{-# LANGUAGE DeriveDataTypeable #-} ----------------------------------------------------------------------------- -- | -- Module : Data.Summary.Double -- Copyright : Copyright (c) 2010, Patrick Perry -- License : BSD3 -- Maintainer : Patrick Perry -- Stability : experimental -- -- Summary statistics for @Double@s. -- module Data.Summary.Double ( -- * Summary type Summary, -- * Properties -- ** Sum size, sum, sumSquaredErrors, -- ** Mean mean, meanSE, meanCI, -- ** Scale stddev, variance, -- ** Extremes maximum, minimum, -- * Construction empty, singleton, -- * Insertion insert, insertWith, -- * Combination union, unions, -- * Conversion -- ** Lists fromList, fromListWith, -- ** Statistics toStats, fromStats, ) where import Prelude hiding (maximum, minimum, sum) import qualified Prelude as P import Data.Data( Data, Typeable ) import Data.List( foldl' ) import Data.Monoid import Text.Printf import Data.Summary.Utils( interval ) -- | A type for storing summary statistics for a data set including -- sample size, min and max values, and first and second moments. data Summary = S {-# UNPACK #-} !Int -- sample size {-# UNPACK #-} !Double -- sample mean {-# UNPACK #-} !Double -- sum of squares {-# UNPACK #-} !Double -- sample min {-# UNPACK #-} !Double -- sample max deriving(Eq, Data, Typeable) instance Show Summary where show s@(S n mu _ l h) = printf " sample size: %d" n ++ printf "\n minimum: %g" l ++ printf "\n maximum: %g" h ++ printf "\n mean: %g" mu ++ printf "\n SE: %g" (meanSE s) ++ printf "\n 99%% CI: (%g, %g)" c1 c2 where (c1,c2) = meanCI 0.99 s instance Monoid Summary where mempty = empty mappend = union -- | Number of observations. size :: Summary -> Int size (S n _ _ _ _) = n -- | Sum of values. sum :: Summary -> Double sum s = (fromIntegral $ size s) * (mean s) -- | Mean value. mean :: Summary -> Double mean (S _ m _ _ _) = m -- | Standard error of the mean. meanSE :: Summary -> Double meanSE s = sqrt (variance s / fromIntegral (size s)) -- | Get a Central Limit Theorem based confidence interval for the -- population mean with the specified coverage level. The level must -- be in the range @(0,1)@. meanCI :: Double -> Summary -> (Double,Double) meanCI level s = interval level (mean s) (meanSE s) -- | Sample standard deviation. stddev :: Summary -> Double stddev s = sqrt (variance s) -- | Sample variance. variance :: Summary -> Double variance s = (sumSquaredErrors s) / fromIntegral (size s - 1) -- | Sum of squared errors @(x[i] - mean)^2@. sumSquaredErrors :: Summary -> Double sumSquaredErrors (S _ _ s _ _) = s -- | Maximum value. maximum :: Summary -> Double maximum (S _ _ _ _ h) = h -- | Minimum value. minimum :: Summary -> Double minimum (S _ _ _ l _) = l -- | An empty summary. empty :: Summary empty = S 0 0 0 (1/0) (-1/0) -- | Summarize a single value. singleton :: Double -> Summary singleton x = S 1 x 0 x x -- | Update the summary with a data point. -- Running mean and variance computed as in Knuth, Vol 2, page 232, -- 3rd edition, see for -- a description. insert :: Double -> Summary -> Summary insert x (S n m s l h) = let n' = n+1 delta = x - m m' = m + delta / fromIntegral n' s' = s + delta*(x - m') l' = if x < l then x else l h' = if x > h then x else h in S n' m' s' l' h' -- | Apply a function and update the summary with the result. insertWith :: (a -> Double) -> a -> Summary -> Summary insertWith f a = insert (f a) -- | Take the union of two summaries. -- Use the updating rules from Chan et al. \"Updating Formulae and a Pairwise -- Algorithm for Computing Sample Variances,\" available at -- . union :: Summary -> Summary -> Summary union (S na ma sa la ha) (S nb mb sb lb hb) = let delta = mb - ma (na', nb') = (fromIntegral na, fromIntegral nb) n = na + nb n' = fromIntegral n weightedDelta = delta*nb'/n' m | n == 0 = 0 | otherwise = ma + weightedDelta s | n == 0 = 0 | otherwise = sa + sb + delta*na'*weightedDelta l = P.min la lb h = P.max ha hb in S n m s l h -- | Take the union of a list of summaries. unions :: [Summary] -> Summary unions = foldl' union empty -- | Get a summary of a list of values. fromList :: [Double] -> Summary fromList = foldl' (flip insert) empty -- | Map a function over a list of values and summarize the results. fromListWith :: (a -> Double) -> [a] -> Summary fromListWith f = fromList . map f -- | Convert to (size, mean, sumSquaredErrors, minimum, maximum). toStats :: Summary -> (Int,Double,Double,Double,Double) toStats (S n m s l u) = (n,m,s,l,u) -- | Convert from (size, mean, sumSquaredErrors, minimum, maximum). -- No validation is performed. fromStats :: Int -> Double -> Double -> Double -> Double -> Summary fromStats = S