{-# LANGUAGE BangPatterns #-} {-# LANGUAGE DeriveDataTypeable #-} {-# LANGUAGE DeriveFoldable #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE DeriveTraversable #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TypeFamilies #-} {-# LANGUAGE ViewPatterns #-} -- | -- Monoids for calculating various statistics in constant space module Data.Monoid.Statistics.Numeric ( -- * Mean & Variance -- ** Number of elements CountG(..) , Count , asCount -- ** Mean algorithms -- ** Default algorithms , Mean , asMean , WMean , asWMean -- *** Mean , MeanNaive(..) , asMeanNaive , MeanKBN(..) , asMeanKBN -- *** Weighted mean , WMeanNaive(..) , asWMeanNaive , WMeanKBN(..) , asWMeanKBN -- ** Variance , Variance(..) , asVariance -- * Maximum and minimum , Max(..) , Min(..) , MaxD(..) , MinD(..) -- * Binomial trials , BinomAcc(..) , asBinomAcc -- * Rest , Weighted(..) -- * References -- $references ) where import Control.Monad.Catch (MonadThrow(..)) import Data.Data (Typeable,Data) import Data.Vector.Unboxed (Unbox) import Data.Vector.Unboxed.Deriving (derivingUnbox) import Numeric.Sum import GHC.Generics (Generic) import Data.Monoid.Statistics.Class ---------------------------------------------------------------- -- Statistical monoids ---------------------------------------------------------------- -- | Calculate number of elements in the sample. newtype CountG a = CountG { calcCountN :: a } deriving (Show,Eq,Ord,Typeable) type Count = CountG Int -- | Type restricted 'id' asCount :: CountG a -> CountG a asCount = id instance Integral a => Semigroup (CountG a) where CountG i <> CountG j = CountG (i + j) instance Integral a => Monoid (CountG a) where mempty = CountG 0 mappend = (<>) instance (Integral a) => StatMonoid (CountG a) b where singletonMonoid _ = CountG 1 addValue (CountG n) _ = CountG (n + 1) instance CalcCount (CountG Int) where calcCount = calcCountN ---------------------------------------------------------------- -- | Type alias for currently recommended algorithms for calculation -- of mean. It should be default choice type Mean = MeanKBN asMean :: Mean -> Mean asMean = id -- | Type alias for currently recommended algorithms for calculation -- of weighted mean. It should be default choice type WMean = WMeanKBN asWMean :: WMean -> WMean asWMean = id ---------------------------------------------------------------- -- | Incremental calculation of mean. It tracks separately number of -- elements and running sum. Note that summation of floating point -- numbers loses precision and genrally use 'MeanKBN' is -- recommended. data MeanNaive = MeanNaive !Int !Double deriving (Show,Eq,Typeable,Data,Generic) asMeanNaive :: MeanNaive -> MeanNaive asMeanNaive = id instance Semigroup MeanNaive where MeanNaive 0 _ <> m = m m <> MeanNaive 0 _ = m MeanNaive n1 s1 <> MeanNaive n2 s2 = MeanNaive (n1+n2) (s1 + s2) instance Monoid MeanNaive where mempty = MeanNaive 0 0 mappend = (<>) instance Real a => StatMonoid MeanNaive a where addValue (MeanNaive n m) x = MeanNaive (n+1) (m + realToFrac x) {-# INLINE addValue #-} instance CalcCount MeanNaive where calcCount (MeanNaive n _) = n instance CalcMean MeanNaive where calcMean (MeanNaive 0 _) = throwM $ EmptySample "Data.Monoid.Statistics.Numeric.MeanNaive: calcMean" calcMean (MeanNaive n s) = return (s / fromIntegral n) ---------------------------------------------------------------- -- | Incremental calculation of mean. It tracks separately number of -- elements and running sum. It uses algorithm for compensated -- summation which works with mantissa of double size at cost of -- doing more operations. This means that it's usually possible to -- compute sum (and therefore mean) within 1 ulp. data MeanKBN = MeanKBN !Int {-# UNPACK #-} !KBNSum deriving (Show,Eq,Typeable,Data,Generic) asMeanKBN :: MeanKBN -> MeanKBN asMeanKBN = id instance Semigroup MeanKBN where MeanKBN 0 _ <> m = m m <> MeanKBN 0 _ = m MeanKBN n1 s1 <> MeanKBN n2 s2 = MeanKBN (n1+n2) (s1 <> s2) instance Monoid MeanKBN where mempty = MeanKBN 0 mempty mappend = (<>) instance Real a => StatMonoid MeanKBN a where addValue (MeanKBN n m) x = MeanKBN (n+1) (addValue m x) {-# INLINE addValue #-} instance CalcCount MeanKBN where calcCount (MeanKBN n _) = n instance CalcMean MeanKBN where calcMean (MeanKBN 0 _) = throwM $ EmptySample "Data.Monoid.Statistics.Numeric.MeanKBN: calcMean" calcMean (MeanKBN n s) = return (kbn s / fromIntegral n) ---------------------------------------------------------------- -- | Incremental calculation of weighed mean. data WMeanNaive = WMeanNaive !Double -- Weight !Double -- Weighted sum deriving (Show,Eq,Typeable,Data,Generic) asWMeanNaive :: WMeanNaive -> WMeanNaive asWMeanNaive = id instance Semigroup WMeanNaive where WMeanNaive w1 s1 <> WMeanNaive w2 s2 = WMeanNaive (w1 + w2) (s1 + s2) instance Monoid WMeanNaive where mempty = WMeanNaive 0 0 mappend = (<>) instance (Real w, Real a) => StatMonoid WMeanNaive (Weighted w a) where addValue (WMeanNaive n s) (Weighted w a) = WMeanNaive (n + w') (s + (w' * a')) where w' = realToFrac w a' = realToFrac a {-# INLINE addValue #-} instance CalcMean WMeanNaive where calcMean (WMeanNaive w s) | w <= 0 = throwM $ EmptySample "Data.Monoid.Statistics.Numeric.WMeanNaive: calcMean" | otherwise = return (s / w) ---------------------------------------------------------------- -- | Incremental calculation of weighed mean. Sum of both weights and -- elements is calculated using Kahan-Babuška-Neumaier summation. data WMeanKBN = WMeanKBN {-# UNPACK #-} !KBNSum -- Weight {-# UNPACK #-} !KBNSum -- Weighted sum deriving (Show,Eq,Typeable,Data,Generic) asWMeanKBN :: WMeanKBN -> WMeanKBN asWMeanKBN = id instance Semigroup WMeanKBN where WMeanKBN n1 s1 <> WMeanKBN n2 s2 = WMeanKBN (n1 <> n2) (s1 <> s2) instance Monoid WMeanKBN where mempty = WMeanKBN mempty mempty mappend = (<>) instance (Real w, Real a) => StatMonoid WMeanKBN (Weighted w a) where addValue (WMeanKBN n m) (Weighted w a) = WMeanKBN (add n w') (add m (w' * a')) where w' = realToFrac w :: Double a' = realToFrac a :: Double {-# INLINE addValue #-} instance CalcMean WMeanKBN where calcMean (WMeanKBN (kbn -> w) (kbn -> s)) | w <= 0 = throwM $ EmptySample "Data.Monoid.Statistics.Numeric.WMeanKBN: calcMean" | otherwise = return (s / w) ---------------------------------------------------------------- -- | This is algorithm for estimation of mean and variance of sample -- which uses modified Welford algorithm. It uses KBN summation and -- provides approximately 2 additional decimal digits data VarWelfordKBN = VarWelfordKBN {-# UNPACK #-} !Int -- Number of elements in the sample {-# UNPACK #-} !KBNSum -- Current sum of elements of sample {-# UNPACK #-} !KBNSum -- Current sum of squares of deviations from current mean asVarWelfordKBN :: VarWelfordKBN -> VarWelfordKBN asVarWelfordKBN = id -- | Incremental algorithms for calculation the standard deviation [Chan1979]. data Variance = Variance {-# UNPACK #-} !Int -- Number of elements in the sample {-# UNPACK #-} !Double -- Current sum of elements of sample {-# UNPACK #-} !Double -- Current sum of squares of deviations from current mean deriving (Show,Eq,Typeable) -- | Type restricted 'id ' asVariance :: Variance -> Variance asVariance = id instance Semigroup Variance where Variance n1 ta sa <> Variance n2 tb sb = Variance (n1+n2) (ta+tb) sumsq where na = fromIntegral n1 nb = fromIntegral n2 nom = sqr (ta * nb - tb * na) sumsq | n1 == 0 = sb | n2 == 0 = sa | otherwise = sa + sb + nom / ((na + nb) * na * nb) instance Monoid Variance where mempty = Variance 0 0 0 mappend = (<>) instance Real a => StatMonoid Variance a where addValue (Variance 0 _ _) x = singletonMonoid x addValue (Variance n t s) (realToFrac -> x) = Variance (n + 1) (t + x) (s + sqr (t - n' * x) / (n' * (n'+1))) where n' = fromIntegral n {-# INLINE addValue #-} singletonMonoid x = Variance 1 (realToFrac x) 0 {-# INLINE singletonMonoid #-} instance CalcCount Variance where calcCount (Variance n _ _) = n instance CalcMean Variance where calcMean (Variance 0 _ _) = throwM $ EmptySample "Data.Monoid.Statistics.Numeric.Variance: calcMean" calcMean (Variance n s _) = return (s / fromIntegral n) instance CalcVariance Variance where calcVariance (Variance n _ s) | n < 2 = throwM $ InvalidSample "Data.Monoid.Statistics.Numeric.Variance: calcVariance" "Need at least 2 elements" | otherwise = return $! s / fromIntegral (n - 1) calcVarianceML (Variance n _ s) | n < 1 = throwM $ InvalidSample "Data.Monoid.Statistics.Numeric.Variance: calcVarianceML" "Need at least 1 element" | otherwise = return $! s / fromIntegral n ---------------------------------------------------------------- -- | Calculate minimum of sample newtype Min a = Min { calcMin :: Maybe a } deriving (Show,Eq,Ord,Typeable,Data,Generic) instance Ord a => Semigroup (Min a) where Min (Just a) <> Min (Just b) = Min (Just $! min a b) Min a <> Min Nothing = Min a Min Nothing <> Min b = Min b instance Ord a => Monoid (Min a) where mempty = Min Nothing mappend = (<>) instance (Ord a, a ~ a') => StatMonoid (Min a) a' where singletonMonoid a = Min (Just a) ---------------------------------------------------------------- -- | Calculate maximum of sample newtype Max a = Max { calcMax :: Maybe a } deriving (Show,Eq,Ord,Typeable,Data,Generic) instance Ord a => Semigroup (Max a) where Max (Just a) <> Max (Just b) = Max (Just $! max a b) Max a <> Max Nothing = Max a Max Nothing <> Max b = Max b instance Ord a => Monoid (Max a) where mempty = Max Nothing mappend = (<>) instance (Ord a, a ~ a') => StatMonoid (Max a) a' where singletonMonoid a = Max (Just a) ---------------------------------------------------------------- -- | Calculate minimum of sample of Doubles. For empty sample returns NaN. Any -- NaN encountered will be ignored. newtype MinD = MinD { calcMinD :: Double } deriving (Show,Typeable,Data,Generic) instance Eq MinD where MinD a == MinD b | isNaN a && isNaN b = True | otherwise = a == b instance Semigroup MinD where MinD x <> MinD y | isNaN x = MinD y | isNaN y = MinD x | otherwise = MinD (min x y) -- N.B. forall (x :: Double) (x <= NaN) == False instance Monoid MinD where mempty = MinD (0/0) mappend = (<>) instance a ~ Double => StatMonoid MinD a where singletonMonoid = MinD -- | Calculate maximum of sample. For empty sample returns NaN. Any -- NaN encountered will be ignored. newtype MaxD = MaxD { calcMaxD :: Double } deriving (Show,Typeable,Data,Generic) instance Eq MaxD where MaxD a == MaxD b | isNaN a && isNaN b = True | otherwise = a == b instance Semigroup MaxD where MaxD x <> MaxD y | isNaN x = MaxD y | isNaN y = MaxD x | otherwise = MaxD (max x y) instance Monoid MaxD where mempty = MaxD (0/0) mappend = (<>) instance a ~ Double => StatMonoid MaxD a where singletonMonoid = MaxD ---------------------------------------------------------------- -- | Accumulator for binomial trials. data BinomAcc = BinomAcc { binomAccSuccess :: !Int , binomAccTotal :: !Int } deriving (Show,Eq,Ord,Typeable,Data,Generic) -- | Type restricted 'id' asBinomAcc :: BinomAcc -> BinomAcc asBinomAcc = id instance Semigroup BinomAcc where BinomAcc n1 m1 <> BinomAcc n2 m2 = BinomAcc (n1+n2) (m1+m2) instance Monoid BinomAcc where mempty = BinomAcc 0 0 mappend = (<>) instance StatMonoid BinomAcc Bool where addValue (BinomAcc nS nT) True = BinomAcc (nS+1) (nT+1) addValue (BinomAcc nS nT) False = BinomAcc nS (nT+1) -- | Value @a@ weighted by weight @w@ data Weighted w a = Weighted w a deriving (Show,Eq,Ord,Typeable,Data,Generic,Functor,Foldable,Traversable) ---------------------------------------------------------------- -- Helpers ---------------------------------------------------------------- sqr :: Double -> Double sqr x = x * x {-# INLINE sqr #-} ---------------------------------------------------------------- -- Unboxed instances ---------------------------------------------------------------- derivingUnbox "CountG" [t| forall a. Unbox a => CountG a -> a |] [| calcCountN |] [| CountG |] derivingUnbox "MeanKBN" [t| MeanKBN -> (Int,Double,Double) |] [| \(MeanKBN a (KBNSum b c)) -> (a,b,c) |] [| \(a,b,c) -> MeanKBN a (KBNSum b c) |] derivingUnbox "Variance" [t| Variance -> (Int,Double,Double) |] [| \(Variance a b c) -> (a,b,c) |] [| \(a,b,c) -> Variance a b c |] derivingUnbox "MinD" [t| MinD -> Double |] [| calcMinD |] [| MinD |] derivingUnbox "MaxD" [t| MaxD -> Double |] [| calcMaxD |] [| MaxD |] derivingUnbox "Weighted" [t| forall w a. (Unbox w, Unbox a) => Weighted w a -> (w,a) |] [| \(Weighted w a) -> (w,a) |] [| \(w,a) -> Weighted w a |] -- $references -- -- * [Welford1962] Welford, B.P. (1962) Note on a method for -- calculating corrected sums of squares and -- products. /Technometrics/ -- 4(3):419-420. -- -- * [Chan1979] Chan, Tony F.; Golub, Gene H.; LeVeque, Randall -- J. (1979), Updating Formulae and a Pairwise Algorithm for -- Computing Sample Variances., Technical Report STAN-CS-79-773, -- Department of Computer Science, Stanford University. Page 4.