{-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE BangPatterns #-} -- | -- Module : Data.Monoid.Statistics -- Copyright : Copyright (c) 2010, Alexey Khudyakov -- License : BSD3 -- Maintainer : Alexey Khudyakov -- Stability : experimental -- module Data.Monoid.Statistics ( StatMonoid(..) , evalStatistic -- * Statistic monoids , Count(..) , Mean(..) -- * Additional information -- $info ) where import Data.Int (Int8, Int16, Int32, Int64) import Data.Word (Word8,Word16,Word32,Word64,Word) import Data.Monoid import qualified Data.Foldable as F import GHC.Float (float2Double) -- | Monoid which corresponds to some stattics. In order to do so it -- must be commutative. In many cases it's not practical to -- construct monoids for each element so 'papennd' was added. -- First parameter of type class is monoidal accumulator. Second is -- type of element over which statistic is calculated. -- -- Statistic could be calculated with fold over sample. Since -- accumulator is 'Monoid' such fold could be easily parralelized. -- -- Instance must satisfy following law: -- -- > pappend x (pappend y mempty) == pappend x mempty `mappend` pappend y mempty -- > mappend x y == mappend y x class Monoid m => StatMonoid m a where -- | Add one element to monoid accumulator. P stands for point in -- analogy for Pointed. pappend :: a -> m -> m -- | Calculate statistic over 'Foldable'. It's implemented in terms of -- foldl'. evalStatistic :: (F.Foldable d, StatMonoid m a) => d a -> m evalStatistic = F.foldl' (flip pappend) mempty ---------------------------------------------------------------- -- Data types ---------------------------------------------------------------- -- | Simplest statistics. Number of elements in the sample newtype Count a = Count { calcCount :: a } deriving Show instance Integral a => Monoid (Count a) where mempty = Count 0 (Count i) `mappend` (Count j) = Count (i + j) {-# INLINE mempty #-} {-# INLINE mappend #-} instance (Integral a) => StatMonoid (Count a) b where pappend _ !(Count n) = Count (n + 1) {-# INLINE pappend #-} -- | Mean of sample. Samples of Double,Float and bui;t-in integral -- types are supported -- -- Numeric stability of 'mappend' is not proven. data Mean = Mean { calcMean :: Double -- ^ Current mean , calcCountMean :: Int -- ^ Number of entries } deriving Show instance Monoid Mean where mempty = Mean 0 0 mappend !(Mean x n) !(Mean y k) = Mean ((x*n' + y*k') / (n' + k')) (n + k) where n' = fromIntegral n k' = fromIntegral k {-# INLINE mempty #-} {-# INLINE mappend #-} -- Add one sample elemnt to Mean addValueToMean :: (a -> Double) -> a -> Mean -> Mean addValueToMean f !x !(Mean m n) = Mean (m + (f x - m) / fromIntegral n') n' where n' = n+1 {-# INLINE addValueToMean #-} -- Floating point instance StatMonoid Mean Double where pappend = addValueToMean id {-# INLINE pappend #-} instance StatMonoid Mean Float where pappend = addValueToMean float2Double {-# INLINE pappend #-} -- Basic integrals instance StatMonoid Mean Integer where pappend = addValueToMean fromIntegral {-# INLINE pappend #-} instance StatMonoid Mean Int where pappend = addValueToMean fromIntegral {-# INLINE pappend #-} instance StatMonoid Mean Word where pappend = addValueToMean fromIntegral {-# INLINE pappend #-} -- Fixed size ints instance StatMonoid Mean Int8 where pappend = addValueToMean fromIntegral {-# INLINE pappend #-} instance StatMonoid Mean Int16 where pappend = addValueToMean fromIntegral {-# INLINE pappend #-} instance StatMonoid Mean Int32 where pappend = addValueToMean fromIntegral {-# INLINE pappend #-} instance StatMonoid Mean Int64 where pappend = addValueToMean fromIntegral {-# INLINE pappend #-} -- Fixed size Words instance StatMonoid Mean Word8 where pappend = addValueToMean fromIntegral {-# INLINE pappend #-} instance StatMonoid Mean Word16 where pappend = addValueToMean fromIntegral {-# INLINE pappend #-} instance StatMonoid Mean Word32 where pappend = addValueToMean fromIntegral {-# INLINE pappend #-} instance StatMonoid Mean Word64 where pappend = addValueToMean fromIntegral {-# INLINE pappend #-} ---------------------------------------------------------------- -- Generic monoids ---------------------------------------------------------------- -- | Monoid which allows to calculate two statistics in parralel data TwoStats a b = TwoStats { calcStat1 :: a , calcStat2 :: b } instance (Monoid a, Monoid b) => Monoid (TwoStats a b) where mempty = TwoStats mempty mempty mappend !(TwoStats x y) !(TwoStats x' y') = TwoStats (mappend x x') (mappend y y') {-# INLINE mempty #-} {-# INLINE mappend #-} instance (StatMonoid a x, StatMonoid b x) => StatMonoid (TwoStats a b) x where pappend !x !(TwoStats a b) = TwoStats (pappend x a) (pappend x b) {-# INLINE pappend #-} -- $info -- -- Statistic is function of a sample which does not depend on order of -- elements in a sample. For each statistics corresponding monoid -- could be constructed: -- -- > f :: [A] -> B -- > -- > data F = F [A] -- > -- > evalF (F xs) = f xs -- > -- > instance Monoid F here -- > mempty = F [] -- > (F a) `mappend` (F b) = F (a ++ b) -- -- This indeed proves that monoid could be constructed. Monoid above -- is completely impractical. It runs in O(n) space. However for some -- statistics monoids which runs in O(1) space could be -- implemented. For example mean. -- -- On the other hand some statistics could not be implemented in such -- way. For example calculation of median require O(n) space. Variance -- could be implemented in O(1) but such implementation won't be -- numerically stable.