module Statistics.EM.TwoGaussian
( emFix
, emStarts
) where
import qualified Data.Vector.Unboxed as VU
import Statistics.Distribution
import Statistics.Distribution.Normal
import Control.Monad.Fix (fix)
import Data.List (sort,maximumBy,tails)
import Data.Ord
type Weight = Double
type Normal = (Double,Double)
emStep :: VU.Vector Double -> (Weight,Normal,Normal) -> (Weight,Normal,Normal)
emStep xs (w,(mu1,s1),(mu2,s2)) = (w',(mu1',s1'),(mu2',s2')) where
ys = VU.map responsibility xs
n1 = normalDistr mu1 s1
n2 = normalDistr mu2 s2
responsibility y = let
n1y = density n1 y
n2y = density n2 y
in w * n2y / ((1w) * n1y + w * n2y)
w' = VU.sum ys / fromIntegral (VU.length ys)
div1 = VU.sum . VU.map (\y -> 1y) $ ys
mu1' = (VU.sum $ VU.zipWith (\x y -> (1y) * x) xs ys) / div1
s1' = (VU.sum $ VU.zipWith (\x y -> (1y) * (xmu1')^2) xs ys) / div1
div2 = VU.sum $ ys
mu2' = (VU.sum $ VU.zipWith (\x y -> y * x) xs ys) / div2
s2' = (VU.sum $ VU.zipWith (\x y -> y * (xmu2')^2) xs ys) / div2
emIter :: VU.Vector Double -> (Weight,Normal,Normal) -> [(Weight,Normal,Normal)]
emIter xs theta = iterate (emStep xs) theta
emFix :: VU.Vector Double -> (Weight,Normal,Normal) -> (Weight,Normal,Normal)
emFix xs theta = last . map fst . takeWhile f $ zip zs (tail zs) where
zs = emIter xs theta
f ( (w1,(mu11,s11),(mu12,s12)) , (w2,(mu21,s21),(mu22,s22)) ) = w1 =/= w2 && mu11 =/= mu21 && mu12 =/= mu22 && s11 =/= s21 && s12 =/= s22
a =/= b = abs (ab) > epsilon
epsilon = 10.0 ^^ (10)
emStarts :: VU.Vector Double -> (Weight,Normal,Normal)
emStarts xs = maximumBy (comparing loglikelihood) . map (emFix xs) $ [f xs mu1 mu2 | mu1 <- VU.toList xs, mu2 <- VU.toList xs, mu1<mu2] where
f xs mu1 mu2 = (0.5,(mu1,sampleVar),(mu2,sampleVar))
loglikelihood (w,(mu1,s1),(mu2,s2)) = VU.sum . VU.map ll $ xs where
ll x = log $ (1w) * density n1 x + w * density n2 x
n1 = normalDistr mu1 s1
n2 = normalDistr mu2 s2
sampleMu = VU.sum xs / (fromIntegral $ VU.length xs)
sampleVar = (VU.sum . VU.map (\x -> (xsampleMu)^2) $ xs) / (fromIntegral $ VU.length xs)