-- | EM for a mixture of k one-dimensional Gaussians. This procedure tends to
-- produce "NaN"s whenever more Gaussians are being selected than are called
-- for. This is rather convenient. ;-)
--
-- TODO cite paper
module Statistics.EM.GMM
( emFix
, emStarts
) where
import Control.Monad.Fix (fix)
import Data.List (sort,maximumBy,tails,inits,genericLength)
import Data.Ord
import qualified Data.Vector as V
import qualified Data.Vector.Unboxed as VU
import Statistics.Distribution
import Statistics.Distribution.Normal
import Data.Tuple.Select (sel2)
type Data = VU.Vector Double
type Theta = VU.Vector (Double,Double,Double) -- weight, mean, variance
type ThetaL = [(Double,Double,Double)] -- weight, mean, variance
-- | Perform one EM step given the data. In General, emSteps should be iterated
-- until some convergence criterion is met.
emStep :: Data -> Theta -> Theta
emStep xs ts = ts' where
-- E-step
-- TODO this could be made easier by using hmatrix with enabled "vector"
resps = V.map (\i -> calcResV (ts `VU.unsafeIndex` i)) $ V.enumFromN 0 tlen
calcResV t = VU.map (calcRes t) xs
calcRes (w,mu,s) x = (w * density (normalDistr mu s) x) / (VU.sum . VU.map (\(w',mu',s') -> w'* density (normalDistr mu' s') x) $ ts)
ns = VU.map (\i -> VU.sum $ resps `V.unsafeIndex` i) $ VU.enumFromN 0 tlen
-- M-step
ws = VU.map (\w -> w / fromIntegral (VU.length xs)) ns
mus = VU.map (\i ->
(1/ ns `VU.unsafeIndex` i) *
(VU.sum $ VU.zipWith (*) (resps `V.unsafeIndex` i) xs))
$ VU.enumFromN 0 tlen
ss = VU.map (\i ->
(1/ ns `VU.unsafeIndex` i) *
(VU.sum $ VU.zipWith (*) (resps `V.unsafeIndex` i) (VU.map (\x -> (x - (mus `VU.unsafeIndex` i))^2) xs)))
$ VU.enumFromN 0 tlen
ts' = VU.zip3 ws mus ss
tlen = VU.length ts
-- | Produces an infinite list of 'Theta's that will (should) convergence
-- toward a local optimum.
emIter :: Data -> Theta -> [Theta]
emIter xs ts = iterate (emStep xs) ts
-- | Find an optimal set of parameters 'Theta'. The additional "takeWhile (not
-- . isnan . fst)" makes sure that in cases of overfitting, 'emFix' does
-- terminate. Due to the way we check and take, in case of NaNs, the returned
-- values will be NaNs (checking fst, returning snd).
emFix :: Data -> Theta -> Theta
emFix xs ts = res where
res = last . map snd . takeWhile (not . isnan . fst) . takeWhile (not . converged) $ zip ys zs
ys = emIter xs ts
zs = tail ys
converged (y,z) = abs (logLikelihood y xs - logLikelihood z xs) < epsilon
epsilon = 10 ^^ (-10)
isnan ns = let (ws,_,_) = VU.unzip3 ns in VU.any isNaN ws
-- | Calculate the log-likelihood for a given set of parameters 'Theta' and
-- some data 'Data'. Used by 'emFix' to estimate if convergence is reached.
--
-- TODO could be useful in a more general setting within StatisticalMethods.
logLikelihood :: Theta -> Data -> Double
logLikelihood ts xs = (VU.sum . VU.map lls $ xs) / (fromIntegral $ VU.length xs) where
lls x = VU.sum . VU.map (\t -> ll t x) $ ts
ll (w,mu,s) x = w * density (normalDistr mu s) x
-- | Given a set of 'Data' and a number 'k' of Gaussian peaks, try to find the
-- optimal GMM. This is done by trying each data point as mu for each Gaussian.
-- Note that this will be rather slow for larger 'k' (larger than, say 2 or 3).
-- In that case, a random-drawing method should be chosen.
--
-- TODO xs' -> xs sorting makes me cry!
emStarts :: Int -> Data -> Theta
emStarts k xs' = maximumBy (comparing (\t -> logLikelihood t xs)) . map (emFix xs) $ ts where
ts = map VU.fromList . f k . VU.toList $ xs
mkT mu = (w,mu,sampleVar)
f l zs
| l< 1 = error "emStarts called with k<1"
| l==1 = map (\z -> [mkT z]) zs
| l> 1 = [mkT y : ys | y <- zs, ys <- f (l-1) (dropWhile ( (x-sampleMu)^2) $ xs) / (fromIntegral $ VU.length xs)
w = 1 / fromIntegral k
xs = VU.fromList . sort . VU.toList $ xs'