src/Math/Statistics/Dirichlet/Density.hs

---------------------------------------------------------------------------
-- | Module    : Math.Statistics.Dirichlet.Density
-- Copyright   : (c) 2009-2012 Felipe Lessa
-- License     : BSD3
--
-- Maintainer  : felipe.lessa@gmail.com
-- Stability   : experimental
-- Portability : portable
--
--------------------------------------------------------------------------

module Math.Statistics.Dirichlet.Density
    ( DirichletDensity(..)
    , empty
    , fromList
    , toList
    , derive
    , cost
    ) where

import qualified Data.Vector as V
import qualified Data.Vector.Generic as G
import qualified Data.Vector.Unboxed as U

import Control.DeepSeq (NFData(..))
import Numeric.GSL.Special.Gamma (lngamma)
import Numeric.GSL.Special.Psi (psi)

import Math.Statistics.Dirichlet.Options
import Math.Statistics.Dirichlet.Util



-- | A Dirichlet density.
newtype DirichletDensity = DD {unDD :: U.Vector Double} deriving (Eq)

instance Show DirichletDensity where
    showsPrec prec (DD v) =
      showParen (prec > 10) $
      showString "fromList " .
      showsPrec 11 (U.toList v)

instance Read DirichletDensity where
    readsPrec p ('(':xs) = let (ys,')':zs) = break (== ')') xs
                           in map (\(x,s) -> (x,s++zs)) $
                              readsPrec p ys
    readsPrec p xs = let [("fromList",list)] = lex xs
                     in map (\(x,s) -> (fromList x,s)) $
                        readsPrec p list

instance NFData DirichletDensity where
    rnf DD {} = ()

-- | @empty n x@ is an \"empty\" Dirichlet density with size
-- @n@ and all alphas set to @x@.
empty :: Int -> Double -> DirichletDensity
empty = (DD .) . U.replicate
{-# INLINE empty #-}

-- | @fromList xs@ constructs a Dirichlet density from a list of
-- alpha values.
fromList :: [Double] -> DirichletDensity
fromList = DD . U.fromList
{-# INLINE fromList #-}

-- | @toList d@ deconstructs a Dirichlet density to a list of
-- alpha values.
toList :: DirichletDensity -> [Double]
toList (DD xs) = U.toList xs
{-# INLINE toList #-}

-- | Derive a Dirichlet density using a maximum likelihood method
-- as described by Karplus et al (equation 26).  All training
-- vectors should have the same length, however this is not
-- verified.
derive :: DirichletDensity -> Predicate -> StepSize
         -> TrainingVectors -> Result DirichletDensity
derive (DD initial) (Pred maxIter' minDelta_ deltaSteps' _ _)
             (Step step) trainingData
    | V.length trainingData == 0 = err "empty training data"
    | U.length initial < 1       = err "empty initial vector"
    | maxIter' < 1               = err "non-positive maxIter"
    | minDelta_ < 0              = err "negative minDelta"
    | deltaSteps' < 1            = err "non-positive deltaSteps"
    | step <= 0                  = err "non-positive step"
    | step >= 1                  = err "step greater than one"
    | otherwise                  = train
    where
      err = error . ("Dirichlet.derive: " ++)

      -- Compensate the different deltaSteps.
      !minDelta'    = minDelta_ * fromIntegral deltaSteps'

      -- Number of training sequences.
      !trainingSize = fromIntegral $ V.length trainingData

      -- Sums of each training sequence.
      trainingSums :: U.Vector Double
      !trainingSums = G.unstream $ G.stream $ V.map U.sum trainingData

      -- Functions that work on the alphas only (and not their logs).
      calcSumAs = U.sum . snd . U.unzip
      finish    = DD    . snd . U.unzip

      -- Start training in the zero-th iteration and with
      -- infinite inital cost.
      train = train' 1 infinity (U.sum initial) $
              U.map (\x -> (log x, x)) initial

      train' !iter !oldCost !sumAs !alphas =
        -- Reestimate alpha's.
        let !alphas'  = U.imap calculateAlphas alphas
            !psiSumAs = psi sumAs
            !psiSums  = U.sum $ U.map (\sumT -> psi $ sumT + sumAs) trainingSums
            calculateAlphas !i (!w, !a) =
              let !s1 = trainingSize * (psiSumAs - psi a)
                  !s2 = V.sum $ V.map (\t -> psi $ t U.! i + a) trainingData
                  !w' = w + step * a * (s1 + s2 - psiSums)
                  !a' = exp w'
              in (w', a')

        -- Recalculate constants.
            !sumAs'   = calcSumAs alphas'
            !calcCost = iter `mod` deltaSteps' == 0
            !cost'    = if calcCost then newCost else oldCost
             where newCost = costWorker (snd $ U.unzip alphas') sumAs'
                                        trainingData trainingSums
            !delta    = abs (cost' - oldCost)

        -- Verify convergence.  Even with MaxIter we only stop
        -- iterating if the delta was calculated.  Otherwise we
        -- wouldn't be able to tell the caller why the delta was
        -- still big when we reached the limit.
        in case (calcCost, delta <= minDelta', iter >= maxIter') of
             (True, True, _) -> Result Delta   iter delta cost' $ finish alphas'
             (True, _, True) -> Result MaxIter iter delta cost' $ finish alphas'
             _               -> train' (iter+1) cost' sumAs' alphas'

-- | Cost function for deriving a Dirichlet density (equation
-- 18).  This function is minimized by 'derive'.
cost :: TrainingVectors -> DirichletDensity -> Double
cost tv (DD arr) = costWorker arr (U.sum arr) tv $
                     G.unstream $ G.stream $ V.map U.sum tv

-- | 'cost' needs to calculate the sum of all training vectors.
-- This functios avoids recalculting this quantity in 'derive'
-- multiple times.  This is the used by both 'cost' and 'derive'.
costWorker :: U.Vector Double -> Double -> TrainingVectors -> U.Vector Double -> Double
costWorker !alphas !sumAs !trainingData !trainingSums =
    let !lngammaSumAs = lngamma sumAs
        f t = U.sum $ U.zipWith w t alphas
            where w t_i a_i = lngamma (t_i + a_i) - lngamma (t_i + 1) - lngamma a_i
        g sumT = lngamma (sumT+1) - lngamma (sumT + sumAs)
    in negate $ (V.sum $ V.map f trainingData)
              + (U.sum $ U.map g trainingSums)
              + lngammaSumAs * fromIntegral (U.length trainingSums)
{-# INLINE costWorker #-}