src/Math/Statistics/Dirichlet/Mixture.hs

---------------------------------------------------------------------------
-- | Module    : Math.Statistics.Dirichlet.Mixture
-- Copyright   : (c) 2009-2012 Felipe Lessa
-- License     : BSD3
--
-- Maintainer  : felipe.lessa@gmail.com
-- Stability   : experimental
-- Portability : portable
--
--------------------------------------------------------------------------

module Math.Statistics.Dirichlet.Mixture
    ( -- * Data types
      DirichletMixture(..)
    , dmComponents
    , dmParameters
    , dmDensitiesL
    , (!!!)
    , empty
    , Component
    , fromList
    , toList
    , fromDD
      -- * Training data
    , TrainingData
    , prepareTraining
      -- * Functions
    , derive
    , cost
    , del_cost_w
    ) where

import qualified Data.Vector as V
import qualified Data.Vector.Generic as G
import qualified Data.Vector.Unboxed as U

import Control.DeepSeq (NFData(..))
import Control.Monad.ST
import Data.Bits
import Data.Function (fix)
import Numeric.GSL.Special.Gamma (lngamma)
import Numeric.GSL.Special.Psi (psi)

import qualified Numeric.Optimization.Algorithms.HagerZhang05 as CG

import qualified Math.Statistics.Dirichlet.Density as D
import qualified Math.Statistics.Dirichlet.Matrix as M
import Math.Statistics.Dirichlet.Density (DirichletDensity(..))
import Math.Statistics.Dirichlet.Matrix (Matrix (..))
import Math.Statistics.Dirichlet.Options
import Math.Statistics.Dirichlet.Util



-- | A Dirichlet mixture.
data DirichletMixture =
    DM { dmWeights    :: !(U.Vector Double)
         -- ^ Weights of each density.
       , dmDensities  :: !M.Matrix
         -- ^ Values of all parameters of all densities.  This
         -- matrix has @length dmWeights@ rows.
       } deriving (Eq)

instance Show DirichletMixture where
    showsPrec prec dm =
      showParen (prec > 10) $
      showString "fromList " .
      showsPrec 11 (toList dm)

instance Read DirichletMixture where
    readsPrec p ('(':xs) = let (ys,')':zs) = break (== ')') xs
                           in map (\(x,s) -> (x,s++zs)) $
                              readsPrec p ys
    readsPrec p xs = let [("fromList",list)] = lex xs
                     in map (\(x,s) -> (fromList x,s)) $
                        readsPrec p list

instance NFData DirichletMixture where
    rnf DM {} = ()


-- | Number of components in a dirichlet mixture.
dmComponents :: DirichletMixture -> Int
dmComponents = U.length . dmWeights

-- | Number of parameters each component has.
dmParameters :: DirichletMixture -> Int
dmParameters = mCols . dmDensities

-- | Separated list of densities.
dmDensitiesL :: DirichletMixture -> [DirichletDensity]
dmDensitiesL (DM _ as) = map DD $ V.toList $ M.rows as

-- | @dm !!! i@ is the @i@-th density.  No bounding checks are
-- made.
(!!!) :: DirichletMixture -> Int -> U.Vector Double
(DM _ as) !!! i = as M.!!! i
{-# INLINE (!!!) #-}





dmap :: (U.Vector Double -> Double) -> DirichletMixture -> U.Vector Double
dmap f = M.rowmap f . dmDensities



-- | @empty q n x@ is an \"empty\" Dirichlet mixture with @q@
-- components and @n@ parameters.  Each component has size @n@,
-- weight inversely proportional to its index and all alphas set
-- to @x@.
empty :: Int -> Int -> Double -> DirichletMixture
empty q n x = let (DD d) = D.empty n x
                  f i    = fromIntegral (q-i) / sum_
                  sum_   = fromIntegral (q*(q+1)`div`2)
              in DM {dmWeights    = U.generate q f
                    ,dmDensities  = M.replicateRows q d}
{-# INLINE empty #-}


-- | A list representation of a component of a Dirichlet mixture.
-- Used by 'fromList' and 'toList' only.
type Component = (Double, [Double])

-- | @fromList xs@ constructs a Dirichlet mixture from a
-- non-empty list of components.  Each component has a weight and
-- a list of alpha values.  The weights sum to 1, all lists must
-- have the same number of values and every number must be
-- non-negative.  None of these preconditions are verified.
fromList :: [Component] -> DirichletMixture
fromList components =
  let -- Vectors
      qs =         U.fromList $       map fst components
      as = M q n $ U.fromList $ concatMap snd components

      -- Properties of the mixture
      q  = length components
      n  = length (snd $ head components)
  in DM qs as

-- | @toList dm@ is the inverse of @fromList@, constructs a list
-- of components from a Dirichlet mixture.  There are no error
-- conditions and @toList . fromList == id@.
toList :: DirichletMixture -> [Component]
toList dm =
    let qs' = U.toList $ dmWeights dm
        as' = map (U.toList . unDD) (dmDensitiesL dm)
    in zip qs' as'

-- | Constructs a Dirichlet mixture of one component from a
-- Dirichlet density.
fromDD :: DirichletDensity -> DirichletMixture
fromDD (DD d) = DM (U.singleton 1) (M.replicateRows 1 d)





-- | Prepares training vectors to be used as training data.
-- Anything that depends only on the training vectors is
-- precalculated here.
--
-- We also try to find columns where all training vectors are
-- zero.  Those columns are removed from the derivation process
-- and every component will have zero value on that column.  Note
-- that at least one column should have non-zero training
-- vectors.
prepareTraining :: TrainingVectors -> TrainingData
prepareTraining ns_0 =
    let zeroes  = zeroedCols ns_0
        ns      = removeZeroes ns_0 zeroes
        ns_sums = G.unstream $ G.stream $ V.map U.sum ns
        tns     = M.fromVectorT ns
    in TD {..}

-- | Pre-processed training vectors (see 'prepareTraining').
data TrainingData = TD { ns      :: !TrainingVectors
                       , ns_sums :: !(U.Vector Double)
                       , tns     :: !Matrix
                       , zeroes  :: ![Int]}
                    deriving (Eq, Show)

-- | Return the list of columns that are zeroed, counting from zero.
zeroedCols :: TrainingVectors -> [Int]
zeroedCols =
    -- We set the i-th bit whenever the i-th column was zeroed.
    let fold (acc, mask) 0 = (acc .|. mask,   shiftL mask 1)
        fold (acc, mask) _ = (acc :: Integer, shiftL mask 1)
        unBits !_ 0 = []
        unBits !i x = (if testBit x 0 then (i:) else id)
                      (unBits (i+1) (shiftR x 1))
    in unBits 0 . V.foldl1' (.&.) . V.map (fst . U.foldl' fold (0,1))

-- | Remove zeroed columns from training vectors.
removeZeroes :: TrainingVectors -> [Int] -> TrainingVectors
removeZeroes ns [] = ns
removeZeroes ns zs =
    let cols_orig = U.length (V.head ns)
        cols_new  = U.filter (`notElem` zs) $ U.enumFromN 0 cols_orig
    in V.map (flip U.backpermute cols_new) ns

-- | Remove zeroed columns from a Dirichlet mixture matrix of
-- densities.
removeZeroesM :: [Int] -> Matrix -> Matrix
removeZeroesM [] as = as
removeZeroesM zs as =
    let size      = M.mCols as * M.mRows as
        cols_orig = M.mCols as
        cols_new  = U.filter ((`notElem` zs) . (`rem` cols_orig)) $
                    U.enumFromN 0 size
    in M {mCols = M.mCols as - length zs
         ,mRows = M.mRows as
         ,mData = U.backpermute (M.mData as) cols_new}

-- | Add zeroed columns back to a Dirichlet mixture matrix of
-- densities.
addZeroesM :: [Int] -> Matrix -> Matrix
addZeroesM []  = id
addZeroesM zs' = M.fromVector .
                V.map (U.fromList . add 0 zs' . U.toList) .
                M.rows
    where
      add !_ []     xs                 = xs
      add  _ zs     []                 = map (const zero) zs
      add  i (z:zs) (x:xs) | i == z    = zero : add (i+1) zs (x:xs)
                           | otherwise = x    : add (i+1) (z:zs) xs
      zero = 0.00001





-- | /Prob(a_j | n, theta)/ Defined in equation (16), "the
-- posterior probability of the j-th component of the mixture
-- given the vector of counts n".  We return the probabilities
-- for all /j/ in each vector.
--
-- The order of the result is inversed for performance.  In the
-- outer boxed vector there are /j/ elements.  The /i/-th inner
-- unboxed vector contains that probability for each of the
-- training vectors.
--
-- Calculated as per equation (39) using 'logBeta'.  If we take
-- the numerator of the right hand side of equation (39) as /Y_j/
-- and the left hand side as /P_j/, then /P_j/ is proportional to
-- /Y_j/ normalized to sum to 1.  We may have problems if /P_j/
-- is too large or too small.  Using the suggestion from the
-- paper, we may multiply all /P_j/ by a constant before
-- normalizing everything.  We calculate /P_j/ using a logarithm,
-- so that means we may freely add or subtract a constant from
-- the logarithm before appling the exponential function.  This
-- is really essencial.
prob_a_n_theta :: TrainingVectors -> DirichletMixture -> Matrix
prob_a_n_theta ns dm@(DM qs _) =
    let -- Precalculate logBeta of all components
        !logBetaAlphas = dmap logBeta dm

        -- Calculate the factors for one of the training vectors.
        calc n i lb_a  = let !a = dm !!! i
                         in logBeta (U.zipWith (+) n a) - lb_a
        factors n      = let fs  = U.imap (calc n) logBetaAlphas
                             !c  = U.maximum fs  -- see the note above
                             fs' = U.zipWith (\q f -> q * exp (f - c)) qs fs
                             !total = U.sum fs'
                         in U.map (/ total) fs'
    in M.fromVectorT $ V.map factors ns


-- | Customized version of @prob_a_n_theta@ used when the weights
-- are being estimated.  Precomputes everything that doesn't
-- depend on the weight.
prob_a_n_theta_weights :: TrainingVectors -> Matrix
                       -> (U.Vector Double -> Matrix)
prob_a_n_theta_weights ns as =
    let -- Precalculate logBeta of all components
        !logBetaAlphas   = M.rowmap logBeta as

        -- Precalculate the factors for one of the training vectors.
        precalc n i lb_a = let !a = as M.!!! i
                           in logBeta (U.zipWith (+) n a) - lb_a
        norm fs          = let !c = U.maximum fs
                           in U.map (exp . subtract c) fs
        !prefactors      = V.map (norm . flip U.imap logBetaAlphas . precalc) ns

    in \qs ->
        let -- Calculate the final factors.
            calc pfs = let fs = U.zipWith (*) pfs qs
                           total = U.sum fs
                       in U.map (/ total) fs
        in M.fromVectorT $ V.map calc prefactors












-- | Cost function for deriving a Dirichlet mixture (equation
-- 18).  This function is minimized by 'derive'.  Calculated
-- using (17) and (54).
cost :: TrainingData -> DirichletMixture -> Double
cost td dm =
    let as_sums = dmap U.sum dm
    in cost_worker td dm as_sums


-- | Worker of 'cost' function that avoids repeating some
-- computations that are done when reestimating alphas.
cost_worker :: TrainingData -> DirichletMixture
            -> U.Vector Double -> Double
cost_worker TD {ns, ns_sums} dm@(DM !qs _) !as_sums =
    let -- From the equation (54).
        prob_n_a !n !n_sum !a !a_sum !lngamma_a_sum =
            let !s = lngamma (n_sum+1) + lngamma_a_sum - lngamma (n_sum+a_sum)
                f n_i a_i = lngamma (n_i + a_i) - lngamma (n_i + 1) - lngamma a_i
            in exp $ s + U.sum (U.zipWith f n a)

        -- From equation (17).
        prob_n_theta i n =
            let !n_sum = ns_sums U.! i
            in U.sum $ U.zipWith (*) qs $
               U.izipWith (prob_n_a n n_sum . (dm !!!))
                  as_sums lngamma_as_sums
        !lngamma_as_sums = U.map lngamma as_sums
    in negate $ V.sum $ V.imap ((log .) . prob_n_theta) ns

-- | Version of 'cost' function that avoids repeating a lot of
-- computations that are done when reestimating weights.
cost_weight :: TrainingData -> Matrix
            -> U.Vector Double -> (U.Vector Double -> Double)
cost_weight TD {ns, ns_sums} !as !as_sums =
    let -- From the equation (54).
        prob_n_a !n !n_sum !a !a_sum !lngamma_a_sum =
            let !s = lngamma (n_sum+1) + lngamma_a_sum - lngamma (n_sum+a_sum)
                f n_i a_i = lngamma (n_i + a_i) - lngamma (n_i + 1) - lngamma a_i
            in exp $ s + U.sum (U.zipWith f n a)

        -- From equation (17).
        prepare_prob_n_theta i n =
            let !n_sum = ns_sums U.! i
            in {- U.sum $ U.zipWith (*) qs $ -}
               U.izipWith (prob_n_a n n_sum . (as M.!!!))
                  as_sums lngamma_as_sums
        !lngamma_as_sums = U.map lngamma as_sums
        !prepared = V.imap prepare_prob_n_theta ns

        -- Final worker function.
        final qs = log . U.sum . U.zipWith (*) qs
    in \(!qs) -> negate $ V.sum $ V.map (final qs) prepared







-- | Derivative of the cost function with respect @w_{i,j}@,
-- defined by Equation (22).  The result is given in the same
-- size and order as the 'dmDensitites' vector.
del_cost_w :: TrainingData -> DirichletMixture -> Matrix
del_cost_w td dm =
    let as_sums = dmap U.sum dm
    in del_cost_w_worker td dm as_sums


-- | Worker function of 'del_cost_w'.
del_cost_w_worker :: TrainingData -> DirichletMixture
                  -> U.Vector Double -> Matrix
del_cost_w_worker TD {ns, ns_sums, tns} dm !as_sums =
    let -- Calculate Prob(a | n, theta)
        !probs_a_n   = prob_a_n_theta ns dm

        -- Calculate all S_j's.
        !sjs         = M.rowmap U.sum probs_a_n

        -- @calc j _ i _ _@ calculates the derivative of the
        -- cost function with respect to @w_{i,j}@.  The other
        -- arguments come from vector that we @zipWith@ below.
        calc j probs =
          -- Everything that doesn't depend on i, just on j.
          let !a_sum        = as_sums U.! j
              !psi_a_sum    = psi a_sum
              !sum_prob_psi = U.sum $ U.zipWith (*) probs $
                              U.map (psi . (+) a_sum) ns_sums
          -----
          in \i a_i ->
            let !s1 = (sjs U.! j) * (psi_a_sum - psi a_i)
                !s2 = U.sum $ U.zipWith (\p_i n_i -> p_i * psi (n_i + a_i)) probs (tns M.!!! i)
            in - a_i * (s1 + s2 - sum_prob_psi)

    in M.fromVector $ V.imap (\j p_j -> let !f = calc j p_j
                                        in U.imap f (dm !!! j))
                             (M.rows probs_a_n)





-- | Derive a Dirichlet mixture using a maximum likelihood method
-- as described by Karplus et al (equation 25) using CG_DESCENT
-- method by Hager and Zhang (see
-- "Numeric.Optimization.Algorithms.HagerZhang05").  All training
-- vectors should have the same length, however this is not
-- verified.
derive :: DirichletMixture -> Predicate -> StepSize
         -> TrainingData -> Result DirichletMixture
derive (DM initial_qs initial_as') (Pred {..}) _ td@(TD {ns,zeroes})
    | V.length ns == 0          = err "empty training data"
    | U.length initial_qs < 1   = err "empty initial weights vector"
    | M.size initial_as < (1,1) = err "empty initial alphas vector"
    | maxIter < 1               = err "non-positive maxIter"
    | minDelta < 0              = err "negative minDelta"
    | jumpDelta < 0             = err "negative jumpDelta"
    | jumpDelta < minDelta      = err "minDelta greater than jumpDelta"
    | otherwise                 = runST train
    where
      err = error . ("Dirichlet.derive: " ++)
      singleDensity = U.length initial_qs == 1

      -- Remove zeroes from initial_as'.
      initial_as = removeZeroesM zeroes initial_as'

      -- Reciprocal of the number of training sequences.
      !recip_m = recip $ fromIntegral $ V.length ns

      -- Calculate the sums of the alphas.
      calc_as_sums = M.rowmap U.sum

      -- Parameters used by CG_DESCENT.
      verbose = False
      parameters = CG.defaultParameters
                     { CG.printFinal    = verbose
                     , CG.printParams   = verbose
                     , CG.verbose       = if verbose then CG.VeryVerbose else CG.Quiet
                     , CG.maxItersFac   = max 1 $ fromIntegral maxIter / 20
                     , CG.estimateError = CG.RelativeEpsilon (1e-6 * s)
                     }
        where (w,h) = M.size initial_as
              s = fromIntegral (w * h * V.length ns)

      -- Transform a U.Vector from/to a M.Matrix in the case that
      -- the matrix has the same shape as initial_as (i.e. all
      -- as's and ws's).
      fromMatrix = M.mData
      toMatrix v = initial_as {M.mData = v}

      -- Create specialized functions that are optimized by
      -- CG_DESCENT.  They depend only on @qs@, the weights.
      createFunctions !qs =
        let calc f = \ws -> let !as      = M.map exp (toMatrix ws)
                                !as_sums = calc_as_sums as
                                dm       = DM qs as
                            in f dm as_sums
            grad_worker = ((fromMatrix .) .) . del_cost_w_worker
            func = CG.VFunction $ calc $ cost_worker td
            grad = CG.VGradient $ calc $ grad_worker td
            comb = CG.VCombined $ calc $ \dm as_sums ->
                     (cost_worker td dm as_sums
                     ,grad_worker td dm as_sums)
        in (func, grad, comb)

      -- Start training in the zero-th iteration and with
      -- infinite inital cost.
      train = trainAlphas 0 infinity initial_qs $ M.map log initial_as

      trainAlphas !iter !oldCost !qs !ws = {-# SCC "trainAlphas" #-} do
        -- Optimize using CG_DESCENT
        let (func, grad, comb) = createFunctions qs
            opt = CG.optimize parameters minDelta (fromMatrix ws)
                              func grad (Just comb)

        (!pre_ws', result, stats) <- unsafeIOToST opt
        let !ws' = toMatrix (G.unstream $ G.stream pre_ws')

        -- Recalculate everything.
        let !as'     = M.map exp ws'
            as_sums' = calc_as_sums as'
            !iter'   = iter + fromIntegral (CG.totalIters stats)
            !cost'   = CG.finalValue stats
            !delta   = abs (cost' - oldCost)
            dm       = DM qs $ addZeroesM zeroes as'

        -- Verify convergence.  Even with MaxIter we only stop
        -- iterating if the delta was calculated.  Otherwise we
        -- wouldn't be able to tell the caller why the delta was
        -- still big when we reached the limit.
        case (decide result
             ,delta <= minDelta
             ,iter' >= maxIter
             ,singleDensity) of
            (Stop r,_,_,_) -> return $ Result r       iter' delta cost' dm
            (_,True,_,_)   -> return $ Result Delta   iter' delta cost' dm
            (_,_,True,_)   -> return $ Result MaxIter iter' delta cost' dm
            (_,_,_,True)   -> return $ Result Delta   iter' delta cost' dm
            (GoOn,_,_,_)   -> trainWeights iter' cost' qs ws' as' as_sums'

      trainWeights !oldIter !veryOldCost !oldQs !ws !as !as_sums =
        {-# SCC "trainWeights" #-}
        -- Prepare invariant parts.
        let !probs_a_n_mk = prob_a_n_theta_weights ns as
            !cost_mk      = cost_weight td as as_sums
        in ($ oldQs) . ($ veryOldCost) . ($ maxWeightIter) . fix $
               \again !itersLeft !oldCost !qs ->
          -- Reestimate weight's.
          let !probs_a_n = probs_a_n_mk qs
              qs' = M.rowmap ((*) recip_m . U.sum) probs_a_n

          -- Recalculate constants.
              !cost'    = cost_mk qs'
              !delta    = abs (cost' - oldCost)

        -- Verify convergence.  We never stop the process here.
        in case (delta <= jumpDelta, itersLeft <= 0) of
             (False,False) -> again (itersLeft-1) cost' qs'
             _             -> trainAlphas oldIter cost' qs' ws


-- | Decide what we should do depending on the result of the
-- CG_DESCENT routine.
decide :: CG.Result -> Decision
decide CG.ToleranceStatisfied = GoOn
decide CG.FunctionChange      = GoOn
decide CG.MaxTotalIter        = GoOn
decide CG.MaxSecantIter       = GoOn
decide other                  = Stop (CG other)

data Decision = GoOn | Stop Reason