module NLP.LDA
(
runSampler
, pass
, runLDA
, Sampler
, LDA
, Finalized
, Doc
, D
, W
, Z
, docTopics
, wordTopics
, topics
, alphasum
, beta
, topicNum
, vSize
, model
, topicDocs
, topicWords
, initial
, finalize
, docTopicWeights
, compress
, Table2D
, Table1D
)
where
import qualified Data.IntMap as IntMap
import qualified Data.Vector.Unboxed as U
import qualified Data.Vector as V
import qualified Data.List as List
import Prelude hiding (sum)
import GHC.Generics (Generic)
import Data.Random (rvarT)
import Data.RVar
import Data.Random.Distribution.Categorical
import Control.Monad.State
import Data.Random.Source.PureMT (pureMT)
import Data.Word (Word64)
import NLP.LDA.Utils (count)
import NLP.LDA.UnboxedMaybeVector ()
type D = Int
type Z = Int
type W = Int
type Doc = (D, U.Vector (W, Maybe Z))
type Table2D = IntMap.IntMap Table1D
type Table1D = IntMap.IntMap Double
data LDA =
LDA
{ docTopics :: Table2D
, wordTopics :: Table2D
, topics :: Table1D
, alphasum :: !Double
, beta :: !Double
, topicNum :: !Int
, vSize :: !Int
} deriving (Generic)
data Finalized =
Finalized
{ model :: LDA
, topicDocs :: Table2D
, topicWords :: Table2D
}
deriving (Generic)
type Sampler a = RVarT (State LDA) a
initial :: Int -> Double -> Double -> LDA
initial k a b =
LDA { docTopics = IntMap.empty
, wordTopics = IntMap.empty
, topics = IntMap.empty
, alphasum = a
, beta = b
, topicNum = k
, vSize = 0
}
finalize :: LDA -> Finalized
finalize m =
Finalized { model = m
, topicDocs = invert . docTopics $ m
, topicWords = invert . wordTopics $ m }
pass :: V.Vector Doc -> Sampler (V.Vector Doc)
pass = V.mapM passOne
runSampler :: Word64 -> LDA -> Sampler a -> (a, LDA)
runSampler seed m =
flip runState m
. flip evalStateT (pureMT seed)
. sampleRVarTWith lift
runLDA :: Word64
-> Int
-> LDA
-> V.Vector Doc
-> (V.Vector Doc, LDA)
runLDA seed n m ds = runSampler seed m . foldM (const . pass) ds
$ [1..n]
compress :: IntMap.IntMap (IntMap.IntMap Double)
-> IntMap.IntMap (IntMap.IntMap Double)
compress = IntMap.map dezero
passOne :: Doc -> Sampler Doc
passOne (d, wz) = do
zs <- U.mapM one wz
return (d, U.zip (U.map fst wz) (U.map Just zs))
where one (w, mz) = do
m <- lift get
let m' = maybe m (update (1) m d w) mz
lift $ put m'
z <- randomZ d w
lift $ put (update 1 m' d w z)
return z
randomZ :: D -> W -> Sampler Z
randomZ d w = do
m <- lift get
sampleCategorical . fromWeightedList . U.toList . U.map swap . U.indexed
. wordTopicWeights m d
$ w
wordTopicWeights :: LDA -> D -> W -> U.Vector Double
wordTopicWeights m d w =
let k = topicNum m
a = alphasum m / fromIntegral k
b = beta m
dt = IntMap.findWithDefault IntMap.empty d . docTopics $ m
wt = IntMap.findWithDefault IntMap.empty w . wordTopics $ m
v = fromIntegral . vSize $ m
weights = [ (count z dt + a)
* (count z wt + b)
* (1/(count z (topics m) + v * b))
| z <- [0..k1] ]
in U.fromList weights
docTopicWeights :: LDA -> Doc -> U.Vector Double
docTopicWeights m (d, ws) =
U.accumulate (+) (U.replicate (topicNum m) 0)
. U.concatMap (U.indexed . wordTopicWeights m d)
. U.map fst
$ ws
update :: Double -> LDA -> D -> W -> Z -> LDA
update c m d w z =
m { docTopics = upd c (docTopics m) d z
, wordTopics = upd c (wordTopics m) w z
, topics = IntMap.insertWith' (+) z c (topics m)
, vSize = vSize m + (fromEnum . IntMap.notMember w . wordTopics $ m)
}
upd :: Double -> Table2D -> Int -> Int -> Table2D
upd c m k k' = IntMap.insertWith' (flip (IntMap.unionWith (+)))
k
(IntMap.singleton k' c)
m
sampleCategorical :: Categorical Double Z -> Sampler Z
sampleCategorical = sampleRVarT . rvarT
dezero :: IntMap.IntMap Double -> IntMap.IntMap Double
dezero = IntMap.filter (/=0)
invert :: Table2D -> Table2D
invert outer =
List.foldl' (\z (k,k',v) -> upd v z k k') IntMap.empty
[ (k',k,v)
| (k, inner) <- IntMap.toList outer
, (k', v) <- IntMap.toList inner ]
swap :: (Int, Double) -> (Double, Int)
swap (!a, !b) = (b, a)