{-# LANGUAGE ImportQualifiedPost #-}

-- LDA model from Anglican
-- (https://bitbucket.org/probprog/anglican-white-paper)

-- This model is just a toy/reference implementation.
-- A more serious one would not store documents as lists of words.
-- The point is just to showcase the model

module LDA where

import Control.Monad qualified as List (replicateM)
import Control.Monad.Bayes.Class
  ( MonadDistribution (categorical, dirichlet, uniformD),
    MonadMeasure,
    factor,
  )
import Control.Monad.Bayes.Sampler.Strict (sampleIO, sampleIOfixed)
import Control.Monad.Bayes.Traced (mh)
import Control.Monad.Bayes.Weighted (unweighted)
import Data.Map qualified as Map
import Data.Text (Text, words)
import Data.Vector as V (Vector, replicate, (!))
import Data.Vector qualified as V hiding (length, mapM, mapM_)
import Numeric.Log (Log (Exp))
import Text.Pretty.Simple (pPrint)
import Prelude hiding (words)

vocabulary :: [Text]
vocabulary = ["bear", "wolf", "python", "prolog"]

topics :: [Text]
topics = ["topic1", "topic2"]

type Documents = [[Text]]

documents :: Documents
documents =
  [ words "bear wolf bear wolf bear wolf python wolf bear wolf",
    words "python prolog python prolog python prolog python prolog python prolog",
    words "bear wolf bear wolf bear wolf bear wolf bear wolf",
    words "python prolog python prolog python prolog python prolog python prolog",
    words "bear wolf bear python bear wolf bear wolf bear wolf"
  ]

wordDistPrior :: MonadDistribution m => m (V.Vector Double)
wordDistPrior = dirichlet $ V.replicate (length vocabulary) 1

topicDistPrior :: MonadDistribution m => m (V.Vector Double)
topicDistPrior = dirichlet $ V.replicate (length topics) 1

wordIndex :: Map.Map Text Int
wordIndex = Map.fromList $ zip vocabulary [0 ..]

lda ::
  MonadMeasure m =>
  Documents ->
  m (Map.Map Text (V.Vector (Text, Double)), [(Text, V.Vector (Text, Double))])
lda docs = do
  word_dist_for_topic <- do
    ts <- List.replicateM (length topics) wordDistPrior
    return $ Map.fromList $ zip topics ts
  let obs doc = do
        topic_dist <- topicDistPrior
        let f word = do
              topic <- (fmap (topics !!) . categorical) topic_dist
              factor $ (Exp . log) $ (word_dist_for_topic Map.! topic) V.! (wordIndex Map.! word)
        mapM_ f doc
        return topic_dist
  td <- mapM obs docs
  return
    ( fmap (V.zip (V.fromList vocabulary)) word_dist_for_topic,
      zip (fmap (foldr1 (\x y -> x <> " " <> y)) docs) (fmap (V.zip $ V.fromList ["topic1", "topic2"]) td)
    )

syntheticData :: MonadDistribution m => Int -> Int -> m [[Text]]
syntheticData d w = List.replicateM d (List.replicateM w syntheticWord)
  where
    syntheticWord = uniformD vocabulary

runLDA :: IO ()
runLDA = do
  s <- sampleIOfixed $ unweighted $ mh 1000 $ lda documents
  pPrint (head s)