{-# LANGUAGE DataKinds #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE UndecidableInstances #-}

module Control.Distributed.Spark.ML.LDA where

import Control.Distributed.Spark.ML.Feature.CountVectorizer
import Control.Distributed.Spark.PairRDD
import Data.Int
import Foreign.C.Types
import Language.Java

newtype LDA = LDA (J ('Class "org.apache.spark.mllib.clustering.LDA"))
  deriving Coercible

newtype OnlineLDAOptimizer = OnlineLDAOptimizer (J ('Class "org.apache.spark.mllib.clustering.OnlineLDAOptimizer"))
  deriving Coercible

newLDA :: Double                               -- ^ fraction of documents
       -> Int32                                -- ^ number of topics
       -> Int32                                -- ^ maximum number of iterations
       -> IO LDA
newLDA frac numTopics maxIterations = do
  lda :: LDA <- new []
  opti :: OnlineLDAOptimizer <- new []
  OnlineLDAOptimizer opti' <- call opti "setMiniBatchFraction" [JDouble frac]
  lda' :: LDA <- call lda "setOptimizer" [coerce (unsafeCast opti' :: J ('Iface "org.apache.spark.mllib.clustering.LDAOptimizer"))]
  lda'' :: LDA <- call lda' "setK" [JInt numTopics]
  lda''' :: LDA <- call lda'' "setMaxIterations" [JInt maxIterations]
  lda'''' :: LDA <- call lda''' "setDocConcentration" [JDouble $ negate 1]
  call lda'''' "setTopicConcentration" [JDouble $ negate 1]

newtype LDAModel = LDAModel (J ('Class "org.apache.spark.mllib.clustering.LDAModel"))
  deriving Coercible

runLDA :: LDA -> PairRDD CLong SparkVector -> IO LDAModel
runLDA lda rdd = callStatic "Helper" "runLDA" [coerce lda, coerce rdd]

describeResults :: LDAModel -> CountVectorizerModel -> Int32 -> IO ()
describeResults lm cvm maxTerms =
    callStatic
      "Helper"
      "describeResults"
      [coerce lm, coerce cvm, JInt maxTerms]