module NLP.Hext.NaiveBayes (FrequencyList,
                            Labeled(..),
                            Classified(..),
                            BayesModel(..),
                            emptyModel,
                            teach,
                            runBayes,
                            -- * Example: Simple Usage
                            -- $simpleExample
                            ) where

import qualified Data.HashMap.Lazy as H
import qualified Data.Set as S
import Data.Maybe
import Data.Char
import Data.Function
import Data.List
import qualified Data.Text.Lazy as T

-- | A hash representing frequency list of words
type FrequencyList = H.HashMap T.Text Int

-- | A frequency list of words that has been assigned a class
data Labeled a = Labeled { hash :: FrequencyList -- ^ a frequency list
                         , label :: a
                         } -- ^ the class label for a piece of text

-- | A class which has a specific probability of occuring
data Classified a = Classified { _class :: a
                               , probability :: Double } deriving (Eq)

-- | A model representing the knowledge that has been given
data BayesModel a = BayesModel { classes :: S.Set a -- ^ a set of user-defined classes
                               , vocab :: FrequencyList -- ^ the frequency list of all vocabulary
                               , material :: [Labeled a] -- ^ a list of all of the classified text
                                     }

instance (Show a) => Show (BayesModel a) where
  show model = show (classes model) ++
        " " ++ show (vocab model)

instance (Eq a) => Ord (Classified a) where compare = compare `on` probability

instance (Show a) => Show (Classified a) where
  show c = show (_class c, probability c)


-- | an empty model to begin teaching
emptyModel :: BayesModel a
emptyModel = BayesModel S.empty H.empty []

-- | teaches the model
teach :: (Ord a) => T.Text -- ^ the sample
                 -> a -- ^ sample's class
                  -> BayesModel a -- ^ the current model
                  -> BayesModel a -- ^ the new model
teach source c model = 
  let fl = vectorize source
      labeled = Labeled fl c
      vc = vocab model
      vc' = H.union fl vc
      cs = classes model
      mat = material model
  in BayesModel (S.insert c cs) vc' (labeled:mat)


-- | Runs a sample string through the Naive Bayes algorithm using
-- a model containing all knowledge from previous learning
runBayes :: (Ord a, Eq a) => BayesModel a  -- ^ a model that has been taught using 'learn'
            -> String -- ^ the sample string to be classified
            -> a -- ^ a datatype representing a class to classify text
runBayes model sample = argmax $ classify model (T.words $ T.pack sample)

classify :: (Ord a, Eq a) => BayesModel a -> [T.Text] -> S.Set (Classified a)
classify model = f where
    cs = classes model
    lengthVocab = H.size $ vocab model
    mat = material model
    prob c ws = 
        let caseC = unions . vecs $ filter ((== c) . label) mat
            n = totalWords caseC
            denom = n + lengthVocab
        in foldl' (\acc word -> (pWordGivenClass word denom caseC) * acc) (pClass c mat) ws
    f wrds = S.map (\c -> Classified c $ prob c wrds) cs

-- the probability of a class occurs,
-- given a set of learning material
pClass :: (Eq a) => a -> [Labeled a] -> Double
pClass cl [] = 0
pClass cl docs =
    let count = length $ filter (\(Labeled fl clas) -> clas == cl) docs
    in (fromIntegral count) / (fromIntegral $ length docs)

-- the probability the word occurs given the class
pWordGivenClass :: T.Text -> Int -> FrequencyList -> Double
pWordGivenClass w denom currentCase =
    (fromIntegral (nk + 1)) / (fromIntegral denom) where
        nk = totalOfWord w currentCase    

-- returns the class that which has the highest probability associated with it
argmax :: (Eq a) => S.Set (Classified a) -> a
argmax = _class . S.findMax

removePunctuation :: T.Text -> T.Text
removePunctuation = T.filter (not . isPunctuation)

-- takes a list of words and makes a frequency list
vectorize :: T.Text -> FrequencyList
vectorize = 
    H.fromListWith (+) . flip zip (repeat 1) . T.words . removePunctuation

-- a list of frequency lists, derived from a set of material
vecs :: [Labeled a] -> [FrequencyList]
vecs = map hash

-- the union of multiple frequency lists
-- adds occurences of each word together
unions :: [FrequencyList] -> FrequencyList
unions = foldl' (\acc hmap -> H.unionWith (+) hmap acc) H.empty 

totalWords :: FrequencyList -> Int
totalWords = H.foldl' (+) 0 

totalOfWord :: T.Text -> FrequencyList -> Int
totalOfWord word doc = H.lookupDefault 0 word doc

{- $simpleExample

In this example a list of sample reviews and their corresponding classes
are zipped into an association list to be passed into the 'makeMaterial' function.
This newly created material is then passed into the 'runBayes' function, along with
a new review. This will classify the new review based on the training material that
has been given.

> data Class = Positive | Negative deriving (Eq, Show)
>
> doc1 = "I loved the movie"
> doc2 = "I hated the movie"
> doc3 = "a great movie. good movie"
> doc4 = "poor acting"
> doc5 = "great acting. a good movie"
>
> docs = [doc1, doc2, doc3, doc4, doc5]
> correspondingClasses = [Positive, Negative, Positive, Negative, Positive]
> classifiedDocs = zip docs correspondingClasses
>
> main :: IO ()
> main = do
>     -- teachMultiple returns a BayesModel Class
>     let teachMultiple = foldl (\m (sample, cl) -> teach (T.pack sample) cl m) emptyModel
>
>     let review = "I hated the poor acting"
>     let result = runBayes (teachMultiple classifiedDocs) review
>     
>     putStrLn $ "The review '" ++ review ++ "' is " ++ show result -- Negative
-}