module NLP.Hext.NaiveBayes (makeMaterial, runBayes, FList, Labeled(..), Material(..), Classified(..), -- * Example: Simple Usage -- $simpleExample ) where import qualified Data.Map.Lazy as M import Data.Maybe import Data.Char import Data.Function import Data.List import qualified Data.Text.Lazy as T -- | A frequency list of words type FList = M.Map T.Text Int -- TODO make hash -- | A frequency list of words that has been assigned a class data Labeled a = Labeled { flist :: FList -- ^ a labeled frequency list , labeledClass :: a -- ^ the class label for a piece of text } -- | A list of labeled data type Material a = [Labeled a] -- | A class which has a specific probability of occuring data Classified a = Classified { classifiedClass :: a , probability :: Double } deriving (Eq) instance (Eq a) => Ord (Classified a) where compare = compare `on` probability -- | Creates learning material for the program combining -- samples and their corresponding classes into -- a 'Labeled' datatype. makeMaterial :: [(String,a)] -- ^ a list of text samples and their corresponding classes -> Material a makeMaterial ((s,c):rest) = (Labeled (vectorize s) c): makeMaterial rest makeMaterial [] = [] -- | Runs a sample string through the Naive Bayes algorithm using -- training material made by 'makeMaterial' runBayes :: (Eq a) => Material a -- ^ learning material made with 'makeMaterial' -> String -- ^ the sample string to be classified -> a -- ^ a datatype representing a class to classify text runBayes trainingMaterial sample = argmax $ classify trainingMaterial (T.words $ T.pack sample) classify :: (Eq a) => Material a -> [T.Text] -> [Classified a] classify mat = f where classes = nub [c | (Labeled f c) <- mat] lengthVocab = totalUniqueWords . unions $ vecs mat prob c s = let caseC = unions . vecs $ filter (\(Labeled fl cl) -> c == cl) mat n = totalWords caseC denom = n + lengthVocab in foldl' (\acc word -> (pWordGivenClass word denom caseC) * acc) (pClass c mat) s f s = [Classified c $ prob c s | c <- classes] -- the probability of a class occurs, -- given a set of learning material pClass :: (Eq a) => a -> Material a -> Double pClass cl [] = error "no material given" pClass cl docs = let count = length $ filter (\(Labeled fl clas) -> clas == cl) docs in (fromIntegral count) / (fromIntegral $ length docs) -- the probability the word occurs given the class pWordGivenClass :: T.Text -> Int -> FList -> Double pWordGivenClass w denom currentCase = (fromIntegral (nk + 1)) / (fromIntegral denom) where nk = totalOfWord w currentCase -- returns the class that which has the highest probability associated with it argmax :: (Eq a) => [Classified a] -> a argmax = classifiedClass . maximum removePunctuation :: T.Text -> T.Text removePunctuation = T.filter (not . isPunctuation) -- takes a list of words and makes a frequency list vectorize :: String -> FList vectorize = M.fromListWith (+) . flip zip (repeat 1) . T.words . removePunctuation . T.pack -- a list of frequency lists, derived from a set of material vecs :: Material a -> [FList] vecs = map flist -- the union of multiple frequency lists -- adds occurences of each word together unions :: [FList] -> FList unions = M.unionsWith (+) totalUniqueWords :: FList -> Int totalUniqueWords = M.size totalWords :: FList -> Int totalWords = M.foldl' (+) 0 totalOfWord :: T.Text -> FList -> Int totalOfWord word doc = M.findWithDefault 0 word doc {- $simpleExample In this example a list of sample reviews and their corresponding classes are zipped into an association list to be passed into the 'makeMaterial' function. This newly created material is then passed into the 'runBayes' function, along with a new review. This will classify the new review based on the training material that has been given. > data Class = Positive | Negative deriving (Eq, Show) > > doc1 = "I loved the movie" > doc2 = "I hated the movie" > doc3 = "a great movie. good movie" > doc4 = "poor acting" > doc5 = "great acting. a good movie" > > docs = [doc1, doc2, doc3, doc4, doc5] > correspondingClasses = [Positive, Negative, Positive, Negative, Positive] > classifiedDocs = zip docs correspondingClasses > > main :: IO () > main = do > let material = makeMaterial classifiedDocs > let review = "I loved the great acting" > let result = runBayes material review > > putStrLn $ "The review '" ++ review ++ "' is " ++ show result -}