src/NLP/Similarity/VectorSim.hs

module NLP.Similarity.VectorSim where

import Data.DefaultMap (DefaultMap)
import qualified Data.DefaultMap as DM
import qualified Data.Set as Set
import Data.Text (Text)
import qualified Data.Text as T
import Data.List (elemIndices)

import NLP.Types

-- | An efficient (ish) representation for documents in the "bag of
-- words" sense.
type TermVector = DefaultMap Text Double

-- | Generate a `TermVector` from a tokenized document.
mkVector :: Corpus -> [Text] -> TermVector
mkVector corpus doc = DM.fromList 0 $ Set.toList $
                        Set.map (\t->(t, tf_idf t doc corpus)) (Set.fromList doc)


-- | Invokes similarity on full strings, using `T.words` for
-- tokenization, and no stemming.
--
-- There *must* be at least one document in the corpus.
sim :: Corpus -> Text -> Text -> Double
sim corpus doc1 doc2 = similarity corpus (T.words doc1) (T.words doc2)

-- | Determine how similar two documents are.
--
-- This function assumes that each document has been tokenized and (if
-- desired) stemmed/case-normalized.
--
-- This is a wrapper around `tvSim`, which is a *much* more efficient
-- implementation.  If you need to run similarity against any single
-- document more than once, then you should create `TermVector`s for
-- each of your documents and use `tvSim` instead of `similarity`.
--
-- There *must* be at least one document in the corpus.
similarity :: Corpus -> [Text] -> [Text] -> Double
similarity corpus doc1 doc2 = let
  vec1 = mkVector corpus doc1
  vec2 = mkVector corpus doc2
  in tvSim vec1 vec2

-- | Determine how similar two documents are.
--
-- Calculates the similarity between two documents, represented as
-- `TermVectors`
tvSim :: TermVector -> TermVector -> Double
tvSim doc1 doc2 = let
  theCos = cosVec doc1 doc2
  in if isNaN theCos then 0 else theCos

-- | Return the raw frequency of a term in a body of text.
--
-- The firt argument is the term to find, the second is a tokenized
-- document. This function does not do any stemming or additional text
-- modification.
tf :: Eq a => a -> [a] -> Int
tf term doc = length $ elemIndices term doc

-- | Calculate the inverse document frequency.
--
-- The IDF is, roughly speaking, a measure of how popular a term is.
idf :: Text -> Corpus -> Double
idf term corpus = let
  docCount = corpLength corpus
  containedInCount = 1 + termCounts corpus term
  in log (fromIntegral docCount / fromIntegral containedInCount)

-- | Calculate the tf*idf measure for a term given a document and a
-- corpus.
tf_idf :: Text -> [Text] -> Corpus -> Double
tf_idf term doc corp = let
  corpus = addDocument corp doc
  freq = tf term doc
  result | freq == 0 = 0
         | otherwise = (fromIntegral freq) * idf term corpus
  in result

cosVec :: TermVector -> TermVector -> Double
cosVec vec1 vec2 = let
  dp = dotProd vec1 vec2
  mag = (magnitude vec1 * magnitude vec2)
  in dp / mag

-- | Calculate the magnitude of a vector.
magnitude :: TermVector -> Double
magnitude v = sqrt $ DM.foldl acc 0 v
  where
    acc :: Double -> Double -> Double
    acc cur new = cur + (new ** 2)

-- | find the dot product of two vectors.
dotProd :: TermVector -> TermVector -> Double
dotProd xs ys = let
  terms = Set.fromList (DM.keys xs) `Set.union` Set.fromList (DM.keys ys)
  in Set.foldl (+) 0 (Set.map (\t -> (DM.lookup t xs) * (DM.lookup t ys)) terms)