module NLP.Similarity.VectorSim where
import Control.Applicative ((<$>))
import Prelude hiding (lookup)
import Data.DefaultMap (DefaultMap)
import Test.QuickCheck (Arbitrary(..))
import qualified Data.HashMap.Strict as HM
import qualified Data.DefaultMap as DM
import qualified Data.HashSet as HSet
import Data.Text (Text)
import qualified Data.Text as T
import Data.List (elemIndices)
import GHC.Generics
import NLP.Types
import Control.DeepSeq (NFData(..), deepseq)
newtype TermVector = TermVector (DefaultMap Text Double)
deriving (Read, Show, Eq, Generic, NFData)
instance Arbitrary TermVector where
arbitrary = do
theMap <- arbitrary
let zeroMap = theMap { DM.defDefault = 0 }
return $ TermVector zeroMap
data Document = Document { docTermFrequencies :: HM.HashMap Text Int
, docTokens :: [Text]
}
deriving (Read, Show, Eq, Generic)
instance NFData Document where
rnf (Document f m) = f `deepseq` m `deepseq` ()
instance Arbitrary Document where
arbitrary = mkDocument <$> arbitrary
mkDocument :: [Text] -> Document
mkDocument ts = Document mp ts
where
mp = foldr (\t -> HM.insertWith (+) t 1) HM.empty ts
fromTV :: TermVector -> DefaultMap Text Double
fromTV (TermVector dm) = dm
mkVector :: Corpus -> Document -> TermVector
mkVector corpus doc =
TermVector $ DM.DefMap { DM.defDefault = 0
, DM.defMap = HM.mapWithKey (\t _ -> tf_idf t doc corpus) (docTermFrequencies doc)
}
sim :: Corpus -> Text -> Text -> Double
sim corpus doc1 doc2 = similarity corpus (T.words doc1) (T.words doc2)
similarity :: Corpus -> [Text] -> [Text] -> Double
similarity corpus doc1 doc2 = let
vec1 = mkVector corpus $ mkDocument doc1
vec2 = mkVector corpus $ mkDocument doc2
in tvSim vec1 vec2
tvSim :: TermVector -> TermVector -> Double
tvSim doc1 doc2 = let
theCos = cosVec doc1 doc2
in if isNaN theCos then 0 else theCos
tf :: Text -> Document -> Int
tf term doc = HM.lookupDefault 0 term (docTermFrequencies doc)
idf :: Text -> Corpus -> Double
idf term corpus = let
docCount = 1 + corpLength corpus
containedInCount = 2 + termCounts corpus term
in log (fromIntegral docCount / fromIntegral containedInCount)
tf_idf :: Text -> Document -> Corpus -> Double
tf_idf term doc corpus = let
freq = tf term doc
result | freq == 0 = 0
| otherwise = (fromIntegral freq) * idf term corpus
in result
cosVec :: TermVector -> TermVector -> Double
cosVec vec1 vec2 = let
dp = dotProd vec1 vec2
mag = (magnitude vec1 * magnitude vec2)
in dp / mag
addVectors :: TermVector -> TermVector -> TermVector
addVectors vec1 vec2 = TermVector (DM.unionWith (+) 0 (fromTV vec1) (fromTV vec2))
zeroVector :: TermVector
zeroVector = TermVector (DM.empty 0)
negate :: TermVector -> TermVector
negate vec = TermVector (DM.map ((1) *) $ fromTV vec)
sum :: [TermVector] -> TermVector
sum = foldr addVectors zeroVector
magnitude :: TermVector -> Double
magnitude v = sqrt $ DM.foldl acc 0 $ fromTV v
where
acc :: Double -> Double -> Double
acc cur new = cur + (new ** 2)
dotProd :: TermVector -> TermVector -> Double
dotProd xs ys = let
terms = vectorToSet xs `HSet.union` vectorToSet ys
in HSet.foldl' (+) 0 (HSet.map (\t -> (lookup t xs) * (lookup t ys)) terms)
where
vectorToSet = HSet.fromMap . (HM.map (const ())) . DM.defMap . fromTV
keys :: TermVector -> [Text]
keys tv = DM.keys $ fromTV tv
lookup :: Text -> TermVector -> Double
lookup key tv = DM.lookup key $ fromTV tv