Safe Haskell	None
Language	Haskell2010

NLP.Similarity.VectorSim

Synopsis

Documentation

newtype TermVector Source #

An efficient (ish) representation for documents in the "bag of words" sense.

Constructors

TermVector (DefaultMap Text Double)

Instances

Eq TermVector Source #
Methods (==) :: TermVector -> TermVector -> Bool # (/=) :: TermVector -> TermVector -> Bool #
Read TermVector Source #
Methods readsPrec :: Int -> ReadS TermVector # readList :: ReadS [TermVector] # readPrec :: ReadPrec TermVector # readListPrec :: ReadPrec [TermVector] #
Show TermVector Source #
Methods showsPrec :: Int -> TermVector -> ShowS # show :: TermVector -> String # showList :: [TermVector] -> ShowS #
Generic TermVector Source #
Associated Types type Rep TermVector :: * -> * # Methods from :: TermVector -> Rep TermVector x # to :: Rep TermVector x -> TermVector #
Arbitrary TermVector Source #
Methods arbitrary :: Gen TermVector # shrink :: TermVector -> [TermVector] #
NFData TermVector Source #
Methods rnf :: TermVector -> () #
type Rep TermVector Source #
type Rep TermVector = D1 (MetaData "TermVector" "NLP.Similarity.VectorSim" "chatter-0.9.1.0-CnWxxDeMROyIxVsZb3fGkc" True) (C1 (MetaCons "TermVector" PrefixI False) (S1 (MetaSel (Nothing Symbol) NoSourceUnpackedness NoSourceStrictness DecidedLazy) (Rec0 (DefaultMap Text Double))))

data Document Source #

Constructors

Document
Fields docTermFrequencies :: HashMap Text Int docTokens :: [Text]

Instances

Eq Document Source #
Methods (==) :: Document -> Document -> Bool # (/=) :: Document -> Document -> Bool #
Read Document Source #
Methods readsPrec :: Int -> ReadS Document # readList :: ReadS [Document] # readPrec :: ReadPrec Document # readListPrec :: ReadPrec [Document] #
Show Document Source #
Methods showsPrec :: Int -> Document -> ShowS # show :: Document -> String # showList :: [Document] -> ShowS #
Generic Document Source #
Associated Types type Rep Document :: * -> * # Methods from :: Document -> Rep Document x # to :: Rep Document x -> Document #
Arbitrary Document Source #
Methods arbitrary :: Gen Document # shrink :: Document -> [Document] #
NFData Document Source #
Methods rnf :: Document -> () #
type Rep Document Source #
type Rep Document = D1 (MetaData "Document" "NLP.Similarity.VectorSim" "chatter-0.9.1.0-CnWxxDeMROyIxVsZb3fGkc" False) (C1 (MetaCons "Document" PrefixI True) ((:*:) (S1 (MetaSel (Just Symbol "docTermFrequencies") NoSourceUnpackedness NoSourceStrictness DecidedLazy) (Rec0 (HashMap Text Int))) (S1 (MetaSel (Just Symbol "docTokens") NoSourceUnpackedness NoSourceStrictness DecidedLazy) (Rec0 [Text]))))

mkDocument :: [Text] -> Document Source #

Make a document from a list of tokens.

fromTV :: TermVector -> DefaultMap Text Double Source #

Access the underlying DefaultMap used to store term vector details.

mkVector :: Corpus -> Document -> TermVector Source #

Generate a TermVector from a tokenized document.

sim :: Corpus -> Text -> Text -> Double Source #

Invokes similarity on full strings, using words for tokenization, and no stemming. The return value will be in the range [0, 1]

There *must* be at least one document in the corpus.

similarity :: Corpus -> [Text] -> [Text] -> Double Source #

Determine how similar two documents are.

This function assumes that each document has been tokenized and (if desired) stemmed/case-normalized.

This is a wrapper around tvSim, which is a *much* more efficient implementation. If you need to run similarity against any single document more than once, then you should create TermVectors for each of your documents and use tvSim instead of similarity.

The return value will be in the range [0, 1].

There *must* be at least one document in the corpus.

tvSim :: TermVector -> TermVector -> Double Source #

Determine how similar two documents are.

Calculates the similarity between two documents, represented as TermVectors, returning a double in the range [0, 1] where 1 represents "most similar".

tf :: Text -> Document -> Int Source #

Return the raw frequency of a term in a body of text.

The firt argument is the term to find, the second is a tokenized document. This function does not do any stemming or additional text modification.

idf :: Text -> Corpus -> Double Source #

Calculate the inverse document frequency.

The IDF is, roughly speaking, a measure of how popular a term is.

tf_idf :: Text -> Document -> Corpus -> Double Source #

Calculate the tf*idf measure for a term given a document and a corpus.

cosVec :: TermVector -> TermVector -> Double Source #

addVectors :: TermVector -> TermVector -> TermVector Source #

Add two term vectors. When a term is added, its value in each vector is used (or that vector's default value is used if the term is absent from the vector). The new term vector resulting from the addition always uses a default value of zero.

zeroVector :: TermVector Source #

A "zero vector" term vector (i.e. addVector v zeroVector = v).

negate :: TermVector -> TermVector Source #

Negate a term vector.

sum :: [TermVector] -> TermVector Source #

Add a list of term vectors.

magnitude :: TermVector -> Double Source #

Calculate the magnitude of a vector.

dotProd :: TermVector -> TermVector -> Double Source #

find the dot product of two vectors.

keys :: TermVector -> [Text] Source #

lookup :: Text -> TermVector -> Double Source #