-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | A library of simple NLP algorithms. -- -- chatter is a collection of simple Natural Language Processing -- algorithms. -- -- Chatter supports: -- --
-- def predict(self, features): -- '''Dot-product the features and current weights and return the best label.''' -- scores = defaultdict(float) -- for feat, value in features.items(): -- if feat not in self.weights or value == 0: -- continue -- weights = self.weights[feat] -- for label, weight in weights.items(): -- scores[label] += value * weight -- # Do a secondary alphabetic sort, for stability -- return max(self.classes, key=lambda label: (scores[label], label)) --predict :: Perceptron -> Map Feature Int -> Maybe Class train :: Int -> Perceptron -> [(Map Feature Int, Class)] -> IO Perceptron -- | Update the perceptron with a new example. -- --
-- update(self, truth, guess, features)
-- ...
-- self.i += 1
-- if truth == guess:
-- return None
-- for f in features:
-- weights = self.weights.setdefault(f, {}) -- setdefault is Map.findWithDefault, and destructive.
-- upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
-- upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
-- return None
--
update :: Perceptron -> Class -> Class -> [Feature] -> Perceptron
-- | Average the weights
--
-- Ported from Python:
--
--
-- def average_weights(self):
-- for feat, weights in self.weights.items():
-- new_feat_weights = {}
-- for clas, weight in weights.items():
-- param = (feat, clas)
-- total = self._totals[param]
-- total += (self.i - self._tstamps[param]) * weight
-- averaged = round(total / float(self.i), 3)
-- if averaged:
-- new_feat_weights[clas] = averaged
-- self.weights[feat] = new_feat_weights
-- return None
--
averageWeights :: Perceptron -> Perceptron
instance Read Feature
instance Show Feature
instance Eq Feature
instance Ord Feature
instance Generic Feature
instance Read Class
instance Show Class
instance Eq Class
instance Ord Class
instance Generic Class
instance Read Perceptron
instance Show Perceptron
instance Eq Perceptron
instance Generic Perceptron
instance Datatype D1Feature
instance Constructor C1_0Feature
instance Datatype D1Class
instance Constructor C1_0Class
instance Datatype D1Perceptron
instance Constructor C1_0Perceptron
instance Selector S1_0_0Perceptron
instance Selector S1_0_1Perceptron
instance Selector S1_0_2Perceptron
instance Selector S1_0_3Perceptron
instance NFData Perceptron
instance Serialize Perceptron
instance Serialize Class
instance Serialize Feature
module NLP.Similarity.VectorSim
-- | An efficient (ish) representation for documents in the "bag of words"
-- sense.
type TermVector = DefaultMap Text Double
-- | Generate a TermVector from a tokenized document.
mkVector :: Corpus -> [Text] -> TermVector
-- | Invokes similarity on full strings, using words for
-- tokenization, and no stemming.
--
-- There *must* be at least one document in the corpus.
sim :: Corpus -> Text -> Text -> Double
-- | Determine how similar two documents are.
--
-- This function assumes that each document has been tokenized and (if
-- desired) stemmed/case-normalized.
--
-- This is a wrapper around tvSim, which is a *much* more
-- efficient implementation. If you need to run similarity against any
-- single document more than once, then you should create
-- TermVectors for each of your documents and use tvSim
-- instead of similarity.
--
-- There *must* be at least one document in the corpus.
similarity :: Corpus -> [Text] -> [Text] -> Double
-- | Determine how similar two documents are.
--
-- Calculates the similarity between two documents, represented as
-- TermVectors
tvSim :: TermVector -> TermVector -> Double
-- | Return the raw frequency of a term in a body of text.
--
-- The firt argument is the term to find, the second is a tokenized
-- document. This function does not do any stemming or additional text
-- modification.
tf :: Eq a => a -> [a] -> Int
-- | Calculate the inverse document frequency.
--
-- The IDF is, roughly speaking, a measure of how popular a term is.
idf :: Text -> Corpus -> Double
-- | Calculate the tf*idf measure for a term given a document and a corpus.
tf_idf :: Text -> [Text] -> Corpus -> Double
cosVec :: TermVector -> TermVector -> Double
-- | Calculate the magnitude of a vector.
magnitude :: TermVector -> Double
-- | find the dot product of two vectors.
dotProd :: TermVector -> TermVector -> Double
-- | This is a very simple wrapper around Parsec for writing Information
-- Extraction patterns.
--
-- Because the particular tags/tokens to parse depends on the training
-- corpus (for POS tagging) and the domain, this module only provides
-- basic extractors. You can, for example, create an extractor to find
-- noun phrases by combining the components provided here:
--
-- -- nounPhrase :: Extractor (Text, Tag) -- nounPhrase = do -- nlist <- many1 (try (posTok $ Tag "NN") -- <|> try (posTok $ Tag "DT") -- <|> (posTok $ Tag "JJ")) -- let term = T.intercalate " " (map fst nlist) -- return (term, Tag "n-phr") --module NLP.Extraction.Parsec -- | A Parsec parser. -- -- Example usage: -- --
-- > set -XOverloadedStrings -- > import Text.Parsec.Prim -- > parse myExtractor "interactive repl" someTaggedSentence --type Extractor = Parsec TaggedSentence () -- | Consume a token with the given POS Tag posTok :: Tag -> Extractor (Text, Tag) -- | Consume a token with the specified POS prefix. -- --
-- > parse (posPrefix "n") "ghci" [(Bob, Tag "np")] -- Right [(Bob, Tag "np")] --posPrefix :: Text -> Extractor (Text, Tag) -- | Text equality matching with optional case sensitivity. matches :: CaseSensitive -> Text -> Text -> Bool -- | Consume a token with the given lexical representation. txtTok :: CaseSensitive -> Text -> Extractor (Text, Tag) -- | Consume any one non-empty token. anyToken :: Extractor (Text, Tag) oneOf :: CaseSensitive -> [Text] -> Extractor (Text, Tag) -- | Skips any number of fill tokens, ending with the end parser, and -- returning the last parsed result. -- -- This is useful when you know what you're looking for and (for -- instance) don't care what comes first. followedBy :: Extractor b -> Extractor a -> Extractor a module NLP.Extraction.Examples.ParsecExamples -- | Create a chunked tag from a set of incomming tagged tokens. chunk :: [(Text, Tag)] -> Tag -> (Text, Tag) prepPhrase :: Extractor (Text, Tag) nounPhrase :: Extractor (Text, Tag) clause :: Extractor (Text, Tag) verbPhrase :: Extractor (Text, Tag) module NLP.Corpora.Parsing -- | Read a POS-tagged corpus out of a Text string of the form: "token/tag -- token/tag..." -- --
-- >>> readPOS "Dear/jj Sirs/nns :/: Let/vb"
-- [("Dear",JJ),("Sirs",NNS),(":",Other ":"),("Let",VB)]
--
readPOS :: Text -> TaggedSentence
-- | Returns all but the last element of a string, unless the string is
-- empty, in which case it returns that string.
safeInit :: Text -> Text
-- | Avegeraged Perceptron Tagger
--
-- Adapted from the python implementation found here:
--
--
module NLP.POS.AvgPerceptronTagger
-- | Create an Averaged Perceptron Tagger using the specified back-off
-- tagger as a fall-back, if one is specified.
--
-- This uses a tokenizer adapted from the tokenize package for a
-- tokenizer, and Erik Kow's fullstop sentence segmenter
-- (http://hackage.haskell.org/package/fullstop) as a sentence
-- splitter.
mkTagger :: Perceptron -> Maybe POSTagger -> POSTagger
-- | Train a new Perceptron.
--
-- The training corpus should be a collection of sentences, one sentence
-- on each line, and with each token tagged with a part of speech.
--
-- For example, the input:
--
-- -- "The/DT dog/NN jumped/VB ./.\nThe/DT cat/NN slept/VB ./." ---- -- defines two training sentences. -- --
-- >>> tagger <- trainNew "Dear/jj Sirs/nns :/: Let/vb\nUs/nn begin/vb\n" -- -- >>> tag tagger $ map T.words $ T.lines "Dear sir" -- "Dear/jj Sirs/nns :/: Let/vb" --trainNew :: Text -> IO Perceptron -- | Train a new Perceptron on a corpus of files. trainOnFiles :: [FilePath] -> IO Perceptron -- | Add training examples to a perceptron. -- --
-- >>> tagger <- train emptyPerceptron "Dear/jj Sirs/nns :/: Let/vb\nUs/nn begin/vb\n" -- -- >>> tag tagger $ map T.words $ T.lines "Dear sir" -- "Dear/jj Sirs/nns :/: Let/vb" ---- -- If you're using multiple input files, this can be useful to improve -- performance (by folding over the files). For example, see -- trainOnFiles train :: Perceptron -> Text -> IO Perceptron -- | Train a model from sentences. -- -- Ported from Python: -- --
-- def train(self, sentences, save_loc=None, nr_iter=5):
-- self._make_tagdict(sentences)
-- self.model.classes = self.classes
-- prev, prev2 = START
-- for iter_ in range(nr_iter):
-- c = 0
-- n = 0
-- for words, tags in sentences:
-- context = START + [self._normalize(w) for w in words] + END
-- for i, word in enumerate(words):
-- guess = self.tagdict.get(word)
-- if not guess:
-- feats = self._get_features(i, word, context, prev, prev2)
-- guess = self.model.predict(feats)
-- self.model.update(tags[i], guess, feats)
-- prev2 = prev; prev = guess
-- c += guess == tags[i]
-- n += 1
-- random.shuffle(sentences)
-- logging.info("Iter {0}: {1}/{2}={3}".format(iter_, c, n, _pc(c, n)))
-- self.model.average_weights()
-- # Pickle as a binary file
-- if save_loc is not None:
-- pickle.dump((self.model.weights, self.tagdict, self.classes),
-- open(save_loc, 'wb'), -1)
-- return None
--
trainInt :: Int -> Perceptron -> [TaggedSentence] -> IO Perceptron
-- | Tag a document (represented as a list of Sentences) with a
-- trained Perceptron
--
-- Ported from Python:
--
--
-- def tag(self, corpus, tokenize=True):
-- '''Tags a string `corpus`.'''
-- # Assume untokenized corpus has \n between sentences and ' ' between words
-- s_split = nltk.sent_tokenize if tokenize else lambda t: t.split('\n')
-- w_split = nltk.word_tokenize if tokenize else lambda s: s.split()
-- def split_sents(corpus):
-- for s in s_split(corpus):
-- yield w_split(s)
-- prev, prev2 = self.START
-- tokens = []
-- for words in split_sents(corpus):
-- context = self.START + [self._normalize(w) for w in words] + self.END
-- for i, word in enumerate(words):
-- tag = self.tagdict.get(word)
-- if not tag:
-- features = self._get_features(i, word, context, prev, prev2)
-- tag = self.model.predict(features)
-- tokens.append((word, tag))
-- prev2 = prev
-- prev = tag
-- return tokens
--
tag :: Perceptron -> [Sentence] -> [TaggedSentence]
-- | Tag a single sentence.
tagSentence :: Perceptron -> Sentence -> TaggedSentence
-- | An empty perceptron, used to start training.
emptyPerceptron :: Perceptron
taggerID :: ByteString
readTagger :: ByteString -> Maybe POSTagger -> Either String POSTagger
-- | This module aims to make tagging text with parts of speech trivially
-- easy.
--
-- If you're new to chatter and POS-tagging, then I suggest you
-- simply try:
--
-- -- >>> tagger <- defaultTagger -- -- >>> tagStr tagger "This is a sample sentence." -- "This/dt is/bez a/at sample/nn sentence/nn ./." ---- -- Note that we used tagStr, instead of tag, or -- tagText. Many people don't (yet!) use Data.Text by -- default, so there is a wrapper around tag that packs and -- unpacks the String. This is innefficient, but it's just to get -- you started, and tagStr can be very handy when you're debugging -- an tagger in ghci (or cabal repl). -- -- tag exposes more details of the tokenization and tagging, since -- it returns a list of TaggedSentences, but it doesn't print -- results as nicely. module NLP.POS -- | Tag a chunk of input text with part-of-speech tags, using the sentence -- splitter, tokenizer, and tagger contained in the POSTager. tag :: POSTagger -> Text -> [TaggedSentence] -- | Tag the tokens in a string. -- -- Returns a space-separated string of tokens, each token suffixed with -- the part of speech. For example: -- --
-- >>> tag tagger "the dog jumped ." -- "the/at dog/nn jumped/vbd ./." --tagStr :: POSTagger -> String -> String -- | Text version of tagStr tagText :: POSTagger -> Text -> Text -- | Train a POSTagger on a corpus of sentences. -- -- This will recurse through the POSTagger stack, training all the -- backoff taggers as well. In order to do that, this function has to be -- generic to the kind of taggers used, so it is not possible to train up -- a new POSTagger from nothing: train wouldn't know what tagger -- to create. -- -- To get around that restriction, you can use the various -- mkTagger implementations, such as mkTagger or -- NLP.POS.AvgPerceptronTagger.mkTagger'. For example: -- --
-- import NLP.POS.AvgPerceptronTagger as APT -- -- let newTagger = APT.mkTagger APT.emptyPerceptron Nothing -- posTgr <- train newTagger trainingExamples --train :: POSTagger -> [TaggedSentence] -> IO POSTagger -- | Train a tagger on string input in the standard form for POS tagged -- corpora: -- --
-- trainStr tagger "the/at dog/nn jumped/vbd ./." --trainStr :: POSTagger -> String -> IO POSTagger -- | The Text version of trainStr trainText :: POSTagger -> Text -> IO POSTagger tagTokens :: POSTagger -> [Sentence] -> [TaggedSentence] -- | Evaluate a POSTager. -- -- Measures accuracy over all tags in the test corpus. -- -- Accuracy is calculated as: -- --
-- |tokens tagged correctly| / |all tokens| --eval :: POSTagger -> [TaggedSentence] -> Double serialize :: POSTagger -> ByteString deserialize :: Map ByteString (ByteString -> Maybe POSTagger -> Either String POSTagger) -> ByteString -> Either String POSTagger -- | The default table of tagger IDs to readTagger functions. Each tagger -- packaged with Chatter should have an entry here. By convention, the -- IDs use are the fully qualified module name of the tagger package. taggerTable :: Map ByteString (ByteString -> Maybe POSTagger -> Either String POSTagger) -- | Store a POSTager to a file. saveTagger :: POSTagger -> FilePath -> IO () -- | Load a tagger, using the interal taggerTable. If you need to -- specify your own mappings for new composite taggers, you should use -- deserialize. -- -- This function checks the filename to determine if the content should -- be decompressed. If the file ends with ".gz", then we assume it is a -- gziped model. loadTagger :: FilePath -> IO POSTagger defaultTagger :: IO POSTagger