-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | A library of simple NLP algorithms. -- -- chatter is a collection of simple Natural Language Processing -- algorithms. -- -- Chatter supports: -- -- @package chatter @version 0.2.0.0 module Data.DefaultMap -- | Defaulting Map; a Map that returns a default value when queried for a -- key that does not exist. data DefaultMap k v DefMap :: v -> Map k v -> DefaultMap k v defDefault :: DefaultMap k v -> v defMap :: DefaultMap k v -> Map k v -- | Create an empty DefaultMap empty :: v -> DefaultMap k v -- | Query the map for a value. Returns the default if the key is not -- found. lookup :: Ord k => k -> DefaultMap k v -> v -- | Create a DefaultMap from a default value and a list. fromList :: Ord k => v -> [(k, v)] -> DefaultMap k v -- | Access the keys as a list. keys :: DefaultMap k a -> [k] -- | Fold over the values in the map. -- -- Note that this *does* not fold over the default value -- this fold -- behaves in the same way as a standard foldl foldl :: (a -> b -> a) -> a -> DefaultMap k b -> a instance (Ord k, Read k, Read v) => Read (DefaultMap k v) instance (Show k, Show v) => Show (DefaultMap k v) instance (Eq k, Eq v) => Eq (DefaultMap k v) instance (Ord k, Ord v) => Ord (DefaultMap k v) instance Generic (DefaultMap k v) instance Datatype D1DefaultMap instance Constructor C1_0DefaultMap instance Selector S1_0_0DefaultMap instance Selector S1_0_1DefaultMap instance (NFData k, NFData v, Ord k) => NFData (DefaultMap k v) instance (Ord k, Serialize k, Serialize v) => Serialize (DefaultMap k v) -- | Utilities for reading mailman-style email archives. module NLP.Corpora.Email -- | Path to the directory containing all the PLUG archives. plugDataPath :: FilePath plugArchiveText :: IO [Text] plugArchiveTokens :: IO [[Text]] fullPlugArchive :: IO [Message] readF :: FilePath -> IO Text module NLP.Types type Sentence = [Text] type TaggedSentence = [(Text, Tag)] flattenText :: TaggedSentence -> Text -- | True if the input sentence contains the given text token. Does not do -- partial or approximate matching, and compares details in a fully -- case-sensitive manner. contains :: TaggedSentence -> Text -> Bool -- | True if the input sentence contains the given POS tag. Does not do -- partial matching (such as prefix matching) containsTag :: TaggedSentence -> Tag -> Bool -- | Boolean type to indicate case sensitivity for textual comparisons. data CaseSensitive Sensitive :: CaseSensitive Insensitive :: CaseSensitive -- | Part of Speech tagger, with back-off tagger. -- -- A sequence of pos taggers can be assembled by using backoff taggers. -- When tagging text, the first tagger is run on the input, possibly -- tagging some tokens as unknown ('Tag Unk'). The first backoff -- tagger is then recursively invoked on the text to fill in the unknown -- tags, but that may still leave some tokens marked with 'Tag -- Unk'. This process repeats until no more taggers are found. -- (The current implementation is not very efficient in this respect.). -- -- Back off taggers are particularly useful when there is a set of domain -- specific vernacular that a general purpose statistical tagger does not -- know of. A LitteralTagger can be created to map terms to fixed POS -- tags, and then delegate the bulk of the text to a statistical back off -- tagger, such as an AvgPerceptronTagger. -- -- POSTagger values can be serialized and deserialized by using -- serialize and NLP.POS.deserialize`. This is a bit tricky -- because the POSTagger abstracts away the implementation details of the -- particular tagging algorithm, and the model for that tagger (if any). -- To support serialization, each POSTagger value must provide a -- serialize value that can be used to generate a ByteString -- representation of the model, as well as a unique id (also a -- ByteString). Furthermore, that ID must be added to a `Map -- ByteString (ByteString -> Maybe POSTagger -> Either String -- POSTagger)` that is provided to deserialize. The function in -- the map takes the output of posSerialize, and possibly a -- backoff tagger, and reconstitutes the POSTagger that was serialized -- (assigning the proper functions, setting up closures as needed, etc.) -- Look at the source for taggerTable and readTagger for -- examples. data POSTagger POSTagger :: ([Sentence] -> [TaggedSentence]) -> ([TaggedSentence] -> IO POSTagger) -> Maybe POSTagger -> (Text -> Sentence) -> (Text -> [Text]) -> ByteString -> ByteString -> POSTagger -- | The initial part-of-speech tagger. posTagger :: POSTagger -> [Sentence] -> [TaggedSentence] -- | Training function to train the immediate POS tagger. posTrainer :: POSTagger -> [TaggedSentence] -> IO POSTagger -- | A tagger to invoke on unknown tokens. posBackoff :: POSTagger -> Maybe POSTagger -- | A tokenizer; (words will work.) posTokenizer :: POSTagger -> Text -> Sentence -- | A sentence splitter. If your input is formatted as one sentence per -- line, then use lines, otherwise try Erik Kow's fullstop -- library. posSplitter :: POSTagger -> Text -> [Text] -- | Store this POS tagger to a bytestring. This does not serialize -- the backoff taggers. posSerialize :: POSTagger -> ByteString -- | A unique id that will identify the algorithm used for this POS Tagger. -- This is used in deserialization posID :: POSTagger -> ByteString -- | Remove the tags from a tagged sentence stripTags :: TaggedSentence -> Sentence newtype Tag Tag :: Text -> Tag fromTag :: Tag -> Text parseTag :: Text -> Tag -- | Constant tag for "unknown" tagUNK :: Tag -- | Document corpus. -- -- This is a simple hashed corpus, the document content is not stored. data Corpus Corpus :: Int -> Map Text Int -> Corpus -- | The number of documents in the corpus. corpLength :: Corpus -> Int -- | A count of the number of documents each term occurred in. corpTermCounts :: Corpus -> Map Text Int -- | Get the number of documents that a term occurred in. termCounts :: Corpus -> Text -> Int -- | Add a document to the corpus. -- -- This can be dangerous if the documents are pre-processed differently. -- All corpus-related functions assume that the documents have all been -- tokenized and the tokens normalized, in the same way. addDocument :: Corpus -> [Text] -> Corpus -- | Create a corpus from a list of documents, represented by normalized -- tokens. mkCorpus :: [[Text]] -> Corpus addTerms :: Map Text Int -> Set Text -> Map Text Int addTerm :: Map Text Int -> Text -> Map Text Int instance Read CaseSensitive instance Show CaseSensitive instance Generic CaseSensitive instance Ord Tag instance Eq Tag instance Read Tag instance Show Tag instance Generic Tag instance Read Corpus instance Show Corpus instance Eq Corpus instance Ord Corpus instance Generic Corpus instance Datatype D1CaseSensitive instance Constructor C1_0CaseSensitive instance Constructor C1_1CaseSensitive instance Datatype D1Tag instance Constructor C1_0Tag instance Datatype D1Corpus instance Constructor C1_0Corpus instance Selector S1_0_0Corpus instance Selector S1_0_1Corpus instance Serialize Corpus instance NFData Corpus instance Serialize Text instance Serialize Tag module NLP.POS.LiteralTagger tag :: Map Text Tag -> CaseSensitive -> [Sentence] -> [TaggedSentence] tagSentence :: Map Text Tag -> CaseSensitive -> Sentence -> TaggedSentence -- | Create a Literal Tagger using the specified back-off tagger as a -- fall-back, if one is specified. -- -- This uses a tokenizer adapted from the tokenize package for a -- tokenizer, and Erik Kow's fullstop sentence segmenter as a sentence -- splitter. mkTagger :: Map Text Tag -> CaseSensitive -> Maybe POSTagger -> POSTagger taggerID :: ByteString -- | deserialization for Literal Taggers. The serialization logic is in the -- posSerialize record of the POSTagger created in mkTagger. readTagger :: ByteString -> Maybe POSTagger -> Either String POSTagger -- | Boolean type to indicate case sensitivity for textual comparisons. data CaseSensitive Sensitive :: CaseSensitive Insensitive :: CaseSensitive -- | Create a tokenizer that protects the provided terms (to tokenize -- multi-word terms) protectTerms :: [Text] -> CaseSensitive -> Tokenizer instance Serialize CaseSensitive -- | This POS tagger deterministically tags tokens. However, if it ever -- sees multiple tags for the same token, it will forget the tag it has -- learned. This is useful for creating taggers that have very high -- precision, but very low recall. -- -- Unambiguous taggers are also useful when defined with a -- non-deterministic backoff tagger, such as an -- NLP.POS.AveragedPerceptronTagger, since the high-confidence -- tags will be applied first, followed by the more non-deterministic -- results of the backoff tagger. module NLP.POS.UnambiguousTagger taggerID :: ByteString readTagger :: ByteString -> Maybe POSTagger -> Either String POSTagger -- | Create an unambiguous tagger, using the supplied Map as a -- source of tags. mkTagger :: Map Text Tag -> Maybe POSTagger -> POSTagger -- | Trainer method for unambiguous taggers. train :: Map Text Tag -> [TaggedSentence] -> Map Text Tag -- | Average Perceptron implementation of Part of speech tagging, adapted -- for Haskell from this python implementation, which is described on the -- blog post: -- -- -- -- The Perceptron code can be found on github: -- -- module NLP.POS.AvgPerceptron -- | The perceptron model. data Perceptron Perceptron :: Map Feature (Map Class Weight) -> Map (Feature, Class) Weight -> Map (Feature, Class) Int -> Int -> Perceptron -- | Each feature gets its own weight vector, so weights is a dict-of-dicts weights :: Perceptron -> Map Feature (Map Class Weight) -- | The accumulated values, for the averaging. These will be keyed by -- feature/clas tuples totals :: Perceptron -> Map (Feature, Class) Weight -- | The last time the feature was changed, for the averaging. Also keyed -- by feature/clas tuples (tstamps is short for timestamps) tstamps :: Perceptron -> Map (Feature, Class) Int -- | Number of instances seen instances :: Perceptron -> Int -- | The classes that the perceptron assigns are represnted with a -- newtype-wrapped String. -- -- Eventually, I think this should become a typeclass, so the classes can -- be defined by the users of the Perceptron (such as custom POS tag -- ADTs, or more complex classes). newtype Class Class :: String -> Class -- | Typedef for doubles to make the code easier to read, and to make this -- simple to change if necessary. type Weight = Double newtype Feature Feat :: Text -> Feature -- | An empty perceptron, used to start training. emptyPerceptron :: Perceptron -- | Predict a class given a feature vector. -- -- Ported from python: -- --
--   def predict(self, features):
--       '''Dot-product the features and current weights and return the best label.'''
--       scores = defaultdict(float)
--       for feat, value in features.items():
--           if feat not in self.weights or value == 0:
--               continue
--           weights = self.weights[feat]
--           for label, weight in weights.items():
--               scores[label] += value * weight
--       # Do a secondary alphabetic sort, for stability
--       return max(self.classes, key=lambda label: (scores[label], label))
--   
predict :: Perceptron -> Map Feature Int -> Maybe Class train :: Int -> Perceptron -> [(Map Feature Int, Class)] -> IO Perceptron -- | Update the perceptron with a new example. -- --
--   update(self, truth, guess, features)
--      ...
--           self.i += 1
--           if truth == guess:
--               return None
--           for f in features:
--               weights = self.weights.setdefault(f, {}) -- setdefault is Map.findWithDefault, and destructive.
--               upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
--               upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
--           return None
--   
update :: Perceptron -> Class -> Class -> [Feature] -> Perceptron -- | Average the weights -- -- Ported from Python: -- --
--   def average_weights(self):
--       for feat, weights in self.weights.items():
--           new_feat_weights = {}
--           for clas, weight in weights.items():
--               param = (feat, clas)
--               total = self._totals[param]
--               total += (self.i - self._tstamps[param]) * weight
--               averaged = round(total / float(self.i), 3)
--               if averaged:
--                   new_feat_weights[clas] = averaged
--           self.weights[feat] = new_feat_weights
--       return None
--   
averageWeights :: Perceptron -> Perceptron instance Read Feature instance Show Feature instance Eq Feature instance Ord Feature instance Generic Feature instance Read Class instance Show Class instance Eq Class instance Ord Class instance Generic Class instance Read Perceptron instance Show Perceptron instance Eq Perceptron instance Generic Perceptron instance Datatype D1Feature instance Constructor C1_0Feature instance Datatype D1Class instance Constructor C1_0Class instance Datatype D1Perceptron instance Constructor C1_0Perceptron instance Selector S1_0_0Perceptron instance Selector S1_0_1Perceptron instance Selector S1_0_2Perceptron instance Selector S1_0_3Perceptron instance NFData Perceptron instance Serialize Perceptron instance Serialize Class instance Serialize Feature module NLP.Similarity.VectorSim -- | An efficient (ish) representation for documents in the "bag of words" -- sense. type TermVector = DefaultMap Text Double -- | Generate a TermVector from a tokenized document. mkVector :: Corpus -> [Text] -> TermVector -- | Invokes similarity on full strings, using words for -- tokenization, and no stemming. -- -- There *must* be at least one document in the corpus. sim :: Corpus -> Text -> Text -> Double -- | Determine how similar two documents are. -- -- This function assumes that each document has been tokenized and (if -- desired) stemmed/case-normalized. -- -- This is a wrapper around tvSim, which is a *much* more -- efficient implementation. If you need to run similarity against any -- single document more than once, then you should create -- TermVectors for each of your documents and use tvSim -- instead of similarity. -- -- There *must* be at least one document in the corpus. similarity :: Corpus -> [Text] -> [Text] -> Double -- | Determine how similar two documents are. -- -- Calculates the similarity between two documents, represented as -- TermVectors tvSim :: TermVector -> TermVector -> Double -- | Return the raw frequency of a term in a body of text. -- -- The firt argument is the term to find, the second is a tokenized -- document. This function does not do any stemming or additional text -- modification. tf :: Eq a => a -> [a] -> Int -- | Calculate the inverse document frequency. -- -- The IDF is, roughly speaking, a measure of how popular a term is. idf :: Text -> Corpus -> Double -- | Calculate the tf*idf measure for a term given a document and a corpus. tf_idf :: Text -> [Text] -> Corpus -> Double cosVec :: TermVector -> TermVector -> Double -- | Calculate the magnitude of a vector. magnitude :: TermVector -> Double -- | find the dot product of two vectors. dotProd :: TermVector -> TermVector -> Double -- | This is a very simple wrapper around Parsec for writing Information -- Extraction patterns. -- -- Because the particular tags/tokens to parse depends on the training -- corpus (for POS tagging) and the domain, this module only provides -- basic extractors. You can, for example, create an extractor to find -- noun phrases by combining the components provided here: -- --
--   nounPhrase :: Extractor (Text, Tag)
--   nounPhrase = do
--     nlist <- many1 (try (posTok $ Tag "NN")
--                 <|> try (posTok $ Tag "DT")
--                     <|> (posTok $ Tag "JJ"))
--     let term = T.intercalate " " (map fst nlist)
--     return (term, Tag "n-phr")
--   
module NLP.Extraction.Parsec -- | A Parsec parser. -- -- Example usage: -- --
--   > set -XOverloadedStrings
--   > import Text.Parsec.Prim
--   > parse myExtractor "interactive repl" someTaggedSentence
--   
type Extractor = Parsec TaggedSentence () -- | Consume a token with the given POS Tag posTok :: Tag -> Extractor (Text, Tag) -- | Consume a token with the specified POS prefix. -- --
--   parse (posPrefix "n") "ghci" [("Bob", Tag "np")]
--   
-- -- Right [(Bob, Tag "np")] posPrefix :: Text -> Extractor (Text, Tag) -- | Text equality matching with optional case sensitivity. matches :: CaseSensitive -> Text -> Text -> Bool -- | Consume a token with the given lexical representation. txtTok :: CaseSensitive -> Text -> Extractor (Text, Tag) -- | Consume any one non-empty token. anyToken :: Extractor (Text, Tag) oneOf :: CaseSensitive -> [Text] -> Extractor (Text, Tag) -- | Skips any number of fill tokens, ending with the end parser, and -- returning the last parsed result. -- -- This is useful when you know what you're looking for and (for -- instance) don't care what comes first. followedBy :: Extractor b -> Extractor a -> Extractor a module NLP.Corpora.Parsing -- | Read a POS-tagged corpus out of a Text string of the form: "token/tag -- token/tag..." -- --
--   >>> readPOS "Dear/jj Sirs/nns :/: Let/vb"
--   [("Dear",JJ),("Sirs",NNS),(":",Other ":"),("Let",VB)]
--   
readPOS :: Text -> TaggedSentence -- | Returns all but the last element of a string, unless the string is -- empty, in which case it returns that string. safeInit :: Text -> Text -- | Avegeraged Perceptron Tagger -- -- Adapted from the python implementation found here: -- -- module NLP.POS.AvgPerceptronTagger -- | Create an Averaged Perceptron Tagger using the specified back-off -- tagger as a fall-back, if one is specified. -- -- This uses a tokenizer adapted from the tokenize package for a -- tokenizer, and Erik Kow's fullstop sentence segmenter -- (http://hackage.haskell.org/package/fullstop) as a sentence -- splitter. mkTagger :: Perceptron -> Maybe POSTagger -> POSTagger -- | Train a new Perceptron. -- -- The training corpus should be a collection of sentences, one sentence -- on each line, and with each token tagged with a part of speech. -- -- For example, the input: -- --
--   "The/DT dog/NN jumped/VB ./.\nThe/DT cat/NN slept/VB ./."
--   
-- -- defines two training sentences. -- --
--   >>> tagger <- trainNew "Dear/jj Sirs/nns :/: Let/vb\nUs/nn begin/vb\n"
--   
--   >>> tag tagger $ map T.words $ T.lines "Dear sir"
--   "Dear/jj Sirs/nns :/: Let/vb"
--   
trainNew :: Text -> IO Perceptron -- | Train a new Perceptron on a corpus of files. trainOnFiles :: [FilePath] -> IO Perceptron -- | Add training examples to a perceptron. -- --
--   >>> tagger <- train emptyPerceptron "Dear/jj Sirs/nns :/: Let/vb\nUs/nn begin/vb\n"
--   
--   >>> tag tagger $ map T.words $ T.lines "Dear sir"
--   "Dear/jj Sirs/nns :/: Let/vb"
--   
-- -- If you're using multiple input files, this can be useful to improve -- performance (by folding over the files). For example, see -- trainOnFiles train :: Perceptron -> Text -> IO Perceptron -- | Train a model from sentences. -- -- Ported from Python: -- --
--   def train(self, sentences, save_loc=None, nr_iter=5):
--       self._make_tagdict(sentences)
--       self.model.classes = self.classes
--       prev, prev2 = START
--       for iter_ in range(nr_iter):
--           c = 0
--           n = 0
--           for words, tags in sentences:
--               context = START + [self._normalize(w) for w in words] + END
--               for i, word in enumerate(words):
--                   guess = self.tagdict.get(word)
--                   if not guess:
--                       feats = self._get_features(i, word, context, prev, prev2)
--                       guess = self.model.predict(feats)
--                       self.model.update(tags[i], guess, feats)
--                   prev2 = prev; prev = guess
--                   c += guess == tags[i]
--                   n += 1
--           random.shuffle(sentences)
--           logging.info("Iter {0}: {1}/{2}={3}".format(iter_, c, n, _pc(c, n)))
--       self.model.average_weights()
--       # Pickle as a binary file
--       if save_loc is not None:
--           pickle.dump((self.model.weights, self.tagdict, self.classes),
--                        open(save_loc, 'wb'), -1)
--       return None
--   
trainInt :: Int -> Perceptron -> [TaggedSentence] -> IO Perceptron -- | Tag a document (represented as a list of Sentences) with a -- trained Perceptron -- -- Ported from Python: -- --
--   def tag(self, corpus, tokenize=True):
--       '''Tags a string `corpus`.'''
--       # Assume untokenized corpus has \n between sentences and ' ' between words
--       s_split = nltk.sent_tokenize if tokenize else lambda t: t.split('\n')
--       w_split = nltk.word_tokenize if tokenize else lambda s: s.split()
--       def split_sents(corpus):
--           for s in s_split(corpus):
--               yield w_split(s)
--        prev, prev2 = self.START
--       tokens = []
--       for words in split_sents(corpus):
--           context = self.START + [self._normalize(w) for w in words] + self.END
--           for i, word in enumerate(words):
--               tag = self.tagdict.get(word)
--               if not tag:
--                   features = self._get_features(i, word, context, prev, prev2)
--                   tag = self.model.predict(features)
--               tokens.append((word, tag))
--               prev2 = prev
--               prev = tag
--       return tokens
--   
tag :: Perceptron -> [Sentence] -> [TaggedSentence] -- | Tag a single sentence. tagSentence :: Perceptron -> Sentence -> TaggedSentence -- | An empty perceptron, used to start training. emptyPerceptron :: Perceptron taggerID :: ByteString readTagger :: ByteString -> Maybe POSTagger -> Either String POSTagger -- | This module aims to make tagging text with parts of speech trivially -- easy. -- -- If you're new to chatter and POS-tagging, then I suggest you -- simply try: -- --
--   >>> tagger <- defaultTagger
--   
--   >>> tagStr tagger "This is a sample sentence."
--   "This/dt is/bez a/at sample/nn sentence/nn ./."
--   
-- -- Note that we used tagStr, instead of tag, or -- tagText. Many people don't (yet!) use Data.Text by -- default, so there is a wrapper around tag that packs and -- unpacks the String. This is innefficient, but it's just to get -- you started, and tagStr can be very handy when you're debugging -- an tagger in ghci (or cabal repl). -- -- tag exposes more details of the tokenization and tagging, since -- it returns a list of TaggedSentences, but it doesn't print -- results as nicely. module NLP.POS -- | Tag a chunk of input text with part-of-speech tags, using the sentence -- splitter, tokenizer, and tagger contained in the POSTager. tag :: POSTagger -> Text -> [TaggedSentence] -- | Tag the tokens in a string. -- -- Returns a space-separated string of tokens, each token suffixed with -- the part of speech. For example: -- --
--   >>> tag tagger "the dog jumped ."
--   "the/at dog/nn jumped/vbd ./."
--   
tagStr :: POSTagger -> String -> String -- | Text version of tagStr tagText :: POSTagger -> Text -> Text -- | Train a POSTagger on a corpus of sentences. -- -- This will recurse through the POSTagger stack, training all the -- backoff taggers as well. In order to do that, this function has to be -- generic to the kind of taggers used, so it is not possible to train up -- a new POSTagger from nothing: train wouldn't know what tagger -- to create. -- -- To get around that restriction, you can use the various -- mkTagger implementations, such as mkTagger or -- NLP.POS.AvgPerceptronTagger.mkTagger'. For example: -- --
--   import NLP.POS.AvgPerceptronTagger as APT
--   
--   let newTagger = APT.mkTagger APT.emptyPerceptron Nothing
--   posTgr <- train newTagger trainingExamples
--   
train :: POSTagger -> [TaggedSentence] -> IO POSTagger -- | Train a tagger on string input in the standard form for POS tagged -- corpora: -- --
--   trainStr tagger "the/at dog/nn jumped/vbd ./."
--   
trainStr :: POSTagger -> String -> IO POSTagger -- | The Text version of trainStr trainText :: POSTagger -> Text -> IO POSTagger tagTokens :: POSTagger -> [Sentence] -> [TaggedSentence] -- | Evaluate a POSTager. -- -- Measures accuracy over all tags in the test corpus. -- -- Accuracy is calculated as: -- --
--   |tokens tagged correctly| / |all tokens|
--   
eval :: POSTagger -> [TaggedSentence] -> Double serialize :: POSTagger -> ByteString deserialize :: Map ByteString (ByteString -> Maybe POSTagger -> Either String POSTagger) -> ByteString -> Either String POSTagger -- | The default table of tagger IDs to readTagger functions. Each tagger -- packaged with Chatter should have an entry here. By convention, the -- IDs use are the fully qualified module name of the tagger package. taggerTable :: Map ByteString (ByteString -> Maybe POSTagger -> Either String POSTagger) -- | Store a POSTager to a file. saveTagger :: POSTagger -> FilePath -> IO () -- | Load a tagger, using the interal taggerTable. If you need to -- specify your own mappings for new composite taggers, you should use -- deserialize. -- -- This function checks the filename to determine if the content should -- be decompressed. If the file ends with ".gz", then we assume it is a -- gziped model. loadTagger :: FilePath -> IO POSTagger defaultTagger :: IO POSTagger