-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/


-- | A library of simple NLP algorithms.
--   
--   chatter is a collection of simple Natural Language Processing
--   algorithms.
--   
--   Chatter supports:
--   
--   <ul>
--   <li>Part of speech tagging with Averaged Perceptrons. Based on the
--   Python implementation by Matthew Honnibal:
--   (<a>http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/</a>)
--   See <a>NLP.POS</a> for the details of part-of-speech tagging with
--   chatter.</li>
--   <li>Document similarity; A cosine-based similarity measure, and TF-IDF
--   calculations, are available in the <a>NLP.Similarity.VectorSim</a>
--   module.</li>
--   <li>Information Extraction patterns via
--   (<a>http://www.haskell.org/haskellwiki/Parsec/</a>) Parsec</li>
--   </ul>
@package chatter
@version 0.2.0.0

module Data.DefaultMap

-- | Defaulting Map; a Map that returns a default value when queried for a
--   key that does not exist.
data DefaultMap k v
DefMap :: v -> Map k v -> DefaultMap k v
defDefault :: DefaultMap k v -> v
defMap :: DefaultMap k v -> Map k v

-- | Create an empty <a>DefaultMap</a>
empty :: v -> DefaultMap k v

-- | Query the map for a value. Returns the default if the key is not
--   found.
lookup :: Ord k => k -> DefaultMap k v -> v

-- | Create a <a>DefaultMap</a> from a default value and a list.
fromList :: Ord k => v -> [(k, v)] -> DefaultMap k v

-- | Access the keys as a list.
keys :: DefaultMap k a -> [k]

-- | Fold over the values in the map.
--   
--   Note that this *does* not fold over the default value -- this fold
--   behaves in the same way as a standard <a>foldl</a>
foldl :: (a -> b -> a) -> a -> DefaultMap k b -> a
instance (Ord k, Read k, Read v) => Read (DefaultMap k v)
instance (Show k, Show v) => Show (DefaultMap k v)
instance (Eq k, Eq v) => Eq (DefaultMap k v)
instance (Ord k, Ord v) => Ord (DefaultMap k v)
instance Generic (DefaultMap k v)
instance Datatype D1DefaultMap
instance Constructor C1_0DefaultMap
instance Selector S1_0_0DefaultMap
instance Selector S1_0_1DefaultMap
instance (NFData k, NFData v, Ord k) => NFData (DefaultMap k v)
instance (Ord k, Serialize k, Serialize v) => Serialize (DefaultMap k v)


-- | Utilities for reading mailman-style email archives.
module NLP.Corpora.Email

-- | Path to the directory containing all the PLUG archives.
plugDataPath :: FilePath
plugArchiveText :: IO [Text]
plugArchiveTokens :: IO [[Text]]
fullPlugArchive :: IO [Message]
readF :: FilePath -> IO Text

module NLP.Types
type Sentence = [Text]
type TaggedSentence = [(Text, Tag)]
flattenText :: TaggedSentence -> Text

-- | True if the input sentence contains the given text token. Does not do
--   partial or approximate matching, and compares details in a fully
--   case-sensitive manner.
contains :: TaggedSentence -> Text -> Bool

-- | True if the input sentence contains the given POS tag. Does not do
--   partial matching (such as prefix matching)
containsTag :: TaggedSentence -> Tag -> Bool

-- | Boolean type to indicate case sensitivity for textual comparisons.
data CaseSensitive
Sensitive :: CaseSensitive
Insensitive :: CaseSensitive

-- | Part of Speech tagger, with back-off tagger.
--   
--   A sequence of pos taggers can be assembled by using backoff taggers.
--   When tagging text, the first tagger is run on the input, possibly
--   tagging some tokens as unknown ('Tag <a>Unk</a>'). The first backoff
--   tagger is then recursively invoked on the text to fill in the unknown
--   tags, but that may still leave some tokens marked with 'Tag
--   <a>Unk</a>'. This process repeats until no more taggers are found.
--   (The current implementation is not very efficient in this respect.).
--   
--   Back off taggers are particularly useful when there is a set of domain
--   specific vernacular that a general purpose statistical tagger does not
--   know of. A LitteralTagger can be created to map terms to fixed POS
--   tags, and then delegate the bulk of the text to a statistical back off
--   tagger, such as an AvgPerceptronTagger.
--   
--   <a>POSTagger</a> values can be serialized and deserialized by using
--   <a>serialize</a> and NLP.POS.deserialize`. This is a bit tricky
--   because the POSTagger abstracts away the implementation details of the
--   particular tagging algorithm, and the model for that tagger (if any).
--   To support serialization, each POSTagger value must provide a
--   serialize value that can be used to generate a <a>ByteString</a>
--   representation of the model, as well as a unique id (also a
--   <a>ByteString</a>). Furthermore, that ID must be added to a `Map
--   ByteString (ByteString -&gt; Maybe POSTagger -&gt; Either String
--   POSTagger)` that is provided to <tt>deserialize</tt>. The function in
--   the map takes the output of <a>posSerialize</a>, and possibly a
--   backoff tagger, and reconstitutes the POSTagger that was serialized
--   (assigning the proper functions, setting up closures as needed, etc.)
--   Look at the source for <a>taggerTable</a> and <a>readTagger</a> for
--   examples.
data POSTagger
POSTagger :: ([Sentence] -> [TaggedSentence]) -> ([TaggedSentence] -> IO POSTagger) -> Maybe POSTagger -> (Text -> Sentence) -> (Text -> [Text]) -> ByteString -> ByteString -> POSTagger

-- | The initial part-of-speech tagger.
posTagger :: POSTagger -> [Sentence] -> [TaggedSentence]

-- | Training function to train the immediate POS tagger.
posTrainer :: POSTagger -> [TaggedSentence] -> IO POSTagger

-- | A tagger to invoke on unknown tokens.
posBackoff :: POSTagger -> Maybe POSTagger

-- | A tokenizer; (<a>words</a> will work.)
posTokenizer :: POSTagger -> Text -> Sentence

-- | A sentence splitter. If your input is formatted as one sentence per
--   line, then use <a>lines</a>, otherwise try Erik Kow's fullstop
--   library.
posSplitter :: POSTagger -> Text -> [Text]

-- | Store this POS tagger to a bytestring. This does <i>not</i> serialize
--   the backoff taggers.
posSerialize :: POSTagger -> ByteString

-- | A unique id that will identify the algorithm used for this POS Tagger.
--   This is used in deserialization
posID :: POSTagger -> ByteString

-- | Remove the tags from a tagged sentence
stripTags :: TaggedSentence -> Sentence
newtype Tag
Tag :: Text -> Tag
fromTag :: Tag -> Text
parseTag :: Text -> Tag

-- | Constant tag for "unknown"
tagUNK :: Tag

-- | Document corpus.
--   
--   This is a simple hashed corpus, the document content is not stored.
data Corpus
Corpus :: Int -> Map Text Int -> Corpus

-- | The number of documents in the corpus.
corpLength :: Corpus -> Int

-- | A count of the number of documents each term occurred in.
corpTermCounts :: Corpus -> Map Text Int

-- | Get the number of documents that a term occurred in.
termCounts :: Corpus -> Text -> Int

-- | Add a document to the corpus.
--   
--   This can be dangerous if the documents are pre-processed differently.
--   All corpus-related functions assume that the documents have all been
--   tokenized and the tokens normalized, in the same way.
addDocument :: Corpus -> [Text] -> Corpus

-- | Create a corpus from a list of documents, represented by normalized
--   tokens.
mkCorpus :: [[Text]] -> Corpus
addTerms :: Map Text Int -> Set Text -> Map Text Int
addTerm :: Map Text Int -> Text -> Map Text Int
instance Read CaseSensitive
instance Show CaseSensitive
instance Generic CaseSensitive
instance Ord Tag
instance Eq Tag
instance Read Tag
instance Show Tag
instance Generic Tag
instance Read Corpus
instance Show Corpus
instance Eq Corpus
instance Ord Corpus
instance Generic Corpus
instance Datatype D1CaseSensitive
instance Constructor C1_0CaseSensitive
instance Constructor C1_1CaseSensitive
instance Datatype D1Tag
instance Constructor C1_0Tag
instance Datatype D1Corpus
instance Constructor C1_0Corpus
instance Selector S1_0_0Corpus
instance Selector S1_0_1Corpus
instance Serialize Corpus
instance NFData Corpus
instance Serialize Text
instance Serialize Tag

module NLP.POS.LiteralTagger
tag :: Map Text Tag -> CaseSensitive -> [Sentence] -> [TaggedSentence]
tagSentence :: Map Text Tag -> CaseSensitive -> Sentence -> TaggedSentence

-- | Create a Literal Tagger using the specified back-off tagger as a
--   fall-back, if one is specified.
--   
--   This uses a tokenizer adapted from the <tt>tokenize</tt> package for a
--   tokenizer, and Erik Kow's fullstop sentence segmenter as a sentence
--   splitter.
mkTagger :: Map Text Tag -> CaseSensitive -> Maybe POSTagger -> POSTagger
taggerID :: ByteString

-- | deserialization for Literal Taggers. The serialization logic is in the
--   posSerialize record of the POSTagger created in mkTagger.
readTagger :: ByteString -> Maybe POSTagger -> Either String POSTagger

-- | Boolean type to indicate case sensitivity for textual comparisons.
data CaseSensitive
Sensitive :: CaseSensitive
Insensitive :: CaseSensitive

-- | Create a tokenizer that protects the provided terms (to tokenize
--   multi-word terms)
protectTerms :: [Text] -> CaseSensitive -> Tokenizer
instance Serialize CaseSensitive


-- | This POS tagger deterministically tags tokens. However, if it ever
--   sees multiple tags for the same token, it will forget the tag it has
--   learned. This is useful for creating taggers that have very high
--   precision, but very low recall.
--   
--   Unambiguous taggers are also useful when defined with a
--   non-deterministic backoff tagger, such as an
--   <a>NLP.POS.AveragedPerceptronTagger</a>, since the high-confidence
--   tags will be applied first, followed by the more non-deterministic
--   results of the backoff tagger.
module NLP.POS.UnambiguousTagger
taggerID :: ByteString
readTagger :: ByteString -> Maybe POSTagger -> Either String POSTagger

-- | Create an unambiguous tagger, using the supplied <a>Map</a> as a
--   source of tags.
mkTagger :: Map Text Tag -> Maybe POSTagger -> POSTagger

-- | Trainer method for unambiguous taggers.
train :: Map Text Tag -> [TaggedSentence] -> Map Text Tag


-- | Average Perceptron implementation of Part of speech tagging, adapted
--   for Haskell from this python implementation, which is described on the
--   blog post:
--   
--   <ul>
--   
--   <li><a>http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/</a></li>
--   </ul>
--   
--   The Perceptron code can be found on github:
--   
--   <ul>
--   
--   <li><a>https://github.com/sloria/TextBlob/blob/dev/text/_perceptron.py</a></li>
--   </ul>
module NLP.POS.AvgPerceptron

-- | The perceptron model.
data Perceptron
Perceptron :: Map Feature (Map Class Weight) -> Map (Feature, Class) Weight -> Map (Feature, Class) Int -> Int -> Perceptron

-- | Each feature gets its own weight vector, so weights is a dict-of-dicts
weights :: Perceptron -> Map Feature (Map Class Weight)

-- | The accumulated values, for the averaging. These will be keyed by
--   feature/clas tuples
totals :: Perceptron -> Map (Feature, Class) Weight

-- | The last time the feature was changed, for the averaging. Also keyed
--   by feature/clas tuples (tstamps is short for timestamps)
tstamps :: Perceptron -> Map (Feature, Class) Int

-- | Number of instances seen
instances :: Perceptron -> Int

-- | The classes that the perceptron assigns are represnted with a
--   newtype-wrapped String.
--   
--   Eventually, I think this should become a typeclass, so the classes can
--   be defined by the users of the Perceptron (such as custom POS tag
--   ADTs, or more complex classes).
newtype Class
Class :: String -> Class

-- | Typedef for doubles to make the code easier to read, and to make this
--   simple to change if necessary.
type Weight = Double
newtype Feature
Feat :: Text -> Feature

-- | An empty perceptron, used to start training.
emptyPerceptron :: Perceptron

-- | Predict a class given a feature vector.
--   
--   Ported from python:
--   
--   <pre>
--   def predict(self, features):
--       '''Dot-product the features and current weights and return the best label.'''
--       scores = defaultdict(float)
--       for feat, value in features.items():
--           if feat not in self.weights or value == 0:
--               continue
--           weights = self.weights[feat]
--           for label, weight in weights.items():
--               scores[label] += value * weight
--       # Do a secondary alphabetic sort, for stability
--       return max(self.classes, key=lambda label: (scores[label], label))
--   </pre>
predict :: Perceptron -> Map Feature Int -> Maybe Class
train :: Int -> Perceptron -> [(Map Feature Int, Class)] -> IO Perceptron

-- | Update the perceptron with a new example.
--   
--   <pre>
--   update(self, truth, guess, features)
--      ...
--           self.i += 1
--           if truth == guess:
--               return None
--           for f in features:
--               weights = self.weights.setdefault(f, {}) -- setdefault is Map.findWithDefault, and destructive.
--               upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
--               upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
--           return None
--   </pre>
update :: Perceptron -> Class -> Class -> [Feature] -> Perceptron

-- | Average the weights
--   
--   Ported from Python:
--   
--   <pre>
--   def average_weights(self):
--       for feat, weights in self.weights.items():
--           new_feat_weights = {}
--           for clas, weight in weights.items():
--               param = (feat, clas)
--               total = self._totals[param]
--               total += (self.i - self._tstamps[param]) * weight
--               averaged = round(total / float(self.i), 3)
--               if averaged:
--                   new_feat_weights[clas] = averaged
--           self.weights[feat] = new_feat_weights
--       return None
--   </pre>
averageWeights :: Perceptron -> Perceptron
instance Read Feature
instance Show Feature
instance Eq Feature
instance Ord Feature
instance Generic Feature
instance Read Class
instance Show Class
instance Eq Class
instance Ord Class
instance Generic Class
instance Read Perceptron
instance Show Perceptron
instance Eq Perceptron
instance Generic Perceptron
instance Datatype D1Feature
instance Constructor C1_0Feature
instance Datatype D1Class
instance Constructor C1_0Class
instance Datatype D1Perceptron
instance Constructor C1_0Perceptron
instance Selector S1_0_0Perceptron
instance Selector S1_0_1Perceptron
instance Selector S1_0_2Perceptron
instance Selector S1_0_3Perceptron
instance NFData Perceptron
instance Serialize Perceptron
instance Serialize Class
instance Serialize Feature

module NLP.Similarity.VectorSim

-- | An efficient (ish) representation for documents in the "bag of words"
--   sense.
type TermVector = DefaultMap Text Double

-- | Generate a <a>TermVector</a> from a tokenized document.
mkVector :: Corpus -> [Text] -> TermVector

-- | Invokes similarity on full strings, using <a>words</a> for
--   tokenization, and no stemming.
--   
--   There *must* be at least one document in the corpus.
sim :: Corpus -> Text -> Text -> Double

-- | Determine how similar two documents are.
--   
--   This function assumes that each document has been tokenized and (if
--   desired) stemmed/case-normalized.
--   
--   This is a wrapper around <a>tvSim</a>, which is a *much* more
--   efficient implementation. If you need to run similarity against any
--   single document more than once, then you should create
--   <a>TermVector</a>s for each of your documents and use <a>tvSim</a>
--   instead of <a>similarity</a>.
--   
--   There *must* be at least one document in the corpus.
similarity :: Corpus -> [Text] -> [Text] -> Double

-- | Determine how similar two documents are.
--   
--   Calculates the similarity between two documents, represented as
--   <tt>TermVectors</tt>
tvSim :: TermVector -> TermVector -> Double

-- | Return the raw frequency of a term in a body of text.
--   
--   The firt argument is the term to find, the second is a tokenized
--   document. This function does not do any stemming or additional text
--   modification.
tf :: Eq a => a -> [a] -> Int

-- | Calculate the inverse document frequency.
--   
--   The IDF is, roughly speaking, a measure of how popular a term is.
idf :: Text -> Corpus -> Double

-- | Calculate the tf*idf measure for a term given a document and a corpus.
tf_idf :: Text -> [Text] -> Corpus -> Double
cosVec :: TermVector -> TermVector -> Double

-- | Calculate the magnitude of a vector.
magnitude :: TermVector -> Double

-- | find the dot product of two vectors.
dotProd :: TermVector -> TermVector -> Double


-- | This is a very simple wrapper around Parsec for writing Information
--   Extraction patterns.
--   
--   Because the particular tags/tokens to parse depends on the training
--   corpus (for POS tagging) and the domain, this module only provides
--   basic extractors. You can, for example, create an extractor to find
--   noun phrases by combining the components provided here:
--   
--   <pre>
--   nounPhrase :: Extractor (Text, Tag)
--   nounPhrase = do
--     nlist &lt;- many1 (try (posTok $ Tag "NN")
--                 &lt;|&gt; try (posTok $ Tag "DT")
--                     &lt;|&gt; (posTok $ Tag "JJ"))
--     let term = T.intercalate " " (map fst nlist)
--     return (term, Tag "n-phr")
--   </pre>
module NLP.Extraction.Parsec

-- | A Parsec parser.
--   
--   Example usage:
--   
--   <pre>
--   &gt; set -XOverloadedStrings
--   &gt; import Text.Parsec.Prim
--   &gt; parse myExtractor "interactive repl" someTaggedSentence
--   </pre>
type Extractor = Parsec TaggedSentence ()

-- | Consume a token with the given POS Tag
posTok :: Tag -> Extractor (Text, Tag)

-- | Consume a token with the specified POS prefix.
--   
--   <pre>
--   parse (posPrefix "n") "ghci" [("Bob", Tag "np")]
--   </pre>
--   
--   Right [(<a>Bob</a>, Tag "np")]
posPrefix :: Text -> Extractor (Text, Tag)

-- | Text equality matching with optional case sensitivity.
matches :: CaseSensitive -> Text -> Text -> Bool

-- | Consume a token with the given lexical representation.
txtTok :: CaseSensitive -> Text -> Extractor (Text, Tag)

-- | Consume any one non-empty token.
anyToken :: Extractor (Text, Tag)
oneOf :: CaseSensitive -> [Text] -> Extractor (Text, Tag)

-- | Skips any number of fill tokens, ending with the end parser, and
--   returning the last parsed result.
--   
--   This is useful when you know what you're looking for and (for
--   instance) don't care what comes first.
followedBy :: Extractor b -> Extractor a -> Extractor a

module NLP.Corpora.Parsing

-- | Read a POS-tagged corpus out of a Text string of the form: "token/tag
--   token/tag..."
--   
--   <pre>
--   &gt;&gt;&gt; readPOS "Dear/jj Sirs/nns :/: Let/vb"
--   [("Dear",JJ),("Sirs",NNS),(":",Other ":"),("Let",VB)]
--   </pre>
readPOS :: Text -> TaggedSentence

-- | Returns all but the last element of a string, unless the string is
--   empty, in which case it returns that string.
safeInit :: Text -> Text


-- | Avegeraged Perceptron Tagger
--   
--   Adapted from the python implementation found here:
--   
--   <ul>
--   
--   <li><a>https://github.com/sloria/textblob-aptagger/blob/master/textblob_aptagger/taggers.py</a></li>
--   </ul>
module NLP.POS.AvgPerceptronTagger

-- | Create an Averaged Perceptron Tagger using the specified back-off
--   tagger as a fall-back, if one is specified.
--   
--   This uses a tokenizer adapted from the <a>tokenize</a> package for a
--   tokenizer, and Erik Kow's fullstop sentence segmenter
--   (<a>http://hackage.haskell.org/package/fullstop</a>) as a sentence
--   splitter.
mkTagger :: Perceptron -> Maybe POSTagger -> POSTagger

-- | Train a new <a>Perceptron</a>.
--   
--   The training corpus should be a collection of sentences, one sentence
--   on each line, and with each token tagged with a part of speech.
--   
--   For example, the input:
--   
--   <pre>
--   "The/DT dog/NN jumped/VB ./.\nThe/DT cat/NN slept/VB ./."
--   </pre>
--   
--   defines two training sentences.
--   
--   <pre>
--   &gt;&gt;&gt; tagger &lt;- trainNew "Dear/jj Sirs/nns :/: Let/vb\nUs/nn begin/vb\n"
--   
--   &gt;&gt;&gt; tag tagger $ map T.words $ T.lines "Dear sir"
--   "Dear/jj Sirs/nns :/: Let/vb"
--   </pre>
trainNew :: Text -> IO Perceptron

-- | Train a new <a>Perceptron</a> on a corpus of files.
trainOnFiles :: [FilePath] -> IO Perceptron

-- | Add training examples to a perceptron.
--   
--   <pre>
--   &gt;&gt;&gt; tagger &lt;- train emptyPerceptron "Dear/jj Sirs/nns :/: Let/vb\nUs/nn begin/vb\n"
--   
--   &gt;&gt;&gt; tag tagger $ map T.words $ T.lines "Dear sir"
--   "Dear/jj Sirs/nns :/: Let/vb"
--   </pre>
--   
--   If you're using multiple input files, this can be useful to improve
--   performance (by folding over the files). For example, see
--   <a>trainOnFiles</a>
train :: Perceptron -> Text -> IO Perceptron

-- | Train a model from sentences.
--   
--   Ported from Python:
--   
--   <pre>
--   def train(self, sentences, save_loc=None, nr_iter=5):
--       self._make_tagdict(sentences)
--       self.model.classes = self.classes
--       prev, prev2 = START
--       for iter_ in range(nr_iter):
--           c = 0
--           n = 0
--           for words, tags in sentences:
--               context = START + [self._normalize(w) for w in words] + END
--               for i, word in enumerate(words):
--                   guess = self.tagdict.get(word)
--                   if not guess:
--                       feats = self._get_features(i, word, context, prev, prev2)
--                       guess = self.model.predict(feats)
--                       self.model.update(tags[i], guess, feats)
--                   prev2 = prev; prev = guess
--                   c += guess == tags[i]
--                   n += 1
--           random.shuffle(sentences)
--           logging.info("Iter {0}: {1}/{2}={3}".format(iter_, c, n, _pc(c, n)))
--       self.model.average_weights()
--       # Pickle as a binary file
--       if save_loc is not None:
--           pickle.dump((self.model.weights, self.tagdict, self.classes),
--                        open(save_loc, 'wb'), -1)
--       return None
--   </pre>
trainInt :: Int -> Perceptron -> [TaggedSentence] -> IO Perceptron

-- | Tag a document (represented as a list of <a>Sentence</a>s) with a
--   trained <a>Perceptron</a>
--   
--   Ported from Python:
--   
--   <pre>
--   def tag(self, corpus, tokenize=True):
--       '''Tags a string `corpus`.'''
--       # Assume untokenized corpus has \n between sentences and ' ' between words
--       s_split = nltk.sent_tokenize if tokenize else lambda t: t.split('\n')
--       w_split = nltk.word_tokenize if tokenize else lambda s: s.split()
--       def split_sents(corpus):
--           for s in s_split(corpus):
--               yield w_split(s)
--        prev, prev2 = self.START
--       tokens = []
--       for words in split_sents(corpus):
--           context = self.START + [self._normalize(w) for w in words] + self.END
--           for i, word in enumerate(words):
--               tag = self.tagdict.get(word)
--               if not tag:
--                   features = self._get_features(i, word, context, prev, prev2)
--                   tag = self.model.predict(features)
--               tokens.append((word, tag))
--               prev2 = prev
--               prev = tag
--       return tokens
--   </pre>
tag :: Perceptron -> [Sentence] -> [TaggedSentence]

-- | Tag a single sentence.
tagSentence :: Perceptron -> Sentence -> TaggedSentence

-- | An empty perceptron, used to start training.
emptyPerceptron :: Perceptron
taggerID :: ByteString
readTagger :: ByteString -> Maybe POSTagger -> Either String POSTagger


-- | This module aims to make tagging text with parts of speech trivially
--   easy.
--   
--   If you're new to <tt>chatter</tt> and POS-tagging, then I suggest you
--   simply try:
--   
--   <pre>
--   &gt;&gt;&gt; tagger &lt;- defaultTagger
--   
--   &gt;&gt;&gt; tagStr tagger "This is a sample sentence."
--   "This/dt is/bez a/at sample/nn sentence/nn ./."
--   </pre>
--   
--   Note that we used <a>tagStr</a>, instead of <a>tag</a>, or
--   <a>tagText</a>. Many people don't (yet!) use <a>Data.Text</a> by
--   default, so there is a wrapper around <a>tag</a> that packs and
--   unpacks the <a>String</a>. This is innefficient, but it's just to get
--   you started, and <a>tagStr</a> can be very handy when you're debugging
--   an tagger in ghci (or cabal repl).
--   
--   <a>tag</a> exposes more details of the tokenization and tagging, since
--   it returns a list of <a>TaggedSentence</a>s, but it doesn't print
--   results as nicely.
module NLP.POS

-- | Tag a chunk of input text with part-of-speech tags, using the sentence
--   splitter, tokenizer, and tagger contained in the <tt>POSTager</tt>.
tag :: POSTagger -> Text -> [TaggedSentence]

-- | Tag the tokens in a string.
--   
--   Returns a space-separated string of tokens, each token suffixed with
--   the part of speech. For example:
--   
--   <pre>
--   &gt;&gt;&gt; tag tagger "the dog jumped ."
--   "the/at dog/nn jumped/vbd ./."
--   </pre>
tagStr :: POSTagger -> String -> String

-- | Text version of tagStr
tagText :: POSTagger -> Text -> Text

-- | Train a <a>POSTagger</a> on a corpus of sentences.
--   
--   This will recurse through the <a>POSTagger</a> stack, training all the
--   backoff taggers as well. In order to do that, this function has to be
--   generic to the kind of taggers used, so it is not possible to train up
--   a new POSTagger from nothing: <a>train</a> wouldn't know what tagger
--   to create.
--   
--   To get around that restriction, you can use the various
--   <tt>mkTagger</tt> implementations, such as <a>mkTagger</a> or
--   NLP.POS.AvgPerceptronTagger.mkTagger'. For example:
--   
--   <pre>
--   import NLP.POS.AvgPerceptronTagger as APT
--   
--   let newTagger = APT.mkTagger APT.emptyPerceptron Nothing
--   posTgr &lt;- train newTagger trainingExamples
--   </pre>
train :: POSTagger -> [TaggedSentence] -> IO POSTagger

-- | Train a tagger on string input in the standard form for POS tagged
--   corpora:
--   
--   <pre>
--   trainStr tagger "the/at dog/nn jumped/vbd ./."
--   </pre>
trainStr :: POSTagger -> String -> IO POSTagger

-- | The <a>Text</a> version of <a>trainStr</a>
trainText :: POSTagger -> Text -> IO POSTagger
tagTokens :: POSTagger -> [Sentence] -> [TaggedSentence]

-- | Evaluate a <tt>POSTager</tt>.
--   
--   Measures accuracy over all tags in the test corpus.
--   
--   Accuracy is calculated as:
--   
--   <pre>
--   |tokens tagged correctly| / |all tokens|
--   </pre>
eval :: POSTagger -> [TaggedSentence] -> Double
serialize :: POSTagger -> ByteString
deserialize :: Map ByteString (ByteString -> Maybe POSTagger -> Either String POSTagger) -> ByteString -> Either String POSTagger

-- | The default table of tagger IDs to readTagger functions. Each tagger
--   packaged with Chatter should have an entry here. By convention, the
--   IDs use are the fully qualified module name of the tagger package.
taggerTable :: Map ByteString (ByteString -> Maybe POSTagger -> Either String POSTagger)

-- | Store a <tt>POSTager</tt> to a file.
saveTagger :: POSTagger -> FilePath -> IO ()

-- | Load a tagger, using the interal <a>taggerTable</a>. If you need to
--   specify your own mappings for new composite taggers, you should use
--   <a>deserialize</a>.
--   
--   This function checks the filename to determine if the content should
--   be decompressed. If the file ends with ".gz", then we assume it is a
--   gziped model.
loadTagger :: FilePath -> IO POSTagger
defaultTagger :: IO POSTagger