-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/


-- | Language-agnostic analyzer for positional morphosyntactic tags
--   
--   Implementation of a space-efficient morphosyntactic analyzer. It
--   solves a problem of providing a set of possible tags for a given word.
@package moan
@version 0.2.0.2


-- | Implementation of a space-efficient morphosyntactic analyzer.
--   
--   It solves a problem of providing a set of possible tags for a given
--   word. Instead of just matching on the word-set pair, one can assume
--   that suffixes of an unknown word also hold some information about the
--   set.
--   
--   This library provides the functionality of that kind of analysis. One
--   example of where this might be useful is <tt>concraft</tt> tagging
--   library. Before the POS-tagging one needs to have a set of possible
--   tags for a word from which the correct one is disambiguated.
--   
--   For a sufficiently large construction corpus this analyzer might only
--   benefit from additional regular expressions for punctuation and number
--   matching. There is a possibility of returning a set of possible tags
--   that isn't complete - the set doesn't contain a correct tag. If
--   construction corpus isn't sufficiently large, there might be a fair
--   amount of incomplete sets on unseen named entities (person names,
--   corporation names etc.).
--   
--   If one needs the analyzer to be less aggressive, it is recommended to
--   extend the functionality and remove the sets of possible tags from
--   words which might be named (ex. capitalized words in the middle of a
--   sentence). This is present mostly in use cases where part-of-speech
--   tags of a language contain information whether a word represents a
--   named entity or not, so if this is not a case, there will be no need
--   to extend the current functionality.
--   
--   A simple example of using <tt>GHCi</tt> for construction:
--   
--   <pre>
--   :set -XOverloadedStrings
--   import qualified Data.Text.IO as T
--   import qualified Data.Tagset.Positional as P
--   f &lt;- readFile "tagset.cfg"
--   let tset = P.parseTagset "tagset1" f
--   f &lt;- T.readFile "fulldict.txt"
--   let train = map (\(word:tags) -&gt; (word, map (P.parseTag tset) tags)) . map T.words . filter (not . T.null) . T.lines $ f
--   let an = create tset (AConf 3 [] M.empty) train
--   save "analyzer.gz" an
--   </pre>
--   
--   It is assumed that tag attributes are separated with <tt>:</tt> for
--   <a>parseTag</a>. One could write a different parsing function.
module NLP.Morphosyntax.Analyzer

-- | Representation of the analyzer.
data Analyzer

-- | Checks whether a word is in the analyzer. If it is the set of tags
--   returned by the <a>getTags</a> will be non-empty.
elem :: Text -> Analyzer -> Bool

-- | Gives a set of possible tags for a given word. It is possible that the
--   set of possible tags is empty.
getTags :: Analyzer -> Text -> Set Tag

-- | Save analyzer in a file. Data is compressed using the gzip format.
save :: FilePath -> Analyzer -> IO ()

-- | Load analyzer from a file.
load :: FilePath -> IO Analyzer

-- | Creates a morphological analyzer given a tagset, a list of regex for
--   additional matching, smallest suffix length and a construction corpus.
create :: Tagset -> AConf -> [(Text, [Tag])] -> Analyzer

-- | Can be used for dummy analyzer building.
emptyConf :: AConf

-- | Replaces the need of writing regular expressions for simple matching.
--   Matching on punctuation, number, alphanumeric, upper-case tokens or
--   regular expressions.
data Matcher

-- | Matches a token with all punctuation characters.
Punct :: Matcher

-- | Matches a token with all unicode numeral characters.
Number :: Matcher

-- | Matches a token with all alphanumeric characters.
AlphaNum :: Matcher

-- | Matches a token with at least one uppercase characther.
AnyUpper :: Matcher

-- | Matches a token with all uppercase characters.
AllUpper :: Matcher

-- | Matches a token with at least one lowercase characther.
AnyLower :: Matcher

-- | Matches a token with all lowercase characters.
AllLower :: Matcher

-- | Matches a capitalized token.
Capital :: Matcher

-- | Matches on a regular expression.
RegExpr :: Text -> Matcher

-- | Configuration for the analyzer.
data AConf
AConf :: Int -> [(Matcher, Set Tag)] -> Map POS (Set POS) -> AConf

-- | If word isn't known this is the smallest suffix length that will be
--   matched.
suffixLen :: AConf -> Int

-- | A list of regular expressions (POSIX) and accompanying set of tags. If
--   a word matches a regular expression, the accompanying set of tags will
--   be given as the set of possible tags.
regexMatch :: AConf -> [(Matcher, Set Tag)]

-- | Provides the analyzer with the ability to analyze the word on a single
--   <a>POS</a>-tag in case incomplete construction corpus is present. (Ex.
--   Croatian adjectives and pronouns) It might be the case that words that
--   can be adjectives can also be pronouns. If the analyzer isn't thorough
--   enough (the provided construction data doesn't have all cases covered)
--   one would also like that words that are adjectives are also
--   interpreted as being pronouns. What can happen is, an unknown word has
--   a very long suffix that matches an adjective, but it can also be a
--   pronoun. In that case one would like pronoun tags too. If your
--   construction data is very large this doesn't have to be used.
separationLayout :: AConf -> Map POS (Set POS)
instance Eq Matcher
instance Show Matcher
instance Ord Matcher
instance Eq AConf
instance Show AConf
instance Eq ConstLayout
instance Show ConstLayout
instance Eq Analyzer
instance Binary Analyzer
instance Binary ConstLayout
instance Binary AConf
instance Binary Matcher