-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/


-- | Nerf, the named entity recognition tool based on linear-chain CRFs
--   
--   The package provides the named entity recognition (NER) tool divided
--   into a back-end library (see the <a>NLP.Nerf</a> module) and the
--   front-end tool nerf. Using the library you can model and recognize
--   named entities (NEs) which, for a particular sentence, take the form
--   of forest with NE category values kept in internal nodes and sentence
--   words kept in forest leaves.
--   
--   To model NE forests we combine two different techniques. The IOB codec
--   is used to translate to and fro between the original, forest
--   representation of NEs and the sequence of atomic labels. In other
--   words, it provides two isomorphic functions for encoding and decoding
--   between both representations. Linear-chain conditional random fields,
--   on the other hand, provide the framework for label modelling and
--   tagging.
@package nerf
@version 0.1.0


-- | Basic types for dictionary handling.
module NLP.Nerf.Dict.Base

-- | A orthographic form.
type Form = Text

-- | Is the form a multiword one?
isMultiWord :: Form -> Bool

-- | A type of named entity.
type NeType = Text

-- | A Named Entity entry from the LMF dictionary.
data Entry
Entry :: !Form -> !NeType -> Entry

-- | Orthographic form of the NE
neOrth :: Entry -> !Form

-- | Type of the NE
neType :: Entry -> !NeType

-- | A NeDict is a map from forms to NE types. Each NE may be annotated
--   with multiple types.
type NeDict = Map Form (Set NeType)

-- | Construct the dictionary from the list of entries.
mkDict :: [Entry] -> NeDict

-- | Remove dictionary entries which do not satisfy the predicate.
siftDict :: (Form -> Set NeType -> Bool) -> NeDict -> NeDict

-- | Save the dictionary in the file.
saveDict :: FilePath -> NeDict -> IO ()

-- | Load the dictionary from the file.
loadDict :: FilePath -> IO NeDict

-- | Merge dictionary resources.
merge :: [NeDict] -> NeDict

-- | Differentiate labels from separate dictionaries using
--   dictionary-unique prefixes.
diff :: [NeDict] -> [NeDict]
instance Show Entry
instance Read Entry
instance Eq Entry
instance Ord Entry


-- | Parsing the Gazetteer for Polish Named Entities (used formerly within
--   the SProUT platform) in the LMF format.
module NLP.Nerf.Dict.PNEG

-- | Parse the dictionary to the list of entries.
parsePNEG :: Text -> [Entry]

-- | Read the dictionary from the file.
readPNEG :: FilePath -> IO [Entry]


-- | Handling the NELexicon dictionary.
module NLP.Nerf.Dict.NELexicon

-- | Parse the NELexicon into a list of entries.
parseNELexicon :: Text -> [Entry]

-- | Read the dictionary from the file.
readNELexicon :: FilePath -> IO [Entry]

module NLP.Nerf.Dict

-- | Parse the PNEG dictionary and save it in a binary form into the output
--   file.
preparePNEG :: FilePath -> FilePath -> IO ()

-- | Parse the NELexicon, merge it with the PoliMorf and serialize into a
--   binary, DAWG form.
prepareNELexicon :: FilePath -> FilePath -> FilePath -> IO ()


-- | Basic types.
module NLP.Nerf.Types

-- | A word.
type Word = Text

-- | A named entity.
type NE = Text

-- | An observation consist of an index (of list type) and an actual
--   observation value.
type Ob = ([Int], Text)

-- | A label is created by encoding the named entity forest using the IOB
--   method.
type Lb = Label NE


-- | Observation schema blocks for Nerf.
module NLP.Nerf.Schema

-- | The Ox monad specialized to word token type and text observations.
type Ox a = Ox Word Text a

-- | A schema is a block of the Ox computation performed within the context
--   of the sentence and the absolute sentence position.
type Schema a = Vector Word -> Int -> Ox a

-- | A dummy schema block.
void :: a -> Schema a

-- | Sequence the list of schemas and discard individual values.
sequenceS_ :: [Schema a] -> Schema ()

-- | Use the schema to extract observations from the sentence.
schematize :: Schema a -> [Word] -> Sent Ob

-- | Configuration of the schema. All configuration elements specify the
--   range over which a particular observation type should be taken on
--   account. For example, the <tt>[-1, 0, 2]</tt> range means that
--   observations of particular type will be extracted with respect to
--   previous (<tt>k - 1</tt>), current (<tt>k</tt>) and after the next
--   (<tt>k + 2</tt>) positions when identifying the observation set for
--   position <tt>k</tt> in the input sentence.
data SchemaCfg
SchemaCfg :: [Int] -> [Int] -> [Int] -> [Int] -> [Int] -> Maybe (NeDict, [Int]) -> SchemaCfg

-- | The <a>orthS</a> schema block
orthC :: SchemaCfg -> [Int]

-- | The <a>lemmaS</a> schema block
lemmaC :: SchemaCfg -> [Int]

-- | The <a>shapeS</a> schema block
shapeC :: SchemaCfg -> [Int]

-- | The <a>shapePairS</a> schema block
shapePairC :: SchemaCfg -> [Int]

-- | The <a>suffixS</a> schema block
suffixC :: SchemaCfg -> [Int]

-- | The <a>searchS</a> schema block
dictC :: SchemaCfg -> Maybe (NeDict, [Int])

-- | Default configuration for Nerf observation schema.
defaultCfg :: FilePath -> IO SchemaCfg

-- | Build the schema based on the configuration.
fromCfg :: SchemaCfg -> Schema ()

-- | A block is a chunk of the Ox computation performed within the context
--   of the sentence and the list of absolute sentence positions.
type Block a = Vector Word -> [Int] -> Ox a

-- | Transform the block to the schema dependent on the list of relative
--   sentence positions.
fromBlock :: Block a -> [Int] -> Schema a

-- | Orthographic observations determined with respect to the list of
--   relative positions.
orthS :: Block ()

-- | Lemma substitute determined with respect to the list of relative
--   positions.
lemmaS :: Block ()

-- | Shape and packed shape determined with respect to the list of relative
--   positions.
shapeS :: Block ()

-- | Shape pairs determined with respect to the list of relative positions.
shapePairS :: Block ()

-- | Several suffixes determined with respect to the list of relative
--   positions.
suffixS :: Block ()

-- | Plain dictionary search determined with respect to the list of
--   relative positions.
searchS :: NeDict -> Block ()
instance Binary SchemaCfg


-- | Main module of the Nerf tool.
module NLP.Nerf

-- | A Nerf consists of the observation schema configuration and the CRF
--   model.
data Nerf
Nerf :: SchemaCfg -> CRF Ob Lb -> Nerf
schemaCfg :: Nerf -> SchemaCfg
crf :: Nerf -> CRF Ob Lb

-- | Train Nerf on the input data using the SGD method.
train :: SgdArgs -> SchemaCfg -> FilePath -> Maybe FilePath -> IO Nerf

-- | Perform named entity recognition (NER) using the Nerf.
ner :: Nerf -> [Word] -> NeForest NE Word

-- | Show results of observation extraction on the input ENAMEX file.
tryOx :: SchemaCfg -> FilePath -> IO ()
instance Binary Nerf