-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/
-- | Nerf, the named entity recognition tool based on linear-chain CRFs
--
-- The package provides the named entity recognition (NER) tool divided
-- into a back-end library (see the NLP.Nerf module) and the
-- front-end tool nerf. Using the library you can model and recognize
-- named entities (NEs) which, for a particular sentence, take the form
-- of forest with NE category values kept in internal nodes and sentence
-- words kept in forest leaves.
--
-- To model NE forests we combine two different techniques. The IOB codec
-- is used to translate to and fro between the original, forest
-- representation of NEs and the sequence of atomic labels. In other
-- words, it provides two isomorphic functions for encoding and decoding
-- between both representations. Linear-chain conditional random fields,
-- on the other hand, provide the framework for label modelling and
-- tagging.
@package nerf
@version 0.1.0
-- | Basic types for dictionary handling.
module NLP.Nerf.Dict.Base
-- | A orthographic form.
type Form = Text
-- | Is the form a multiword one?
isMultiWord :: Form -> Bool
-- | A type of named entity.
type NeType = Text
-- | A Named Entity entry from the LMF dictionary.
data Entry
Entry :: !Form -> !NeType -> Entry
-- | Orthographic form of the NE
neOrth :: Entry -> !Form
-- | Type of the NE
neType :: Entry -> !NeType
-- | A NeDict is a map from forms to NE types. Each NE may be annotated
-- with multiple types.
type NeDict = Map Form (Set NeType)
-- | Construct the dictionary from the list of entries.
mkDict :: [Entry] -> NeDict
-- | Remove dictionary entries which do not satisfy the predicate.
siftDict :: (Form -> Set NeType -> Bool) -> NeDict -> NeDict
-- | Save the dictionary in the file.
saveDict :: FilePath -> NeDict -> IO ()
-- | Load the dictionary from the file.
loadDict :: FilePath -> IO NeDict
-- | Merge dictionary resources.
merge :: [NeDict] -> NeDict
-- | Differentiate labels from separate dictionaries using
-- dictionary-unique prefixes.
diff :: [NeDict] -> [NeDict]
instance Show Entry
instance Read Entry
instance Eq Entry
instance Ord Entry
-- | Parsing the Gazetteer for Polish Named Entities (used formerly within
-- the SProUT platform) in the LMF format.
module NLP.Nerf.Dict.PNEG
-- | Parse the dictionary to the list of entries.
parsePNEG :: Text -> [Entry]
-- | Read the dictionary from the file.
readPNEG :: FilePath -> IO [Entry]
-- | Handling the NELexicon dictionary.
module NLP.Nerf.Dict.NELexicon
-- | Parse the NELexicon into a list of entries.
parseNELexicon :: Text -> [Entry]
-- | Read the dictionary from the file.
readNELexicon :: FilePath -> IO [Entry]
module NLP.Nerf.Dict
-- | Parse the PNEG dictionary and save it in a binary form into the output
-- file.
preparePNEG :: FilePath -> FilePath -> IO ()
-- | Parse the NELexicon, merge it with the PoliMorf and serialize into a
-- binary, DAWG form.
prepareNELexicon :: FilePath -> FilePath -> FilePath -> IO ()
-- | Basic types.
module NLP.Nerf.Types
-- | A word.
type Word = Text
-- | A named entity.
type NE = Text
-- | An observation consist of an index (of list type) and an actual
-- observation value.
type Ob = ([Int], Text)
-- | A label is created by encoding the named entity forest using the IOB
-- method.
type Lb = Label NE
-- | Observation schema blocks for Nerf.
module NLP.Nerf.Schema
-- | The Ox monad specialized to word token type and text observations.
type Ox a = Ox Word Text a
-- | A schema is a block of the Ox computation performed within the context
-- of the sentence and the absolute sentence position.
type Schema a = Vector Word -> Int -> Ox a
-- | A dummy schema block.
void :: a -> Schema a
-- | Sequence the list of schemas and discard individual values.
sequenceS_ :: [Schema a] -> Schema ()
-- | Use the schema to extract observations from the sentence.
schematize :: Schema a -> [Word] -> Sent Ob
-- | Configuration of the schema. All configuration elements specify the
-- range over which a particular observation type should be taken on
-- account. For example, the [-1, 0, 2] range means that
-- observations of particular type will be extracted with respect to
-- previous (k - 1), current (k) and after the next
-- (k + 2) positions when identifying the observation set for
-- position k in the input sentence.
data SchemaCfg
SchemaCfg :: [Int] -> [Int] -> [Int] -> [Int] -> [Int] -> Maybe (NeDict, [Int]) -> SchemaCfg
-- | The orthS schema block
orthC :: SchemaCfg -> [Int]
-- | The lemmaS schema block
lemmaC :: SchemaCfg -> [Int]
-- | The shapeS schema block
shapeC :: SchemaCfg -> [Int]
-- | The shapePairS schema block
shapePairC :: SchemaCfg -> [Int]
-- | The suffixS schema block
suffixC :: SchemaCfg -> [Int]
-- | The searchS schema block
dictC :: SchemaCfg -> Maybe (NeDict, [Int])
-- | Default configuration for Nerf observation schema.
defaultCfg :: FilePath -> IO SchemaCfg
-- | Build the schema based on the configuration.
fromCfg :: SchemaCfg -> Schema ()
-- | A block is a chunk of the Ox computation performed within the context
-- of the sentence and the list of absolute sentence positions.
type Block a = Vector Word -> [Int] -> Ox a
-- | Transform the block to the schema dependent on the list of relative
-- sentence positions.
fromBlock :: Block a -> [Int] -> Schema a
-- | Orthographic observations determined with respect to the list of
-- relative positions.
orthS :: Block ()
-- | Lemma substitute determined with respect to the list of relative
-- positions.
lemmaS :: Block ()
-- | Shape and packed shape determined with respect to the list of relative
-- positions.
shapeS :: Block ()
-- | Shape pairs determined with respect to the list of relative positions.
shapePairS :: Block ()
-- | Several suffixes determined with respect to the list of relative
-- positions.
suffixS :: Block ()
-- | Plain dictionary search determined with respect to the list of
-- relative positions.
searchS :: NeDict -> Block ()
instance Binary SchemaCfg
-- | Main module of the Nerf tool.
module NLP.Nerf
-- | A Nerf consists of the observation schema configuration and the CRF
-- model.
data Nerf
Nerf :: SchemaCfg -> CRF Ob Lb -> Nerf
schemaCfg :: Nerf -> SchemaCfg
crf :: Nerf -> CRF Ob Lb
-- | Train Nerf on the input data using the SGD method.
train :: SgdArgs -> SchemaCfg -> FilePath -> Maybe FilePath -> IO Nerf
-- | Perform named entity recognition (NER) using the Nerf.
ner :: Nerf -> [Word] -> NeForest NE Word
-- | Show results of observation extraction on the input ENAMEX file.
tryOx :: SchemaCfg -> FilePath -> IO ()
instance Binary Nerf