-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/
-- | Nerf, the named entity recognition tool based on linear-chain CRFs
--
-- The package provides the named entity recognition (NER) tool divided
-- into a back-end library (see the NLP.Nerf module) and the
-- front-end tool nerf. Using the library you can model and recognize
-- named entities (NEs) which, for a particular sentence, take the form
-- of forest with NE category values kept in internal nodes and sentence
-- words kept in forest leaves.
--
-- To model NE forests we combine two different techniques. The IOB codec
-- is used to translate to and fro between the original, forest
-- representation of NEs and the sequence of atomic labels. In other
-- words, it provides two isomorphic functions for encoding and decoding
-- between both representations. Linear-chain conditional random fields,
-- on the other hand, provide the framework for label modelling and
-- tagging.
@package nerf
@version 0.4.0
-- | Polish Named Entity Triggers http://zil.ipipan.waw.pl/PNET
-- dictionary.
module NLP.Nerf.Dict.PNET
-- | Parse dictionary into a list of entries.
parsePNET :: Text -> [Entry]
-- | Read dictionary from the file.
readPNET :: FilePath -> IO [Entry]
-- | Trigger type.
data Typ
Internal :: Typ
External :: Typ
-- | Does entry represents a trigger of the given type?
hasTyp :: Typ -> Entry -> Bool
-- | PNET entry.
data Entry
Entry :: Text -> Text -> Text -> Typ -> Text -> Text -> Entry
orth :: Entry -> Text
base :: Entry -> Text
tag :: Entry -> Text
typ :: Entry -> Typ
neTyp :: Entry -> Text
example :: Entry -> Text
instance Show Typ
instance Eq Typ
instance Ord Typ
-- | Basic types for dictionary handling.
module NLP.Nerf.Dict.Base
-- | A type of named entity.
type NeType = Text
-- | A orthographic form.
type Form = Text
-- | Is the form a multiword one?
isMultiWord :: Form -> Bool
-- | A Named Entity entry from the LMF dictionary.
data Entry
Entry :: !Form -> !NeType -> Entry
-- | Orthographic form of the NE
neOrth :: Entry -> !Form
-- | Type of the NE
neType :: Entry -> !NeType
-- | Dictionary label.
type Label = Text
-- | A Dict is a map from forms to labels. Each form may be
-- annotated with multiple labels. The map is represented using the
-- directed acyclic word graph. type Dict = D.DAWG (S.Set Label)
type DAWG = DAWG Trans Char ()
type Dict = DAWG (Set Label)
-- | Construct dictionary from the list of form/label pairs.
fromPairs :: [(Form, Label)] -> Dict
-- | Construct dictionary from the list of entries.
fromEntries :: [Entry] -> Dict
-- | Remove dictionary entries which do not satisfy the predicate.
siftDict :: (Form -> Set Label -> Bool) -> Dict -> Dict
-- | Save the dictionary in the file.
saveDict :: FilePath -> Dict -> IO ()
-- | Load the dictionary from the file.
loadDict :: FilePath -> IO Dict
-- | Merge dictionary resources.
merge :: [Dict] -> Dict
-- | Differentiate labels from separate dictionaries using
-- dictionary-unique prefixes.
diff :: [Dict] -> [Dict]
instance Show Entry
instance Read Entry
instance Eq Entry
instance Ord Entry
-- | Parsing the Gazetteer for Polish Named Entities (used formerly within
-- the SProUT platform) in the LMF format.
module NLP.Nerf.Dict.PNEG
-- | Parse the dictionary to the list of entries.
parsePNEG :: Text -> [Entry]
-- | Read the dictionary from the file.
readPNEG :: FilePath -> IO [Entry]
-- | Handling the NELexicon dictionary.
module NLP.Nerf.Dict.NELexicon
-- | Parse the NELexicon into a list of entries.
parseNELexicon :: Text -> [Entry]
-- | Read the dictionary from the file.
readNELexicon :: FilePath -> IO [Entry]
-- | Handling Prolexbase dictionaries, both with the same storage format.
module NLP.Nerf.Dict.Prolexbase
-- | Parse dictionary into a list of entries.
parseProlexbase :: Text -> [Entry]
-- | Read the dictionary from the file.
readProlexbase :: FilePath -> IO [Entry]
-- | Extraction utilities for various dictionary resources.
module NLP.Nerf.Dict
-- | Extract NEs dictionary from PoliMorf.
extractPoliMorf :: FilePath -> IO Dict
-- | Extract NEs dictionary from PNEG.
extractPNEG :: FilePath -> IO Dict
-- | Extract NEs dictionary from NELexicon.
extractNELexicon :: FilePath -> IO Dict
-- | Extract NEs dictionary from Prolexbase.
extractProlexbase :: FilePath -> IO Dict
-- | Extract internal triggers from PNET dictionary.
extractIntTriggers :: FilePath -> IO Dict
-- | Extract external triggers from PNET dictionary.
extractExtTriggers :: FilePath -> IO Dict
-- | The module implements the tokenization used within Nerf and some other
-- tokenization-related stuff.
module NLP.Nerf.Tokenize
-- | Tokenize sentence using the default tokenizer.
tokenize :: String -> [String]
-- | A class of objects with size.
class Word a
word :: Word a => a -> String
-- | Synchronize named entities with tokenization represented by the second
-- function argument. Of course, both arguments should relate to the same
-- sentence.
moveNEs :: (Word b, Word c) => NeForest a b -> [c] -> NeForest a c
instance Word Text
instance Word Text
instance Word String
-- | Basic types.
module NLP.Nerf.Types
-- | A word.
type Word = Text
-- | A named entity.
type NE = Text
-- | An observation consist of an index (of list type) and an actual
-- observation value.
type Ob = ([Int], Text)
-- | A label is created by encoding the named entity forest using the IOB
-- method.
type Lb = Label NE
-- | Observation schema blocks for Nerf.
module NLP.Nerf.Schema
-- | The Ox monad specialized to word token type and text observations.
type Ox a = Ox Word Text a
-- | A schema is a block of the Ox computation performed within the context
-- of the sentence and the absolute sentence position.
type Schema a = Vector Word -> Int -> Ox a
-- | A dummy schema block.
void :: a -> Schema a
-- | Sequence the list of schemas (or blocks) and discard individual
-- values.
sequenceS_ :: [Vector Word -> a -> Ox b] -> Vector Word -> a -> Ox ()
-- | Use the schema to extract observations from the sentence.
schematize :: Schema a -> [Word] -> Sent Ob
-- | Body of configuration entry.
data Body a
Body :: [Int] -> a -> Body a
-- | Range argument for the schema block.
range :: Body a -> [Int]
-- | Additional arguments for the schema block.
args :: Body a -> a
-- | Maybe entry.
type Entry a = Maybe (Body a)
-- | Plain entry with no additional arugments.
entry :: [Int] -> Entry ()
-- | Entry with additional arguemnts.
entryWith :: a -> [Int] -> Entry a
-- | Configuration of the schema. All configuration elements specify the
-- range over which a particular observation type should be taken on
-- account. For example, the [-1, 0, 2] range means that
-- observations of particular type will be extracted with respect to
-- previous (k - 1), current (k) and after the next
-- (k + 2) positions when identifying the observation set for
-- position k in the input sentence.
data SchemaConf
SchemaConf :: Entry () -> Entry () -> Entry [Int] -> Entry [Int] -> Entry Int -> Entry () -> Entry () -> Entry () -> Entry () -> Entry [Dict] -> Entry Dict -> Entry Dict -> SchemaConf
-- | The orthB schema block.
orthC :: SchemaConf -> Entry ()
-- | The splitOrthB schema block.
splitOrthC :: SchemaConf -> Entry ()
-- | The lowPrefixesB schema block. The first list of ints
-- represents lengths of prefixes.
lowPrefixesC :: SchemaConf -> Entry [Int]
-- | The lowSuffixesB schema block. The first list of ints
-- represents lengths of suffixes.
lowSuffixesC :: SchemaConf -> Entry [Int]
-- | The lemmaB schema block.
lemmaC :: SchemaConf -> Entry Int
-- | The shapeB schema block.
shapeC :: SchemaConf -> Entry ()
-- | The packedB schema block.
packedC :: SchemaConf -> Entry ()
-- | The shapePairB schema block.
shapePairC :: SchemaConf -> Entry ()
-- | The packedPairB schema block.
packedPairC :: SchemaConf -> Entry ()
-- | Dictionaries of NEs (dictB schema block).
dictC :: SchemaConf -> Entry [Dict]
-- | Dictionary of internal triggers.
intTrigsC :: SchemaConf -> Entry Dict
-- | Dictionary of external triggers.
extTrigsC :: SchemaConf -> Entry Dict
-- | Null configuration of the observation schema.
nullConf :: SchemaConf
-- | Default configuration of the observation schema.
defaultConf :: [Dict] -> Maybe Dict -> Maybe Dict -> IO SchemaConf
-- | Build the schema based on the configuration.
fromConf :: SchemaConf -> Schema ()
-- | A block is a chunk of the Ox computation performed within the context
-- of the sentence and the list of absolute sentence positions.
type Block a = Vector Word -> [Int] -> Ox a
-- | Transform the block to the schema depending on the list of relative
-- sentence positions.
fromBlock :: Block a -> [Int] -> Schema a
-- | Orthographic form at the current position.
orthB :: Block ()
-- | Orthographic form split into two observations: the lowercased form and
-- the original form (only when different than the lowercased one).
splitOrthB :: Block ()
-- | List of lowercased prefixes of given lengths.
lowPrefixesB :: [Int] -> Block ()
-- | List of lowercased suffixes of given lengths.
lowSuffixesB :: [Int] -> Block ()
-- | Lemma substitute parametrized by the number specifying the span over
-- which lowercased prefixes and suffixes will be saved. For
-- example, lemmaB 2 will take affixes of lengths 0, -1
-- and -2 on account.
lemmaB :: Int -> Block ()
-- | Shape of the word.
shapeB :: Block ()
-- | Packed shape of the word.
packedB :: Block ()
-- | Combined shapes of two consecutive (at k-1 and k
-- positions) words.
shapePairB :: Block ()
-- | Combined packed shapes of two consecutive (at k-1 and
-- k positions) words.
packedPairB :: Block ()
-- | Plain dictionary search determined with respect to the list of
-- relative positions.
dictB :: Dict -> Block ()
instance Show a => Show (Body a)
instance Show SchemaConf
instance Binary SchemaConf
instance Binary a => Binary (Body a)
-- | Main module of the Nerf tool.
module NLP.Nerf
-- | A Nerf consists of the observation schema configuration and the CRF
-- model.
data Nerf
Nerf :: SchemaConf -> CRF Ob Lb -> Nerf
schemaConf :: Nerf -> SchemaConf
crf :: Nerf -> CRF Ob Lb
-- | Train Nerf on the input data using the SGD method.
train :: SgdArgs -> SchemaConf -> FilePath -> Maybe FilePath -> IO Nerf
-- | Perform named entity recognition (NER) using the Nerf.
ner :: Nerf -> String -> NeForest NE Word
-- | Show results of observation extraction on the input ENAMEX file.
tryOx :: SchemaConf -> FilePath -> IO ()
instance Binary Nerf