-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | Nerf, the named entity recognition tool based on linear-chain CRFs -- -- The package provides the named entity recognition (NER) tool divided -- into a back-end library (see the NLP.Nerf module) and the -- front-end tool nerf. Using the library you can model and recognize -- named entities (NEs) which, for a particular sentence, take the form -- of forest with NE category values kept in internal nodes and sentence -- words kept in forest leaves. -- -- To model NE forests we combine two different techniques. The IOB codec -- is used to translate to and fro between the original, forest -- representation of NEs and the sequence of atomic labels. In other -- words, it provides two isomorphic functions for encoding and decoding -- between both representations. Linear-chain conditional random fields, -- on the other hand, provide the framework for label modelling and -- tagging. @package nerf @version 0.3.0 -- | Polish Named Entity Triggers http://zil.ipipan.waw.pl/PNET -- dictionary. module NLP.Nerf.Dict.PNET -- | Parse dictionary into a list of entries. parsePNET :: Text -> [Entry] -- | Read dictionary from the file. readPNET :: FilePath -> IO [Entry] -- | Trigger type. data Typ Internal :: Typ External :: Typ -- | Does entry represents a trigger of the given type? hasTyp :: Typ -> Entry -> Bool -- | PNET entry. data Entry Entry :: Text -> Text -> Text -> Typ -> Text -> Text -> Entry orth :: Entry -> Text base :: Entry -> Text tag :: Entry -> Text typ :: Entry -> Typ neTyp :: Entry -> Text example :: Entry -> Text instance Show Typ instance Eq Typ instance Ord Typ -- | Basic types for dictionary handling. module NLP.Nerf.Dict.Base -- | A type of named entity. type NeType = Text -- | A orthographic form. type Form = Text -- | Is the form a multiword one? isMultiWord :: Form -> Bool -- | A Named Entity entry from the LMF dictionary. data Entry Entry :: !Form -> !NeType -> Entry -- | Orthographic form of the NE neOrth :: Entry -> !Form -- | Type of the NE neType :: Entry -> !NeType -- | Dictionary label. type Label = Text -- | A Dict is a map from forms to labels. Each form may be -- annotated with multiple labels. The map is represented using the -- directed acyclic word graph. type Dict = D.DAWG (S.Set Label) type DAWG = DAWG Trans Char () type Dict = DAWG (Set Label) -- | Construct dictionary from the list of form/label pairs. fromPairs :: [(Form, Label)] -> Dict -- | Construct dictionary from the list of entries. fromEntries :: [Entry] -> Dict -- | Remove dictionary entries which do not satisfy the predicate. siftDict :: (Form -> Set Label -> Bool) -> Dict -> Dict -- | Save the dictionary in the file. saveDict :: FilePath -> Dict -> IO () -- | Load the dictionary from the file. loadDict :: FilePath -> IO Dict -- | Merge dictionary resources. merge :: [Dict] -> Dict -- | Differentiate labels from separate dictionaries using -- dictionary-unique prefixes. diff :: [Dict] -> [Dict] instance Show Entry instance Read Entry instance Eq Entry instance Ord Entry -- | Parsing the Gazetteer for Polish Named Entities (used formerly within -- the SProUT platform) in the LMF format. module NLP.Nerf.Dict.PNEG -- | Parse the dictionary to the list of entries. parsePNEG :: Text -> [Entry] -- | Read the dictionary from the file. readPNEG :: FilePath -> IO [Entry] -- | Handling the NELexicon dictionary. module NLP.Nerf.Dict.NELexicon -- | Parse the NELexicon into a list of entries. parseNELexicon :: Text -> [Entry] -- | Read the dictionary from the file. readNELexicon :: FilePath -> IO [Entry] -- | Handling Prolexbase dictionaries, both with the same storage format. module NLP.Nerf.Dict.Prolexbase -- | Parse dictionary into a list of entries. parseProlexbase :: Text -> [Entry] -- | Read the dictionary from the file. readProlexbase :: FilePath -> IO [Entry] -- | Extraction utilities for various dictionary resources. module NLP.Nerf.Dict -- | Extract NEs dictionary from PoliMorf. extractPoliMorf :: FilePath -> IO Dict -- | Extract NEs dictionary from PNEG. extractPNEG :: FilePath -> IO Dict -- | Extract NEs dictionary from NELexicon. extractNELexicon :: FilePath -> IO Dict -- | Extract NEs dictionary from Prolexbase. extractProlexbase :: FilePath -> IO Dict -- | Extract internal triggers from PNET dictionary. extractIntTriggers :: FilePath -> IO Dict -- | Extract external triggers from PNET dictionary. extractExtTriggers :: FilePath -> IO Dict -- | The module implements the tokenization used within Nerf and some other -- tokenization-related stuff. module NLP.Nerf.Tokenize -- | Tokenize sentence using the default tokenizer. tokenize :: String -> [String] -- | A class of objects with size. class Word a -- | Synchronize named entities with tokenization represented by the second -- function argument. Of course, both arguments should relate to the same -- sentence. moveNEs :: Word b => NeForest a b -> [b] -> NeForest a b instance Word Text instance Word Text instance Word String -- | Basic types. module NLP.Nerf.Types -- | A word. type Word = Text -- | A named entity. type NE = Text -- | An observation consist of an index (of list type) and an actual -- observation value. type Ob = ([Int], Text) -- | A label is created by encoding the named entity forest using the IOB -- method. type Lb = Label NE -- | Observation schema blocks for Nerf. module NLP.Nerf.Schema -- | The Ox monad specialized to word token type and text observations. type Ox a = Ox Word Text a -- | A schema is a block of the Ox computation performed within the context -- of the sentence and the absolute sentence position. type Schema a = Vector Word -> Int -> Ox a -- | A dummy schema block. void :: a -> Schema a -- | Sequence the list of schemas (or blocks) and discard individual -- values. sequenceS_ :: [Vector Word -> a -> Ox b] -> Vector Word -> a -> Ox () -- | Use the schema to extract observations from the sentence. schematize :: Schema a -> [Word] -> Sent Ob -- | Body of configuration entry. data Body a Body :: [Int] -> a -> Body a -- | Range argument for the schema block. range :: Body a -> [Int] -- | Additional arguments for the schema block. args :: Body a -> a -- | Maybe entry. type Entry a = Maybe (Body a) -- | Plain entry with no additional arugments. entry :: [Int] -> Entry () -- | Entry with additional arguemnts. entryWith :: a -> [Int] -> Entry a -- | Configuration of the schema. All configuration elements specify the -- range over which a particular observation type should be taken on -- account. For example, the [-1, 0, 2] range means that -- observations of particular type will be extracted with respect to -- previous (k - 1), current (k) and after the next -- (k + 2) positions when identifying the observation set for -- position k in the input sentence. data SchemaConf SchemaConf :: Entry () -> Entry () -> Entry [Int] -> Entry [Int] -> Entry Int -> Entry () -> Entry () -> Entry () -> Entry () -> Entry [Dict] -> Entry Dict -> Entry Dict -> SchemaConf -- | The orthB schema block. orthC :: SchemaConf -> Entry () -- | The splitOrthB schema block. splitOrthC :: SchemaConf -> Entry () -- | The lowPrefixesB schema block. The first list of ints -- represents lengths of prefixes. lowPrefixesC :: SchemaConf -> Entry [Int] -- | The lowSuffixesB schema block. The first list of ints -- represents lengths of suffixes. lowSuffixesC :: SchemaConf -> Entry [Int] -- | The lemmaB schema block. lemmaC :: SchemaConf -> Entry Int -- | The shapeB schema block. shapeC :: SchemaConf -> Entry () -- | The packedB schema block. packedC :: SchemaConf -> Entry () -- | The shapePairB schema block. shapePairC :: SchemaConf -> Entry () -- | The packedPairB schema block. packedPairC :: SchemaConf -> Entry () -- | Dictionaries of NEs (dictB schema block). dictC :: SchemaConf -> Entry [Dict] -- | Dictionary of internal triggers. intTrigsC :: SchemaConf -> Entry Dict -- | Dictionary of external triggers. extTrigsC :: SchemaConf -> Entry Dict -- | Null configuration of the observation schema. nullConf :: SchemaConf -- | Default configuration of the observation schema. defaultConf :: [Dict] -> Maybe Dict -> Maybe Dict -> IO SchemaConf -- | Build the schema based on the configuration. fromConf :: SchemaConf -> Schema () -- | A block is a chunk of the Ox computation performed within the context -- of the sentence and the list of absolute sentence positions. type Block a = Vector Word -> [Int] -> Ox a -- | Transform the block to the schema depending on the list of relative -- sentence positions. fromBlock :: Block a -> [Int] -> Schema a -- | Orthographic form at the current position. orthB :: Block () -- | Orthographic form split into two observations: the lowercased form and -- the original form (only when different than the lowercased one). splitOrthB :: Block () -- | List of lowercased prefixes of given lengths. lowPrefixesB :: [Int] -> Block () -- | List of lowercased suffixes of given lengths. lowSuffixesB :: [Int] -> Block () -- | Lemma substitute parametrized by the number specifying the span over -- which lowercased prefixes and suffixes will be saved. For -- example, lemmaB 2 will take affixes of lengths 0, -1 -- and -2 on account. lemmaB :: Int -> Block () -- | Shape of the word. shapeB :: Block () -- | Packed shape of the word. packedB :: Block () -- | Combined shapes of two consecutive (at k-1 and k -- positions) words. shapePairB :: Block () -- | Combined packed shapes of two consecutive (at k-1 and -- k positions) words. packedPairB :: Block () -- | Plain dictionary search determined with respect to the list of -- relative positions. dictB :: Dict -> Block () instance Show a => Show (Body a) instance Show SchemaConf instance Binary SchemaConf instance Binary a => Binary (Body a) -- | Main module of the Nerf tool. module NLP.Nerf -- | A Nerf consists of the observation schema configuration and the CRF -- model. data Nerf Nerf :: SchemaConf -> CRF Ob Lb -> Nerf schemaConf :: Nerf -> SchemaConf crf :: Nerf -> CRF Ob Lb -- | Train Nerf on the input data using the SGD method. train :: SgdArgs -> SchemaConf -> FilePath -> Maybe FilePath -> IO Nerf -- | Perform named entity recognition (NER) using the Nerf. ner :: Nerf -> [Word] -> NeForest NE Word -- | Show results of observation extraction on the input ENAMEX file. tryOx :: SchemaConf -> FilePath -> IO () instance Binary Nerf