-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | Nerf, the named entity recognition tool based on linear-chain CRFs -- -- The package provides the named entity recognition (NER) tool divided -- into a back-end library (see the NLP.Nerf module) and the -- front-end tool nerf. Using the library you can model and recognize -- named entities (NEs) which, for a particular sentence, take the form -- of forest with NE category values kept in internal nodes and sentence -- words kept in forest leaves. -- -- To model NE forests we combine two different techniques. The IOB codec -- is used to translate to and fro between the original, forest -- representation of NEs and the sequence of atomic labels. In other -- words, it provides two isomorphic functions for encoding and decoding -- between both representations. Linear-chain conditional random fields, -- on the other hand, provide the framework for label modelling and -- tagging. @package nerf @version 0.1.0 -- | Basic types for dictionary handling. module NLP.Nerf.Dict.Base -- | A orthographic form. type Form = Text -- | Is the form a multiword one? isMultiWord :: Form -> Bool -- | A type of named entity. type NeType = Text -- | A Named Entity entry from the LMF dictionary. data Entry Entry :: !Form -> !NeType -> Entry -- | Orthographic form of the NE neOrth :: Entry -> !Form -- | Type of the NE neType :: Entry -> !NeType -- | A NeDict is a map from forms to NE types. Each NE may be annotated -- with multiple types. type NeDict = Map Form (Set NeType) -- | Construct the dictionary from the list of entries. mkDict :: [Entry] -> NeDict -- | Remove dictionary entries which do not satisfy the predicate. siftDict :: (Form -> Set NeType -> Bool) -> NeDict -> NeDict -- | Save the dictionary in the file. saveDict :: FilePath -> NeDict -> IO () -- | Load the dictionary from the file. loadDict :: FilePath -> IO NeDict -- | Merge dictionary resources. merge :: [NeDict] -> NeDict -- | Differentiate labels from separate dictionaries using -- dictionary-unique prefixes. diff :: [NeDict] -> [NeDict] instance Show Entry instance Read Entry instance Eq Entry instance Ord Entry -- | Parsing the Gazetteer for Polish Named Entities (used formerly within -- the SProUT platform) in the LMF format. module NLP.Nerf.Dict.PNEG -- | Parse the dictionary to the list of entries. parsePNEG :: Text -> [Entry] -- | Read the dictionary from the file. readPNEG :: FilePath -> IO [Entry] -- | Handling the NELexicon dictionary. module NLP.Nerf.Dict.NELexicon -- | Parse the NELexicon into a list of entries. parseNELexicon :: Text -> [Entry] -- | Read the dictionary from the file. readNELexicon :: FilePath -> IO [Entry] module NLP.Nerf.Dict -- | Parse the PNEG dictionary and save it in a binary form into the output -- file. preparePNEG :: FilePath -> FilePath -> IO () -- | Parse the NELexicon, merge it with the PoliMorf and serialize into a -- binary, DAWG form. prepareNELexicon :: FilePath -> FilePath -> FilePath -> IO () -- | Basic types. module NLP.Nerf.Types -- | A word. type Word = Text -- | A named entity. type NE = Text -- | An observation consist of an index (of list type) and an actual -- observation value. type Ob = ([Int], Text) -- | A label is created by encoding the named entity forest using the IOB -- method. type Lb = Label NE -- | Observation schema blocks for Nerf. module NLP.Nerf.Schema -- | The Ox monad specialized to word token type and text observations. type Ox a = Ox Word Text a -- | A schema is a block of the Ox computation performed within the context -- of the sentence and the absolute sentence position. type Schema a = Vector Word -> Int -> Ox a -- | A dummy schema block. void :: a -> Schema a -- | Sequence the list of schemas and discard individual values. sequenceS_ :: [Schema a] -> Schema () -- | Use the schema to extract observations from the sentence. schematize :: Schema a -> [Word] -> Sent Ob -- | Configuration of the schema. All configuration elements specify the -- range over which a particular observation type should be taken on -- account. For example, the [-1, 0, 2] range means that -- observations of particular type will be extracted with respect to -- previous (k - 1), current (k) and after the next -- (k + 2) positions when identifying the observation set for -- position k in the input sentence. data SchemaCfg SchemaCfg :: [Int] -> [Int] -> [Int] -> [Int] -> [Int] -> Maybe (NeDict, [Int]) -> SchemaCfg -- | The orthS schema block orthC :: SchemaCfg -> [Int] -- | The lemmaS schema block lemmaC :: SchemaCfg -> [Int] -- | The shapeS schema block shapeC :: SchemaCfg -> [Int] -- | The shapePairS schema block shapePairC :: SchemaCfg -> [Int] -- | The suffixS schema block suffixC :: SchemaCfg -> [Int] -- | The searchS schema block dictC :: SchemaCfg -> Maybe (NeDict, [Int]) -- | Default configuration for Nerf observation schema. defaultCfg :: FilePath -> IO SchemaCfg -- | Build the schema based on the configuration. fromCfg :: SchemaCfg -> Schema () -- | A block is a chunk of the Ox computation performed within the context -- of the sentence and the list of absolute sentence positions. type Block a = Vector Word -> [Int] -> Ox a -- | Transform the block to the schema dependent on the list of relative -- sentence positions. fromBlock :: Block a -> [Int] -> Schema a -- | Orthographic observations determined with respect to the list of -- relative positions. orthS :: Block () -- | Lemma substitute determined with respect to the list of relative -- positions. lemmaS :: Block () -- | Shape and packed shape determined with respect to the list of relative -- positions. shapeS :: Block () -- | Shape pairs determined with respect to the list of relative positions. shapePairS :: Block () -- | Several suffixes determined with respect to the list of relative -- positions. suffixS :: Block () -- | Plain dictionary search determined with respect to the list of -- relative positions. searchS :: NeDict -> Block () instance Binary SchemaCfg -- | Main module of the Nerf tool. module NLP.Nerf -- | A Nerf consists of the observation schema configuration and the CRF -- model. data Nerf Nerf :: SchemaCfg -> CRF Ob Lb -> Nerf schemaCfg :: Nerf -> SchemaCfg crf :: Nerf -> CRF Ob Lb -- | Train Nerf on the input data using the SGD method. train :: SgdArgs -> SchemaCfg -> FilePath -> Maybe FilePath -> IO Nerf -- | Perform named entity recognition (NER) using the Nerf. ner :: Nerf -> [Word] -> NeForest NE Word -- | Show results of observation extraction on the input ENAMEX file. tryOx :: SchemaCfg -> FilePath -> IO () instance Binary Nerf