-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | Nerf, a named entity recognition tool based on linear-chain CRFs -- -- Please see the README on GitHub at -- https://github.com/kawu/nerf#readme @package nerf @version 0.5.4 -- | Compare two NE-annotated datasets. module NLP.Nerf.Compare -- | Statistics. data Stats Stats :: !Int -> !Int -> !Int -> !Int -> Stats -- | false positive [fp] :: Stats -> !Int -- | true positive [tp] :: Stats -> !Int -- | false negative [fn] :: Stats -> !Int -- | true negative [tn] :: Stats -> !Int -- | Add stats. (.+.) :: Stats -> Stats -> Stats -- | Compare two NE-annotated datasets. The function assumes, that forest -- pairs correspond to the same sentences. compare :: Ord a => [(NeForest a Text, NeForest a Text)] -> Map a Stats instance GHC.Classes.Ord a => GHC.Classes.Ord (NLP.Nerf.Compare.Node a) instance GHC.Classes.Eq a => GHC.Classes.Eq (NLP.Nerf.Compare.Node a) instance GHC.Show.Show a => GHC.Show.Show (NLP.Nerf.Compare.Node a) instance GHC.Classes.Ord NLP.Nerf.Compare.Stats instance GHC.Classes.Eq NLP.Nerf.Compare.Stats instance GHC.Show.Show NLP.Nerf.Compare.Stats -- | Basic types for dictionary handling. module NLP.Nerf.Dict.Base -- | A type of named entity. type NeType = Text -- | A orthographic form. type Form = Text -- | Is the form a multiword one? isMultiWord :: Form -> Bool -- | A Named Entity entry from the LMF dictionary. data Entry Entry :: !Form -> !NeType -> Entry -- | Orthographic form of the NE [neOrth] :: Entry -> !Form -- | Type of the NE [neType] :: Entry -> !NeType -- | Dictionary label. type Label = Text -- | A Dict is a map from forms to labels. Each form may be -- annotated with multiple labels. The map is represented using the -- directed acyclic word graph. type Dict = D.DAWG (S.Set Label) type DAWG = DAWG Trans Char () type Dict = DAWG (Set Label) -- | Construct dictionary from the list of form/label pairs. fromPairs :: [(Form, Label)] -> Dict -- | Construct dictionary from the list of entries. fromEntries :: [Entry] -> Dict -- | Remove dictionary entries which do not satisfy the predicate. siftDict :: (Form -> Set Label -> Bool) -> Dict -> Dict -- | Save the dictionary in the file. saveDict :: FilePath -> Dict -> IO () -- | Load the dictionary from the file. loadDict :: FilePath -> IO Dict -- | Merge dictionary resources. merge :: [Dict] -> Dict -- | Differentiate labels from separate dictionaries using -- dictionary-unique prefixes. diff :: [Dict] -> [Dict] instance GHC.Classes.Ord NLP.Nerf.Dict.Base.Entry instance GHC.Classes.Eq NLP.Nerf.Dict.Base.Entry instance GHC.Read.Read NLP.Nerf.Dict.Base.Entry instance GHC.Show.Show NLP.Nerf.Dict.Base.Entry -- | Handling the NELexicon dictionary. module NLP.Nerf.Dict.NELexicon -- | Parse the NELexicon into a list of entries. parseNELexicon :: Text -> [Entry] -- | Read the dictionary from the file. readNELexicon :: FilePath -> IO [Entry] -- | Parsing the Gazetteer for Polish Named Entities (used formerly within -- the SProUT platform) in the LMF format. module NLP.Nerf.Dict.PNEG -- | Parse the dictionary to the list of entries. parsePNEG :: Text -> [Entry] -- | Read the dictionary from the file. readPNEG :: FilePath -> IO [Entry] -- | Polish Named Entity Triggers http://zil.ipipan.waw.pl/PNET -- dictionary. module NLP.Nerf.Dict.PNET -- | Parse dictionary into a list of entries. parsePNET :: Text -> [Entry] -- | Read dictionary from the file. readPNET :: FilePath -> IO [Entry] -- | Trigger type. data Typ Internal :: Typ External :: Typ -- | Does entry represents a trigger of the given type? hasTyp :: Typ -> Entry -> Bool -- | PNET entry. data Entry Entry :: Text -> Text -> Text -> Typ -> Text -> Text -> Entry [orth] :: Entry -> Text [base] :: Entry -> Text [tag] :: Entry -> Text [typ] :: Entry -> Typ [neTyp] :: Entry -> Text [example] :: Entry -> Text instance GHC.Classes.Ord NLP.Nerf.Dict.PNET.Typ instance GHC.Classes.Eq NLP.Nerf.Dict.PNET.Typ instance GHC.Show.Show NLP.Nerf.Dict.PNET.Typ -- | Handling Prolexbase dictionaries, both with the same storage format. module NLP.Nerf.Dict.Prolexbase -- | Parse dictionary into a list of entries. parseProlexbase :: Text -> [Entry] -- | Read the dictionary from the file. readProlexbase :: FilePath -> IO [Entry] -- | Extraction utilities for various dictionary resources. module NLP.Nerf.Dict -- | Extract NEs dictionary from PoliMorf. extractPoliMorf :: FilePath -> IO Dict -- | Extract NEs dictionary from PNEG. extractPNEG :: FilePath -> IO Dict -- | Extract NEs dictionary from NELexicon. extractNELexicon :: FilePath -> IO Dict -- | Extract NEs dictionary from Prolexbase. extractProlexbase :: FilePath -> IO Dict -- | Extract internal triggers from PNET dictionary. extractIntTriggers :: FilePath -> IO Dict -- | Extract external triggers from PNET dictionary. extractExtTriggers :: FilePath -> IO Dict -- | The module implements the tokenization used within Nerf and some other -- tokenization-related stuff. module NLP.Nerf.Tokenize -- | Tokenize sentence using the default tokenizer. tokenize :: String -> [String] -- | A class of objects which can be converted to String. class Word a word :: Word a => a -> String -- | Synchronize the list of NE trees with the new tokenization. sync :: (Word b, Word c) => NeForest a b -> [c] -> NeForest a c instance NLP.Nerf.Tokenize.Word GHC.Base.String instance NLP.Nerf.Tokenize.Word Data.Text.Internal.Text instance NLP.Nerf.Tokenize.Word Data.Text.Internal.Lazy.Text -- | Basic types. module NLP.Nerf.Types -- | A word. type Word = Text -- | A named entity. type NE = Text -- | An observation consist of an index (of list type) and an actual -- observation value. type Ob = ([Int], Text) -- | A label is created by encoding the named entity forest using the IOB -- method. type Lb = Label NE -- | Observation schema blocks for Nerf. module NLP.Nerf.Schema -- | The Ox monad specialized to word token type and text observations. type Ox a = Ox Word Text a -- | A schema is a block of the Ox computation performed within the context -- of the sentence and the absolute sentence position. type Schema a = Vector Word -> Int -> Ox a -- | A dummy schema block. void :: a -> Schema a -- | Sequence the list of schemas (or blocks) and discard individual -- values. sequenceS_ :: [Vector Word -> a -> Ox b] -> Vector Word -> a -> Ox () -- | Use the schema to extract observations from the sentence. schematize :: Schema a -> [Word] -> Sent Ob -- | Body of configuration entry. data Body a Body :: [Int] -> a -> Body a -- | Range argument for the schema block. [range] :: Body a -> [Int] -- | Additional arguments for the schema block. [args] :: Body a -> a -- | Maybe entry. type Entry a = Maybe (Body a) -- | Plain entry with no additional arugments. entry :: [Int] -> Entry () -- | Entry with additional arguemnts. entryWith :: a -> [Int] -> Entry a -- | Configuration of the schema. All configuration elements specify the -- range over which a particular observation type should be taken on -- account. For example, the [-1, 0, 2] range means that -- observations of particular type will be extracted with respect to -- previous (k - 1), current (k) and after the next -- (k + 2) positions when identifying the observation set for -- position k in the input sentence. data SchemaConf SchemaConf :: Entry () -> Entry () -> Entry [Int] -> Entry [Int] -> Entry Int -> Entry () -> Entry () -> Entry () -> Entry () -> Entry [Dict] -> Entry Dict -> Entry Dict -> SchemaConf -- | The orthB schema block. [orthC] :: SchemaConf -> Entry () -- | The splitOrthB schema block. [splitOrthC] :: SchemaConf -> Entry () -- | The lowPrefixesB schema block. The first list of ints -- represents lengths of prefixes. [lowPrefixesC] :: SchemaConf -> Entry [Int] -- | The lowSuffixesB schema block. The first list of ints -- represents lengths of suffixes. [lowSuffixesC] :: SchemaConf -> Entry [Int] -- | The lemmaB schema block. [lemmaC] :: SchemaConf -> Entry Int -- | The shapeB schema block. [shapeC] :: SchemaConf -> Entry () -- | The packedB schema block. [packedC] :: SchemaConf -> Entry () -- | The shapePairB schema block. [shapePairC] :: SchemaConf -> Entry () -- | The packedPairB schema block. [packedPairC] :: SchemaConf -> Entry () -- | Dictionaries of NEs (dictB schema block). [dictC] :: SchemaConf -> Entry [Dict] -- | Dictionary of internal triggers. [intTrigsC] :: SchemaConf -> Entry Dict -- | Dictionary of external triggers. [extTrigsC] :: SchemaConf -> Entry Dict -- | Null configuration of the observation schema. nullConf :: SchemaConf -- | Default configuration of the observation schema. defaultConf :: [Dict] -> Maybe Dict -> Maybe Dict -> IO SchemaConf -- | Build the schema based on the configuration. fromConf :: SchemaConf -> Schema () -- | A block is a chunk of the Ox computation performed within the context -- of the sentence and the list of absolute sentence positions. type Block a = Vector Word -> [Int] -> Ox a -- | Transform the block to the schema depending on the list of relative -- sentence positions. fromBlock :: Block a -> [Int] -> Schema a -- | Orthographic form at the current position. orthB :: Block () -- | Orthographic form split into two observations: the lowercased form and -- the original form (only when different than the lowercased one). splitOrthB :: Block () -- | List of lowercased prefixes of given lengths. lowPrefixesB :: [Int] -> Block () -- | List of lowercased suffixes of given lengths. lowSuffixesB :: [Int] -> Block () -- | Lemma substitute parametrized by the number specifying the span over -- which lowercased prefixes and suffixes will be saved. For -- example, lemmaB 2 will take affixes of lengths 0, -1 -- and -2 on account. lemmaB :: Int -> Block () -- | Shape of the word. shapeB :: Block () -- | Packed shape of the word. packedB :: Block () -- | Combined shapes of two consecutive (at k-1 and k -- positions) words. shapePairB :: Block () -- | Combined packed shapes of two consecutive (at k-1 and -- k positions) words. packedPairB :: Block () -- | Plain dictionary search determined with respect to the list of -- relative positions. dictB :: Dict -> Block () instance GHC.Show.Show NLP.Nerf.Schema.SchemaConf instance GHC.Show.Show a => GHC.Show.Show (NLP.Nerf.Schema.Body a) instance Data.Binary.Class.Binary NLP.Nerf.Schema.SchemaConf instance Data.Binary.Class.Binary a => Data.Binary.Class.Binary (NLP.Nerf.Schema.Body a) -- | Main module of the Nerf tool. module NLP.Nerf -- | A Nerf consists of the observation schema configuration and the CRF -- model. data Nerf Nerf :: SchemaConf -> CRF Ob Lb -> Nerf [schemaConf] :: Nerf -> SchemaConf [crf] :: Nerf -> CRF Ob Lb -- | Train Nerf on the input data using the SGD method. train :: SgdArgs -> SchemaConf -> FilePath -> Maybe FilePath -> IO Nerf -- | Perform named entity recognition (NER) using the Nerf. ner :: Nerf -> String -> NeForest NE Word -- | Show results of observation extraction on the input ENAMEX file. tryOx :: SchemaConf -> FilePath -> IO () instance Data.Binary.Class.Binary NLP.Nerf.Nerf module NLP.Nerf.Server -- | Run a Nerf server on a given port. runNerfServer :: Nerf -> PortID -> IO () -- | Perform NER tagging on the input sentence. ner :: HostName -> PortID -> String -> IO (NeForest NE Word) -- | Support for the XCES format. module NLP.Nerf.XCES -- | Annotate XCES (in a form of a tag list) with NEs with respect to the -- given NER function. nerXCES :: Nerf.Nerf -> L.Text -> L.Text nerXCES :: (String -> NeForest NE Word) -> Text -> Text instance GHC.Show.Show NLP.Nerf.XCES.SentI instance GHC.Show.Show NLP.Nerf.XCES.SegT instance NLP.Nerf.Tokenize.Word NLP.Nerf.XCES.Tok