-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | Morphosyntactic tagging tool based on constrained CRFs -- -- A morphosyntactic tagging tool based on constrained conditional random -- fields. @package concraft @version 0.4.0 -- | Types and functions related to the morphosyntax data layer. module NLP.Concraft.Morphosyntax -- | A sentence of Words. type Sent t = [Word t] -- | A word parametrized over a tag type. data Word t Word :: Text -> WMap t -> Bool -> Word t -- | Orthographic form. orth :: Word t -> Text -- | Set of word interpretations. To each interpretation a weight of -- correctness within the context is assigned. tagWMap :: Word t -> WMap t -- | Out-of-vocabulary (OOV) word, i.e. word unknown to the morphosyntactic -- analyser. oov :: Word t -> Bool -- | Map function over word tags. mapWord :: Ord b => (a -> b) -> Word a -> Word b -- | Map function over sentence tags. mapSent :: Ord b => (a -> b) -> Sent a -> Sent b -- | Interpretations of the word. interpsSet :: Word t -> Set t -- | Interpretations of the word. interps :: Word t -> [t] -- | A weighted collection of type a elements. data WMap a -- | Make a weighted collection. mkWMap :: Ord a => [(a, Double)] -> WMap a -- | Map function over weighted collection elements. mapWMap :: Ord b => (a -> b) -> WMap a -> WMap b instance Show a => Show (WMap a) instance Eq a => Eq (WMap a) instance Ord a => Ord (WMap a) instance Show t => Show (Word t) instance Eq t => Eq (Word t) instance Ord t => Ord (Word t) -- | The module provides several abstractions for representing external -- data formats. Concraft will be able to work with any format which -- implements those abstractions. module NLP.Concraft.Format -- | Textual representation of morphposyntactic tag. type Tag = Text -- | Word handler. data Word w Word :: (w -> Word Tag) -> (WMap Tag -> w -> w) -> Word w -- | Extract information relevant for tagging. extract :: Word w -> w -> Word Tag -- | Select the set of morphosyntactic interpretations. select :: Word w -> WMap Tag -> w -> w -- | Sentence handler. data Sent s w Sent :: (s -> [w]) -> ([w] -> s -> s) -> Word w -> Sent s w -- | Split sentence into a list of words. parseSent :: Sent s w -> s -> [w] -- | Merge words with a sentence. mergeSent :: Sent s w -> [w] -> s -> s -- | Words handler. wordHandler :: Sent s w -> Word w -- | Document format. data Doc f s w Doc :: (Text -> f s) -> (f s -> Text) -> Sent s w -> Doc f s w -- | Parse textual interpretations into a functor with sentence elements. parseDoc :: Doc f s w -> Text -> f s -- | Show textual reprezentation of a document. showDoc :: Doc f s w -> f s -> Text -- | Sentence handler. sentHandler :: Doc f s w -> Sent s w -- | Observation schema blocks for Concraft. module NLP.Concraft.Schema -- | An observation consist of an index (of list type) and an actual -- observation value. type Ob = ([Int], Text) -- | The Ox monad specialized to word token type and text observations. type Ox t a = Ox (Word t) Text a -- | A schema is a block of the Ox computation performed within the context -- of the sentence and the absolute sentence position. type Schema t a = Vector (Word t) -> Int -> Ox t a -- | A dummy schema block. void :: a -> Schema t a -- | Sequence the list of schemas (or blocks) and discard individual -- values. sequenceS_ :: [Vector (Word t) -> a -> Ox t b] -> Vector (Word t) -> a -> Ox t () -- | Use the schema to extract observations from the sentence. schematize :: Schema t a -> Sent t -> [[Ob]] -- | Body of configuration entry. data Body a Body :: [Int] -> Bool -> a -> Body a -- | Range argument for the schema block. range :: Body a -> [Int] -- | When true, the entry is used only for oov words. oovOnly :: Body a -> Bool -- | Additional arguments for the schema block. args :: Body a -> a -- | Maybe entry. type Entry a = Maybe (Body a) -- | Plain entry with no additional arugments. entry :: [Int] -> Entry () -- | Entry with additional arguemnts. entryWith :: a -> [Int] -> Entry a -- | Configuration of the schema. All configuration elements specify the -- range over which a particular observation type should be taken on -- account. For example, the [-1, 0, 2] range means that -- observations of particular type will be extracted with respect to -- previous (k - 1), current (k) and after the next -- (k + 2) positions when identifying the observation set for -- position k in the input sentence. data SchemaConf SchemaConf :: Entry () -> Entry () -> Entry [Int] -> Entry [Int] -> Entry () -> Entry () -> Entry () -> Entry () -> SchemaConf -- | The orthB schema block. orthC :: SchemaConf -> Entry () -- | The lowOrthB schema block. lowOrthC :: SchemaConf -> Entry () -- | The lowPrefixesB schema block. The first list of ints -- represents lengths of prefixes. lowPrefixesC :: SchemaConf -> Entry [Int] -- | The lowSuffixesB schema block. The first list of ints -- represents lengths of suffixes. lowSuffixesC :: SchemaConf -> Entry [Int] -- | The knownB schema block. knownC :: SchemaConf -> Entry () -- | The shapeB schema block. shapeC :: SchemaConf -> Entry () -- | The packedB schema block. packedC :: SchemaConf -> Entry () -- | The begPackedB schema block. begPackedC :: SchemaConf -> Entry () -- | Null configuration of the observation schema. nullConf :: SchemaConf -- | Build the schema based on the configuration. fromConf :: SchemaConf -> Schema t () -- | Default configuration for the guessing observation schema. guessConfDefault :: SchemaConf -- | Default configuration for the guessing observation schema. disambConfDefault :: SchemaConf -- | A block is a chunk of the Ox computation performed within the context -- of the sentence and the list of absolute sentence positions. type Block t a = Vector (Word t) -> [Int] -> Ox t a -- | Transform a block to a schema depending on * A list of relative -- sentence positions, * A boolean value; if true, the block computation -- will be performed only on positions where an OOV word resides. fromBlock :: Block t a -> [Int] -> Bool -> Schema t a -- | Orthographic form at the current position. orthB :: Block t () -- | Orthographic form at the current position. lowOrthB :: Block t () -- | List of lowercased prefixes of given lengths. lowPrefixesB :: [Int] -> Block t () -- | List of lowercased suffixes of given lengths. lowSuffixesB :: [Int] -> Block t () -- | Shape of the word. knownB :: Block t () -- | Shape of the word. shapeB :: Block t () -- | Packed shape of the word. packedB :: Block t () -- | Packed shape of the word. begPackedB :: Block t () instance Show a => Show (Body a) instance Show SchemaConf instance Binary SchemaConf instance Binary a => Binary (Body a) module NLP.Concraft.Guess -- | A guessing model. data Guesser t Guesser :: SchemaConf -> CRF Ob t -> Guesser t schemaConf :: Guesser t -> SchemaConf crf :: Guesser t -> CRF Ob t -- | Determine the k most probable labels for each word in the -- sentence. guess :: Ord t => Int -> Guesser t -> Sent t -> [[t]] -- | Tag sentence in external format. Selected interpretations (tags -- correct within the context) will be preserved. guessSent :: Sent s w -> Int -> Guesser Tag -> s -> s -- | Tag file. guessDoc :: Functor f => Doc f s w -> Int -> Guesser Tag -> Text -> Text -- | Include guessing results into the sentence. include :: Ord t => Sent t -> [[t]] -> Sent t -- | Training configuration. data TrainConf TrainConf :: SchemaConf -> SgdArgs -> TrainConf schemaConfT :: TrainConf -> SchemaConf sgdArgsT :: TrainConf -> SgdArgs -- | Train guesser. train :: Foldable f => Doc f s w -> TrainConf -> FilePath -> Maybe FilePath -> IO (Guesser Tag) instance (Ord t, Binary t) => Binary (Guesser t) module NLP.Concraft.Disamb -- | A disambiguation model. data Disamb Disamb :: Tagset -> [Tier] -> SchemaConf -> CRF Ob Atom -> Disamb tagset :: Disamb -> Tagset tiers :: Disamb -> [Tier] schemaConf :: Disamb -> SchemaConf crf :: Disamb -> CRF Ob Atom -- | CRF model data. data CRF a b -- | A tier description. data Tier Tier :: Bool -> Set Attr -> Tier -- | Does it include the part of speech? withPos :: Tier -> Bool -- | Tier grammatical attributes. withAtts :: Tier -> Set Attr -- | An atomic part of morphosyntactic tag with optional POS. data Atom Atom :: Maybe POS -> Map Attr Text -> Atom pos :: Atom -> Maybe POS atts :: Atom -> Map Attr Text -- | Default tiered tagging configuration. tiersDefault :: [Tier] -- | Perform context-sensitive disambiguation. disamb :: Disamb -> Sent Tag -> [Tag] -- | Tag the sentence. disambSent :: Sent s w -> Disamb -> s -> s -- | Disambiguate document. disambDoc :: Functor f => Doc f s w -> Disamb -> Text -> Text -- | Training configuration. data TrainConf TrainConf :: Tagset -> [Tier] -> SchemaConf -> SgdArgs -> TrainConf tagsetT :: TrainConf -> Tagset tiersT :: TrainConf -> [Tier] schemaConfT :: TrainConf -> SchemaConf sgdArgsT :: TrainConf -> SgdArgs -- | Train disamb model. train :: Foldable f => Doc f s w -> TrainConf -> FilePath -> Maybe FilePath -> IO Disamb instance Binary Disamb -- | Simple format for morphosyntax representation which assumes that all -- tags have a textual representation with no spaces inside and that one -- of the tags indicates unknown words. module NLP.Concraft.Format.Plain -- | A token. data Token Token :: Text -> Space -> Bool -> Map Interp Bool -> Token orth :: Token -> Text space :: Token -> Space known :: Token -> Bool -- | Interpretations of the token, each interpretation annotated with a -- disamb Boolean value (if True, the interpretation is -- correct within the context). interps :: Token -> Map Interp Bool data Interp Interp :: Maybe Text -> Tag -> Interp base :: Interp -> Maybe Text tag :: Interp -> Tag -- | No space, space or newline. data Space None :: Space Space :: Space NewLine :: Space -- | Create document handler given value of the ignore tag. plainFormat :: Tag -> Doc [] [Token] Token -- | Parse the text in the plain format given the oov tag. parsePlain :: Tag -> Text -> [[Token]] -- | Parse the sentence in the plain format given the oov tag. parseSent :: Tag -> Text -> [Token] -- | Show the plain data. showPlain :: Tag -> [[Token]] -> Text -- | Show the sentence. showSent :: Tag -> [Token] -> Text instance Show Space instance Eq Space instance Ord Space instance Show Interp instance Eq Interp instance Ord Interp instance Show Token instance Eq Token instance Ord Token module NLP.Concraft -- | Concraft data. data Concraft Concraft :: Int -> Guesser Tag -> Disamb -> Concraft guessNum :: Concraft -> Int guesser :: Concraft -> Guesser Tag disamb :: Concraft -> Disamb -- | Perform disambiguation preceded by context-sensitive guessing. tag :: Concraft -> Sent Tag -> [Tag] -- | Tag the sentence. tagSent :: Sent s w -> Concraft -> s -> s -- | Tag document. tagDoc :: Functor f => Doc f s w -> Concraft -> Text -> Text -- | Train guessing and disambiguation models. train :: (Functor f, Foldable f) => Doc f s w -> Int -> TrainConf -> TrainConf -> FilePath -> Maybe FilePath -> IO Concraft instance Binary Concraft