-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | Morphosyntactic tagging tool based on constrained CRFs -- -- A morphosyntactic tagging tool based on constrained conditional random -- fields. @package concraft @version 0.3.2 -- | The module provides functions for splitting positional tags. They can -- be used together with the layered disambiguation model. module NLP.Concraft.Disamb.Positional -- | A tier description. data Tier Tier :: Bool -> Set Attr -> Tier -- | Does it include the part of speech? withPos :: Tier -> Bool -- | Tier grammatical attributes. withAtts :: Tier -> Set Attr -- | An atomic part of morphosyntactic tag with optional POS. data Part Part :: Maybe POS -> Map Attr Text -> Part pos :: Part -> Maybe POS atts :: Part -> Map Attr Text -- | Select tier attributes. select :: Tier -> Tag -> Part -- | Split the positional tag. split :: [Tier] -> Tag -> [Part] -- | Default tiered tagging configuration. tierConfDefault :: [Tier] instance Show Part instance Eq Part instance Ord Part instance Binary Part instance Binary Tier -- | Types and functions related to the morphosyntax data layer. module NLP.Concraft.Morphosyntax -- | A sentence of Words. type Sent t = [Word t] -- | A word parametrized over a tag type. data Word t Word :: Text -> WMap t -> Bool -> Word t -- | Orthographic form. orth :: Word t -> Text -- | Set of word interpretations. To each interpretation a weight of -- correctness within the context is assigned. tagWMap :: Word t -> WMap t -- | Out-of-vocabulary (OOV) word, i.e. word unknown to the morphosyntactic -- analyser. oov :: Word t -> Bool -- | Map function over word tags. mapWord :: Ord b => (a -> b) -> Word a -> Word b -- | Map function over sentence tags. mapSent :: Ord b => (a -> b) -> Sent a -> Sent b -- | Interpretations of the word. interpsSet :: Word t -> Set t -- | Interpretations of the word. interps :: Word t -> [t] -- | A weighted collection of type a elements. data WMap a -- | Make a weighted collection. mkWMap :: Ord a => [(a, Double)] -> WMap a -- | Map function over weighted collection elements. mapWMap :: Ord b => (a -> b) -> WMap a -> WMap b instance Show a => Show (WMap a) instance Eq a => Eq (WMap a) instance Ord a => Ord (WMap a) instance Show t => Show (Word t) instance Eq t => Eq (Word t) instance Ord t => Ord (Word t) -- | The module provides several abstractions for representing external -- data formats. Concraft will be able to work with any format which -- implements those abstractions. module NLP.Concraft.Format -- | Textual representation of morphposyntactic tag. type Tag = Text -- | Words handler. data Word w Word :: (w -> Word Tag) -> (WMap Tag -> w -> w) -> Word w -- | Extract information relevant for tagging. extract :: Word w -> w -> Word Tag -- | Select the set of morphosyntactic interpretations. select :: Word w -> WMap Tag -> w -> w -- | Sentence handler. data Sent s w Sent :: (s -> [w]) -> ([w] -> s -> s) -> Word w -> Sent s w -- | Split sentence into a list of words. parseSent :: Sent s w -> s -> [w] -- | Merge words with a sentence. mergeSent :: Sent s w -> [w] -> s -> s -- | Words handler. wordHandler :: Sent s w -> Word w -- | Document format. data Doc f s w Doc :: (Text -> f s) -> (f s -> Text) -> Sent s w -> Doc f s w -- | Parse textual interpretations into a functor with sentence elements. parseDoc :: Doc f s w -> Text -> f s -- | Show textual reprezentation of a document. showDoc :: Doc f s w -> f s -> Text -- | Sentence handler. sentHandler :: Doc f s w -> Sent s w -- | Simple format for morphosyntax representation which assumes that all -- tags have a textual representation with no spaces inside and that one -- of the tags indicates unknown words. module NLP.Concraft.Format.Plain -- | Create document handler given value of the ignore tag. plainFormat :: Tag -> Doc [] [Token] Token instance Show Space instance Eq Space instance Ord Space instance Show Interp instance Eq Interp instance Ord Interp instance Show Token instance Eq Token instance Ord Token module NLP.Concraft.Schema -- | A schema is a block of the Ox computation performed within the context -- of the sentence and the absolute sentence position. type Schema t a = Vector (Word t) -> Int -> Ox t a -- | The Ox monad specialized to word token type and text observations. type Ox t a = Ox (Word t) Text a -- | An observation consist of an index (of list type) and an actual -- observation value. type Ob = ([Int], Text) -- | Default guessing schema. guessSchemaDefault :: Schema t () -- | Default disambiguation schema. disambSchemaDefault :: Schema t () module NLP.Concraft.Guess -- | A guesser represented by the conditional random field. newtype Guesser t Guesser :: CRF Ob t -> Guesser t crf :: Guesser t -> CRF Ob t -- | Determine the k most probable labels for each word in the -- sentence. guess :: Ord t => Int -> Schema t a -> Guesser t -> Sent t -> [[t]] -- | Include guessing results into the sentence. include :: Ord t => Sent t -> [[t]] -> Sent t -- | Tag sentence in external format. Selected interpretations (tags -- correct within the context) will be preserved. guessSent :: Sent s w -> Int -> Schema Tag a -> Guesser Tag -> s -> s -- | Tag file. guessDoc :: Functor f => Doc f s w -> Int -> Schema Tag a -> Guesser Tag -> Text -> Text -- | Train guesser. trainOn :: Foldable f => Doc f s w -> Schema Tag a -> SgdArgs -> FilePath -> Maybe FilePath -> IO (Guesser Tag) instance (Ord t, Binary t) => Binary (Guesser t) module NLP.Concraft.Disamb -- | Split is just a function from an original tag form to a complex tag -- form. type Split r t = r -> t -- | CRF training function. type TrainCRF o t c = IO [SentL o t] -> Maybe (IO [SentL o t]) -> IO c -- | CRF tagging function. type TagCRF o t = Sent o t -> [t] -- | Perform context-sensitive disambiguation. disamb :: (Ord r, Ord t) => Schema t a -> Split r t -> TagCRF Ob t -> Sent r -> [r] -- | Tag the sentence. disambSent :: Ord t => Sent s w -> Schema t a -> Split Tag t -> TagCRF Ob t -> s -> s -- | Disambiguate document. disambDoc :: (Functor f, Ord t) => Doc f s w -> Schema t a -> Split Tag t -> TagCRF Ob t -> Text -> Text -- | Train disamb model. trainOn :: (Foldable f, Ord t) => Doc f s w -> Schema t a -> Split Tag t -> TrainCRF Ob t c -> FilePath -> Maybe FilePath -> IO c module NLP.Concraft.Disamb.Tiered -- | Observation. newtype Ob Ob :: Int -> Ob unOb :: Ob -> Int -- | newtype Lb Lb :: Int -> Lb unLb :: Lb -> Int -- | Feature. data Feat TFeat3 :: {-# UNPACK #-} !Lb -> {-# UNPACK #-} !Lb -> {-# UNPACK #-} !Lb -> {-# UNPACK #-} !Int -> Feat x1 :: Feat -> {-# UNPACK #-} !Lb x2 :: Feat -> {-# UNPACK #-} !Lb x3 :: Feat -> {-# UNPACK #-} !Lb ln :: Feat -> {-# UNPACK #-} !Int TFeat2 :: {-# UNPACK #-} !Lb -> {-# UNPACK #-} !Lb -> {-# UNPACK #-} !Int -> Feat x1 :: Feat -> {-# UNPACK #-} !Lb x2 :: Feat -> {-# UNPACK #-} !Lb ln :: Feat -> {-# UNPACK #-} !Int TFeat1 :: {-# UNPACK #-} !Lb -> {-# UNPACK #-} !Int -> Feat x1 :: Feat -> {-# UNPACK #-} !Lb ln :: Feat -> {-# UNPACK #-} !Int OFeat :: {-# UNPACK #-} !Ob -> {-# UNPACK #-} !Lb -> {-# UNPACK #-} !Int -> Feat ob :: Feat -> {-# UNPACK #-} !Ob x1 :: Feat -> {-# UNPACK #-} !Lb ln :: Feat -> {-# UNPACK #-} !Int -- | CRF model data. data CRF a b CRF :: Int -> CodecData a b -> Model FeatMap Ob [Lb] Feat -> CRF a b numOfLayers :: CRF a b -> Int codecData :: CRF a b -> CodecData a b model :: CRF a b -> Model FeatMap Ob [Lb] Feat -- | Train the CRF using the stochastic gradient descent method. Use the -- provided feature selection function to determine model features. train :: (Ord o, Ord t) => Int -> FeatSel Ob [Lb] Feat -> SgdArgs -> TrainCRF o [t] (CRF o t) -- | Find the most probable label sequence. tag :: (Ord o, Ord t) => CRF o t -> TagCRF o [t] -- | A feature selection function type. type FeatSel o t f = FeatGen o t f -> Xs o t -> Ys t -> [f] -- | The hiddenFeats adapted to fit feature selection specs. selectHidden :: FeatSel o t f -- | The presentFeats adapted to fit feature selection specs. selectPresent :: FeatSel o t f instance Show Ob instance Eq Ob instance Ord Ob instance Ix Ob instance Binary Ob instance Show Lb instance Eq Lb instance Ord Lb instance Ix Lb instance Binary Lb instance Show Feat instance Eq Feat instance Ord Feat instance (Ord a, Ord b, Binary a, Binary b) => Binary (CRF a b) instance FeatMap FeatMap Feat instance Binary (FeatMap Feat) instance Binary Feat module NLP.Concraft -- | Guessing configuration. data GuessConf r GuessConf :: Int -> Schema r () -> GuessConf r guessNum :: GuessConf r -> Int guessSchema :: GuessConf r -> Schema r () -- | Guessing configuration and model data. data GuessData r GuessData :: GuessConf r -> Guesser r -> GuessData r guessConf :: GuessData r -> GuessConf r guesser :: GuessData r -> Guesser r -- | Disambiguation configuration. data DisambConf r t DisambConf :: Split r t -> Schema t () -> DisambConf r t split :: DisambConf r t -> Split r t disambSchema :: DisambConf r t -> Schema t () -- | Disambiguation configuration with... data DisambWith r t a DisambWith :: DisambConf r t -> a -> DisambWith r t a disambConf :: DisambWith r t a -> DisambConf r t disambWith :: DisambWith r t a -> a -- | Tagging with disambiguation configuration. type DisambTag r t = DisambWith r t (TagCRF Ob t) -- | Training disambiguation model configuration. type DisambTrain r t c = DisambWith r t (TrainCRF Ob t c) -- | Perform disambiguation preceded by context-sensitive guessing. disamb :: (Ord r, Ord t) => GuessData r -> DisambTag r t -> Sent r -> [r] -- | Tag document. disambDoc :: (Functor f, Ord t) => Doc f s w -> GuessData Tag -> DisambTag Tag t -> Text -> Text -- | Train guessing and disambiguation models. trainOn :: (Functor f, Foldable f, Ord t) => Doc f s w -> GuessConf Tag -> SgdArgs -> DisambTrain Tag t c -> FilePath -> Maybe FilePath -> IO (Guesser Tag, c)