-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | Morphosyntactic tagging tool based on constrained CRFs -- -- A morphosyntactic tagging tool based on constrained conditional random -- fields. @package concraft @version 0.2.0 module NLP.Concraft.Morphosyntax -- | A word parametrized over the tag type. data Word t Word :: Text -> Set t -> Word t -- | Orthographic form. orth :: Word t -> Text -- | Set of word interpretations. tags :: Word t -> Set t mapWord :: Ord b => (a -> b) -> Word a -> Word b -- | A sentence of Words. type Sent t = [Word t] -- | Interpretations chosen in the given context with corresponding -- positive weights. type Choice t = Map t (Positive Double) mapChoice :: Ord b => (a -> b) -> Choice a -> Choice b -- | Positive number. data Positive a (<+>) :: Num a => Positive a -> Positive a -> Positive a mkPositive :: (Num a, Ord a) => a -> Positive a -- | Retrieve the most probable interpretation. best :: Choice t -> t -- | A word is considered to be known when the set of possible -- interpretations is not empty. known :: Word t -> Bool instance Show t => Show (Word t) instance (Ord t, Read t) => Read (Word t) instance Eq t => Eq (Word t) instance Ord t => Ord (Word t) instance Show a => Show (Positive a) instance Eq a => Eq (Positive a) instance Ord a => Ord (Positive a) -- | Simple format for morphosyntax representation which assumes that all -- tags have a textual representation with no spaces inside and that one -- of the tags indicates unknown words. module NLP.Concraft.Plain -- | No space, space or newline. data Space None :: Space Space :: Space NewLine :: Space -- | A token. data Token Token :: Text -> Space -> Bool -> Map Interp Bool -> Token orth :: Token -> Text space :: Token -> Space known :: Token -> Bool -- | Interpretations with disambiguation info. interps :: Token -> Map Interp Bool data Interp Interp :: Text -> Text -> Interp base :: Interp -> Text tag :: Interp -> Text -- | Extract information relevant for tagging. fromTok :: Token -> (Word Text, Choice Text) -- | Mark all interpretations with tag component beeing a member of the -- given choice set with disamb annotations. choose :: Token -> Set Text -> Token -- | Add new interpretations with given disamb annotation. addInterps :: Bool -> Token -> [Interp] -> Token -- | Add new interpretations with None base and given disamb -- annotation. addNones :: Bool -> Token -> [Text] -> Token readPlain :: Text -> FilePath -> IO [[Token]] parsePlain :: Text -> Text -> [[Token]] parseSent :: Text -> Text -> [Token] writePlain :: Text -> FilePath -> [[Token]] -> IO () showPlain :: Text -> [[Token]] -> Text showSent :: Text -> [Token] -> Text showWord :: Text -> Token -> Text instance Show Space instance Eq Space instance Ord Space instance Show Interp instance Eq Interp instance Ord Interp instance Show Token instance Eq Token instance Ord Token module NLP.Concraft.Guess -- | The Ox monad specialized to word token type and text observations. -- TODO: Move to monad-ox package from here and from the nerf library. type Ox t a = Ox (Word t) Text a -- | A schema is a block of the Ox computation performed within the context -- of the sentence and the absolute sentence position. type Schema t a = Vector (Word t) -> Int -> Ox t a -- | An observation consist of an index (of list type) and an actual -- observation value. type Ob = ([Int], Text) schema :: Schema t () -- | Schematize the input sentence with according to schema rules. schematize :: Ord t => Sent t -> Sent Ob t -- | A guesser represented by the conditional random field. data Guesser t Guesser :: CRF Ob t -> t -> Guesser t -- | The CRF model crf :: Guesser t -> CRF Ob t -- | The tag indicating unkown words ign :: Guesser t -> t -- | Determine the k most probable labels for each unknown word in -- the sentence. guess :: Ord t => Int -> Guesser t -> Sent t -> [[t]] -- | Tag the file. tagFile :: Int -> Guesser Text -> FilePath -> IO Text -- | TODO: Abstract over the format type. learn :: SgdArgs -> Text -> FilePath -> Maybe FilePath -> IO (Guesser Text) instance (Ord t, Binary t) => Binary (Guesser t) module NLP.Concraft.Disamb -- | A tier description. data Tier Tier :: Bool -> Set Attr -> Tier -- | Does it include the part of speech? withPos :: Tier -> Bool -- | Tier grammatical attributes. withAtts :: Tier -> Set Attr -- | A tag with optional POS. data Tag Tag :: Maybe POS -> Map Attr Text -> Tag pos :: Tag -> Maybe POS atts :: Tag -> Map Attr Text -- | Select tier attributes. select :: Tier -> Tag -> Tag -- | Split tags between two layers. TODO: Add support for multiple layers. splitWord :: TierConf -> Word Tag -> Word (Tag, Tag) splitSent :: TierConf -> Sent Tag -> Sent (Tag, Tag) -- | The Ox monad specialized to word token type and text observations. -- TODO: Move to monad-ox package from here and from the nerf library. type Ox t a = Ox (Word t) Text a -- | A schema is a block of the Ox computation performed within the context -- of the sentence and the absolute sentence position. type Schema t a = Vector (Word t) -> Int -> Ox t a -- | An observation consist of an index (of list type) and an actual -- observation value. type Ob = ([Int], Text) schema :: Schema t () -- | Schematize the input sentence according to schema rules. schematize :: Sent t -> Sent Ob t type TierConf = (Tier, Tier) tear :: TierConf -> Tag -> (Tag, Tag) -- | Unsplit the list of tag pairs. TODO: It can be done without the help -- of original word. deTear :: TierConf -> Word Tag -> (Tag, Tag) -> Tag deTears :: TierConf -> Sent Tag -> [(Tag, Tag)] -> [Tag] -- | The disambiguation model. data Disamb -- | Determine the most probable label sequence. disamb :: Disamb -> Sent Tag -> [Tag] -- | Tag the file. tagFile :: Text -> Disamb -> FilePath -> IO Text -- | TODO: Abstract over the format type. learn :: SgdArgs -> FilePath -> Text -> TierConf -> FilePath -> Maybe FilePath -> IO Disamb instance Show Tag instance Eq Tag instance Ord Tag instance Binary Disamb instance Binary Tag instance Binary Tier