-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | Morphosyntactic tagging tool based on constrained CRFs -- -- A morphosyntactic tagging tool based on constrained conditional random -- fields. @package concraft @version 0.1.0 module NLP.Concraft.Morphosyntax -- | A word parametrized over the tag type. data Word t Word :: Text -> Set t -> Word t -- | Orthographic form. orth :: Word t -> Text -- | Set of word interpretations. tags :: Word t -> Set t -- | A sentence of Words. type Sent t = [Word t] -- | Interpretations chosen in the given context with corresponding -- positive weights. type Choice t = Map t (Positive Double) -- | Positive number. data Positive a (<+>) :: Num a => Positive a -> Positive a -> Positive a mkPositive :: (Num a, Ord a) => a -> Positive a -- | Retrieve the most probable interpretation. best :: Choice t -> t -- | A word is considered to be known when the set of possible -- interpretations is not empty. known :: Word t -> Bool instance Show t => Show (Word t) instance (Ord t, Read t) => Read (Word t) instance Eq t => Eq (Word t) instance Ord t => Ord (Word t) instance Show a => Show (Positive a) instance Eq a => Eq (Positive a) instance Ord a => Ord (Positive a) -- | Simple format for morphosyntax representation which assumes that all -- tags have a textual representation with no spaces inside and that one -- of the tags indicates unknown words. module NLP.Concraft.Plain -- | No space, space or newline. data Space None :: Space Space :: Space NewLine :: Space -- | A token. data Token Token :: Text -> Space -> Bool -> Map Interp Bool -> Token orth :: Token -> Text space :: Token -> Space known :: Token -> Bool -- | Interpretations with disambiguation info. interps :: Token -> Map Interp Bool data Interp Interp :: Text -> Text -> Interp base :: Interp -> Text tag :: Interp -> Text -- | Extract information relevant for tagging. fromTok :: Token -> (Word Text, Choice Text) -- | Mark all interpretations with tag component beeing a member of the -- given choice set with disamb annotations. choose :: Token -> Set Text -> Token -- | Add new interpretations with given disamb annotation. addInterps :: Bool -> Token -> [Interp] -> Token -- | Add new interpretations with None base and given disamb -- annotation. addNones :: Bool -> Token -> [Text] -> Token readPlain :: Text -> FilePath -> IO [[Token]] parsePlain :: Text -> Text -> [[Token]] parseSent :: Text -> Text -> [Token] writePlain :: Text -> FilePath -> [[Token]] -> IO () showPlain :: Text -> [[Token]] -> Text showSent :: Text -> [Token] -> Text showWord :: Text -> Token -> Text instance Show Space instance Eq Space instance Ord Space instance Show Interp instance Eq Interp instance Ord Interp instance Show Token instance Eq Token instance Ord Token module NLP.Concraft.Guess -- | The Ox monad specialized to word token type and text observations. -- TODO: Move to monad-ox package from here and from the nerf library. type Ox t a = Ox (Word t) Text a -- | A schema is a block of the Ox computation performed within the context -- of the sentence and the absolute sentence position. type Schema t a = Vector (Word t) -> Int -> Ox t a -- | An observation consist of an index (of list type) and an actual -- observation value. type Ob = ([Int], Text) schema :: Schema t () -- | Schematize the input sentence with according to schema rules. schematize :: Ord t => Sent t -> Sent Ob t -- | A guesser represented by the conditional random field. data Guesser t Guesser :: CRF Ob t -> t -> Guesser t -- | The CRF model crf :: Guesser t -> CRF Ob t -- | The tag indicating unkown words ign :: Guesser t -> t -- | Determine the k most probable labels for each unknown word in -- the sentence. guess :: Ord t => Int -> Guesser t -> Sent t -> [[t]] -- | Tag the file. tagFile :: Int -> Guesser Text -> FilePath -> IO Text -- | TODO: Abstract over the format type. learn :: SgdArgs -> Text -> FilePath -> Maybe FilePath -> IO (Guesser Text) instance (Ord t, Binary t) => Binary (Guesser t)