-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/
-- | Morphosyntactic tagging tool based on constrained CRFs
--
-- A morphosyntactic tagging tool based on constrained conditional random
-- fields.
@package concraft
@version 0.2.0
module NLP.Concraft.Morphosyntax
-- | A word parametrized over the tag type.
data Word t
Word :: Text -> Set t -> Word t
-- | Orthographic form.
orth :: Word t -> Text
-- | Set of word interpretations.
tags :: Word t -> Set t
mapWord :: Ord b => (a -> b) -> Word a -> Word b
-- | A sentence of Words.
type Sent t = [Word t]
-- | Interpretations chosen in the given context with corresponding
-- positive weights.
type Choice t = Map t (Positive Double)
mapChoice :: Ord b => (a -> b) -> Choice a -> Choice b
-- | Positive number.
data Positive a
(<+>) :: Num a => Positive a -> Positive a -> Positive a
mkPositive :: (Num a, Ord a) => a -> Positive a
-- | Retrieve the most probable interpretation.
best :: Choice t -> t
-- | A word is considered to be known when the set of possible
-- interpretations is not empty.
known :: Word t -> Bool
instance Show t => Show (Word t)
instance (Ord t, Read t) => Read (Word t)
instance Eq t => Eq (Word t)
instance Ord t => Ord (Word t)
instance Show a => Show (Positive a)
instance Eq a => Eq (Positive a)
instance Ord a => Ord (Positive a)
-- | Simple format for morphosyntax representation which assumes that all
-- tags have a textual representation with no spaces inside and that one
-- of the tags indicates unknown words.
module NLP.Concraft.Plain
-- | No space, space or newline.
data Space
None :: Space
Space :: Space
NewLine :: Space
-- | A token.
data Token
Token :: Text -> Space -> Bool -> Map Interp Bool -> Token
orth :: Token -> Text
space :: Token -> Space
known :: Token -> Bool
-- | Interpretations with disambiguation info.
interps :: Token -> Map Interp Bool
data Interp
Interp :: Text -> Text -> Interp
base :: Interp -> Text
tag :: Interp -> Text
-- | Extract information relevant for tagging.
fromTok :: Token -> (Word Text, Choice Text)
-- | Mark all interpretations with tag component beeing a member of the
-- given choice set with disamb annotations.
choose :: Token -> Set Text -> Token
-- | Add new interpretations with given disamb annotation.
addInterps :: Bool -> Token -> [Interp] -> Token
-- | Add new interpretations with None base and given disamb
-- annotation.
addNones :: Bool -> Token -> [Text] -> Token
readPlain :: Text -> FilePath -> IO [[Token]]
parsePlain :: Text -> Text -> [[Token]]
parseSent :: Text -> Text -> [Token]
writePlain :: Text -> FilePath -> [[Token]] -> IO ()
showPlain :: Text -> [[Token]] -> Text
showSent :: Text -> [Token] -> Text
showWord :: Text -> Token -> Text
instance Show Space
instance Eq Space
instance Ord Space
instance Show Interp
instance Eq Interp
instance Ord Interp
instance Show Token
instance Eq Token
instance Ord Token
module NLP.Concraft.Guess
-- | The Ox monad specialized to word token type and text observations.
-- TODO: Move to monad-ox package from here and from the nerf library.
type Ox t a = Ox (Word t) Text a
-- | A schema is a block of the Ox computation performed within the context
-- of the sentence and the absolute sentence position.
type Schema t a = Vector (Word t) -> Int -> Ox t a
-- | An observation consist of an index (of list type) and an actual
-- observation value.
type Ob = ([Int], Text)
schema :: Schema t ()
-- | Schematize the input sentence with according to schema rules.
schematize :: Ord t => Sent t -> Sent Ob t
-- | A guesser represented by the conditional random field.
data Guesser t
Guesser :: CRF Ob t -> t -> Guesser t
-- | The CRF model
crf :: Guesser t -> CRF Ob t
-- | The tag indicating unkown words
ign :: Guesser t -> t
-- | Determine the k most probable labels for each unknown word in
-- the sentence.
guess :: Ord t => Int -> Guesser t -> Sent t -> [[t]]
-- | Tag the file.
tagFile :: Int -> Guesser Text -> FilePath -> IO Text
-- | TODO: Abstract over the format type.
learn :: SgdArgs -> Text -> FilePath -> Maybe FilePath -> IO (Guesser Text)
instance (Ord t, Binary t) => Binary (Guesser t)
module NLP.Concraft.Disamb
-- | A tier description.
data Tier
Tier :: Bool -> Set Attr -> Tier
-- | Does it include the part of speech?
withPos :: Tier -> Bool
-- | Tier grammatical attributes.
withAtts :: Tier -> Set Attr
-- | A tag with optional POS.
data Tag
Tag :: Maybe POS -> Map Attr Text -> Tag
pos :: Tag -> Maybe POS
atts :: Tag -> Map Attr Text
-- | Select tier attributes.
select :: Tier -> Tag -> Tag
-- | Split tags between two layers. TODO: Add support for multiple layers.
splitWord :: TierConf -> Word Tag -> Word (Tag, Tag)
splitSent :: TierConf -> Sent Tag -> Sent (Tag, Tag)
-- | The Ox monad specialized to word token type and text observations.
-- TODO: Move to monad-ox package from here and from the nerf library.
type Ox t a = Ox (Word t) Text a
-- | A schema is a block of the Ox computation performed within the context
-- of the sentence and the absolute sentence position.
type Schema t a = Vector (Word t) -> Int -> Ox t a
-- | An observation consist of an index (of list type) and an actual
-- observation value.
type Ob = ([Int], Text)
schema :: Schema t ()
-- | Schematize the input sentence according to schema rules.
schematize :: Sent t -> Sent Ob t
type TierConf = (Tier, Tier)
tear :: TierConf -> Tag -> (Tag, Tag)
-- | Unsplit the list of tag pairs. TODO: It can be done without the help
-- of original word.
deTear :: TierConf -> Word Tag -> (Tag, Tag) -> Tag
deTears :: TierConf -> Sent Tag -> [(Tag, Tag)] -> [Tag]
-- | The disambiguation model.
data Disamb
-- | Determine the most probable label sequence.
disamb :: Disamb -> Sent Tag -> [Tag]
-- | Tag the file.
tagFile :: Text -> Disamb -> FilePath -> IO Text
-- | TODO: Abstract over the format type.
learn :: SgdArgs -> FilePath -> Text -> TierConf -> FilePath -> Maybe FilePath -> IO Disamb
instance Show Tag
instance Eq Tag
instance Ord Tag
instance Binary Disamb
instance Binary Tag
instance Binary Tier