-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/


-- | Morphosyntactic tagging tool based on constrained CRFs
--   
--   A morphosyntactic tagging tool based on constrained conditional random
--   fields.
@package concraft
@version 0.2.0

module NLP.Concraft.Morphosyntax

-- | A word parametrized over the tag type.
data Word t
Word :: Text -> Set t -> Word t

-- | Orthographic form.
orth :: Word t -> Text

-- | Set of word interpretations.
tags :: Word t -> Set t
mapWord :: Ord b => (a -> b) -> Word a -> Word b

-- | A sentence of <a>Word</a>s.
type Sent t = [Word t]

-- | Interpretations chosen in the given context with corresponding
--   positive weights.
type Choice t = Map t (Positive Double)
mapChoice :: Ord b => (a -> b) -> Choice a -> Choice b

-- | Positive number.
data Positive a
(<+>) :: Num a => Positive a -> Positive a -> Positive a
mkPositive :: (Num a, Ord a) => a -> Positive a

-- | Retrieve the most probable interpretation.
best :: Choice t -> t

-- | A word is considered to be known when the set of possible
--   interpretations is not empty.
known :: Word t -> Bool
instance Show t => Show (Word t)
instance (Ord t, Read t) => Read (Word t)
instance Eq t => Eq (Word t)
instance Ord t => Ord (Word t)
instance Show a => Show (Positive a)
instance Eq a => Eq (Positive a)
instance Ord a => Ord (Positive a)


-- | Simple format for morphosyntax representation which assumes that all
--   tags have a textual representation with no spaces inside and that one
--   of the tags indicates unknown words.
module NLP.Concraft.Plain

-- | No space, space or newline.
data Space
None :: Space
Space :: Space
NewLine :: Space

-- | A token.
data Token
Token :: Text -> Space -> Bool -> Map Interp Bool -> Token
orth :: Token -> Text
space :: Token -> Space
known :: Token -> Bool

-- | Interpretations with disambiguation info.
interps :: Token -> Map Interp Bool
data Interp
Interp :: Text -> Text -> Interp
base :: Interp -> Text
tag :: Interp -> Text

-- | Extract information relevant for tagging.
fromTok :: Token -> (Word Text, Choice Text)

-- | Mark all interpretations with tag component beeing a member of the
--   given choice set with disamb annotations.
choose :: Token -> Set Text -> Token

-- | Add new interpretations with given disamb annotation.
addInterps :: Bool -> Token -> [Interp] -> Token

-- | Add new interpretations with <a>None</a> base and given disamb
--   annotation.
addNones :: Bool -> Token -> [Text] -> Token
readPlain :: Text -> FilePath -> IO [[Token]]
parsePlain :: Text -> Text -> [[Token]]
parseSent :: Text -> Text -> [Token]
writePlain :: Text -> FilePath -> [[Token]] -> IO ()
showPlain :: Text -> [[Token]] -> Text
showSent :: Text -> [Token] -> Text
showWord :: Text -> Token -> Text
instance Show Space
instance Eq Space
instance Ord Space
instance Show Interp
instance Eq Interp
instance Ord Interp
instance Show Token
instance Eq Token
instance Ord Token

module NLP.Concraft.Guess

-- | The Ox monad specialized to word token type and text observations.
--   TODO: Move to monad-ox package from here and from the nerf library.
type Ox t a = Ox (Word t) Text a

-- | A schema is a block of the Ox computation performed within the context
--   of the sentence and the absolute sentence position.
type Schema t a = Vector (Word t) -> Int -> Ox t a

-- | An observation consist of an index (of list type) and an actual
--   observation value.
type Ob = ([Int], Text)
schema :: Schema t ()

-- | Schematize the input sentence with according to <a>schema</a> rules.
schematize :: Ord t => Sent t -> Sent Ob t

-- | A guesser represented by the conditional random field.
data Guesser t
Guesser :: CRF Ob t -> t -> Guesser t

-- | The CRF model
crf :: Guesser t -> CRF Ob t

-- | The tag indicating unkown words
ign :: Guesser t -> t

-- | Determine the <tt>k</tt> most probable labels for each unknown word in
--   the sentence.
guess :: Ord t => Int -> Guesser t -> Sent t -> [[t]]

-- | Tag the file.
tagFile :: Int -> Guesser Text -> FilePath -> IO Text

-- | TODO: Abstract over the format type.
learn :: SgdArgs -> Text -> FilePath -> Maybe FilePath -> IO (Guesser Text)
instance (Ord t, Binary t) => Binary (Guesser t)

module NLP.Concraft.Disamb

-- | A tier description.
data Tier
Tier :: Bool -> Set Attr -> Tier

-- | Does it include the part of speech?
withPos :: Tier -> Bool

-- | Tier grammatical attributes.
withAtts :: Tier -> Set Attr

-- | A tag with optional POS.
data Tag
Tag :: Maybe POS -> Map Attr Text -> Tag
pos :: Tag -> Maybe POS
atts :: Tag -> Map Attr Text

-- | Select tier attributes.
select :: Tier -> Tag -> Tag

-- | Split tags between two layers. TODO: Add support for multiple layers.
splitWord :: TierConf -> Word Tag -> Word (Tag, Tag)
splitSent :: TierConf -> Sent Tag -> Sent (Tag, Tag)

-- | The Ox monad specialized to word token type and text observations.
--   TODO: Move to monad-ox package from here and from the nerf library.
type Ox t a = Ox (Word t) Text a

-- | A schema is a block of the Ox computation performed within the context
--   of the sentence and the absolute sentence position.
type Schema t a = Vector (Word t) -> Int -> Ox t a

-- | An observation consist of an index (of list type) and an actual
--   observation value.
type Ob = ([Int], Text)
schema :: Schema t ()

-- | Schematize the input sentence according to <a>schema</a> rules.
schematize :: Sent t -> Sent Ob t
type TierConf = (Tier, Tier)
tear :: TierConf -> Tag -> (Tag, Tag)

-- | Unsplit the list of tag pairs. TODO: It can be done without the help
--   of original word.
deTear :: TierConf -> Word Tag -> (Tag, Tag) -> Tag
deTears :: TierConf -> Sent Tag -> [(Tag, Tag)] -> [Tag]

-- | The disambiguation model.
data Disamb

-- | Determine the most probable label sequence.
disamb :: Disamb -> Sent Tag -> [Tag]

-- | Tag the file.
tagFile :: Text -> Disamb -> FilePath -> IO Text

-- | TODO: Abstract over the format type.
learn :: SgdArgs -> FilePath -> Text -> TierConf -> FilePath -> Maybe FilePath -> IO Disamb
instance Show Tag
instance Eq Tag
instance Ord Tag
instance Binary Disamb
instance Binary Tag
instance Binary Tier