-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/
-- | Morphosyntactic tagging tool based on constrained CRFs
--
-- A morphosyntactic tagging tool based on constrained conditional random
-- fields.
@package concraft
@version 0.4.0
-- | Types and functions related to the morphosyntax data layer.
module NLP.Concraft.Morphosyntax
-- | A sentence of Words.
type Sent t = [Word t]
-- | A word parametrized over a tag type.
data Word t
Word :: Text -> WMap t -> Bool -> Word t
-- | Orthographic form.
orth :: Word t -> Text
-- | Set of word interpretations. To each interpretation a weight of
-- correctness within the context is assigned.
tagWMap :: Word t -> WMap t
-- | Out-of-vocabulary (OOV) word, i.e. word unknown to the morphosyntactic
-- analyser.
oov :: Word t -> Bool
-- | Map function over word tags.
mapWord :: Ord b => (a -> b) -> Word a -> Word b
-- | Map function over sentence tags.
mapSent :: Ord b => (a -> b) -> Sent a -> Sent b
-- | Interpretations of the word.
interpsSet :: Word t -> Set t
-- | Interpretations of the word.
interps :: Word t -> [t]
-- | A weighted collection of type a elements.
data WMap a
-- | Make a weighted collection.
mkWMap :: Ord a => [(a, Double)] -> WMap a
-- | Map function over weighted collection elements.
mapWMap :: Ord b => (a -> b) -> WMap a -> WMap b
instance Show a => Show (WMap a)
instance Eq a => Eq (WMap a)
instance Ord a => Ord (WMap a)
instance Show t => Show (Word t)
instance Eq t => Eq (Word t)
instance Ord t => Ord (Word t)
-- | The module provides several abstractions for representing external
-- data formats. Concraft will be able to work with any format which
-- implements those abstractions.
module NLP.Concraft.Format
-- | Textual representation of morphposyntactic tag.
type Tag = Text
-- | Word handler.
data Word w
Word :: (w -> Word Tag) -> (WMap Tag -> w -> w) -> Word w
-- | Extract information relevant for tagging.
extract :: Word w -> w -> Word Tag
-- | Select the set of morphosyntactic interpretations.
select :: Word w -> WMap Tag -> w -> w
-- | Sentence handler.
data Sent s w
Sent :: (s -> [w]) -> ([w] -> s -> s) -> Word w -> Sent s w
-- | Split sentence into a list of words.
parseSent :: Sent s w -> s -> [w]
-- | Merge words with a sentence.
mergeSent :: Sent s w -> [w] -> s -> s
-- | Words handler.
wordHandler :: Sent s w -> Word w
-- | Document format.
data Doc f s w
Doc :: (Text -> f s) -> (f s -> Text) -> Sent s w -> Doc f s w
-- | Parse textual interpretations into a functor with sentence elements.
parseDoc :: Doc f s w -> Text -> f s
-- | Show textual reprezentation of a document.
showDoc :: Doc f s w -> f s -> Text
-- | Sentence handler.
sentHandler :: Doc f s w -> Sent s w
-- | Observation schema blocks for Concraft.
module NLP.Concraft.Schema
-- | An observation consist of an index (of list type) and an actual
-- observation value.
type Ob = ([Int], Text)
-- | The Ox monad specialized to word token type and text observations.
type Ox t a = Ox (Word t) Text a
-- | A schema is a block of the Ox computation performed within the context
-- of the sentence and the absolute sentence position.
type Schema t a = Vector (Word t) -> Int -> Ox t a
-- | A dummy schema block.
void :: a -> Schema t a
-- | Sequence the list of schemas (or blocks) and discard individual
-- values.
sequenceS_ :: [Vector (Word t) -> a -> Ox t b] -> Vector (Word t) -> a -> Ox t ()
-- | Use the schema to extract observations from the sentence.
schematize :: Schema t a -> Sent t -> [[Ob]]
-- | Body of configuration entry.
data Body a
Body :: [Int] -> Bool -> a -> Body a
-- | Range argument for the schema block.
range :: Body a -> [Int]
-- | When true, the entry is used only for oov words.
oovOnly :: Body a -> Bool
-- | Additional arguments for the schema block.
args :: Body a -> a
-- | Maybe entry.
type Entry a = Maybe (Body a)
-- | Plain entry with no additional arugments.
entry :: [Int] -> Entry ()
-- | Entry with additional arguemnts.
entryWith :: a -> [Int] -> Entry a
-- | Configuration of the schema. All configuration elements specify the
-- range over which a particular observation type should be taken on
-- account. For example, the [-1, 0, 2] range means that
-- observations of particular type will be extracted with respect to
-- previous (k - 1), current (k) and after the next
-- (k + 2) positions when identifying the observation set for
-- position k in the input sentence.
data SchemaConf
SchemaConf :: Entry () -> Entry () -> Entry [Int] -> Entry [Int] -> Entry () -> Entry () -> Entry () -> Entry () -> SchemaConf
-- | The orthB schema block.
orthC :: SchemaConf -> Entry ()
-- | The lowOrthB schema block.
lowOrthC :: SchemaConf -> Entry ()
-- | The lowPrefixesB schema block. The first list of ints
-- represents lengths of prefixes.
lowPrefixesC :: SchemaConf -> Entry [Int]
-- | The lowSuffixesB schema block. The first list of ints
-- represents lengths of suffixes.
lowSuffixesC :: SchemaConf -> Entry [Int]
-- | The knownB schema block.
knownC :: SchemaConf -> Entry ()
-- | The shapeB schema block.
shapeC :: SchemaConf -> Entry ()
-- | The packedB schema block.
packedC :: SchemaConf -> Entry ()
-- | The begPackedB schema block.
begPackedC :: SchemaConf -> Entry ()
-- | Null configuration of the observation schema.
nullConf :: SchemaConf
-- | Build the schema based on the configuration.
fromConf :: SchemaConf -> Schema t ()
-- | Default configuration for the guessing observation schema.
guessConfDefault :: SchemaConf
-- | Default configuration for the guessing observation schema.
disambConfDefault :: SchemaConf
-- | A block is a chunk of the Ox computation performed within the context
-- of the sentence and the list of absolute sentence positions.
type Block t a = Vector (Word t) -> [Int] -> Ox t a
-- | Transform a block to a schema depending on * A list of relative
-- sentence positions, * A boolean value; if true, the block computation
-- will be performed only on positions where an OOV word resides.
fromBlock :: Block t a -> [Int] -> Bool -> Schema t a
-- | Orthographic form at the current position.
orthB :: Block t ()
-- | Orthographic form at the current position.
lowOrthB :: Block t ()
-- | List of lowercased prefixes of given lengths.
lowPrefixesB :: [Int] -> Block t ()
-- | List of lowercased suffixes of given lengths.
lowSuffixesB :: [Int] -> Block t ()
-- | Shape of the word.
knownB :: Block t ()
-- | Shape of the word.
shapeB :: Block t ()
-- | Packed shape of the word.
packedB :: Block t ()
-- | Packed shape of the word.
begPackedB :: Block t ()
instance Show a => Show (Body a)
instance Show SchemaConf
instance Binary SchemaConf
instance Binary a => Binary (Body a)
module NLP.Concraft.Guess
-- | A guessing model.
data Guesser t
Guesser :: SchemaConf -> CRF Ob t -> Guesser t
schemaConf :: Guesser t -> SchemaConf
crf :: Guesser t -> CRF Ob t
-- | Determine the k most probable labels for each word in the
-- sentence.
guess :: Ord t => Int -> Guesser t -> Sent t -> [[t]]
-- | Tag sentence in external format. Selected interpretations (tags
-- correct within the context) will be preserved.
guessSent :: Sent s w -> Int -> Guesser Tag -> s -> s
-- | Tag file.
guessDoc :: Functor f => Doc f s w -> Int -> Guesser Tag -> Text -> Text
-- | Include guessing results into the sentence.
include :: Ord t => Sent t -> [[t]] -> Sent t
-- | Training configuration.
data TrainConf
TrainConf :: SchemaConf -> SgdArgs -> TrainConf
schemaConfT :: TrainConf -> SchemaConf
sgdArgsT :: TrainConf -> SgdArgs
-- | Train guesser.
train :: Foldable f => Doc f s w -> TrainConf -> FilePath -> Maybe FilePath -> IO (Guesser Tag)
instance (Ord t, Binary t) => Binary (Guesser t)
module NLP.Concraft.Disamb
-- | A disambiguation model.
data Disamb
Disamb :: Tagset -> [Tier] -> SchemaConf -> CRF Ob Atom -> Disamb
tagset :: Disamb -> Tagset
tiers :: Disamb -> [Tier]
schemaConf :: Disamb -> SchemaConf
crf :: Disamb -> CRF Ob Atom
-- | CRF model data.
data CRF a b
-- | A tier description.
data Tier
Tier :: Bool -> Set Attr -> Tier
-- | Does it include the part of speech?
withPos :: Tier -> Bool
-- | Tier grammatical attributes.
withAtts :: Tier -> Set Attr
-- | An atomic part of morphosyntactic tag with optional POS.
data Atom
Atom :: Maybe POS -> Map Attr Text -> Atom
pos :: Atom -> Maybe POS
atts :: Atom -> Map Attr Text
-- | Default tiered tagging configuration.
tiersDefault :: [Tier]
-- | Perform context-sensitive disambiguation.
disamb :: Disamb -> Sent Tag -> [Tag]
-- | Tag the sentence.
disambSent :: Sent s w -> Disamb -> s -> s
-- | Disambiguate document.
disambDoc :: Functor f => Doc f s w -> Disamb -> Text -> Text
-- | Training configuration.
data TrainConf
TrainConf :: Tagset -> [Tier] -> SchemaConf -> SgdArgs -> TrainConf
tagsetT :: TrainConf -> Tagset
tiersT :: TrainConf -> [Tier]
schemaConfT :: TrainConf -> SchemaConf
sgdArgsT :: TrainConf -> SgdArgs
-- | Train disamb model.
train :: Foldable f => Doc f s w -> TrainConf -> FilePath -> Maybe FilePath -> IO Disamb
instance Binary Disamb
-- | Simple format for morphosyntax representation which assumes that all
-- tags have a textual representation with no spaces inside and that one
-- of the tags indicates unknown words.
module NLP.Concraft.Format.Plain
-- | A token.
data Token
Token :: Text -> Space -> Bool -> Map Interp Bool -> Token
orth :: Token -> Text
space :: Token -> Space
known :: Token -> Bool
-- | Interpretations of the token, each interpretation annotated with a
-- disamb Boolean value (if True, the interpretation is
-- correct within the context).
interps :: Token -> Map Interp Bool
data Interp
Interp :: Maybe Text -> Tag -> Interp
base :: Interp -> Maybe Text
tag :: Interp -> Tag
-- | No space, space or newline.
data Space
None :: Space
Space :: Space
NewLine :: Space
-- | Create document handler given value of the ignore tag.
plainFormat :: Tag -> Doc [] [Token] Token
-- | Parse the text in the plain format given the oov tag.
parsePlain :: Tag -> Text -> [[Token]]
-- | Parse the sentence in the plain format given the oov tag.
parseSent :: Tag -> Text -> [Token]
-- | Show the plain data.
showPlain :: Tag -> [[Token]] -> Text
-- | Show the sentence.
showSent :: Tag -> [Token] -> Text
instance Show Space
instance Eq Space
instance Ord Space
instance Show Interp
instance Eq Interp
instance Ord Interp
instance Show Token
instance Eq Token
instance Ord Token
module NLP.Concraft
-- | Concraft data.
data Concraft
Concraft :: Int -> Guesser Tag -> Disamb -> Concraft
guessNum :: Concraft -> Int
guesser :: Concraft -> Guesser Tag
disamb :: Concraft -> Disamb
-- | Perform disambiguation preceded by context-sensitive guessing.
tag :: Concraft -> Sent Tag -> [Tag]
-- | Tag the sentence.
tagSent :: Sent s w -> Concraft -> s -> s
-- | Tag document.
tagDoc :: Functor f => Doc f s w -> Concraft -> Text -> Text
-- | Train guessing and disambiguation models.
train :: (Functor f, Foldable f) => Doc f s w -> Int -> TrainConf -> TrainConf -> FilePath -> Maybe FilePath -> IO Concraft
instance Binary Concraft