-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/
-- | Morphological disambiguation based on constrained CRFs
--
-- A morphological disambiguation library based on constrained
-- conditional random fields.
@package concraft
@version 0.14.2
-- | Segmentation-level ambiguities. TODO: consider moving the module
-- contents to DAG.
module NLP.Concraft.DAG.Morphosyntax.Ambiguous
-- | Identify ambigouos segments (roughly, segments which can be by-passed)
-- in the given DAG. Such ambiguous edges are marked in the resulting DAG
-- with True values.
identifyAmbiguousSegments :: DAG a b -> DAG a Bool
module NLP.Concraft.Morphosyntax.WMap
-- | A set with a non-negative weight assigned to each of its elements.
data WMap a
-- | Create WMap from a map.
fromMap :: Map a Double -> WMap a
-- | Map function over weighted collection elements.
mapWMap :: Ord b => (a -> b) -> WMap a -> WMap b
-- | Make a weighted collection. Negative elements will be ignored.
mkWMap :: Ord a => [(a, Double)] -> WMap a
-- | Trim down the set of potential labels to k most probable
-- ones.
trim :: Ord a => Int -> WMap a -> WMap a
instance Data.Binary.Class.Binary a => Data.Binary.Class.Binary (NLP.Concraft.Morphosyntax.WMap.WMap a)
instance GHC.Classes.Ord a => GHC.Classes.Ord (NLP.Concraft.Morphosyntax.WMap.WMap a)
instance GHC.Classes.Eq a => GHC.Classes.Eq (NLP.Concraft.Morphosyntax.WMap.WMap a)
instance GHC.Show.Show a => GHC.Show.Show (NLP.Concraft.Morphosyntax.WMap.WMap a)
-- | Types and functions related to the morphosyntax data layer.
module NLP.Concraft.Morphosyntax
-- | A segment parametrized over a word type and a tag type.
data Seg w t
Seg :: w -> WMap t -> Seg w t
-- | A word represented by the segment. Typically it will be an instance of
-- the Word class.
[word] :: Seg w t -> w
-- | A set of interpretations. To each interpretation a weight of
-- appropriateness within the context is assigned.
[tags] :: Seg w t -> WMap t
-- | Map function over segment tags.
mapSeg :: Ord b => (a -> b) -> Seg w a -> Seg w b
-- | Interpretations of the segment.
interpsSet :: Seg w t -> Set t
-- | Interpretations of the segment.
interps :: Seg w t -> [t]
class Word a
-- | Orthographic form.
orth :: Word a => a -> Text
-- | Out-of-vocabulary (OOV) word.
oov :: Word a => a -> Bool
-- | A sentence.
type Sent w t = [Seg w t]
-- | Map function over sentence tags.
mapSent :: Ord b => (a -> b) -> Sent w a -> Sent w b
-- | A sentence with original, textual representation.
data SentO w t
SentO :: Sent w t -> Text -> SentO w t
[segs] :: SentO w t -> Sent w t
[orig] :: SentO w t -> Text
-- | Map function over sentence tags.
mapSentO :: Ord b => (a -> b) -> SentO w a -> SentO w b
instance (GHC.Show.Show w, GHC.Show.Show t) => GHC.Show.Show (NLP.Concraft.Morphosyntax.SentO w t)
instance (GHC.Show.Show w, GHC.Show.Show t) => GHC.Show.Show (NLP.Concraft.Morphosyntax.Seg w t)
instance NLP.Concraft.Morphosyntax.Word w => NLP.Concraft.Morphosyntax.Word (NLP.Concraft.Morphosyntax.Seg w t)
instance Data.Aeson.Types.ToJSON.ToJSON w => Data.Aeson.Types.ToJSON.ToJSON (NLP.Concraft.Morphosyntax.Seg w Data.Text.Internal.Text)
instance Data.Aeson.Types.FromJSON.FromJSON w => Data.Aeson.Types.FromJSON.FromJSON (NLP.Concraft.Morphosyntax.Seg w Data.Text.Internal.Text)
-- | Accuracy statistics.
module NLP.Concraft.Morphosyntax.Accuracy
-- | Statistics.
data Stats
-- | Number of segments in gold corpus
Stats :: Int -> Int -> Stats
-- | Number of correct tags
[good] :: Stats -> Int
[gold] :: Stats -> Int
-- | Accuracy given stats.
accuracy :: Stats -> Double
-- | Accuracy weak lower bound.
weakLB :: Word w => Tagset -> [Seg w Tag] -> [Seg w Tag] -> Stats
-- | Accuracy weak upper bound.
weakUB :: Word w => Tagset -> [Seg w Tag] -> [Seg w Tag] -> Stats
-- | Accuracy strong lower bound.
strongLB :: Word w => Tagset -> [Seg w Tag] -> [Seg w Tag] -> Stats
-- | Accuracy strong upper bound.
strongUB :: Word w => Tagset -> [Seg w Tag] -> [Seg w Tag] -> Stats
-- | Morphosyntactic analysis utilities.
--
-- See reAnaSent function for a description of how reanalsis is
-- performed. At some point it would be nice to change the entire process
-- so that sentence-level segmentation is also taken from the reanalysed
-- data.
module NLP.Concraft.Analysis
-- | An analyser performs word-level segmentation and morphological
-- analysis.
type Analyse w t = Text -> IO (Sent w t)
-- | Reanalyse sentence.
--
-- From the reference sentence the function takes:
--
--
-- - Word-level segmentation
-- - Chosen interpretations (tags)
--
--
-- From the reanalysed sentence the function takes:
--
--
-- - Potential interpretations
--
reAnaSent :: Word w => Tagset -> Analyse w Tag -> SentO w Tag -> IO (Sent w Tag)
-- | Reanalyse paragraph.
reAnaPar :: Word w => Tagset -> Analyse w Tag -> [SentO w Tag] -> IO [Sent w Tag]
-- | Types and functions related to the morphosyntax data layer.
module NLP.Concraft.DAG.Morphosyntax
-- | A segment parametrized over a word type and a tag type.
data Seg w t
Seg :: w -> WMap t -> Seg w t
-- | A word represented by the segment. Typically it will be an instance of
-- the Word class.
[word] :: Seg w t -> w
-- | A set of interpretations. To each interpretation a weight of
-- appropriateness within the context is assigned.
[tags] :: Seg w t -> WMap t
-- | Map function over segment tags.
mapSeg :: Ord b => (a -> b) -> Seg w a -> Seg w b
-- | Interpretations of the segment.
interpsSet :: Seg w t -> Set t
-- | Interpretations of the segment.
interps :: Seg w t -> [t]
class Word a
-- | Orthographic form.
orth :: Word a => a -> Text
-- | Out-of-vocabulary (OOV) word.
oov :: Word a => a -> Bool
-- | A sentence. type Sent w t = [Seg w t]
type Sent w t = DAG () (Seg w t)
-- | Map function over sentence tags.
mapSent :: Ord b => (a -> b) -> Sent w a -> Sent w b
-- | A sentence with original, textual representation.
data SentO w t
SentO :: Sent w t -> Text -> SentO w t
[segs] :: SentO w t -> Sent w t
[orig] :: SentO w t -> Text
-- | Map function over sentence tags.
mapSentO :: Ord b => (a -> b) -> SentO w a -> SentO w b
instance (GHC.Show.Show w, GHC.Show.Show t) => GHC.Show.Show (NLP.Concraft.DAG.Morphosyntax.Seg w t)
instance NLP.Concraft.DAG.Morphosyntax.Word w => NLP.Concraft.DAG.Morphosyntax.Word (NLP.Concraft.DAG.Morphosyntax.Seg w t)
instance Data.Aeson.Types.ToJSON.ToJSON w => Data.Aeson.Types.ToJSON.ToJSON (NLP.Concraft.DAG.Morphosyntax.Seg w Data.Text.Internal.Text)
instance Data.Aeson.Types.FromJSON.FromJSON w => Data.Aeson.Types.FromJSON.FromJSON (NLP.Concraft.DAG.Morphosyntax.Seg w Data.Text.Internal.Text)
-- | Baseline word-segmentation functions.
module NLP.Concraft.DAG.Segmentation
-- | Which path type to search: shortest (Min) or longest
-- (Max)
data PathTyp
Min :: PathTyp
Max :: PathTyp
Freq :: FreqConf -> PathTyp
-- | Select the shortest-path (or longest, depending on PathTyp) in
-- the given DAG and remove all the edges which are not on this path.
pickPath :: Word b => PathTyp -> DAG a b -> DAG a b
-- | Retrieve the edges which belong to the shortest/longest (depending on
-- the argument function: minimum or maximum) path in the
-- given DAG.
findPath :: Word b => PathTyp -> DAG a b -> Set EdgeID
-- | Compute chosen/not-chosen counts of the individual orthographic forms
-- in the DAGs. Only the ambiguous segments are taken into account.
computeFreqs :: Word w => [Sent w t] -> Map Text (Int, Int)
-- | Configuration related to frequency-based path picking.
data FreqConf
FreqConf :: Map Text (Int, Int) -> Double -> FreqConf
-- | A map which assigns (chosen, not chosen) counts to the invidiaul
-- orthographic forms (see computeFreqs).
[pickFreqMap] :: FreqConf -> Map Text (Int, Int)
-- | A naive smoothing related parameter, which should be adddd to each
-- count in pickFreqMap. , orth :: DAG.EdgeID -> T.Text -- ^
-- Orthographic form of a given edge
[smoothingParam] :: FreqConf -> Double
-- | Compute: * the number of tokens participating in ambiguities * the
-- total number of tokens
computeAmbiStats :: Word w => AmbiCfg -> [Sent w t] -> AmbiStats
-- | Numbers of tokens.
data AmbiCfg
AmbiCfg :: Bool -> AmbiCfg
-- | Only take the chosen tokens into account
[onlyChosen] :: AmbiCfg -> Bool
-- | Numbers of tokens.
data AmbiStats
AmbiStats :: !Int -> !Int -> AmbiStats
-- | Ambiguous tokens
[ambi] :: AmbiStats -> !Int
-- | All tokens
[total] :: AmbiStats -> !Int
instance GHC.Classes.Ord NLP.Concraft.DAG.Segmentation.AmbiStats
instance GHC.Classes.Eq NLP.Concraft.DAG.Segmentation.AmbiStats
instance GHC.Show.Show NLP.Concraft.DAG.Segmentation.AmbiStats
instance GHC.Classes.Ord NLP.Concraft.DAG.Segmentation.AmbiCfg
instance GHC.Classes.Eq NLP.Concraft.DAG.Segmentation.AmbiCfg
instance GHC.Show.Show NLP.Concraft.DAG.Segmentation.AmbiCfg
-- | Observation schema blocks for Concraft.
module NLP.Concraft.DAG.Schema
-- | An observation consist of an index (of list type) and an actual
-- observation value.
type Ob = ([Int], Text)
-- | The Ox monad specialized to word token type and text observations.
type Ox a = Ox Text a
-- | A schema is a block of the Ox computation performed within the context
-- of the sentence and the absolute sentence position.
type Schema w t a = Sent w t -> EdgeID -> Ox a
-- | A dummy schema block. TODO: is it a monad, an applicative?
void :: a -> Schema w t a
-- | Sequence the list of schemas (or blocks) and discard individual
-- values.
sequenceS_ :: [Sent w t -> a -> Ox b] -> Sent w t -> a -> Ox ()
-- | Use the schema to extract observations from the sentence.
schematize :: Schema w t a -> Sent w t -> DAG () [Ob]
-- | Body of configuration entry.
data Body a
Body :: [Int] -> Bool -> a -> Body a
-- | Range argument for the schema block.
[range] :: Body a -> [Int]
-- | When true, the entry is used only for oov words.
[oovOnly] :: Body a -> Bool
-- | Additional arguments for the schema block.
[args] :: Body a -> a
-- | Maybe entry.
type Entry a = Maybe (Body a)
-- | Plain entry with no additional arugments.
entry :: [Int] -> Entry ()
-- | Entry with additional arguemnts.
entryWith :: a -> [Int] -> Entry a
-- | Configuration of the schema. All configuration elements specify the
-- range over which a particular observation type should be taken on
-- account. For example, the [-1, 0, 2] range means that
-- observations of particular type will be extracted with respect to
-- previous (k - 1), current (k) and after the next
-- (k + 2) positions when identifying the observation set for
-- position k in the input sentence.
data SchemaConf
SchemaConf :: Entry () -> Entry () -> Entry [Int] -> Entry [Int] -> Entry () -> Entry () -> Entry () -> Entry () -> SchemaConf
-- | The orthB schema block.
[orthC] :: SchemaConf -> Entry ()
-- | The lowOrthB schema block.
[lowOrthC] :: SchemaConf -> Entry ()
-- | The lowPrefixesB schema block. The first list of ints
-- represents lengths of prefixes.
[lowPrefixesC] :: SchemaConf -> Entry [Int]
-- | The lowSuffixesB schema block. The first list of ints
-- represents lengths of suffixes.
[lowSuffixesC] :: SchemaConf -> Entry [Int]
-- | The knownB schema block.
[knownC] :: SchemaConf -> Entry ()
-- | The shapeB schema block.
[shapeC] :: SchemaConf -> Entry ()
-- | The packedB schema block.
[packedC] :: SchemaConf -> Entry ()
-- | The begPackedB schema block.
[begPackedC] :: SchemaConf -> Entry ()
-- | Null configuration of the observation schema.
nullConf :: SchemaConf
-- | Build the schema based on the configuration.
fromConf :: Word w => SchemaConf -> Schema w t ()
-- | A block is a chunk of the Ox computation performed within the context
-- of the sentence and the list of absolute sentence positions.
type Block w t a = Sent w t -> [EdgeID] -> Ox a
-- | Transform a block to a schema depending on * A list of relative
-- sentence positions, * A boolean value; if true, the block computation
-- will be performed only on positions where an OOV word resides.
fromBlock :: Word w => Block w t a -> [Int] -> Bool -> Schema w t a
-- | Orthographic form at the current position.
orthB :: Word w => Block w t ()
-- | Orthographic form at the current position.
lowOrthB :: Word w => Block w t ()
-- | List of lowercased prefixes of given lengths.
lowPrefixesB :: Word w => [Int] -> Block w t ()
-- | List of lowercased suffixes of given lengths.
lowSuffixesB :: Word w => [Int] -> Block w t ()
-- | Shape of the word.
knownB :: Word w => Block w t ()
-- | Shape of the word.
shapeB :: Word w => Block w t ()
-- | Packed shape of the word.
packedB :: Word w => Block w t ()
-- | Packed shape of the word.
begPackedB :: Word w => Block w t ()
instance GHC.Show.Show NLP.Concraft.DAG.Schema.SchemaConf
instance GHC.Show.Show a => GHC.Show.Show (NLP.Concraft.DAG.Schema.Body a)
instance Data.Binary.Class.Binary NLP.Concraft.DAG.Schema.SchemaConf
instance Data.Binary.Class.Binary a => Data.Binary.Class.Binary (NLP.Concraft.DAG.Schema.Body a)
-- | Accuracy statistics.
module NLP.Concraft.DAG.Morphosyntax.Accuracy
-- | True positives, false positives, etc.
data Stats
Stats :: !Int -> !Int -> !Int -> !Int -> !Int -> Stats
-- | True positive
[tp] :: Stats -> !Int
-- | False positive
[fp] :: Stats -> !Int
-- | True negative
[tn] :: Stats -> !Int
-- | False negative
[fn] :: Stats -> !Int
-- | Consistency error (number of edges for which both fp and
-- fn hold)
[ce] :: Stats -> !Int
-- | Configuration of accuracy computation.
data AccCfg x
AccCfg :: Bool -> Bool -> Set x -> Tagset -> Bool -> Bool -> Bool -> Bool -> AccCfg x
-- | Limit calculations to OOV words
[onlyOov] :: AccCfg x -> Bool
-- | Limit calculations to segmentation-ambiguous words
[onlyAmb] :: AccCfg x -> Bool
-- | Limit calculations to segments marked with one of the given labels; if
-- empty, the option has no effect
[onlyMarkedWith] :: AccCfg x -> Set x
-- | The underlying tagset
[accTagset] :: AccCfg x -> Tagset
-- | Should the tags be expanded?
[expandTag] :: AccCfg x -> Bool
-- | Compute segmentation-level accurracy. The actually chosen tags are
-- ignored, only information about the chosen DAG edges is relevant.
[ignoreTag] :: AccCfg x -> Bool
-- | If weak, there has to be an overlap in the tags assigned to a given
-- segment in both datasets. Otherwise, the two sets of tags have to be
-- identical. , discardProb0 :: Bool -- ^ Whether sentences with near 0
-- probability should be discarded from -- evaluation.
[weakAcc] :: AccCfg x -> Bool
-- | Print information about compared elements
[verbose] :: AccCfg x -> Bool
-- | Compute the accuracy of the model with respect to the labeled dataset.
-- To each Tag an additional information x can be
-- assigned, which will be taken into account when computing statistics.
collect :: (Word w, Ord x, Show x) => AccCfg x -> [Sent w (Tag, x)] -> [Sent w (Tag, x)] -> Stats
precision :: Stats -> Double
recall :: Stats -> Double
accuracy :: Stats -> Double
instance GHC.Classes.Ord NLP.Concraft.DAG.Morphosyntax.Accuracy.Stats
instance GHC.Classes.Eq NLP.Concraft.DAG.Morphosyntax.Accuracy.Stats
instance GHC.Show.Show NLP.Concraft.DAG.Morphosyntax.Accuracy.Stats
module NLP.Concraft.DAG.Guess
-- | A guessing model.
data Guesser t s
Guesser :: SchemaConf -> CRF Ob s -> s -> Set t -> (t -> s) -> (s -> t) -> Guesser t s
[schemaConf] :: Guesser t s -> SchemaConf
[crf] :: Guesser t s -> CRF Ob s
[zeroProbLab] :: Guesser t s -> s
-- | The tagset considered for the unknown words (TODO: a solution parallel
-- and not 100% consistent with what is implemented in the CRF library)
-- TODO: with complexify, unkTagSet is not needed anymore!
[unkTagSet] :: Guesser t s -> Set t
-- | A tag simplification function
[simplify] :: Guesser t s -> t -> s
-- | NEW: instead of an unkTagSet, a function which makes a complex
-- tag out of a simple tag.
--
-- WARNING: we assume, that this function does not conflate simplified
-- tags, i.e., tag to tags of type s cannot lead to one and the
-- same complex tag of type t.
[complexify] :: Guesser t s -> s -> t
-- | Store the entire guessing model apart from the simplification
-- function.
putGuesser :: (Binary t, Binary s, Ord s) => Guesser t s -> Put
-- | Get the disambiguation model, provided the simplification function.
-- getGuesser :: (M.Map t T.Tag) -> Get (Guesser t)
getGuesser :: (Binary t, Binary s, Ord s, Ord t) => (t -> s) -> (s -> t) -> Get (Guesser t s)
-- | Determine the marginal probabilities of the individual labels in the
-- sentence.
marginals :: (Word w, Ord t, Ord s) => Config s -> Guesser t s -> Sent w t -> DAG () (WMap t)
-- | Replace the probabilities of the sentence labels with the marginal
-- probabilities stemming from the model.
marginalsSent :: (Word w, Ord t, Ord s) => Config s -> Guesser t s -> Sent w t -> Sent w t
-- | Training configuration.
data TrainConf t s
TrainConf :: SchemaConf -> SgdArgs -> Bool -> R0T -> t -> (t -> s) -> (s -> t) -> (t -> t) -> Bool -> TrainConf t s
[schemaConfT] :: TrainConf t s -> SchemaConf
-- | SGD parameters.
[sgdArgsT] :: TrainConf t s -> SgdArgs
-- | Store SGD dataset on disk
[onDiskT] :: TrainConf t s -> Bool
-- | R0 construction method
[r0T] :: TrainConf t s -> R0T
-- | Zero probability label
[zeroProbLabel] :: TrainConf t s -> t
-- | Label simplification function
[simplifyLabel] :: TrainConf t s -> t -> s
-- | Label complexification function
[complexifyLabel] :: TrainConf t s -> s -> t
-- | Strip the label from irrelevant information. Used to determine the set
-- of possible tags for unknown words. TODO: we don't need this with
-- complexify anymore!?
[stripLabel] :: TrainConf t s -> t -> t
-- | Guess only visible features
[onlyVisible] :: TrainConf t s -> Bool
-- | Method of constructing the default set of labels (R0).
data R0T
-- | See anyInterps
AnyInterps :: R0T
-- | See anyChosen
AnyChosen :: R0T
-- | See oovChosen
OovChosen :: R0T
-- | Train guesser.
train :: (Word w, Ord t, Ord s) => TrainConf t s -> IO [Sent w t] -> IO [Sent w t] -> IO (Guesser t s)
-- | Schematized dataset.
schemed :: (Word w, Ord t, Ord s) => (t -> s) -> Schema w s a -> [Sent w t] -> [SentL Ob s]
instance Data.Data.Data NLP.Concraft.DAG.Guess.R0T
instance GHC.Enum.Enum NLP.Concraft.DAG.Guess.R0T
instance GHC.Classes.Ord NLP.Concraft.DAG.Guess.R0T
instance GHC.Classes.Eq NLP.Concraft.DAG.Guess.R0T
instance GHC.Show.Show NLP.Concraft.DAG.Guess.R0T
module NLP.Concraft.DAG.Disamb
-- | A disambiguation model.
data Disamb t
Disamb :: [Tier] -> SchemaConf -> CRF Ob Atom -> (t -> Tag) -> Disamb t
[tiers] :: Disamb t -> [Tier]
[schemaConf] :: Disamb t -> SchemaConf
[crf] :: Disamb t -> CRF Ob Atom
-- | A map which simplifies the tags of generic type t to
-- simplified positional tags. The motivation behind this is that tags
-- can have a richer structure.
--
-- NOTE: it can happen in real situations that a tag is encountered which
-- is not known by the model. It would be nice to be able to treat it as
-- the closest tag that can be handled. Then, one have to define the
-- notion of the similarilty between tags, though... But probably it
-- should be done at a different level (where more information about the
-- structure of t is known)
[simplify] :: Disamb t -> t -> Tag
-- | Store the entire disambiguation model apart from the simplification
-- function.
putDisamb :: Disamb t -> Put
-- | Get the disambiguation model, provided the simplification function.
-- getDisamb :: (M.Map t T.Tag) -> Get (Disamb t)
getDisamb :: (t -> Tag) -> Get (Disamb t)
-- | A tier description.
data Tier
Tier :: Bool -> Bool -> Set Attr -> Tier
-- | Does it include the part of speech?
[withPos] :: Tier -> Bool
-- | End-of-sentence marker.
[withEos] :: Tier -> Bool
-- | Tier grammatical attributes.
[withAtts] :: Tier -> Set Attr
-- | An atomic part of morphosyntactic tag with optional POS.
data Atom
Atom :: Maybe POS -> Map Attr Text -> Maybe Bool -> Atom
[pos] :: Atom -> Maybe POS
[atts] :: Atom -> Map Attr Text
-- | NOTE: could be simplified to Bool, but this way it's more readable
[eos] :: Atom -> Maybe Bool
-- | Type of resulting probabilities.
data ProbType
-- | Marginal probabilities
Marginals :: ProbType
-- | TODO
MaxProbs :: ProbType
-- | Determine the marginal probabilities of to individual labels in the
-- sentence. marginalsSent :: (X.Word w, Ord t) => Disamb t ->
-- X.Sent w t -> DAG () (X.WMap [P.Atom])
probsSent :: (Word w, Ord t) => ProbType -> Disamb t -> Sent w t -> Sent w t
-- | Determine the marginal probabilities of to individual labels in the
-- sentence.
probs :: (Word w, Ord t) => ProbType -> Disamb t -> Sent w t -> DAG () (WMap t)
-- | Training configuration.
data TrainConf t
TrainConf :: [Tier] -> SchemaConf -> SgdArgs -> Bool -> (t -> Tag) -> TrainConf t
[tiersT] :: TrainConf t -> [Tier]
[schemaConfT] :: TrainConf t -> SchemaConf
[sgdArgsT] :: TrainConf t -> SgdArgs
[onDiskT] :: TrainConf t -> Bool
-- | Label simplification function
[simplifyLabel] :: TrainConf t -> t -> Tag
-- | Train disambiguation module.
train :: (Word w, Ord t) => TrainConf t -> IO [Sent w t] -> IO [Sent w t] -> IO (Disamb t)
-- | Prune disamb model: discard model features with absolute values (in
-- log-domain) lower than the given threshold.
prune :: Double -> Disamb t -> Disamb t
-- | Schematize the input sentence according to schema rules.
schematize :: Schema w [t] a -> Sent w [t] -> Sent Ob t
-- | A version of the disambigation model adapted to perform sentence
-- segmentation as well.
module NLP.Concraft.DAG.DisambSeg
-- | The internal tag type.
data Tag
Tag :: Tag -> Bool -> Tag
-- | Positional tag
[posiTag] :: Tag -> Tag
-- | End-of-sentence marker
[hasEos] :: Tag -> Bool
-- | A disambiguation model.
data Disamb t
Disamb :: [Tier] -> SchemaConf -> CRF Ob Atom -> (t -> Tag) -> Disamb t
[tiers] :: Disamb t -> [Tier]
[schemaConf] :: Disamb t -> SchemaConf
[crf] :: Disamb t -> CRF Ob Atom
-- | A function which simplifies the tags of the generic type t to
-- (i) the corresponding positional tags and (ii) information if the
-- segment represents sentence end.
--
-- NOTE: it can happen in real situations that a tag is encountered which
-- is not known by the model. It would be nice to be able to treat it as
-- the closest tag that can be handled. Then, one have to define the
-- notion of the similarilty between tags, though... But probably it
-- should be done at a different level (where more information about the
-- structure of t is known)
[simplify] :: Disamb t -> t -> Tag
-- | Store the entire disambiguation model apart from the simplification
-- function.
putDisamb :: Disamb t -> Put
-- | Get the disambiguation model, provided the simplification function.
-- getDisamb :: (M.Map t T.Tag) -> Get (Disamb t)
getDisamb :: (t -> Tag) -> Get (Disamb t)
-- | A tier description.
data Tier
Tier :: Bool -> Bool -> Set Attr -> Tier
-- | Does it include the part of speech?
[withPos] :: Tier -> Bool
-- | End-of-sentence marker.
[withEos] :: Tier -> Bool
-- | Tier grammatical attributes.
[withAtts] :: Tier -> Set Attr
-- | An atomic part of morphosyntactic tag with optional POS.
data Atom
Atom :: Maybe POS -> Map Attr Text -> Maybe Bool -> Atom
[pos] :: Atom -> Maybe POS
[atts] :: Atom -> Map Attr Text
-- | NOTE: could be simplified to Bool, but this way it's more readable
[eos] :: Atom -> Maybe Bool
-- | Perform disambiguation.
disamb :: (Word w, Ord t) => Disamb t -> Sent w t -> DAG () (Map t Bool)
-- | Type of resulting probabilities.
data ProbType
-- | Marginal probabilities
Marginals :: ProbType
-- | TODO
MaxProbs :: ProbType
-- | Determine the marginal probabilities of to individual labels in the
-- sentence.
probsSent :: (Word w, Ord t) => ProbType -> Disamb t -> Sent w t -> Sent w t
-- | Determine the marginal probabilities of to individual labels in the
-- sentence.
probs :: (Word w, Ord t) => ProbType -> Disamb t -> Sent w t -> DAG () (WMap t)
-- | Training configuration.
data TrainConf t
TrainConf :: [Tier] -> SchemaConf -> SgdArgs -> Bool -> (t -> Tag) -> TrainConf t
[tiersT] :: TrainConf t -> [Tier]
[schemaConfT] :: TrainConf t -> SchemaConf
[sgdArgsT] :: TrainConf t -> SgdArgs
[onDiskT] :: TrainConf t -> Bool
-- | Label simplification function
[simplifyLabel] :: TrainConf t -> t -> Tag
-- | Train disambiguation module.
train :: (Word w, Ord t) => TrainConf t -> IO [Sent w t] -> IO [Sent w t] -> IO (Disamb t)
-- | Prune disamb model: discard model features with absolute values (in
-- log-domain) lower than the given threshold.
prune :: Double -> Disamb t -> Disamb t
instance GHC.Classes.Ord NLP.Concraft.DAG.DisambSeg.Tag
instance GHC.Classes.Eq NLP.Concraft.DAG.DisambSeg.Tag
instance GHC.Show.Show NLP.Concraft.DAG.DisambSeg.Tag
-- | Top-level module adated to DAGs, guessing and disambiguation.
module NLP.Concraft.DAGSeg
-- | Concraft data.
data Concraft t
Concraft :: Tagset -> Int -> Guesser t Tag -> Disamb t -> Disamb t -> Concraft t
[tagset] :: Concraft t -> Tagset
[guessNum] :: Concraft t -> Int
[guesser] :: Concraft t -> Guesser t Tag
[segmenter] :: Concraft t -> Disamb t
[disamber] :: Concraft t -> Disamb t
-- | Save model in a file. Data is compressed using the gzip format.
saveModel :: (Ord t, Binary t) => FilePath -> Concraft t -> IO ()
-- | Load model from a file.
loadModel :: (Ord t, Binary t) => (Tagset -> t -> Tag) -> (Tagset -> Tag -> t) -> (Tagset -> t -> Tag) -> FilePath -> IO (Concraft t)
-- | DAG annotation, assignes b values to a labels for
-- each edge in the graph.
type Anno a b = DAG () (Map a b)
-- | Determine max probabilities corresponding to individual tags w.r.t.
-- the disambiguation model.
disamb :: (Word w, Ord t) => Disamb t -> Sent w t -> Anno t Bool
-- | Find all optimal paths in the given annotation. Optimal paths are
-- those which go through tags with the assigned probability 1. For a
-- given chosen edge, all the tags with probability 1 are selected.
findOptimalPaths :: Ord t => Anno t Double -> [[(EdgeID, Set t)]]
-- | Make the given path with disamb markers in the given annotation and
-- produce a new disamb annotation.
disambPath :: Ord t => [(EdgeID, Set t)] -> Anno t Double -> Anno t Bool
-- | Determine marginal probabilities corresponding to individual tags
-- w.r.t. the guessing model.
guessMarginals :: (Word w, Ord t) => Config Tag -> Guesser t Tag -> Sent w t -> Anno t Double
-- | Determine marginal probabilities corresponding to individual tags
-- w.r.t. the disambiguation model.
disambMarginals :: (Word w, Ord t) => Disamb t -> Sent w t -> Anno t Double
-- | Determine max probabilities corresponding to individual tags w.r.t.
-- the disambiguation model.
disambProbs :: (Word w, Ord t) => ProbType -> Disamb t -> Sent w t -> Anno t Double
-- | Extend the OOV words with new, guessed interpretations.
--
-- Determine marginal probabilities corresponding to individual tags
-- w.r.t. the guessing model and, afterwards, trim the sentence to keep
-- only the k most probably labels for each OOV edge. Note that,
-- for OOV words, the entire set of default tags is considered.
guessSent :: (Word w, Ord t) => Int -> Config Tag -> Guesser t Tag -> Sent w t -> Sent w t
-- | Perform guessing, trimming, and finally determine marginal
-- probabilities corresponding to individual tags w.r.t. the guessing
-- model.
guess :: (Word w, Ord t) => Int -> Config Tag -> Guesser t Tag -> Sent w t -> Anno t Double
-- | Perform guessing, trimming, and finally determine marginal
-- probabilities corresponding to individual tags w.r.t. the
-- disambiguation model.
tag :: (Word w, Ord t) => Int -> Config Tag -> Concraft t -> Sent w t -> Anno t Double
-- | Prune the disambiguation model: discard model features with absolute
-- values (in log-domain) lower than the given threshold.
prune :: Double -> Concraft t -> Concraft t
-- | Observation schema blocks for Concraft.
module NLP.Concraft.Schema
-- | An observation consist of an index (of list type) and an actual
-- observation value.
type Ob = ([Int], Text)
-- | The Ox monad specialized to word token type and text observations.
type Ox a = Ox Text a
-- | A schema is a block of the Ox computation performed within the context
-- of the sentence and the absolute sentence position.
type Schema w t a = Vector (Seg w t) -> Int -> Ox a
-- | A dummy schema block.
void :: a -> Schema w t a
-- | Sequence the list of schemas (or blocks) and discard individual
-- values.
sequenceS_ :: [Vector (Seg w t) -> a -> Ox b] -> Vector (Seg w t) -> a -> Ox ()
-- | Use the schema to extract observations from the sentence.
schematize :: Schema w t a -> Sent w t -> [[Ob]]
-- | Body of configuration entry.
data Body a
Body :: [Int] -> Bool -> a -> Body a
-- | Range argument for the schema block.
[range] :: Body a -> [Int]
-- | When true, the entry is used only for oov words.
[oovOnly] :: Body a -> Bool
-- | Additional arguments for the schema block.
[args] :: Body a -> a
-- | Maybe entry.
type Entry a = Maybe (Body a)
-- | Plain entry with no additional arugments.
entry :: [Int] -> Entry ()
-- | Entry with additional arguemnts.
entryWith :: a -> [Int] -> Entry a
-- | Configuration of the schema. All configuration elements specify the
-- range over which a particular observation type should be taken on
-- account. For example, the [-1, 0, 2] range means that
-- observations of particular type will be extracted with respect to
-- previous (k - 1), current (k) and after the next
-- (k + 2) positions when identifying the observation set for
-- position k in the input sentence.
data SchemaConf
SchemaConf :: Entry () -> Entry () -> Entry [Int] -> Entry [Int] -> Entry () -> Entry () -> Entry () -> Entry () -> SchemaConf
-- | The orthB schema block.
[orthC] :: SchemaConf -> Entry ()
-- | The lowOrthB schema block.
[lowOrthC] :: SchemaConf -> Entry ()
-- | The lowPrefixesB schema block. The first list of ints
-- represents lengths of prefixes.
[lowPrefixesC] :: SchemaConf -> Entry [Int]
-- | The lowSuffixesB schema block. The first list of ints
-- represents lengths of suffixes.
[lowSuffixesC] :: SchemaConf -> Entry [Int]
-- | The knownB schema block.
[knownC] :: SchemaConf -> Entry ()
-- | The shapeB schema block.
[shapeC] :: SchemaConf -> Entry ()
-- | The packedB schema block.
[packedC] :: SchemaConf -> Entry ()
-- | The begPackedB schema block.
[begPackedC] :: SchemaConf -> Entry ()
-- | Null configuration of the observation schema.
nullConf :: SchemaConf
-- | Build the schema based on the configuration.
fromConf :: Word w => SchemaConf -> Schema w t ()
-- | A block is a chunk of the Ox computation performed within the context
-- of the sentence and the list of absolute sentence positions.
type Block w t a = Vector (Seg w t) -> [Int] -> Ox a
-- | Transform a block to a schema depending on * A list of relative
-- sentence positions, * A boolean value; if true, the block computation
-- will be performed only on positions where an OOV word resides.
fromBlock :: Word w => Block w t a -> [Int] -> Bool -> Schema w t a
-- | Orthographic form at the current position.
orthB :: Word w => Block w t ()
-- | Orthographic form at the current position.
lowOrthB :: Word w => Block w t ()
-- | List of lowercased prefixes of given lengths.
lowPrefixesB :: Word w => [Int] -> Block w t ()
-- | List of lowercased suffixes of given lengths.
lowSuffixesB :: Word w => [Int] -> Block w t ()
-- | Shape of the word.
knownB :: Word w => Block w t ()
-- | Shape of the word.
shapeB :: Word w => Block w t ()
-- | Packed shape of the word.
packedB :: Word w => Block w t ()
-- | Packed shape of the word.
begPackedB :: Word w => Block w t ()
instance GHC.Show.Show NLP.Concraft.Schema.SchemaConf
instance GHC.Show.Show a => GHC.Show.Show (NLP.Concraft.Schema.Body a)
instance Data.Binary.Class.Binary NLP.Concraft.Schema.SchemaConf
instance Data.Binary.Class.Binary a => Data.Binary.Class.Binary (NLP.Concraft.Schema.Body a)
module NLP.Concraft.Guess
-- | A guessing model.
data Guesser t
Guesser :: SchemaConf -> CRF Ob t -> Guesser t
[schemaConf] :: Guesser t -> SchemaConf
[crf] :: Guesser t -> CRF Ob t
-- | Determine the k most probable labels for each word in the
-- sentence. TODO: Perhaps it would be better to use sets instead of
-- lists as output?
guess :: (Word w, Ord t) => Int -> Guesser t -> Sent w t -> [[t]]
-- | Insert guessing results into the sentence. Only interpretations of OOV
-- words will be extended.
include :: (Word w, Ord t) => [[t]] -> Sent w t -> Sent w t
-- | Combine guess with include.
guessSent :: (Word w, Ord t) => Int -> Guesser t -> Sent w t -> Sent w t
-- | Training configuration.
data TrainConf
TrainConf :: SchemaConf -> SgdArgs -> Bool -> R0T -> TrainConf
[schemaConfT] :: TrainConf -> SchemaConf
-- | SGD parameters.
[sgdArgsT] :: TrainConf -> SgdArgs
-- | Store SGD dataset on disk
[onDiskT] :: TrainConf -> Bool
-- | R0 construction method
[r0T] :: TrainConf -> R0T
-- | Method of constructing the default set of labels (R0).
data R0T
-- | See anyInterps
AnyInterps :: R0T
-- | See anyChosen
AnyChosen :: R0T
-- | See oovChosen
OovChosen :: R0T
-- | Train guesser.
train :: (Word w, Ord t) => TrainConf -> IO [Sent w t] -> IO [Sent w t] -> IO (Guesser t)
instance Data.Data.Data NLP.Concraft.Guess.R0T
instance GHC.Enum.Enum NLP.Concraft.Guess.R0T
instance GHC.Classes.Ord NLP.Concraft.Guess.R0T
instance GHC.Classes.Eq NLP.Concraft.Guess.R0T
instance GHC.Show.Show NLP.Concraft.Guess.R0T
instance (GHC.Classes.Ord t, Data.Binary.Class.Binary t) => Data.Binary.Class.Binary (NLP.Concraft.Guess.Guesser t)
module NLP.Concraft.Disamb
-- | A disambiguation model.
data Disamb
Disamb :: [Tier] -> SchemaConf -> CRF Ob Atom -> Disamb
[tiers] :: Disamb -> [Tier]
[schemaConf] :: Disamb -> SchemaConf
[crf] :: Disamb -> CRF Ob Atom
-- | A tier description.
data Tier
Tier :: Bool -> Bool -> Set Attr -> Tier
-- | Does it include the part of speech?
[withPos] :: Tier -> Bool
-- | End-of-sentence marker.
[withEos] :: Tier -> Bool
-- | Tier grammatical attributes.
[withAtts] :: Tier -> Set Attr
-- | An atomic part of morphosyntactic tag with optional POS.
data Atom
Atom :: Maybe POS -> Map Attr Text -> Maybe Bool -> Atom
[pos] :: Atom -> Maybe POS
[atts] :: Atom -> Map Attr Text
-- | NOTE: could be simplified to Bool, but this way it's more readable
[eos] :: Atom -> Maybe Bool
-- | Tag labels with marginal probabilities.
marginals :: Word w => Disamb -> Sent w Tag -> [WMap Tag]
-- | Perform context-sensitive disambiguation.
disamb :: Word w => Disamb -> Sent w Tag -> [Tag]
-- | Insert disambiguation results into the sentence.
include :: (Sent w Tag -> [Tag]) -> Sent w Tag -> Sent w Tag
-- | Combine disamb with include.
disambSent :: Word w => Disamb -> Sent w Tag -> Sent w Tag
-- | Training configuration.
data TrainConf
TrainConf :: [Tier] -> SchemaConf -> SgdArgs -> Bool -> TrainConf
[tiersT] :: TrainConf -> [Tier]
[schemaConfT] :: TrainConf -> SchemaConf
[sgdArgsT] :: TrainConf -> SgdArgs
[onDiskT] :: TrainConf -> Bool
ReTrainConf :: Disamb -> SgdArgs -> Bool -> TrainConf
[initDmb] :: TrainConf -> Disamb
[sgdArgsT] :: TrainConf -> SgdArgs
[onDiskT] :: TrainConf -> Bool
-- | Train disamb model.
train :: Word w => TrainConf -> IO [Sent w Tag] -> IO [Sent w Tag] -> IO Disamb
-- | Prune disamb model: discard model features with absolute values (in
-- log-domain) lower than the given threshold.
prune :: Double -> Disamb -> Disamb
instance Data.Binary.Class.Binary NLP.Concraft.Disamb.Disamb
module NLP.Concraft
-- | Concraft data.
data Concraft
Concraft :: Tagset -> Int -> Guesser Tag -> Disamb -> Concraft
[tagset] :: Concraft -> Tagset
[guessNum] :: Concraft -> Int
[guesser] :: Concraft -> Guesser Tag
[disamb] :: Concraft -> Disamb
-- | Save model in a file. Data is compressed using the gzip format.
saveModel :: FilePath -> Concraft -> IO ()
-- | Load model from a file.
loadModel :: FilePath -> IO Concraft
-- | Tag sentence using the model. In your code you should probably use
-- your analysis function, translate results into a container of
-- Sentences, evaluate tag on each sentence and embed the
-- tagging results into the morphosyntactic structure of your own.
--
-- The function returns guessing results as fst elements of the
-- output pairs and disambiguation results as snd elements of the
-- corresponding pairs.
tag :: Word w => Concraft -> Sent w Tag -> [(Set Tag, Tag)]
-- | Determine marginal probabilities corresponding to individual tags
-- w.r.t. the disambiguation model. Since the guessing model is used
-- first, the resulting weighted maps corresponding to OOV words may
-- contain tags not present in the input sentence.
marginals :: Word w => Concraft -> Sent w Tag -> [WMap Tag]
-- | Train the Concraft model. No reanalysis of the input data will
-- be performed.
--
-- The FromJSON and ToJSON instances are used to store
-- processed input data in temporary files on a disk.
train :: (Word w, FromJSON w, ToJSON w) => Tagset -> Int -> TrainConf -> TrainConf -> IO [Sent w Tag] -> IO [Sent w Tag] -> IO Concraft
-- | Train the Concraft model after dataset reanalysis.
--
-- The FromJSON and ToJSON instances are used to store
-- processed input data in temporary files on a disk.
reAnaTrain :: (Word w, FromJSON w, ToJSON w) => Tagset -> Analyse w Tag -> Int -> TrainConf -> TrainConf -> IO [SentO w Tag] -> IO [SentO w Tag] -> IO Concraft
-- | Prune disambiguation model: discard model features with absolute
-- values (in log-domain) lower than the given threshold.
prune :: Double -> Concraft -> Concraft
instance Data.Binary.Class.Binary NLP.Concraft.Concraft