-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | Morphological disambiguation based on constrained CRFs -- -- A morphological disambiguation library based on constrained -- conditional random fields. @package concraft @version 0.13.0 -- | Segmentation-level ambiguities. TODO: consider moving the module -- contents to DAG. module NLP.Concraft.DAG.Morphosyntax.Ambiguous -- | Identify ambigouos segments (roughly, segments which can be by-passed) -- in the given DAG. Such ambiguous edges are marked in the resulting DAG -- with True values. identifyAmbiguousSegments :: DAG a b -> DAG a Bool module NLP.Concraft.Morphosyntax.WMap -- | A set with a non-negative weight assigned to each of its elements. data WMap a -- | Create WMap from a map. fromMap :: Map a Double -> WMap a -- | Map function over weighted collection elements. mapWMap :: Ord b => (a -> b) -> WMap a -> WMap b -- | Make a weighted collection. Negative elements will be ignored. mkWMap :: Ord a => [(a, Double)] -> WMap a -- | Trim down the set of potential labels to k most probable -- ones. trim :: (Ord a) => Int -> WMap a -> WMap a instance Data.Binary.Class.Binary a => Data.Binary.Class.Binary (NLP.Concraft.Morphosyntax.WMap.WMap a) instance GHC.Classes.Ord a => GHC.Classes.Ord (NLP.Concraft.Morphosyntax.WMap.WMap a) instance GHC.Classes.Eq a => GHC.Classes.Eq (NLP.Concraft.Morphosyntax.WMap.WMap a) instance GHC.Show.Show a => GHC.Show.Show (NLP.Concraft.Morphosyntax.WMap.WMap a) -- | Types and functions related to the morphosyntax data layer. module NLP.Concraft.Morphosyntax -- | A segment parametrized over a word type and a tag type. data Seg w t Seg :: w -> WMap t -> Seg w t -- | A word represented by the segment. Typically it will be an instance of -- the Word class. [word] :: Seg w t -> w -- | A set of interpretations. To each interpretation a weight of -- appropriateness within the context is assigned. [tags] :: Seg w t -> WMap t -- | Map function over segment tags. mapSeg :: Ord b => (a -> b) -> Seg w a -> Seg w b -- | Interpretations of the segment. interpsSet :: Seg w t -> Set t -- | Interpretations of the segment. interps :: Seg w t -> [t] class Word a -- | Orthographic form. orth :: Word a => a -> Text -- | Out-of-vocabulary (OOV) word. oov :: Word a => a -> Bool -- | A sentence. type Sent w t = [Seg w t] -- | Map function over sentence tags. mapSent :: Ord b => (a -> b) -> Sent w a -> Sent w b -- | A sentence with original, textual representation. data SentO w t SentO :: Sent w t -> Text -> SentO w t [segs] :: SentO w t -> Sent w t [orig] :: SentO w t -> Text -- | Map function over sentence tags. mapSentO :: Ord b => (a -> b) -> SentO w a -> SentO w b instance (GHC.Show.Show w, GHC.Show.Show t) => GHC.Show.Show (NLP.Concraft.Morphosyntax.SentO w t) instance (GHC.Show.Show w, GHC.Show.Show t) => GHC.Show.Show (NLP.Concraft.Morphosyntax.Seg w t) instance NLP.Concraft.Morphosyntax.Word w => NLP.Concraft.Morphosyntax.Word (NLP.Concraft.Morphosyntax.Seg w t) instance Data.Aeson.Types.ToJSON.ToJSON w => Data.Aeson.Types.ToJSON.ToJSON (NLP.Concraft.Morphosyntax.Seg w Data.Text.Internal.Text) instance Data.Aeson.Types.FromJSON.FromJSON w => Data.Aeson.Types.FromJSON.FromJSON (NLP.Concraft.Morphosyntax.Seg w Data.Text.Internal.Text) -- | Accuracy statistics. module NLP.Concraft.Morphosyntax.Accuracy -- | Statistics. data Stats -- | Number of segments in gold corpus Stats :: Int -> Int -> Stats -- | Number of correct tags [good] :: Stats -> Int [gold] :: Stats -> Int -- | Accuracy given stats. accuracy :: Stats -> Double -- | Accuracy weak lower bound. weakLB :: Word w => Tagset -> [Seg w Tag] -> [Seg w Tag] -> Stats -- | Accuracy weak upper bound. weakUB :: Word w => Tagset -> [Seg w Tag] -> [Seg w Tag] -> Stats -- | Accuracy strong lower bound. strongLB :: Word w => Tagset -> [Seg w Tag] -> [Seg w Tag] -> Stats -- | Accuracy strong upper bound. strongUB :: Word w => Tagset -> [Seg w Tag] -> [Seg w Tag] -> Stats -- | Morphosyntactic analysis utilities. -- -- See reAnaSent function for a description of how reanalsis is -- performed. At some point it would be nice to change the entire process -- so that sentence-level segmentation is also taken from the reanalysed -- data. module NLP.Concraft.Analysis -- | An analyser performs word-level segmentation and morphological -- analysis. type Analyse w t = Text -> IO (Sent w t) -- | Reanalyse sentence. -- -- From the reference sentence the function takes: -- -- -- -- From the reanalysed sentence the function takes: -- -- reAnaSent :: Word w => Tagset -> Analyse w Tag -> SentO w Tag -> IO (Sent w Tag) -- | Reanalyse paragraph. reAnaPar :: Word w => Tagset -> Analyse w Tag -> [SentO w Tag] -> IO [Sent w Tag] -- | Types and functions related to the morphosyntax data layer. module NLP.Concraft.DAG.Morphosyntax -- | A segment parametrized over a word type and a tag type. data Seg w t Seg :: w -> WMap t -> Seg w t -- | A word represented by the segment. Typically it will be an instance of -- the Word class. [word] :: Seg w t -> w -- | A set of interpretations. To each interpretation a weight of -- appropriateness within the context is assigned. [tags] :: Seg w t -> WMap t -- | Map function over segment tags. mapSeg :: Ord b => (a -> b) -> Seg w a -> Seg w b -- | Interpretations of the segment. interpsSet :: Seg w t -> Set t -- | Interpretations of the segment. interps :: Seg w t -> [t] class Word a -- | Orthographic form. orth :: Word a => a -> Text -- | Out-of-vocabulary (OOV) word. oov :: Word a => a -> Bool -- | A sentence. type Sent w t = [Seg w t] type Sent w t = DAG () (Seg w t) -- | Map function over sentence tags. mapSent :: Ord b => (a -> b) -> Sent w a -> Sent w b -- | A sentence with original, textual representation. data SentO w t SentO :: Sent w t -> Text -> SentO w t [segs] :: SentO w t -> Sent w t [orig] :: SentO w t -> Text -- | Map function over sentence tags. mapSentO :: Ord b => (a -> b) -> SentO w a -> SentO w b instance (GHC.Show.Show w, GHC.Show.Show t) => GHC.Show.Show (NLP.Concraft.DAG.Morphosyntax.Seg w t) instance NLP.Concraft.DAG.Morphosyntax.Word w => NLP.Concraft.DAG.Morphosyntax.Word (NLP.Concraft.DAG.Morphosyntax.Seg w t) instance Data.Aeson.Types.ToJSON.ToJSON w => Data.Aeson.Types.ToJSON.ToJSON (NLP.Concraft.DAG.Morphosyntax.Seg w Data.Text.Internal.Text) instance Data.Aeson.Types.FromJSON.FromJSON w => Data.Aeson.Types.FromJSON.FromJSON (NLP.Concraft.DAG.Morphosyntax.Seg w Data.Text.Internal.Text) -- | Baseline word-segmentation functions. module NLP.Concraft.DAG.Segmentation -- | Which path type to search: shortest (Min) or longest -- (Max) data PathTyp Min :: PathTyp Max :: PathTyp Freq :: FreqConf -> PathTyp -- | Select the shortest-path (or longest, depending on PathTyp) in -- the given DAG and remove all the edges which are not on this path. pickPath :: (Word b) => PathTyp -> DAG a b -> DAG a b -- | Retrieve the edges which belong to the shortest/longest (depending on -- the argument function: minimum or maximum) path in the -- given DAG. findPath :: (Word b) => PathTyp -> DAG a b -> Set EdgeID -- | Compute chosen/not-chosen counts of the individual orthographic forms -- in the DAGs. Only the ambiguous segments are taken into account. computeFreqs :: (Word w) => [Sent w t] -> Map Text (Int, Int) -- | Configuration related to frequency-based path picking. data FreqConf FreqConf :: Map Text (Int, Int) -> Double -> FreqConf -- | A map which assigns (chosen, not chosen) counts to the invidiaul -- orthographic forms (see computeFreqs). [pickFreqMap] :: FreqConf -> Map Text (Int, Int) -- | A naive smoothing related parameter, which should be adddd to each -- count in pickFreqMap. , orth :: DAG.EdgeID -> T.Text -- ^ -- Orthographic form of a given edge [smoothingParam] :: FreqConf -> Double -- | Compute: * the number of tokens participating in ambiguities * the -- total number of tokens computeAmbiStats :: (Word w) => AmbiCfg -> [Sent w t] -> AmbiStats -- | Numbers of tokens. data AmbiCfg AmbiCfg :: Bool -> AmbiCfg -- | Only take the chosen tokens into account [onlyChosen] :: AmbiCfg -> Bool -- | Numbers of tokens. data AmbiStats AmbiStats :: !Int -> !Int -> AmbiStats -- | Ambiguous tokens [ambi] :: AmbiStats -> !Int -- | All tokens [total] :: AmbiStats -> !Int instance GHC.Classes.Ord NLP.Concraft.DAG.Segmentation.AmbiStats instance GHC.Classes.Eq NLP.Concraft.DAG.Segmentation.AmbiStats instance GHC.Show.Show NLP.Concraft.DAG.Segmentation.AmbiStats instance GHC.Classes.Ord NLP.Concraft.DAG.Segmentation.AmbiCfg instance GHC.Classes.Eq NLP.Concraft.DAG.Segmentation.AmbiCfg instance GHC.Show.Show NLP.Concraft.DAG.Segmentation.AmbiCfg -- | Observation schema blocks for Concraft. module NLP.Concraft.DAG.Schema -- | An observation consist of an index (of list type) and an actual -- observation value. type Ob = ([Int], Text) -- | The Ox monad specialized to word token type and text observations. type Ox a = Ox Text a -- | A schema is a block of the Ox computation performed within the context -- of the sentence and the absolute sentence position. type Schema w t a = Sent w t -> EdgeID -> Ox a -- | A dummy schema block. TODO: is it a monad, an applicative? void :: a -> Schema w t a -- | Sequence the list of schemas (or blocks) and discard individual -- values. sequenceS_ :: [Sent w t -> a -> Ox b] -> Sent w t -> a -> Ox () -- | Use the schema to extract observations from the sentence. schematize :: Schema w t a -> Sent w t -> DAG () [Ob] -- | Body of configuration entry. data Body a Body :: [Int] -> Bool -> a -> Body a -- | Range argument for the schema block. [range] :: Body a -> [Int] -- | When true, the entry is used only for oov words. [oovOnly] :: Body a -> Bool -- | Additional arguments for the schema block. [args] :: Body a -> a -- | Maybe entry. type Entry a = Maybe (Body a) -- | Plain entry with no additional arugments. entry :: [Int] -> Entry () -- | Entry with additional arguemnts. entryWith :: a -> [Int] -> Entry a -- | Configuration of the schema. All configuration elements specify the -- range over which a particular observation type should be taken on -- account. For example, the [-1, 0, 2] range means that -- observations of particular type will be extracted with respect to -- previous (k - 1), current (k) and after the next -- (k + 2) positions when identifying the observation set for -- position k in the input sentence. data SchemaConf SchemaConf :: Entry () -> Entry () -> Entry [Int] -> Entry [Int] -> Entry () -> Entry () -> Entry () -> Entry () -> SchemaConf -- | The orthB schema block. [orthC] :: SchemaConf -> Entry () -- | The lowOrthB schema block. [lowOrthC] :: SchemaConf -> Entry () -- | The lowPrefixesB schema block. The first list of ints -- represents lengths of prefixes. [lowPrefixesC] :: SchemaConf -> Entry [Int] -- | The lowSuffixesB schema block. The first list of ints -- represents lengths of suffixes. [lowSuffixesC] :: SchemaConf -> Entry [Int] -- | The knownB schema block. [knownC] :: SchemaConf -> Entry () -- | The shapeB schema block. [shapeC] :: SchemaConf -> Entry () -- | The packedB schema block. [packedC] :: SchemaConf -> Entry () -- | The begPackedB schema block. [begPackedC] :: SchemaConf -> Entry () -- | Null configuration of the observation schema. nullConf :: SchemaConf -- | Build the schema based on the configuration. fromConf :: Word w => SchemaConf -> Schema w t () -- | A block is a chunk of the Ox computation performed within the context -- of the sentence and the list of absolute sentence positions. type Block w t a = Sent w t -> [EdgeID] -> Ox a -- | Transform a block to a schema depending on * A list of relative -- sentence positions, * A boolean value; if true, the block computation -- will be performed only on positions where an OOV word resides. fromBlock :: Word w => Block w t a -> [Int] -> Bool -> Schema w t a -- | Orthographic form at the current position. orthB :: Word w => Block w t () -- | Orthographic form at the current position. lowOrthB :: Word w => Block w t () -- | List of lowercased prefixes of given lengths. lowPrefixesB :: Word w => [Int] -> Block w t () -- | List of lowercased suffixes of given lengths. lowSuffixesB :: Word w => [Int] -> Block w t () -- | Shape of the word. knownB :: Word w => Block w t () -- | Shape of the word. shapeB :: Word w => Block w t () -- | Packed shape of the word. packedB :: Word w => Block w t () -- | Packed shape of the word. begPackedB :: Word w => Block w t () instance GHC.Show.Show NLP.Concraft.DAG.Schema.SchemaConf instance GHC.Show.Show a => GHC.Show.Show (NLP.Concraft.DAG.Schema.Body a) instance Data.Binary.Class.Binary NLP.Concraft.DAG.Schema.SchemaConf instance Data.Binary.Class.Binary a => Data.Binary.Class.Binary (NLP.Concraft.DAG.Schema.Body a) -- | Accuracy statistics. module NLP.Concraft.DAG.Morphosyntax.Accuracy -- | True positives, false positives, etc. data Stats Stats :: !Int -> !Int -> !Int -> !Int -> !Int -> Stats -- | True positive [tp] :: Stats -> !Int -- | False positive [fp] :: Stats -> !Int -- | True negative [tn] :: Stats -> !Int -- | False negative [fn] :: Stats -> !Int -- | Consistency error (number of edges for which both fp and -- fn hold) [ce] :: Stats -> !Int -- | Configuration of accuracy computation. data AccCfg x AccCfg :: Bool -> Bool -> Set x -> Tagset -> Bool -> Bool -> Bool -> Bool -> Bool -> AccCfg x -- | Limit calculations to OOV words [onlyOov] :: AccCfg x -> Bool -- | Limit calculations to segmentation-ambiguous words [onlyAmb] :: AccCfg x -> Bool -- | Limit calculations to segments marked with one of the given labels; if -- empty, the option has no effect [onlyMarkedWith] :: AccCfg x -> Set x -- | The underlying tagset [accTagset] :: AccCfg x -> Tagset -- | Should the tags be expanded? [expandTag] :: AccCfg x -> Bool -- | Compute segmentation-level accurracy. The actually chosen tags are -- ignored, only information about the chosen DAG edges is relevant. [ignoreTag] :: AccCfg x -> Bool -- | If weak, there has to be an overlap in the tags assigned to a given -- segment in both datasets. Otherwise, the two sets of tags have to be -- identical. [weakAcc] :: AccCfg x -> Bool -- | Whether sentences with near 0 probability should be discarded from -- evaluation. [discardProb0] :: AccCfg x -> Bool -- | Print information about compared elements [verbose] :: AccCfg x -> Bool -- | Compute the accuracy of the model with respect to the labeled dataset. -- To each Tag an additional information x can be -- assigned, which will be taken into account when computing statistics. collect :: (Word w, Ord x, Show x) => AccCfg x -> [Sent w (Tag, x)] -> [Sent w (Tag, x)] -> Stats precision :: Stats -> Double recall :: Stats -> Double accuracy :: Stats -> Double instance GHC.Classes.Ord NLP.Concraft.DAG.Morphosyntax.Accuracy.Stats instance GHC.Classes.Eq NLP.Concraft.DAG.Morphosyntax.Accuracy.Stats instance GHC.Show.Show NLP.Concraft.DAG.Morphosyntax.Accuracy.Stats module NLP.Concraft.DAG.Guess -- | A guessing model. data Guesser t s Guesser :: SchemaConf -> CRF Ob s -> s -> Set t -> t -> s -> s -> t -> Guesser t s [schemaConf] :: Guesser t s -> SchemaConf [crf] :: Guesser t s -> CRF Ob s [zeroProbLab] :: Guesser t s -> s -- | The tagset considered for the unknown words (TODO: a solution parallel -- and not 100% consistent with what is implemented in the CRF library) -- TODO: with complexify, unkTagSet is not needed anymore! [unkTagSet] :: Guesser t s -> Set t -- | A tag simplification function [simplify] :: Guesser t s -> t -> s -- | NEW: instead of an unkTagSet, a function which makes a complex -- tag out of a simple tag. -- -- WARNING: we assume, that this function does not conflate simplified -- tags, i.e., tag to tags of type s cannot lead to one and the -- same complex tag of type t. [complexify] :: Guesser t s -> s -> t -- | Store the entire guessing model apart from the simplification -- function. putGuesser :: (Binary t, Binary s, Ord s) => Guesser t s -> Put -- | Get the disambiguation model, provided the simplification function. -- getGuesser :: (M.Map t T.Tag) -> Get (Guesser t) getGuesser :: (Binary t, Binary s, Ord s, Ord t) => (t -> s) -> (s -> t) -> Get (Guesser t s) -- | Determine the marginal probabilities of the individual labels in the -- sentence. marginals :: (Word w, Ord t, Ord s) => Config s -> Guesser t s -> Sent w t -> DAG () (WMap t) -- | Replace the probabilities of the sentence labels with the marginal -- probabilities stemming from the model. marginalsSent :: (Word w, Ord t, Ord s) => Config s -> Guesser t s -> Sent w t -> Sent w t -- | Training configuration. data TrainConf t s TrainConf :: SchemaConf -> SgdArgs -> Bool -> R0T -> t -> t -> s -> s -> t -> t -> t -> Bool -> TrainConf t s [schemaConfT] :: TrainConf t s -> SchemaConf -- | SGD parameters. [sgdArgsT] :: TrainConf t s -> SgdArgs -- | Store SGD dataset on disk [onDiskT] :: TrainConf t s -> Bool -- | R0 construction method [r0T] :: TrainConf t s -> R0T -- | Zero probability label [zeroProbLabel] :: TrainConf t s -> t -- | Label simplification function [simplifyLabel] :: TrainConf t s -> t -> s -- | Label complexification function [complexifyLabel] :: TrainConf t s -> s -> t -- | Strip the label from irrelevant information. Used to determine the set -- of possible tags for unknown words. TODO: we don't need this with -- complexify anymore!? [stripLabel] :: TrainConf t s -> t -> t -- | Guess only visible features [onlyVisible] :: TrainConf t s -> Bool -- | Method of constructing the default set of labels (R0). data R0T -- | See anyInterps AnyInterps :: R0T -- | See anyChosen AnyChosen :: R0T -- | See oovChosen OovChosen :: R0T -- | Train guesser. train :: (Word w, Ord t, Ord s) => TrainConf t s -> IO [Sent w t] -> IO [Sent w t] -> IO (Guesser t s) -- | Schematized dataset. schemed :: (Word w, Ord t, Ord s) => (t -> s) -> Schema w s a -> [Sent w t] -> [SentL Ob s] instance Data.Data.Data NLP.Concraft.DAG.Guess.R0T instance GHC.Enum.Enum NLP.Concraft.DAG.Guess.R0T instance GHC.Classes.Ord NLP.Concraft.DAG.Guess.R0T instance GHC.Classes.Eq NLP.Concraft.DAG.Guess.R0T instance GHC.Show.Show NLP.Concraft.DAG.Guess.R0T module NLP.Concraft.DAG.Disamb -- | A disambiguation model. data Disamb t Disamb :: [Tier] -> SchemaConf -> CRF Ob Atom -> t -> Tag -> Disamb t [tiers] :: Disamb t -> [Tier] [schemaConf] :: Disamb t -> SchemaConf [crf] :: Disamb t -> CRF Ob Atom -- | A map which simplifies the tags of generic type t to -- simplified positional tags. The motivation behind this is that tags -- can have a richer structure. -- -- NOTE: it can happen in real situations that a tag is encountered which -- is not known by the model. It would be nice to be able to treat it as -- the closest tag that can be handled. Then, one have to define the -- notion of the similarilty between tags, though... But probably it -- should be done at a different level (where more information about the -- structure of t is known) [simplify] :: Disamb t -> t -> Tag -- | Store the entire disambiguation model apart from the simplification -- function. putDisamb :: Disamb t -> Put -- | Get the disambiguation model, provided the simplification function. -- getDisamb :: (M.Map t T.Tag) -> Get (Disamb t) getDisamb :: (t -> Tag) -> Get (Disamb t) -- | A tier description. data Tier Tier :: Bool -> Bool -> Set Attr -> Tier -- | Does it include the part of speech? [withPos] :: Tier -> Bool -- | End-of-sentence marker. [withEos] :: Tier -> Bool -- | Tier grammatical attributes. [withAtts] :: Tier -> Set Attr -- | An atomic part of morphosyntactic tag with optional POS. data Atom Atom :: Maybe POS -> Map Attr Text -> Maybe Bool -> Atom [pos] :: Atom -> Maybe POS [atts] :: Atom -> Map Attr Text -- | NOTE: could be simplified to Bool, but this way it's more readable [eos] :: Atom -> Maybe Bool -- | Type of resulting probabilities. data ProbType -- | Marginal probabilities Marginals :: ProbType -- | TODO MaxProbs :: ProbType -- | Determine the marginal probabilities of to individual labels in the -- sentence. marginalsSent :: (X.Word w, Ord t) => Disamb t -> -- X.Sent w t -> DAG () (X.WMap [P.Atom]) probsSent :: (Word w, Ord t) => ProbType -> Disamb t -> Sent w t -> Sent w t -- | Determine the marginal probabilities of to individual labels in the -- sentence. probs :: (Word w, Ord t) => ProbType -> Disamb t -> Sent w t -> DAG () (WMap t) -- | Training configuration. data TrainConf t TrainConf :: [Tier] -> SchemaConf -> SgdArgs -> Bool -> t -> Tag -> TrainConf t [tiersT] :: TrainConf t -> [Tier] [schemaConfT] :: TrainConf t -> SchemaConf [sgdArgsT] :: TrainConf t -> SgdArgs [onDiskT] :: TrainConf t -> Bool -- | Label simplification function [simplifyLabel] :: TrainConf t -> t -> Tag -- | Train disambiguation module. train :: (Word w, Ord t) => TrainConf t -> IO [Sent w t] -> IO [Sent w t] -> IO (Disamb t) -- | Prune disamb model: discard model features with absolute values (in -- log-domain) lower than the given threshold. prune :: Double -> Disamb t -> Disamb t -- | Schematize the input sentence according to schema rules. schematize :: Schema w [t] a -> Sent w [t] -> Sent Ob t -- | A version of the disambigation model adapted to perform sentence -- segmentation as well. module NLP.Concraft.DAG.DisambSeg -- | The internal tag type. data Tag Tag :: Tag -> Bool -> Tag -- | Positional tag [posiTag] :: Tag -> Tag -- | End-of-sentence marker [hasEos] :: Tag -> Bool -- | A disambiguation model. data Disamb t Disamb :: [Tier] -> SchemaConf -> CRF Ob Atom -> t -> Tag -> Disamb t [tiers] :: Disamb t -> [Tier] [schemaConf] :: Disamb t -> SchemaConf [crf] :: Disamb t -> CRF Ob Atom -- | A function which simplifies the tags of the generic type t to -- (i) the corresponding positional tags and (ii) information if the -- segment represents sentence end. -- -- NOTE: it can happen in real situations that a tag is encountered which -- is not known by the model. It would be nice to be able to treat it as -- the closest tag that can be handled. Then, one have to define the -- notion of the similarilty between tags, though... But probably it -- should be done at a different level (where more information about the -- structure of t is known) [simplify] :: Disamb t -> t -> Tag -- | Store the entire disambiguation model apart from the simplification -- function. putDisamb :: Disamb t -> Put -- | Get the disambiguation model, provided the simplification function. -- getDisamb :: (M.Map t T.Tag) -> Get (Disamb t) getDisamb :: (t -> Tag) -> Get (Disamb t) -- | A tier description. data Tier Tier :: Bool -> Bool -> Set Attr -> Tier -- | Does it include the part of speech? [withPos] :: Tier -> Bool -- | End-of-sentence marker. [withEos] :: Tier -> Bool -- | Tier grammatical attributes. [withAtts] :: Tier -> Set Attr -- | An atomic part of morphosyntactic tag with optional POS. data Atom Atom :: Maybe POS -> Map Attr Text -> Maybe Bool -> Atom [pos] :: Atom -> Maybe POS [atts] :: Atom -> Map Attr Text -- | NOTE: could be simplified to Bool, but this way it's more readable [eos] :: Atom -> Maybe Bool -- | Type of resulting probabilities. data ProbType -- | Marginal probabilities Marginals :: ProbType -- | TODO MaxProbs :: ProbType -- | Determine the marginal probabilities of to individual labels in the -- sentence. marginalsSent :: (X.Word w, Ord t) => Disamb t -> -- X.Sent w t -> DAG () (X.WMap [P.Atom]) probsSent :: (Word w, Ord t) => ProbType -> Disamb t -> Sent w t -> Sent w t -- | Determine the marginal probabilities of to individual labels in the -- sentence. probs :: (Word w, Ord t) => ProbType -> Disamb t -> Sent w t -> DAG () (WMap t) -- | Training configuration. data TrainConf t TrainConf :: [Tier] -> SchemaConf -> SgdArgs -> Bool -> t -> Tag -> TrainConf t [tiersT] :: TrainConf t -> [Tier] [schemaConfT] :: TrainConf t -> SchemaConf [sgdArgsT] :: TrainConf t -> SgdArgs [onDiskT] :: TrainConf t -> Bool -- | Label simplification function [simplifyLabel] :: TrainConf t -> t -> Tag -- | Train disambiguation module. train :: (Word w, Ord t) => TrainConf t -> IO [Sent w t] -> IO [Sent w t] -> IO (Disamb t) -- | Prune disamb model: discard model features with absolute values (in -- log-domain) lower than the given threshold. prune :: Double -> Disamb t -> Disamb t instance GHC.Classes.Ord NLP.Concraft.DAG.DisambSeg.Tag instance GHC.Classes.Eq NLP.Concraft.DAG.DisambSeg.Tag instance GHC.Show.Show NLP.Concraft.DAG.DisambSeg.Tag -- | Top-level module adated to DAGs, guessing and disambiguation. module NLP.Concraft.DAGSeg -- | Concraft data. data Concraft t Concraft :: Tagset -> Int -> Guesser t Tag -> Disamb t -> Disamb t -> Concraft t [tagset] :: Concraft t -> Tagset [guessNum] :: Concraft t -> Int [guesser] :: Concraft t -> Guesser t Tag [segmenter] :: Concraft t -> Disamb t [disamb] :: Concraft t -> Disamb t -- | Save model in a file. Data is compressed using the gzip format. saveModel :: (Ord t, Binary t) => FilePath -> Concraft t -> IO () -- | Load model from a file. loadModel :: (Ord t, Binary t) => (Tagset -> t -> Tag) -> (Tagset -> Tag -> t) -> (Tagset -> t -> Tag) -> FilePath -> IO (Concraft t) -- | DAG annotation, assignes b values to a labels for -- each edge in the graph. type Anno a b = DAG () (Map a b) -- | Find all optimal paths in the given annotation. Optimal paths are -- those which go through tags with the assigned probability 1. For a -- given chosen edge, all the tags with probability 1 are selected. findOptimalPaths :: Ord t => Anno t Double -> [[(EdgeID, Set t)]] -- | Make the given path with disamb markers in the given annotation and -- produce a new disamb annotation. disambPath :: (Ord t) => [(EdgeID, Set t)] -> Anno t Double -> Anno t Bool -- | Determine marginal probabilities corresponding to individual tags -- w.r.t. the guessing model. guessMarginals :: (Word w, Ord t) => Config Tag -> Guesser t Tag -> Sent w t -> Anno t Double -- | Determine marginal probabilities corresponding to individual tags -- w.r.t. the guessing model. disambMarginals :: (Word w, Ord t) => Disamb t -> Sent w t -> Anno t Double -- | Determine probabilities corresponding to individual tags w.r.t. the -- guessing model. disambProbs :: (Word w, Ord t) => ProbType -> Disamb t -> Sent w t -> Anno t Double -- | Extend the OOV words with new, guessed interpretations. -- -- Determine marginal probabilities corresponding to individual tags -- w.r.t. the guessing model and, afterwards, trim the sentence to keep -- only the k most probably labels for each OOV edge. Note that, -- for OOV words, the entire set of default tags is considered. guessSent :: (Word w, Ord t) => Int -> Config Tag -> Guesser t Tag -> Sent w t -> Sent w t -- | Perform guessing, trimming, and finally determine marginal -- probabilities corresponding to individual tags w.r.t. the guessing -- model. guess :: (Word w, Ord t) => Int -> Config Tag -> Guesser t Tag -> Sent w t -> Anno t Double -- | Perform guessing, trimming, and finally determine marginal -- probabilities corresponding to individual tags w.r.t. the -- disambiguation model. tag :: (Word w, Ord t) => Int -> Config Tag -> Concraft t -> Sent w t -> Anno t Double -- | Prune the disambiguation model: discard model features with absolute -- values (in log-domain) lower than the given threshold. prune :: Double -> Concraft t -> Concraft t -- | Observation schema blocks for Concraft. module NLP.Concraft.Schema -- | An observation consist of an index (of list type) and an actual -- observation value. type Ob = ([Int], Text) -- | The Ox monad specialized to word token type and text observations. type Ox a = Ox Text a -- | A schema is a block of the Ox computation performed within the context -- of the sentence and the absolute sentence position. type Schema w t a = Vector (Seg w t) -> Int -> Ox a -- | A dummy schema block. void :: a -> Schema w t a -- | Sequence the list of schemas (or blocks) and discard individual -- values. sequenceS_ :: [Vector (Seg w t) -> a -> Ox b] -> Vector (Seg w t) -> a -> Ox () -- | Use the schema to extract observations from the sentence. schematize :: Schema w t a -> Sent w t -> [[Ob]] -- | Body of configuration entry. data Body a Body :: [Int] -> Bool -> a -> Body a -- | Range argument for the schema block. [range] :: Body a -> [Int] -- | When true, the entry is used only for oov words. [oovOnly] :: Body a -> Bool -- | Additional arguments for the schema block. [args] :: Body a -> a -- | Maybe entry. type Entry a = Maybe (Body a) -- | Plain entry with no additional arugments. entry :: [Int] -> Entry () -- | Entry with additional arguemnts. entryWith :: a -> [Int] -> Entry a -- | Configuration of the schema. All configuration elements specify the -- range over which a particular observation type should be taken on -- account. For example, the [-1, 0, 2] range means that -- observations of particular type will be extracted with respect to -- previous (k - 1), current (k) and after the next -- (k + 2) positions when identifying the observation set for -- position k in the input sentence. data SchemaConf SchemaConf :: Entry () -> Entry () -> Entry [Int] -> Entry [Int] -> Entry () -> Entry () -> Entry () -> Entry () -> SchemaConf -- | The orthB schema block. [orthC] :: SchemaConf -> Entry () -- | The lowOrthB schema block. [lowOrthC] :: SchemaConf -> Entry () -- | The lowPrefixesB schema block. The first list of ints -- represents lengths of prefixes. [lowPrefixesC] :: SchemaConf -> Entry [Int] -- | The lowSuffixesB schema block. The first list of ints -- represents lengths of suffixes. [lowSuffixesC] :: SchemaConf -> Entry [Int] -- | The knownB schema block. [knownC] :: SchemaConf -> Entry () -- | The shapeB schema block. [shapeC] :: SchemaConf -> Entry () -- | The packedB schema block. [packedC] :: SchemaConf -> Entry () -- | The begPackedB schema block. [begPackedC] :: SchemaConf -> Entry () -- | Null configuration of the observation schema. nullConf :: SchemaConf -- | Build the schema based on the configuration. fromConf :: Word w => SchemaConf -> Schema w t () -- | A block is a chunk of the Ox computation performed within the context -- of the sentence and the list of absolute sentence positions. type Block w t a = Vector (Seg w t) -> [Int] -> Ox a -- | Transform a block to a schema depending on * A list of relative -- sentence positions, * A boolean value; if true, the block computation -- will be performed only on positions where an OOV word resides. fromBlock :: Word w => Block w t a -> [Int] -> Bool -> Schema w t a -- | Orthographic form at the current position. orthB :: Word w => Block w t () -- | Orthographic form at the current position. lowOrthB :: Word w => Block w t () -- | List of lowercased prefixes of given lengths. lowPrefixesB :: Word w => [Int] -> Block w t () -- | List of lowercased suffixes of given lengths. lowSuffixesB :: Word w => [Int] -> Block w t () -- | Shape of the word. knownB :: Word w => Block w t () -- | Shape of the word. shapeB :: Word w => Block w t () -- | Packed shape of the word. packedB :: Word w => Block w t () -- | Packed shape of the word. begPackedB :: Word w => Block w t () instance GHC.Show.Show NLP.Concraft.Schema.SchemaConf instance GHC.Show.Show a => GHC.Show.Show (NLP.Concraft.Schema.Body a) instance Data.Binary.Class.Binary NLP.Concraft.Schema.SchemaConf instance Data.Binary.Class.Binary a => Data.Binary.Class.Binary (NLP.Concraft.Schema.Body a) module NLP.Concraft.Guess -- | A guessing model. data Guesser t Guesser :: SchemaConf -> CRF Ob t -> Guesser t [schemaConf] :: Guesser t -> SchemaConf [crf] :: Guesser t -> CRF Ob t -- | Determine the k most probable labels for each word in the -- sentence. TODO: Perhaps it would be better to use sets instead of -- lists as output? guess :: (Word w, Ord t) => Int -> Guesser t -> Sent w t -> [[t]] -- | Insert guessing results into the sentence. Only interpretations of OOV -- words will be extended. include :: (Word w, Ord t) => [[t]] -> Sent w t -> Sent w t -- | Combine guess with include. guessSent :: (Word w, Ord t) => Int -> Guesser t -> Sent w t -> Sent w t -- | Training configuration. data TrainConf TrainConf :: SchemaConf -> SgdArgs -> Bool -> R0T -> TrainConf [schemaConfT] :: TrainConf -> SchemaConf -- | SGD parameters. [sgdArgsT] :: TrainConf -> SgdArgs -- | Store SGD dataset on disk [onDiskT] :: TrainConf -> Bool -- | R0 construction method [r0T] :: TrainConf -> R0T -- | Method of constructing the default set of labels (R0). data R0T -- | See anyInterps AnyInterps :: R0T -- | See anyChosen AnyChosen :: R0T -- | See oovChosen OovChosen :: R0T -- | Train guesser. train :: (Word w, Ord t) => TrainConf -> IO [Sent w t] -> IO [Sent w t] -> IO (Guesser t) instance Data.Data.Data NLP.Concraft.Guess.R0T instance GHC.Enum.Enum NLP.Concraft.Guess.R0T instance GHC.Classes.Ord NLP.Concraft.Guess.R0T instance GHC.Classes.Eq NLP.Concraft.Guess.R0T instance GHC.Show.Show NLP.Concraft.Guess.R0T instance (GHC.Classes.Ord t, Data.Binary.Class.Binary t) => Data.Binary.Class.Binary (NLP.Concraft.Guess.Guesser t) module NLP.Concraft.Disamb -- | A disambiguation model. data Disamb Disamb :: [Tier] -> SchemaConf -> CRF Ob Atom -> Disamb [tiers] :: Disamb -> [Tier] [schemaConf] :: Disamb -> SchemaConf [crf] :: Disamb -> CRF Ob Atom -- | A tier description. data Tier Tier :: Bool -> Bool -> Set Attr -> Tier -- | Does it include the part of speech? [withPos] :: Tier -> Bool -- | End-of-sentence marker. [withEos] :: Tier -> Bool -- | Tier grammatical attributes. [withAtts] :: Tier -> Set Attr -- | An atomic part of morphosyntactic tag with optional POS. data Atom Atom :: Maybe POS -> Map Attr Text -> Maybe Bool -> Atom [pos] :: Atom -> Maybe POS [atts] :: Atom -> Map Attr Text -- | NOTE: could be simplified to Bool, but this way it's more readable [eos] :: Atom -> Maybe Bool -- | Tag labels with marginal probabilities. marginals :: Word w => Disamb -> Sent w Tag -> [WMap Tag] -- | Perform context-sensitive disambiguation. disamb :: Word w => Disamb -> Sent w Tag -> [Tag] -- | Insert disambiguation results into the sentence. include :: (Sent w Tag -> [Tag]) -> Sent w Tag -> Sent w Tag -- | Combine disamb with include. disambSent :: Word w => Disamb -> Sent w Tag -> Sent w Tag -- | Training configuration. data TrainConf TrainConf :: [Tier] -> SchemaConf -> SgdArgs -> Bool -> TrainConf [tiersT] :: TrainConf -> [Tier] [schemaConfT] :: TrainConf -> SchemaConf [sgdArgsT] :: TrainConf -> SgdArgs [onDiskT] :: TrainConf -> Bool ReTrainConf :: Disamb -> SgdArgs -> Bool -> TrainConf [initDmb] :: TrainConf -> Disamb [sgdArgsT] :: TrainConf -> SgdArgs [onDiskT] :: TrainConf -> Bool -- | Train disamb model. train :: Word w => TrainConf -> IO [Sent w Tag] -> IO [Sent w Tag] -> IO Disamb -- | Prune disamb model: discard model features with absolute values (in -- log-domain) lower than the given threshold. prune :: Double -> Disamb -> Disamb instance Data.Binary.Class.Binary NLP.Concraft.Disamb.Disamb module NLP.Concraft -- | Concraft data. data Concraft Concraft :: Tagset -> Int -> Guesser Tag -> Disamb -> Concraft [tagset] :: Concraft -> Tagset [guessNum] :: Concraft -> Int [guesser] :: Concraft -> Guesser Tag [disamb] :: Concraft -> Disamb -- | Save model in a file. Data is compressed using the gzip format. saveModel :: FilePath -> Concraft -> IO () -- | Load model from a file. loadModel :: FilePath -> IO Concraft -- | Tag sentence using the model. In your code you should probably use -- your analysis function, translate results into a container of -- Sentences, evaluate tag on each sentence and embed the -- tagging results into the morphosyntactic structure of your own. -- -- The function returns guessing results as fst elements of the -- output pairs and disambiguation results as snd elements of the -- corresponding pairs. tag :: Word w => Concraft -> Sent w Tag -> [(Set Tag, Tag)] -- | Determine marginal probabilities corresponding to individual tags -- w.r.t. the disambiguation model. Since the guessing model is used -- first, the resulting weighted maps corresponding to OOV words may -- contain tags not present in the input sentence. marginals :: Word w => Concraft -> Sent w Tag -> [WMap Tag] -- | Train the Concraft model. No reanalysis of the input data will -- be performed. -- -- The FromJSON and ToJSON instances are used to store -- processed input data in temporary files on a disk. train :: (Word w, FromJSON w, ToJSON w) => Tagset -> Int -> TrainConf -> TrainConf -> IO [Sent w Tag] -> IO [Sent w Tag] -> IO Concraft -- | Train the Concraft model after dataset reanalysis. -- -- The FromJSON and ToJSON instances are used to store -- processed input data in temporary files on a disk. reAnaTrain :: (Word w, FromJSON w, ToJSON w) => Tagset -> Analyse w Tag -> Int -> TrainConf -> TrainConf -> IO [SentO w Tag] -> IO [SentO w Tag] -> IO Concraft -- | Prune disambiguation model: discard model features with absolute -- values (in log-domain) lower than the given threshold. prune :: Double -> Concraft -> Concraft instance Data.Binary.Class.Binary NLP.Concraft.Concraft