-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | Morphological disambiguation based on constrained CRFs -- -- A morphological disambiguation library based on constrained -- conditional random fields. @package concraft @version 0.9.3 -- | Types and functions related to the morphosyntax data layer. module NLP.Concraft.Morphosyntax -- | A segment parametrized over a word type and a tag type. data Seg w t Seg :: w -> WMap t -> Seg w t -- | A word represented by the segment. Typically it will be an instance of -- the Word class. word :: Seg w t -> w -- | A set of interpretations. To each interpretation a weight of -- appropriateness within the context is assigned. tags :: Seg w t -> WMap t -- | Map function over segment tags. mapSeg :: Ord b => (a -> b) -> Seg w a -> Seg w b -- | Interpretations of the segment. interpsSet :: Seg w t -> Set t -- | Interpretations of the segment. interps :: Seg w t -> [t] class Word a orth :: Word a => a -> Text oov :: Word a => a -> Bool -- | A sentence. type Sent w t = [Seg w t] -- | Map function over sentence tags. mapSent :: Ord b => (a -> b) -> Sent w a -> Sent w b -- | A sentence with original, textual representation. data SentO w t SentO :: Sent w t -> Text -> SentO w t segs :: SentO w t -> Sent w t orig :: SentO w t -> Text -- | Map function over sentence tags. mapSentO :: Ord b => (a -> b) -> SentO w a -> SentO w b -- | A set with a non-negative weight assigned to each of its elements. data WMap a -- | Map function over weighted collection elements. mapWMap :: Ord b => (a -> b) -> WMap a -> WMap b -- | Make a weighted collection. Negative elements will be ignored. mkWMap :: Ord a => [(a, Double)] -> WMap a instance Show a => Show (WMap a) instance Eq a => Eq (WMap a) instance Ord a => Ord (WMap a) instance (Ord a, Binary a) => Binary (WMap a) instance (Show w, Show t) => Show (Seg w t) instance (Show w, Show t) => Show (SentO w t) instance Word w => Word (Seg w t) instance FromJSON w => FromJSON (Seg w Text) instance ToJSON w => ToJSON (Seg w Text) -- | Morphosyntactic analysis utilities. -- -- See reAnaSent function for a description of how reanalsis is -- performed. At some point it would be nice to change the entire process -- so that sentence-level segmentation is also taken from the reanalysed -- data. module NLP.Concraft.Analysis -- | An analyser performs word-level segmentation and morphological -- analysis. type Analyse w t = Text -> IO (Sent w t) -- | Reanalyse sentence. -- -- From the reference sentence the function takes: -- -- -- -- From the reanalysed sentence the function takes: -- -- reAnaSent :: Word w => Tagset -> Analyse w Tag -> SentO w Tag -> IO (Sent w Tag) -- | Reanalyse paragraph. reAnaPar :: Word w => Tagset -> Analyse w Tag -> [SentO w Tag] -> IO [Sent w Tag] -- | Observation schema blocks for Concraft. module NLP.Concraft.Schema -- | An observation consist of an index (of list type) and an actual -- observation value. type Ob = ([Int], Text) -- | The Ox monad specialized to word token type and text observations. type Ox a = Ox Text a -- | A schema is a block of the Ox computation performed within the context -- of the sentence and the absolute sentence position. type Schema w t a = Vector (Seg w t) -> Int -> Ox a -- | A dummy schema block. void :: a -> Schema w t a -- | Sequence the list of schemas (or blocks) and discard individual -- values. sequenceS_ :: [Vector (Seg w t) -> a -> Ox b] -> Vector (Seg w t) -> a -> Ox () -- | Use the schema to extract observations from the sentence. schematize :: Schema w t a -> Sent w t -> [[Ob]] -- | Body of configuration entry. data Body a Body :: [Int] -> Bool -> a -> Body a -- | Range argument for the schema block. range :: Body a -> [Int] -- | When true, the entry is used only for oov words. oovOnly :: Body a -> Bool -- | Additional arguments for the schema block. args :: Body a -> a -- | Maybe entry. type Entry a = Maybe (Body a) -- | Plain entry with no additional arugments. entry :: [Int] -> Entry () -- | Entry with additional arguemnts. entryWith :: a -> [Int] -> Entry a -- | Configuration of the schema. All configuration elements specify the -- range over which a particular observation type should be taken on -- account. For example, the [-1, 0, 2] range means that -- observations of particular type will be extracted with respect to -- previous (k - 1), current (k) and after the next -- (k + 2) positions when identifying the observation set for -- position k in the input sentence. data SchemaConf SchemaConf :: Entry () -> Entry () -> Entry [Int] -> Entry [Int] -> Entry () -> Entry () -> Entry () -> Entry () -> SchemaConf -- | The orthB schema block. orthC :: SchemaConf -> Entry () -- | The lowOrthB schema block. lowOrthC :: SchemaConf -> Entry () -- | The lowPrefixesB schema block. The first list of ints -- represents lengths of prefixes. lowPrefixesC :: SchemaConf -> Entry [Int] -- | The lowSuffixesB schema block. The first list of ints -- represents lengths of suffixes. lowSuffixesC :: SchemaConf -> Entry [Int] -- | The knownB schema block. knownC :: SchemaConf -> Entry () -- | The shapeB schema block. shapeC :: SchemaConf -> Entry () -- | The packedB schema block. packedC :: SchemaConf -> Entry () -- | The begPackedB schema block. begPackedC :: SchemaConf -> Entry () -- | Null configuration of the observation schema. nullConf :: SchemaConf -- | Build the schema based on the configuration. fromConf :: Word w => SchemaConf -> Schema w t () -- | A block is a chunk of the Ox computation performed within the context -- of the sentence and the list of absolute sentence positions. type Block w t a = Vector (Seg w t) -> [Int] -> Ox a -- | Transform a block to a schema depending on * A list of relative -- sentence positions, * A boolean value; if true, the block computation -- will be performed only on positions where an OOV word resides. fromBlock :: Word w => Block w t a -> [Int] -> Bool -> Schema w t a -- | Orthographic form at the current position. orthB :: Word w => Block w t () -- | Orthographic form at the current position. lowOrthB :: Word w => Block w t () -- | List of lowercased prefixes of given lengths. lowPrefixesB :: Word w => [Int] -> Block w t () -- | List of lowercased suffixes of given lengths. lowSuffixesB :: Word w => [Int] -> Block w t () -- | Shape of the word. knownB :: Word w => Block w t () -- | Shape of the word. shapeB :: Word w => Block w t () -- | Packed shape of the word. packedB :: Word w => Block w t () -- | Packed shape of the word. begPackedB :: Word w => Block w t () instance Show a => Show (Body a) instance Show SchemaConf instance Binary SchemaConf instance Binary a => Binary (Body a) module NLP.Concraft.Guess -- | A guessing model. data Guesser t Guesser :: SchemaConf -> CRF Ob t -> Guesser t schemaConf :: Guesser t -> SchemaConf crf :: Guesser t -> CRF Ob t -- | Determine the k most probable labels for each word in the -- sentence. TODO: Perhaps it would be better to use sets instead of -- lists as output? guess :: (Word w, Ord t) => Int -> Guesser t -> Sent w t -> [[t]] -- | Insert guessing results into the sentence. Only interpretations of OOV -- words will be extended. include :: (Word w, Ord t) => [[t]] -> Sent w t -> Sent w t -- | Combine guess with include. guessSent :: (Word w, Ord t) => Int -> Guesser t -> Sent w t -> Sent w t -- | Training configuration. data TrainConf TrainConf :: SchemaConf -> SgdArgs -> Bool -> R0T -> TrainConf schemaConfT :: TrainConf -> SchemaConf -- | SGD parameters. sgdArgsT :: TrainConf -> SgdArgs -- | Store SGD dataset on disk onDiskT :: TrainConf -> Bool -- | R0 construction method r0T :: TrainConf -> R0T -- | Method of constructing the default set of labels (R0). data R0T -- | See anyInterps AnyInterps :: R0T -- | See anyChosen AnyChosen :: R0T -- | See oovChosen OovChosen :: R0T -- | Train guesser. train :: (Word w, Ord t) => TrainConf -> IO [Sent w t] -> IO [Sent w t] -> IO (Guesser t) instance Typeable R0T instance Show R0T instance Eq R0T instance Ord R0T instance Enum R0T instance Data R0T instance (Ord t, Binary t) => Binary (Guesser t) module NLP.Concraft.Disamb -- | A disambiguation model. data Disamb Disamb :: [Tier] -> SchemaConf -> CRF Ob Atom -> Disamb tiers :: Disamb -> [Tier] schemaConf :: Disamb -> SchemaConf crf :: Disamb -> CRF Ob Atom -- | A tier description. data Tier Tier :: Bool -> Set Attr -> Tier -- | Does it include the part of speech? withPos :: Tier -> Bool -- | Tier grammatical attributes. withAtts :: Tier -> Set Attr -- | An atomic part of morphosyntactic tag with optional POS. data Atom Atom :: Maybe POS -> Map Attr Text -> Atom pos :: Atom -> Maybe POS atts :: Atom -> Map Attr Text -- | Tag labels with marginal probabilities. marginals :: Word w => Disamb -> Sent w Tag -> [WMap Tag] -- | Perform context-sensitive disambiguation. disamb :: Word w => Disamb -> Sent w Tag -> [Tag] -- | Insert disambiguation results into the sentence. include :: (Sent w Tag -> [Tag]) -> Sent w Tag -> Sent w Tag -- | Combine disamb with include. disambSent :: Word w => Disamb -> Sent w Tag -> Sent w Tag -- | Training configuration. data TrainConf TrainConf :: [Tier] -> SchemaConf -> SgdArgs -> Bool -> TrainConf tiersT :: TrainConf -> [Tier] schemaConfT :: TrainConf -> SchemaConf sgdArgsT :: TrainConf -> SgdArgs onDiskT :: TrainConf -> Bool ReTrainConf :: Disamb -> SgdArgs -> Bool -> TrainConf initDmb :: TrainConf -> Disamb sgdArgsT :: TrainConf -> SgdArgs onDiskT :: TrainConf -> Bool -- | Train disamb model. train :: Word w => TrainConf -> IO [Sent w Tag] -> IO [Sent w Tag] -> IO Disamb -- | Prune disamb model: discard model features with absolute values (in -- log-domain) lower than the given threshold. prune :: Double -> Disamb -> Disamb instance Binary Disamb -- | Accuracy statistics. module NLP.Concraft.Morphosyntax.Accuracy -- | Statistics. data Stats -- | Number of segments in gold corpus Stats :: Int -> Int -> Stats -- | Number of correct tags good :: Stats -> Int gold :: Stats -> Int -- | Accuracy given stats. accuracy :: Stats -> Double -- | Accuracy weak lower bound. weakLB :: Word w => Tagset -> [Seg w Tag] -> [Seg w Tag] -> Stats -- | Accuracy weak upper bound. weakUB :: Word w => Tagset -> [Seg w Tag] -> [Seg w Tag] -> Stats -- | Accuracy strong lower bound. strongLB :: Word w => Tagset -> [Seg w Tag] -> [Seg w Tag] -> Stats -- | Accuracy strong upper bound. strongUB :: Word w => Tagset -> [Seg w Tag] -> [Seg w Tag] -> Stats module NLP.Concraft -- | Concraft data. data Concraft Concraft :: Tagset -> Int -> Guesser Tag -> Disamb -> Concraft tagset :: Concraft -> Tagset guessNum :: Concraft -> Int guesser :: Concraft -> Guesser Tag disamb :: Concraft -> Disamb -- | Save model in a file. Data is compressed using the gzip format. saveModel :: FilePath -> Concraft -> IO () -- | Load model from a file. loadModel :: FilePath -> IO Concraft -- | Tag sentence using the model. In your code you should probably use -- your analysis function, translate results into a container of -- Sentences, evaluate tag on each sentence and embed the -- tagging results into the morphosyntactic structure of your own. -- -- The function returns guessing results as fst elements of the -- output pairs and disambiguation results as snd elements of the -- corresponding pairs. tag :: Word w => Concraft -> Sent w Tag -> [(Set Tag, Tag)] -- | Determine marginal probabilities corresponding to individual tags -- w.r.t. the disambiguation model. Since the guessing model is used -- first, the resulting weighted maps corresponding to OOV words may -- contain tags not present in the input sentence. marginals :: Word w => Concraft -> Sent w Tag -> [WMap Tag] -- | Train the Concraft model. No reanalysis of the input data will -- be performed. -- -- The FromJSON and ToJSON instances are used to store -- processed input data in temporary files on a disk. train :: (Word w, FromJSON w, ToJSON w) => Tagset -> Int -> TrainConf -> TrainConf -> IO [Sent w Tag] -> IO [Sent w Tag] -> IO Concraft -- | Train the Concraft model after dataset reanalysis. -- -- The FromJSON and ToJSON instances are used to store -- processed input data in temporary files on a disk. reAnaTrain :: (Word w, FromJSON w, ToJSON w) => Tagset -> Analyse w Tag -> Int -> TrainConf -> TrainConf -> IO [SentO w Tag] -> IO [SentO w Tag] -> IO Concraft -- | Prune disambiguation model: discard model features with absolute -- values (in log-domain) lower than the given threshold. prune :: Double -> Concraft -> Concraft instance Binary Concraft