-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/
-- | Morphological disambiguation based on constrained CRFs
--
-- A morphological disambiguation library based on constrained
-- conditional random fields.
@package concraft
@version 0.8.2
-- | Types and functions related to the morphosyntax data layer.
module NLP.Concraft.Morphosyntax
-- | A segment parametrized over a word type and a tag type.
data Seg w t
Seg :: w -> WMap t -> Seg w t
-- | A word represented by the segment. Typically it will be an instance of
-- the Word class.
word :: Seg w t -> w
-- | A set of interpretations. To each interpretation a weight of
-- appropriateness within the context is assigned.
tags :: Seg w t -> WMap t
-- | Map function over segment tags.
mapSeg :: Ord b => (a -> b) -> Seg w a -> Seg w b
-- | Interpretations of the segment.
interpsSet :: Seg w t -> Set t
-- | Interpretations of the segment.
interps :: Seg w t -> [t]
class Word a
orth :: Word a => a -> Text
oov :: Word a => a -> Bool
-- | A sentence.
type Sent w t = [Seg w t]
-- | Map function over sentence tags.
mapSent :: Ord b => (a -> b) -> Sent w a -> Sent w b
-- | A sentence with original, textual representation.
data SentO w t
SentO :: Sent w t -> Text -> SentO w t
segs :: SentO w t -> Sent w t
orig :: SentO w t -> Text
-- | Map function over sentence tags.
mapSentO :: Ord b => (a -> b) -> SentO w a -> SentO w b
-- | A set with a non-negative weight assigned to each of its elements.
data WMap a
-- | Map function over weighted collection elements.
mapWMap :: Ord b => (a -> b) -> WMap a -> WMap b
-- | Make a weighted collection. Negative elements will be ignored.
mkWMap :: Ord a => [(a, Double)] -> WMap a
instance Show a => Show (WMap a)
instance Eq a => Eq (WMap a)
instance Ord a => Ord (WMap a)
instance (Show w, Show t) => Show (Seg w t)
instance (Show w, Show t) => Show (SentO w t)
instance Word w => Word (Seg w t)
instance FromJSON w => FromJSON (Seg w Text)
instance ToJSON w => ToJSON (Seg w Text)
-- | Morphosyntactic analysis utilities.
--
-- See reAnaSent function for a description of how reanalsis is
-- performed. At some point it would be nice to change the entire process
-- so that sentence-level segmentation is also taken from the reanalysed
-- data.
module NLP.Concraft.Analysis
-- | An analyser performs word-level segmentation and morphological
-- analysis.
type Analyse w t = Text -> IO (Sent w t)
-- | Reanalyse sentence.
--
-- From the reference sentence the function takes:
--
--
-- - Word-level segmentation
-- - Chosen interpretations (tags)
--
--
-- From the reanalysed sentence the function takes:
--
--
-- - Potential interpretations
--
reAnaSent :: Word w => Tagset -> Analyse w Tag -> SentO w Tag -> IO (Sent w Tag)
-- | Reanalyse paragraph.
reAnaPar :: Word w => Tagset -> Analyse w Tag -> [SentO w Tag] -> IO [Sent w Tag]
-- | Observation schema blocks for Concraft.
module NLP.Concraft.Schema
-- | An observation consist of an index (of list type) and an actual
-- observation value.
type Ob = ([Int], Text)
-- | The Ox monad specialized to word token type and text observations.
type Ox a = Ox Text a
-- | A schema is a block of the Ox computation performed within the context
-- of the sentence and the absolute sentence position.
type Schema w t a = Vector (Seg w t) -> Int -> Ox a
-- | A dummy schema block.
void :: a -> Schema w t a
-- | Sequence the list of schemas (or blocks) and discard individual
-- values.
sequenceS_ :: [Vector (Seg w t) -> a -> Ox b] -> Vector (Seg w t) -> a -> Ox ()
-- | Use the schema to extract observations from the sentence.
schematize :: Schema w t a -> Sent w t -> [[Ob]]
-- | Body of configuration entry.
data Body a
Body :: [Int] -> Bool -> a -> Body a
-- | Range argument for the schema block.
range :: Body a -> [Int]
-- | When true, the entry is used only for oov words.
oovOnly :: Body a -> Bool
-- | Additional arguments for the schema block.
args :: Body a -> a
-- | Maybe entry.
type Entry a = Maybe (Body a)
-- | Plain entry with no additional arugments.
entry :: [Int] -> Entry ()
-- | Entry with additional arguemnts.
entryWith :: a -> [Int] -> Entry a
-- | Configuration of the schema. All configuration elements specify the
-- range over which a particular observation type should be taken on
-- account. For example, the [-1, 0, 2] range means that
-- observations of particular type will be extracted with respect to
-- previous (k - 1), current (k) and after the next
-- (k + 2) positions when identifying the observation set for
-- position k in the input sentence.
data SchemaConf
SchemaConf :: Entry () -> Entry () -> Entry [Int] -> Entry [Int] -> Entry () -> Entry () -> Entry () -> Entry () -> SchemaConf
-- | The orthB schema block.
orthC :: SchemaConf -> Entry ()
-- | The lowOrthB schema block.
lowOrthC :: SchemaConf -> Entry ()
-- | The lowPrefixesB schema block. The first list of ints
-- represents lengths of prefixes.
lowPrefixesC :: SchemaConf -> Entry [Int]
-- | The lowSuffixesB schema block. The first list of ints
-- represents lengths of suffixes.
lowSuffixesC :: SchemaConf -> Entry [Int]
-- | The knownB schema block.
knownC :: SchemaConf -> Entry ()
-- | The shapeB schema block.
shapeC :: SchemaConf -> Entry ()
-- | The packedB schema block.
packedC :: SchemaConf -> Entry ()
-- | The begPackedB schema block.
begPackedC :: SchemaConf -> Entry ()
-- | Null configuration of the observation schema.
nullConf :: SchemaConf
-- | Build the schema based on the configuration.
fromConf :: Word w => SchemaConf -> Schema w t ()
-- | A block is a chunk of the Ox computation performed within the context
-- of the sentence and the list of absolute sentence positions.
type Block w t a = Vector (Seg w t) -> [Int] -> Ox a
-- | Transform a block to a schema depending on * A list of relative
-- sentence positions, * A boolean value; if true, the block computation
-- will be performed only on positions where an OOV word resides.
fromBlock :: Word w => Block w t a -> [Int] -> Bool -> Schema w t a
-- | Orthographic form at the current position.
orthB :: Word w => Block w t ()
-- | Orthographic form at the current position.
lowOrthB :: Word w => Block w t ()
-- | List of lowercased prefixes of given lengths.
lowPrefixesB :: Word w => [Int] -> Block w t ()
-- | List of lowercased suffixes of given lengths.
lowSuffixesB :: Word w => [Int] -> Block w t ()
-- | Shape of the word.
knownB :: Word w => Block w t ()
-- | Shape of the word.
shapeB :: Word w => Block w t ()
-- | Packed shape of the word.
packedB :: Word w => Block w t ()
-- | Packed shape of the word.
begPackedB :: Word w => Block w t ()
instance Show a => Show (Body a)
instance Show SchemaConf
instance Binary SchemaConf
instance Binary a => Binary (Body a)
module NLP.Concraft.Guess
-- | A guessing model.
data Guesser t
Guesser :: SchemaConf -> CRF Ob t -> Guesser t
schemaConf :: Guesser t -> SchemaConf
crf :: Guesser t -> CRF Ob t
-- | Determine k most probable labels for each word in the
-- sentence.
guess :: (Word w, Ord t) => Int -> Guesser t -> Sent w t -> [[t]]
-- | Insert guessing results into the sentence.
include :: (Word w, Ord t) => (Sent w t -> [[t]]) -> Sent w t -> Sent w t
-- | Combine guess with include.
guessSent :: (Word w, Ord t) => Int -> Guesser t -> Sent w t -> Sent w t
-- | Training configuration.
data TrainConf
TrainConf :: SchemaConf -> SgdArgs -> Bool -> R0T -> TrainConf
schemaConfT :: TrainConf -> SchemaConf
-- | SGD parameters.
sgdArgsT :: TrainConf -> SgdArgs
-- | Store SGD dataset on disk
onDiskT :: TrainConf -> Bool
-- | R0 construction method
r0T :: TrainConf -> R0T
-- | Method of constructing the default set of labels (R0).
data R0T
-- | See anyInterps
AnyInterps :: R0T
-- | See anyChosen
AnyChosen :: R0T
-- | See oovChosen
OovChosen :: R0T
-- | Train guesser.
train :: (Word w, Ord t) => TrainConf -> IO [Sent w t] -> IO [Sent w t] -> IO (Guesser t)
instance Typeable R0T
instance Show R0T
instance Eq R0T
instance Ord R0T
instance Enum R0T
instance Data R0T
instance (Ord t, Binary t) => Binary (Guesser t)
module NLP.Concraft.Disamb
-- | A disambiguation model.
data Disamb
Disamb :: [Tier] -> SchemaConf -> CRF Ob Atom -> Disamb
tiers :: Disamb -> [Tier]
schemaConf :: Disamb -> SchemaConf
crf :: Disamb -> CRF Ob Atom
-- | A tier description.
data Tier
Tier :: Bool -> Set Attr -> Tier
-- | Does it include the part of speech?
withPos :: Tier -> Bool
-- | Tier grammatical attributes.
withAtts :: Tier -> Set Attr
-- | An atomic part of morphosyntactic tag with optional POS.
data Atom
Atom :: Maybe POS -> Map Attr Text -> Atom
pos :: Atom -> Maybe POS
atts :: Atom -> Map Attr Text
-- | Perform context-sensitive disambiguation.
disamb :: Word w => Disamb -> Sent w Tag -> [Tag]
-- | Insert disambiguation results into the sentence.
include :: (Sent w Tag -> [Tag]) -> Sent w Tag -> Sent w Tag
-- | Combine disamb with include.
disambSent :: Word w => Disamb -> Sent w Tag -> Sent w Tag
-- | Training configuration.
data TrainConf
TrainConf :: [Tier] -> SchemaConf -> SgdArgs -> Bool -> Maybe Double -> TrainConf
tiersT :: TrainConf -> [Tier]
schemaConfT :: TrainConf -> SchemaConf
sgdArgsT :: TrainConf -> SgdArgs
onDiskT :: TrainConf -> Bool
pruneT :: TrainConf -> Maybe Double
-- | Train disamb model.
train :: Word w => TrainConf -> IO [Sent w Tag] -> IO [Sent w Tag] -> IO Disamb
instance Binary Disamb
-- | Accuracy statistics.
module NLP.Concraft.Morphosyntax.Accuracy
-- | Statistics.
data Stats
-- | Number of segments in gold corpus
Stats :: Int -> Int -> Stats
-- | Number of correct tags
good :: Stats -> Int
gold :: Stats -> Int
-- | Accuracy given stats.
accuracy :: Stats -> Double
-- | Accuracy weak lower bound.
weakLB :: Word w => Tagset -> [Seg w Tag] -> [Seg w Tag] -> Stats
-- | Accuracy weak upper bound.
weakUB :: Word w => Tagset -> [Seg w Tag] -> [Seg w Tag] -> Stats
-- | Accuracy strong lower bound.
strongLB :: Word w => Tagset -> [Seg w Tag] -> [Seg w Tag] -> Stats
-- | Accuracy strong upper bound.
strongUB :: Word w => Tagset -> [Seg w Tag] -> [Seg w Tag] -> Stats
module NLP.Concraft
-- | Concraft data.
data Concraft
Concraft :: Tagset -> Int -> Guesser Tag -> Disamb -> Concraft
tagset :: Concraft -> Tagset
guessNum :: Concraft -> Int
guesser :: Concraft -> Guesser Tag
disamb :: Concraft -> Disamb
-- | Save model in a file. Data is compressed using the gzip format.
saveModel :: FilePath -> Concraft -> IO ()
-- | Load model from a file.
loadModel :: FilePath -> IO Concraft
-- | Tag sentence using the model. In your code you should probably use
-- your analysis function, translate results into a container of
-- Sentences, evaluate tagSent on each sentence and embed
-- the tagging results into morphosyntactic structure of your own.
tag :: Word w => Concraft -> Sent w Tag -> [Tag]
-- | Train guessing and disambiguation models. No reanalysis will be
-- performed.
train :: (Word w, FromJSON w, ToJSON w) => Tagset -> Int -> TrainConf -> TrainConf -> IO [Sent w Tag] -> IO [Sent w Tag] -> IO Concraft
-- | Train guessing and disambiguation models after dataset reanalysis.
reAnaTrain :: (Word w, FromJSON w, ToJSON w) => Tagset -> Analyse w Tag -> Int -> TrainConf -> TrainConf -> IO [SentO w Tag] -> IO [SentO w Tag] -> IO Concraft
instance Binary Concraft