-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | Alphabet and word representations -- -- Provides different encoding for characters and words in natural -- language processing. A character will often be encoded as a unicode -- text string as we deal with multi-symbol characters. -- -- Internal encoding of IMMC symbols are 0-based integers, which allows -- for the use of unboxed containers. -- -- A very simple unigram-based scoring scheme and DSL to write such -- schemes are also provided. -- -- -- https://github.com/choener/NaturalLanguageAlphabets/blob/master/README.md @package NaturalLanguageAlphabets @version 0.0.2.0 -- | An alphabet, where each character is a short piece of Text. module NLP.Alphabet.MultiChar -- | Interns a MultiChar character. internMultiChar :: MultiChar -> MultiChar -- | Wrap a short bytestring. Read and Show instances behave like for -- normal strings. newtype MultiChar MultiChar :: Text -> MultiChar [getMultiChar] :: MultiChar -> Text -- | Interned MultiChar. -- -- TODO Check Ord instance. We compare on -- uninternMultiChar. data InternedMultiChar InternedMultiChar :: {-# UNPACK #-} !Id -> {-# UNPACK #-} !MultiChar -> InternedMultiChar [internedMultiCharId] :: InternedMultiChar -> {-# UNPACK #-} !Id [uninternMultiChar] :: InternedMultiChar -> {-# UNPACK #-} !MultiChar imcCache :: Cache InternedMultiChar instance GHC.Generics.Selector NLP.Alphabet.MultiChar.S1_0_1InternedMultiChar instance GHC.Generics.Selector NLP.Alphabet.MultiChar.S1_0_0InternedMultiChar instance GHC.Generics.Constructor NLP.Alphabet.MultiChar.C1_0InternedMultiChar instance GHC.Generics.Datatype NLP.Alphabet.MultiChar.D1InternedMultiChar instance GHC.Generics.Selector NLP.Alphabet.MultiChar.S1_0_0MultiChar instance GHC.Generics.Constructor NLP.Alphabet.MultiChar.C1_0MultiChar instance GHC.Generics.Datatype NLP.Alphabet.MultiChar.D1MultiChar instance Data.Hashable.Class.Hashable (Data.Interned.Internal.Description NLP.Alphabet.MultiChar.InternedMultiChar) instance GHC.Classes.Eq (Data.Interned.Internal.Description NLP.Alphabet.MultiChar.InternedMultiChar) instance Data.Data.Data NLP.Alphabet.MultiChar.InternedMultiChar instance GHC.Generics.Generic NLP.Alphabet.MultiChar.InternedMultiChar instance Data.Data.Data NLP.Alphabet.MultiChar.MultiChar instance GHC.Generics.Generic NLP.Alphabet.MultiChar.MultiChar instance GHC.Classes.Ord NLP.Alphabet.MultiChar.MultiChar instance GHC.Classes.Eq NLP.Alphabet.MultiChar.MultiChar instance GHC.Show.Show NLP.Alphabet.MultiChar.MultiChar instance GHC.Read.Read NLP.Alphabet.MultiChar.MultiChar instance Data.Hashable.Class.Hashable NLP.Alphabet.MultiChar.MultiChar instance Data.String.IsString NLP.Alphabet.MultiChar.MultiChar instance Data.Stringable.Stringable NLP.Alphabet.MultiChar.MultiChar instance Control.DeepSeq.NFData NLP.Alphabet.MultiChar.MultiChar instance Data.String.IsString NLP.Alphabet.MultiChar.InternedMultiChar instance GHC.Classes.Eq NLP.Alphabet.MultiChar.InternedMultiChar instance GHC.Classes.Ord NLP.Alphabet.MultiChar.InternedMultiChar instance GHC.Read.Read NLP.Alphabet.MultiChar.InternedMultiChar instance GHC.Show.Show NLP.Alphabet.MultiChar.InternedMultiChar instance Data.Hashable.Class.Hashable NLP.Alphabet.MultiChar.InternedMultiChar instance Data.Interned.Internal.Interned NLP.Alphabet.MultiChar.InternedMultiChar instance Data.Stringable.Stringable NLP.Alphabet.MultiChar.InternedMultiChar instance Control.DeepSeq.NFData NLP.Alphabet.MultiChar.InternedMultiChar -- | This module keeps a persistent bimap between -- InternedMultiChars and Ints -- -- TODO make this a bimap Text - Vector. Compare -- performance when printing backtracking results. (Do this after the -- Builder-based backtracking is online) module NLP.Alphabet.IMMC.Internal immcBimap :: IORef (Bimap InternedMultiChar Int) -- | Add InternedMultiChar and return Int key. Will -- return key for existing string and thereby serves for lookup in -- left-to-right direction. immcBimapAdd :: InternedMultiChar -> Int -- | Lookup the InternedMultiChar based on an Int key. -- Unsafe totality assumption. immcBimapLookupInt :: Int -> InternedMultiChar -- | An implementation of Int-mapped MultiChars with -- internalization. module NLP.Alphabet.IMMC newtype IMMC IMMC :: Int -> IMMC [getIMMC] :: IMMC -> Int immc :: InternedMultiChar -> IMMC instance Data.Vector.Unboxed.Base.Unbox NLP.Alphabet.IMMC.IMMC instance Data.Vector.Generic.Mutable.Base.MVector Data.Vector.Unboxed.Base.MVector NLP.Alphabet.IMMC.IMMC instance Data.Vector.Generic.Base.Vector Data.Vector.Unboxed.Base.Vector NLP.Alphabet.IMMC.IMMC instance GHC.Classes.Ord NLP.Alphabet.IMMC.IMMC instance Data.String.IsString NLP.Alphabet.IMMC.IMMC instance GHC.Show.Show NLP.Alphabet.IMMC.IMMC instance GHC.Read.Read NLP.Alphabet.IMMC.IMMC instance Data.Hashable.Class.Hashable NLP.Alphabet.IMMC.IMMC instance Data.Stringable.Stringable NLP.Alphabet.IMMC.IMMC instance Control.DeepSeq.NFData NLP.Alphabet.IMMC.IMMC instance Data.Binary.Class.Binary NLP.Alphabet.IMMC.IMMC instance Data.Serialize.Serialize NLP.Alphabet.IMMC.IMMC instance Data.Aeson.Types.Class.FromJSON NLP.Alphabet.IMMC.IMMC instance Data.Aeson.Types.Class.ToJSON NLP.Alphabet.IMMC.IMMC instance GHC.Generics.Selector NLP.Alphabet.IMMC.S1_0_0IMMC instance GHC.Generics.Constructor NLP.Alphabet.IMMC.C1_0IMMC instance GHC.Generics.Datatype NLP.Alphabet.IMMC.D1IMMC instance GHC.Generics.Generic NLP.Alphabet.IMMC.IMMC instance GHC.Classes.Eq NLP.Alphabet.IMMC.IMMC -- | This module defines a simple scoring scheme based on pairs of -- unigrams. module NLP.Scoring.SimpleUnigram -- | Score MultiChars x and y based on the -- simple scoring system: (i) lookup (x,y) and use the score if found; -- (ii) if (x,y) is not in the database, then return the default matching -- defMatch score if x==y, otherwise return the default -- mismatch defMismatch score. scoreUnigram :: SimpleScoring -> IMMC -> IMMC -> Double -- | Collect the hashtable and scalar values for simple scoring. data SimpleScoring SimpleScoring :: !(BasicHashTable (IMMC, IMMC) Double) -> !Double -> !Double -> !Double -> !Double -> !Double -> SimpleScoring [simpleScore] :: SimpleScoring -> !(BasicHashTable (IMMC, IMMC) Double) [gapScore] :: SimpleScoring -> !Double [gapOpen] :: SimpleScoring -> !Double [gapExtend] :: SimpleScoring -> !Double [defMatch] :: SimpleScoring -> !Double [defMismatch] :: SimpleScoring -> !Double instance GHC.Show.Show NLP.Scoring.SimpleUnigram.SimpleScoring module NLP.Scoring.SimpleUnigram.Import -- | Each parsed line gives a set of characters, or tells us a score. -- -- TODO add LPimport which starts a recursive import (note: -- start by storing the hash or whatever of the file to be imported so -- that we can comment on circular imports) data ParsedLine PLset :: Text -> [IMMC] -> ParsedLine PLeq :: Text -> Double -> ParsedLine PLeqset :: Text -> [IMMC] -> ParsedLine PLinset :: Text -> Text -> Double -> ParsedLine PLgap :: Double -> ParsedLine PLgapopen :: Double -> ParsedLine PLgapextend :: Double -> ParsedLine PLdefmatch :: Double -> ParsedLine PLdefmismatch :: Double -> ParsedLine PLcomment :: Text -> ParsedLine -- | Here we simple parse individual lines. parseLine :: Text -> ParsedLine -- | Parses a bytestring to create a simple scoring. We don't do much error -- checking, many of the bindings below will easily fail. -- -- TODO obviously: implement error-checking genSimpleScoring :: Text -> SimpleScoring -- | parse a simple scoring file. simpleScoreFromFile :: FilePath -> IO SimpleScoring instance GHC.Classes.Ord NLP.Scoring.SimpleUnigram.Import.ParsedLine instance GHC.Classes.Eq NLP.Scoring.SimpleUnigram.Import.ParsedLine instance GHC.Show.Show NLP.Scoring.SimpleUnigram.Import.ParsedLine module NLP.Scoring.SimpleUnigram.Default -- | Default simple unigram scores for a system of consonants, liquid -- consonants, and vowels of arbitrary scale. clvDefaults :: SimpleScoring