-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | Bigram word pair alignments. -- -- The library provides fast dynamic programming algorithms to align word -- pairs using either a simple or a bigram scoring scheme. @package WordAlignment @version 0.1.0.0 -- | A single word in a language. Uses a MultiChar encoding for -- the actual characters. MultiChar encodings need to be decoded for -- printing on screen. module Linguistics.Word -- | A single word we want to align to another word. It comes with an id -- (here 9), the language name (which we intern), a word class (interned -- as well), the length of the word (so that we don't have to check -- wordWord length and check for word delims), and finally the word -- itself. Indivitual MultiChar characters are interned to -- reduce memory cost (and we might want to do stuff with the Id's). -- -- 9 Albanian_Tosk 1.214 6 ' b a lʸ t ə data Word Word :: {-# UNPACK #-} !Int -> {-# UNPACK #-} !BTI -> {-# UNPACK #-} !BTI -> {-# UNPACK #-} !Int -> {-# UNPACK #-} !(Vector BTI) -> Word [wordID] :: Word -> {-# UNPACK #-} !Int [wordClass] :: Word -> {-# UNPACK #-} !BTI [wordLang] :: Word -> {-# UNPACK #-} !BTI [wordLength] :: Word -> {-# UNPACK #-} !Int [wordWord] :: Word -> {-# UNPACK #-} !(Vector BTI) parseWord :: ByteString -> Word addWordDelims :: Word -> Word removeWordDelims :: Word -> Word instance GHC.Classes.Ord Linguistics.Word.Word instance GHC.Classes.Eq Linguistics.Word.Word instance GHC.Show.Show Linguistics.Word.Word instance Control.DeepSeq.NFData Linguistics.Word.Word -- | Some common functions and things that are not of immediate importance -- to understand the algorithms. module Linguistics.Common type IMCp = (BTI, BTI) -- | Actually align something prettily alignPretty :: [[IMCp]] -> [String] -- | Prettyprint `characters', which are actually small -- bytestrings. printAligned :: IMCp -> [IMCp] -> String -- | Print with special padding character printAlignedPad :: Char -> IMCp -> [IMCp] -> String -- | Length in printed characters of an UTF8 string wrapped as a -- ByteString -- -- NOTE isMark selects unicode symbols that modify a character, -- thereby not increasing the length of the printed string. printLength :: BTI -> Int toUtf8String :: BTI -> String buildLines :: [[Text]] -> Builder printLines :: Handle -> [[Text]] -> IO () module Linguistics.TwoWay.Simple type SigT m x r = SigGlobal m x r BTI BTI sScore :: Monad m => SimpleScoring -> SigT m Double Double sBacktrack :: Monad m => SigT m (FMList (BTI, BTI)) [FMList (BTI, BTI)] -- | Create a backtracking function -- -- TODO includes scores as well? sBacktrackFun :: Monad m => SigT m (FMList [Text]) [FMList [Text]] alignGlobal :: SimpleScoring -> Int -> Vector BTI -> Vector BTI -> (Double, [[[Text]]]) alignGlobalForward :: SimpleScoring -> Vector BTI -> Vector BTI -> Z :. ITbl Id Unboxed ((Z :. PointL I) :. PointL I) Double alignGlobalBacktrack :: SimpleScoring -> Vector BTI -> Vector BTI -> ITbl Id Unboxed ((Z :. PointL I) :. PointL I) Double -> [[[Text]]] -- | Map between Strings that represent characters and their -- Int-based representation. -- -- NOTE filtering the scores list and creating a single bigram map takes -- about 70 seconds. -- -- NOTE A single bigram map costs around 160 MByte ram. This includes the -- overhead for actually storing the bigrams once (creating pointers -- instead of multiple copied Bigram data structures. module Linguistics.Bigram data Bigram Bigram :: {-# UNPACK #-} !BTI -> {-# UNPACK #-} !BTI -> Bigram [peekChar] :: Bigram -> {-# UNPACK #-} !BTI [hitChar] :: Bigram -> {-# UNPACK #-} !BTI -- | Try to read the first line to figure out if there is a default score -- there withDefault :: Double -> [ByteString] -> (Double, [ByteString]) parseLine :: ByteString -> (BTI, BTI, Bigram, Bigram, Double) type Lang = BTI type Line = (Lang, Lang, Bigram, Bigram, Double) type Scores = HashMap (Bigram :!: Bigram) Double data Mapping Mapping :: !(Map Bigram Bigram) -> !(Map (Lang :!: Lang) Scores) -> Mapping [bigrams] :: Mapping -> !(Map Bigram Bigram) [lliid] :: Mapping -> !(Map (Lang :!: Lang) Scores) lines2mapping :: [Line] -> Mapping emptyMapping :: Mapping mkMapping :: Mapping -> [Line] -> Mapping -- | Given a set of acceptable languages, a default score, and the lazy -- bytestring of scores, create the Mapping of languages and -- scores. generateLookups :: Set BTI -> Double -> ByteString -> Mapping instance GHC.Generics.Selector Linguistics.Bigram.S1_0_1Bigram instance GHC.Generics.Selector Linguistics.Bigram.S1_0_0Bigram instance GHC.Generics.Constructor Linguistics.Bigram.C1_0Bigram instance GHC.Generics.Datatype Linguistics.Bigram.D1Bigram instance GHC.Show.Show Linguistics.Bigram.Mapping instance GHC.Generics.Generic Linguistics.Bigram.Bigram instance GHC.Classes.Ord Linguistics.Bigram.Bigram instance GHC.Classes.Eq Linguistics.Bigram.Bigram instance GHC.Show.Show Linguistics.Bigram.Bigram instance Data.Hashable.Class.Hashable Linguistics.Bigram.Bigram instance Control.DeepSeq.NFData Linguistics.Bigram.Bigram instance Data.Hashable.Class.Hashable (Data.Strict.Tuple.Pair GHC.Types.Int GHC.Types.Int) instance Data.Hashable.Class.Hashable (Data.Strict.Tuple.Pair Linguistics.Bigram.Bigram Linguistics.Bigram.Bigram) module Linguistics.TwoWay.Bigram type IMC = BTI type SigT m x r = SigGlobal m x r IMCp IMCp sScore :: Monad m => Double -> Double -> Scores -> SigT m Double Double sBacktrack :: Monad m => SigT m (FMList (IMCp, IMCp)) [FMList (IMCp, IMCp)] sBacktrackFun :: Monad m => Double -> Double -> Scores -> SigT m (FMList [Text]) [FMList [Text]] alignGlobal :: Double -> Double -> Scores -> Int -> Vector IMC -> Vector IMC -> (Double, [[[Text]]]) alignGlobalForward :: Double -> Double -> Scores -> Vector IMCp -> Vector IMCp -> Z :. ITbl Id Unboxed ((Z :. PointL I) :. PointL I) Double alignGlobalBacktrack :: Double -> Double -> Scores -> Vector IMCp -> Vector IMCp -> ITbl Id Unboxed ((Z :. PointL I) :. PointL I) Double -> [[[Text]]]