-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/
-- | Bigram word pair alignments.
--
-- The library provides fast dynamic programming algorithms to align word
-- pairs using either a simple or a bigram scoring scheme.
@package WordAlignment
@version 0.1.0.0
-- | A single word in a language. Uses a MultiChar encoding for
-- the actual characters. MultiChar encodings need to be decoded for
-- printing on screen.
module Linguistics.Word
-- | A single word we want to align to another word. It comes with an id
-- (here 9), the language name (which we intern), a word class (interned
-- as well), the length of the word (so that we don't have to check
-- wordWord length and check for word delims), and finally the word
-- itself. Indivitual MultiChar characters are interned to
-- reduce memory cost (and we might want to do stuff with the Id's).
--
-- 9 Albanian_Tosk 1.214 6 ' b a lʸ t ə
data Word
Word :: {-# UNPACK #-} !Int -> {-# UNPACK #-} !BTI -> {-# UNPACK #-} !BTI -> {-# UNPACK #-} !Int -> {-# UNPACK #-} !(Vector BTI) -> Word
[wordID] :: Word -> {-# UNPACK #-} !Int
[wordClass] :: Word -> {-# UNPACK #-} !BTI
[wordLang] :: Word -> {-# UNPACK #-} !BTI
[wordLength] :: Word -> {-# UNPACK #-} !Int
[wordWord] :: Word -> {-# UNPACK #-} !(Vector BTI)
parseWord :: ByteString -> Word
addWordDelims :: Word -> Word
removeWordDelims :: Word -> Word
instance GHC.Classes.Ord Linguistics.Word.Word
instance GHC.Classes.Eq Linguistics.Word.Word
instance GHC.Show.Show Linguistics.Word.Word
instance Control.DeepSeq.NFData Linguistics.Word.Word
-- | Some common functions and things that are not of immediate importance
-- to understand the algorithms.
module Linguistics.Common
type IMCp = (BTI, BTI)
-- | Actually align something prettily
alignPretty :: [[IMCp]] -> [String]
-- | Prettyprint `characters', which are actually small
-- bytestrings.
printAligned :: IMCp -> [IMCp] -> String
-- | Print with special padding character
printAlignedPad :: Char -> IMCp -> [IMCp] -> String
-- | Length in printed characters of an UTF8 string wrapped as a
-- ByteString
--
-- NOTE isMark selects unicode symbols that modify a character,
-- thereby not increasing the length of the printed string.
printLength :: BTI -> Int
toUtf8String :: BTI -> String
buildLines :: [[Text]] -> Builder
printLines :: Handle -> [[Text]] -> IO ()
module Linguistics.TwoWay.Simple
type SigT m x r = SigGlobal m x r BTI BTI
sScore :: Monad m => SimpleScoring -> SigT m Double Double
sBacktrack :: Monad m => SigT m (FMList (BTI, BTI)) [FMList (BTI, BTI)]
-- | Create a backtracking function
--
-- TODO includes scores as well?
sBacktrackFun :: Monad m => SigT m (FMList [Text]) [FMList [Text]]
alignGlobal :: SimpleScoring -> Int -> Vector BTI -> Vector BTI -> (Double, [[[Text]]])
alignGlobalForward :: SimpleScoring -> Vector BTI -> Vector BTI -> Z :. ITbl Id Unboxed ((Z :. PointL I) :. PointL I) Double
alignGlobalBacktrack :: SimpleScoring -> Vector BTI -> Vector BTI -> ITbl Id Unboxed ((Z :. PointL I) :. PointL I) Double -> [[[Text]]]
-- | Map between Strings that represent characters and their
-- Int-based representation.
--
-- NOTE filtering the scores list and creating a single bigram map takes
-- about 70 seconds.
--
-- NOTE A single bigram map costs around 160 MByte ram. This includes the
-- overhead for actually storing the bigrams once (creating pointers
-- instead of multiple copied Bigram data structures.
module Linguistics.Bigram
data Bigram
Bigram :: {-# UNPACK #-} !BTI -> {-# UNPACK #-} !BTI -> Bigram
[peekChar] :: Bigram -> {-# UNPACK #-} !BTI
[hitChar] :: Bigram -> {-# UNPACK #-} !BTI
-- | Try to read the first line to figure out if there is a default score
-- there
withDefault :: Double -> [ByteString] -> (Double, [ByteString])
parseLine :: ByteString -> (BTI, BTI, Bigram, Bigram, Double)
type Lang = BTI
type Line = (Lang, Lang, Bigram, Bigram, Double)
type Scores = HashMap (Bigram :!: Bigram) Double
data Mapping
Mapping :: !(Map Bigram Bigram) -> !(Map (Lang :!: Lang) Scores) -> Mapping
[bigrams] :: Mapping -> !(Map Bigram Bigram)
[lliid] :: Mapping -> !(Map (Lang :!: Lang) Scores)
lines2mapping :: [Line] -> Mapping
emptyMapping :: Mapping
mkMapping :: Mapping -> [Line] -> Mapping
-- | Given a set of acceptable languages, a default score, and the lazy
-- bytestring of scores, create the Mapping of languages and
-- scores.
generateLookups :: Set BTI -> Double -> ByteString -> Mapping
instance GHC.Generics.Selector Linguistics.Bigram.S1_0_1Bigram
instance GHC.Generics.Selector Linguistics.Bigram.S1_0_0Bigram
instance GHC.Generics.Constructor Linguistics.Bigram.C1_0Bigram
instance GHC.Generics.Datatype Linguistics.Bigram.D1Bigram
instance GHC.Show.Show Linguistics.Bigram.Mapping
instance GHC.Generics.Generic Linguistics.Bigram.Bigram
instance GHC.Classes.Ord Linguistics.Bigram.Bigram
instance GHC.Classes.Eq Linguistics.Bigram.Bigram
instance GHC.Show.Show Linguistics.Bigram.Bigram
instance Data.Hashable.Class.Hashable Linguistics.Bigram.Bigram
instance Control.DeepSeq.NFData Linguistics.Bigram.Bigram
instance Data.Hashable.Class.Hashable (Data.Strict.Tuple.Pair GHC.Types.Int GHC.Types.Int)
instance Data.Hashable.Class.Hashable (Data.Strict.Tuple.Pair Linguistics.Bigram.Bigram Linguistics.Bigram.Bigram)
module Linguistics.TwoWay.Bigram
type IMC = BTI
type SigT m x r = SigGlobal m x r IMCp IMCp
sScore :: Monad m => Double -> Double -> Scores -> SigT m Double Double
sBacktrack :: Monad m => SigT m (FMList (IMCp, IMCp)) [FMList (IMCp, IMCp)]
sBacktrackFun :: Monad m => Double -> Double -> Scores -> SigT m (FMList [Text]) [FMList [Text]]
alignGlobal :: Double -> Double -> Scores -> Int -> Vector IMC -> Vector IMC -> (Double, [[[Text]]])
alignGlobalForward :: Double -> Double -> Scores -> Vector IMCp -> Vector IMCp -> Z :. ITbl Id Unboxed ((Z :. PointL I) :. PointL I) Double
alignGlobalBacktrack :: Double -> Double -> Scores -> Vector IMCp -> Vector IMCp -> ITbl Id Unboxed ((Z :. PointL I) :. PointL I) Double -> [[[Text]]]