{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE BangPatterns #-} -- | A single word in a language. Uses a 'MultiChar' encoding for the actual -- characters. MultiChar encodings need to be decoded for printing on screen. module Linguistics.WordAlignment.Word where import Control.Applicative import Control.DeepSeq import Data.Aeson import Data.Attoparsec.ByteString.Lazy (()) import Data.ByteString (ByteString) import Data.Interned import Data.Interned.ByteString import Data.List (intersperse) import GHC.Generics import Prelude hiding (Word) import qualified Data.Attoparsec.ByteString as AB import qualified Data.Attoparsec.ByteString.Char8 as AB hiding (takeWhile1) import qualified Data.Attoparsec.ByteString.Lazy as ABL import qualified Data.ByteString as B import qualified Data.ByteString.Builder as BB import qualified Data.ByteString.Lazy as BL hiding (unpack) import qualified Data.ByteString.Lazy.Char8 as BL hiding (readFile) import qualified Data.ByteString.Short as S import qualified Data.HashMap.Strict as HM import qualified Data.Text as T import qualified Data.Text.Format as TF import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Builder as TLB import qualified Data.Vector as V import qualified Data.Vector.Unboxed as VU import Data.Text.Encoding (encodeUtf8) import NLP.Text.BTI -- | A single word we want to align to another word. It comes with an id (here -- 9), the language name (which we intern), a word class (interned as well), -- the length of the word (so that we don't have to check wordWord length and -- check for word delims), and finally the word itself. Indivitual 'MultiChar' -- characters are interned to reduce memory cost (and we might want to do stuff -- with the Id's). -- -- 9 Albanian_Tosk 1.214 6 \' b a lʸ t ə -- data Word = Word { wordID :: {-# UNPACK #-} !Int , wordClass :: {-# UNPACK #-} !BTI -- InternedByteString , wordLang :: {-# UNPACK #-} !BTI -- InternedByteString , wordLength :: {-# UNPACK #-} !Int , wordWord :: {-# UNPACK #-} !(VU.Vector BTI) } deriving (Show,Eq,Ord,Generic) instance ToJSON Word where toJSON (Word i c lang len w) = object [ "id" .= i , "class" .= c , "lang" .= lang , "word" .= w ] instance NFData Word where rnf !(Word {}) = () parseWord :: BL.ByteString -> Word parseWord w = case ABL.eitherResult (ABL.parse go w) of Left err -> error $ "failed to parse line: " ++ show w ++ " with error: " ++ show err Right p -> force p where go = Word <$> (AB.decimal "number") <* (AB.skipSpace "1+ ws") <*> ((wW <$> wrd) "class") <*> ((wW <$> wrd) "language") <*> (AB.decimal "length") <* (AB.skipSpace "1+ ws") <*> ((VU.fromList . map wW) <$> (AB.takeWhile1 (not . AB.isHorizontalSpace) `AB.sepBy` AB.space) "word") wrd = AB.takeWhile1 (not . AB.isHorizontalSpace) <* AB.skipSpace wW = btiFromCS addWordDelims :: Word -> Word addWordDelims w | VU.length ww >= 2 && VU.head ww == "^" && VU.last ww == "$" = w | otherwise = w { wordWord = "^" `VU.cons` wordWord w `VU.snoc` "$" } where ww = wordWord w removeWordDelims :: Word -> Word removeWordDelims w | VU.length ww >= 2 && VU.head ww == "^" && VU.last ww == "$" = w { wordWord = VU.init . VU.tail $ wordWord w } | otherwise = w where ww = wordWord w -- | Format a 'Word' as a lazy text with interspersed white space. wordLazyTextWS :: Word -> TL.Text wordLazyTextWS = TLB.toLazyText . wordLazyTextWSB {-# Inline wordLazyTextWS #-} -- | A Builder for the actual word in a 'Word'. With intersperse white -- space. wordLazyTextWSB :: Word -> TLB.Builder wordLazyTextWSB = mconcat . intersperse " " . map (TLB.fromText . btiToCS) . VU.toList . wordWord {-# Inline wordLazyTextWSB #-} -- | A builder for an utf8 bytestring. wordUtf8Builder :: Word -> BB.Builder wordUtf8Builder = mconcat . intersperse " " . map (BB.byteString . btiToUtf8) . VU.toList . wordWord {-# Inline wordUtf8Builder #-} -- | -- -- TODO @Builder@ or @Text@ or @Lazy.Text@ ? data FastChars = FastChars { fcTable :: !(HM.HashMap BTI B.ByteString) , fcWidth :: !Int } -- | Generate the fast lookup table fastChars :: Int -> V.Vector Word -> FastChars fastChars width ws = {-# SCC "fastChars" #-} deepseq ws `seq` FastChars hm width where hm = HM.fromList . map fmt . addDefaultChars . concatMap (VU.toList . wordWord) . V.toList $ ws fmt k = (k , encodeUtf8 . TL.toStrict . TLB.toLazyText . TF.left width ' ' . btiToText $ k) addDefaultChars xs = xs ++ map btiFromCS ["-", "$", "^" :: T.Text] {-# NoInline fastChars #-} fastChar :: FastChars -> BTI -> BB.Builder -- TLB.Builder --fastChar (FastChars hm width) k = {-# SCC "fastChar" #-} maybe (BB.byteString . encodeUtf8 . TF.format "{}" . TF.left width ' ' . TF.Only $ btiToCS k) BB.byteString $ HM.lookup k hm fastChar (FastChars hm width) k = {-# SCC "fastChar" #-} maybe encodek BB.byteString $ HM.lookup k hm where encodek = BB.byteString . encodeUtf8 . TL.toStrict . TF.format "{}" . TF.Only . TF.left width ' ' $ (btiToCS k :: T.Text) {-# InlineAble fastChar #-} -- | Word to bigram vector wordToBigramVector :: VU.Vector BTI -> VU.Vector (BTI,BTI) wordToBigramVector m' = let m = "^" `VU.cons` m' `VU.snoc` "$" in VU.zip m (VU.tail m) {-# Inline wordToBigramVector #-}