module Linguistics.Bigram where
import Control.Applicative
import Control.Arrow
import Control.DeepSeq
import Control.Lens
import Data.Attoparsec.ByteString.Lazy ((<?>))
import Data.ByteString (ByteString)
import Data.Function
import Data.Hashable
import Data.Interned
import Data.List
import Data.Strict.Tuple
import GHC.Generics (Generic)
import qualified Data.Attoparsec.ByteString as AB
import qualified Data.Attoparsec.ByteString.Char8 as AB hiding (takeWhile1)
import qualified Data.Attoparsec.ByteString.Lazy as ABL
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as BL hiding (unpack)
import qualified Data.ByteString.Lazy.Char8 as BL hiding (readFile)
import qualified Data.ByteString.Short as BS
import qualified Data.HashMap.Strict as HM
import qualified Data.Map.Strict as M
import qualified Data.Set as S
import qualified Data.Stringable as SA
import NLP.Text.BTI
data Bigram = Bigram
{ peekChar :: !BTI
, hitChar :: !BTI
}
deriving (Show,Eq,Ord,Generic)
instance Hashable Bigram where
hashWithSalt s (Bigram p h) = hashWithSalt s (p,h)
hash (Bigram p h) = hash (hash p , hash h)
instance NFData Bigram where
rnf !(Bigram a b) = ()
instance Hashable (Pair Int Int) where
hashWithSalt s (a:!:b) = hashWithSalt s (a,b)
withDefault :: Double -> [BL.ByteString] -> (Double,[BL.ByteString])
withDefault d [] = (d,[])
withDefault d (x:xs)
| [(rd,"")] <- readsPrec 0 (BL.unpack x) = (rd,xs)
| otherwise = (d,(x:xs))
parseLine l = case ABL.eitherResult (ABL.parse go l) of
Left err -> error err
Right p -> force p
where
go = (,,,,) <$> lang <*> lang <*> bigram <*> bigram <*> score <?> "go"
lang = wrd <?> "lang"
bigram = Bigram <$> wrd <*> wrd <?> "bigram"
score = AB.double <?> "score"
wrd = SA.fromByteString <$> AB.takeWhile1 (not . AB.isHorizontalSpace) <* AB.space
type Lang = BTI
type Line = (Lang, Lang, Bigram, Bigram, Double)
type Scores = HM.HashMap (Bigram:!:Bigram) Double
data Mapping = Mapping
{ bigrams :: !(M.Map Bigram Bigram)
, lliid :: !(M.Map (Lang:!:Lang) Scores)
}
deriving (Show)
instance Hashable (Pair Bigram Bigram) where
hashWithSalt s (a:!:b) = hashWithSalt s (a,b)
lines2mapping :: [Line] -> Mapping
lines2mapping = foldl' mkMapping emptyMapping . concatMap dupGroup . groupBy ((==) `on` ((^._1) &&& (^._2))) where
dupGroup ls@(l:_)
| l^._1 == l^._2 = [ls]
| otherwise = [ls,ls']
where ls' = map (\(l1,l2,b1,b2,d) -> (l2,l1,b2,b1,d)) . filter (\l -> l^._1 /= l^._2) $ ls
emptyMapping = let b = Bigram "" ""
in Mapping (M.singleton b b) M.empty
mkMapping :: Mapping -> [Line] -> Mapping
mkMapping !m [] = m
mkMapping !(Mapping bs ll) xs@(x:_)
| otherwise = Mapping bs' ll'
where
nom = filter (`M.notMember` bs) $ map (^._3) xs ++ map (^._4) xs
bs' = bs `M.union` (M.fromList $ map (\a -> (a,a)) nom)
ll' = M.insertWith HM.union (x^._1 :!: x^._2) ys ll
ys :: Scores
ys = HM.fromList
[ ((k1:!:k2),d)
| y <- xs
, let k1 = bs' M.! (y^._3)
, let k2 = bs' M.! (y^._4)
, let d = y ^._5
]
generateLookups :: S.Set BTI -> Double -> BL.ByteString -> Mapping
generateLookups langs wd b = lines2mapping xs where
(d,ls) = withDefault wd $ BL.lines b
xs = filter inLangSet $ map parseLine ls
inLangSet l
| S.null langs = True
| (l^._1) `S.member` langs
&& (l^._2) `S.member` langs = True
| otherwise = False