-- | Map between 'String's that represent characters and their 'Int'-based
-- representation.
--
-- NOTE filtering the scores list and creating a single bigram map takes about
-- 70 seconds.
--
-- NOTE A single bigram map costs around 160 MByte ram. This includes the
-- overhead for actually storing the bigrams once (creating pointers instead of
-- multiple copied 'Bigram' data structures.

module Linguistics.Bigram where

import           Control.Applicative
import           Control.Arrow
import           Control.DeepSeq
import           Control.Lens
import           Data.Attoparsec.ByteString.Lazy ((<?>))
import           Data.ByteString (ByteString)
import           Data.Function
import           Data.Hashable
import           Data.Interned
import           Data.List
import           Data.Strict.Tuple
import           GHC.Generics (Generic)
import qualified Data.Attoparsec.ByteString as AB
import qualified Data.Attoparsec.ByteString.Char8 as AB hiding (takeWhile1)
import qualified Data.Attoparsec.ByteString.Lazy as ABL
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as BL hiding (unpack)
import qualified Data.ByteString.Lazy.Char8 as BL hiding (readFile)
import qualified Data.ByteString.Short as BS
import qualified Data.HashMap.Strict as HM
import qualified Data.Map.Strict as M
import qualified Data.Set as S
import qualified Data.Stringable as SA

import           NLP.Text.BTI



data Bigram = Bigram
  { peekChar :: {-# UNPACK #-} !BTI
  , hitChar  :: {-# UNPACK #-} !BTI
  }
  deriving (Show,Eq,Ord,Generic)

instance Hashable Bigram where
  hashWithSalt s (Bigram p h) = hashWithSalt s (p,h) -- (uninternMultiChar p, uninternMultiChar h)
  hash           (Bigram p h) = hash (hash p , hash h)
  {-# Inline hashWithSalt #-}
  {-# Inline hash         #-}

instance NFData Bigram where
  rnf !(Bigram a b) = ()

instance Hashable (Pair Int Int) where
  hashWithSalt s (a:!:b) = hashWithSalt s (a,b)
  {-# Inline hashWithSalt #-}

-- | Try to read the first line to figure out if there is a default score there

withDefault :: Double -> [BL.ByteString] -> (Double,[BL.ByteString])
withDefault d [] = (d,[])
withDefault d (x:xs)
  | [(rd,"")] <- readsPrec 0 (BL.unpack x) = (rd,xs)
  | otherwise = (d,(x:xs))

parseLine l = case ABL.eitherResult (ABL.parse go l) of
                Left  err -> error err
                Right p   -> force p
  where
    go     = (,,,,) <$> lang <*> lang <*> bigram <*> bigram <*> score <?> "go"
    lang   = wrd <?> "lang"
    bigram = Bigram <$> wrd <*> wrd <?> "bigram"
    score  = AB.double <?> "score"
    wrd    = SA.fromByteString <$> AB.takeWhile1 (not . AB.isHorizontalSpace) <* AB.space

type Lang = BTI
type Line = (Lang, Lang, Bigram, Bigram, Double)
type Scores = HM.HashMap (Bigram:!:Bigram) Double

data Mapping = Mapping
  { bigrams :: !(M.Map Bigram Bigram)
  , lliid   :: !(M.Map (Lang:!:Lang) Scores)
  }
  deriving (Show)

instance Hashable (Pair Bigram Bigram) where
  hashWithSalt s (a:!:b) = hashWithSalt s (a,b)

lines2mapping :: [Line] -> Mapping
lines2mapping = foldl' mkMapping emptyMapping . concatMap dupGroup . groupBy ((==) `on` ((^._1) &&& (^._2))) where
  dupGroup ls@(l:_)
    | l^._1 == l^._2 = [ls]
    | otherwise      = [ls,ls']
    where ls' = map (\(l1,l2,b1,b2,d) -> (l2,l1,b2,b1,d)) . filter (\l -> l^._1 /= l^._2) $ ls

emptyMapping = let b = Bigram "" ""
               in Mapping (M.singleton b b) M.empty

mkMapping :: Mapping -> [Line] -> Mapping
mkMapping !m [] = m
mkMapping !(Mapping bs ll) xs@(x:_)
  | otherwise = Mapping bs' ll'
  where
    nom = filter (`M.notMember` bs) $ map (^._3) xs ++ map (^._4) xs
    bs' = bs `M.union` (M.fromList $ map (\a -> (a,a)) nom)
    ll' = M.insertWith HM.union (x^._1 :!: x^._2) ys ll
    ys :: Scores
    ys = HM.fromList
           [ ((k1:!:k2),d)
           | y <- xs
           , let k1 = bs' M.! (y^._3)
           , let k2 = bs' M.! (y^._4)
           , let d = y ^._5
           ]

-- | Given a set of acceptable languages, a default score, and the lazy
-- bytestring of scores, create the 'Mapping' of languages and scores.

generateLookups :: S.Set BTI -> Double -> BL.ByteString -> Mapping
generateLookups langs wd b = lines2mapping xs where
  (d,ls) = withDefault wd $ BL.lines b
  xs = filter inLangSet $ map parseLine ls
  inLangSet l
    | S.null langs = True
    |  (l^._1) `S.member` langs
    && (l^._2) `S.member` langs = True
    | otherwise = False