{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell   #-}
{-# LANGUAGE CPP               #-}
{-# LANGUAGE BangPatterns      #-}
-- | Simplified Chinese <-> English dictionary with pinyin phonetics.
module Data.Chinese.CCDict
  ( CCDict
  , Entry(..)
  , entryPinyin
  , load
  , parse
  , lookup
  , ccDict
  , Token(..)
  , tokenizer
  ) where

import           Control.Monad       (guard)
import           Data.Char
import           Data.FileEmbed
import           Data.List           (foldl')
import           Data.IntMap (IntMap)
import qualified Data.IntMap.Strict as IntMap
import qualified Data.Map.Strict            as M
import           Data.Maybe
import           Data.Text           (Text)
import qualified Data.Text           as T
import qualified Data.Text.Encoding  as T
import qualified Data.Text.IO        as T

import           Prelude             hiding (lookup)

import Data.Tree

import           Data.Chinese.Pinyin
import           Data.Chinese.Frequency

--------------------------------------------------
-- Dictionary


-- | Dictionary entry
data Entry = Entry
  { entryChinese    :: {-# UNPACK #-} !Text
  , entryPinyinRaw  :: [Text]
  , entryDefinition :: [[Text]]
  } deriving ( Read, Show, Eq, Ord )

entryPinyin :: Entry -> [Text]
entryPinyin = map (T.unwords . map toToneMarks . T.words) . entryPinyinRaw

type CCDict = IntMap CCTrieEntry
data CCTrieEntry
  = CCTrieEntry    {-# UNPACK #-} !Entry !CCDict
  | CCTrieEntryEnd {-# UNPACK #-} !Entry
  | CCTrieNoEntry                        !CCDict
  deriving ( Show )


-- instance Binary CCTrieEntry where
--   put (CCTrieEntry entry rest) = put entry >> put rest
--   get = CCTrieEntry <$> get <*> get

-- | Load dictionary from file.
load :: FilePath -> IO CCDict
load path = parse `fmap` T.readFile path

-- | Load dictionary from unicode text.
parse :: Text -> CCDict
parse txt = fromList [ entry | Just entry <- map parseLine (T.lines txt) ]

-- | O(n). Lookup dictionary entry for a string of simplified chinese.
lookup :: Text -> CCDict -> Maybe Entry
lookup key trie =
    case map ord $ T.unpack key of
      [] -> Nothing
      (x:xs) -> go xs =<< IntMap.lookup x trie
  where
    go _ (CCTrieEntryEnd es) = Just es
    go [] (CCTrieEntry es _) = Just es
    go [] (CCTrieNoEntry _) = Nothing
    go (x:xs) (CCTrieEntry es m) = Just (fromMaybe es (go xs =<< IntMap.lookup x m))
    go (x:xs) (CCTrieNoEntry m) = (go xs =<< IntMap.lookup x m)

lookupMatches :: Text -> CCDict -> Maybe [Entry]
lookupMatches key trie =
    case map ord $ T.unpack key of
      []     -> Nothing
      (x:xs) ->
        case fmap (go xs) (IntMap.lookup x trie) of
          Just [] -> Nothing
          other   -> other
  where
    go _ (CCTrieEntryEnd e) = [e]
    go [] (CCTrieNoEntry _) = []
    go [] (CCTrieEntry e _) = [e]
    go (x:xs) (CCTrieNoEntry m) = maybe [] (go xs) (IntMap.lookup x m)
    go (x:xs) (CCTrieEntry e m) = e : maybe [] (go xs) (IntMap.lookup x m)


-- 点出发
-- [[点,出发],[点出,发]]
-- 出发点
-- [[出发]]
lookupNonDet :: Text -> CCDict -> Maybe [[Entry]]
lookupNonDet key trie = do
  entries <- lookupMatches key trie
  let longest = maximum (map (T.length . entryChinese) entries)
  if longest == 1
    then return [entries]
    else return $ do
      entry <- entries
      let len = T.length (entryChinese entry)

      case lookupMatches (T.drop len key) trie of
        Just rest | len < longest -> do
          next <- rest
          guard (T.length (entryChinese next) + len > longest)
          return [entry, next]
        _nothing -> return [entry]


--------------------------------------------------
-- Tokenizer

data Token = KnownWord Entry | UnknownWord Text
  deriving ( Read, Show, Eq, Ord )

-- Interesting case: 他的话 tokenizes to [他,的话] by both google translate and
-- MDGB. The correct tokenization is [他,的,话]. Not sure if it can be fixed without
-- adding an entry for 他的 in the dictionary.
-- TODO: Mark text inclosed in curly brackets as unknown words.
-- FIXME: 不想 should tokenize to [不,想]
-- FIXME: 那是 should tokenize to [那,是]
-- | Break a string of simplified chinese down to a list of tokens.
tokenizer :: CCDict -> Text -> [Token]
tokenizer = tokenizer'
--tokenizer trie inp = maximumBy (comparing score) (tokenizerNondet trie inp)
-- tokenizer trie inp = filter isValid $ go 0 inp inp
--   where
--     isValid (UnknownWord txt) = not (T.null txt)
--     isValid _ = True
--     go n unrecognied txt
--       | T.null txt = [ unknown ]
--       | otherwise =
--           case lookup txt trie of
--             Nothing -> go (n+1) unrecognied (T.drop 1 txt)
--             Just es ->
--               let rest = T.drop (T.length (entryChinese es)) txt in
--               unknown : KnownWord es : go 0 rest rest
--       where
--         unknown = UnknownWord $ T.take n unrecognied

_tokenizer_tests :: [(Text, [Text], [Text])]
_tokenizer_tests =
    [ (input, result, tokens)
    | (input, result) <- cases
    , let tokens = flat (tokenizer' ccDict input)
    , tokens /= result ]
  where
    cases =
        [ ("多工作", ["多","工作"])
        , ("有电话", ["有","电话"])
      	, ("回电话", ["回","电话"])
      	, ("不知道", ["不","知道"])
      	, ("定时间", ["定","时间"])
      	, ("这位子", ["这","位子"])
      	, ("十分钟", ["十","分钟"])
      	, ("有电梯", ["有","电梯"])
        , ("中午前", ["中午","前"])
        -- , ("得很", ["得","很"]) -- Don't know how to fix this.
        -- , ("家中餐馆", ["家","中餐馆"]) -- tokenizer needs to be more greedy to correctly
                                           -- deal with this input.
      	, ("后生活", ["后","生活"])
      	, ("不愿意", ["不","愿意"])
      	, ("点出发", ["点","出发"])
        , ("不会跳舞", ["不会","跳舞"]) ]

flat :: [Token] -> [Text]
flat tokens = [ entryChinese entry | KnownWord entry <- tokens ]

type NonDet = Tree [Token]

-- ppNonDet :: [NonDet] -> String
-- ppNonDet forest = drawForest (map (fmap (unwords . map ppToken)) forest)
--   where
--     ppToken (KnownWord entry) = T.unpack (entryChinese entry)
--     ppToken (UnknownWord txt) = T.unpack txt

-- compactNonDet :: NonDet -> NonDet
-- compactNonDet (Node a [Node b rest]) =
--   compactNonDet (Node (a++b) rest)
-- compactNonDet (Node a rest) =
--   Node a (map compactNonDet rest)

collapseNonDet :: [NonDet] -> [Token]
collapseNonDet [] = []
collapseNonDet [Node entries rest] = entries ++ collapseNonDet rest
collapseNonDet (node:nodes) =
    case maxBy nodeScore node nodes of
      Node entries rest -> entries ++ collapseNonDet rest
  where
    maxBy fn x xs = maxBy' (fn x) x xs
      where
        maxBy' _hiScore hiItem [] = hiItem
        maxBy' hiScore hiItem (y:ys) =
          let score = fn y in
          if score > hiScore then maxBy' score y ys else maxBy' hiScore hiItem ys
    geoMean :: [Int] -> Double
    geoMean [] = 0
    geoMean n = fromIntegral (product n)**(recip (fromIntegral (length n)))
    -- assocs = [ (node, geoMean (filter (/=0) (nodeSum node)))
    --          | node <- forest ]
    wordCount word = maybe 0 subtlexWCount (M.lookup word subtlex)
    entryCount (KnownWord entry) = wordCount (entryChinese entry)
    entryCount UnknownWord{} = 0
    nodeSum (Node entries _) = map entryCount entries
    nodeScore = geoMean . filter (/=0) . nodeSum

-- Enhanced tokenizer, mixed non-determistic and greedy algorithm
tokenizer' :: CCDict -> Text -> [Token]
tokenizer' trie inp = collapseNonDet (tokenizerNondet trie inp)

tokenizerNondet :: CCDict -> Text -> [NonDet]
tokenizerNondet trie inp = go inp
  where
    go txt | T.null txt = []
    go txt =
      case lookupNonDet txt trie of
        Nothing -> do
          return $ Node [UnknownWord (T.take 1 txt)] $ go (T.drop 1 txt)
        Just es -> do
          entries <- es
          let len = sum (map (T.length . entryChinese) entries)
          return $ Node (map KnownWord entries) $ go (T.drop len txt)

--score :: [Token] -> Double
--score = sum . map fn
--  where
--    fn UnknownWord{} = 0
--    fn (KnownWord entry) | T.length (entryChinese entry) == 1 = 0
--    fn (KnownWord entry) =
--      case M.lookup (entryChinese entry) subtlex of
--        Nothing   -> 0
--        Just freq -> subtlexWMillion freq


--------------------------------------------------
-- Dictionary trie

-- union :: CCDict -> CCDict -> CCDict
-- union = IntMap.unionWith joinTrie

joinTrie :: CCTrieEntry -> CCTrieEntry -> CCTrieEntry
joinTrie (CCTrieNoEntry t1) (CCTrieNoEntry t2) = CCTrieNoEntry (IntMap.unionWith joinTrie t1 t2)
joinTrie (CCTrieNoEntry t1) (CCTrieEntry e t2) = CCTrieEntry e (IntMap.unionWith joinTrie t1 t2)
joinTrie (CCTrieNoEntry t1) (CCTrieEntryEnd e) = CCTrieEntry e t1
joinTrie (CCTrieEntry e t1) (CCTrieNoEntry t2) = CCTrieEntry e (IntMap.unionWith joinTrie t1 t2)
joinTrie (CCTrieEntry e1 t1) (CCTrieEntry e2 t2) =
  CCTrieEntry (joinEntry e1 e2) (IntMap.unionWith joinTrie t1 t2)
joinTrie (CCTrieEntry e1 t2) (CCTrieEntryEnd e2) = CCTrieEntry (joinEntry e1 e2) t2
joinTrie (CCTrieEntryEnd e) (CCTrieNoEntry t)    = CCTrieEntry e t
joinTrie (CCTrieEntryEnd e1) (CCTrieEntry e2 t)  = CCTrieEntry (joinEntry e1 e2) t
joinTrie (CCTrieEntryEnd e1) (CCTrieEntryEnd e2) = CCTrieEntryEnd (joinEntry e1 e2)

joinEntry :: Entry -> Entry -> Entry
joinEntry e1 e2 = Entry
  { entryChinese    = entryChinese e1
  , entryPinyinRaw  = entryPinyinRaw e1 ++ entryPinyinRaw e2
  , entryDefinition = entryDefinition e1 ++ entryDefinition e2 }

-- unions :: [CCDict] -> CCDict
-- unions = foldl' union IntMap.empty

fromList :: [Entry] -> CCDict
-- fromList = unions . map singleton
fromList = foldl' (flip insert) IntMap.empty

insert :: Entry -> CCDict -> CCDict
insert entry = go (T.unpack (entryChinese entry))
  where
    go :: [Char] -> CCDict -> CCDict
    go [] _ = error "insert: Invalid entry."
    go [x] t =
      IntMap.insertWith joinTrie (ord x) (CCTrieEntryEnd entry) t
    go (x:xs) t =
      IntMap.alter (go' xs) (ord x) t
    go' xs Nothing = Just $ CCTrieNoEntry (go xs IntMap.empty)
    go' xs (Just trie) = Just $
      case trie of
        CCTrieNoEntry t -> CCTrieNoEntry $ go xs t
        CCTrieEntry e t -> CCTrieEntry e $ go xs t
        CCTrieEntryEnd e -> CCTrieEntry e $ go xs IntMap.empty

-- singleton :: Entry -> CCDict
-- singleton entry = go (T.unpack (entryChinese entry))
--   where
--     go []     = error "singleton: Invalid entry."
--     go [x]    = IntMap.singleton (ord x) (CCTrieEntryEnd entry)
--     go (x:xs) = IntMap.singleton (ord x) (CCTrieNoEntry (go xs))

parseLine :: Text -> Maybe Entry
parseLine line | "#" `T.isPrefixOf` line = Nothing
parseLine line =
    Just Entry
    { entryChinese    = T.copy simplified
    , entryPinyinRaw  = [pinyin] -- [T.unwords $ map toToneMarks $ T.words pinyin]
    , entryDefinition = [splitDefinition english] }
    -- , entryPinyin     = V.singleton $ T.unwords $ map toToneMarks $ T.words $ T.tail $
    --                       T.init $ T.unwords (pinyin ++ [pin])
    -- , entryDefinition = V.singleton $ splitDefinition (T.unwords english) }
  where
    (_traditional, line') = T.breakOn " " line
    (simplified, line'') = T.breakOn " " (T.drop 1 line')
    (pinyin_, english_) = T.breakOn "/" (T.drop 1 line'')
    !english = T.copy english_
    !pinyin = T.copy $ T.dropAround (\c -> isSpace c || c == '[' || c == ']') pinyin_
    -- firstSep = breakOn " ", breakOn " ", breakOn "/"
    -- (_traditional : chinese : rest) = T.words (T.copy line)
    -- (pinyin, (pin : english)) = break (\word -> T.count "]" word > 0) rest

-- /first/second/third/ -> [first, second, third]
splitDefinition :: Text -> [Text]
splitDefinition = filter (not . T.null) . T.splitOn "/" . T.dropAround isSpace


--------------------------------------------------
-- Embedded dictionary

-- | Embedded dictionary.
ccDict :: CCDict
ccDict = parse $ T.decodeUtf8 raw
  where
    raw = $(embedFile "data/cedict_1_0_ts_utf-8_mdbg.txt")

-- ccDict' :: CCDict
-- ccDict' = decode (BL.fromStrict raw)
--   where
--     raw = $(embedFile "data/cedict_1_0_ts_utf-8_mdbg.txt.binary")