module Data.Chinese.CCDict
( CCDict
, Entry(..)
, load
, parse
, lookup
, ccDict
, Token(..)
, tokenizer
) where
import Data.Char
import Data.FileEmbed
import Data.List (foldl', nub)
import Data.IntMap (IntMap)
import qualified Data.IntMap.Strict as IntMap
import qualified Data.Map.Strict as M
import Data.Maybe
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Text.IO as T
import Prelude hiding (lookup)
import Data.Tree
import Data.Chinese.Pinyin
import Data.Chinese.Frequency
data Entry = Entry
{ entrySimplified :: !Text
, entryTraditional :: !Text
, entryPinyin :: [Text]
, entryDefinition :: [[Text]]
} deriving ( Read, Show, Eq, Ord )
type RawEntry = Text
type CCDict = IntMap CCTrieEntry
data CCTrieEntry
= CCTrieEntry !RawEntry !CCDict
| CCTrieEntryEnd !RawEntry
| CCTrieNoEntry !CCDict
deriving ( Show )
load :: FilePath -> IO CCDict
load path = parse `fmap` T.readFile path
parse :: Text -> CCDict
parse txt = fromList
[ (key, line)
| line <- T.lines txt
, Just entry <- [parseLine line]
, key <- nub [entrySimplified entry, entryTraditional entry] ]
lookup :: Text -> CCDict -> Maybe Entry
lookup key trie =
case map ord $ T.unpack key of
[] -> Nothing
(x:xs) -> fmap parseRawEntry (go xs =<< IntMap.lookup x trie)
where
go _ (CCTrieEntryEnd es) = Just es
go [] (CCTrieEntry es _) = Just es
go [] (CCTrieNoEntry _) = Nothing
go (x:xs) (CCTrieEntry es m) = Just (fromMaybe es (go xs =<< IntMap.lookup x m))
go (x:xs) (CCTrieNoEntry m) = (go xs =<< IntMap.lookup x m)
lookupMatches :: Text -> CCDict -> Maybe [Entry]
lookupMatches key trie =
case map ord $ T.unpack key of
[] -> Nothing
(x:xs) ->
case fmap (map parseRawEntry . go xs) (IntMap.lookup x trie) of
Just [] -> Nothing
other -> other
where
go _ (CCTrieEntryEnd e) = [e]
go [] (CCTrieNoEntry _) = []
go [] (CCTrieEntry e _) = [e]
go (x:xs) (CCTrieNoEntry m) = maybe [] (go xs) (IntMap.lookup x m)
go (x:xs) (CCTrieEntry e m) = e : maybe [] (go xs) (IntMap.lookup x m)
lookupNonDet :: Text -> CCDict -> Maybe [[Entry]]
lookupNonDet key trie = toMaybe $ beGreedy $
step (lookupMatches key trie) $ \entry1 -> do
let len = T.length (entrySimplified entry1)
case lookupMatches (T.drop len key) trie of
Nothing -> return [entry1]
Just entries -> do
entry2 <- entries
return [entry1, entry2]
where
step Nothing _ = []
step (Just [x]) _ = return [x]
step (Just lst) fn = lst >>= fn
beGreedy lst =
let len = sum . map (T.length . entrySimplified)
longest = maximum (map len lst')
mostCompact = minimum (map length lst)
lst' = filter (\x -> length x == mostCompact) lst
in filter (\x -> len x == longest) lst'
toMaybe [] = Nothing
toMaybe lst = Just lst
data Token = KnownWord Entry | UnknownWord Text
deriving ( Read, Show, Eq, Ord )
tokenizer :: CCDict -> Text -> [Token]
tokenizer = tokenizer'
_ppTokenizerTests :: IO ()
_ppTokenizerTests =
case _tokenizer_tests of
[] -> putStrLn "No test failures."
lst -> do
flip mapM_ lst $ \(orig, expected, actual) -> do
T.putStr orig
putStr ": expected: "
T.putStr (T.unwords expected)
putStr ", got: "
T.putStrLn (T.unwords actual)
_tokenizer_tests :: [(Text, [Text], [Text])]
_tokenizer_tests =
[ (input, result, tokens)
| (input, result) <- cases
, let tokens = flat (tokenizer' ccDict input)
, tokens /= result ]
where
cases =
[ ("多工作", ["多","工作"])
, ("有电话", ["有","电话"])
, ("回电话", ["回","电话"])
, ("不知道", ["不","知道"])
, ("定时间", ["定","时间"])
, ("这位子", ["这","位子"])
, ("十分钟", ["十","分钟"])
, ("有电梯", ["有","电梯"])
, ("中午前", ["中午","前"])
, ("外套", ["外套"])
, ("家中餐馆", ["家","中餐馆"])
, ("后生活", ["后","生活"])
, ("不愿意", ["不","愿意"])
, ("点出发", ["点","出发"])
, ("老婆婆", ["老","婆婆"])
, ("不会跳舞", ["不会","跳舞"])
, ("穿上外套", ["穿上","外套"])
, ("建议", ["建议"])
, ("高明和", ["高明","和"]) ]
flat :: [Token] -> [Text]
flat = map worker
where
worker (KnownWord entry) = entrySimplified entry
worker (UnknownWord txt) = txt
type NonDet = Tree [Token]
_ppNonDet :: [NonDet] -> String
_ppNonDet = drawForest . map (fmap (unwords . map ppToken))
where
ppToken (KnownWord entry) = T.unpack (entrySimplified entry)
ppToken (UnknownWord txt) = T.unpack txt
_compactNonDet :: NonDet -> NonDet
_compactNonDet (Node a [Node b rest]) =
_compactNonDet (Node (a++b) rest)
_compactNonDet (Node a rest) =
Node a (map _compactNonDet rest)
collapseNonDet :: [NonDet] -> [Token]
collapseNonDet [] = []
collapseNonDet [Node entries rest] = entries ++ collapseNonDet rest
collapseNonDet (node:nodes) =
case maxBy nodeScore node nodes of
Node entries rest -> entries ++ collapseNonDet rest
where
maxBy fn x xs = maxBy' (fn x) x xs
where
maxBy' _hiScore hiItem [] = hiItem
maxBy' hiScore hiItem (y:ys) =
let score = fn y in
if score > hiScore then maxBy' score y ys else maxBy' hiScore hiItem ys
geoMean :: [Int] -> Integer
geoMean [] = 0
geoMean n = product $ map fromIntegral n
wordCount word = maybe 1 subtlexWCount (M.lookup word subtlex)
entryCount (KnownWord entry) = wordCount (entrySimplified entry)
entryCount UnknownWord{} = 1
nodeSum (Node entries _) = map entryCount entries
nodeScore = geoMean . nodeSum
tokenizer' :: CCDict -> Text -> [Token]
tokenizer' trie inp = collapseNonDet (tokenizerNondet trie inp)
tokenizerNondet :: CCDict -> Text -> [NonDet]
tokenizerNondet trie inp = map _compactNonDet $ go inp
where
go txt | T.null txt = []
go txt =
case lookupNonDet txt trie of
Nothing -> do
return $ Node [UnknownWord (T.take 1 txt)] $ go (T.drop 1 txt)
Just es -> do
entries <- es
let len = sum (map (T.length . entrySimplified) entries)
return $ Node (map KnownWord entries) $ go (T.drop len txt)
joinTrie :: CCTrieEntry -> CCTrieEntry -> CCTrieEntry
joinTrie (CCTrieNoEntry t1) (CCTrieNoEntry t2) = CCTrieNoEntry (IntMap.unionWith joinTrie t1 t2)
joinTrie (CCTrieNoEntry t1) (CCTrieEntry e t2) = CCTrieEntry e (IntMap.unionWith joinTrie t1 t2)
joinTrie (CCTrieNoEntry t1) (CCTrieEntryEnd e) = CCTrieEntry e t1
joinTrie (CCTrieEntry e t1) (CCTrieNoEntry t2) = CCTrieEntry e (IntMap.unionWith joinTrie t1 t2)
joinTrie (CCTrieEntry e1 t1) (CCTrieEntry e2 t2) =
CCTrieEntry (joinRawEntry e1 e2) (IntMap.unionWith joinTrie t1 t2)
joinTrie (CCTrieEntry e1 t2) (CCTrieEntryEnd e2) = CCTrieEntry (joinRawEntry e1 e2) t2
joinTrie (CCTrieEntryEnd e) (CCTrieNoEntry t) = CCTrieEntry e t
joinTrie (CCTrieEntryEnd e1) (CCTrieEntry e2 t) = CCTrieEntry (joinRawEntry e1 e2) t
joinTrie (CCTrieEntryEnd e1) (CCTrieEntryEnd e2) = CCTrieEntryEnd (joinRawEntry e1 e2)
joinRawEntry :: RawEntry -> RawEntry -> RawEntry
joinRawEntry e1 e2 = T.concat [e1, "\n", e2]
joinEntry :: Entry -> Entry -> Entry
joinEntry e1 e2 = Entry
{ entrySimplified = entrySimplified e1
, entryTraditional = entryTraditional e1
, entryPinyin = entryPinyin e1 ++ entryPinyin e2
, entryDefinition = entryDefinition e1 ++ entryDefinition e2 }
fromList :: [(Text, RawEntry)] -> CCDict
fromList = foldl' (flip insert) IntMap.empty
insert :: (Text, RawEntry) -> CCDict -> CCDict
insert (key, entry) = go (T.unpack key)
where
go :: [Char] -> CCDict -> CCDict
go [] _ = error "insert: Invalid entry."
go [x] t =
IntMap.insertWith joinTrie (ord x) (CCTrieEntryEnd entry) t
go (x:xs) t =
IntMap.alter (go' xs) (ord x) t
go' xs Nothing = Just $ CCTrieNoEntry (go xs IntMap.empty)
go' xs (Just trie) = Just $
case trie of
CCTrieNoEntry t -> CCTrieNoEntry $ go xs t
CCTrieEntry e t -> CCTrieEntry e $ go xs t
CCTrieEntryEnd e -> CCTrieEntry e $ go xs IntMap.empty
parseRawEntry :: Text -> Entry
parseRawEntry = foldr1 joinEntry . mapMaybe parseLine . T.lines
parseLine :: Text -> Maybe Entry
parseLine line | "#" `T.isPrefixOf` line = Nothing
parseLine line =
Just Entry
{ entrySimplified = simplified
, entryTraditional = traditional
, entryPinyin = [T.unwords $ map toToneMarks $ T.words pinyin]
, entryDefinition = [splitDefinition english] }
where
(traditional, line') = T.breakOn " " line
(simplified, line'') = T.breakOn " " (T.drop 1 line')
(pinyin_, english_) = T.breakOn "/" (T.drop 1 line'')
!english = english_
!pinyin = T.dropAround (\c -> isSpace c || c == '[' || c == ']') pinyin_
splitDefinition :: Text -> [Text]
splitDefinition = filter (not . T.null) . T.splitOn "/" . T.dropAround isSpace
ccDict :: CCDict
ccDict = parse $ T.decodeUtf8 raw
where
raw = $(embedFile "data/cedict_1_0_ts_utf-8_mdbg.txt")