module Data.Chinese.CCDict
( CCDict
, Entry(..)
, load
, parse
, lookup
, ccDict
, Token(..)
, tokenizer
, toTraditional
, toSimplified
) where
import Data.Char
import Data.FileEmbed
import Data.List (foldl', nub, maximumBy)
import Data.Ord
import Data.IntMap (IntMap)
import qualified Data.IntMap.Strict as IntMap
import qualified Data.Map.Strict as M
import Data.Maybe
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Text.IO as T
import Prelude hiding (lookup)
import Data.Tree
import Data.Chinese.Pinyin
import qualified Data.Chinese.Frequency as Frequency
import Data.Chinese.Frequency hiding (lookup)
data Entry = Entry
{ entrySimplified :: !Text
, entryTraditional :: !Text
, entryPinyin :: [Text]
, entryDefinition :: [[Text]]
} deriving ( Read, Show, Eq, Ord )
type RawEntry = Text
type CCDict = IntMap CCTrieEntry
data CCTrieEntry
= CCTrieEntry !RawEntry !CCDict
| CCTrieEntryEnd !RawEntry
| CCTrieNoEntry !CCDict
deriving ( Show )
load :: FilePath -> IO CCDict
load path = parse `fmap` T.readFile path
parse :: Text -> CCDict
parse txt = fromList
[ (key, line)
| line <- T.lines txt
, Just entry <- [parseLine line]
, key <- nub [entrySimplified entry, entryTraditional entry] ]
lookup :: Text -> CCDict -> Maybe Entry
lookup key trie =
case map ord $ T.unpack key of
[] -> Nothing
(x:xs) -> fmap parseRawEntry (go xs =<< IntMap.lookup x trie)
where
go _ (CCTrieEntryEnd es) = Just es
go [] (CCTrieEntry es _) = Just es
go [] (CCTrieNoEntry _) = Nothing
go (x:xs) (CCTrieEntry es m) = Just (fromMaybe es (go xs =<< IntMap.lookup x m))
go (x:xs) (CCTrieNoEntry m) = (go xs =<< IntMap.lookup x m)
lookupMatches :: Text -> CCDict -> Maybe [Entry]
lookupMatches key trie =
case map ord $ T.unpack key of
[] -> Nothing
(x:xs) ->
case fmap (map parseRawEntry . go xs) (IntMap.lookup x trie) of
Just [] -> Nothing
other -> other
where
go _ (CCTrieEntryEnd e) = [e]
go [] (CCTrieNoEntry _) = []
go [] (CCTrieEntry e _) = [e]
go (x:xs) (CCTrieNoEntry m) = maybe [] (go xs) (IntMap.lookup x m)
go (x:xs) (CCTrieEntry e m) = e : maybe [] (go xs) (IntMap.lookup x m)
lookupNonDet :: Text -> CCDict -> Maybe [[Entry]]
lookupNonDet key trie = do
entries <- lookupMatches key trie
let longest = maximumBy (comparing (T.length . entrySimplified)) entries
if length entries == 1
then return [entries]
else do
return $ whenEmpty [[longest]] $ maybe [] beGreedy $ sequence $ do
entry1 <- entries
case lookupMatches (T.drop (T.length (entrySimplified entry1)) key) trie of
Nothing -> return Nothing
Just entries2 -> do
entry2 <- entries2
return $ Just (entry1, entry2)
where
filterCompact :: [[Entry]] -> [[Entry]]
filterCompact lst =
let mostCompact = minimum (map length lst)
in [ entries | entries <- lst, length entries == mostCompact ]
filterLongest :: [[Entry]] -> [[Entry]]
filterLongest lst =
let len = sum . map (T.length . entrySimplified)
longest = maximum (map len lst)
in [ entries | entries <- lst, len entries == longest ]
beGreedy :: [(Entry,Entry)] -> [[Entry]]
beGreedy lst =
let longestFirst = maximum (map (T.length . entrySimplified . fst) lst)
longest = maximum [ T.length (entrySimplified e1) + T.length (entrySimplified e2)
| (e1,e2) <- lst
, T.length (entrySimplified e1) < longestFirst ]
in filterCompact $ filterLongest $ nub $
[ [e1,e2]
| (e1,e2) <- lst
, T.length (entrySimplified e1) < longest
, T.length (entrySimplified e1) + T.length (entrySimplified e2) /= 2 ] ++
[ [e1]
| (e1,_) <- lst
, T.length (entrySimplified e1) == longest ]
whenEmpty lst [] = lst
whenEmpty _ lst = lst
data Token = KnownWord Entry | UnknownWord Text
deriving ( Read, Show, Eq, Ord )
tokenizer :: CCDict -> Text -> [Token]
tokenizer = tokenizer'
_ppTokenizerTests :: IO ()
_ppTokenizerTests =
case _tokenizer_tests of
[] -> putStrLn "No test failures."
lst -> do
flip mapM_ lst $ \(orig, expected, actual) -> do
T.putStr orig
putStr ": expected: "
T.putStr (T.unwords expected)
putStr ", got: "
T.putStrLn (T.unwords actual)
_tokenizer_tests :: [(Text, [Text], [Text])]
_tokenizer_tests =
[ (input, result, tokens)
| (input, result) <- cases
, let tokens = flat (tokenizer' ccDict input)
, tokens /= result ]
where
cases =
[ ("多工作", ["多","工作"])
, ("有电话", ["有","电话"])
, ("回电话", ["回","电话"])
, ("不知道", ["不","知道"])
, ("定时间", ["定","时间"])
, ("这位子", ["这","位子"])
, ("十分钟", ["十","分钟"])
, ("有电梯", ["有","电梯"])
, ("中午前", ["中午","前"])
, ("外套", ["外套"])
, ("家中餐馆", ["家","中餐馆"])
, ("后生活", ["后","生活"])
, ("不愿意", ["不","愿意"])
, ("点出发", ["点","出发"])
, ("老婆婆", ["老","婆婆"])
, ("不会跳舞", ["不会","跳舞"])
, ("穿上外套", ["穿上","外套"])
, ("建议", ["建议"])
, ("怎么不知道", ["怎么","不","知道"])
, ("蛋糕发起来", ["蛋糕","发","起来"])
, ("管理的人才", ["管理","的","人才"])
, ("轻快乐曲", ["轻快","乐曲"])
, ("高明和", ["高明","和"])
, ("一下子之间", ["一下子","之间"])
, ("我绝没想到", ["我","绝","没想到"])
, ("没想到会", ["没想到","会"]) ]
flat :: [Token] -> [Text]
flat = map worker
where
worker (KnownWord entry) = entrySimplified entry
worker (UnknownWord txt) = txt
type NonDet = Tree [Token]
_ppNonDet :: [NonDet] -> String
_ppNonDet = drawForest . map (fmap (unwords . map ppToken))
where
ppToken (KnownWord entry) = T.unpack (entrySimplified entry)
ppToken (UnknownWord txt) = T.unpack txt
_compactNonDet :: NonDet -> NonDet
_compactNonDet (Node a [Node b rest]) =
_compactNonDet (Node (a++b) rest)
_compactNonDet (Node a rest) =
Node a (map _compactNonDet rest)
collapseNonDet :: [NonDet] -> [Token]
collapseNonDet [] = []
collapseNonDet [Node entries rest] = entries ++ collapseNonDet rest
collapseNonDet (node:nodes) =
case maxBy nodeScore node nodes of
Node entries rest -> entries ++ collapseNonDet rest
where
maxBy fn x xs = maxBy' (fn x) x xs
where
maxBy' _hiScore hiItem [] = hiItem
maxBy' hiScore hiItem (y:ys) =
let score = fn y in
if score > hiScore then maxBy' score y ys else maxBy' hiScore hiItem ys
geoMean :: [Int] -> Integer
geoMean [] = 0
geoMean n = product $ map fromIntegral n
wordCount word = maybe 1 subtlexWCount (Frequency.lookup word subtlex)
entryCount (KnownWord entry) = wordCount (entrySimplified entry)
entryCount UnknownWord{} = 1
nodeSum (Node entries _) = map entryCount entries
nodeScore = geoMean . nodeSum
tokenizer' :: CCDict -> Text -> [Token]
tokenizer' trie inp = compress $ collapseNonDet (tokenizerNondet trie inp)
where
compress [] = []
compress (UnknownWord a:UnknownWord b:xs) = compress (UnknownWord (a `T.append` b):xs)
compress (x:xs) = x:compress xs
tokenizerNondet :: CCDict -> Text -> [NonDet]
tokenizerNondet trie inp = map _compactNonDet $ go inp
where
go txt | T.null txt = []
go txt =
case lookupNonDet txt trie of
Nothing -> do
return $ Node [UnknownWord (T.take 1 txt)] $ go (T.drop 1 txt)
Just es -> do
entries <- es
let len = sum (map (T.length . entrySimplified) entries)
return $ Node (map KnownWord entries) $ go (T.drop len txt)
joinTrie :: CCTrieEntry -> CCTrieEntry -> CCTrieEntry
joinTrie (CCTrieNoEntry t1) (CCTrieNoEntry t2) = CCTrieNoEntry (IntMap.unionWith joinTrie t1 t2)
joinTrie (CCTrieNoEntry t1) (CCTrieEntry e t2) = CCTrieEntry e (IntMap.unionWith joinTrie t1 t2)
joinTrie (CCTrieNoEntry t1) (CCTrieEntryEnd e) = CCTrieEntry e t1
joinTrie (CCTrieEntry e t1) (CCTrieNoEntry t2) = CCTrieEntry e (IntMap.unionWith joinTrie t1 t2)
joinTrie (CCTrieEntry e1 t1) (CCTrieEntry e2 t2) =
CCTrieEntry (joinRawEntry e1 e2) (IntMap.unionWith joinTrie t1 t2)
joinTrie (CCTrieEntry e1 t2) (CCTrieEntryEnd e2) = CCTrieEntry (joinRawEntry e1 e2) t2
joinTrie (CCTrieEntryEnd e) (CCTrieNoEntry t) = CCTrieEntry e t
joinTrie (CCTrieEntryEnd e1) (CCTrieEntry e2 t) = CCTrieEntry (joinRawEntry e1 e2) t
joinTrie (CCTrieEntryEnd e1) (CCTrieEntryEnd e2) = CCTrieEntryEnd (joinRawEntry e1 e2)
joinRawEntry :: RawEntry -> RawEntry -> RawEntry
joinRawEntry e1 e2 = T.concat [e1, "\n", e2]
joinEntry :: Entry -> Entry -> Entry
joinEntry e1 e2 = Entry
{
entrySimplified = entrySimplified e1
, entryTraditional =
if entryTraditional e1 == entrySimplified e1 ||
entryTraditional e2 == entrySimplified e1
then entrySimplified e1
else entryTraditional e1
, entryPinyin = entryPinyin e2 ++ entryPinyin e1
, entryDefinition = entryDefinition e2 ++ entryDefinition e1 }
fromList :: [(Text, RawEntry)] -> CCDict
fromList = foldl' (flip insert) IntMap.empty
insert :: (Text, RawEntry) -> CCDict -> CCDict
insert (key, entry) = go (T.unpack key)
where
go :: [Char] -> CCDict -> CCDict
go [] _ = error "insert: Invalid entry."
go [x] t =
IntMap.insertWith joinTrie (ord x) (CCTrieEntryEnd entry) t
go (x:xs) t =
IntMap.alter (go' xs) (ord x) t
go' xs Nothing = Just $ CCTrieNoEntry (go xs IntMap.empty)
go' xs (Just trie) = Just $
case trie of
CCTrieNoEntry t -> CCTrieNoEntry $ go xs t
CCTrieEntry e t -> CCTrieEntry e $ go xs t
CCTrieEntryEnd e -> CCTrieEntry e $ go xs IntMap.empty
parseRawEntry :: Text -> Entry
parseRawEntry = foldr1 joinEntry . mapMaybe parseLine . T.lines
parseLine :: Text -> Maybe Entry
parseLine line | "#" `T.isPrefixOf` line = Nothing
parseLine line =
Just Entry
{ entrySimplified = simplified
, entryTraditional = traditional
, entryPinyin = [T.unwords $ map toToneMarks $ T.words pinyin]
, entryDefinition = [splitDefinition english] }
where
(traditional, line') = T.breakOn " " line
(simplified, line'') = T.breakOn " " (T.drop 1 line')
(pinyin_, english_) = T.breakOn "/" (T.drop 1 line'')
!english = english_
!pinyin = T.dropAround (\c -> isSpace c || c == '[' || c == ']') pinyin_
splitDefinition :: Text -> [Text]
splitDefinition = filter (not . T.null) . T.splitOn "/" . T.dropAround isSpace
flatMap :: (Entry -> Text) -> [Token] -> Text
flatMap fn = T.concat . map worker
where
worker (KnownWord e) = fn e
worker (UnknownWord txt) = txt
toTraditional :: Text -> Text
toTraditional = flatMap entryTraditional . tokenizer ccDict
toSimplified :: Text -> Text
toSimplified = flatMap entrySimplified . tokenizer ccDict
ccDict :: CCDict
ccDict = parse $ T.decodeUtf8 raw
where
raw = $(embedFile "data/cedict_1_0_ts_utf-8_mdbg.txt")