{-# LANGUAGE BangPatterns #-} {-# LANGUAGE CPP #-} {-# LANGUAGE OverloadedStrings #-} -- | Simplified Chinese <-> English dictionary with pinyin phonetics. module Data.Chinese.CCDict ( CCDict , Entry(..) , load , parse , lookup , lookupMatches , ccDict ) where import qualified Data.ByteString as B import Data.Char import Data.IntMap (IntMap) import qualified Data.IntMap.Strict as IntMap import Data.List (foldl', maximumBy, nub) import Data.Maybe import Data.Ord import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.Encoding as T import qualified Data.Text.IO as T import Paths_cndict import Prelude hiding (lookup) import System.IO.Unsafe (unsafePerformIO) import Data.Tree import qualified Data.Chinese.Frequency as Frequency import Data.Chinese.Pinyin import Data.Chinese.Frequency hiding (lookup) -------------------------------------------------- -- Dictionary -- | Dictionary entry data Entry = Entry { entrySimplified :: !Text , entryTraditional :: !Text , entryPinyin :: [Text] , entryDefinition :: [[Text]] } deriving ( Read, Show, Eq, Ord ) type RawEntry = Text -- entryPinyin :: Entry -> [Text] -- entryPinyin = map (T.unwords . map toToneMarks . T.words) . entryPinyinRaw type CCDict = IntMap CCTrieEntry data CCTrieEntry = CCTrieEntry {-# UNPACK #-} !RawEntry !CCDict | CCTrieEntryEnd {-# UNPACK #-} !RawEntry | CCTrieNoEntry !CCDict deriving ( Show ) -- instance Binary CCTrieEntry where -- put (CCTrieEntry entry rest) = put entry >> put rest -- get = CCTrieEntry <$> get <*> get -- | Load dictionary from file. load :: FilePath -> IO CCDict load path = parse `fmap` T.readFile path -- | Load dictionary from unicode text. parse :: Text -> CCDict parse txt = fromList [ (key, line) | line <- T.lines txt , Just entry <- [parseLine line] , key <- nub [entrySimplified entry, entryTraditional entry] ] -- | O(n). Lookup dictionary entry for a string of simplified chinese. lookup :: Text -> CCDict -> Maybe Entry lookup key trie = case map ord $ T.unpack key of [] -> Nothing (x:xs) -> fmap parseRawEntry (go xs =<< IntMap.lookup x trie) where go _ (CCTrieEntryEnd es) = Just es go [] (CCTrieEntry es _) = Just es go [] (CCTrieNoEntry _) = Nothing go (x:xs) (CCTrieEntry es m) = Just (fromMaybe es (go xs =<< IntMap.lookup x m)) go (x:xs) (CCTrieNoEntry m) = (go xs =<< IntMap.lookup x m) lookupMatches :: Text -> CCDict -> Maybe [Entry] lookupMatches key trie = case map ord $ T.unpack key of [] -> Nothing (x:xs) -> case fmap (map parseRawEntry . go xs) (IntMap.lookup x trie) of Just [] -> Nothing other -> other where go _ (CCTrieEntryEnd e) = [e] go [] (CCTrieNoEntry _) = [] go [] (CCTrieEntry e _) = [e] go (x:xs) (CCTrieNoEntry m) = maybe [] (go xs) (IntMap.lookup x m) go (x:xs) (CCTrieEntry e m) = e : maybe [] (go xs) (IntMap.lookup x m) -- 点出发 -- [[点,出发],[点出,发]] -- 出发点 -- [[出发]] -- 穿上外套 -- This can be broken up in two ways: 穿 上外 套 and 穿上 外套 -- We want the second, more greedy tokenization. lookupNonDet :: Text -> CCDict -> Maybe [[Entry]] lookupNonDet key trie = do entries <- lookupMatches key trie let longest = maximumBy (comparing (T.length . entrySimplified)) entries if length entries == 1 then return [entries] else do return $ whenEmpty [[longest]] $ maybe [] beGreedy $ sequence $ do entry1 <- entries case lookupMatches (T.drop (T.length (entrySimplified entry1)) key) trie of Nothing -> return Nothing Just entries2 -> do entry2 <- entries2 return $ Just (entry1, entry2) where filterCompact :: [[Entry]] -> [[Entry]] filterCompact lst = let mostCompact = minimum (map length lst) in [ entries | entries <- lst, length entries == mostCompact ] filterLongest :: [[Entry]] -> [[Entry]] filterLongest lst = let len = sum . map (T.length . entrySimplified) longest = maximum (map len lst) in [ entries | entries <- lst, len entries == longest ] beGreedy :: [(Entry,Entry)] -> [[Entry]] beGreedy lst = let longestFirst = maximum (map (T.length . entrySimplified . fst) lst) longest = maximum [ T.length (entrySimplified e1) + T.length (entrySimplified e2) | (e1,e2) <- lst , T.length (entrySimplified e1) < longestFirst ] in filterCompact $ filterLongest $ nub $ [ [e1,e2] | (e1,e2) <- lst , T.length (entrySimplified e1) < longest , T.length (entrySimplified e1) + T.length (entrySimplified e2) /= 2 ] ++ [ [e1] | (e1,_) <- lst , T.length (entrySimplified e1) == longest ] whenEmpty lst [] = lst whenEmpty _ lst = lst -- step Nothing _ = [] -- step (Just [x]) _ = return [x] -- step (Just lst) fn = lst >>= fn -- beGreedy lst = -- let len = sum . map (T.length . entrySimplified) -- longest = maximum (map len lst') -- mostCompact = minimum (map length lst) -- lst' = filter (\x -> length x == mostCompact) lst -- in filter (\x -> len x == longest) lst' -- toMaybe [] = Nothing -- toMaybe lst = Just lst -------------------------------------------------- -- Tokenizer -- Interesting case: 他的话 tokenizes to [他,的话] by both google translate and -- MDGB. The correct tokenization is [他,的,话]. Not sure if it can be fixed without -- adding an entry for 他的 in the dictionary. -- TODO: Mark text inclosed in curly brackets as unknown words. -- FIXME: 不想 should tokenize to [不,想] -- FIXME: 那是 should tokenize to [那,是] -------------------------------------------------- -- Dictionary trie -- union :: CCDict -> CCDict -> CCDict -- union = IntMap.unionWith joinTrie -- joinTrie newValue oldValue joinTrie :: CCTrieEntry -> CCTrieEntry -> CCTrieEntry joinTrie (CCTrieNoEntry t1) (CCTrieNoEntry t2) = CCTrieNoEntry (IntMap.unionWith joinTrie t1 t2) joinTrie (CCTrieNoEntry t1) (CCTrieEntry e t2) = CCTrieEntry e (IntMap.unionWith joinTrie t1 t2) joinTrie (CCTrieNoEntry t1) (CCTrieEntryEnd e) = CCTrieEntry e t1 joinTrie (CCTrieEntry e t1) (CCTrieNoEntry t2) = CCTrieEntry e (IntMap.unionWith joinTrie t1 t2) joinTrie (CCTrieEntry e1 t1) (CCTrieEntry e2 t2) = CCTrieEntry (joinRawEntry e1 e2) (IntMap.unionWith joinTrie t1 t2) joinTrie (CCTrieEntry e1 t2) (CCTrieEntryEnd e2) = CCTrieEntry (joinRawEntry e1 e2) t2 joinTrie (CCTrieEntryEnd e) (CCTrieNoEntry t) = CCTrieEntry e t joinTrie (CCTrieEntryEnd e1) (CCTrieEntry e2 t) = CCTrieEntry (joinRawEntry e1 e2) t joinTrie (CCTrieEntryEnd e1) (CCTrieEntryEnd e2) = CCTrieEntryEnd (joinRawEntry e1 e2) joinRawEntry :: RawEntry -> RawEntry -> RawEntry joinRawEntry e1 e2 = T.concat [e1, "\n", e2] -- joinEntry newValue oldValue joinEntry :: Entry -> Entry -> Entry joinEntry e1 e2 = Entry { -- The simplified characters must be identical entrySimplified = entrySimplified e1 -- 了 maps to two traditional characters: 了 and 瞭. -- In these cases, choose the same as the simplified. , entryTraditional = if entryTraditional e1 == entrySimplified e1 || entryTraditional e2 == entrySimplified e1 then entrySimplified e1 else entryTraditional e1 , entryPinyin = entryPinyin e2 ++ entryPinyin e1 , entryDefinition = entryDefinition e2 ++ entryDefinition e1 } -- unions :: [CCDict] -> CCDict -- unions = foldl' union IntMap.empty fromList :: [(Text, RawEntry)] -> CCDict -- fromList = unions . map singleton fromList = foldl' (flip insert) IntMap.empty insert :: (Text, RawEntry) -> CCDict -> CCDict insert (key, entry) = go (T.unpack key) where go :: [Char] -> CCDict -> CCDict go [] _ = error "insert: Invalid entry." go [x] t = IntMap.insertWith joinTrie (ord x) (CCTrieEntryEnd entry) t go (x:xs) t = IntMap.alter (go' xs) (ord x) t go' xs Nothing = Just $ CCTrieNoEntry (go xs IntMap.empty) go' xs (Just trie) = Just $ case trie of CCTrieNoEntry t -> CCTrieNoEntry $ go xs t CCTrieEntry e t -> CCTrieEntry e $ go xs t CCTrieEntryEnd e -> CCTrieEntry e $ go xs IntMap.empty -- singleton :: Entry -> CCDict -- singleton entry = go (T.unpack (entryChinese entry)) -- where -- go [] = error "singleton: Invalid entry." -- go [x] = IntMap.singleton (ord x) (CCTrieEntryEnd entry) -- go (x:xs) = IntMap.singleton (ord x) (CCTrieNoEntry (go xs)) parseRawEntry :: Text -> Entry parseRawEntry = foldr1 joinEntry . mapMaybe parseLine . T.lines parseLine :: Text -> Maybe Entry parseLine line | "#" `T.isPrefixOf` line = Nothing parseLine line = Just Entry { entrySimplified = simplified , entryTraditional = traditional , entryPinyin = [T.unwords $ map toToneMarks $ T.words pinyin] , entryDefinition = [splitDefinition english] } -- , entryPinyin = V.singleton $ T.unwords $ map toToneMarks $ T.words $ T.tail $ -- T.init $ T.unwords (pinyin ++ [pin]) -- , entryDefinition = V.singleton $ splitDefinition (T.unwords english) } where (traditional, line') = T.breakOn " " line (simplified, line'') = T.breakOn " " (T.drop 1 line') (pinyin_, english_) = T.breakOn "/" (T.drop 1 line'') !english = english_ !pinyin = T.dropAround (\c -> isSpace c || c == '[' || c == ']') pinyin_ -- firstSep = breakOn " ", breakOn " ", breakOn "/" -- (_traditional : chinese : rest) = T.words (T.copy line) -- (pinyin, (pin : english)) = break (\word -> T.count "]" word > 0) rest -- /first/second/third/ -> [first, second, third] splitDefinition :: Text -> [Text] splitDefinition = filter (not . T.null) . T.splitOn "/" . T.dropAround isSpace -------------------------------------------------- -- Embedded dictionary remove :: Text -> CCDict -> CCDict remove = worker . map ord . T.unpack where worker [] dict = dict worker (x:xs) dict = IntMap.update (fn xs) x dict fn xs (CCTrieNoEntry rest) = Just $ CCTrieNoEntry (worker xs rest) fn [] CCTrieEntryEnd{} = Nothing fn _ (CCTrieEntryEnd entry) = Just $ CCTrieEntryEnd entry fn [] (CCTrieEntry _ rest) = Just $ CCTrieNoEntry rest fn xs (CCTrieEntry e rest) = Just $ CCTrieEntry e (worker xs rest) -- | Embedded dictionary. ccDict :: CCDict ccDict = remove "得很" $ remove "那是" $ remove "到了" $ remove "里人" $ remove "多事" $ remove "你我" $ remove "家的" $ parse $ T.decodeUtf8 raw where -- raw = $(embedFile "data/cedict_1_0_ts_utf-8_mdbg.txt") raw = unsafePerformIO $ do path <- getDataFileName "data/cedict_1_0_ts_utf-8_mdbg.txt" B.readFile path -- ccDict' :: CCDict -- ccDict' = decode (BL.fromStrict raw) -- where -- raw = $(embedFile "data/cedict_1_0_ts_utf-8_mdbg.txt.binary")