{- } Parses a CEDICT dictionary, assuming it's + been translated from UTF-8 or whatever into character data. + uses UNIX line-endings { -} module Data.Char.CEDICT.Reader.Parser ( dictParse , parseFile , parseLine , parseLines ) where import System.IO.UTF8 import Text.ParserCombinators.Parsec import Prelude hiding ( appendFile , getContents , getLine , print , putStr , putStrLn , readFile , readLn , writeFile ) parseFile path = do text <- readFile path return $ dictParse path text dictParse = parse parseLines parseLines = do tag <- comment contents <- parseNoComment return (tag, contents) comment = do char '#' spaces tag <- manyTill anyChar $ char ';' many1 $ noneOf "\n" newline return tag parseNoComment = sepEndBy1 parseLine newline parseLine = do traditional <- manyTill anyChar $ char ' ' spaces simplified <- manyTill anyChar $ char ' ' spaces pinyin <- pins spaces definitions <- defs return (traditional, simplified, pinyin, definitions) pins = do char '[' text <- manyTill anyChar $ char ']' return $ teutonize text teutonize ('u':':':rest) = 'ΓΌ' : (teutonize rest) teutonize (c:rest) = c : (teutonize rest) teutonize [] = [] defs = do char '/' many1 oneDef oneDef = manyTill (noneOf "\n") $ char '/' onePin = do phone <- many1 $ choice [letter, char ':'] tone <- digit return $ phone ++ [tone] -- Tried using this, but it makes my parser slow! oneHan = oneOf $ concat [ [ (c 0x3400)..(c 0x4DB5) ] -- CJK Unified Ideographs Extension A 3.0 , [ (c 0x4E00)..(c 0x9FA5) ] -- CJK Unified Ideographs 1.1 , [ (c 0x9FA6)..(c 0x9FBB) ] -- CJK Unified Ideographs 4.1 , [ (c 0xF900)..(c 0xFA2D) ] -- CJK Compatibility Ideographs 1.1 , [ (c 0xFA30)..(c 0xFA6A) ] -- CJK Compatibility Ideographs 3.2 , [ (c 0xFA70)..(c 0xFAD9) ] -- CJK Compatibility Ideographs 4.1 , [(c 0x20000)..(c 0x2A6D6)] -- CJK Unified Ideographs Extension B 3.1 , [(c 0x2F800)..(c 0x2FA1D)] -- CJK Compatibility Supplement 3.1 ] where c :: Int -> Char c = toEnum