#!/usr/bin/env runhaskell {- Feed this command a stream of UTF8 chars, and it will look up the - ones it can and just print the ones it can't. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -} import Control.Monad import Data.Char import Data.List import qualified Data.Map as Map import qualified System.IO.UTF8 as UTF8 import qualified Data.Char.CEDICT as CEDICT main = do s <- UTF8.getContents mapM_ (UTF8.putStrLn . handleCodePoint) $ cleanUpLines s cleanUpLines = foldr1 (++) . words nullLookup = [("\NUL", "\NUL")] handleCodePoint char = (++ "\n") $ together $ charLine:datLines where together = concat . intersperse "\n" charLine = (replicate 77 ' ') ++ [char] datLines = if isSpace char then [] else if pinsdefs == nullLookup then [] else format pinsdefs where pinsdefs = if isUnihan char then lookItUp char else nullLookup format pinsdefs = format' pinsdefs [] where format' [] res = reverse res format' ((p, d):t) res = format' t (formed:res) where formed = " " ++ paddedDef ++ paddedPin where paddedPin = padding ++ trunc where (trunc, padding) = trunPad 10 p paddedDef = trunc ++ padding where (trunc, padding) = trunPad 64 d trunPad width str = (trunc, padding) where trunc = take width str padding = replicate (width - length trunc) ' ' lookItUp char = case Map.lookup char CEDICT.singles of Just val -> val Nothing -> nullLookup {- Per - http://www.unicode.org/Public/UNIDATA/Unihan.html#File_Structure - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -} isUnihan char = any (uncurry $ check cp) ranges where cp = (fromEnum char) :: Int check val a b = (a val) && (b val) ranges = [ ((>= 0x3400), (<= 0x4DB5)) -- CJK Unified Ideographs Extension A 3.0 , ((>= 0x4E00), (<= 0x9FA5)) -- CJK Unified Ideographs 1.1 , ((>= 0x9FA6), (<= 0x9FBB)) -- CJK Unified Ideographs 4.1 , ((>= 0xF900), (<= 0xFA2D)) -- CJK Compatibility Ideographs 1.1 , ((>= 0xFA30), (<= 0xFA6A)) -- CJK Compatibility Ideographs 3.2 , ((>= 0xFA70), (<= 0xFAD9)) -- CJK Compatibility Ideographs 4.1 , ((>= 0x20000), (<= 0x2A6D6)) -- CJK Unified Ideographs Extension B 3.1 , ((>= 0x2F800), (<= 0x2FA1D)) -- CJK Compatibility Supplement 3.1 ]