{- }
  
  Parses a CEDICT dictionary, assuming it's

 +  been translated from UTF-8 or whatever into character data.
 +  uses UNIX line-endings

 { -}


module Data.Char.CEDICT.Reader.Parser
  ( dictParse
  , parseFile
  , parseLine
  , parseLines
  )
where

import System.IO.UTF8

import Text.ParserCombinators.Parsec

import Prelude hiding
  ( appendFile
  , getContents
  , getLine
  , print
  , putStr
  , putStrLn
  , readFile
  , readLn
  , writeFile
  ) 


parseFile path               =  do
  text                      <-  readFile path
  return $ dictParse path text

dictParse                    =  parse parseLines

parseLines                   =  do
  tag                       <-  comment
  contents                  <-  parseNoComment
  return (tag, contents)

comment                      =  do
  char '#'
  spaces
  tag                       <-  manyTill anyChar $ char ';'
  many1 $ noneOf "\n"
  newline
  return tag

parseNoComment               =  sepEndBy1 parseLine newline

parseLine                    =  do
  traditional               <-  manyTill anyChar $ char ' ' 
  spaces
  simplified                <-  manyTill anyChar $ char ' '
  spaces
  pinyin                    <-  pins
  spaces
  definitions               <-  defs
  return (traditional, simplified, pinyin, definitions)


pins                         =  do
  char '['
  text                      <-  manyTill anyChar $ char ']' 
  return $ teutonize text

teutonize ('u':':':rest)     =  'ü' : (teutonize rest)
teutonize (c:rest)           =  c : (teutonize rest)
teutonize []                 =  []

defs                         =  do
  char '/'
  many1 oneDef 

oneDef                       =  manyTill (noneOf "\n") $ char '/'

onePin                       =  do
  phone                     <-  many1 $ choice [letter, char ':']
  tone                      <-  digit
  return $ phone ++ [tone]


 -- Tried using this, but it makes my parser slow!
oneHan                       =  oneOf $ concat
  [ [ (c 0x3400)..(c 0x4DB5) ] --  CJK Unified Ideographs Extension A  3.0
  , [ (c 0x4E00)..(c 0x9FA5) ] --  CJK Unified Ideographs  1.1
  , [ (c 0x9FA6)..(c 0x9FBB) ] --  CJK Unified Ideographs  4.1
  , [ (c 0xF900)..(c 0xFA2D) ] --  CJK Compatibility Ideographs  1.1
  , [ (c 0xFA30)..(c 0xFA6A) ] --  CJK Compatibility Ideographs  3.2
  , [ (c 0xFA70)..(c 0xFAD9) ] --  CJK Compatibility Ideographs  4.1
  , [(c 0x20000)..(c 0x2A6D6)] --  CJK Unified Ideographs Extension B  3.1
  , [(c 0x2F800)..(c 0x2FA1D)] --  CJK Compatibility Supplement  3.1
  ]
 where
  c                         ::  Int -> Char
  c                          =  toEnum