module CJK.Data.Unihan.Readings (
CharDefinition, definition,
OccurrenceCount, IsHDZSubstitution,
mandarinBestEffort, mandarin, hanyuPinlu, hanyuPinyin, xhc1983,
cantonese,
CommonTangCharacter, tang,
hangul, korean,
japaneseKun, japaneseOn,
vietnamese
) where
import qualified CJK.Data.Hangul as Hangul
import qualified CJK.Data.Jyutping as Jyutping
import qualified CJK.Data.KoreanYale as KoreanYale
import qualified CJK.Data.Pinyin as Pinyin
import qualified CJK.Data.QuocNgu as QuocNgu
import CJK.Data.Internal
import CJK.Data.Types
import CJK.Utilities
import Control.Applicative
import qualified Data.Text as Text
import qualified Data.Text.Lazy as TextL
import Data.Attoparsec.Text
import Data.Char
import Data.Maybe
import qualified Data.Map as M
import Data.List
import System.IO.Unsafe
type CharDefinition = Text.Text
type OccurrenceCount = Int
type CommonTangCharacter = Bool
type IsHDZSubstitution = Bool
mandarinBestEffort :: Char -> [Pinyin.Phone]
mandarinBestEffort c = nubBy eq $ map fst (hanyuPinlu c) ++
concatMap snd (xhc1983 c) ++
concatMap snd (hanyuPinyin c) ++
(case mandarin c of Nothing -> []; Just (simp, trad) -> [simp, trad])
where yin1 `eq` yin2 = Text.toLower (Pinyin.sound yin1) == Text.toLower (Pinyin.sound yin2) && Pinyin.tone yin1 == Pinyin.tone yin2
cantonese :: Char -> [Jyutping.Phone]
cantonese c = M.findWithDefault [] c (kCantonese readings)
definition :: Char -> [CharDefinition]
definition c = M.findWithDefault [] c (kDefinition readings)
hangul :: Char -> [Hangul.Phone]
hangul c = M.findWithDefault [] c (kHangul readings)
hanyuPinlu :: Char -> [(Pinyin.Phone, OccurrenceCount)]
hanyuPinlu c = M.findWithDefault [] c (kHanyuPinlu readings)
hanyuPinyin :: Char -> [([HDZEntry], [Pinyin.Phone])]
hanyuPinyin c = M.findWithDefault [] c (kHanyuPinyin readings)
japaneseKun :: Char -> [Text.Text]
japaneseKun c = M.findWithDefault [] c (kJapaneseKun readings)
japaneseOn :: Char -> [Text.Text]
japaneseOn c = M.findWithDefault [] c (kJapaneseOn readings)
korean :: Char -> [KoreanYale.Phone]
korean c = M.findWithDefault [] c (kKorean readings)
mandarin :: Char -> Maybe (Pinyin.Phone, Pinyin.Phone)
mandarin c = M.lookup c (kMandarin readings)
tang :: Char -> [(CommonTangCharacter, Text.Text)]
tang c = M.findWithDefault [] c (kTang readings)
vietnamese :: Char -> [QuocNgu.Phone]
vietnamese c = M.findWithDefault [] c (kVietnamese readings)
xhc1983 :: Char -> [([(HDZEntry, IsHDZSubstitution)], [Pinyin.Phone])]
xhc1983 c = M.findWithDefault [] c (kXHC1983 readings)
data ReadingsMap = RMS {
kCantonese :: !(M.Map Char [Jyutping.Phone]),
kDefinition :: !(M.Map Char [CharDefinition]),
kHangul :: !(M.Map Char [Hangul.Phone]),
kHanyuPinlu :: !(M.Map Char [(Pinyin.Phone, OccurrenceCount)]),
kHanyuPinyin :: !(M.Map Char [([HDZEntry], [Pinyin.Phone])]),
kJapaneseKun :: !(M.Map Char [Text.Text]),
kJapaneseOn :: !(M.Map Char [Text.Text]),
kKorean :: !(M.Map Char [KoreanYale.Phone]),
kMandarin :: !(M.Map Char (Pinyin.Phone, Pinyin.Phone)),
kTang :: !(M.Map Char [(CommonTangCharacter, Text.Text)]),
kVietnamese :: !(M.Map Char [QuocNgu.Phone]),
kXHC1983 :: !(M.Map Char [([(HDZEntry, IsHDZSubstitution)], [Pinyin.Phone])])
} deriving (Show)
emptyReadingsMap :: ReadingsMap
emptyReadingsMap = RMS M.empty M.empty M.empty M.empty M.empty M.empty M.empty M.empty M.empty M.empty M.empty M.empty
unionReadingsMap :: ReadingsMap -> ReadingsMap -> ReadingsMap
unionReadingsMap (RMS a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12) (RMS b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 b12)
= RMS (plus a1 b1) (plus a2 b2) (plus a3 b3) (plus a4 b4) (plus a5 b5) (plus a6 b6)
(plus a7 b7) (plus a8 b8) (plus a9 b9) (plus a10 b10) (plus a11 b11) (plus a12 b12)
where plus = M.unionWith (error "unionReadingsMap: impossible")
contents :: TextL.Text
contents = unsafePerformIO (readUTF8DataFile "data/Unihan/Unihan_Readings.txt")
readings :: ReadingsMap
readings = parseLazy fileP contents
fileP :: Parser ReadingsMap
fileP = fmap (foldl' unionReadingsMap emptyReadingsMap) (lineP `manyTill` endOfInput)
lineP :: Parser ReadingsMap
lineP = do { c <- charP <* skipSpace; dataP <- readingP c <* skipSpace; dataP <* skipTrueSpace <* lineTerminator }
<|> char '#' *> manyTill anyChar lineTerminator *> pure emptyReadingsMap
<|> manyTill skipTrueSpace lineTerminator *> pure emptyReadingsMap
<?> "line"
readingP :: Char -> Parser (Parser ReadingsMap)
readingP c = string "kCantonese" *> pure (liftA (\x -> emptyReadingsMap { kCantonese = mk x }) (jyutpingP `sepBy1` skipTrueSpace))
<|> string "kDefinition" *> pure (liftA (\x -> emptyReadingsMap { kDefinition = mk x }) definitionsP)
<|> string "kHangul" *> pure (liftA (\x -> emptyReadingsMap { kHangul = mk x }) (hangulP `sepBy1` skipTrueSpace))
<|> string "kHanyuPinlu" *> pure (liftA (\x -> emptyReadingsMap { kHanyuPinlu = mk x }) (hanyuPinluP `sepBy1` skipTrueSpace))
<|> string "kHanyuPinyin" *> pure (liftA (\x -> emptyReadingsMap { kHanyuPinyin = mk x }) (hanyuPinyinP `sepBy1` skipTrueSpace))
<|> string "kJapaneseKun" *> pure (liftA (\x -> emptyReadingsMap { kJapaneseKun = mk x }) (takeWhile1 isAsciiUpper `sepBy1` skipTrueSpace))
<|> string "kJapaneseOn" *> pure (liftA (\x -> emptyReadingsMap { kJapaneseOn = mk x }) (takeWhile1 isAsciiUpper `sepBy1` skipTrueSpace))
<|> string "kKorean" *> pure (liftA (\x -> emptyReadingsMap { kKorean = mk x }) (yaleP `sepBy1` skipTrueSpace))
<|> string "kMandarin" *> pure (liftA (\x -> emptyReadingsMap { kMandarin = mk x }) mandarinP)
<|> string "kTang" *> pure (liftA (\x -> emptyReadingsMap { kTang = mk x }) (tangP `sepBy1` skipTrueSpace))
<|> string "kVietnamese" *> pure (liftA (\x -> emptyReadingsMap { kVietnamese = mk x }) (quocNguP `sepBy1` skipTrueSpace))
<|> string "kXHC1983" *> pure (liftA (\x -> emptyReadingsMap { kXHC1983 = mk x }) (xhc1983P `sepBy1` skipTrueSpace))
where mk x = M.singleton c x
definitionsP :: Parser [CharDefinition]
definitionsP = takeWhile1 (\c -> c /= '\r' && c /= '\n' && c /= ';') `sepBy1` (takeWhile1 (== ';') <* skipTrueSpace)
hangulP :: Parser Hangul.Phone
hangulP = liftA Hangul.fromJamos (takeWhile1 (not . isSpace))
hanyuPinluP :: Parser (Pinyin.Phone, OccurrenceCount)
hanyuPinluP = liftA2 (,) tonedPinyinP (char '(' *> decimal <* char ')')
mandarinP :: Parser (Pinyin.Phone, Pinyin.Phone)
mandarinP = liftA2 (\simp mb_trad -> (simp, fromMaybe simp mb_trad)) accentedPinyinP (optional (skipTrueSpace *> accentedPinyinP))
accentedPinyinP :: Parser Pinyin.Phone
accentedPinyinP = liftA Pinyin.fromAccented (takeWhile1 (\c -> not (isSpace c) && c /= ','))
hanyuPinyinP :: Parser ([HDZEntry], [Pinyin.Phone])
hanyuPinyinP = liftA2 (,) (hdzEntryP `sepBy1` char ',') (char ':' *> (accentedPinyinP `sepBy1` char ','))
yaleP :: Parser KoreanYale.Phone
yaleP = takeWhile1 isAsciiUpper
tangP :: Parser (CommonTangCharacter, Text.Text)
tangP = liftA2 (,) (canParse (char '*')) (takeWhile1 (not . isSpace))
quocNguP :: Parser QuocNgu.Phone
quocNguP = takeWhile1 (not . isSpace)
xhc1983P :: Parser ([(HDZEntry, IsHDZSubstitution)], [Pinyin.Phone])
xhc1983P = liftA2 (,) (locP `sepBy1` char ',') (char ':' *> (accentedPinyinP `sepBy1` char ','))
where locP = liftA2 (,) hdzEntryP (canParse (char '*'))