{-# LANGUAGE OverloadedStrings #-} module CJK.Data.Unihan.Variants ( SemanticVariantType(..), VariantSource, VariantCitation, Variant, compatibilityVariants, zVariants, semanticVariants, specializedSemanticVariants, simplifiedVariants, traditionalVariants ) where import CJK.Utilities import Control.Applicative import qualified Data.Text as Text import qualified Data.Text.Lazy as TextL import Data.Attoparsec.Text import Data.Char import Data.Maybe import qualified Data.Map as M import Data.List import System.IO.Unsafe data SemanticVariantType = T -- T for tòng, U+540C 同. The indicated source explicitly indicates the two are the same (e.g., by saying that the one character is “the same as” the other). | B -- T for bù, U+4E0D 不. The source explicitly indicates that the two are used improperly one for the other. | Z -- T for zhèng, U+6B63 正. The source explicitly indicates that the given character is the preferred form | F -- T for fán, U+7E41 繁. The source explicitly indicates that the given character is the traditional form. | J -- T for jiǎn U+7C21 簡/U+7B80 简. The source explicitly indicates that the given character is the simplified form. deriving (Eq, Ord, Show) type VariantSource = Text.Text type VariantCitation = [(VariantSource, [SemanticVariantType])] type Variant = (Char, VariantCitation) -- | The compatibility decomposition for this ideograph compatibilityVariants :: Char -> [Char] compatibilityVariants c = case variants of VMS mp _ _ _ _ _ -> M.findWithDefault [] c mp -- | A semantic variant is an x- or y-variant with similar or identical meaning which can generally be used in place of the indicated character semanticVariants :: Char -> [Variant] semanticVariants c = case variants of VMS _ mp _ _ _ _ -> M.findWithDefault [] c mp -- | Simplified Chinese variant(s) for this character simplifiedVariants :: Char -> [Char] simplifiedVariants c = case variants of VMS _ _ mp _ _ _ -> M.findWithDefault [] c mp -- | A specialized semantic variant is an x- or y-variant with similar or identical meaning only in certain contexts (such as accountants’ numerals) specializedSemanticVariants :: Char -> [Variant] specializedSemanticVariants c = case variants of VMS _ _ _ mp _ _ -> M.findWithDefault [] c mp -- | Traditional Chinese variant(s) for this character traditionalVariants :: Char -> [Char] traditionalVariants c = case variants of VMS _ _ _ _ mp _ -> M.findWithDefault [] c mp -- | The z-variant(s) for this character zVariants :: Char -> [Variant] zVariants c = case variants of VMS _ _ _ _ _ mp -> M.findWithDefault [] c mp type VariantMap = M.Map Char [Char] type CitedVariantMap = M.Map Char [Variant] data VariantsMap = VMS !VariantMap !CitedVariantMap !VariantMap !CitedVariantMap !VariantMap !CitedVariantMap deriving (Show) -- Useful for debugging in GHCi emptyVariantsMap :: VariantsMap emptyVariantsMap = VMS M.empty M.empty M.empty M.empty M.empty M.empty unionVariantsMap :: VariantsMap -> VariantsMap -> VariantsMap unionVariantsMap (VMS a1 a2 a3 a4 a5 a6) (VMS b1 b2 b3 b4 b5 b6) = VMS (M.unionWith (++) a1 b1) (M.unionWith (++) a2 b2) (M.unionWith (++) a3 b3) (M.unionWith (++) a4 b4) (M.unionWith (++) a5 b5) (M.unionWith (++) a6 b6) {-# NOINLINE contents #-} contents :: TextL.Text contents = unsafePerformIO (readUTF8DataFile "data/Unihan/Unihan_Variants.txt") variants :: VariantsMap variants = parseLazy fileP contents fileP :: Parser VariantsMap fileP = fmap (foldl' unionVariantsMap emptyVariantsMap) (lineP `manyTill` endOfInput) lineP :: Parser VariantsMap lineP = do { c <- charP <* skipSpace; dataP <- variantP c <* skipSpace; dataP <* skipTrueSpace <* lineTerminator } <|> char '#' *> manyTill anyChar lineTerminator *> pure emptyVariantsMap <|> manyTill skipTrueSpace lineTerminator *> pure emptyVariantsMap "line" variantP :: Char -> Parser (Parser VariantsMap) variantP c = string "kCompatibilityVariant" *> pure (liftA (\x -> VMS (mk x) M.empty M.empty M.empty M.empty M.empty) charsP) <|> string "kSemanticVariant" *> pure (liftA (\x -> VMS M.empty (mk x) M.empty M.empty M.empty M.empty) variantsP) <|> string "kSimplifiedVariant" *> pure (liftA (\x -> VMS M.empty M.empty (mk x) M.empty M.empty M.empty) charsP) <|> string "kSpecializedSemanticVariant" *> pure (liftA (\x -> VMS M.empty M.empty M.empty (mk x) M.empty M.empty) variantsP) <|> string "kTraditionalVariant" *> pure (liftA (\x -> VMS M.empty M.empty M.empty M.empty (mk x) M.empty) charsP) <|> string "kZVariant" *> pure (liftA (\x -> VMS M.empty M.empty M.empty M.empty M.empty (mk x)) variantsP) "variant" where mk x = M.singleton c x charsP :: Parser [Char] charsP = charP `sepBy1` skipTrueSpace variantsP :: Parser [Variant] variantsP = liftA2 (,) charP variantCitationP `sepBy1` skipTrueSpace semanticVariantTypeP :: Parser SemanticVariantType semanticVariantTypeP = char 'T' *> pure T <|> char 'B' *> pure B <|> char 'Z' *> pure Z <|> char 'F' *> pure F <|> char 'J' *> pure J "semantic variant type" variantCitationP :: Parser VariantCitation variantCitationP = char '<' *> (entryP `sepBy1` char ',') <|> pure [] -- Z-variants are commonly uncited "variant citation" where entryP = liftA2 (\which mb_xs -> (which, fromMaybe [] mb_xs)) sourceP (optional (char ':' *> many1 semanticVariantTypeP)) sourceP = takeWhile1 isAlphaNum