{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedLists #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NamedFieldPuns #-}
-- | This module deals with Chinese characters and Sino-Korean words written in
-- hanja.
module Text.Seonbi.Hanja
    ( -- * Korean mixed-script (國漢文混用) transformation
      HanjaPhoneticization (..)
    , def
    , phoneticizeHanja
      -- * Single character phoneticization
    , phoneticizeHanjaChar
      -- * Word phoneticization
    , HanjaDictionary
    , HanjaWordPhoneticizer
    , phoneticizeHanjaWord
    , phoneticizeHanjaWordWithInitialSoundLaw
    , withDictionary
      -- * Word rendering
    , HanjaWordRenderer
    , hangulOnly
    , hanjaInParentheses
    , hanjaInRuby
      -- * Initial sound law (頭音法則)
    , convertInitialSoundLaw
    , initialSoundLawTable
    , initialSoundLawTable'
    , revertInitialSoundLaw
    ) where

import Prelude hiding (lookup)

import Control.Applicative
import Control.Monad
import Data.Char
import Data.List hiding (lookup)
import Data.Maybe
import Data.Ord (comparing)

import Data.Attoparsec.Text
import Data.Default
import Data.Map.Strict
import Data.Set
import Data.Text hiding (concatMap)

import Text.Seonbi.Hangul
import Text.Seonbi.Html
import Text.Seonbi.Html.Lang
import Text.Seonbi.Html.Preservation
import Text.Seonbi.Html.TagStack (push)
import qualified Text.Seonbi.Trie as Trie
import Text.Seonbi.Unihan.KHangul

-- $setup
-- >>> import qualified Text.Show.Unicode
-- >>> :set -interactive-print=Text.Show.Unicode.uprint

-- | Settings to transform Sino-Korean words written in hanja into hangul
-- letters.
data HanjaPhoneticization = HanjaPhoneticization
    { -- | A function to phoneticize a hanja word.
      -- Use 'phoneticizeHanjaWordWithInitialSoundLaw' for South Korean
      -- orthography, or 'phoneticizeHanjaWord' for North Korean orthography.
      HanjaPhoneticization -> HanjaWordPhoneticizer
phoneticizer :: HanjaWordPhoneticizer
      -- | A function to render a hanja word.  See also 'HanjaWordRenderer'.
    , HanjaPhoneticization -> HanjaWordRenderer
wordRenderer :: HanjaWordRenderer
      -- | A function to render a hanja word which should be disambiguated.
      -- It's used instead of 'wordRenderer' when two or more words in
      -- a text have the same hangul reading but actually are dictinct
      -- each other in hanja characters, e.g., 小數\/素數 (소수).
    , HanjaPhoneticization -> HanjaWordRenderer
homophoneRenderer :: HanjaWordRenderer
      -- | Whether to insert some HTML comments that contain useful information
      -- for debugging into the result.  This does not affect the rendering
      -- of the result HTML, but only the HTML code.
    , HanjaPhoneticization -> Bool
debugComment :: Bool
    }

-- | A function to phoneticize a Sino-Korean (i.e., hanja) word (漢字語)
-- into hangul letters.
-- See also 'phoneticizeHanjaWord', 'phoneticizeHanjaWordWithInitialSoundLaw',
-- and 'withDictionary'.
type HanjaWordPhoneticizer
    = Text  -- ^ A Sino-Korean (i.e., hanja) word (漢字語) to phoneticize.
    -> Text -- ^ Hangul letters that phoneticize the given Sino-Korean word.

-- | A function to render a Sino-Korean (i.e., hanja) word (漢字語).
-- Choose one in 'hangulOnly', 'hanjaInParentheses', and 'hanjaInRuby'.
type HanjaWordRenderer
    = HtmlTagStack
    -- ^ Where rendered HTML entities get interleaved into.
    -> Text
    -- ^ A Sino-Korean (i.e., hanja) word (漢字語) to render.
    -> Text
    -- ^ Hangul letters that phoneticized the Sino-Korean word.
    -> [HtmlEntity]
    -- ^ Rendered HTML entities.

-- | Renders a word in hangul-only, no hanja at all (e.g., @안녕히@).
hangulOnly :: HanjaWordRenderer
hangulOnly :: HanjaWordRenderer
hangulOnly HtmlTagStack
stack Text
_ Text
hangul = [HtmlTagStack -> Text -> HtmlEntity
HtmlCdata HtmlTagStack
stack Text
hangul]

-- | Renders a word in hangul followed by hanja in parentheses
-- (e.g., @안녕(安寧)히@).
hanjaInParentheses :: HanjaWordRenderer
hanjaInParentheses :: HanjaWordRenderer
hanjaInParentheses HtmlTagStack
stack Text
hanja Text
hangul =
    [HtmlTagStack -> Text -> HtmlEntity
HtmlCdata HtmlTagStack
stack (Text -> HtmlEntity) -> Text -> HtmlEntity
forall a b. (a -> b) -> a -> b
$ [Text] -> Text
Data.Text.concat [Text
Item [Text]
hangul, Item [Text]
"(", Text
Item [Text]
hanja, Item [Text]
")"]]

-- | Renders a word in @<ruby>@ tag (e.g.,
-- @\<ruby\>安寧\<rp\>(\<\/rp\>\<rt\>안녕\<\/rt\>\<rp\>)\<\/rp\>\<\/ruby\>히@).
--
-- Please read [Use Cases & Exploratory Approaches for Ruby
-- Markup](https://www.w3.org/TR/ruby-use-cases/) as well for more information.
hanjaInRuby :: HanjaWordRenderer
hanjaInRuby :: HanjaWordRenderer
hanjaInRuby HtmlTagStack
stack Text
hanja Text
hangul =
    [ HtmlTagStack -> HtmlTag -> Text -> HtmlEntity
HtmlStartTag HtmlTagStack
stack HtmlTag
Ruby Text
""
    , HtmlTagStack -> Text -> HtmlEntity
HtmlCdata HtmlTagStack
rubyStack Text
hanja
    , HtmlTagStack -> HtmlTag -> Text -> HtmlEntity
HtmlStartTag HtmlTagStack
rubyStack HtmlTag
RP Text
""
    , HtmlTagStack -> Text -> HtmlEntity
HtmlText (HtmlTag -> HtmlTagStack -> HtmlTagStack
push HtmlTag
RP HtmlTagStack
rubyStack) Text
"("
    , HtmlTagStack -> HtmlTag -> HtmlEntity
HtmlEndTag HtmlTagStack
rubyStack HtmlTag
RP
    , HtmlTagStack -> HtmlTag -> Text -> HtmlEntity
HtmlStartTag HtmlTagStack
rubyStack HtmlTag
RT Text
""
    , HtmlTagStack -> Text -> HtmlEntity
HtmlCdata (HtmlTag -> HtmlTagStack -> HtmlTagStack
push HtmlTag
RT HtmlTagStack
rubyStack) Text
hangul
    , HtmlTagStack -> HtmlTag -> HtmlEntity
HtmlEndTag HtmlTagStack
rubyStack HtmlTag
RT
    , HtmlTagStack -> HtmlTag -> Text -> HtmlEntity
HtmlStartTag HtmlTagStack
rubyStack HtmlTag
RP Text
""
    , HtmlTagStack -> Text -> HtmlEntity
HtmlText (HtmlTag -> HtmlTagStack -> HtmlTagStack
push HtmlTag
RP HtmlTagStack
rubyStack) Text
")"
    , HtmlTagStack -> HtmlTag -> HtmlEntity
HtmlEndTag HtmlTagStack
rubyStack HtmlTag
RP
    , HtmlTagStack -> HtmlTag -> HtmlEntity
HtmlEndTag HtmlTagStack
stack HtmlTag
Ruby
    ]
  where
    rubyStack :: HtmlTagStack
    rubyStack :: HtmlTagStack
rubyStack = HtmlTag -> HtmlTagStack -> HtmlTagStack
push HtmlTag
Ruby HtmlTagStack
stack

instance Default HanjaPhoneticization where
    def :: HanjaPhoneticization
def = HanjaPhoneticization :: HanjaWordPhoneticizer
-> HanjaWordRenderer
-> HanjaWordRenderer
-> Bool
-> HanjaPhoneticization
HanjaPhoneticization
        { phoneticizer :: HanjaWordPhoneticizer
phoneticizer = HanjaWordPhoneticizer
phoneticizeHanjaWordWithInitialSoundLaw
        , wordRenderer :: HanjaWordRenderer
wordRenderer = HanjaWordRenderer
hangulOnly
        , homophoneRenderer :: HanjaWordRenderer
homophoneRenderer = HanjaWordRenderer
hanjaInParentheses
        , debugComment :: Bool
debugComment = Bool
False
        }

-- | Transforms hanja words in the given HTML entities into corresponding
-- hangul words.
phoneticizeHanja
    :: HanjaPhoneticization
    -- ^ Configures the phoneticization details.
    -> [HtmlEntity]
    -- ^ HTML entities (that may contain some hanja words) to phoneticize
    -- all hanja words into corresponding hangul-only words.
    -> [HtmlEntity]
    -- ^ HTML entities that have no hanja words but hangul-only words instead.
phoneticizeHanja :: HanjaPhoneticization -> [HtmlEntity] -> [HtmlEntity]
phoneticizeHanja HanjaPhoneticization { HanjaWordPhoneticizer
phoneticizer :: HanjaWordPhoneticizer
phoneticizer :: HanjaPhoneticization -> HanjaWordPhoneticizer
phoneticizer
                                      , HanjaWordRenderer
wordRenderer :: HanjaWordRenderer
wordRenderer :: HanjaPhoneticization -> HanjaWordRenderer
wordRenderer
                                      , HanjaWordRenderer
homophoneRenderer :: HanjaWordRenderer
homophoneRenderer :: HanjaPhoneticization -> HanjaWordRenderer
homophoneRenderer
                                      , Bool
debugComment :: Bool
debugComment :: HanjaPhoneticization -> Bool
debugComment
                                      }
                 [HtmlEntity]
entities =
    ((Either HtmlEntity (HtmlTagStack, Text, Text) -> [HtmlEntity])
-> [Either HtmlEntity (HtmlTagStack, Text, Text)] -> [HtmlEntity]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
`concatMap` [Either HtmlEntity (HtmlTagStack, Text, Text)]
normalized) ((Either HtmlEntity (HtmlTagStack, Text, Text) -> [HtmlEntity])
 -> [HtmlEntity])
-> (Either HtmlEntity (HtmlTagStack, Text, Text) -> [HtmlEntity])
-> [HtmlEntity]
forall a b. (a -> b) -> a -> b
$ \ case
        Left HtmlEntity
e' ->
            [Item [HtmlEntity]
HtmlEntity
e']
        Right (HtmlTagStack
stack, Text
hanja, Text
hangul) ->
            if Set Text -> Int
forall a. Set a -> Int
Data.Set.size (Set Text -> Text -> Map Text (Set Text) -> Set Text
forall k a. Ord k => a -> k -> Map k a -> a
findWithDefault [] Text
hangul Map Text (Set Text)
frequencyDict) Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
1
            then HanjaWordRenderer
homophoneRenderer' HtmlTagStack
stack Text
hanja Text
hangul
            else HanjaWordRenderer
wordRenderer' HtmlTagStack
stack Text
hanja Text
hangul
  where
    frequencyDict :: Map Text (Set Text)
    frequencyDict :: Map Text (Set Text)
frequencyDict = (Set Text -> Set Text -> Set Text)
-> [(Text, Set Text)] -> Map Text (Set Text)
forall k a. Ord k => (a -> a -> a) -> [(k, a)] -> Map k a
Data.Map.Strict.fromListWith
        Set Text -> Set Text -> Set Text
forall a. Ord a => Set a -> Set a -> Set a
Data.Set.union
        [(Text
hangul, [Text
Item (Set Text)
hanja]) | Right (HtmlTagStack
_, Text
hanja, Text
hangul) <- [Either HtmlEntity (HtmlTagStack, Text, Text)]
normalized]
    normalized :: [Either HtmlEntity (HtmlTagStack, Text, Text)]
    normalized :: [Either HtmlEntity (HtmlTagStack, Text, Text)]
normalized = [[Either HtmlEntity (HtmlTagStack, Text, Text)]]
-> [Either HtmlEntity (HtmlTagStack, Text, Text)]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
Data.List.concat
        [ case Either HtmlEntity (HtmlTagStack, Text, Text)
e of
            Left HtmlEntity
_ ->
                [Either HtmlEntity (HtmlTagStack, Text, Text)
Item [Either HtmlEntity (HtmlTagStack, Text, Text)]
e]
            Right (HtmlTagStack
stack, Text
hanja, Text
hangul) ->
                let hanjaWords :: [Text]
hanjaWords = Text -> [Text]
splitByDigits Text
hanja
                    hangulWords :: [Text]
hangulWords = Text -> [Text]
splitByDigits Text
hangul
                    hanjaWordsLen :: Int
hanjaWordsLen = [Text] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
Prelude.length [Text]
hanjaWords
                    hangulWordsLen :: Int
hangulWordsLen = [Text] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
Prelude.length [Text]
hangulWords
                in
                    if Int
hanjaWordsLen Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
/= Int
hangulWordsLen
                    then [Either HtmlEntity (HtmlTagStack, Text, Text)
Item [Either HtmlEntity (HtmlTagStack, Text, Text)]
e]
                    else
                        [ if (Char -> Bool) -> Text -> Bool
Data.Text.any Char -> Bool
isDigit Text
hanj
                          then HtmlEntity -> Either HtmlEntity (HtmlTagStack, Text, Text)
forall a b. a -> Either a b
Left (HtmlEntity -> Either HtmlEntity (HtmlTagStack, Text, Text))
-> HtmlEntity -> Either HtmlEntity (HtmlTagStack, Text, Text)
forall a b. (a -> b) -> a -> b
$ HtmlTagStack -> Text -> HtmlEntity
HtmlText HtmlTagStack
stack Text
hanj
                          else (HtmlTagStack, Text, Text)
-> Either HtmlEntity (HtmlTagStack, Text, Text)
forall a b. b -> Either a b
Right (HtmlTagStack
stack, Text
hanj, Text
hang)
                        | (Text
hanj, Text
hang) <- [Text] -> [Text] -> [(Text, Text)]
forall a b. [a] -> [b] -> [(a, b)]
Prelude.zip [Text]
hanjaWords [Text]
hangulWords
                        ]
        | Either HtmlEntity (HtmlTagStack, Text, Text)
e <- (LangHtmlEntity -> [Either HtmlEntity (HtmlTagStack, Text, Text)])
-> [LangHtmlEntity]
-> [Either HtmlEntity (HtmlTagStack, Text, Text)]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap LangHtmlEntity -> [Either HtmlEntity (HtmlTagStack, Text, Text)]
transform ([HtmlEntity] -> [LangHtmlEntity]
annotateWithLang [HtmlEntity]
entities)
        ]
    splitByDigits :: Text -> [Text]
    splitByDigits :: Text -> [Text]
splitByDigits = (Char -> Char -> Bool) -> Text -> [Text]
Data.Text.groupBy (\ Char
a Char
b -> Char -> Bool
isDigit Char
a Bool -> Bool -> Bool
forall a. Eq a => a -> a -> Bool
== Char -> Bool
isDigit Char
b)
    transform :: LangHtmlEntity
              -> [Either HtmlEntity (HtmlTagStack, Text, Text)]
    transform :: LangHtmlEntity -> [Either HtmlEntity (HtmlTagStack, Text, Text)]
transform LangHtmlEntity
                { lang :: LangHtmlEntity -> Maybe Text
lang = Maybe Text
lang
                , entity :: LangHtmlEntity -> HtmlEntity
entity = entity :: HtmlEntity
entity@HtmlText
                    { tagStack :: HtmlEntity -> HtmlTagStack
tagStack = HtmlTagStack
tagStack'
                    , rawText :: HtmlEntity -> Text
rawText = Text
rawText'
                    }
                }
      | HtmlTagStack -> Bool
isPreservedTagStack HtmlTagStack
tagStack' Bool -> Bool -> Bool
|| Maybe Text -> Bool
isNeverKorean Maybe Text
lang =
            [HtmlEntity -> Either HtmlEntity (HtmlTagStack, Text, Text)
forall a b. a -> Either a b
Left HtmlEntity
entity]
      | Bool
otherwise =
            case Text -> Maybe [(Bool, Text)]
analyzeHanjaText Text
rawText' of
                Maybe [(Bool, Text)]
Nothing -> [HtmlEntity -> Either HtmlEntity (HtmlTagStack, Text, Text)
forall a b. a -> Either a b
Left (HtmlEntity -> Either HtmlEntity (HtmlTagStack, Text, Text))
-> HtmlEntity -> Either HtmlEntity (HtmlTagStack, Text, Text)
forall a b. (a -> b) -> a -> b
$ HtmlEntity
entity { rawText :: Text
rawText = Text
rawText' }]
                Just [(Bool, Text)]
pairs ->
                    [ if Bool
trueIfHanja
                      then (HtmlTagStack, Text, Text)
-> Either HtmlEntity (HtmlTagStack, Text, Text)
forall a b. b -> Either a b
Right (HtmlTagStack
tagStack', Text
htmlText, HanjaWordPhoneticizer
phoneticizer Text
htmlText)
                      else HtmlEntity -> Either HtmlEntity (HtmlTagStack, Text, Text)
forall a b. a -> Either a b
Left (HtmlEntity
entity { rawText :: Text
rawText = Text
htmlText })
                    -- Note that htmlText here can have HTML entities.
                    | (Bool
trueIfHanja, Text
htmlText) <- [(Bool, Text)]
pairs
                    ]
    transform LangHtmlEntity { HtmlEntity
entity :: HtmlEntity
entity :: LangHtmlEntity -> HtmlEntity
entity } =
        [HtmlEntity -> Either HtmlEntity (HtmlTagStack, Text, Text)
forall a b. a -> Either a b
Left HtmlEntity
entity]
    -- FIXME: This should be public:
    debugRenderer :: HanjaWordRenderer -> HanjaWordRenderer
    debugRenderer :: HanjaWordRenderer -> HanjaWordRenderer
debugRenderer HanjaWordRenderer
render HtmlTagStack
stack Text
hanja Text
hangul =
        HtmlTagStack -> Text -> HtmlEntity
HtmlComment HtmlTagStack
stack (Text
" Hanja: " Text -> HanjaWordPhoneticizer
`append` Text
hanja)
            HtmlEntity -> [HtmlEntity] -> [HtmlEntity]
forall a. a -> [a] -> [a]
: HanjaWordRenderer
render HtmlTagStack
stack Text
hanja Text
hangul [HtmlEntity] -> [HtmlEntity] -> [HtmlEntity]
forall a. [a] -> [a] -> [a]
++ [HtmlTagStack -> Text -> HtmlEntity
HtmlComment HtmlTagStack
stack Text
" /Hanja "]
    wordRenderer' :: HanjaWordRenderer
    wordRenderer' :: HanjaWordRenderer
wordRenderer'
      | Bool
debugComment = HanjaWordRenderer -> HanjaWordRenderer
debugRenderer HanjaWordRenderer
wordRenderer
      | Bool
otherwise = HanjaWordRenderer
wordRenderer
    homophoneRenderer' :: HanjaWordRenderer
    homophoneRenderer' :: HanjaWordRenderer
homophoneRenderer'
      | Bool
debugComment = HanjaWordRenderer -> HanjaWordRenderer
debugRenderer HanjaWordRenderer
homophoneRenderer
      | Bool
otherwise = HanjaWordRenderer
homophoneRenderer

analyzeHanjaText :: Text -> Maybe [(Bool, Text)]
analyzeHanjaText :: Text -> Maybe [(Bool, Text)]
analyzeHanjaText Text
text' =
    case Parser [(Bool, Text)] -> Text -> Either String [(Bool, Text)]
forall a. Parser a -> Text -> Either String a
parseOnly (Parser [(Bool, Text)]
textParser Parser [(Bool, Text)] -> Parser Text () -> Parser [(Bool, Text)]
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser Text ()
forall t. Chunk t => Parser t ()
endOfInput) Text
text' of
        Left String
_ -> Maybe [(Bool, Text)]
forall a. Maybe a
Nothing
        Right [(Bool, Text)]
pairs -> [(Bool, Text)] -> Maybe [(Bool, Text)]
forall a. a -> Maybe a
Just
            [ (Bool
trueIfHanja, Text
text)
            | (Bool
trueIfHanja, Text
text) <- [(Bool, Text)]
pairs
            , Bool -> Bool
not (Text -> Bool
Data.Text.null Text
text)
            ]

-- | Reads a hanja word and returns a corresponding hangul word.
--
-- >>> :set -XOverloadedStrings
-- >>> phoneticizeHanjaWord "漢字"
-- "한자"
--
-- Note that it does not apply Initial Sound Law (頭音法則):
--
-- >>> phoneticizeHanjaWord  "來日"
-- "래일"
phoneticizeHanjaWord :: HanjaWordPhoneticizer
phoneticizeHanjaWord :: HanjaWordPhoneticizer
phoneticizeHanjaWord =
    (Char -> Char) -> HanjaWordPhoneticizer
Data.Text.map Char -> Char
phoneticizeHanjaChar

-- | It is like 'phoneticizeHanjaWord', but it also applies
-- Initial Sound Law (頭音法則).
--
-- >>> :set -XOverloadedStrings
-- >>> phoneticizeHanjaWordWithInitialSoundLaw  "來日"
-- "내일"
-- >>> phoneticizeHanjaWordWithInitialSoundLaw  "未來"
-- "미래"
phoneticizeHanjaWordWithInitialSoundLaw :: HanjaWordPhoneticizer
phoneticizeHanjaWordWithInitialSoundLaw :: HanjaWordPhoneticizer
phoneticizeHanjaWordWithInitialSoundLaw Text
word =
    case Parser Text -> Text -> Either String Text
forall a. Parser a -> Text -> Either String a
parseOnly (Parser Text
parser Parser Text -> Parser Text () -> Parser Text
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser Text ()
forall t. Chunk t => Parser t ()
endOfInput) Text
word of
        Left String
_ -> Text
word
        Right Text
"" -> Text
word
        Right Text
hangulWord -> Text
hangulWord
  where
    parser :: Parser Text
    parser :: Parser Text
parser = do
        [Text]
chars <- Parser Text -> Parser Text [Text]
forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
many'
            ( Parser Text -> Parser Text
forall i a. Parser i a -> Parser i a
try Parser Text
yeolYul
            Parser Text -> Parser Text -> Parser Text
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Text -> Parser Text
forall i a. Parser i a -> Parser i a
try Parser Text
prefixedNumber
            Parser Text -> Parser Text -> Parser Text
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Text -> Parser Text
forall i a. Parser i a -> Parser i a
try Parser Text
hanNumber
            Parser Text -> Parser Text -> Parser Text
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Text -> Parser Text
forall i a. Parser i a -> Parser i a
try (Char -> Text
Data.Text.singleton (Char -> Text) -> (Char -> Char) -> Char -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Char
phoneticize (Char -> Text) -> Parser Text Char -> Parser Text
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Text Char
anyChar)
            )
        let hangulWord :: Text
hangulWord = [Text] -> Text
Data.Text.concat [Text]
chars
        Text -> Parser Text
forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> Parser Text) -> Text -> Parser Text
forall a b. (a -> b) -> a -> b
$ [Text] -> Text
Data.Text.concat
            [ (Char -> Char) -> HanjaWordPhoneticizer
Data.Text.map Char -> Char
convertInitialSoundLaw HanjaWordPhoneticizer -> HanjaWordPhoneticizer
forall a b. (a -> b) -> a -> b
$ Int -> HanjaWordPhoneticizer
Data.Text.take Int
1 Text
hangulWord
            , Int -> HanjaWordPhoneticizer
Data.Text.drop Int
1 Text
hangulWord
            ]
    yeolYul :: Parser Text
    yeolYul :: Parser Text
yeolYul = do
        Char
former <- (Char -> Bool) -> Parser Text Char
satisfy ((Char -> Bool) -> Parser Text Char)
-> (Char -> Bool) -> Parser Text Char
forall a b. (a -> b) -> a -> b
$ \ Char
c ->
            Char
c Char -> Maybe Char -> Bool
`hasBatchim` Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\x11ab' Bool -> Bool -> Bool
|| Char
c Char -> Maybe Char -> Bool
`hasBatchim` Maybe Char
forall a. Maybe a
Nothing
        Char
later <- Char -> Parser Text Char
phone Char
'렬' Parser Text Char -> Parser Text Char -> Parser Text Char
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Char -> Parser Text Char
phone Char
'률'
        Text -> Parser Text
forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> Parser Text) -> Text -> Parser Text
forall a b. (a -> b) -> a -> b
$ String -> Text
pack
            [ Char -> Char
phoneticize Char
former
            , Char -> Char
convert Char
later
            ]
    prefixedNumber :: Parser Text
    prefixedNumber :: Parser Text
prefixedNumber = do
        Char
prefix <- Char -> Parser Text Char
char Char
'第'
        Text
digits <- (Char -> Bool) -> Parser Text
takeWhile1 Char -> Bool
isHanDigit
        Text -> Parser Text
forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> Parser Text) -> Text -> Parser Text
forall a b. (a -> b) -> a -> b
$ Char -> HanjaWordPhoneticizer
Data.Text.cons
            (Char -> Char
phoneticize Char
prefix)
            ((Char -> Char) -> HanjaWordPhoneticizer
Data.Text.map Char -> Char
convertDigit Text
digits)
    hanNumber :: Parser Text
    hanNumber :: Parser Text
hanNumber = do
        Char
first <- Parser Text Char
hanDigit
        Text
rest <- (Char -> Bool) -> Parser Text
takeWhile1 Char -> Bool
isHanDigit
        Text -> Parser Text
forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> Parser Text) -> Text -> Parser Text
forall a b. (a -> b) -> a -> b
$ (Char -> Char) -> HanjaWordPhoneticizer
Data.Text.map Char -> Char
convertDigit HanjaWordPhoneticizer -> HanjaWordPhoneticizer
forall a b. (a -> b) -> a -> b
$ Char -> HanjaWordPhoneticizer
Data.Text.cons Char
first Text
rest
    hanDigit :: Parser Char
    hanDigit :: Parser Text Char
hanDigit = (Char -> Bool) -> Parser Text Char
satisfy Char -> Bool
isHanDigit
    phone :: Char -> Parser Char
    phone :: Char -> Parser Text Char
phone Char
hangul = (Char -> Bool) -> Parser Text Char
satisfy ((Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
hangul) (Char -> Bool) -> (Char -> Char) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Char
phoneticize)
    convertDigit :: Char -> Char
    convertDigit :: Char -> Char
convertDigit = Char -> Char
convertInitialSoundLaw (Char -> Char) -> (Char -> Char) -> Char -> Char
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Char
phoneticizeDigit
    convert :: Char -> Char
    convert :: Char -> Char
convert = Char -> Char
convertInitialSoundLaw (Char -> Char) -> (Char -> Char) -> Char -> Char
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Char
phoneticize
    phoneticizeDigit :: Char -> Char
    phoneticizeDigit :: Char -> Char
phoneticizeDigit Char
'參' = Char
'삼'
    phoneticizeDigit Char
'叁' = Char
'삼'
    phoneticizeDigit Char
'参' = Char
'삼'
    phoneticizeDigit Char
'叄' = Char
'삼'
    phoneticizeDigit Char
'拾' = Char
'십'
    phoneticizeDigit Char
c = Char -> Char
phoneticize Char
c
    phoneticize :: Char -> Char
    phoneticize :: Char -> Char
phoneticize = Char -> Char
phoneticizeHanjaChar
    hasBatchim :: Char -> Maybe Char -> Bool
    hasBatchim :: Char -> Maybe Char -> Bool
hasBatchim Char
c Maybe Char
batchim =
        case Char -> Maybe JamoTriple
toJamoTriple (Char -> Char
phoneticize Char
c) of
            Just (Char
_, Char
_, Maybe Char
final) -> Maybe Char
final Maybe Char -> Maybe Char -> Bool
forall a. Eq a => a -> a -> Bool
== Maybe Char
batchim
            Maybe JamoTriple
_ -> Bool
False
    isHanDigit :: Char -> Bool
    isHanDigit :: Char -> Bool
isHanDigit = String -> Char -> Bool
inClass (String -> Char -> Bool) -> String -> Char -> Bool
forall a b. (a -> b) -> a -> b
$
        String
"零一壹壱弌夁二貳贰弐弍貮三參叁参弎叄四肆䦉五伍六陸陆陸七柒漆八捌" String -> String -> String
forall a. [a] -> [a] -> [a]
++
        String
"九玖十拾百佰陌千仟阡萬万億兆京垓秭穰溝澗"

-- | Represents a dictionary that has hanja keys and values of their
-- corresponding hangul readings, e.g., @[("敗北", "패배")]@.
type HanjaDictionary = Trie.Trie Text

-- | Reads a hanja word according to the given dictionary, or falls back to
-- the other phoneticizer if there is no such word in the dictionary.
--
-- It's basically replace one with one:
--
-- >>> :set -XOverloadedLists -XOverloadedStrings
-- >>> let phone = withDictionary [("自轉車", "자전거")] phoneticizeHanjaWord
-- >>> phone "自轉車"
-- "자전거"
--
-- But, if it faces any words or characters that are not registered in
-- the dictionary, it does the best to interpolate prefixes\/infixes\/suffixes
-- using the fallback phoneticizer:
--
-- >>> phone "自轉車道路"
-- "자전거도로"
-- >>> phone "二輪自轉車"
-- "이륜자전거"
withDictionary
    :: HanjaDictionary
    -- ^ Hangul readings of Sino-Korean words.
    -> HanjaWordPhoneticizer
    -- ^ A fallback phoneticize for unregistered words.
    -- E.g., 'phoneticizeHanjaWordWithInitialSoundLaw'.
    -> HanjaWordPhoneticizer
    -- ^ A combined phoneticizer.
withDictionary :: HanjaDictionary -> HanjaWordPhoneticizer -> HanjaWordPhoneticizer
withDictionary HanjaDictionary
_ HanjaWordPhoneticizer
_ Text
"" = Text
""
withDictionary HanjaDictionary
dic HanjaWordPhoneticizer
fallback Text
word =
    case [(Text, Text)]
matches of
        [] ->
            HanjaWordPhoneticizer
fallback Text
word
        (Text
replaced, Text
rest) : [(Text, Text)]
_ ->
            if Text -> Bool
Data.Text.null Text
rest
            then Text
replaced
            else Text
replaced Text -> HanjaWordPhoneticizer
`append` HanjaDictionary -> HanjaWordPhoneticizer -> HanjaWordPhoneticizer
withDictionary HanjaDictionary
dic HanjaWordPhoneticizer
fallback Text
rest
  where
    lookupDic :: Text -> Maybe Text
    lookupDic :: Text -> Maybe Text
lookupDic = (Text -> HanjaDictionary -> Maybe Text
forall a. Text -> Trie a -> Maybe a
`Trie.lookup` HanjaDictionary
dic)
    tries :: [(Text, Text)]
    tries :: [(Text, Text)]
tries =
        [Int -> Text -> (Text, Text)
Data.Text.splitAt Int
pos Text
word | Int
pos <- [Item [Int]
0..Text -> Int
Data.Text.length Text
word]]
    patterns :: Text -> [Text]
    patterns :: Text -> [Text]
patterns Text
word' =
        Text
word' Text -> [Text] -> [Text]
forall a. a -> [a] -> [a]
: case Text -> Maybe (Text, Char)
unsnoc Text
word' of
            Just (Text
next, Char
_) -> Text -> [Text]
patterns Text
next
            Maybe (Text, Char)
Nothing -> []
    matchTries :: [Maybe (Text, Text, Text)]
    matchTries :: [Maybe (Text, Text, Text)]
matchTries =
        (((Text, Text) -> Maybe (Text, Text, Text))
-> [(Text, Text)] -> [Maybe (Text, Text, Text)]
forall a b. (a -> b) -> [a] -> [b]
`Prelude.map` [(Text, Text)]
tries) (((Text, Text) -> Maybe (Text, Text, Text))
 -> [Maybe (Text, Text, Text)])
-> ((Text, Text) -> Maybe (Text, Text, Text))
-> [Maybe (Text, Text, Text)]
forall a b. (a -> b) -> a -> b
$ \ (Text
unmatched, Text
wd) ->
            case [(Text
p, Text
m) | Text
p <- Text -> [Text]
patterns Text
wd, Text
m <- Maybe Text -> [Text]
forall a. Maybe a -> [a]
maybeToList (Text -> Maybe Text
lookupDic Text
p)] of
                [] -> Maybe (Text, Text, Text)
forall a. Maybe a
Nothing
                (Text, Text)
pair : [(Text, Text)]
_ -> (Text, Text, Text) -> Maybe (Text, Text, Text)
forall a. a -> Maybe a
Just
                    ( Text
unmatched
                    , (Text, Text) -> Text
forall a b. (a, b) -> b
snd (Text, Text)
pair
                    , Int -> HanjaWordPhoneticizer
Data.Text.drop (Text -> Int
Data.Text.length (Text -> Int) -> Text -> Int
forall a b. (a -> b) -> a -> b
$ (Text, Text) -> Text
forall a b. (a, b) -> a
fst (Text, Text)
pair) Text
wd
                    )
    matches :: [(Text, Text)]
    matches :: [(Text, Text)]
matches =
        [ (HanjaWordPhoneticizer
fallback Text
unmatched Text -> HanjaWordPhoneticizer
`append` Text
matched, Text
rest)
        | Just (Text
unmatched, Text
matched, Text
rest) <- [Maybe (Text, Text, Text)]
matchTries
        ]

-- | Reads a hanja character as a hangul character.
--
-- >>> phoneticizeHanjaChar '漢'
-- '한'
--
-- Note that it does not follow Initial Sound Law (頭音法則):
--
-- >>> phoneticizeHanjaChar '六'
-- '륙'
phoneticizeHanjaChar :: Char -> Char
phoneticizeHanjaChar :: Char -> Char
phoneticizeHanjaChar Char
c = Char -> Maybe Char -> Char
forall a. a -> Maybe a -> a
fromMaybe Char
c (Maybe Char -> Char) -> Maybe Char -> Char
forall a b. (a -> b) -> a -> b
$ do
    HanjaReadings
readings <- Char -> Map Char HanjaReadings -> Maybe HanjaReadings
forall k a. Ord k => k -> Map k a -> Maybe a
lookup Char
c Map Char HanjaReadings
kHangulData
    let readings' :: [(Char, HanjaReadingCitation)]
readings' = HanjaReadings -> [(Char, HanjaReadingCitation)]
forall k a. Map k a -> [(k, a)]
Data.Map.Strict.toList HanjaReadings
readings
    let (Char
sound, HanjaReadingCitation
_) = ((Char, HanjaReadingCitation)
 -> (Char, HanjaReadingCitation) -> Ordering)
-> [(Char, HanjaReadingCitation)] -> (Char, HanjaReadingCitation)
forall (t :: * -> *) a.
Foldable t =>
(a -> a -> Ordering) -> t a -> a
minimumBy (((Char, HanjaReadingCitation) -> HanjaReadingCitation)
-> (Char, HanjaReadingCitation)
-> (Char, HanjaReadingCitation)
-> Ordering
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing (Char, HanjaReadingCitation) -> HanjaReadingCitation
forall a b. (a, b) -> b
snd) [(Char, HanjaReadingCitation)]
readings'
    let initialLawReverted :: Set Char
initialLawReverted = (Char -> Bool) -> Set Char -> Set Char
forall a. (a -> Bool) -> Set a -> Set a
Data.Set.filter
            (Char -> HanjaReadings -> Bool
forall k a. Ord k => k -> Map k a -> Bool
`Data.Map.Strict.member` HanjaReadings
readings)
            (Char -> Set Char
revertInitialSoundLaw Char
sound)
    Char -> Maybe Char
forall (m :: * -> *) a. Monad m => a -> m a
return (Char -> Maybe Char) -> Char -> Maybe Char
forall a b. (a -> b) -> a -> b
$ case Set Char -> String
forall a. Set a -> [a]
Data.Set.toList Set Char
initialLawReverted of
        [] -> Char
sound
        Char
reverted : String
_ -> Char
reverted

withoutBatchim :: Char -> Maybe (Char, Maybe Char)
withoutBatchim :: Char -> Maybe (Char, Maybe Char)
withoutBatchim Char
hangul = do
    (Char
initial, Char
vowel, Maybe Char
final) <- Char -> Maybe JamoTriple
toJamoTriple Char
hangul
    Char
noBatchim <- JamoTriple -> Maybe Char
fromJamoTriple (Char
initial, Char
vowel, Maybe Char
forall a. Maybe a
Nothing)
    (Char, Maybe Char) -> Maybe (Char, Maybe Char)
forall (m :: * -> *) a. Monad m => a -> m a
return (Char
noBatchim, Maybe Char
final)

withBatchim :: Char -> Maybe Char -> Maybe Char
withBatchim :: Char -> Maybe Char -> Maybe Char
withBatchim Char
hangul Maybe Char
final = do
    (Char
initial, Char
vowel, Maybe Char
_) <- Char -> Maybe JamoTriple
toJamoTriple Char
hangul
    JamoTriple -> Maybe Char
fromJamoTriple (Char
initial, Char
vowel, Maybe Char
final)

-- | Converts a hangul character according to Initial Sound Law (頭音法則).
--
-- >>> convertInitialSoundLaw '념'
-- '염'
--
-- If an input is not a hangul syllable or a syllable is not applicable to
-- the law it returns the given input without change:
--
-- >>> convertInitialSoundLaw 'A'
-- 'A'
-- >>> convertInitialSoundLaw '가'
-- '가'
convertInitialSoundLaw :: Char -> Char
convertInitialSoundLaw :: Char -> Char
convertInitialSoundLaw Char
sound = Char -> Maybe Char -> Char
forall a. a -> Maybe a -> a
fromMaybe Char
sound (Maybe Char -> Char) -> Maybe Char -> Char
forall a b. (a -> b) -> a -> b
$ do
    (Char
pattern', Maybe Char
final) <- Char -> Maybe (Char, Maybe Char)
withoutBatchim Char
sound
    let converted :: Char
converted = Char -> Char -> Map Char Char -> Char
forall k a. Ord k => a -> k -> Map k a -> a
findWithDefault Char
pattern' Char
pattern' Map Char Char
initialSoundLawTable
    Char -> Maybe Char -> Maybe Char
withBatchim Char
converted Maybe Char
final

-- | It's a kind of inverse function of 'convertInitialSoundLaw',
-- except it returns a set of candidates instead of a single canonical answer
-- because Initial Sound Law (頭音法則) is not a bijective function.
--
-- >>> revertInitialSoundLaw '예'
-- fromList "례"
-- >>> revertInitialSoundLaw '염'
-- fromList "념렴"
--
-- It returns an empty set if an input is not applicable to the law:
--
-- >>> revertInitialSoundLaw '가'
-- fromList ""
revertInitialSoundLaw :: Char -> Set Char
revertInitialSoundLaw :: Char -> Set Char
revertInitialSoundLaw Char
sound = Set Char -> Maybe (Set Char) -> Set Char
forall a. a -> Maybe a -> a
fromMaybe Set Char
forall a. Set a
Data.Set.empty (Maybe (Set Char) -> Set Char) -> Maybe (Set Char) -> Set Char
forall a b. (a -> b) -> a -> b
$ do
    (Char
pattern', Maybe Char
final) <- Char -> Maybe (Char, Maybe Char)
withoutBatchim Char
sound
    let candidates :: String
candidates = Set Char -> String
forall a. Set a -> [a]
Data.Set.toList (Set Char -> String) -> Set Char -> String
forall a b. (a -> b) -> a -> b
$
            Set Char -> Char -> Map Char (Set Char) -> Set Char
forall k a. Ord k => a -> k -> Map k a -> a
findWithDefault Set Char
forall a. Set a
Data.Set.empty Char
pattern' Map Char (Set Char)
initialSoundLawTable'
    Set Char -> Maybe (Set Char)
forall a. a -> Maybe a
Just (Set Char -> Maybe (Set Char)) -> Set Char -> Maybe (Set Char)
forall a b. (a -> b) -> a -> b
$ String -> Set Char
forall a. Ord a => [a] -> Set a
Data.Set.fromList (String -> Set Char) -> String -> Set Char
forall a b. (a -> b) -> a -> b
$ [Maybe Char] -> String
forall a. [Maybe a] -> [a]
catMaybes ([Maybe Char] -> String) -> [Maybe Char] -> String
forall a b. (a -> b) -> a -> b
$ String
candidates String -> (Char -> Maybe Char) -> [Maybe Char]
forall (f :: * -> *) a b. Functor f => f a -> (a -> b) -> f b
<&> (Char -> Maybe Char -> Maybe Char
`withBatchim` Maybe Char
final)
  where
    (<&>) :: Functor f => f a -> (a -> b) -> f b
    <&> :: f a -> (a -> b) -> f b
(<&>) = ((a -> b) -> f a -> f b) -> f a -> (a -> b) -> f b
forall a b c. (a -> b -> c) -> b -> a -> c
flip (a -> b) -> f a -> f b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap

textParser :: Parser [(Bool, Text)]
textParser :: Parser [(Bool, Text)]
textParser = ([[(Bool, Text)]] -> [(Bool, Text)])
-> Parser Text [[(Bool, Text)]] -> Parser [(Bool, Text)]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap [[(Bool, Text)]] -> [(Bool, Text)]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
Data.List.concat (Parser Text [[(Bool, Text)]] -> Parser [(Bool, Text)])
-> Parser Text [[(Bool, Text)]] -> Parser [(Bool, Text)]
forall a b. (a -> b) -> a -> b
$ Parser [(Bool, Text)] -> Parser Text [[(Bool, Text)]]
forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
many' (Parser [(Bool, Text)] -> Parser Text [[(Bool, Text)]])
-> Parser [(Bool, Text)] -> Parser Text [[(Bool, Text)]]
forall a b. (a -> b) -> a -> b
$ do
    -- We have 3 passes to optimize by utilizing takeWhile instead of many'
    String
hanjaEntities <- Parser Text Char -> Parser Text String
forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
many' (Parser Text Char -> Parser Text String)
-> Parser Text Char -> Parser Text String
forall a b. (a -> b) -> a -> b
$ Parser Text Char -> Parser Text Char
forall i a. Parser i a -> Parser i a
try (Parser Text Char -> Parser Text Char)
-> Parser Text Char -> Parser Text Char
forall a b. (a -> b) -> a -> b
$ do
        Char
c <- Parser Text Char
unnamedCharRef
        Bool -> Parser Text () -> Parser Text ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
unless (Char -> Bool
isHanjaOrDigit Char
c) (String -> Parser Text ()
forall (m :: * -> *) a. MonadFail m => String -> m a
fail String
"not a hanja")
        Char -> Parser Text Char
forall (m :: * -> *) a. Monad m => a -> m a
return Char
c
    Text
hanjaCharsText <- (Char -> Bool) -> Parser Text
Data.Attoparsec.Text.takeWhile Char -> Bool
isHanjaOrDigit
    String
hanjaChars <- Parser Text Char -> Parser Text String
forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
many' (Parser Text Char -> Parser Text String)
-> Parser Text Char -> Parser Text String
forall a b. (a -> b) -> a -> b
$ Parser Text Char -> Parser Text Char
forall i a. Parser i a -> Parser i a
try (Parser Text Char -> Parser Text Char)
-> Parser Text Char -> Parser Text Char
forall a b. (a -> b) -> a -> b
$ do
        Char
c <- Parser Text Char
unnamedCharRef Parser Text Char -> Parser Text Char -> Parser Text Char
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Text Char
anyChar
        Bool -> Parser Text () -> Parser Text ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
unless (Char -> Bool
isHanjaOrDigit Char
c) (String -> Parser Text ()
forall (m :: * -> *) a. MonadFail m => String -> m a
fail String
"not a hanja")
        Char -> Parser Text Char
forall (m :: * -> *) a. Monad m => a -> m a
return Char
c
    -- Note that the parsed result can still have HTML entities; these
    -- are never touched.
    String
entities <- Parser Text Char -> Parser Text String
forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
many' (Parser Text Char -> Parser Text String)
-> Parser Text Char -> Parser Text String
forall a b. (a -> b) -> a -> b
$ Parser Text Char -> Parser Text Char
forall i a. Parser i a -> Parser i a
try (Parser Text Char -> Parser Text Char)
-> Parser Text Char -> Parser Text Char
forall a b. (a -> b) -> a -> b
$ do
        Char
c <- Parser Text Char
unnamedCharRef
        Bool -> Parser Text () -> Parser Text ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Char -> Bool
isHanjaOrDigit Char
c) (String -> Parser Text ()
forall (m :: * -> *) a. MonadFail m => String -> m a
fail String
"a hanja")
        Char -> Parser Text Char
forall (m :: * -> *) a. Monad m => a -> m a
return Char
c
    Text
charsText <- (Char -> Bool) -> Parser Text
takeTill Char -> Bool
isHanjaOrDigit
    String
chars <- Parser Text Char -> Parser Text String
forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
many' (Parser Text Char -> Parser Text String)
-> Parser Text Char -> Parser Text String
forall a b. (a -> b) -> a -> b
$ Parser Text Char -> Parser Text Char
forall i a. Parser i a -> Parser i a
try (Parser Text Char -> Parser Text Char)
-> Parser Text Char -> Parser Text Char
forall a b. (a -> b) -> a -> b
$ do
        Char
c <- Parser Text Char
unnamedCharRef Parser Text Char -> Parser Text Char -> Parser Text Char
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Text Char
anyChar
        Bool -> Parser Text () -> Parser Text ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Char -> Bool
isHanjaOrDigit Char
c) (String -> Parser Text ()
forall (m :: * -> *) a. MonadFail m => String -> m a
fail String
"a hanja")
        Char -> Parser Text Char
forall (m :: * -> *) a. Monad m => a -> m a
return Char
c
    let hanjaText :: Text
hanjaText = [Text] -> Text
Data.Text.concat
            [String -> Text
pack String
hanjaEntities, Text
Item [Text]
hanjaCharsText, String -> Text
pack String
hanjaChars]
    let text' :: Text
text' = [Text] -> Text
Data.Text.concat [String -> Text
pack String
entities, Text
Item [Text]
charsText, String -> Text
pack String
chars]
    Bool -> Parser Text () -> Parser Text ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Text -> Bool
Data.Text.null (Text -> Bool) -> Text -> Bool
forall a b. (a -> b) -> a -> b
$ Text
hanjaText Text -> HanjaWordPhoneticizer
`append` Text
text') (String -> Parser Text ()
forall (m :: * -> *) a. MonadFail m => String -> m a
fail String
"parsed nothing")
    [(Bool, Text)] -> Parser [(Bool, Text)]
forall (m :: * -> *) a. Monad m => a -> m a
return [(Bool
True, Text
hanjaText), (Bool
False, Text
text')]
  where
    isHanjaOrDigit :: Char -> Bool
    isHanjaOrDigit :: Char -> Bool
isHanjaOrDigit Char
c =
        Char -> Bool
isDigit Char
c Bool -> Bool -> Bool
|| Char -> Bool
isHanja Char
c
    isHanja :: Char -> Bool
    isHanja :: Char -> Bool
isHanja Char
c =
        -- Ideographic Description Character
        Char
'\x2f00'  Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
c Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x2fff' Bool -> Bool -> Bool
||
        -- U+3007 IDEOGRAPHIC NUMBER ZERO (〇)
        Char
'\x3007'  Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
c Bool -> Bool -> Bool
||
        -- CJK Unified Ideographs Extension A
        Char
'\x3400'  Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
c Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x4dbf' Bool -> Bool -> Bool
||
        -- CJK Unified Ideographs
        Char
'\x4e00'  Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
c Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x9fcc' Bool -> Bool -> Bool
||
        -- CJK Compatibility Ideographs
        Char
'\xf900'  Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
c Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xfaff' Bool -> Bool -> Bool
||
        -- CJK Unified Ideographs Extension B
        Char
'\x20000' Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
c Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x2a6d6' Bool -> Bool -> Bool
||
        -- CJK Unified Ideographs Extension C
        Char
'\x2a700' Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
c Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x2b734' Bool -> Bool -> Bool
||
        -- CJK Unified Ideographs Extension D
        Char
'\x2b740' Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
c Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x2b81d' Bool -> Bool -> Bool
||
        -- CJK Unified Ideographs Extension E
        Char
'\x2b820' Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
c Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x2cea1' Bool -> Bool -> Bool
||
        -- CJK Unified Ideographs Extension F
        Char
'\x2ceb0' Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
c Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x2ebe0' Bool -> Bool -> Bool
||
        -- CJK Compatibility Ideographs Supplement
        Char
'\x2f800' Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
c Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x2fa1f'
    unnamedCharRef :: Parser Char
    unnamedCharRef :: Parser Text Char
unnamedCharRef = do
        Char
_ <- Char -> Parser Text Char
char Char
'&'
        Char
_ <- Char -> Parser Text Char
char Char
'#'
        Bool
hex <- Bool -> Parser Text Bool -> Parser Text Bool
forall (f :: * -> *) a. Alternative f => a -> f a -> f a
option Bool
False ((Char -> Parser Text Char
char Char
'x' Parser Text Char -> Parser Text Char -> Parser Text Char
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Char -> Parser Text Char
char Char
'X') Parser Text Char -> Parser Text Bool -> Parser Text Bool
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Bool -> Parser Text Bool
forall (m :: * -> *) a. Monad m => a -> m a
return Bool
True)
        Int
codepoint <- if Bool
hex then Parser Int
forall a. (Integral a, Bits a) => Parser a
hexadecimal else Parser Int
forall a. Integral a => Parser a
decimal
        Char
_ <- Char -> Parser Text Char
char Char
';'
        Char -> Parser Text Char
forall (m :: * -> *) a. Monad m => a -> m a
return (Char -> Parser Text Char) -> Char -> Parser Text Char
forall a b. (a -> b) -> a -> b
$ Int -> Char
chr Int
codepoint

-- | The Initial Sound Law (頭音法則) table according to South Korean
-- /Hangul Orthography/ (한글 맞춤법) Clause 5, Section 52, Chapter 6
-- (第6章52項5節).  Keys are an original Sino-Korean sound and values
-- are a converted sound according to the law.
initialSoundLawTable :: Map Char Char
initialSoundLawTable :: Map Char Char
initialSoundLawTable =
    [ (Char
'녀', Char
'여')
    , (Char
'뇨', Char
'요')
    , (Char
'뉴', Char
'유')
    , (Char
'니', Char
'이')
    , (Char
'랴', Char
'야')
    , (Char
'려', Char
'여')
    , (Char
'례', Char
'예')
    , (Char
'료', Char
'요')
    , (Char
'류', Char
'유')
    , (Char
'리', Char
'이')
    , (Char
'라', Char
'나')
    , (Char
'래', Char
'내')
    , (Char
'로', Char
'노')
    , (Char
'뢰', Char
'뇌')
    , (Char
'루', Char
'누')
    , (Char
'르', Char
'느')
    ]

-- | Contains the same contents to 'initialSoundLawTable' except that
-- keys and values are crossed: keys are a converted sound and values are
-- possible original sounds.
initialSoundLawTable' :: Map Char (Set Char)
initialSoundLawTable' :: Map Char (Set Char)
initialSoundLawTable' =
    (Char -> Char -> Map Char (Set Char) -> Map Char (Set Char))
-> Map Char (Set Char) -> Map Char Char -> Map Char (Set Char)
forall k a b. (k -> a -> b -> b) -> b -> Map k a -> b
foldrWithKey Char -> Char -> Map Char (Set Char) -> Map Char (Set Char)
f Map Char (Set Char)
forall k a. Map k a
Data.Map.Strict.empty Map Char Char
initialSoundLawTable
  where
    f :: Char -> Char -> Map Char (Set Char) -> Map Char (Set Char)
    f :: Char -> Char -> Map Char (Set Char) -> Map Char (Set Char)
f Char
original Char
converted =
        (Set Char -> Set Char -> Set Char)
-> Char -> Set Char -> Map Char (Set Char) -> Map Char (Set Char)
forall k a. Ord k => (a -> a -> a) -> k -> a -> Map k a -> Map k a
insertWith Set Char -> Set Char -> Set Char
forall a. Ord a => Set a -> Set a -> Set a
Data.Set.union Char
converted (Char -> Set Char
forall a. a -> Set a
Data.Set.singleton Char
original)