{-# LANGUAGE OverloadedStrings #-}
module Text.Seonbi.Html.Lang
    ( LangHtmlEntity (..)
    , LanguageTag
    , annotateWithLang
    , extractLang
    , isKorean
    , isNeverKorean
    ) where

import Control.Applicative
import Data.Char (isSpace)
import Data.Maybe

import Data.Attoparsec.Text
import Data.Text

import Text.Seonbi.Html.Entity
import Text.Seonbi.Html.Tag (HtmlTag)

-- | Represents a language tag.  Although it is defined as an alias for 'Text',
-- it can be structured in the future.  Do not use its contents directly.
type LanguageTag = Text

-- | Extracts the language tag from the given raw HTML attributes if it has
-- @lang@ attribute.
--
-- >>> extractLang ""
-- Nothing
-- >>> extractLang "lang=en"
-- Just "en"
-- >>> extractLang "lang=\"ko-KR\""
-- Just "ko-kr"
-- >>> extractLang " lang='ko-Hang'"
-- Just "ko-hang"
extractLang
    :: HtmlRawAttrs
    -- ^ A raw HTML attributes to extract the language tag from.
    -> Maybe LanguageTag
    -- ^ A language tag extracted from the given raw HTML attributes.
    -- If the given raw HTML attributes does not have @lang@ attribute or
    -- its value is invalid, 'Nothing' is returned.
extractLang :: HtmlRawAttrs -> Maybe HtmlRawAttrs
extractLang HtmlRawAttrs
attrs =
    case Parser (Maybe HtmlRawAttrs)
-> HtmlRawAttrs -> Either String (Maybe HtmlRawAttrs)
forall a. Parser a -> HtmlRawAttrs -> Either String a
parseOnly Parser (Maybe HtmlRawAttrs)
parser' HtmlRawAttrs
attrs of
        Right (Just HtmlRawAttrs
lang') ->
            let lt :: HtmlRawAttrs
lt = HtmlRawAttrs -> HtmlRawAttrs
toLower (HtmlRawAttrs -> HtmlRawAttrs)
-> (HtmlRawAttrs -> HtmlRawAttrs) -> HtmlRawAttrs -> HtmlRawAttrs
forall b c a. (b -> c) -> (a -> b) -> a -> c
. HtmlRawAttrs -> HtmlRawAttrs
strip (HtmlRawAttrs -> HtmlRawAttrs)
-> (HtmlRawAttrs -> HtmlRawAttrs) -> HtmlRawAttrs -> HtmlRawAttrs
forall b c a. (b -> c) -> (a -> b) -> a -> c
. HtmlRawAttrs -> HtmlRawAttrs
normalizeEntities (HtmlRawAttrs -> HtmlRawAttrs) -> HtmlRawAttrs -> HtmlRawAttrs
forall a b. (a -> b) -> a -> b
$ HtmlRawAttrs
lang'
            in if HtmlRawAttrs -> Bool
Data.Text.null HtmlRawAttrs
lt then Maybe HtmlRawAttrs
forall a. Maybe a
Nothing else HtmlRawAttrs -> Maybe HtmlRawAttrs
forall a. a -> Maybe a
Just HtmlRawAttrs
lt
        Either String (Maybe HtmlRawAttrs)
_ -> Maybe HtmlRawAttrs
forall a. Maybe a
Nothing
  where
    parser' :: Parser (Maybe Text)
    parser' :: Parser (Maybe HtmlRawAttrs)
parser' = do
        Parser ()
skipSpace
        [Maybe HtmlRawAttrs]
attrs' <- Parser (Maybe HtmlRawAttrs)
langAttr Parser (Maybe HtmlRawAttrs)
-> Parser HtmlRawAttrs Char
-> Parser HtmlRawAttrs [Maybe HtmlRawAttrs]
forall (f :: * -> *) a s. Alternative f => f a -> f s -> f [a]
`sepBy` Parser HtmlRawAttrs Char
space
        Parser ()
skipSpace
        Maybe HtmlRawAttrs -> Parser (Maybe HtmlRawAttrs)
forall (m :: * -> *) a. Monad m => a -> m a
return (Maybe HtmlRawAttrs -> Parser (Maybe HtmlRawAttrs))
-> Maybe HtmlRawAttrs -> Parser (Maybe HtmlRawAttrs)
forall a b. (a -> b) -> a -> b
$ [HtmlRawAttrs] -> Maybe HtmlRawAttrs
forall a. [a] -> Maybe a
listToMaybe ([HtmlRawAttrs] -> Maybe HtmlRawAttrs)
-> [HtmlRawAttrs] -> Maybe HtmlRawAttrs
forall a b. (a -> b) -> a -> b
$ [Maybe HtmlRawAttrs] -> [HtmlRawAttrs]
forall a. [Maybe a] -> [a]
catMaybes [Maybe HtmlRawAttrs]
attrs'
    langAttr :: Parser (Maybe Text)
    langAttr :: Parser (Maybe HtmlRawAttrs)
langAttr = do
        (Bool
isLang, Bool
cont) <- Parser (Bool, Bool)
attrIsLang
        HtmlRawAttrs
value <- if Bool
cont then Parser HtmlRawAttrs
attrValue else HtmlRawAttrs -> Parser HtmlRawAttrs
forall (m :: * -> *) a. Monad m => a -> m a
return HtmlRawAttrs
""
        Maybe HtmlRawAttrs -> Parser (Maybe HtmlRawAttrs)
forall (m :: * -> *) a. Monad m => a -> m a
return (if Bool
isLang then HtmlRawAttrs -> Maybe HtmlRawAttrs
forall a. a -> Maybe a
Just HtmlRawAttrs
value else Maybe HtmlRawAttrs
forall a. Maybe a
Nothing)
    attrIsLang :: Parser (Bool, Bool)
    attrIsLang :: Parser (Bool, Bool)
attrIsLang = [Parser (Bool, Bool)] -> Parser (Bool, Bool)
forall (f :: * -> *) a. Alternative f => [f a] -> f a
choice
        [ HtmlRawAttrs -> Parser HtmlRawAttrs
asciiCI HtmlRawAttrs
"lang=" Parser HtmlRawAttrs -> Parser (Bool, Bool) -> Parser (Bool, Bool)
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> (Bool, Bool) -> Parser (Bool, Bool)
forall (m :: * -> *) a. Monad m => a -> m a
return (Bool
True, Bool
True)
        , do { HtmlRawAttrs
_ <- (Char -> Bool) -> Parser HtmlRawAttrs
takeWhile1 (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'=')
             ; Maybe Char
eq <- Parser HtmlRawAttrs Char -> Parser HtmlRawAttrs (Maybe Char)
forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
optional (Char -> Parser HtmlRawAttrs Char
char Char
'=')
             ; (Bool, Bool) -> Parser (Bool, Bool)
forall (m :: * -> *) a. Monad m => a -> m a
return (Bool
False, Maybe Char -> Bool
forall a. Maybe a -> Bool
isJust Maybe Char
eq)
             }
        ]
    attrValue :: Parser Text
    attrValue :: Parser HtmlRawAttrs
attrValue = [Parser HtmlRawAttrs] -> Parser HtmlRawAttrs
forall (f :: * -> *) a. Alternative f => [f a] -> f a
choice
        [ do { (Char -> Bool) -> Parser ()
skip (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'"'); HtmlRawAttrs
v <- (Char -> Bool) -> Parser HtmlRawAttrs
takeTill (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'"'); (Char -> Bool) -> Parser ()
skip (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'"'); HtmlRawAttrs -> Parser HtmlRawAttrs
forall (m :: * -> *) a. Monad m => a -> m a
return HtmlRawAttrs
v }
        , do { (Char -> Bool) -> Parser ()
skip (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'\'')
             ; HtmlRawAttrs
v <- (Char -> Bool) -> Parser HtmlRawAttrs
takeTill (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'\'')
             ; (Char -> Bool) -> Parser ()
skip (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'\''); HtmlRawAttrs -> Parser HtmlRawAttrs
forall (m :: * -> *) a. Monad m => a -> m a
return HtmlRawAttrs
v
             }
        , (Char -> Bool) -> Parser HtmlRawAttrs
takeWhile1 (Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Bool
isSpace)
        ]
    normalizeEntities :: Text -> Text
    normalizeEntities :: HtmlRawAttrs -> HtmlRawAttrs
normalizeEntities
        = HtmlRawAttrs -> HtmlRawAttrs -> HtmlRawAttrs -> HtmlRawAttrs
Data.Text.replace HtmlRawAttrs
"&hyphen;" HtmlRawAttrs
"-"
        (HtmlRawAttrs -> HtmlRawAttrs)
-> (HtmlRawAttrs -> HtmlRawAttrs) -> HtmlRawAttrs -> HtmlRawAttrs
forall b c a. (b -> c) -> (a -> b) -> a -> c
. HtmlRawAttrs -> HtmlRawAttrs -> HtmlRawAttrs -> HtmlRawAttrs
Data.Text.replace HtmlRawAttrs
"&dash;" HtmlRawAttrs
"-"
        (HtmlRawAttrs -> HtmlRawAttrs)
-> (HtmlRawAttrs -> HtmlRawAttrs) -> HtmlRawAttrs -> HtmlRawAttrs
forall b c a. (b -> c) -> (a -> b) -> a -> c
. HtmlRawAttrs -> HtmlRawAttrs -> HtmlRawAttrs -> HtmlRawAttrs
Data.Text.replace HtmlRawAttrs
"&#8208;" HtmlRawAttrs
"-"
        (HtmlRawAttrs -> HtmlRawAttrs)
-> (HtmlRawAttrs -> HtmlRawAttrs) -> HtmlRawAttrs -> HtmlRawAttrs
forall b c a. (b -> c) -> (a -> b) -> a -> c
. HtmlRawAttrs -> HtmlRawAttrs -> HtmlRawAttrs -> HtmlRawAttrs
Data.Text.replace HtmlRawAttrs
"&#x2010;" HtmlRawAttrs
"-"
        (HtmlRawAttrs -> HtmlRawAttrs)
-> (HtmlRawAttrs -> HtmlRawAttrs) -> HtmlRawAttrs -> HtmlRawAttrs
forall b c a. (b -> c) -> (a -> b) -> a -> c
. HtmlRawAttrs -> HtmlRawAttrs -> HtmlRawAttrs -> HtmlRawAttrs
Data.Text.replace HtmlRawAttrs
"&#X2010;" HtmlRawAttrs
"-"

-- | Annotates 'HtmlEntity' with the 'lang' tag extracted from it or its
-- ancestors.
data LangHtmlEntity = LangHtmlEntity
    { -- | The @lang@ tag extracted from the HTML 'entity' or its ancestors.
      LangHtmlEntity -> Maybe HtmlRawAttrs
lang :: Maybe LanguageTag
      -- | The annotated HTML 'entity'.
    , LangHtmlEntity -> HtmlEntity
entity :: HtmlEntity
    } deriving (Int -> LangHtmlEntity -> ShowS
[LangHtmlEntity] -> ShowS
LangHtmlEntity -> String
(Int -> LangHtmlEntity -> ShowS)
-> (LangHtmlEntity -> String)
-> ([LangHtmlEntity] -> ShowS)
-> Show LangHtmlEntity
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [LangHtmlEntity] -> ShowS
$cshowList :: [LangHtmlEntity] -> ShowS
show :: LangHtmlEntity -> String
$cshow :: LangHtmlEntity -> String
showsPrec :: Int -> LangHtmlEntity -> ShowS
$cshowsPrec :: Int -> LangHtmlEntity -> ShowS
Show, LangHtmlEntity -> LangHtmlEntity -> Bool
(LangHtmlEntity -> LangHtmlEntity -> Bool)
-> (LangHtmlEntity -> LangHtmlEntity -> Bool) -> Eq LangHtmlEntity
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: LangHtmlEntity -> LangHtmlEntity -> Bool
$c/= :: LangHtmlEntity -> LangHtmlEntity -> Bool
== :: LangHtmlEntity -> LangHtmlEntity -> Bool
$c== :: LangHtmlEntity -> LangHtmlEntity -> Bool
Eq)

-- | Annotates the given HTML entities with the language tag extracted from
-- their @lang@ attributes.  If a parent entity has @lang@ attribute, its
-- all descendants are annotated with the same language tag.
annotateWithLang :: [HtmlEntity] -> [LangHtmlEntity]
annotateWithLang :: [HtmlEntity] -> [LangHtmlEntity]
annotateWithLang =
    [(HtmlTag, Maybe HtmlRawAttrs)] -> [HtmlEntity] -> [LangHtmlEntity]
annotate []
  where
    annotate :: [(HtmlTag, Maybe LanguageTag)]
             -> [HtmlEntity]
             -> [LangHtmlEntity]
    annotate :: [(HtmlTag, Maybe HtmlRawAttrs)] -> [HtmlEntity] -> [LangHtmlEntity]
annotate [(HtmlTag, Maybe HtmlRawAttrs)]
_ [] = []
    annotate [(HtmlTag, Maybe HtmlRawAttrs)]
stack (x :: HtmlEntity
x@HtmlStartTag { tag :: HtmlEntity -> HtmlTag
tag = HtmlTag
tag', rawAttributes :: HtmlEntity -> HtmlRawAttrs
rawAttributes = HtmlRawAttrs
attrs } : [HtmlEntity]
xs) =
        Maybe HtmlRawAttrs -> HtmlEntity -> LangHtmlEntity
LangHtmlEntity Maybe HtmlRawAttrs
thisLang HtmlEntity
x LangHtmlEntity -> [LangHtmlEntity] -> [LangHtmlEntity]
forall a. a -> [a] -> [a]
: [(HtmlTag, Maybe HtmlRawAttrs)] -> [HtmlEntity] -> [LangHtmlEntity]
annotate [(HtmlTag, Maybe HtmlRawAttrs)]
nextStack [HtmlEntity]
xs
      where
        parentLang :: Maybe LanguageTag
        parentLang :: Maybe HtmlRawAttrs
parentLang = case [(HtmlTag, Maybe HtmlRawAttrs)]
stack of
            (HtmlTag
_, Maybe HtmlRawAttrs
l):[(HtmlTag, Maybe HtmlRawAttrs)]
_ -> Maybe HtmlRawAttrs
l
            [(HtmlTag, Maybe HtmlRawAttrs)]
_ -> Maybe HtmlRawAttrs
forall a. Maybe a
Nothing
        thisLang :: Maybe LanguageTag
        thisLang :: Maybe HtmlRawAttrs
thisLang = HtmlRawAttrs -> Maybe HtmlRawAttrs
extractLang HtmlRawAttrs
attrs Maybe HtmlRawAttrs -> Maybe HtmlRawAttrs -> Maybe HtmlRawAttrs
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Maybe HtmlRawAttrs
parentLang
        nextStack :: [(HtmlTag, Maybe LanguageTag)]
        nextStack :: [(HtmlTag, Maybe HtmlRawAttrs)]
nextStack = (HtmlTag
tag', Maybe HtmlRawAttrs
thisLang) (HtmlTag, Maybe HtmlRawAttrs)
-> [(HtmlTag, Maybe HtmlRawAttrs)]
-> [(HtmlTag, Maybe HtmlRawAttrs)]
forall a. a -> [a] -> [a]
: [(HtmlTag, Maybe HtmlRawAttrs)]
stack
    annotate [(HtmlTag, Maybe HtmlRawAttrs)]
stack (x :: HtmlEntity
x@HtmlEndTag { tag :: HtmlEntity -> HtmlTag
tag = HtmlTag
tag' } : [HtmlEntity]
xs) =
        Maybe HtmlRawAttrs -> HtmlEntity -> LangHtmlEntity
LangHtmlEntity Maybe HtmlRawAttrs
thisLang HtmlEntity
x LangHtmlEntity -> [LangHtmlEntity] -> [LangHtmlEntity]
forall a. a -> [a] -> [a]
: [(HtmlTag, Maybe HtmlRawAttrs)] -> [HtmlEntity] -> [LangHtmlEntity]
annotate [(HtmlTag, Maybe HtmlRawAttrs)]
nextStack [HtmlEntity]
xs
      where
        ([(HtmlTag, Maybe HtmlRawAttrs)]
nextStack, Maybe HtmlRawAttrs
thisLang) = case [(HtmlTag, Maybe HtmlRawAttrs)]
stack of
            [] -> ([], Maybe HtmlRawAttrs
forall a. Maybe a
Nothing)
            s :: [(HtmlTag, Maybe HtmlRawAttrs)]
s@((HtmlTag
t, Maybe HtmlRawAttrs
lang'):[(HtmlTag, Maybe HtmlRawAttrs)]
ys) ->
                (if HtmlTag
t HtmlTag -> HtmlTag -> Bool
forall a. Eq a => a -> a -> Bool
== HtmlTag
tag' then [(HtmlTag, Maybe HtmlRawAttrs)]
ys else [(HtmlTag, Maybe HtmlRawAttrs)]
s, Maybe HtmlRawAttrs
lang')
    annotate [(HtmlTag, Maybe HtmlRawAttrs)]
stack (HtmlEntity
x : [HtmlEntity]
xs) =
        Maybe HtmlRawAttrs -> HtmlEntity -> LangHtmlEntity
LangHtmlEntity Maybe HtmlRawAttrs
parentLang HtmlEntity
x LangHtmlEntity -> [LangHtmlEntity] -> [LangHtmlEntity]
forall a. a -> [a] -> [a]
: [(HtmlTag, Maybe HtmlRawAttrs)] -> [HtmlEntity] -> [LangHtmlEntity]
annotate [(HtmlTag, Maybe HtmlRawAttrs)]
stack [HtmlEntity]
xs
      where
        parentLang :: Maybe LanguageTag
        parentLang :: Maybe HtmlRawAttrs
parentLang = case [(HtmlTag, Maybe HtmlRawAttrs)]
stack of
            (HtmlTag
_, Maybe HtmlRawAttrs
l):[(HtmlTag, Maybe HtmlRawAttrs)]
_ -> Maybe HtmlRawAttrs
l
            [(HtmlTag, Maybe HtmlRawAttrs)]
_ -> Maybe HtmlRawAttrs
forall a. Maybe a
Nothing

-- | Determines whether the given language tag refers to any kind of Korean.
--
-- >>> isKorean "ko"
-- True
-- >>> isKorean "ko-KR"
-- True
-- >>> isKorean "kor-Hang"
-- True
-- >>> isKorean "en"
-- False
-- >>> isKorean "en-KR"
-- False
isKorean :: LanguageTag -> Bool
isKorean :: HtmlRawAttrs -> Bool
isKorean HtmlRawAttrs
lang' =
    HtmlRawAttrs
l HtmlRawAttrs -> HtmlRawAttrs -> Bool
forall a. Eq a => a -> a -> Bool
== HtmlRawAttrs
"ko" Bool -> Bool -> Bool
|| HtmlRawAttrs
l HtmlRawAttrs -> HtmlRawAttrs -> Bool
forall a. Eq a => a -> a -> Bool
== HtmlRawAttrs
"kor" Bool -> Bool -> Bool
||
    HtmlRawAttrs
"ko-" HtmlRawAttrs -> HtmlRawAttrs -> Bool
`isPrefixOf` HtmlRawAttrs
l Bool -> Bool -> Bool
||
    HtmlRawAttrs
"kor-" HtmlRawAttrs -> HtmlRawAttrs -> Bool
`isPrefixOf` HtmlRawAttrs
l
  where
    l :: Text
    l :: HtmlRawAttrs
l = HtmlRawAttrs -> HtmlRawAttrs
toLower HtmlRawAttrs
lang'

-- | Determines whether the given language tag undoubtedly does not refer
-- to any kind of Korean.
--
-- >>> isNeverKorean $ Just "ko"
-- False
-- >>> isNeverKorean $ Just "ko-KR"
-- False
-- >>> isNeverKorean Nothing
-- False
-- >>> isNeverKorean $ Just "en"
-- True
isNeverKorean :: Maybe LanguageTag -> Bool
isNeverKorean :: Maybe HtmlRawAttrs -> Bool
isNeverKorean Maybe HtmlRawAttrs
Nothing = Bool
False
isNeverKorean (Just HtmlRawAttrs
lang') = Bool -> Bool
not (HtmlRawAttrs -> Bool
isKorean HtmlRawAttrs
lang')