{-|
Description:    Tokenization rules for characters within @\<script\>@ ... @\</script\>@
                sections in HTML comments in script data.

Copyright:      (c) 2020-2021 Sam May
License:        MPL-2.0
Maintainer:     ag.eitilt@gmail.com

Stability:      stable
Portability:    portable
-}
module Web.Mangrove.Parse.Tokenize.ScriptDataDoubleEscaped
    ( tokenScriptDataDoubleEscaped
    , tokenScriptDataDoubleEscapeStart
    ) where


import qualified Control.Applicative as A

import qualified Data.Maybe as Y

import Web.Mangrove.Parse.Common.Error
import Web.Mangrove.Parse.Tokenize.Common
import Web.Willow.Common.Encoding.Character
import Web.Willow.Common.Parser

import {-# SOURCE #-} Web.Mangrove.Parse.Tokenize.Dispatcher


-- | __HTML:__
--      @[script data escape start state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state)@
-- 
-- The parsing instructions for after reading @"\<"@ when the next character is
-- a letter in the 'ScriptDataEscapedState' section of the state machine.
tokenScriptDataDoubleEscapeStart :: Tokenizer [TokenizerOutput Token]
tokenScriptDataDoubleEscapeStart = tokenScriptDataDoubleEscapeStart' >>= \t' ->
    case map toAsciiLower $ tokenizedOut t' of
        "script" -> do
            recovery <- map unpackToken <$> output t'
            tokenizers (Just $ recovery ++ [([EOFInScriptHtmlCommentLikeText], EndOfStream)])
                [ ifsChar isAsciiWhitespace $ escape t'
                , ifsChar (== '/') $ escape t'
                , ifsChar (== '>') $ escape t'
                , elsePush_ $ output t'
                ]
        _ -> output t'
  where escape t' c = do
            changeState ScriptDataDoubleEscapedState
            cs <- finalStateList Nothing <$> output t'
            c' <- emit' ([], Character c)
            cs' <- tokenScriptDataDoubleEscaped
            return $ cs ++ c' ++ cs'
        output t' = case tokenizedOut t' of
            [] -> case tokenizedErrs t' of
                [] -> return []
                errs -> consTokenErrorsList errs <$> dispatcher
            (c:cs) -> emits (tokenizedState t') $
                (tokenizedErrs t', Character c) : [([], Character c') | c' <- cs]
        emits state ts = finalStateList state <$> mapM emit ts
        unpackToken t' = (tokenizedErrs t', tokenizedOut t')

-- | Loop within the __HTML__ @[script data double escape start state]
-- (https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state)@
-- to read the name of the tag.
tokenScriptDataDoubleEscapeStart' :: Tokenizer (TokenizerOutput String)
tokenScriptDataDoubleEscapeStart' = tokenizer (Just ([], ""))
    [ ifChar isAsciiAlpha $ \c -> consOut c <$> tokenScriptDataDoubleEscapeStart'
    , elsePush_ $ packToken ([], "")
    ]

-- | __HTML:__
--      @[script data double escaped state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state)@
-- 
-- The parsing instructions rooted in the 'ScriptDataDoubleEscapedState'
-- section of the state machine.
tokenScriptDataDoubleEscaped :: Tokenizer [TokenizerOutput Token]
tokenScriptDataDoubleEscaped = tokenizers (Just [([EOFInScriptHtmlCommentLikeText], EndOfStream)])
    [ ifs_ (== '-') $ consEmit ([], Character '-') tokenScriptDataDoubleEscapedDash
    , ifs_ (== '<') $ consEmit ([], Character '<') tokenScriptDataDoubleEscapedLessThanSign
    , ifs_ (== '\NUL') $ emit' ([UnexpectedNullCharacter], Character replacementChar)
    , elsesChar $ \c -> do
        e <- A.optional end
        consEmit ([], Character c) $ if Y.isJust e
            then emit' ([EOFInScriptHtmlCommentLikeText], EndOfStream)
            else return []
    ]

-- | __HTML:__
--      @[script data double escaped dash state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-double-dash-state)@
-- 
-- The parsing instructions for after reading @"-"@ in the
-- 'ScriptDataDoubleEscapedState' section of the state machine.
tokenScriptDataDoubleEscapedDash :: Tokenizer [TokenizerOutput Token]
tokenScriptDataDoubleEscapedDash = tokenizers (Just [([EOFInScriptHtmlCommentLikeText], EndOfStream)])
    [ ifs_ (== '-') $ consEmit ([], Character '-') tokenScriptDataDoubleEscapedDashDash
    , ifs_ (== '<') $ consEmit ([], Character '<') tokenScriptDataDoubleEscapedLessThanSign
    , ifs_ (== '\NUL') $ emit' ([UnexpectedNullCharacter], Character replacementChar)
    , elsesChar $ \c -> emit' ([], Character c)
    ]

-- | __HTML:__
--      @[script data double escaped dash dash state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-double-dash-dash-state)@
-- 
-- The parsing instructions for after reading @"--"@ in the
-- 'ScriptDataDoubleEscapedState' section of the state machine.
tokenScriptDataDoubleEscapedDashDash :: Tokenizer [TokenizerOutput Token]
tokenScriptDataDoubleEscapedDashDash = tokenizers (Just [([EOFInScriptHtmlCommentLikeText], EndOfStream)])
    [ ifs_ (== '-') $ consEmit ([], Character '-') tokenScriptDataDoubleEscapedDashDash
    , ifs_ (== '<') $ consEmit ([], Character '<') tokenScriptDataDoubleEscapedLessThanSign
    , ifs_ (== '>') $ changeState ScriptDataState *> emit' ([], Character '>')
    , ifs_ (== '\NUL') $ emit' ([UnexpectedNullCharacter], Character replacementChar)
    , elsesChar $ \c -> emit' ([], Character c)
    ]

-- | __HTML:__
--      @[script data double escaped less-than state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-state)@
-- 
-- The parsing instructions for after reading @"\<"@ in the
-- 'ScriptDataDoubleEscapedState' section of the state machine.
tokenScriptDataDoubleEscapedLessThanSign :: Tokenizer [TokenizerOutput Token]
tokenScriptDataDoubleEscapedLessThanSign = tokenizers (Just [([EOFInScriptHtmlCommentLikeText], EndOfStream)])
    [ ifs_ (== '/') $ consEmit ([], Character '/') tokenScriptDataDoubleEscapeEnd
    , elsePush_ $ return []
    ]

-- | __HTML:__
--      @[script data double escape end state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state)@
-- 
-- The parsing instructions for after reading @"\</"@ in the
-- 'ScriptDataDoubleEscapedState' section of the state machine.
tokenScriptDataDoubleEscapeEnd :: Tokenizer [TokenizerOutput Token]
tokenScriptDataDoubleEscapeEnd = tokenScriptDataDoubleEscapeEnd' >>= \t' -> do
    case map toAsciiLower $ tokenizedOut t' of
        "script" -> do
            recovery <- map unpackToken <$> output t'
            tokenizers (Just $ recovery ++ [([EOFInScriptHtmlCommentLikeText], EndOfStream)])
                [ ifsChar isAsciiWhitespace $ escape t'
                , ifsChar (== '/') $ escape t'
                , ifsChar (== '>') $ escape t'
                , elsePush_ $ output t'
                ]
        _ -> output t'
  where escape t' c = do
            changeState ScriptDataEscapedState
            cs <- finalStateList Nothing <$> output t'
            c' <- emit' ([], Character c)
            return $ cs ++ c'
        output t' = do
            buffer <- emits (tokenizedState t') [([], Character c) | c <- tokenizedOut t']
            case tokenizedErrs t' of
                [] -> return buffer
                errs -> consTokenErrorsList errs <$> case buffer of
                    [] -> dispatcher
                    _ -> return buffer
        emits state ts = finalStateList state <$> mapM emit ts
        unpackToken t' = (tokenizedErrs t', tokenizedOut t')


-- | Loop within the __HTML__ @[script data double escape end state]
-- (https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state)@
-- to read the name of the tag.
tokenScriptDataDoubleEscapeEnd' :: Tokenizer (TokenizerOutput String)
tokenScriptDataDoubleEscapeEnd' = tokenizer (Just ([], ""))
    [ ifChar isAsciiAlpha $ \c -> consOut c <$> tokenScriptDataDoubleEscapeEnd'
    , elsePush_ $ packToken ([], "")
    ]