{-| Description: Tokenization rules for characters within @\@ ... @\@ sections in HTML comments in script data. Copyright: (c) 2020-2021 Sam May License: MPL-2.0 Maintainer: ag.eitilt@gmail.com Stability: stable Portability: portable -} module Web.Mangrove.Parse.Tokenize.ScriptDataDoubleEscaped ( tokenScriptDataDoubleEscaped , tokenScriptDataDoubleEscapeStart ) where import qualified Control.Applicative as A import qualified Data.Maybe as Y import Web.Mangrove.Parse.Common.Error import Web.Mangrove.Parse.Tokenize.Common import Web.Willow.Common.Encoding.Character import Web.Willow.Common.Parser import {-# SOURCE #-} Web.Mangrove.Parse.Tokenize.Dispatcher -- | __HTML:__ -- @[script data escape start state] -- (https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state)@ -- -- The parsing instructions for after reading @"\<"@ when the next character is -- a letter in the 'ScriptDataEscapedState' section of the state machine. tokenScriptDataDoubleEscapeStart :: Tokenizer [TokenizerOutput Token] tokenScriptDataDoubleEscapeStart = tokenScriptDataDoubleEscapeStart' >>= \t' -> case map toAsciiLower $ tokenizedOut t' of "script" -> do recovery <- map unpackToken <$> output t' tokenizers (Just $ recovery ++ [([EOFInScriptHtmlCommentLikeText], EndOfStream)]) [ ifsChar isAsciiWhitespace $ escape t' , ifsChar (== '/') $ escape t' , ifsChar (== '>') $ escape t' , elsePush_ $ output t' ] _ -> output t' where escape t' c = do changeState ScriptDataDoubleEscapedState cs <- finalStateList Nothing <$> output t' c' <- emit' ([], Character c) cs' <- tokenScriptDataDoubleEscaped return $ cs ++ c' ++ cs' output t' = case tokenizedOut t' of [] -> case tokenizedErrs t' of [] -> return [] errs -> consTokenErrorsList errs <$> dispatcher (c:cs) -> emits (tokenizedState t') $ (tokenizedErrs t', Character c) : [([], Character c') | c' <- cs] emits state ts = finalStateList state <$> mapM emit ts unpackToken t' = (tokenizedErrs t', tokenizedOut t') -- | Loop within the __HTML__ @[script data double escape start state] -- (https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state)@ -- to read the name of the tag. tokenScriptDataDoubleEscapeStart' :: Tokenizer (TokenizerOutput String) tokenScriptDataDoubleEscapeStart' = tokenizer (Just ([], "")) [ ifChar isAsciiAlpha $ \c -> consOut c <$> tokenScriptDataDoubleEscapeStart' , elsePush_ $ packToken ([], "") ] -- | __HTML:__ -- @[script data double escaped state] -- (https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state)@ -- -- The parsing instructions rooted in the 'ScriptDataDoubleEscapedState' -- section of the state machine. tokenScriptDataDoubleEscaped :: Tokenizer [TokenizerOutput Token] tokenScriptDataDoubleEscaped = tokenizers (Just [([EOFInScriptHtmlCommentLikeText], EndOfStream)]) [ ifs_ (== '-') $ consEmit ([], Character '-') tokenScriptDataDoubleEscapedDash , ifs_ (== '<') $ consEmit ([], Character '<') tokenScriptDataDoubleEscapedLessThanSign , ifs_ (== '\NUL') $ emit' ([UnexpectedNullCharacter], Character replacementChar) , elsesChar $ \c -> do e <- A.optional end consEmit ([], Character c) $ if Y.isJust e then emit' ([EOFInScriptHtmlCommentLikeText], EndOfStream) else return [] ] -- | __HTML:__ -- @[script data double escaped dash state] -- (https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-double-dash-state)@ -- -- The parsing instructions for after reading @"-"@ in the -- 'ScriptDataDoubleEscapedState' section of the state machine. tokenScriptDataDoubleEscapedDash :: Tokenizer [TokenizerOutput Token] tokenScriptDataDoubleEscapedDash = tokenizers (Just [([EOFInScriptHtmlCommentLikeText], EndOfStream)]) [ ifs_ (== '-') $ consEmit ([], Character '-') tokenScriptDataDoubleEscapedDashDash , ifs_ (== '<') $ consEmit ([], Character '<') tokenScriptDataDoubleEscapedLessThanSign , ifs_ (== '\NUL') $ emit' ([UnexpectedNullCharacter], Character replacementChar) , elsesChar $ \c -> emit' ([], Character c) ] -- | __HTML:__ -- @[script data double escaped dash dash state] -- (https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-double-dash-dash-state)@ -- -- The parsing instructions for after reading @"--"@ in the -- 'ScriptDataDoubleEscapedState' section of the state machine. tokenScriptDataDoubleEscapedDashDash :: Tokenizer [TokenizerOutput Token] tokenScriptDataDoubleEscapedDashDash = tokenizers (Just [([EOFInScriptHtmlCommentLikeText], EndOfStream)]) [ ifs_ (== '-') $ consEmit ([], Character '-') tokenScriptDataDoubleEscapedDashDash , ifs_ (== '<') $ consEmit ([], Character '<') tokenScriptDataDoubleEscapedLessThanSign , ifs_ (== '>') $ changeState ScriptDataState *> emit' ([], Character '>') , ifs_ (== '\NUL') $ emit' ([UnexpectedNullCharacter], Character replacementChar) , elsesChar $ \c -> emit' ([], Character c) ] -- | __HTML:__ -- @[script data double escaped less-than state] -- (https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-state)@ -- -- The parsing instructions for after reading @"\<"@ in the -- 'ScriptDataDoubleEscapedState' section of the state machine. tokenScriptDataDoubleEscapedLessThanSign :: Tokenizer [TokenizerOutput Token] tokenScriptDataDoubleEscapedLessThanSign = tokenizers (Just [([EOFInScriptHtmlCommentLikeText], EndOfStream)]) [ ifs_ (== '/') $ consEmit ([], Character '/') tokenScriptDataDoubleEscapeEnd , elsePush_ $ return [] ] -- | __HTML:__ -- @[script data double escape end state] -- (https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state)@ -- -- The parsing instructions for after reading @"\>= \t' -> do case map toAsciiLower $ tokenizedOut t' of "script" -> do recovery <- map unpackToken <$> output t' tokenizers (Just $ recovery ++ [([EOFInScriptHtmlCommentLikeText], EndOfStream)]) [ ifsChar isAsciiWhitespace $ escape t' , ifsChar (== '/') $ escape t' , ifsChar (== '>') $ escape t' , elsePush_ $ output t' ] _ -> output t' where escape t' c = do changeState ScriptDataEscapedState cs <- finalStateList Nothing <$> output t' c' <- emit' ([], Character c) return $ cs ++ c' output t' = do buffer <- emits (tokenizedState t') [([], Character c) | c <- tokenizedOut t'] case tokenizedErrs t' of [] -> return buffer errs -> consTokenErrorsList errs <$> case buffer of [] -> dispatcher _ -> return buffer emits state ts = finalStateList state <$> mapM emit ts unpackToken t' = (tokenizedErrs t', tokenizedOut t') -- | Loop within the __HTML__ @[script data double escape end state] -- (https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state)@ -- to read the name of the tag. tokenScriptDataDoubleEscapeEnd' :: Tokenizer (TokenizerOutput String) tokenScriptDataDoubleEscapeEnd' = tokenizer (Just ([], "")) [ ifChar isAsciiAlpha $ \c -> consOut c <$> tokenScriptDataDoubleEscapeEnd' , elsePush_ $ packToken ([], "") ]