{-| Description: Tokenization rules for characters comprising a markup element. Copyright: (c) 2020-2021 Sam May License: MPL-2.0 Maintainer: ag.eitilt@gmail.com Stability: stable Portability: portable -} module Web.Mangrove.Parse.Tokenize.Tag ( tokenTagOpen , tokenBeforeAttributeName , tokenSelfClosingStartTag , tokenAppropriateEndTagLessThanSign , tokenAppropriateEndTagOpen -- * Types , TagParams' ( .. ) , emptyTagData , packTagToken ) where import qualified Control.Applicative as A import qualified Control.Monad.Trans.State as N.S import qualified Data.Bifunctor as F.B import qualified Data.List as L import qualified Data.HashMap.Strict as M import qualified Data.Maybe as Y import qualified Data.Text as T import Data.Functor ( ($>) ) import Web.Willow.DOM import Web.Mangrove.Parse.Common.Error import Web.Mangrove.Parse.Tokenize.Common import Web.Mangrove.Parse.Tokenize.Character import Web.Mangrove.Parse.Tokenize.Comment import Web.Mangrove.Parse.Tokenize.Doctype import Web.Willow.Common.Encoding.Character import Web.Willow.Common.Parser.Util import {-# SOURCE #-} Web.Mangrove.Parse.Tokenize.Dispatcher -- | An intermediate form of 'TagParams' with unpacked parameter types to -- facilitate construction. Specifically, 'Char'-by-'Char' construction of a -- 'String' followed by a single 'T.pack' is much faster than repeated @O(n)@ -- calls to 'T.cons'. Values may be easily instantiated as updates to -- @emptyTagData@. data TagParams' = TagParams' { tagName' :: String -- ^ See 'tagName'. , tagIsSelfClosing' :: Bool -- ^ See 'tagIsSelfClosing'. , tagAttributes' :: [(String, String)] -- ^ See 'tagAttributes'. } deriving ( Eq, Show, Read ) -- | A sane default collection for easy record initialization; namely, -- 'null's and 'False'. emptyTagData :: TagParams' emptyTagData = TagParams' { tagName' = "" , tagIsSelfClosing' = False , tagAttributes' = [] } -- | Wrap a given collection of data into the payload of a 'StartTag' or -- 'EndTag', according to the given constructor. packTagToken :: (TagParams -> Token) -> TokenizerOutput (Maybe TagParams') -> Tokenizer [TokenizerOutput Token] packTagToken toToken d' = case tokenizedOut d' of Just tagData -> do let tok = toToken $ emptyTagParams { tagName = T.pack $ tagName' tagData , tagIsSelfClosing = tagIsSelfClosing' tagData , tagAttributes = attrs } (attrErrs, attrs) = packAttributes $ tagAttributes' tagData tok' <- emit (tokenizedErrs d' ++ attrErrs, tok) return [tok' { tokenizedState = tokenizedState d' }] Nothing -> case tokenizedErrs d' of [] -> return [] errs -> do atEOS <- atEndOfStream <$> N.S.get consTokenErrorsList errs <$> if atEOS then Y.fromMaybe [] <$> A.optional dispatcher else dispatcher -- | Wrap a loose collection of unpacked 'String's into a form suitable for -- representing the 'tagAttributes' of a markup element, removing all but the -- first occurrence of any single name. -- -- Note that any nameless attributes at the head of the list are dropped, to -- work around the edge case where @newAttr@ is called immediately following -- the tag name; this shouldn't cause any issue as the specification doesn't -- seem to output any such attributes in the first place. packAttributes :: [(String, String)] -> ([ParseError], M.HashMap T.Text T.Text) packAttributes = foldr filterAttr ([], M.empty) . L.sortOn fst . dropWhile (null . fst) -- | Add a single attribute to the processed collection, potentially throwing a -- 'DuplicateAttribute' warning (with the old value) if one with the same name -- already exists. filterAttr :: (String, String) -> ([ParseError], M.HashMap T.Text T.Text) -> ([ParseError], M.HashMap T.Text T.Text) filterAttr (n, v) (errs, as) | M.null as = (errs, M.singleton name value) | otherwise = case M.lookup name as of Nothing -> (errs, M.insert name value as) Just value' -> (DuplicateAttribute (name, value') : errs, M.insert name value as) where name = T.pack n value = T.pack v -- | Finalize the currently "active" attribute, so that any further calls to, -- e.g., @consAttrValue@ will be applied to a different (initially empty) pair -- instead. newAttr :: TokenizerOutput (Maybe TagParams') -> TokenizerOutput (Maybe TagParams') newAttr = fmap . fmap $ \d -> d { tagAttributes' = case tagAttributes' d of [] -> [] as@(("", ""):_) -> as as -> ("", "") : as } -- | Prepend a character to the unpacked key of the currently "active" metadata -- pair in a markup element. consAttrName :: Char -> TokenizerOutput (Maybe TagParams') -> TokenizerOutput (Maybe TagParams') consAttrName c = fmap . fmap $ \d -> d { tagAttributes' = consAttrName' $ tagAttributes' d } where consAttrName' [] = [([c], "")] consAttrName' (a:as) = F.B.first (c :) a : as -- | Prepend a character to the unpacked value of the currently "active" -- metadata pair in a markup element. consAttrValue :: Char -> TokenizerOutput (Maybe TagParams') -> TokenizerOutput (Maybe TagParams') consAttrValue c = consAttrValueString [c] -- | Prepend a character sequence to the unpacked value of the currently -- "active" metadata pair in a markup element. consAttrValueString :: String -> TokenizerOutput (Maybe TagParams') -> TokenizerOutput (Maybe TagParams') consAttrValueString str = fmap . fmap $ \d -> d { tagAttributes' = consAttrValueString' $ tagAttributes' d } where consAttrValueString' [] = [("", str)] consAttrValueString' (a:as) = fmap (str ++) a : as -- | __HTML:__ -- @[tag open state] -- (https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state)@ -- -- The parsing instructions for after reading @"\<"@ in a section of the state -- machine which allows markup declarations. tokenTagOpen :: Tokenizer [TokenizerOutput Token] tokenTagOpen = tokenizers (Just [([EOFBeforeTagName], Character '<')]) [ ifs_ (== '!') tokenMarkupDeclarationOpen , ifs_ (== '/') tokenEndTagOpen , ifPush_ isAsciiAlpha $ tokenTagName >>= packTagToken StartTag , ifPush_ (== '?') $ (: []) . packCommentToken . consTokenError UnexpectedQuestionMarkInsteadOfTagName <$> tokenBogusComment , elsePush_ $ changeState DataState *> emit' ([InvalidFirstCharacterOfTagName], Character '<') ] -- | __HTML:__ -- @[end tag open state] -- (https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state)@ -- -- The parsing instructions for after reading @"\>= packTagToken EndTag , ifs_ (== '>') $ do changeState DataState consTokenErrorsList [MissingEndTagName] <$> dispatcher , elsePush_ $ (: []) . packCommentToken . consTokenError InvalidFirstCharacterOfTagName <$> tokenBogusComment ] where recovery = [ ([], Character '<') , ([], Character '/') , ([EOFBeforeTagName], EndOfStream) ] -- | __HTML:__ -- @[tag name state] -- (https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state)@ -- -- The parsing instructions for after reading @"\<"@ followed by a letter in a -- section of the state machine which allows markup declarations. tokenTagName :: Tokenizer (TokenizerOutput (Maybe TagParams')) tokenTagName = tokenizer (Just ([EOFInTag], Nothing)) [ if_ isAsciiWhitespace tokenBeforeAttributeName , if_ (== '/') tokenSelfClosingStartTag , if_ (== '>') $ packToken ([], Just emptyTagData) , if_ (== '\NUL') $ consTokenError UnexpectedNullCharacter . consTagName replacementChar <$> tokenTagName , elseChar $ \c -> consTagName (toAsciiLower c) <$> tokenTagName ] where consTagName char = fmap . fmap $ \d -> d { tagName' = char : tagName' d } -- | __HTML:__ -- @[before attribute name state] -- (https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state)@ -- -- The parsing instructions for reading whitespace interspersed with key-value -- metadata pairs in the markup tag section of the state machine. tokenBeforeAttributeName :: Tokenizer (TokenizerOutput (Maybe TagParams')) tokenBeforeAttributeName = tokenizer (Just ([EOFInTag], Nothing)) [ if_ isAsciiWhitespace tokenBeforeAttributeName , ifPush_ (`elem` "/>") tokenAfterAttributeName , ifChar (== '=') $ \c -> consTokenError UnexpectedEqualsSignBeforeAttributeName . consAttrName c <$> tokenAttributeName , elsePush_ tokenAttributeName ] -- | __HTML:__ -- @[attribute name state] -- (https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state)@ -- -- The parsing instructions for reading the initial (key) part of a key-value -- metadata pair in the markup tag section of the state machine. tokenAttributeName :: Tokenizer (TokenizerOutput (Maybe TagParams')) tokenAttributeName = tokenizer (Just ([EOFInTag], Nothing)) [ ifPush_ (`elem` "\t\n\f />") tokenAfterAttributeName , if_ (== '=') tokenBeforeAttributeValue , if_ (== '\NUL') $ consTokenError UnexpectedNullCharacter . consAttrName replacementChar <$> tokenAttributeName , ifChar (`elem` "\"'<") $ \c -> consTokenError UnexpectedCharacterInAttributeName . consAttrName c <$> tokenAttributeName , elseChar $ \c -> consAttrName (toAsciiLower c) <$> tokenAttributeName ] -- | __HTML:__ -- @[after attribute name state] -- (https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state)@ -- -- The parsing instructions for reading whitespace after the initial (key) part -- of a key-value metadata pair in the markup tag section of the state machine. tokenAfterAttributeName :: Tokenizer (TokenizerOutput (Maybe TagParams')) tokenAfterAttributeName = tokenizer (Just ([EOFInTag], Nothing)) [ if_ isAsciiWhitespace $ fmap newAttr tokenAfterAttributeName , if_ (== '/') $ fmap newAttr tokenSelfClosingStartTag , if_ (== '=') tokenBeforeAttributeValue , if_ (== '>') $ changeState DataState *> packToken ([], Just emptyTagData) , elsePush_ $ fmap newAttr tokenAttributeName ] -- | __HTML:__ -- @[before attribute value state] -- (https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state)@ -- -- The parsing instructions for reading whitespace after @"="@ following the -- initial (key) part of a key-value metadata pair in the markup tag section of -- the state machine. tokenBeforeAttributeValue :: Tokenizer (TokenizerOutput (Maybe TagParams')) tokenBeforeAttributeValue = tokenizer (Just ([EOFInTag], Nothing)) [ if_ isAsciiWhitespace tokenBeforeAttributeValue , if_ (== '"') tokenAttributeValueDoubleQuoted , if_ (== '\'') tokenAttributeValueSingleQuoted , if_ (== '>') $ changeState DataState *> packToken ([MissingAttributeValue], Just emptyTagData) , elsePush_ tokenAttributeValueUnquoted ] -- | __HTML:__ -- @[attribute value (double-quoted) state] -- (https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(double-quoted\)-state)@ -- -- The parsing instructions for reading the second (value) part of a key-value -- metadata pair following a @'"'@ in the markup tag section of the state -- machine. tokenAttributeValueDoubleQuoted :: Tokenizer (TokenizerOutput (Maybe TagParams')) tokenAttributeValueDoubleQuoted = tokenizer (Just ([EOFInTag], Nothing)) [ if_ (== '"') tokenAfterAttributeValueQuoted , if_ (== '&') $ tokenAttributeCharacterReference tokenAttributeValueDoubleQuoted , if_ (== '\NUL') $ consTokenError UnexpectedNullCharacter . consAttrValue replacementChar <$> tokenAttributeValueDoubleQuoted , elseChar $ \c -> consAttrValue c <$> tokenAttributeValueDoubleQuoted ] -- | __HTML:__ -- @[attribute value (single-quoted) state] -- (https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(single-quoted\)-state)@ -- -- The parsing instructions for reading the second (value) part of a key-value -- metadata pair following a @'\''@ in the markup tag section of the state -- machine. tokenAttributeValueSingleQuoted :: Tokenizer (TokenizerOutput (Maybe TagParams')) tokenAttributeValueSingleQuoted = tokenizer (Just ([EOFInTag], Nothing)) [ if_ (== '\'') tokenAfterAttributeValueQuoted , if_ (== '&') $ tokenAttributeCharacterReference tokenAttributeValueSingleQuoted , if_ (== '\NUL') $ consTokenError UnexpectedNullCharacter . consAttrValue replacementChar <$> tokenAttributeValueSingleQuoted , elseChar $ \c -> consAttrValue c <$> tokenAttributeValueSingleQuoted ] -- | __HTML:__ -- @[attribute value (unquoted) state] -- (https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(unquoted\)-state)@ -- -- The parsing instructions for reading the second (value) part of a key-value -- metadata pair directly following a @'='@ in the markup tag section of the -- state machine. tokenAttributeValueUnquoted :: Tokenizer (TokenizerOutput (Maybe TagParams')) tokenAttributeValueUnquoted = tokenizer (Just ([EOFInTag], Nothing)) [ if_ isAsciiWhitespace $ fmap newAttr tokenBeforeAttributeName , if_ (== '&') $ tokenAttributeCharacterReference tokenAttributeValueUnquoted , if_ (== '>') $ changeState DataState *> packToken ([], Just emptyTagData) , if_ (== '\NUL') $ consTokenError UnexpectedNullCharacter . consAttrValue replacementChar <$> tokenAttributeValueUnquoted , ifChar (`elem` "\"'<=`") $ \c -> consTokenError UnexpectedCharacterInUnquotedAttributeValue . consAttrValue c <$> tokenAttributeValueUnquoted , elseChar $ \c -> consAttrValue c <$> tokenAttributeValueUnquoted ] -- | Resolve a character reference occurring within the second (value) part of -- a key-value metadata pair, and continue parsing using the given tokenizer. tokenAttributeCharacterReference :: Tokenizer (TokenizerOutput (Maybe TagParams')) -> Tokenizer (TokenizerOutput (Maybe TagParams')) tokenAttributeCharacterReference resume = do ref <- tokenCharacterReference True tag <- resume return . consTokenErrors (tokenizedErrs ref) $ consAttrValueString (tokenizedOut ref) tag -- | __HTML:__ -- @[after attribute value (quoted) state] -- (https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-(quoted\)-state)@ -- -- The parsing instructions for after reading a @'"'@ or @'\''@ matching the -- opening mark of the second (value) part of a key-value metadata pair in the -- markup tag section of the state machine. tokenAfterAttributeValueQuoted :: Tokenizer (TokenizerOutput (Maybe TagParams')) tokenAfterAttributeValueQuoted = tokenizer (Just ([EOFInTag], Nothing)) [ if_ isAsciiWhitespace $ fmap newAttr tokenBeforeAttributeName , if_ (== '/') tokenSelfClosingStartTag , if_ (== '>') $ changeState DataState *> packToken ([], Just emptyTagData) , elsePush_ $ consTokenError MissingWhitespaceBetweenAttributes . newAttr <$> tokenBeforeAttributeName ] -- | __HTML:__ -- @[self closing start tag state] -- (https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state)@ -- -- The parsing instructions for after reading @"/"@ in the markup tag section -- of the state machine. tokenSelfClosingStartTag :: Tokenizer (TokenizerOutput (Maybe TagParams')) tokenSelfClosingStartTag = tokenizer (Just ([EOFInTag], Nothing)) [ if_ (== '>') $ changeState DataState *> packToken ([], Just $ emptyTagData { tagIsSelfClosing' = True }) , elsePush_ $ consTokenError UnexpectedSolidusInTag <$> tokenBeforeAttributeName ] -- | __HTML:__ -- @[bogus comment state] -- (https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state)@ -- -- The parsing instructions for after reading @"\') $ changeState DataState *> packToken ([], "") , if_ (== '\NUL') $ consTokenError UnexpectedNullCharacter . consOut replacementChar <$> tokenBogusComment , elseChar $ \c -> consOut c <$> tokenBogusComment ] -- | __HTML:__ -- @[markup declaration open state] -- (https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state)@ -- -- The parsing instructions for after reading @"\ return commentParser , chunk' toAsciiUpper "DOCTYPE" *> return doctypeParser , chunk' id "[CDATA[" *> return cdataParser , return failureParser ] parserChoice where commentParser = fmap (pure . packCommentToken) tokenCommentStart doctypeParser = fmap pure tokenDoctype cdataParser = do currentNamespace <- currentNodeNamespace <$> N.S.get case currentNamespace of Just ns | ns /= htmlNamespace -> changeState CDataState $> [] _ -> pure . packCommentToken . consTokenError CDataInHtmlContent . consOuts "[CDATA[" <$> tokenBogusComment failureParser = pure . packCommentToken . consTokenError IncorrectlyOpenedComment <$> tokenBogusComment -- | __HTML:__ -- -- * [@RAWTEXT less-than sign state@] -- (https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state) -- * [@RCDATA less-than sign state@] -- (https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state) -- -- The parsing instructions for after reading @"\<"@ in sections of the state -- machine which may be closed by tag, and which emit only 'Character' tokens. tokenAppropriateEndTagLessThanSign :: Tokenizer [TokenizerOutput Token] tokenAppropriateEndTagLessThanSign = tokenizers (Just [([], Character '<')]) [ ifs_ (== '/') tokenAppropriateEndTagOpen , elsePush_ $ emit' ([], Character '<') ] -- | __HTML:__ -- -- * [@RAWTEXT end tag open state@] -- (https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state) -- * [@RCDATA end tag open state@] -- (https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state) -- * [@script data end tag open state@] -- (https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state) -- * [@script data escaped end tag open state@] -- (https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state) -- -- The parsing instructions for after reading @"\>= \t' -> do close <- appropriateEndTag (map toAsciiLower $ tokenizedOut t') if close then tokenizers (Just $ fallback t') [ ifs_ isAsciiWhitespace $ tokenBeforeAttributeName >>= packName (tokenizedOut t') , ifs_ (== '/') $ tokenSelfClosingStartTag >>= packName (tokenizedOut t') , ifs_ (== '>') $ do changeState DataState out <- packName (tokenizedOut t') $ t' { tokenizedOut = Just emptyTagData } -- 'packName' keeps the state of the input (i.e. /before/ the -- closing @>@), and so that needs to be cleared so 'ifs_' will -- assign the state of the input token (/after/ the closing -- @>@). return $ map continueState out , elsePush_ $ emits (tokenizedState t') (anythingElse (tokenizedErrs t') (tokenizedOut t')) ] else emits (tokenizedState t') (fallback t') where fallback t' = anythingElse (tokenizedErrs t') (tokenizedOut t') packName buffer = packTagToken $ EndTag . packName' (T.pack buffer) packName' buffer tagData = tagData { tagName = T.map toAsciiLower buffer } anythingElse errs buffer = [ ([], Character '<') , ([], Character '/') ] ++ case buffer of [] -> [] (c:cs) -> (errs, Character c) : [([], Character c') | c' <- cs] emits state ts = finalStateList state <$> mapM emit ts tokenAppropriateEndTagName = tokenizer (Just ([], "")) [ ifChar isAsciiAlpha $ \c -> consOut c <$> tokenAppropriateEndTagName , elsePush_ $ packToken ([], "") ]