{-# LANGUAGE OverloadedStrings #-} {-| Description: Functions and objects used to build the tree construction parser. Copyright: (c) 2020-2021 Sam May License: MPL-2.0 Maintainer: ag.eitilt@gmail.com Stability: provisional Portability: portable This module provides the data structures and common functions used in the first half of this implementation's split __[HTML](https://html.spec.whatwg.org/multipage/parsing.html#tree-construction)__ tree construction algorithm; for more discussion on the structure, see the documentation on "Web.Mangrove.Parse.Tree.Patch". In particular, these objects handle the state transitions and token processing which form the foundation of that parser on which the 'Web.Mangrove.Parse.Tree.Patch.Patch' generation will be able to build. -} module Web.Mangrove.Parse.Tree.Common ( -- * Types -- ** Tree construction -- *** Parser TreeBuilder , TreeState ( .. ) , TreeParserState ( .. ) , defaultTreeState , NodeIndex , InsertionMode ( .. ) -- *** Data -- **** Elements , nodeIsElement , nodeIsSpecial , ElementParams ( .. ) , emptyElementParams , packNodeData , scopeElements , specialElements -- **** Attributes , AttributeParams ( .. ) , emptyAttributeParams , adjustForeignAttributes , adjustMathMLAttributes , adjustSvgAttributes -- **** Document type declarations , DocumentTypeParams ( .. ) , emptyDocumentTypeParams , QuirksMode ( .. ) -- ** Tokenization -- *** Parser , TokenizerState ( .. ) , CurrentTokenizerState ( .. ) -- *** Data , Token ( .. ) , TreeInput ( .. ) , TokenizerOutputState , tokenRemainder , dummyToken , dummyStateToken , mapTokenState , mapTokenState' , TagParams ( .. ) , emptyTagParams -- *** Extraction , tokenCharacter , tokenDoctype , tokenDocumentType , tokenTag , tokenElement -- * Parser building -- ** Token matching , isEOF , isCharacter , isNull , isWhitespace , isComment , isDoctype , isAnyStartTag , isAnyEndTag , isStartTag , isEndTag -- ** State modification , switchMode , resetMode , setFramesetNotOk , insertFormattingMarker , clearFormattingElements , pushTemplateMode , popTemplateMode , resetInsertionMode , resetInsertionMode' -- ** Current node -- *** Retrieval , currentNode , currentNodeIndex , adjustedCurrentNode -- *** Namespace boundaries , atHtmlIntegration , atMathMLIntegration , isMathMLAnnotationXml -- ** Ancestor testing , inFragment , inIFrameSrcDoc , hasOpenElement , hasOpenElementExcept -- *** Scope , hasInScope , hasIndexInScope , hasInButtonScope , hasInListItemScope , hasInTableScope , hasInSelectScope ) where import qualified Control.Monad.Trans.State as N.S import qualified Data.Bifunctor as F.B import qualified Data.ByteString as BS import qualified Data.HashMap.Strict as M import qualified Data.Maybe as Y import qualified Data.Text as T import qualified Numeric.Natural as Z import Web.Willow.DOM hiding ( Tree ( .. ) , Node ( .. ) ) import Web.Mangrove.Parse.Common.Error import Web.Mangrove.Parse.Tokenize.Common import Web.Willow.Common.Encoding.Character import Web.Willow.Common.Parser -- | Parser combinators written over the output of the -- "Web.Mangrove.Parse.Tokenize" stage, transforming it into a sequence of -- folding instruction 'Web.Mangrove.Parse.Tree.Patch.Patch'es. type TreeBuilder out = StateParser TreeParserState [TreeInput] out -- | The collection of data returned by the "Web.Mangrove.Parse.Tokenize" -- stage, and so comprising the input to the tree construction parser. -- Values may be easily instantiated through 'dummyToken' or dummyStateToken'. data TreeInput = TreeInput { tokenErrs :: [ParseError] -- ^ Any authoring errors detected during decoding and tokenization. , tokenOut :: Token -- ^ The token itself. , tokenState :: TokenizerOutputState -- ^ The data required to resume decoding immediately following the -- value, if possible. See also 'decodedRemainder'. } deriving ( Eq, Show, Read ) -- | The unparsed portion of the binary stream, /after/ parsing the associated -- token. See also 'tokenState'. tokenRemainder :: TreeInput -> Maybe BS.ByteString tokenRemainder = fmap snd . tokenState -- | All data with which to re-initialize the tokenizer, to resume as if the -- state machine transition hadn't been interrupted. If 'Nothing', the -- associated 'Token' was emitted in the 'init' of several at once; in this -- case, the tokenizer can't be re-entered in exactly the same place with the -- data wrapped in this type, and so the stream must continue to be processed -- until the first 'Just' value. type TokenizerOutputState = Maybe (TokenizerState, BS.ByteString) -- | Generate a token /a priori/ in the tree construction stage to pass to a -- function expecting the raw tokenizer output. See 'dummyStateToken' if a -- known 'TokenizerOutputState' must be associated as well. dummyToken :: [ParseError] -> Token -> TreeInput dummyToken errs t = dummyStateToken errs t Nothing -- | Generate a token /a priori/ in the tree construction stage to pass to a -- function expecting the raw tokenizer output, attaching a specific resume -- state. For most of the standard parsers, 'dummyToken' should be used -- instead. dummyStateToken :: [ParseError] -> Token -> TokenizerOutputState -> TreeInput dummyStateToken errs t state = TreeInput { tokenErrs = errs , tokenOut = t , tokenState = state } -- | Extract a single 'Char' from an input value. Returns Nothing if the inner -- 'Token' is not a 'Character'. tokenCharacter :: TreeInput -> Maybe Char tokenCharacter t' = case tokenOut t' of Character c -> Just c _ -> Nothing -- | Extract the data comprising a DOCTYPE declaration from an input value. -- Returns an empty collection if the inner 'Token' is not a 'Doctype'. tokenDoctype :: TreeInput -> DoctypeParams tokenDoctype t' = case tokenOut t' of Doctype d -> d _ -> emptyDoctypeParams -- | Extract the data comprising a DOCTYPE declaration from an input value, in -- a form suitable for 'ParseError's. Returns an empty collection if the inner -- 'Token' is not a 'Doctype'. tokenDocumentType :: TreeInput -> DocumentTypeParams tokenDocumentType t' = emptyDocumentTypeParams { documentTypeName = Y.fromMaybe T.empty $ doctypeName d , documentTypePublicId = Y.fromMaybe T.empty $ doctypePublicId d , documentTypeSystemId = Y.fromMaybe T.empty $ doctypeSystemId d } where d = tokenDoctype t' -- | Extract the data comprising a markup element from an input value. Returns -- 'emptyTagParams' if the inner 'Token' is neither a 'StartTag' nor an -- 'EndTag'. tokenTag :: TreeInput -> TagParams tokenTag t' = case tokenOut t' of StartTag d -> d EndTag d -> d _ -> emptyTagParams -- | Extract the data comprising a markup element from an input value, in a -- form suitable for 'ParseError's. Returns an empty collection in the -- 'htmlNamespace' if the inner 'Token' is neither a 'StartTag' nor an -- 'EndTag'. tokenElement :: TreeInput -> ElementParams tokenElement = packNodeData (Just htmlNamespace) . tokenTag -- | Modify the state of the tokenizer as given by one of the wrapped tokens -- used as input to the tree construction parsers. mapTokenState :: TreeInput -> (TokenParserState -> TokenParserState) -> TreeInput mapTokenState t' f = t' { tokenState = F.B.first f' <$> tokenState t' } where f' state = state { tokenParserState = f $ tokenParserState state } -- | Modify the semi-opaque wrapped state of the tokenizer, within one of the -- wrapped tokens used as input to the tree construction parsers. mapTokenState' :: TreeInput -> (TokenizerOutputState -> TokenizerOutputState) -> TreeInput mapTokenState' t' f = t' { tokenState = f $ tokenState t' } -- | Lift the tag data collected by the tokenizer into the namespaced datatype -- of the DOM tree. packNodeData :: Maybe Namespace -> TagParams -> ElementParams packNodeData ns d = emptyElementParams { elementName = tagName d , elementNamespace = ns , elementAttributes = M.fromList . map packAttr . M.toList $ tagAttributes d } where packAttr (n, v) = ((Nothing, n), (Nothing, v)) -- | Type-level clarification for an identifier uniquely identifying each -- element in the document tree, assigned in a rough tree order. -- -- The specification assumes a reference-based and mutable memory model, where -- each element in, e.g., the stack of open elements not only describes the -- shape of the node, but also is distinct from all other nodes with the same -- data. Haskell's memory is much less accessible and more likely to be -- shared, so an extra datapoint needs to be carried along. type NodeIndex = Z.Natural -- | The collection of data required to extract a list of semantic atoms from a -- binary document stream. Values may be easily instantiated as updates to -- 'defaultTreeState'. data TreeState = TreeState { treeParserState :: TreeParserState -- ^ The state of the current 'Web.Mangrove.Parse.Tree.tree' stage. , tokenizerState :: TokenizerState -- ^ The state of the previous tokenization stage. Note that the -- high-level conceptual view of the parser stack is of each stage -- moving along the 'BS.ByteString' as a (more or less) unified -- front, rather than each stage independently running over the output -- of the previous. } deriving ( Eq, Show, Read ) -- | All the data which needs to be tracked for correct behaviour in the tree -- construction stage. data TreeParserState = TreeParserState { insertionMode :: InsertionMode -- ^ __HTML:__ -- @[insertion mode] -- (https://html.spec.whatwg.org/multipage/parsing.html#insertion-mode)@ -- -- The set of rules currently active in the state machine. , originalInsertionMode :: Maybe InsertionMode -- ^ __HTML:__ -- @[original insertion mode] -- (https://html.spec.whatwg.org/multipage/parsing.html#original-insertion-mode)@ -- -- Which part of the state machine should be returned to after exiting -- the 'InText' insertion mode. , templateInsertionModes :: [InsertionMode] -- ^ __HTML:__ -- @[stack of template insertion modes] -- (https://html.spec.whatwg.org/multipage/parsing.html#stack-of-template-insertion-modes)@ -- -- Which part(s) of the state machine should be returned to after -- exiting the 'InTemplate' insertion mode. , openElements :: [(NodeIndex, ElementParams)] -- ^ __HTML:__ -- @[stack of open elements] -- (https://html.spec.whatwg.org/multipage/parsing.html#stack-of-open-elements)@ -- -- The ancestors of/path to the currently "active" node, closest parent -- first. , activeFormattingElements :: [[(NodeIndex, TagParams)]] -- ^ __HTML:__ -- @[list of active formatting elements] -- (https://html.spec.whatwg.org/multipage/parsing.html#list-of-active-formatting-elements)@ -- -- The elements which should be reconstructed at the beginning of a tag -- closed unexpectedly early (overlapping tags). These are divided -- into groups at particularly independent sections of the tree. , elementIndex :: NodeIndex -- ^ The unique ID for the /next/ node which will be generated. , isInIFrameSrcDoc :: Bool -- ^ __HTML:__ -- @[an iframe srcdoc document] -- (https://html.spec.whatwg.org/multipage/iframe-embed-object.html#an-iframe-srcdoc-document)@ -- -- Test whether the current document is being loaded via an @\@. , fragmentContext :: Maybe (ElementParams, [(NodeIndex, ElementParams)]) -- ^ __HTML:__ -- @[context] -- (https://html.spec.whatwg.org/multipage/parsing.html#concept-frag-parse-context)@ -- -- The element passed to the [HTML fragment parsing algorithm] -- (https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-parsing-algorithm) -- to provide its immediate (conceptual) parent. The first element of -- the tuple is a description of the context node itself, while the -- second is a list of its ancestors, most recent parent first. , quirksMode :: QuirksMode -- ^ __DOM:__ -- @[Document mode] -- (https://dom.spec.whatwg.org/#concept-document-mode)@ -- -- The degree to which the document should emulate weirdness in -- historic browsers' rendering. , fosteringEnabled :: Bool -- ^ __HTML:__ -- @[foster parenting] -- (https://html.spec.whatwg.org/multipage/parsing.html#foster-parent)@ -- -- Whether the [appropriate place for inserting a node] -- (https://html.spec.whatwg.org/multipage/parsing.html#appropriate-place-for-inserting-a-node) -- should direct new nodes to be created outside the current @\@. , scriptingEnabled :: Bool -- ^ __HTML:__ -- @[scripting flag] -- (https://html.spec.whatwg.org/multipage/parsing.html#scripting-flag)@ -- -- Whether the document is required to be parsed with JavaScript -- enabled or disabled. Currently a binary choice, but the type may -- change in the future if non-deterministic parsing is implemented. , framesetOk :: Bool -- ^ __HTML:__ -- @[frameset-ok flag] -- (https://html.spec.whatwg.org/multipage/parsing.html#frameset-ok-flag)@ -- -- Whether a particular class of content, which disallows the -- introduction of HTML page inclusion via @\@, has already -- been added to the tree. , headElementPointer :: Maybe NodeIndex -- ^ __HTML:__ -- @[head element pointer] -- (https://html.spec.whatwg.org/multipage/parsing.html#head-element-pointer)@ -- -- The unique ID of the document's @\@ element, if one has been -- added to the tree. , formElementPointer :: Maybe NodeIndex -- ^ __HTML:__ -- @[form element pointer] -- (https://html.spec.whatwg.org/multipage/parsing.html#form-element-pointer)@ -- -- The unique ID of the most recent @\@ element, if one exists. } deriving ( Eq, Show, Read ) -- | The collection of data which results in behaviour according to the -- "initially" instructions in the HTML tree construction algorithm. defaultTreeState :: TreeState defaultTreeState = TreeState { tokenizerState = defaultTokenizerState , treeParserState = TreeParserState { insertionMode = Initial , originalInsertionMode = Nothing , templateInsertionModes = [] , openElements = [] , activeFormattingElements = [] , elementIndex = 0 , isInIFrameSrcDoc = False , fragmentContext = Nothing , quirksMode = NoQuirks , fosteringEnabled = False , scriptingEnabled = False , framesetOk = True , headElementPointer = Nothing , formElementPointer = Nothing } } -- | The various fixed points in the tree construction algorithm, where the -- parser may break and re-enter seamlessly. data InsertionMode = Initial -- ^ __HTML:__ -- @[initial] -- (https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode)@ -- -- Before the doctype declaration, if one exists. | BeforeHtml -- ^ __HTML:__ -- @[before html] -- (https://html.spec.whatwg.org/multipage/parsing.html#the-before-html-insertion-mode)@ -- -- Before the first markup tag in the document. | BeforeHead -- ^ __HTML:__ -- @[before head] -- (https://html.spec.whatwg.org/multipage/parsing.html#the-before-head-insertion-mode)@ -- -- Between the root @\@ tag and the opening @\@. | InHead -- ^ __HTML:__ -- @[in head] -- (https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inhead)@ -- -- Within the @\@ section, describing document metadata. | InHeadNoscript -- ^ __HTML:__ -- @[in head noscript] -- (https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inheadnoscript)@ -- -- Within a @\@ section of the @\@, describing -- alternate document metadata to be used if script support has been -- disabled. | AfterHead -- ^ __HTML:__ -- @[after head] -- (https://html.spec.whatwg.org/multipage/parsing.html#the-after-head-insertion-mode)@ -- -- Between the closing @\@ and opening @\@ markup tags. | InBody -- ^ __HTML:__ -- @[in body] -- (https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody)@ -- -- Within the @\@ section, describing the primary, renderable -- document content. | InText -- ^ __HTML:__ -- @[text] -- (https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incdata)@ -- -- Raw text which should be inserted into the document tree without -- translation to markup tags. Corresponds to the -- "Web.Mangrove.Parse.Tokenize" state 'RCDataState', and as such it is -- critical that the 'currentState' and 'prevStartTag' items be set -- appropriately to allow the parser to continue to the next -- 'InsertionMode'. | InTable -- ^ __HTML:__ -- @[in table] -- (https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intable)@ -- -- Within a @\@ markup section, describing content laid out in -- a rectangular grid. | InTableText -- ^ __HTML:__ -- @[in table text] -- (https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intabletext)@ -- -- Special processing for content misnested within a @\@ but -- outside any @\@ or @\@. | InCaption -- ^ __HTML:__ -- @[in caption] -- (https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incaption)@ -- -- Within a @\@ markup section, describing the data presented -- by the enclosing @\@. | InColumnGroup -- ^ __HTML:__ -- @[in column group] -- (https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incolgroup)@ -- -- Within a @\@ markup section, describing how the vertical -- columns in the enclosing @\@ should be rendered. | InTableBody -- ^ __HTML:__ -- @[in table body] -- (https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intbody)@ -- -- Within a @\@ markup section, describing the actual data -- presented by the enclosing @\@. | InRow -- ^ __HTML:__ -- @[in row] -- (https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intr)@ -- -- Within a @\@ markup section, describing a line of -- mutually-associated data presented by the enclosing @\@. | InCell -- ^ __HTML:__ -- @[in cell] -- (https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intd)@ -- -- Within a @\@ markup section, describing a single point of data -- presented by the enclosing @\@. | InSelect -- ^ __HTML:__ -- @[in select] -- (https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inselect)@ -- -- Within a @\@ markup section, presenting several predefined -- options for the user's input. | InSelectInTable -- ^ __HTML:__ -- @[in select in table] -- (https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inselectintable)@ -- -- As 'InSelect', while providing extra cleanup logic for misnested -- table structure elements. | InTemplate -- ^ __HTML:__ -- @[in template] -- (https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intemplate)@ -- -- Within a @\@ section, providing a -- 'Web.Mangrove.DOM.DocumentFragment' for simpler script-driven -- dynamic generation. | AfterBody -- ^ __HTML:__ -- @[after body] -- (https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-afterbody)@ -- -- Between the closing @\@ and @\@ markup tags. | InFrameset -- ^ __HTML:__ -- @[in frameset] -- (https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inframeset)@ -- -- Within the @\@ section, listing several external -- documents to display in lieu of local content. | AfterFrameset -- ^ __HTML:__ -- @[after frameset] -- (https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-afterframeset)@ -- -- Between the closing @\@ and @\@ markup tags. | AfterAfterBody -- ^ __HTML:__ -- @[after after body] -- (https://html.spec.whatwg.org/multipage/parsing.html#the-after-after-body-insertion-mode)@ -- -- After the final @\@ markup tag, when the primary content was -- described by a @\@ section. | AfterAfterFrameset -- ^ __HTML:__ -- @[after after frameset] -- (https://html.spec.whatwg.org/multipage/parsing.html#the-after-after-frameset-insertion-mode)@ -- -- After the final @\@ markup tag, when the primary content was -- linked via a @\@ section. deriving ( Eq, Ord, Bounded, Enum, Show, Read ) -- | "Switch the insertion mode as specified." switchMode :: InsertionMode -> TreeBuilder () switchMode mode = N.S.modify $ \state -> state { insertionMode = mode } -- | "Switch the insertion mode to the original insertion mode." resetMode :: TreeBuilder () resetMode = N.S.modify $ \state -> case originalInsertionMode state of Just mode -> state { insertionMode = mode , originalInsertionMode = Nothing } Nothing -> state -- | Indicate that the content which is or will be added to the document tree, -- disallows the later use of @\@ transclusion. setFramesetNotOk :: TreeBuilder () setFramesetNotOk = N.S.modify $ \state -> state { framesetOk = False } -- | "Insert a marker at the end of the list of active formatting elements." insertFormattingMarker :: TreeBuilder () insertFormattingMarker = N.S.modify $ \state -> state { activeFormattingElements = [] : activeFormattingElements state } -- | "Clear the list of active formatting elements up to the last marker." clearFormattingElements :: TreeBuilder () clearFormattingElements = N.S.modify $ \state -> state { activeFormattingElements = drop 1 $ activeFormattingElements state } -- | "Push the specified insertion mode onto the stack of template insertion -- modes." pushTemplateMode :: InsertionMode -> TreeBuilder () pushTemplateMode mode = N.S.modify $ \state -> state { templateInsertionModes = mode : templateInsertionModes state } -- | "Pop the current template insertion mode off the stack of template -- insertion modes." popTemplateMode :: TreeBuilder () popTemplateMode = N.S.modify $ \state -> state { templateInsertionModes = drop 1 $ templateInsertionModes state } -- | __HTML:__ -- @[an iframe srcdoc document] -- (https://html.spec.whatwg.org/multipage/iframe-embed-object.html#an-iframe-srcdoc-document)@ -- -- Test whether the current document is being loaded via an @\@. inIFrameSrcDoc :: TreeBuilder Bool inIFrameSrcDoc = isInIFrameSrcDoc <$> N.S.get -- | __HTML:__ -- @[current node] -- (https://html.spec.whatwg.org/multipage/parsing.html#current-node)@ -- -- The most-recently opened markup tag which has not yet been closed. currentNode :: TreeBuilder (Maybe ElementParams) currentNode = fmap snd . Y.listToMaybe . openElements <$> N.S.get -- | The unique ID of the element described by 'currentNode'. currentNodeIndex :: TreeBuilder (Maybe NodeIndex) currentNodeIndex = fmap fst . Y.listToMaybe . openElements <$> N.S.get -- | __HTML:__ -- @[adjusted current node] -- (https://html.spec.whatwg.org/multipage/parsing.html#adjusted-current-node)@ -- -- The most-recently opened markup tag which has not yet been closed, or the -- context element if the document is being read as a document fragment and -- everything else has been closed. adjustedCurrentNode :: TreeBuilder (Maybe ElementParams) adjustedCurrentNode = do state <- N.S.get if length (openElements state) <= 1 && Y.isJust (fragmentContext state) then return . fmap fst $ fragmentContext state else currentNode -- | "Whether the parser was created as part of the HTML fragment parsing -- algorithm." inFragment :: TreeBuilder Bool inFragment = do state <- N.S.get return . Y.isJust $ fragmentContext state -- | Map the given function over all attributes in the token data. adjustAttributes :: (AttributeName -> AttributeName) -> TagParams -> TagParams adjustAttributes adjust d = d { tagAttributes = M.fromList . map (F.B.first adjust) . M.toList $ tagAttributes d } -- | __HTML:__ -- @[adjust MathML attributes] -- (https://html.spec.whatwg.org/multipage/parsing.html#adjust-mathml-attributes)@ -- -- Some attributes on [MathML](https://www.w3.org/TR/MathML/) elements are -- defined in mixed case; restore that distinction to the case-folded token -- data. adjustMathMLAttributes :: TagParams -> TagParams adjustMathMLAttributes = adjustAttributes adjust where adjust :: T.Text -> T.Text adjust "definitionurl" = "definitionURL" adjust name = name -- | __HTML:__ -- @[adjust SVG attributes] -- (https://html.spec.whatwg.org/multipage/parsing.html#adjust-svg-attributes)@ -- -- Some attributes on [SVG](https://www.w3.org/TR/SVG/) elements are defined in -- mixed case; restore that distinction to the case-folded token data. adjustSvgAttributes :: TagParams -> TagParams adjustSvgAttributes = adjustAttributes adjust where adjust :: T.Text -> T.Text adjust "attributename" = "attributeName" adjust "attributetype" = "attributeType" adjust "basefrequency" = "baseFrequency" adjust "baseprofile" = "baseProfile" adjust "calcmode" = "calcMode" adjust "clippathunits" = "clipPathUnits" adjust "diffuseconstant" = "diffuseConstant" adjust "edgemode" = "edgeMode" adjust "filterunits" = "filterUnits" adjust "glyphref" = "glyphRef" adjust "gradienttransform" = "gradientTransform" adjust "gradientunits" = "gradientUnits" adjust "kernelmatrix" = "kernelMatrix" adjust "kernelunitlength" = "kernelUnitLength" adjust "keypoints" = "keyPoints" adjust "keysplines" = "keySplines" adjust "keytimes" = "keyTimes" adjust "lengthadjust" = "lengthAdjust" adjust "limitingconeangle" = "limitingConeAngle" adjust "markerheight" = "markerHeight" adjust "markerunits" = "markerUnits" adjust "markerwidth" = "markerWidth" adjust "maskcontentunits" = "maskContentUnits" adjust "maskunits" = "maskUnits" adjust "numoctaves" = "numOctaves" adjust "pathlength" = "pathLength" adjust "patterncontentunits" = "patternContentUnits" adjust "patterntransform" = "patternTransform" adjust "patternunits" = "patternUnits" adjust "pointsatx" = "pointsAtX" adjust "pointsaty" = "pointsAtY" adjust "pointsatz" = "pointsAtZ" adjust "preservealpha" = "preserveAlpha" adjust "preserveaspectratio" = "preserveAspectRatio" adjust "primitiveunits" = "primitiveUnits" adjust "refx" = "refX" adjust "refy" = "refY" adjust "repeatcount" = "repeatCount" adjust "repeatdur" = "repeatDur" adjust "requiredextensions" = "requiredExtensions" adjust "requiredfeatures" = "requiredFeatures" adjust "specularconstant" = "specularConstant" adjust "specularexponent" = "specularExponent" adjust "spreadmethod" = "spreadMethod" adjust "startoffset" = "startOffset" adjust "stddeviation" = "stdDeviation" adjust "stitchtiles" = "stitchTiles" adjust "surfacescale" = "surfaceScale" adjust "systemlanguage" = "systemLanguage" adjust "tablevalues" = "tableValues" adjust "targetx" = "targetX" adjust "targety" = "targetY" adjust "textlength" = "textLength" adjust "viewbox" = "viewBox" adjust "viewtarget" = "viewTarget" adjust "xchannelselector" = "xChannelSelector" adjust "ychannelselector" = "yChannelSelector" adjust "zoomandpan" = "zoomAndPan" adjust name = name -- | __HTML:__ -- @[adjust foreign attributes] -- (https://html.spec.whatwg.org/multipage/parsing.html#adjust-foreign-attributes)@ -- -- The HTML specification expects most names to not carry XML-style prefixes -- (e.g. @xlink:role@), and handles most which do anyway without issue. -- However, that assumption proves disruptive for a few attributes, and those -- exceptions should therefore be fixed. -- -- Note that this is a very simple algorithm in simply assuming a standard -- prefix<->namespace assignment, and doesn't perform any scope test. As -- perfect representation of the structured data isn't actually the goal, that -- winds up being enough /in this case/. Do not expect actual XML to play -- nicely. adjustForeignAttributes :: ElementParams -> ElementParams adjustForeignAttributes tag = case M.intersection attrs foreignNames of foreignAttrs | M.null foreignAttrs -> tag foreignAttrs -> let foreignAttrs' = fromAttrList . map adjustForeignAttribute $ toAttrList foreignAttrs in tag { elementAttributes = M.union foreignAttrs' $ M.difference attrs foreignNames } where attrs = elementAttributes tag foreignNames = M.fromList [ ((Nothing, n), ()) | n <- [ "xlink:actuate" , "xlink:arcrole" , "xlink:href" , "xlink:role" , "xlink:show" , "xlink:title" , "xlink:type" , "xml:lang" , "xml:space" , "xmlns" , "xmlns:xlink" ] ] -- | Compare the name of a single attribute to the known exceptions described -- in 'adjustForeignAttributes', and update the values if it matches. adjustForeignAttribute :: AttributeParams -> AttributeParams adjustForeignAttribute attr = case T.splitOn ":" $ attrName attr of [p@"xlink", n@"actuate"] -> updateAttr p n xlinkNamespace [p@"xlink", n@"arcrole"] -> updateAttr p n xlinkNamespace [p@"xlink", n@"href"] -> updateAttr p n xlinkNamespace [p@"xlink", n@"role"] -> updateAttr p n xlinkNamespace [p@"xlink", n@"show"] -> updateAttr p n xlinkNamespace [p@"xlink", n@"title"] -> updateAttr p n xlinkNamespace [p@"xlink", n@"type"] -> updateAttr p n xlinkNamespace [p@"xml", n@"lang"] -> updateAttr p n xmlNamespace [p@"xml", n@"space"] -> updateAttr p n xmlNamespace [n@"xmlns"] -> attr { attrPrefix = Nothing , attrName = n , attrNamespace = Just xmlnsNamespace } [p@"xmlns", n@"xlink"] -> updateAttr p n xmlnsNamespace _ -> attr where updateAttr p n ns = attr { attrPrefix = Just p , attrName = n , attrNamespace = Just ns } -- | 'Web.Mangrove.Parse.Common.Parser.switch' case guard testing whether the -- wrapped token indicates the end of the stream. isEOF :: TreeInput -> Bool isEOF t' = case tokenOut t' of -- 'EndOfStream' only works as an explicit EOF because of the manipulations in -- the parser runner ('Web.Mangrove.Parse.Tree.repackStream'). EndOfStream -> True _ -> False -- | 'Web.Mangrove.Parse.Common.Parser.switch' case guard testing whether the -- wrapped token is a whitespace 'Character' in the ASCII range. isWhitespace :: TreeInput -> Bool isWhitespace t' = case tokenOut t' of Character c | isAsciiWhitespace c -> True _ -> False -- | 'Web.Mangrove.Parse.Common.Parser.switch' case guard testing whether the -- wrapped token is any 'Character'. isCharacter :: TreeInput -> Bool isCharacter t' = case tokenOut t' of Character _ -> True _ -> False -- | 'Web.Mangrove.Parse.Common.Parser.switch' case guard testing whether the -- wrapped token is specifically a @U+0000@ null 'Character'. isNull :: TreeInput -> Bool isNull t' = case tokenOut t' of Character '\NUL' -> True _ -> False -- | 'Web.Mangrove.Parse.Common.Parser.switch' case guard testing whether the -- wrapped token is any 'Comment'. isComment :: TreeInput -> Bool isComment t' = case tokenOut t' of Comment _ -> True _ -> False -- | 'Web.Mangrove.Parse.Common.Parser.switch' case guard testing whether the -- wrapped token is any 'Doctype'. isDoctype :: TreeInput -> Bool isDoctype t' = case tokenOut t' of Doctype _ -> True _ -> False -- | 'Web.Mangrove.Parse.Common.Parser.switch' case guard testing whether the -- wrapped token is any 'StartTag'. If the behaviour further requires the -- 'StartTag' to have a specific name, use 'isStartTag' instead. isAnyStartTag :: TreeInput -> Bool isAnyStartTag t' = case tokenOut t' of StartTag _ -> True _ -> False -- | 'Web.Mangrove.Parse.Common.Parser.switch' case guard testing whether the -- wrapped token is a 'StartTag' with any of the given names; the list should -- be entirely in lower-case. If the exact name doesn't actually matter, use -- 'isAnyStartTag' instead. isStartTag :: [String] -> TreeInput -> Bool isStartTag names t' = case tokenOut t' of StartTag d | elem (T.unpack $ tagName d) names -> True _ -> False -- | 'Web.Mangrove.Parse.Common.Parser.switch' case guard testing whether the -- wrapped token is any 'EndTag'. If the behaviour further requires the -- 'EndTag' to have a specific name, use 'isEndTag' instead. isAnyEndTag :: TreeInput -> Bool isAnyEndTag t' = case tokenOut t' of EndTag _ -> True _ -> False -- | 'Web.Mangrove.Parse.Common.Parser.switch' case guard testing whether the -- wrapped token is a 'EndTag' with any of the given names; the list should be -- entirely in lower-case. If the exact name doesn't actually matter, use -- 'isAnyEndTag' instead. isEndTag :: [String] -> TreeInput -> Bool isEndTag names t' = case tokenOut t' of EndTag d | elem (T.unpack $ tagName d) names -> True _ -> False -- | Check if the described node has the given name, and is in the HTML -- namespace. nodeIsElement :: T.Text -> ElementParams -> Bool nodeIsElement name node = elementNamespace node == Just htmlNamespace && elementName node == name -- | Check if the described node is one of the many which have specific behaviour -- governing their addition to the document tree. See 'specialElements' for -- the nodes themselves. nodeIsSpecial :: ElementParams -> Bool nodeIsSpecial node = case elementNamespace node of Just ns -> elem (ns, elementName node) specialElements Nothing -> False -- | Check whether any of the given tags, in the HTML namespace, are in the -- stack of open elements. hasOpenElement :: [ElementName] -> TreeBuilder Bool hasOpenElement names = any isElement . filter isHtml . map snd . openElements <$> N.S.get where isElement tag = elem (elementName tag) names isHtml tag = elementNamespace tag == Just htmlNamespace -- | Check if there are any tags on the stack of open elements, which are not -- one of the given list, or are in a non-HTML namespace. hasOpenElementExcept :: [ElementName] -> TreeBuilder Bool hasOpenElementExcept names = any isNotElement . map snd . openElements <$> N.S.get where isNotElement tag = isNotHtml tag || notElem (elementName tag) names isNotHtml tag = elementNamespace tag /= Just htmlNamespace -- | __HTML:__ -- the elements listed for @[has a particular element in scope] -- (https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-scope)@ -- -- Several elements provide a breakpoint which limits how far up the tree some -- searches for open tags will propagate; this list defines those. See -- 'hasInScope' for a test using this list. scopeElements :: [(Namespace, ElementName)] scopeElements = [ (htmlNamespace, n) | n <- [ "applet" , "caption" , "html" , "table" , "td" , "th" , "marquee" , "object" , "template" ] ] ++ [ (mathMLNamespace, n) | n <- [ "mi" , "mo" , "mn" , "ms" , "mtext" , "annotation-xml" ] ] ++ [ (svgNamespace, n) | n <- [ "foreignObject" , "desc" , "title" ] ] -- | __HTML:__ -- @[special category] -- (https://html.spec.whatwg.org/multipage/parsing.html#special)@ -- -- The specification describes specific behaviour for many tags, and refers -- back to those same tags at a few points in the parse algorithm. This then -- collects everything in that category; see 'nodeIsSpecial' to test against -- it. specialElements :: [(Namespace, ElementName)] specialElements = [ (htmlNamespace, n) | n <- [ "address" , "area" , "article" , "aside" , "base" , "basefont" , "bgsound" , "blockquote" , "body" , "br" , "button" , "center" , "col" , "colgroup" , "dd" , "details" , "dir" , "div" , "dl" , "dt" , "embed" , "fieldset" , "figcaption" , "figure" , "footer" , "form" , "frame" , "frameset" , "h1" , "h2" , "h3" , "h4" , "h5" , "h6" , "head" , "header" , "hgroup" , "hr" , "iframe" , "img" , "input" , "keygen" , "li" , "link" , "listing" , "main" , "menu" , "meta" , "nav" , "noembed" , "noframes" , "noscript" , "ol" , "p" , "param" , "plaintext" , "pre" , "script" , "section" , "select" , "style" , "source" , "summary" , "tbody" , "textarea" , "tfoot" , "thead" , "title" , "tr" , "track" , "ul" , "wbr" , "xmp" ] ] ++ scopeElements -- | __HTML:__ -- @[has a particular element in a specific scope] -- (https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-the-specific-scope)@ -- -- Check if a tag with any of the given names, in the HTML namespace, is in the -- stack of open elements more recently than the closest boundary element. See -- 'scopeElements' for the typical list, but generally 'hasInScope', -- 'hasInButtonScope', 'hasInListItemScope', 'hasInSelectScope', or -- 'hasInTableScope' should be used instead. inScope :: [(Namespace, ElementName)] -> [T.Text] -> TreeBuilder Bool inScope bound names = recurse . openElements <$> N.S.get where recurse [] = False recurse ((_, e):es) | elem (elementName e) names = True | elem (elementNamespace e, elementName e) (map (F.B.first Just) bound) = False | otherwise = recurse es -- | Given a node's unique ID, check if it is in the stack of open elements, -- more recently than the closest boundary node from 'scopeElements'. See -- 'hasInScope' if any node with a given name will suffice. hasIndexInScope :: NodeIndex -> TreeBuilder Bool hasIndexInScope index = any ((== index) . fst) . takeWhile (not . isScopeElement) . openElements <$> N.S.get where isScopeElement (_, e) = case elementNamespace e of Just ns -> elem (ns, elementName e) scopeElements Nothing -> False -- | __HTML:__ -- @[has a particular element in scope] -- (https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-scope)@ -- -- Check if a tag with any of the given names, in the HTML namespace, is in the -- stack of open elements more recently than the closest boundary node from -- 'scopeElements'. See 'hasIndexInScope' if a specific existing element is -- required. hasInScope :: [T.Text] -> TreeBuilder Bool hasInScope = inScope scopeElements -- | __HTML:__ -- @[has a particular element in list item scope] -- (https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-list-item-scope)@ -- -- Check if a tag with any of the given names, in the HTML namespace, is in the -- stack of open elements more recently than the closest boundary node of -- either @\@, @\@, or any from 'scopeElements'. hasInListItemScope :: [T.Text] -> TreeBuilder Bool hasInListItemScope = inScope $ [ (htmlNamespace, n) | n <- [ "ol" , "ul" ] ] ++ scopeElements -- | __HTML:__ -- @[has a particular element in button scope] -- (https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-button-scope)@ -- -- Check if a tag with any of the given names, in the HTML namespace, is in the -- stack of open elements more recently than the closest boundary node of -- either @\@ or any from 'scopeElements'. hasInButtonScope :: [T.Text] -> TreeBuilder Bool hasInButtonScope = inScope $ (htmlNamespace, "button") : scopeElements -- | __HTML:__ -- @[has a particular element in table scope] -- (https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-table-scope)@ -- -- Check if a tag with any of the given names, in the HTML namespace, is in the -- stack of open elements more recently than the closest boundary node of -- either @\@ or a top-level element. hasInTableScope :: [T.Text] -> TreeBuilder Bool hasInTableScope = inScope [ (htmlNamespace, n) | n <- [ "html" , "table" , "template" ] ] -- | __HTML:__ -- @[has a particular element in select scope] -- (https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-select-scope)@ -- -- Check if a tag with any of the given names, in the HTML namespace, is in the -- stack of open elements more recently than /any/ node except an -- @\@ or @\@ element. hasInSelectScope :: [T.Text] -> TreeBuilder Bool hasInSelectScope names = recurse . openElements <$> N.S.get where recurse [] = False recurse ((_, e):es) | elem (elementName e) names = True | elem (elementName e) ["optgroup", "option"] && elementNamespace e == Just htmlNamespace = recurse es | otherwise = False -- | __HTML:__ -- @[reset the insertion mode appropriately] -- (https://html.spec.whatwg.org/multipage/parsing.html#reset-the-insertion-mode-appropriately)@ -- -- Guess what part of the state machine should be active, according to the -- stack of open elements. See 'resetInsertionMode'' for use outside a parser -- monad. resetInsertionMode :: TreeBuilder () resetInsertionMode = N.S.modify resetInsertionMode' -- | __HTML:__ -- @[reset the insertion mode appropriately] -- (https://html.spec.whatwg.org/multipage/parsing.html#reset-the-insertion-mode-appropriately)@ -- -- Guess what part of the state machine should be active, according to the -- stack of open elements. See 'resetInsertionMode' if the calculation is -- being made as part of the tree construction parsing algorithm. resetInsertionMode' :: TreeParserState -> TreeParserState resetInsertionMode' state = resetInsertionMode'' $ openElements state where resetInsertionMode'' [] = switchMode' InBody resetInsertionMode'' ((_, e):es) | isLast && nodeIsElement "select" e' = switchMode' InSelect | nodeIsElement "select" e' = loopSelect es | not isLast && nodeIsElement "td" e' = switchMode' InCell | not isLast && nodeIsElement "th" e' = switchMode' InCell | nodeIsElement "tr" e' = switchMode' InRow | nodeIsElement "tbody" e' = switchMode' InTableBody | nodeIsElement "thead" e' = switchMode' InTableBody | nodeIsElement "tfoot" e' = switchMode' InTableBody | nodeIsElement "caption" e' = switchMode' InCaption | nodeIsElement "colgroup" e' = switchMode' InColumnGroup | nodeIsElement "table" e' = switchMode' InTable | nodeIsElement "template" e' = maybe state switchMode' . Y.listToMaybe $ templateInsertionModes state | not isLast && nodeIsElement "head" e' = switchMode' InHead | nodeIsElement "body" e' = switchMode' InBody | nodeIsElement "frameset" e' = switchMode' InFrameset | nodeIsElement "html" e' = switchMode' $ if Y.isNothing $ headElementPointer state then BeforeHead else AfterHead | isLast = switchMode' InBody | otherwise = resetInsertionMode'' es where isLast = null es e' = case fragmentContext state of Nothing -> e Just context -> fst context loopSelect [] = switchMode' InSelect loopSelect [_] = switchMode' InSelect loopSelect ((_, n):ns) | nodeIsElement "template" n = switchMode' InSelect | nodeIsElement "table" n = switchMode' InSelectInTable | otherwise = loopSelect ns switchMode' mode = state { insertionMode = mode } -- | __HTML:__ -- @[MathML text integration point] -- (https://html.spec.whatwg.org/multipage/parsing.html#mathml-text-integration-point)@ -- -- Whether the markup tag introduces a section of less-structured text content -- embedded within a MathML object. atMathMLIntegration :: ElementParams -> Bool atMathMLIntegration current = elementNamespace current == Just mathMLNamespace && elem (elementName current) ["mi", "mo", "mn", "ms", "mtext"] -- | Whether the markup tag is specifically an @\@ tag within -- the MathML namespace. isMathMLAnnotationXml :: ElementParams -> Bool isMathMLAnnotationXml current | elementNamespace current == Just mathMLNamespace && elementName current == "annotation-xml" = True | otherwise = False -- | __HTML:__ -- @[HTML integration point] -- (https://html.spec.whatwg.org/multipage/parsing.html#html-integration-point)@ -- -- Whether the markup tag introduces a section of HTML content embedded within -- a MathML or SVG object. atHtmlIntegration :: ElementParams -> Bool atHtmlIntegration current | isMathMLAnnotationXml current && isIntegrationAttribute (M.lookup (Nothing, "encoding") $ elementAttributes current) = True | elementNamespace current == Just svgNamespace && elem (elementName current) ["foreignObject", "desc", "title"] = True | otherwise = False where isIntegrationAttribute (Just (_, value)) = case T.map toAsciiLower value of "text/html" -> True "application/xhtml+xml" -> True _ -> False isIntegrationAttribute _ = False