{-# LANGUAGE OverloadedStrings #-} {-| Description: A stateless reformulation of the components of the tree construction algorithm. Copyright: (c) 2020 Sam May License: MPL-2.0 Maintainer: ag.eitilt@gmail.com Stability: provisional Portability: portable This module provides the data structures used in the second half of this implementation's split __[HTML](https://html.spec.whatwg.org/multipage/parsing.html#tree-construction)__ tree construction algorithm, and the common functions generating them. While it would certainly be possible to implement the specification more directly, the instructions it provides are not only (very strongly) build around mutable data structures, they're also tailored to a pass-by-reference core library. Haskell idiomatically being neither, it's actually easier (and likely more performant, though that's not been tested) to implement a fourth stage from scratch. In effect, the main parser body handles all stateful computation ---tracking the stack of open elements, managing transitions through the finite state machine, etc.--- but doesn't know the shape of the document tree beyond its very narrow window. Instead, it emits a self-contained sequence of instructions which can be very simply consumed with neither (much) further modification nor external state; this then is the source from which the final document tree is built. Some duplication /is/ admittedly involved in doing so, as the emitted instruction is typically accompanied by a corresponding change to the parser state, but the relative logic simplicity makes doing so worth it. -} module Web.Mangrove.Parse.Tree.Patch ( -- * Types Patch ( .. ) , TreeOutput ( .. ) , treeRemainder , TokenizerOutputState , InsertAt ( .. ) , TargetNode , ReparentDepth -- * Utility functions -- ** Tree types -- $utility-state , packTree , packTree_ , packTreeErrors , packTreeErrors_ , consTreeError , consTreeError_ , (++|) , (|++) , (|++|) -- ** Token types , mapTokenErrs , mapTokenOut -- * Instructions , setDocumentQuirks , restartParsing , stopParsing -- ** Opening nodes , insertCharacter , insertComment , insertComment' , insertDoctype , addAttribute -- *** Elements , createElement , insertElement , insertElement_ , insertNullElement , insertNullElement_ , insertHeadElement , insertHeadElement_ -- **** Formatting , insertForeignElement , insertForeignNullElement , insertFormattingElement , reconstructFormattingElements -- ** Closing nodes -- *** Single , closeCurrentNode , closeCurrentNode_ , dropCurrentNode , softCloseCurrentNode_ , closeAncestorNode_ , closeAncestorNodes_ -- *** Multiple -- **** Exclusive clear , clearToContext , tableContext , tableBodyContext , tableRowContext -- **** Inclusive clear , clearCount , closeElement , closeElements , closePElement , generateEndTags , impliedEndTags , thoroughlyImpliedEndTags ) where import qualified Control.Applicative as A import qualified Control.Monad as N import qualified Control.Monad.Trans.State as N.S import qualified Data.ByteString as BS import qualified Data.ByteString.Lazy as BS.L import qualified Data.IntMap.Strict as M.I import qualified Data.List as L import qualified Data.Maybe as Y import qualified Data.Text as T import Data.Function ( (&) ) import Data.Functor ( ($>) ) import Web.Willow.DOM import Web.Mangrove.Parse.Common.Error import Web.Mangrove.Parse.Tokenize as Tokenize import Web.Mangrove.Parse.Tokenize.Common import Web.Mangrove.Parse.Tree.Common import Web.Willow.Common.Encoding -- | The atomic, self-contained instruction set describing how to build a final -- document tree. data Patch = ErrorList [ParseError] -- ^ "Ignore the token", or add errors to, e.g., 'SetDocumentQuirks'. | SetDocumentQuirks QuirksMode -- ^ Specify which degree of backwards compatibility should be used in -- rendering the document. | CloseNodes (M.I.IntMap ReparentDepth) -- ^ "Pop the current node off the stack of open elements." -- -- The first 'TargetNode' of every tuple is treated identically to -- 'RelativeLocation', while the second 'ReparentDepth' indicates how -- many ancestors should be closed (i.e., @'CloseNodes' [(1, 1)]@ -- closes the parent node). | SoftCloseCurrentNode -- ^ Float any following non-'Element' nodes (until the next -- 'CloseNodes' or 'DropCurrentNode') to the parent rather than -- inserting them under the current. | DropCurrentNode -- ^ "Remove the current node from its parent node, if it has one. Pop -- it off the stack of open elements." | InsertCharacter [ParseError] Char -- ^ "Insert the character." | InsertComment [ParseError] InsertAt T.Text -- ^ "Insert a comment." | InsertElement [ParseError] ElementParams -- ^ "Insert a foreign element for the token, in the given namespace." | InsertAndSetDocumentType [ParseError] DocumentTypeParams -- ^ "Append a 'DocumentType' node to the @Document@ node. Associate -- the former with the latter so that it is returned as the value of -- the @doctype@ attribute." -- -- While the specification requires the 'DocumentType' be explicitly -- associated as the @doctype@ attribute of the document, the parsing -- rules only allow for a single such node to be produced, so the extra -- processing isn't actually required. | AddAttribute InsertAt AttributeParams -- ^ "Add the attribute and its corresponding value to the top element -- of the stack of open elements." | RestartParsing -- ^ Discard the previous tree and begin a new, empty one. deriving ( Eq, Show, Read ) -- | Some 'Patch'es represent instruction categories which may not always be -- intended for the current location in the tree, and the resulting 'Tree's may -- likewise migrate from where they're generated. These are the "addresses" -- which direct those datatypes through the hierarchy. data InsertAt = RelativeLocation ReparentDepth -- ^ Insert a specific number of nodes /up/ the tree from where the -- 'Patch' or 'Tree' was generated. | InDocument -- ^ Insert as a direct child of the 'Web.Willow.IDL.Document.Document' -- itself. | InHtmlElement -- ^ Insert as a direct child of the top-level @html@ element in the -- document. deriving ( Eq, Show, Read ) -- | Type-level clarification for a count of how many levels a 'Patch' should -- rise in the hierarchy without affecting the intervening nodes; a value of -- @0@ indicates the current node. type TargetNode = Word -- | Type-level clarification for how many levels up the hierarchy the -- associated object should be moved. Note that a value of @0@ won't result in -- any noticable change. type ReparentDepth = Word -- | The standard output of parsers used in the first tree construction stage. -- Specifically, it contains the final state of the tokenization stage in -- addition to the generated instructions, to enable the recursion loop to -- detect the end of multi-token outputs and properly update the resume state. data TreeOutput = TreeOutput { treePatches :: [Patch] -- ^ The instructions generated by the parser. , treeState :: TokenizerOutputState -- ^ The data required to resume tokenization immediately following the -- value, if possible. } deriving ( Eq, Show, Read ) -- | The unparsed portion of the binary stream, after building the -- associated instruction set. treeRemainder :: TreeOutput -> Maybe BS.ByteString treeRemainder = fmap snd . treeState -- $utility-state -- Many functions producing 'TreeOutput' objects have two forms: one taking a -- 'TreeInput' (typically passed via the enclosing -- 'Web.Willow.Common.Parser.Switch.switch') alongside some data to pack, and -- one with only the latter. That 'TreeInput' is the means by which errors -- from earlier stages are propagated forward, and by which the support -- framework identifies breakpoints in the tokenizer. Therefore, most case -- logic must be written to: -- -- * Emit the value returned by a function referencing the 'TreeInput', -- unless that function is a 'Web.Willow.Common.Parser.push' to allow it to -- be reconsumed. -- * Only use the 'TreeInput' with a single function to avoid duplicating -- errors. -- -- In many cases, it may be possible to pass the 'TreeInput' to one of -- several functions. So long as the above rules are followed, it often -- doesn't matter which function performs the wrapping; attaching it in a way -- that makes logical or proximal sense could provide clearer error display -- positioning. -- | Emit a patchset from a tree construction parser, referencing the state -- when the original token was produced. -- -- This produces a stateful output, and should only be used where the token -- would otherwise be discarded; use 'packTree_' elsewhere. packTree :: TreeInput -> [Patch] -> TreeBuilder TreeOutput packTree t' ps = return $ TreeOutput { treePatches = ps , treeState = tokenState t' } -- | Emit a patchset from a tree construction parser without any reference to -- the original token. -- -- This produces a stateless output, and should only be used after the token -- has been 'Web.Willow.Common.Parser.push'ed for reconsumption; use 'packTree' -- elsewhere. packTree_ :: [Patch] -> TreeBuilder TreeOutput packTree_ ps = return $ TreeOutput { treePatches = ps , treeState = Nothing } -- | Modify the collection of errors associated with a wrapped token. mapTokenErrs :: ([ParseError] -> [ParseError]) -> TreeInput -> TreeInput mapTokenErrs f t' = t' { tokenErrs = f $ tokenErrs t' } -- | Modify a wrapped token without affecting the associated data. mapTokenOut :: (Token -> Token) -> TreeInput -> TreeInput mapTokenOut f t' = t' { tokenOut = f $ tokenOut t' } -- | Emit a collection of errors from a tree construction parser, if and only -- if that collection is non-empty. -- -- This produces a stateful output, and should only be used where the token -- would otherwise be discarded; use 'packTreeErrors_' elsewhere. packTreeErrors :: [ParseError] -> TreeInput -> TreeBuilder TreeOutput packTreeErrors errs t' = do ps <- packTreeErrors_ $ errs ++ tokenErrs t' return $ ps { treeState = tokenState t' } -- | Emit a collection of errors from a tree construction parser, if and only -- if that collection is non-empty. -- -- This produces a stateless output, and should only be used after the token -- has been 'Web.Willow.Common.Parser.push'ed for reconsumption; use -- 'packTreeErrors' elsewhere, or 'consTreeError' and 'consTreeError_' if a -- relevant set of patches already exists. packTreeErrors_ :: [ParseError] -> TreeBuilder TreeOutput packTreeErrors_ [] = packTree_ [] packTreeErrors_ errs = packTree_ [ErrorList errs] -- | Prepend an error to the first patch in the list, or add an 'ErrorList' -- entry if it doesn't support them. -- -- If this is the only patchset which may be generated, use 'packTreeErrors_' -- rather than passing a null @['Patch']@. consTreeError_ :: ParseError -> [Patch] -> [Patch] consTreeError_ err (ErrorList errs:ps) = ErrorList (err : errs) : ps consTreeError_ err (InsertCharacter errs c:ps) = InsertCharacter (err : errs) c : ps consTreeError_ err (InsertComment errs loc d:ps) = InsertComment (err : errs) loc d : ps consTreeError_ err (InsertElement errs d:ps) = InsertElement (err : errs) d : ps consTreeError_ err (InsertAndSetDocumentType errs d:ps) = InsertAndSetDocumentType (err : errs) d : ps consTreeError_ err ps = ErrorList [err] : ps -- | Prepend an error to the wrapped patchset, adding an 'ErrorList' entry if -- the first patch doesn't support them. -- -- If this is the only patchset which may be generated, use 'packTreeErrors' -- rather than passing a null @['Patch']@. consTreeError :: ParseError -> TreeOutput -> TreeOutput consTreeError err out = out { treePatches = consTreeError_ err $ treePatches out } -- | Prepend a plain patchset to the contents of a wrapped one. (++|) :: [Patch] -> TreeOutput -> TreeOutput ps ++| out = out { treePatches = ps ++ treePatches out } infixr 4 ++| -- | Append a plain patchset to the contents of a wrapped one. (|++) :: TreeOutput -> [Patch] -> TreeOutput out |++ ps = out { treePatches = treePatches out ++ ps } infixr 4 |++ -- | Concatenate the payloads of two wrapped patchsets, retaining the -- associated state of the right one. (|++|) :: TreeOutput -> TreeOutput -> TreeOutput ps |++| ps' = ps' { treePatches = treePatches ps ++ treePatches ps' } infixr 4 |++| -- | Set the degree of backwards compatibility the document seems to be written -- for. setDocumentQuirks :: QuirksMode -> TreeBuilder [Patch] setDocumentQuirks quirks = do N.S.modify $ \state -> state { quirksMode = quirks } return [SetDocumentQuirks quirks] -- | Pass the stack of open elements into the given function in order to -- determine how many elements should be closed, then close them. clear :: ([ElementParams] -> Word) -> TreeBuilder [Patch] clear f = do state <- N.S.get clearCount . f . map snd $ openElements state -- | Pop a known number of nodes from the stack of open elements. clearCount :: Word -> TreeBuilder [Patch] clearCount 0 = return [] clearCount l = closeAncestorNodes_ 0 l -- | Close all markup tags until the current node is one of the given elements -- in the HTML namespace, close it. For the inverse (closing all tags which -- /are/ listed), see 'generateEndTags'. -- -- 'tableContext', 'tableBodyContext', and 'tableRowContext' provide the -- typical inputs. clearToContext :: [ElementName] -> TreeBuilder [Patch] clearToContext ns = clear countToContext where countToContext [] = 0 countToContext (d:ds) = if elem (elementName d) ns && elementNamespace d == Just htmlNamespace then 0 else succ $ countToContext ds -- | __HTML:__ -- @[clear the stack back to a table context] -- (https://html.spec.whatwg.org/multipage/parsing.html#clear-the-stack-back-to-a-table-context)@ -- -- The list of elements to pass 'clearToContext' when closing all tags until -- the start of the most recent table. tableContext :: [ElementName] tableContext = [ "table" , "template" , "html" ] -- | __HTML:__ -- @[clear the stack back to a table body context] -- (https://html.spec.whatwg.org/multipage/parsing.html#clear-the-stack-back-to-a-table-body-context)@ -- -- The list of elements to pass 'clearToContext' when closing all tags until -- the start of the most recent section of the current table. tableBodyContext :: [ElementName] tableBodyContext = [ "tbody" , "tfoot" , "thead" , "template" , "html" ] -- | __HTML:__ -- @[clear the stack back to a table row context] -- (https://html.spec.whatwg.org/multipage/parsing.html#clear-the-stack-back-to-a-table-row-context)@ -- -- The list of elements to pass 'clearToContext' when closing all tags until -- the start of the most recent row of the current table. tableRowContext :: [ElementName] tableRowContext = [ "tr" , "template" , "html" ] -- | "Pop elements from the stack of open elements until the given element in -- the HTML namespace has been popped from the stack." -- -- If multiple elements may indicate an endpoint, use 'closeElements' instead. closeElement :: ElementName -> TreeBuilder [Patch] closeElement = closeElements . (: []) -- | "Pop elements from the stack of open elements until one of the given -- elements in the HTML namespace has been popped from the stack." -- -- If only a single element may indicate an endpoint, 'closeElement' could -- provide a cleaner interface. closeElements :: [ElementName] -> TreeBuilder [Patch] closeElements names = clear countToElement where countToElement [] = 0 countToElement (d:ds) = if elem (elementName d) names && elementNamespace d == Just htmlNamespace then 1 else succ $ countToElement ds -- | "Pop the current element from the stack of open elements." -- -- This produces a stateful output, and should only be used where the token -- would otherwise be discarded; use 'closeCurrentNode_' elsewhere. closeCurrentNode :: TreeInput -> TreeBuilder TreeOutput closeCurrentNode t' = do close <- closeCurrentNode_ state <- N.S.get return $ TreeOutput { treePatches = foldr consTreeError_ close $ tokenErrs t' , treeState = tokenState . mapTokenState t' $ resetNamespace state } where resetNamespace state tokState = tokState { currentNodeNamespace = Y.listToMaybe (openElements state) >>= elementNamespace . snd } -- | "Pop the current element from the stack of open elements." -- -- This produces a stateless output, and should only be used where the token is -- handled in another manner; use 'closeCurrentNode' elsewhere. This has -- identical behaviour to @'closeAncestorNode_' 0@. closeCurrentNode_ :: TreeBuilder [Patch] closeCurrentNode_ = closeAncestorNode_ 0 -- | As 'closeCurrentNode_', but the closed node and its descendants are not -- retained in the document tree. dropCurrentNode :: TreeBuilder [Patch] dropCurrentNode = closeCurrentNode_ $> [DropCurrentNode] -- | Partially close the current node, such that 'insertElement' (and related -- element instructions) will still be inserted as children, but /every other/ -- type of node will instead be inserted as siblings. This doesn't actually -- change the internal state; a following call to 'closeCurrentNode' or similar -- is still required. softCloseCurrentNode_ :: [Patch] softCloseCurrentNode_ = [SoftCloseCurrentNode] -- | Remove the node the specified number of ancestors up the tree from the -- stack of open elements. See 'closeCurrentNode_' for the special case where -- the argument is @0@. closeAncestorNode_ :: TargetNode -> TreeBuilder [Patch] closeAncestorNode_ = flip closeAncestorNodes_ 1 -- | Remove several nodes the specified number of ancestors up the tree from -- the stack of open elements. See 'closeAncestorNode_' if only a single node -- needs to be closed. closeAncestorNodes_ :: TargetNode -> ReparentDepth -> TreeBuilder [Patch] closeAncestorNodes_ l d = do state <- N.S.get let (es1, es2) = splitAt (fromIntegral l) $ openElements state d' = fromIntegral d N.S.put $ state { openElements = es1 ++ drop d' es2 } return [CloseNodes . M.I.singleton (fromIntegral l) . fromIntegral . min d' $ length es2] -- | Close all markup tags up to and including the most recent @\@ element, -- throwing an 'UnexpectedElementWithImpliedEndTag' if one of them does not -- typically allow an implied end tag. If the error is not required, -- 'closeElement' provides similar behaviour. closePElement :: TreeBuilder [Patch] closePElement = do generate <- generateEndTags $ L.delete "p" impliedEndTags current <- currentNode let errF = if maybe True (nodeIsElement "p") current then id else consTreeError_ UnexpectedElementWithImpliedEndTag p <- closeElement "p" return $ generate ++ errF p -- | Close all markup tags in a given list, until reaching an element which is -- not in that list. For the inverse (closing all tags /except/ what's -- listed), see 'clearToContext'. -- -- 'impliedEndTags' and 'thoroughlyImpliedEndTags' provide the typical inputs. generateEndTags :: [ElementName] -> TreeBuilder [Patch] generateEndTags tags = clear countImpliable where countImpliable [] = 0 countImpliable (d:ds) = if elem (elementName d) tags then succ $ countImpliable ds else 0 -- | __HTML:__ -- @[generate implied end tags] -- (https://html.spec.whatwg.org/multipage/parsing.html#generate-implied-end-tags)@ -- -- The list of elements to pass 'generateEndTags' when closing all markup tags -- which typically allow their end tag to be omitted. impliedEndTags :: [ElementName] impliedEndTags = [ "dd" , "dt" , "li" , "optgroup" , "option" , "p" , "rb" , "rp" , "rt" , "rtc" ] -- | __HTML:__ -- @[generate all implied end tags thoroughly] -- (https://html.spec.whatwg.org/multipage/parsing.html#generate-all-implied-end-tags-thoroughly)@ -- -- The list of elements to pass 'generateEndTags' when closing all markup tags, -- including ones which may not usually allow their end tag to be omitted. thoroughlyImpliedEndTags :: [ElementName] thoroughlyImpliedEndTags = [ "caption" , "colgroup" , "tbody" , "td" , "tfoot" , "th" , "thead" , "tr" ] ++ impliedEndTags -- | __HTML:__ -- @[create an element for the token] -- (https://html.spec.whatwg.org/multipage/parsing.html#create-an-element-for-the-token)@ -- -- Reserve a place in the tree for a given node, but do not yet add it. Note -- that this does not otherwise affect the parser state, so any modification of -- open elements or updating of the 'currentNodeNamespace', for example, needs -- to be performed manually. For that reason, 'insertElement_' is almost -- always more desirable. createElement :: ElementParams -> TreeBuilder (NodeIndex, ElementParams) createElement d = do state <- N.S.get let index = elementIndex state N.S.put $ state { elementIndex = succ index } return (index, d) -- | __HTML:__ -- @[insert a character] -- (https://html.spec.whatwg.org/multipage/parsing.html#insert-a-character)@ -- steps 1-3 -- -- If the wrapped token is a 'Character', add it to the final tree; fails if -- it's not. Note that the concatenation of character into 'Text' nodes occurs -- in the patch folding logic instead. insertCharacter :: TreeInput -> TreeBuilder TreeOutput insertCharacter t' = case tokenOut t' of Character c -> return $ TreeOutput { treePatches = [InsertCharacter (tokenErrs t') c] , treeState = tokenState t' } _ -> A.empty -- | __HTML:__ -- @[insert a comment] -- (https://html.spec.whatwg.org/multipage/parsing.html#insert-a-comment)@ -- with no explicit insertion postion -- -- If the wrapped token is a 'Tokenize.Comment', add it to the final tree at the current -- position (@'RelativeLocation' 0@); fails if it's not. insertComment :: TreeInput -> TreeBuilder TreeOutput insertComment = insertComment' $ RelativeLocation 0 -- | __HTML:__ -- @[insert a comment] -- (https://html.spec.whatwg.org/multipage/parsing.html#insert-a-comment)@ -- -- If the wrapped token is a 'Tokenize.Comment', add it to the final tree at -- the position specified; fails if it's not. insertComment' :: InsertAt -> TreeInput -> TreeBuilder TreeOutput insertComment' at t' = case tokenOut t' of Tokenize.Comment c -> return $ TreeOutput { treePatches = [InsertComment (tokenErrs t') at c] , treeState = tokenState t' } _ -> A.empty -- | If the wrapped token is a 'Tokenize.Comment', add it to the final tree; -- fails if it's not. insertDoctype :: TreeInput -> TreeBuilder TreeOutput insertDoctype t' = case tokenOut t' of Doctype d -> let system = doctypeSystemId d legacy | doctypeName d /= Just "html" = True | Y.isJust $ doctypePublicId d = True | Y.isJust system && system /= Just "about:legacy-compat" = True | otherwise = False errs' | legacy = LegacyDoctype : tokenErrs t' | otherwise = tokenErrs t' in packTree t' [InsertAndSetDocumentType errs' $ tokenDocumentType t'] _ -> A.empty -- | __HTML:__ -- @[insert an HTML element] -- (https://html.spec.whatwg.org/multipage/parsing.html#insert-an-html-element)@ -- -- If the wrapped token is a 'StartTag', add it to the final tree as a markup -- element in the HTML namespace; fails if it's not. -- -- For tag data generated /a priori/, use 'insertElement_' instead. insertElement :: TreeInput -> TreeBuilder TreeOutput insertElement = insertForeignElement htmlNamespace -- | Add a markup element described by the input record to the tree in the HTML -- namespace. -- -- If the tag data was obtained from the tokenizer, use 'insertElement' instead. insertElement_ :: TagParams -> TreeBuilder [Patch] insertElement_ d = treePatches <$> insertElement (dummyToken [] $ StartTag d) -- | __HTML:__ -- @[insert a foreign element] -- (https://html.spec.whatwg.org/multipage/parsing.html#insert-a-foreign-element)@ -- -- If the wrapped token is a 'StartTag', add it to the final tree as a markup -- element in the specified namespace; fails if it's not. insertForeignElement :: Namespace -> TreeInput -> TreeBuilder TreeOutput insertForeignElement ns t' = case tokenOut t' of StartTag d -> do let d' = packNodeData (Just ns) d errs' | tagIsSelfClosing d = NonVoidHtmlElementStartTagWithTrailingSolidus : tokenErrs t' | otherwise = tokenErrs t' e <- createElement d' N.S.modify $ \state -> state { openElements = e : openElements state } return $ TreeOutput { treePatches = [InsertElement errs' $ adjustAttributes d'] , treeState = tokenState $ mapTokenState t' setNamespace } _ -> A.empty where adjustAttributes | ns == htmlNamespace = id | otherwise = adjustForeignAttributes setNamespace state = state { currentNodeNamespace = Just ns } -- | If the wrapped token is a 'StartTag', add it to the final tree as a markup -- element in the HTML namespace, and then immediately close it; fails if it's -- not. -- -- For tag data generated /a priori/, use 'insertNullElement_' instead. insertNullElement :: TreeInput -> TreeBuilder TreeOutput insertNullElement = insertForeignNullElement htmlNamespace -- | Add a markup element described by the input record to the tree in the HTML -- namespace, and then immediately close it. -- -- If the tag data was obtained from the tokenizer, use 'insertNullElement' -- instead. insertNullElement_ :: TagParams -> TreeBuilder [Patch] insertNullElement_ d = treePatches <$> insertNullElement (dummyToken [] $ StartTag d) -- | If the wrapped token is a 'StartTag', add it to the final tree as a markup -- element in the specified namespace, and then immediately close it; fails if -- it's not. insertForeignNullElement :: Namespace -> TreeInput -> TreeBuilder TreeOutput insertForeignNullElement ns t' = case tokenOut t' of StartTag d -> return $ TreeOutput { treePatches = [ InsertElement (tokenErrs t') . adjustAttributes $ packNodeData (Just ns) d , CloseNodes $ M.I.singleton 0 1 ] , treeState = tokenState t' } _ -> A.empty where adjustAttributes | ns == htmlNamespace = id | otherwise = adjustForeignAttributes -- | If the wrapped token is a 'StartTag', add it to the final tree as a markup -- element in the HTML namespace and set it as the target of the head element -- pointer; fails if it's not. -- -- For tag data generated /a priori/, use 'insertHeadElement_' instead. insertHeadElement :: TreeInput -> TreeBuilder TreeOutput insertHeadElement t' = do N.S.modify $ \state -> state { headElementPointer = Just $ elementIndex state } insertElement t' -- | Add a markup element described by the input record to the tree in the HTML -- namespace and set it as the target of the head element pointer. -- -- If the tag data was obtained from the tokenizer, use 'insertElement' instead. insertHeadElement_ :: TagParams -> TreeBuilder [Patch] insertHeadElement_ d = do N.S.modify $ \state -> state { headElementPointer = Just $ elementIndex state } insertElement_ d -- | Add an extra point of metadata to the indicated markup element, if that -- element doesn't already have an attribute with that name. addAttribute :: InsertAt -> NodeIndex -> BasicAttribute -> TreeBuilder [Patch] addAttribute at i (name, value) = do state <- N.S.get let (es1, es2) = L.break ((==) i . fst) $ openElements state case es2 of ((_, e):es') | not (elem name . map attrName . toAttrList $ elementAttributes e) -> do N.S.put $ state { openElements = es1 ++ (i, addAttributeData e) : es' } return [AddAttribute at attr] _ -> return [] where addAttributeData d = d { elementAttributes = insertAttribute attr $ elementAttributes d } attr = emptyAttributeParams { attrName = name , attrValue = value } -- | __HTML:__ -- @[push onto the list of active formatting elements] -- (https://html.spec.whatwg.org/multipage/parsing.html#push-onto-the-list-of-active-formatting-elements)@ -- -- Add a markup element described by the input record to the tree in the HTML -- namespace. The element is also added to the set of elements which are -- recreated on overlapping markup spans via 'reconstructFormattingElements'. insertFormattingElement :: TreeInput -> TreeBuilder TreeOutput insertFormattingElement t' = do insert <- insertElement t' state <- N.S.get case Y.listToMaybe $ openElements state of Just e -> case tokenOut t' of StartTag d -> N.S.put $ state { activeFormattingElements = pushFormattingElement (fst e, d) $ activeFormattingElements state } _ -> return () Nothing -> return () return insert where pushFormattingElement e [] = [[e]] pushFormattingElement e (fs:fss) = (e : fs') : fss where fs' = case L.findIndices (equalElement . snd) fs of (_:_:i:is) -> case splitAts (i : is) fs of [] -> [] (ds:dss) -> ds ++ concatMap (drop 1) dss _ -> fs splitAts [] dss = [dss] splitAts (i:is) dss = let (ds', dss') = splitAt i dss in ds' : splitAts (map (`subtract` i) is) dss' equalElement e' = tagName (snd e) == tagName e' -- This is only ever invoked on HTML elements, so the -- namespaces shouldn't need to be checked for equality. && tagAttributes (snd e) == tagAttributes e' -- | __HTML:__ -- @[reconstruct the active formatting elements] -- (https://html.spec.whatwg.org/multipage/parsing.html#reconstruct-the-active-formatting-elements)@ -- -- Create new tokens for each of the yet-unclosed elements created by -- 'insertFormattingElement', within the scope defined by -- 'insertFormattingMarker'. reconstructFormattingElements :: TreeBuilder [Patch] reconstructFormattingElements = do state <- N.S.get let open = map fst $ openElements state (es, ess) = case activeFormattingElements state of [] -> ([], []) (es':ess') -> (es', ess') (toRebuild, alreadyOpened) = span (\(i, _) -> notElem i open) es N.S.put $ state { activeFormattingElements = alreadyOpened : ess } N.foldM reconstruct [] $ reverse toRebuild where reconstruct pss (_, d) = do ps <- treePatches <$> insertFormattingElement (dummyToken [] $ StartTag d) return $ pss ++ ps -- | __HTML:__ -- @[change the encoding] -- (https://html.spec.whatwg.org/multipage/parsing.html#stop-parsing)@, -- step 6 -- -- Restore the tracked state to pristine condition, issue an instruction to -- throw out any 'Tree' generated thus far, and set the given binary stream to -- be used when the parser next resumes. restartParsing :: BS.L.ByteString -> TreeBuilder TreeOutput restartParsing initial = do N.S.put $ treeParserState defState return $ TreeOutput { treePatches = [RestartParsing] , treeState = Just (tokState, BS.L.toStrict initial) } where updateDecoder state = tokenizerState state & case decoderState_ $ tokenizerState state of Left initialize -> tokenizerEncoding $ fmap Just initialize Right decState -> tokenizerEncoding . Right $ fmap decoderEncoding decState defState = defaultTreeState tokState = updateDecoder defState -- | __HTML:__ -- @[stop parsing] -- (https://html.spec.whatwg.org/multipage/parsing.html#stop-parsing)@ -- -- Close all remaining open elements, and perform other cleanup functions to -- finalize the document tree. stopParsing :: TreeInput -> TreeBuilder TreeOutput stopParsing t' = clear (fromIntegral . length) >>= packTree t'