{-# HLINT ignore "Redundant flip" #-} {-| Description: Fold a linear, semantic stream into a tree structure. Copyright: (c) 2020-2021 Sam May License: MPL-2.0 Maintainer: ag.eitilt@gmail.com Stability: provisional Portability: portable This module and the internal branch it heads implement the "Tree Construction" section of the __[HTML](https://html.spec.whatwg.org/multipage/parsing.html#tree-construction)__ document parsing specification, operating over the output of the "Web.Mangrove.Parse.Tokenize" stage to produce a DOM tree representation of a web page. As this library is still in the early stages of development, the representation produced here is not actually a proper DOM implementation, but instead only stores basic parameters in an equivalent (but less-featured) structure. Nonetheless, it is still enough for basic evaluation and unstyled rendering. -} module Web.Mangrove.Parse.Tree ( -- * Types -- ** Final Tree ( .. ) , Node ( .. ) , QuirksMode ( .. ) -- ** Intermediate , Patch , TreeState , Encoding ( .. ) , NodeIndex , ElementParams ( .. ) , emptyElementParams -- * Initialization , defaultTreeState , treeEncoding , treeFragment , treeInIFrame -- * Transformations , tree , treeStep , finalizeTree ) where import qualified Control.Monad.Trans.State as N.S import qualified Data.ByteString as BS import qualified Data.ByteString.Short as BS.SH import qualified Data.List as L import qualified Data.Maybe as Y import qualified Data.Text as T import Web.Willow.DOM import Web.Mangrove.Parse.Common.Error import Web.Mangrove.Parse.Tokenize import Web.Mangrove.Parse.Tokenize.Common import Web.Mangrove.Parse.Tree.Common import Web.Mangrove.Parse.Tree.Dispatcher import Web.Mangrove.Parse.Tree.Patch import Web.Mangrove.Parse.Tree.Patch.Fold import Web.Willow.Common.Encoding hiding ( setRemainder ) import Web.Willow.Common.Encoding.Sniffer import Web.Willow.Common.Parser -- | __HTML:__ -- @[tree construction] -- (https://html.spec.whatwg.org/multipage/parsing.html#tree-construction)@ -- -- Given a starting environment, transform a binary document stream into a -- hierarchical markup tree. If the parse fails, returns an empty tree (a -- 'Document' node with no children). tree :: TreeState -> BS.ByteString -> ([Patch], TreeState) tree state stream = L.foldl' treeFold ([], state) $ L.unfoldr treeUnfold (state, stream) where treeUnfold = unfoldLoop treeStep tokenizerState treeFold (ps, _) (ps', state', _) = (ps ++ ps', state') -- | Wrap a parser in a signature appropriate to pass to 'L.unfoldr'. unfoldLoop :: Eq state => (state -> BS.ByteString -> ([out], state, BS.ByteString)) -- ^ The single-step parser function. -> (state -> TokenizerState) -- ^ Repack the state used by the step parser into a standardized form. -> (state, BS.ByteString) -- ^ The initial inputs to the parser. -> Maybe (([out], state, BS.ByteString), (state, BS.ByteString)) unfoldLoop step toTokState (state, stream) | BS.null stream = Nothing | otherwise = case step state stream of out@(_, state', _) | state /= state' || hasRemainder state' -> continueUnfold out ([], _, stream') | BS.null stream' -> Nothing out -> continueUnfold out where continueUnfold (ps, state', stream') = Just ((ps, state', stream'), (state', stream')) hasRemainder = maybe False (not . BS.SH.null . decoderRemainder) . decoderState . toTokState -- | Parse a minimal number of tokens from a binary document stream, into a -- state-independent sequence of folding instructions. Returns all data -- required to seamlessly resume parsing. treeStep :: TreeState -> BS.ByteString -> ([Patch], TreeState, BS.ByteString) treeStep state stream = treeStep' stream' stateRemainder state where stateRemainder state' = state' { tokenizerState = setRemainder (BS.SH.toShort stream) $ tokenizerState state' } stream' = L.unfoldr tokenUnfold (tokenizerState state, stream) tokenUnfold = unfoldLoop tokenizeStep id -- | Parse a minimal number of tokens from a binary document stream, performing -- any backend processing required to correctly generate the document tree. treeStep' :: [([([ParseError], Token)], TokenizerState, BS.ByteString)] -> (TreeState -> TreeState) -> TreeState -> ([Patch], TreeState, BS.ByteString) treeStep' input fallback state = case runParserT (N.S.runStateT recurse $ treeParserState state) stream of Just (((ps, tokState, stream'), parserState), _) -> (ps', state', stream') where state' = TreeState { tokenizerState = tokState , treeParserState = parserState } ps' = map redirectPatches ps Nothing -> ([], fallback state, BS.empty) where stream = L.foldr repackStream [] input redirectPatches p = case fragmentContext $ treeParserState state of -- A fragment with an @\@ context won't create that node in -- the folded tree, and so any patches sent there won't find an -- anchor. Just ctx | nodeIsElement (T.pack "html") $ fst ctx -> case p of InsertComment errs InDocument _ -> ErrorList errs InsertComment errs InHtmlElement txt -> InsertComment errs InDocument txt AddAttribute InDocument _ -> ErrorList [] AddAttribute InHtmlElement attr -> AddAttribute InDocument attr _ -> p -- The fragment parsing algorithm returns the children of the -- context node, and so any patches destined for locations above -- that shouldn't show up in the resulting tree. Just _ -> case p of InsertComment errs InDocument _ -> ErrorList errs InsertComment errs InHtmlElement _ -> ErrorList errs AddAttribute InDocument _ -> ErrorList [] AddAttribute InHtmlElement _ -> ErrorList [] _ -> p _ -> p -- | Explicitly indicate that the input stream will not contain any further -- bytes, and perform any finalization processing based on that. finalizeTree :: [Patch] -> TreeState -> Tree finalizeTree ps state = buildTree $ ps ++ ps' where (ps', _, _) = treeStep' [(ts, setRemainder BS.SH.empty tokState, BS.empty)] id state tokState = tokenizerState state ts = finalizeTokenizer tokState ++ [([], EndOfStream)] -- | Specify the encoding scheme a given parse environment should use to read -- from the binary document stream. Note that this will always use the initial -- state for the respective decoder; intermediate states as returned by -- 'decodeStep' are not supported. treeEncoding :: Either SnifferEnvironment (Maybe Encoding) -> TreeState -> TreeState treeEncoding enc state = state { tokenizerState = tokenizerEncoding enc $ tokenizerState state } -- | __HTML:__ -- @[fragment parsing algorithm] -- (https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-parsing-algorithm)@ -- -- Transform a given parse environment by adding context for an embedded but -- separate document fragment. Calling this with an intermediate state -- returned by 'treeStep' (as opposed to an initial state from -- 'defaultTreeState') may result in an unexpected tree structure. treeFragment :: ElementParams -- ^ __HTML:__ -- @[context element] -- (https://html.spec.whatwg.org/multipage/parsing.html#concept-frag-parse-context)@ -- -- The node wrapping -- in one way or another -- the embedded document fragment. -> [(NodeIndex, ElementParams)] -- ^ The ancestors of the context element, most immediate first. -> Maybe QuirksMode -- ^ The degree of backwards compatibility used in the node document of -- the context element, if it can be determined. -> Maybe Bool -- ^ Whether the node document of the context element has been parsed -- in a way which would require scripting to be enabled (@'Just' -- 'True'@) or disabled (@'Just' 'False'@). -> TreeState -> TreeState treeFragment ctxNode ctxTree ctxQuirks ctxScript state = state { tokenizerState = let mode = if elementNamespace ctxNode == Just htmlNamespace then case T.unpack $ elementName ctxNode of "title" -> RCDataState "textarea" -> RCDataState "style" -> RawTextState "xmp" -> RawTextState "iframe" -> RawTextState "noembed" -> RawTextState "noframes" -> RawTextState "script" -> ScriptDataState "noscript" -> if ctxScript == Just True then RawTextState else DataState "plaintext" -> PlainTextState _ -> DataState else DataState in tokenizerStartTag (Just htmlNamespace) (T.pack "html") . tokenizerMode mode $ tokenizerState state , treeParserState = resetInsertionMode' $ (treeParserState state) { openElements = [(0, htmlElement)] , elementIndex = ctxIndex + 1 , quirksMode = Y.fromMaybe NoQuirks ctxQuirks , templateInsertionModes = if nodeIsElement (T.pack "template") ctxNode then [InTemplate] else [] , fragmentContext = Just (ctxNode, ctxTree) , scriptingEnabled = Y.fromMaybe False ctxScript , formElementPointer = fmap fst . L.find (nodeIsElement (T.pack "form") . snd) $ (ctxIndex, ctxNode) : ctxTree } } where ctxIndex = foldr (max . fst) 0 ctxTree + 1 htmlElement = emptyElementParams { elementName = T.pack "html" , elementNamespace = Just htmlNamespace } -- | Specify whether the given parse environment should be treated as if the -- document were contained within the @srcdoc@ attribute of an @\@ -- element ('False' by default). treeInIFrame :: Bool -> TreeState -> TreeState treeInIFrame b state = state { treeParserState = (treeParserState state) { isInIFrameSrcDoc = b } } -- | Given the output of 'tokenizeStep', rewrap the token list and single state -- into a single uniform stream. The final returned tuple will have a 'Just' -- value containing the single state, while every other element has 'Nothing' -- (indicating a point where reentry is impossible). repackStream :: ([([ParseError], Token)], TokenizerState, BS.ByteString) -> [TreeInput] -> [TreeInput] repackStream ([], _, _) is = is repackStream (ts, state, stream) is = case reverse ts of -- Be sure any 'EndOfStream' tokens encountered mid-stream don't break parsing. ((errs, EndOfStream):ts') -> case is of [] -> repackStream' ts (i:is') -> repackStream' (reverse ts') ++ consErrors errs i : is' -- Any other token ending the segment, however, can simply be repacked. _ -> repackStream' ts ++ is where repackStream' = flip foldr [] $ \(errs, t) ts' -> TreeInput { tokenErrs = errs , tokenOut = t , tokenState = if null ts' then Just (state, stream) else Nothing } : ts' consErrors errs i = i { tokenErrs = errs ++ tokenErrs i } -- | Loop the tree dispatcher until it returns a patchset which happens to -- coincide with a tokenizer breakpoint. Relies on lazy evaluation in the -- stream generation to avoid forcing the entire thing at once, while still -- retaining the capability to consume as much input as necessary to get the -- parsers to line up. recurse :: TreeBuilder ([Patch], TokenizerState, BS.ByteString) recurse = do out <- dispatcher case treeState out of -- The tokenizer can't provide a breakpoint at the current token. Nothing -> do (out', tokState', stream') <- recurse return (treePatches out ++ out', tokState', stream') -- We have a re-entrant state to seamlessly resume the tokenizer. Just (tokState, stream) -> return (treePatches out, tokState, stream)