{-# HLINT ignore "Redundant flip" #-}

{-|
Description:    Fold a linear, semantic stream into a tree structure.

Copyright:      (c) 2020-2021 Sam May
License:        MPL-2.0
Maintainer:     ag.eitilt@gmail.com

Stability:      provisional
Portability:    portable

This module and the internal branch it heads implement the "Tree Construction"
section of the
__[HTML](https://html.spec.whatwg.org/multipage/parsing.html#tree-construction)__
document parsing specification, operating over the output of the
"Web.Mangrove.Parse.Tokenize" stage to produce a DOM tree representation of a
web page.  As this library is still in the early stages of development, the
representation produced here is not actually a proper DOM implementation, but
instead only stores basic parameters in an equivalent (but less-featured)
structure.  Nonetheless, it is still enough for basic evaluation and unstyled
rendering.
-}
module Web.Mangrove.Parse.Tree
    ( -- * Types
      -- ** Final
      Tree ( .. )
    , Node ( .. )
    , QuirksMode ( .. )
      -- ** Intermediate
    , Patch
    , TreeState
    , Encoding ( .. )
    , NodeIndex
    , ElementParams ( .. )
    , emptyElementParams
      -- * Initialization
    , defaultTreeState
    , treeEncoding
    , treeFragment
    , treeInIFrame
      -- * Transformations
    , tree
    , treeStep
    , finalizeTree
    ) where


import qualified Control.Monad.Trans.State as N.S

import qualified Data.ByteString as BS
import qualified Data.ByteString.Short as BS.SH
import qualified Data.List as L
import qualified Data.Maybe as Y
import qualified Data.Text as T

import Web.Willow.DOM

import Web.Mangrove.Parse.Common.Error
import Web.Mangrove.Parse.Tokenize
import Web.Mangrove.Parse.Tokenize.Common
import Web.Mangrove.Parse.Tree.Common
import Web.Mangrove.Parse.Tree.Dispatcher
import Web.Mangrove.Parse.Tree.Patch
import Web.Mangrove.Parse.Tree.Patch.Fold
import Web.Willow.Common.Encoding hiding ( setRemainder )
import Web.Willow.Common.Encoding.Sniffer
import Web.Willow.Common.Parser


-- | __HTML:__
--      @[tree construction]
--      (https://html.spec.whatwg.org/multipage/parsing.html#tree-construction)@
-- 
-- Given a starting environment, transform a binary document stream into a
-- hierarchical markup tree.  If the parse fails, returns an empty tree (a
-- 'Document' node with no children).
tree :: TreeState -> BS.ByteString -> ([Patch], TreeState)
tree state stream = L.foldl' treeFold ([], state) $ L.unfoldr treeUnfold (state, stream)
  where treeUnfold = unfoldLoop treeStep tokenizerState
        treeFold (ps, _) (ps', state', _) = (ps ++ ps', state')

-- | Wrap a parser in a signature appropriate to pass to 'L.unfoldr'.
unfoldLoop
    :: Eq state
    => (state -> BS.ByteString -> ([out], state, BS.ByteString))
        -- ^ The single-step parser function.
    -> (state -> TokenizerState)
        -- ^ Repack the state used by the step parser into a standardized form.
    -> (state, BS.ByteString)
        -- ^ The initial inputs to the parser.
    -> Maybe (([out], state, BS.ByteString), (state, BS.ByteString))
unfoldLoop step toTokState (state, stream)
    | BS.null stream = Nothing
    | otherwise = case step state stream of
        out@(_, state', _) | state /= state' || hasRemainder state' -> continueUnfold out
        ([], _, stream') | BS.null stream' -> Nothing
        out -> continueUnfold out
  where continueUnfold (ps, state', stream') = Just ((ps, state', stream'), (state', stream'))
        hasRemainder = maybe False (not . BS.SH.null . decoderRemainder) . decoderState . toTokState

-- | Parse a minimal number of tokens from a binary document stream, into a
-- state-independent sequence of folding instructions.  Returns all data
-- required to seamlessly resume parsing.
treeStep :: TreeState -> BS.ByteString -> ([Patch], TreeState, BS.ByteString)
treeStep state stream = treeStep' stream' stateRemainder state
  where stateRemainder state' = state'
            { tokenizerState = setRemainder (BS.SH.toShort stream) $ tokenizerState state'
            }
        stream' = L.unfoldr tokenUnfold (tokenizerState state, stream)
        tokenUnfold = unfoldLoop tokenizeStep id

-- | Parse a minimal number of tokens from a binary document stream, performing
-- any backend processing required to correctly generate the document tree.
treeStep'
    :: [([([ParseError], Token)], TokenizerState, BS.ByteString)]
    -> (TreeState -> TreeState)
    -> TreeState
    -> ([Patch], TreeState, BS.ByteString)
treeStep' input fallback state =
    case runParserT (N.S.runStateT recurse $ treeParserState state) stream of
        Just (((ps, tokState, stream'), parserState), _) -> (ps', state', stream')
          where state' = TreeState
                    { tokenizerState = tokState
                    , treeParserState = parserState
                    }
                ps' = map redirectPatches ps
        Nothing -> ([], fallback state, BS.empty)
  where stream = L.foldr repackStream [] input
        redirectPatches p = case fragmentContext $ treeParserState state of
            -- A fragment with an @\<html\>@ context won't create that node in
            -- the folded tree, and so any patches sent there won't find an
            -- anchor.
            Just ctx | nodeIsElement (T.pack "html") $ fst ctx -> case p of
                InsertComment errs InDocument _ -> ErrorList errs
                InsertComment errs InHtmlElement txt -> InsertComment errs InDocument txt
                AddAttribute InDocument _ -> ErrorList []
                AddAttribute InHtmlElement attr -> AddAttribute InDocument attr
                _ -> p
            -- The fragment parsing algorithm returns the children of the
            -- context node, and so any patches destined for locations above
            -- that shouldn't show up in the resulting tree.
            Just _ -> case p of
                InsertComment errs InDocument _ -> ErrorList errs
                InsertComment errs InHtmlElement _ -> ErrorList errs
                AddAttribute InDocument _ -> ErrorList []
                AddAttribute InHtmlElement _ -> ErrorList []
                _ -> p
            _ -> p

-- | Explicitly indicate that the input stream will not contain any further
-- bytes, and perform any finalization processing based on that.
finalizeTree :: [Patch] -> TreeState -> Tree
finalizeTree ps state = buildTree $ ps ++ ps'
  where (ps', _, _) = treeStep' [(ts, setRemainder BS.SH.empty tokState, BS.empty)] id state
        tokState = tokenizerState state
        ts = finalizeTokenizer tokState ++ [([], EndOfStream)]


-- | Specify the encoding scheme a given parse environment should use to read
-- from the binary document stream.  Note that this will always use the initial
-- state for the respective decoder; intermediate states as returned by
-- 'decodeStep' are not supported.
treeEncoding :: Either SnifferEnvironment (Maybe Encoding) -> TreeState -> TreeState
treeEncoding enc state = state
    { tokenizerState = tokenizerEncoding enc $ tokenizerState state
    }

-- | __HTML:__
--      @[fragment parsing algorithm]
--      (https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-parsing-algorithm)@
-- 
-- Transform a given parse environment by adding context for an embedded but
-- separate document fragment.  Calling this with an intermediate state
-- returned by 'treeStep' (as opposed to an initial state from
-- 'defaultTreeState') may result in an unexpected tree structure.
treeFragment
    :: ElementParams
        -- ^ __HTML:__
        --      @[context element]
        --      (https://html.spec.whatwg.org/multipage/parsing.html#concept-frag-parse-context)@
        -- 
        -- The node wrapping -- in one way or another -- the embedded document fragment.
    -> [(NodeIndex, ElementParams)]
        -- ^ The ancestors of the context element, most immediate first.
    -> Maybe QuirksMode
        -- ^ The degree of backwards compatibility used in the node document of
        -- the context element, if it can be determined.
    -> Maybe Bool
        -- ^ Whether the node document of the context element has been parsed
        -- in a way which would require scripting to be enabled (@'Just'
        -- 'True'@) or disabled (@'Just' 'False'@).
    -> TreeState
    -> TreeState
treeFragment ctxNode ctxTree ctxQuirks ctxScript state = state
    { tokenizerState =
        let mode = if elementNamespace ctxNode == Just htmlNamespace
                then case T.unpack $ elementName ctxNode of
                    "title" -> RCDataState
                    "textarea" -> RCDataState
                    "style" -> RawTextState
                    "xmp" -> RawTextState
                    "iframe" -> RawTextState
                    "noembed" -> RawTextState
                    "noframes" -> RawTextState
                    "script" -> ScriptDataState
                    "noscript" -> if ctxScript == Just True
                        then RawTextState
                        else DataState
                    "plaintext" -> PlainTextState
                    _ -> DataState
                else DataState
        in  tokenizerStartTag (Just htmlNamespace) (T.pack "html") .
            tokenizerMode mode $ tokenizerState state
    , treeParserState = resetInsertionMode' $ (treeParserState state)
        { openElements = [(0, htmlElement)]
        , elementIndex = ctxIndex + 1
        , quirksMode = Y.fromMaybe NoQuirks ctxQuirks
        , templateInsertionModes = if nodeIsElement (T.pack "template") ctxNode
            then [InTemplate]
            else []
        , fragmentContext = Just (ctxNode, ctxTree)
        , scriptingEnabled = Y.fromMaybe False ctxScript
        , formElementPointer = fmap fst . L.find (nodeIsElement (T.pack "form") . snd) $
            (ctxIndex, ctxNode) : ctxTree
        }
    }
  where ctxIndex = foldr (max . fst) 0 ctxTree + 1
        htmlElement = emptyElementParams
            { elementName = T.pack "html"
            , elementNamespace = Just htmlNamespace
            }

-- | Specify whether the given parse environment should be treated as if the
-- document were contained within the @srcdoc@ attribute of an @\<iframe\>@
-- element ('False' by default).
treeInIFrame :: Bool -> TreeState -> TreeState
treeInIFrame b state = state
    { treeParserState = (treeParserState state)
        { isInIFrameSrcDoc = b
        }
    }


-- | Given the output of 'tokenizeStep', rewrap the token list and single state
-- into a single uniform stream.  The final returned tuple will have a 'Just'
-- value containing the single state, while every other element has 'Nothing'
-- (indicating a point where reentry is impossible).
repackStream
    :: ([([ParseError], Token)], TokenizerState, BS.ByteString)
    -> [TreeInput]
    -> [TreeInput]
repackStream ([], _, _) is = is
repackStream (ts, state, stream) is = case reverse ts of
    -- Be sure any 'EndOfStream' tokens encountered mid-stream don't break parsing.
    ((errs, EndOfStream):ts') -> case is of
        [] -> repackStream' ts
        (i:is') -> repackStream' (reverse ts') ++ consErrors errs i : is'
    -- Any other token ending the segment, however, can simply be repacked.
    _ -> repackStream' ts ++ is
  where repackStream' = flip foldr [] $ \(errs, t) ts' -> TreeInput
            { tokenErrs = errs
            , tokenOut = t
            , tokenState = if null ts'
                then Just (state, stream)
                else Nothing
            } : ts'
        consErrors errs i = i
            { tokenErrs = errs ++ tokenErrs i
            }


-- | Loop the tree dispatcher until it returns a patchset which happens to
-- coincide with a tokenizer breakpoint.  Relies on lazy evaluation in the
-- stream generation to avoid forcing the entire thing at once, while still
-- retaining the capability to consume as much input as necessary to get the
-- parsers to line up.
recurse :: TreeBuilder ([Patch], TokenizerState, BS.ByteString)
recurse = do
    out <- dispatcher
    case treeState out of
        -- The tokenizer can't provide a breakpoint at the current token.
        Nothing -> do
            (out', tokState', stream') <- recurse
            return (treePatches out ++ out', tokState', stream')
        -- We have a re-entrant state to seamlessly resume the tokenizer.
        Just (tokState, stream) ->
            return (treePatches out, tokState, stream)