{-|
Description:    Tokenization rules for characters comprising a markup element.

Copyright:      (c) 2020-2021 Sam May
License:        MPL-2.0
Maintainer:     ag.eitilt@gmail.com

Stability:      stable
Portability:    portable
-}
module Web.Mangrove.Parse.Tokenize.Tag
    ( tokenTagOpen
    , tokenBeforeAttributeName
    , tokenSelfClosingStartTag
    , tokenAppropriateEndTagLessThanSign
    , tokenAppropriateEndTagOpen
      -- * Types
    , TagParams' ( .. )
    , emptyTagData
    , packTagToken
    ) where


import qualified Control.Applicative as A
import qualified Control.Monad.Trans.State as N.S

import qualified Data.Bifunctor as F.B
import qualified Data.List as L
import qualified Data.HashMap.Strict as M
import qualified Data.Maybe as Y
import qualified Data.Text as T

import Data.Functor ( ($>) )

import Web.Willow.DOM

import Web.Mangrove.Parse.Common.Error
import Web.Mangrove.Parse.Tokenize.Common
import Web.Mangrove.Parse.Tokenize.Character
import Web.Mangrove.Parse.Tokenize.Comment
import Web.Mangrove.Parse.Tokenize.Doctype
import Web.Willow.Common.Encoding.Character
import Web.Willow.Common.Parser.Util

import {-# SOURCE #-} Web.Mangrove.Parse.Tokenize.Dispatcher


-- | An intermediate form of 'TagParams' with unpacked parameter types to
-- facilitate construction.  Specifically, 'Char'-by-'Char' construction of a
-- 'String' followed by a single 'T.pack' is much faster than repeated @O(n)@
-- calls to 'T.cons'.  Values may be easily instantiated as updates to
-- @emptyTagData@.
data TagParams' = TagParams'
    { tagName' :: String
        -- ^ See 'tagName'.
    , tagIsSelfClosing' :: Bool
        -- ^ See 'tagIsSelfClosing'.
    , tagAttributes' :: [(String, String)]
        -- ^ See 'tagAttributes'.
    }
  deriving ( Eq, Show, Read )

-- | A sane default collection for easy record initialization; namely,
-- 'null's and 'False'.
emptyTagData :: TagParams'
emptyTagData = TagParams'
    { tagName' = ""
    , tagIsSelfClosing' = False
    , tagAttributes' = []
    }


-- | Wrap a given collection of data into the payload of a 'StartTag' or
-- 'EndTag', according to the given constructor.
packTagToken
    :: (TagParams -> Token)
    -> TokenizerOutput (Maybe TagParams')
    -> Tokenizer [TokenizerOutput Token]
packTagToken toToken d' = case tokenizedOut d' of
    Just tagData -> do
        let tok = toToken $ emptyTagParams
                { tagName = T.pack $ tagName' tagData
                , tagIsSelfClosing = tagIsSelfClosing' tagData
                , tagAttributes = attrs
                }
            (attrErrs, attrs) = packAttributes $ tagAttributes' tagData
        tok' <- emit (tokenizedErrs d' ++ attrErrs, tok)
        return [tok' { tokenizedState = tokenizedState d' }]
    Nothing -> case tokenizedErrs d' of
        [] -> return []
        errs -> do
            atEOS <- atEndOfStream <$> N.S.get
            consTokenErrorsList errs <$> if atEOS
                then Y.fromMaybe [] <$> A.optional dispatcher
                else dispatcher

-- | Wrap a loose collection of unpacked 'String's into a form suitable for
-- representing the 'tagAttributes' of a markup element, removing all but the
-- first occurrence of any single name.
-- 
-- Note that any nameless attributes at the head of the list are dropped, to
-- work around the edge case where @newAttr@ is called immediately following
-- the tag name; this shouldn't cause any issue as the specification doesn't
-- seem to output any such attributes in the first place.
packAttributes
    :: [(String, String)]
    -> ([ParseError], M.HashMap T.Text T.Text)
packAttributes = foldr filterAttr ([], M.empty) . L.sortOn fst . dropWhile (null . fst)

-- | Add a single attribute to the processed collection, potentially throwing a
-- 'DuplicateAttribute' warning (with the old value) if one with the same name
-- already exists.
filterAttr
    :: (String, String)
    -> ([ParseError], M.HashMap T.Text T.Text)
    -> ([ParseError], M.HashMap T.Text T.Text)
filterAttr (n, v) (errs, as)
    | M.null as = (errs, M.singleton name value)
    | otherwise = case M.lookup name as of
        Nothing -> (errs, M.insert name value as)
        Just value' -> (DuplicateAttribute (name, value') : errs, M.insert name value as)
  where name = T.pack n
        value = T.pack v


-- | Finalize the currently "active" attribute, so that any further calls to,
-- e.g., @consAttrValue@ will be applied to a different (initially empty) pair
-- instead.
newAttr :: TokenizerOutput (Maybe TagParams') -> TokenizerOutput (Maybe TagParams')
newAttr = fmap . fmap $ \d -> d
    { tagAttributes' = case tagAttributes' d of
        [] -> []
        as@(("", ""):_) -> as
        as -> ("", "") : as
    }

-- | Prepend a character to the unpacked key of the currently "active" metadata
-- pair in a markup element.
consAttrName :: Char -> TokenizerOutput (Maybe TagParams') -> TokenizerOutput (Maybe TagParams')
consAttrName c = fmap . fmap $ \d -> d
    { tagAttributes' = consAttrName' $ tagAttributes' d
    }
  where consAttrName' [] = [([c], "")]
        consAttrName' (a:as) = F.B.first (c :) a : as

-- | Prepend a character to the unpacked value of the currently "active"
-- metadata pair in a markup element.
consAttrValue :: Char -> TokenizerOutput (Maybe TagParams') -> TokenizerOutput (Maybe TagParams')
consAttrValue c = consAttrValueString [c]

-- | Prepend a character sequence to the unpacked value of the currently
-- "active" metadata pair in a markup element.
consAttrValueString :: String -> TokenizerOutput (Maybe TagParams') -> TokenizerOutput (Maybe TagParams')
consAttrValueString str = fmap . fmap $ \d -> d
    { tagAttributes' = consAttrValueString' $ tagAttributes' d
    }
  where consAttrValueString' [] = [("", str)]
        consAttrValueString' (a:as) = fmap (str ++) a : as


-- | __HTML:__
--      @[tag open state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state)@
-- 
-- The parsing instructions for after reading @"\<"@ in a section of the state
-- machine which allows markup declarations.
tokenTagOpen :: Tokenizer [TokenizerOutput Token]
tokenTagOpen = tokenizers (Just [([EOFBeforeTagName], Character '<')])
    [ ifs_ (== '!') tokenMarkupDeclarationOpen
    , ifs_ (== '/') tokenEndTagOpen
    , ifPush_ isAsciiAlpha $ tokenTagName >>= packTagToken StartTag
    , ifPush_ (== '?') $ (: []) . packCommentToken .
        consTokenError UnexpectedQuestionMarkInsteadOfTagName <$>
        tokenBogusComment
    , elsePush_ $ changeState DataState *>
        emit' ([InvalidFirstCharacterOfTagName], Character '<')
    ]

-- | __HTML:__
--      @[end tag open state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state)@
-- 
-- The parsing instructions for after reading @"\</"@ in a section of the state
-- machine which allows markup declarations.
tokenEndTagOpen :: Tokenizer [TokenizerOutput Token]
tokenEndTagOpen = tokenizers (Just recovery)
    [ ifPush_ isAsciiAlpha $ tokenTagName >>= packTagToken EndTag
    , ifs_ (== '>') $ do
        changeState DataState
        consTokenErrorsList [MissingEndTagName] <$> dispatcher
    , elsePush_ $ (: []) . packCommentToken . consTokenError InvalidFirstCharacterOfTagName <$>
        tokenBogusComment
    ]
  where recovery =
            [ ([], Character '<')
            , ([], Character '/')
            , ([EOFBeforeTagName], EndOfStream)
            ]

-- | __HTML:__
--      @[tag name state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state)@
-- 
-- The parsing instructions for after reading @"\<"@ followed by a letter in a
-- section of the state machine which allows markup declarations.
tokenTagName :: Tokenizer (TokenizerOutput (Maybe TagParams'))
tokenTagName = tokenizer (Just ([EOFInTag], Nothing))
    [ if_ isAsciiWhitespace tokenBeforeAttributeName
    , if_ (== '/') tokenSelfClosingStartTag
    , if_ (== '>') $ packToken ([], Just emptyTagData)
    , if_ (== '\NUL') $ consTokenError UnexpectedNullCharacter . consTagName replacementChar <$>
        tokenTagName
    , elseChar $ \c -> consTagName (toAsciiLower c) <$> tokenTagName
    ]
  where consTagName char = fmap . fmap $ \d -> d
            { tagName' = char : tagName' d
            }

-- | __HTML:__
--      @[before attribute name state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state)@
-- 
-- The parsing instructions for reading whitespace interspersed with key-value
-- metadata pairs in the markup tag section of the state machine.
tokenBeforeAttributeName :: Tokenizer (TokenizerOutput (Maybe TagParams'))
tokenBeforeAttributeName = tokenizer (Just ([EOFInTag], Nothing))
    [ if_ isAsciiWhitespace tokenBeforeAttributeName
    , ifPush_ (`elem` "/>") tokenAfterAttributeName
    , ifChar (== '=') $ \c -> consTokenError UnexpectedEqualsSignBeforeAttributeName . consAttrName c <$>
        tokenAttributeName
    , elsePush_ tokenAttributeName
    ]

-- | __HTML:__
--      @[attribute name state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state)@
-- 
-- The parsing instructions for reading the initial (key) part of a key-value
-- metadata pair in the markup tag section of the state machine.
tokenAttributeName :: Tokenizer (TokenizerOutput (Maybe TagParams'))
tokenAttributeName = tokenizer (Just ([EOFInTag], Nothing))
    [ ifPush_ (`elem` "\t\n\f />") tokenAfterAttributeName
    , if_ (== '=') tokenBeforeAttributeValue
    , if_ (== '\NUL') $ consTokenError UnexpectedNullCharacter . consAttrName replacementChar <$>
        tokenAttributeName
    , ifChar (`elem` "\"'<") $ \c -> consTokenError UnexpectedCharacterInAttributeName . consAttrName c <$>
        tokenAttributeName
    , elseChar $ \c -> consAttrName (toAsciiLower c) <$> tokenAttributeName
    ]

-- | __HTML:__
--      @[after attribute name state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state)@
-- 
-- The parsing instructions for reading whitespace after the initial (key) part
-- of a key-value metadata pair in the markup tag section of the state machine.
tokenAfterAttributeName :: Tokenizer (TokenizerOutput (Maybe TagParams'))
tokenAfterAttributeName = tokenizer (Just ([EOFInTag], Nothing))
    [ if_ isAsciiWhitespace $ fmap newAttr tokenAfterAttributeName
    , if_ (== '/') $ fmap newAttr tokenSelfClosingStartTag
    , if_ (== '=') tokenBeforeAttributeValue
    , if_ (== '>') $ changeState DataState *> packToken ([], Just emptyTagData)
    , elsePush_ $ fmap newAttr tokenAttributeName
    ]

-- | __HTML:__
--      @[before attribute value state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state)@
-- 
-- The parsing instructions for reading whitespace after @"="@ following the
-- initial (key) part of a key-value metadata pair in the markup tag section of
-- the state machine.
tokenBeforeAttributeValue :: Tokenizer (TokenizerOutput (Maybe TagParams'))
tokenBeforeAttributeValue = tokenizer (Just ([EOFInTag], Nothing))
    [ if_ isAsciiWhitespace tokenBeforeAttributeValue
    , if_ (== '"') tokenAttributeValueDoubleQuoted
    , if_ (== '\'') tokenAttributeValueSingleQuoted
    , if_ (== '>') $ changeState DataState *>
        packToken ([MissingAttributeValue], Just emptyTagData)
    , elsePush_ tokenAttributeValueUnquoted
    ]

-- | __HTML:__
--      @[attribute value (double-quoted) state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(double-quoted\)-state)@
-- 
-- The parsing instructions for reading the second (value) part of a key-value
-- metadata pair following a @'"'@ in the markup tag section of the state
-- machine.
tokenAttributeValueDoubleQuoted :: Tokenizer (TokenizerOutput (Maybe TagParams'))
tokenAttributeValueDoubleQuoted = tokenizer (Just ([EOFInTag], Nothing))
    [ if_ (== '"') tokenAfterAttributeValueQuoted
    , if_ (== '&') $ tokenAttributeCharacterReference tokenAttributeValueDoubleQuoted
    , if_ (== '\NUL') $ consTokenError UnexpectedNullCharacter . consAttrValue replacementChar <$>
        tokenAttributeValueDoubleQuoted
    , elseChar $ \c -> consAttrValue c <$> tokenAttributeValueDoubleQuoted
    ]

-- | __HTML:__
--      @[attribute value (single-quoted) state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(single-quoted\)-state)@
-- 
-- The parsing instructions for reading the second (value) part of a key-value
-- metadata pair following a @'\''@ in the markup tag section of the state
-- machine.
tokenAttributeValueSingleQuoted :: Tokenizer (TokenizerOutput (Maybe TagParams'))
tokenAttributeValueSingleQuoted = tokenizer (Just ([EOFInTag], Nothing))
    [ if_ (== '\'') tokenAfterAttributeValueQuoted
    , if_ (== '&') $ tokenAttributeCharacterReference tokenAttributeValueSingleQuoted
    , if_ (== '\NUL') $ consTokenError UnexpectedNullCharacter . consAttrValue replacementChar <$>
        tokenAttributeValueSingleQuoted
    , elseChar $ \c -> consAttrValue c <$> tokenAttributeValueSingleQuoted
    ]

-- | __HTML:__
--      @[attribute value (unquoted) state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(unquoted\)-state)@
-- 
-- The parsing instructions for reading the second (value) part of a key-value
-- metadata pair directly following a @'='@ in the markup tag section of the
-- state machine.
tokenAttributeValueUnquoted :: Tokenizer (TokenizerOutput (Maybe TagParams'))
tokenAttributeValueUnquoted = tokenizer (Just ([EOFInTag], Nothing))
    [ if_ isAsciiWhitespace $ fmap newAttr tokenBeforeAttributeName
    , if_ (== '&') $ tokenAttributeCharacterReference tokenAttributeValueUnquoted
    , if_ (== '>') $ changeState DataState *> packToken ([], Just emptyTagData)
    , if_ (== '\NUL') $ consTokenError UnexpectedNullCharacter . consAttrValue replacementChar <$>
        tokenAttributeValueUnquoted
    , ifChar (`elem` "\"'<=`") $ \c ->
        consTokenError UnexpectedCharacterInUnquotedAttributeValue . consAttrValue c <$>
        tokenAttributeValueUnquoted
    , elseChar $ \c -> consAttrValue c <$> tokenAttributeValueUnquoted
    ]

-- | Resolve a character reference occurring within the second (value) part of
-- a key-value metadata pair, and continue parsing using the given tokenizer.
tokenAttributeCharacterReference
    :: Tokenizer (TokenizerOutput (Maybe TagParams'))
    -> Tokenizer (TokenizerOutput (Maybe TagParams'))
tokenAttributeCharacterReference resume = do
    ref <- tokenCharacterReference True
    tag <- resume
    return . consTokenErrors (tokenizedErrs ref) $ consAttrValueString (tokenizedOut ref) tag

-- | __HTML:__
--      @[after attribute value (quoted) state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-(quoted\)-state)@
-- 
-- The parsing instructions for after reading a @'"'@ or @'\''@ matching the
-- opening mark of the second (value) part of a key-value metadata pair in the
-- markup tag section of the state machine.
tokenAfterAttributeValueQuoted :: Tokenizer (TokenizerOutput (Maybe TagParams'))
tokenAfterAttributeValueQuoted = tokenizer (Just ([EOFInTag], Nothing))
    [ if_ isAsciiWhitespace $ fmap newAttr tokenBeforeAttributeName
    , if_ (== '/') tokenSelfClosingStartTag
    , if_ (== '>') $ changeState DataState *> packToken ([], Just emptyTagData)
    , elsePush_ $ consTokenError MissingWhitespaceBetweenAttributes . newAttr <$>
        tokenBeforeAttributeName
    ]

-- | __HTML:__
--      @[self closing start tag state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state)@
-- 
-- The parsing instructions for after reading @"/"@ in the markup tag section
-- of the state machine.
tokenSelfClosingStartTag :: Tokenizer (TokenizerOutput (Maybe TagParams'))
tokenSelfClosingStartTag = tokenizer (Just ([EOFInTag], Nothing))
    [ if_ (== '>') $ changeState DataState *> packToken ([], Just $ emptyTagData
        { tagIsSelfClosing' = True
        })
    , elsePush_ $ consTokenError UnexpectedSolidusInTag <$> tokenBeforeAttributeName
    ]

-- | __HTML:__
--      @[bogus comment state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state)@
-- 
-- The parsing instructions for after reading @"\<?"@ in a section of the state
-- machine which allows markup declarations.
tokenBogusComment :: Tokenizer (TokenizerOutput String)
tokenBogusComment = tokenizer (Just ([], ""))
    [ if_ (== '>') $ changeState DataState *> packToken ([], "")
    , if_ (== '\NUL') $ consTokenError UnexpectedNullCharacter . consOut replacementChar <$>
        tokenBogusComment
    , elseChar $ \c -> consOut c <$> tokenBogusComment
    ]

-- | __HTML:__
--      @[markup declaration open state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state)@
-- 
-- The parsing instructions for after reading @"\<!"@ in a section of the state
-- machine which allows markup declarations.
tokenMarkupDeclarationOpen :: Tokenizer [TokenizerOutput Token]
tokenMarkupDeclarationOpen = do
    parserChoice <- choice
        [ chunk' id "--" *> return commentParser
        , chunk' toAsciiUpper "DOCTYPE" *> return doctypeParser
        , chunk' id "[CDATA[" *> return cdataParser
        , return failureParser
        ]
    parserChoice
  where commentParser = fmap (pure . packCommentToken) tokenCommentStart
        doctypeParser = fmap pure tokenDoctype
        cdataParser = do
            currentNamespace <- currentNodeNamespace <$> N.S.get
            case currentNamespace of
                Just ns | ns /= htmlNamespace ->
                    changeState CDataState $> []
                _ -> pure . packCommentToken .
                    consTokenError CDataInHtmlContent . consOuts "[CDATA[" <$>
                    tokenBogusComment
        failureParser = pure . packCommentToken .
            consTokenError IncorrectlyOpenedComment <$> tokenBogusComment


-- | __HTML:__
-- 
--    * [@RAWTEXT less-than sign state@]
--      (https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state)
--    * [@RCDATA less-than sign state@]
--      (https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state)
-- 
-- The parsing instructions for after reading @"\<"@ in sections of the state
-- machine which may be closed by tag, and which emit only 'Character' tokens.
tokenAppropriateEndTagLessThanSign :: Tokenizer [TokenizerOutput Token]
tokenAppropriateEndTagLessThanSign = tokenizers (Just [([], Character '<')])
    [ ifs_ (== '/') tokenAppropriateEndTagOpen
    , elsePush_ $ emit' ([], Character '<')
    ]

-- | __HTML:__
-- 
--    * [@RAWTEXT end tag open state@]
--      (https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state)
--    * [@RCDATA end tag open state@]
--      (https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state)
--    * [@script data end tag open state@]
--      (https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state)
--    * [@script data escaped end tag open state@]
--      (https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state)
-- 
-- The parsing instructions for after reading @"\</"@ in sections of the state
-- machine which may be closed by tag, and which emit only 'Character' tokens.
tokenAppropriateEndTagOpen :: Tokenizer [TokenizerOutput Token]
tokenAppropriateEndTagOpen = tokenizers (Just [([], Character '<'), ([], Character '/')])
    [ ifPush_ isAsciiAlpha tokenAppropriateEndTag
    , elsePush_ $ mapM emit
        [ ([], Character '<')
        , ([], Character '/')
        ]
    ]

-- | __HTML:__
-- 
--    * [@RAWTEXT end tag name state@]
--      (https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-name-state)
--    * [@RCDATA end tag name state@]
--      (https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state)
--    * [@script data end tag name state@]
--      (https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state)
--    * [@script data escaped end tag name state@]
--      (https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state)
-- 
-- The parsing instructions for after reading @"\</"@ in sections of the state
-- machine which may be closed by tag, and which emit only 'Character' tokens.
tokenAppropriateEndTag :: Tokenizer [TokenizerOutput Token]
tokenAppropriateEndTag = tokenAppropriateEndTagName >>= \t' -> do
    close <- appropriateEndTag (map toAsciiLower $ tokenizedOut t')
    if close
        then tokenizers (Just $ fallback t')
            [ ifs_ isAsciiWhitespace $
                tokenBeforeAttributeName >>= packName (tokenizedOut t')
            , ifs_ (== '/') $
                tokenSelfClosingStartTag >>= packName (tokenizedOut t')
            , ifs_ (== '>') $ do
                changeState DataState
                out <- packName (tokenizedOut t') $ t'
                    { tokenizedOut = Just emptyTagData
                    }
                -- 'packName' keeps the state of the input (i.e. /before/ the
                -- closing @>@), and so that needs to be cleared so 'ifs_' will
                -- assign the state of the input token (/after/ the closing
                -- @>@).
                return $ map continueState out
            , elsePush_ $ emits (tokenizedState t') (anythingElse (tokenizedErrs t') (tokenizedOut t'))
            ]
        else emits (tokenizedState t') (fallback t')
  where fallback t' = anythingElse (tokenizedErrs t') (tokenizedOut t')
        packName buffer = packTagToken $ EndTag . packName' (T.pack buffer)
        packName' buffer tagData = tagData
            { tagName = T.map toAsciiLower buffer
            }
        anythingElse errs buffer =
            [ ([], Character '<')
            , ([], Character '/')
            ] ++ case buffer of
                [] -> []
                (c:cs) -> (errs, Character c) : [([], Character c') | c' <- cs]
        emits state ts = finalStateList state <$> mapM emit ts
        tokenAppropriateEndTagName = tokenizer (Just ([], ""))
            [ ifChar isAsciiAlpha $ \c -> consOut c <$> tokenAppropriateEndTagName
            , elsePush_ $ packToken ([], "")
            ]