{-|
Description:    Extract basic semantic categories from a simple textual stream.

Copyright:      (c) 2020 Sam May
License:        MPL-2.0
Maintainer:     ag.eitilt@gmail.com

Stability:      provisional
Portability:    portable

This module and the internal branch it heads implement the "Tokenization"
section of the
__[HTML](https://html.spec.whatwg.org/multipage/parsing.html#tokenization)__
document parsing specification, processing a stream of text to add information
on, or group it by, semantic category.  This then allows the following stage to
base its logic on such higher-level concepts as "markup tag" or "comment"
without worrying about the (sometimes complex) escaping behaviour required to
parse them.
-}
module Web.Mangrove.Parse.Tokenize
    ( -- * Types
      -- ** Final
      Token ( .. )
    , BasicAttribute
    , TagParams ( .. )
    , emptyTagParams
    , DoctypeParams ( .. )
    , emptyDoctypeParams
      -- ** Intermediate
    , TokenizerState
    , CurrentTokenizerState ( .. )
    , Encoding ( .. )
      -- * Initialization
    , defaultTokenizerState
    , tokenizerMode
    , tokenizerStartTag
    , tokenizerEncoding
      -- * Transformations
    , tokenize
    , tokenizeStep
    , finalizeTokenizer
    ) where


import qualified Control.Applicative as A
import qualified Control.Monad.Trans.State as N.S

import qualified Data.Bifunctor as F.B
import qualified Data.ByteString as BS
import qualified Data.ByteString.Short as BS.SH
import qualified Data.Maybe as Y

import Web.Willow.DOM

import Web.Mangrove.Parse.Common.Error
import Web.Mangrove.Parse.Encoding.Preprocess
import Web.Mangrove.Parse.Tokenize.Common hiding ( setRemainder )
import Web.Mangrove.Parse.Tokenize.Dispatcher
import Web.Willow.Common.Encoding
import Web.Willow.Common.Encoding.Sniffer
import Web.Willow.Common.Parser


-- | __HTML:__
--      @[tokenization]
--      (https://html.spec.whatwg.org/multipage/parsing.html#tokenization)@
-- 
-- Given a starting environment, transform a binary document stream into a
-- stream of semantic atoms.  If the parse fails, returns all tokens before the
-- one which caused the error, but any trailing bytes are silently dropped.
tokenize
    :: TokenizerState
    -> BS.ByteString
    -> ([([ParseError], Token)], TokenizerState)
tokenize state stream = loop $ tokenizeStep state stream
  where loop (ts, state', output)
            | BS.null output = (ts, state')
            | otherwise = F.B.first (ts ++) . loop $ tokenizeStep state' output


-- | Parse a minimal number of bytes from an input stream, into a sequence of
-- semantic tokens.  Returns all data required to seamlessly resume parsing.
tokenizeStep
    :: TokenizerState
    -> BS.ByteString
    -> ([([ParseError], Token)], TokenizerState, BS.ByteString)
tokenizeStep state stream = case runParserT (N.S.runStateT dispatcher' state) stream of
    Just ((out, state'), stream') -> (out, state', stream')
    Nothing -> ([], stateEof, BS.empty)
  where stateEof = state
            { decoderState_ = Right $ setRemainder (BS.SH.toShort stream) <$>
                decoderDefaultState state stream
            }

-- | Explicitly indicate that the input stream will not contain any further
-- bytes, and perform any finalization processing based on that.
finalizeTokenizer :: TokenizerState -> [([ParseError], Token)]
finalizeTokenizer state = fst $ tokenize state' BS.empty
  where state' = state
            { tokenParserState = (tokenParserState state)
                { atEndOfStream = True
                }
            }


-- | Given a string as emitted by the decoder and the final state of that
-- parser, repack it into a single list with that final state encapsulated in a
-- 'Just'; the 'init' of the string is given 'Nothing's.
repackStream :: ([([ParseError], Char)], DecoderState, BS.ByteString) -> [TokenizerInput]
repackStream ([], _, _) = []
repackStream ([(errs, c)], dState, bs) =
    [TokenizerInput errs c $ Just (Just dState, bs)]
repackStream ((errs, c):cs, dState, bs) =
    TokenizerInput errs c Nothing : repackStream (cs, dState, bs)


-- | Wrap the standard dispatcher to operate over a raw 'BS.ByteString'
-- rather than the "Web.Mangrove.Parse.Encoding" output.
dispatcher' :: StateParser TokenizerState BS.ByteString [([ParseError], Token)]
dispatcher' = do
    state <- N.S.get
    stream <- abridge
    let dState = Y.fromMaybe (initialDecoderState Utf8) $ decoderDefaultState state stream
    recurse state $ preprocessStep' dState stream
  where preprocessStep' dState input = case preprocessStep dState input of
            ([], _, _) -> []
            cs'@(_, dState', input') -> repackStream cs' ++ preprocessStep' dState' input'

-- | Loop the tokenization dispatcher until it returns a set of tokens which
-- happens to coincide with a decoder breakpoint.  Relies on lazy evaluation in
-- the stream generation to avoid forcing the entire thing at once, while still
-- retaining the capability to consume as much input as necessary to get the
-- parsers to line up.
recurse
    :: TokenizerState
    -> [TokenizerInput]
    -> StateParser TokenizerState BS.ByteString [([ParseError], Token)]
recurse state stream = case runParserT (N.S.runStateT dispatcher $ tokenParserState state) stream of
    Nothing -> A.empty
    Just ((out, tokState'), stream') -> case Y.listToMaybe (reverse out) >>= tokenizedState of
        Nothing -> do
            let state' = state
                    { tokenParserState = tokState'
                    }
            N.S.put state'
            out' <- recurse state' stream'
            return $ map repackOut out ++ out'
        Just (dState, dStream) -> do
            pushChunk dStream
            N.S.put $ state
                { decoderState_ = Right dState
                , tokenParserState = tokState'
                }
            return $ map repackOut out
  where repackOut t' = (tokenizedErrs t', tokenizedOut t')


-- | Specify which section of the finite state machine describing the
-- tokenization algorithm should be active.
tokenizerMode :: CurrentTokenizerState -> TokenizerState -> TokenizerState
tokenizerMode mode state = state
    { tokenParserState = (tokenParserState state)
        { currentState = mode
        }
    }

-- | Specify the data to use as the previous tag which had been emitted by the
-- tokenizer.  This only has to be called when required for external algorithms
-- or constructions; the parser automatically updates as required for generated
-- 'StartTag' tokens.
tokenizerStartTag :: Maybe Namespace -> ElementName -> TokenizerState -> TokenizerState
tokenizerStartTag ns name state = state
    { tokenParserState = (tokenParserState state)
        { prevStartTag = Just name
        , currentNodeNamespace = ns
        }
    }

-- | Specify the encoding scheme used by a given parse environment to read from
-- the binary input stream.  Note that this will always use the initial state
-- for the respective decoder; intermediate states as returned by 'decodeStep'
-- are not supported.
tokenizerEncoding :: Either SnifferEnvironment (Maybe Encoding) -> TokenizerState -> TokenizerState
tokenizerEncoding enc' state = state
    { decoderState_ = case enc' of
        Right Nothing -> Right Nothing
        Right (Just enc) -> Left $ Right enc
        Left env -> Left $ Left env
    }