{-|
Description:    Functions and objects used to build the tokenizer.

Copyright:      (c) 2020-2021 Sam May
License:        MPL-2.0
Maintainer:     ag.eitilt@gmail.com

Stability:      provisional
Portability:    portable

The tokenizer stage is in a uniquely challenging position where the standard's
instructions are given as 'switch' statements (unlike
'Web.Mangrove.Parse.Encoding.Preprocess'), it operates over the output
---including errors and re-entrant state--- of a previous stage (unlike
'Web.Mangrove.Parse.Encoding'), and the conceptual input is of a type, 'Char',
with many constructors and a lot of standard library support (unlike
'Web.Mangrove.Parse.Tree').  That intersection means that it has proven best to
abstract the parser constructors themselves, rather than to abstract the
predicate tests and/or the combinator functions as in the other stages.
-}
module Web.Mangrove.Parse.Tokenize.Common
    ( -- * Types
      -- ** Parser
      Tokenizer
    , TokenizerState ( .. )
    , TokenParserState ( .. )
    , decoderState
    , decoderDefaultState
    , CurrentTokenizerState ( .. )
    , defaultTokenizerState
      -- *** Output
      -- **** Tokenizer
    , TokenizerOutput ( .. )
    , mapErrs
    , continueState
    , endState
    , finalStateList
    , Wrapped
    , WrappedOutput
    , WrappedOutputs
      -- **** Decoder
    , TokenizerInput ( .. )
    , DecoderOutputState
    , decodedRemainder
    , setRemainder
      -- ** Data
    , Token ( .. )
    , DoctypeParams ( .. )
    , emptyDoctypeParams
    , TagParams ( .. )
    , emptyTagParams
    , BasicAttribute
      -- * Parser building
      -- ** Input
      -- *** Single token
    , tokenizer
    , if_
    , ifChar
    , else_
    , elseChar
      -- *** Token list
    , tokenizers
    , ifs_
    , ifsChar
    , elses_
    , elsesChar
      -- *** Reconsume input
    , ifPush_
    , ifPushChar
    , elsePush_
    , elsePushChar
      -- ** Output
    , packToken
    , packState
    , emit
    , emit'
    , consEmit
    , consTokenError
    , consTokenErrors
    , consTokenErrorsList
    , consOut
    , consOuts
      -- ** Combinators
    , appropriateEndTag
    , changeState
    , chunk'
    ) where


import qualified Control.Applicative as A
import qualified Control.Monad.Trans.State as N.S

import qualified Data.ByteString as BS
import qualified Data.ByteString.Short as BS.SH
import qualified Data.Either as E
import qualified Data.HashMap.Strict as M
import qualified Data.Maybe as Y
import qualified Data.Text as T

import Web.Willow.DOM

import Web.Mangrove.Parse.Common.Error
import Web.Willow.Common.Encoding hiding ( setRemainder )
import Web.Willow.Common.Encoding.Sniffer
import Web.Willow.Common.Parser
import Web.Willow.Common.Parser.Switch

import qualified Web.Willow.Common.Encoding as Willow

import Control.Applicative ( (<|>) )


-- | The smallest segment of data which carries semantic meaning.
data Token
    = Doctype DoctypeParams
        -- ^ __HTML:__
        --      @DOCTYPE token@
        -- 
        -- 'Web.Mangrove.DOM.DocumentType', describing the language used in the
        -- document.
    | StartTag TagParams
        -- ^ __HTML:__
        --      @start tag token@
        -- 
        -- 'Web.Mangrove.DOM.Element', marking the start of a markup section,
        -- or a point of markup which (per the specification) doesn't contain
        -- any content.
    | EndTag TagParams
        -- ^ __HTML:__
        --      @end tag token@
        -- 
        -- 'Web.Mangrove.DOM.Element' with a @'/'@ character before the name,
        -- marking the end of a section opened by 'StartTag'.
    | Comment T.Text
        -- ^ __HTML:__
        --      @comment token@
        -- 
        -- 'Web.Mangrove.DOM.Comment', marking author's notes or other text
        -- about the document itself, rather than being part of the content.
    | Character Char
        -- ^ __HTML:__
        --      @character token@
        -- 
        -- 'Web.Mangrove.DOM.Character', usually containing (a small portion
        -- of) text which should rendered for the user or included in the
        -- header metadata, but occasionally subject to further processing
        -- (i.e. the content of @\<script\>@ or @\<style\>@ sections).
    | EndOfStream
        -- ^ __HTML:__
        --      @end-of-file token@
        -- 
        -- Represents both an explicit mark of the end of the stream, when a
        -- simple @[]@ doesn't suffice, and provides a seat to carry
        -- 'ParseError's if no other token is emitted at the same time.
        -- 
        -- Note: the former role doesn't have any guarantees; a stream can end
        -- without an 'EndOfStream' termination, and 'EndOfStream' tokens can occur in
        -- places other than the end of the file.
  deriving ( Eq, Show, Read )


-- | __HTML:__
--      the data associated with a @doctype token@
-- 
-- All data comprising a document type declaration which may be obtained
-- directly from the raw document stream.  Values may be easily instantiated as
-- updates to 'emptyDoctypeParams'.
data DoctypeParams = DoctypeParams
    { doctypeName :: Maybe T.Text
        -- ^ The root element of the document, which may also identify the
        -- primary language used.
    , doctypePublicId :: Maybe T.Text
        -- ^ A globally-unique reference to the definition of the language.
    , doctypeSystemId :: Maybe T.Text
        -- ^ A system-dependant (but perhaps easier to access) reference to the
        -- definition of the language.
    , doctypeQuirks :: Bool
        -- ^ Whether the document should be read and rendered in a
        -- backwards-compatible manner, even if the other data in the token
        -- would match that expected by the specification.  Note that 'False'
        -- value is still subject to those expectations; this just provides an
        -- override in the case of, for example, a malformed declaration.
    }
  deriving ( Eq, Show, Read )

-- | A sane default collection for easy record initialization; namely,
-- 'Nothing's and 'False'.
emptyDoctypeParams :: DoctypeParams
emptyDoctypeParams = DoctypeParams
    { doctypeName = Nothing
    , doctypePublicId = Nothing
    , doctypeSystemId = Nothing
    , doctypeQuirks = False
    }


-- | __HTML:__
--      the data associated with a @start tag@ or an @end tag token@
-- 
-- All data comprising a markup tag which may be obtained directly from the raw
-- document stream.  Values may be easily instantiated as updates to
-- 'emptyTagParams'.
data TagParams = TagParams
    { tagName :: ElementName
        -- ^ The primary identifier of the markup tag, defining its behaviour
        -- during rendering, and providing a means of matching opening tags
        -- with closing ones.
    , tagIsSelfClosing :: Bool
        -- ^ Whether the tag was closed at the same point it was opened,
        -- according to the XML-style "@/>@" syntax.  HTML null elements are
        -- handled in the tree construction stage instead.
    , tagAttributes :: M.HashMap T.Text T.Text
        -- ^ Finer-grained metadata attached to the markup tag.
    }
  deriving ( Eq, Show, Read )

-- | A sane default collection for easy record initialization.
emptyTagParams :: TagParams
emptyTagParams = TagParams
    { tagName = T.empty
    , tagIsSelfClosing = False
    , tagAttributes = M.empty
    }


-- | Parser combinators written over the output of the
-- 'Web.Mangrove.Parse.Encoding.decoder' stage, segmenting the raw 'Char'
-- strings into semantic atoms.
type Tokenizer = StateParser TokenParserState [TokenizerInput]


-- | The collection of data required to extract a list of semantic atoms from a
-- binary document stream.  Values may be easily instantiated as updates to
-- 'defaultTokenizerState'.
data TokenizerState = TokenizerState
    { tokenParserState :: TokenParserState
        -- ^ The state of the current 'Web.Mangrove.Parse.Tokenize.tokenize'
        -- stage.
    , decoderState_ :: Either (Either SnifferEnvironment Encoding) (Maybe DecoderState)
        -- ^ The state of the previous 'Web.Mangrove.Parse.Encoding.decoder'
        -- stage, or the data used to initialize it.  For easy access to the
        -- 'DecoderState' itself, see 'decoderState'.
    }
  deriving ( Eq, Show, Read )

-- | All the data which needs to be tracked for correct behaviour in the
-- tokenization stage.
data TokenParserState = TokenParserState
    { prevStartTag :: Maybe ElementName
        -- ^ __HTML:__
        --      @[appropriate end tag token]
        --      (https://html.spec.whatwg.org/multipage/parsing.html#appropriate-end-tag-token)@
        -- 
        -- Certain states in the parser, which only emit 'Character' tokens,
        -- are able to collapse multiple tags which result in that behaviour by
        -- comparing a potential closing markup tag to the name on the
        -- 'StartTag' token which triggered the state.
    , currentState :: CurrentTokenizerState
        -- ^ The set of rules currently active in the state machine.
    , currentNodeNamespace :: Maybe Namespace
        -- ^ Certain states in the parser change behaviour if the @adjusted
        -- current node@ is not an HTML element.  Given the direction of
        -- visibility in the parser stack, this stage can't directly access
        -- that (tree construction level) datum, and so that needs to be
        -- tracked redundently.
    , atEndOfStream :: Bool
        -- ^ Whether the current input stream is known to be the final part of
        -- the document stream ('True') or whether additional input may still
        -- follow ('False') and thus any finalization should not be performed.
    }
  deriving ( Eq, Show, Read )

-- | The various fixed points in the tokenization algorithm, where the parser
-- may break and re-enter seamlessly.
data CurrentTokenizerState
    = DataState
        -- ^ __HTML:__
        --      @[data state]
        --      (https://html.spec.whatwg.org/multipage/parsing.html#data-state)@
        -- 
        -- The core rules, providing the most common tokenization behaviour.
    | RCDataState
        -- ^ __HTML:__
        --      @[RCDATA state]
        --      (https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state)@
        -- 
        -- 'Character'-focused production while, unlike 'RawTextState',
        -- resolving character reference values.
    | RawTextState
        -- ^ __HTML:__
        --      @[RAWTEXT state]
        --      (https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state)@
        -- 
        -- 'Character'-focused production which, unlike 'RCDataState', passes
        -- character reference sequences unchanged.
    | PlainTextState
        -- ^ __HTML:__
        --      @[PLAINTEXT state]
        --      (https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state)@
        -- 
        -- Blind conversion of the entire document stream into 'Character'
        -- tokens.
    | ScriptDataState
        -- ^ __HTML:__
        --      @[script data state]
        --      (https://html.spec.whatwg.org/multipage/parsing.html#script-data-state)@
        -- 
        -- 'Character'-focused production according to the (occasionally
        -- complex) rules governing the handling of @\<script\>@ contents.
    | ScriptDataEscapedState
        -- ^ __HTML:__
        --      @[script data escaped state]
        --      (https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state)@
        -- 
        -- 'Character'-focused production for data within a @\<!--@ / @--\>@
        -- section within 'ScriptDataState'.
    | ScriptDataDoubleEscapedState
        -- ^ __HTML:__
        --      @[script data double escaped state]
        --      (https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state)@
        -- 
        -- 'Character'-focused production for data within a @\<script\>@
        -- section within 'ScriptDataEscapedState'.
    | CDataState
        -- ^ __HTML:__
        --      @[CDATA section state]
        --      (https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state)@
        -- 
        -- 'Character'-focused production for data within a foreign @\<[CDATA[@
        -- / @]]\>@ escape section.
  deriving ( Eq, Ord, Bounded, Enum, Show, Read )

-- | A sane default collection for easy record initialization; namely,
-- interpret the binary stream as 'Utf8' in the primary 'DataState'.
defaultTokenizerState :: TokenizerState
defaultTokenizerState = TokenizerState
    { decoderState_ = Left $ Right Utf8
    , tokenParserState = TokenParserState
        { prevStartTag = Nothing
        , currentState = DataState
        , currentNodeNamespace = Nothing
        , atEndOfStream = False
        }
    }


-- | The state of the previous 'Web.Mangrove.Parse.Encoding.decoder' stage.
-- Note that the high-level conceptual view of the parser stack is of each
-- stage moving along the 'BS.ByteString' as a (more or less) unified front,
-- rather than each stage independently running over the output of the
-- previous.
decoderState :: TokenizerState -> Maybe DecoderState
decoderState = E.fromRight Nothing . decoderState_

-- | As 'decoderState', but generating a default initial state if it doesn't
-- yet exist.
decoderDefaultState :: TokenizerState -> BS.ByteString -> Maybe DecoderState
decoderDefaultState state stream = case decoderState_ state of
    Right dState -> dState
    Left initialize -> Just $ either (flip sniffDecoderState stream) initialDecoderState initialize


-- | __HTML:__
--      @[appropriate end tag token]
--      (https://html.spec.whatwg.org/multipage/parsing.html#appropriate-end-tag-token)@
-- 
-- Check whether the last 'StartTag' token emitted by the tokenizer has a name
-- that matches the given 'String'.
appropriateEndTag :: String -> Tokenizer Bool
appropriateEndTag testName = do
    prevName <- prevStartTag <$> N.S.get
    return $ prevName == Just (T.pack testName)


-- | Prepare the tokenizer to process the /next/ input 'Char' according to the
-- given set of instructions.
changeState :: CurrentTokenizerState -> Tokenizer ()
changeState newState = do
    state <- N.S.get
    N.S.put $ state
        { currentState = newState
        }


-- | All data with which to initialize the decoder, to resume as if the state
-- machine transition hadn't been interrupted.  If 'Nothing', the associated
-- 'Char' was emitted in the 'init' of several at once; in this case, the
-- decoder can't be re-entered in exactly the same place with the data wrapped
-- in this type, and so the stream must continue to be processed until the
-- first 'Just' value.
type DecoderOutputState = Maybe (Maybe DecoderState, BS.ByteString)


-- | A smart wrapper over 'switch' to provide an end-of-stream recovery.  See
-- 'tokenizers' if multiple objects may be returned at once.
tokenizer
    :: Maybe ([ParseError], out)
        -- ^ The return value to use when the stream is empty.
    -> [SwitchCase TokenizerInput Tokenizer (WrappedOutput out)]
        -- ^ If the return value is paired with 'True', the final state of the
        -- input stream is allowed as a fallback; if 'False' (i.e., something
        -- has been 'push'ed back for reconsumption) the returned state is used
        -- as-is, even if that doesn't result in a valid fixed breakpoint.
    -> Tokenizer (TokenizerOutput out)
tokenizer Nothing cases = next >>= switch' cases
tokenizer (Just (errs, out)) cases = (next >>= switch' cases) <|> do
    eos <- atEndOfStream <$> N.S.get
    if eos
        then end *> return recovery
        else A.empty
  where recovery = TokenizerOutput
            { tokenizedErrs = errs
            , tokenizedOut = out
            , tokenizedState = endState
            }

-- | As 'switch', but with an internal marker to indicate whether the final
-- state of the input stream may be retrieved from the given value, or whether
-- it must only reflect the generated output.  See @switches'@ if multiple
-- objects may be returned at once.
switch'
    :: [SwitchCase TokenizerInput Tokenizer (WrappedOutput out)]
    -> TokenizerInput
    -> Tokenizer (TokenizerOutput out)
switch' cases t' = mapState' <$> switch cases t'
  where mapState' (True, out) = mapState (<|> decodedState t') out
        mapState' (False, out) = out


-- | A smart wrapper over 'switch' to provide an end-of-stream recovery.  See
-- 'tokenizer' if only a single object may be returned.
tokenizers
    :: Maybe [([ParseError], out)]
        -- ^ The return values to use when the stream is empty.
    -> [SwitchCase TokenizerInput Tokenizer (WrappedOutputs out)]
        -- ^ If the return value is paired with 'True', the final state of the
        -- input stream is allowed as a fallback; if 'False' (i.e., something
        -- has been 'push'ed back for reconsumption) the returned state is used
        -- as-is, even if that doesn't result in a valid fixed breakpoint.
    -> Tokenizer [TokenizerOutput out]
tokenizers Nothing cases = next >>= switches' cases
tokenizers (Just recovery) cases = (next >>= switches' cases) <|> do
    eos <- atEndOfStream <$> N.S.get
    if eos
        then end *> return recovery'
        else A.empty
  where recovery' = finalStateList endState . flip map recovery $ \(errs, out) ->
            TokenizerOutput errs out Nothing

-- | As 'switch', but with an internal marker to indicate whether the final
-- state of the input stream may be retrieved from the given value, or whether
-- it must only reflect the generated output.  See @switch'@ if only a single
-- object may be returned.
switches'
    :: [SwitchCase TokenizerInput Tokenizer (WrappedOutputs out)]
    -> TokenizerInput
    -> Tokenizer [TokenizerOutput out]
switches' cases t' = repack <$> switch cases t'
  where repack (_, []) = []
        repack (True, [out']) = [mapState (<|> decodedState t') out']
        repack (False, [out']) = [out']
        repack (useState, t:ts) = t : repack (useState, ts)


-- | Perform some operation on the list of errors associated with a wrapped
-- output value.
mapErrs :: ([ParseError] -> [ParseError]) -> TokenizerOutput out -> TokenizerOutput out
mapErrs f out = out { tokenizedErrs = f $ tokenizedErrs out }

-- | Perform some operation on the stream state associated with a wrapped
-- output value.
mapState
    :: (DecoderOutputState -> DecoderOutputState)
    -> TokenizerOutput out
    -> TokenizerOutput out
mapState f out = out { tokenizedState = f $ tokenizedState out }

-- | Indicate that the given wrapped value should never be used as a breakpoint
-- for future stages of the parser.
continueState :: TokenizerOutput out -> TokenizerOutput out
continueState = mapState $ const Nothing

-- | Mark the list of wrapped values such that any in the 'init' may not be
-- used as a breakpoint for future stages of the parser, and the 'last' has the
-- given 'tokenizedState'.
finalStateList :: DecoderOutputState -> [TokenizerOutput out] -> [TokenizerOutput out]
finalStateList _ [] = []
finalStateList state [t'] = [t' { tokenizedState = state }]
finalStateList state (t':ts') = continueState t' : finalStateList state ts'


-- | Wrap an output value with a placeholder state indicating that the value
-- may not be used as a tokenizer re-entry point.  If the value is a 'Token',
-- use 'emit' instead.
packToken :: ([ParseError], out) -> Tokenizer (TokenizerOutput out)
packToken = flip packState Nothing

-- | Wrap an output value such that parsing will continue with the given binary
-- document remainder if 'Just', or such that it can not serve as a breakpoint
-- with 'Nothing'.
packState :: ([ParseError], out) -> DecoderOutputState -> Tokenizer (TokenizerOutput out)
packState (errs, out) dState = return $ TokenizerOutput
    { tokenizedErrs = errs
    , tokenizedOut = out
    , tokenizedState = dState
    }


-- | Wrap a semantic atom and associated errors into the form the tokenizing
-- parsers expect to output.  See 'packToken' if the 'snd' item in the tuple is
-- not a 'Token', or 'emit'' to easily wrap the output as a singleton list.
emit :: ([ParseError], Token) -> Tokenizer (TokenizerOutput Token)
emit t'@(_, StartTag d) = do
    state <- N.S.get
    N.S.put $ state
        { prevStartTag = Just $ tagName d
        }
    packToken t'
emit t'@(_, EndTag d) = consTokenErrors errs <$> packToken t'
  where errs = Y.catMaybes [attrError, closeError]
        attrError
            | null (tagAttributes d) = Nothing
            | otherwise = Just EndTagWithAttributes
        closeError
            | tagIsSelfClosing d = Just EndTagWithTrailingSolidus
            | otherwise = Nothing
emit t' = packToken t'

-- | Wrap a semantic atom and associated errors into the form the tokenizing
-- parsers expect to output, and pack it as a singleton list.  See 'packToken'
-- if the 'snd' item in the tuple is not a 'Token', or 'emit' if the outer list
-- is unnecessary.
emit' :: ([ParseError], Token) -> Tokenizer [TokenizerOutput Token]
emit' = fmap (: []) . emit

-- | Wrap a semantic atom and associated errors into the form the tokenizing
-- parsers expect to output, and add it at the head of a list of the same
-- generated by a separate parser function.
consEmit
    :: ([ParseError], Token)
    -> Tokenizer [TokenizerOutput Token]
    -> Tokenizer [TokenizerOutput Token]
consEmit tok p = do
    t <- emit tok
    ts <- p
    return $ t : ts


-- | The collection of data returned by the "Web.Mangrove.Parse.Decode"
-- stage, and so comprising the input to the tokenizer.
data TokenizerInput = TokenizerInput
    { decodedErrs :: [ParseError]
        -- ^ Any authoring errors detected during decoding.
    , decodedOut :: Char
        -- ^ The decoded character itself.
    , decodedState :: DecoderOutputState
        -- ^ The data required to resume decoding immediately following the
        -- value, if possible.  See also 'decodedRemainder'.
    }
  deriving ( Eq, Show, Read )

-- | The unparsed portion of the binary stream, /after/ parsing the associated
-- 'Char'.  See also 'decodedState'.
decodedRemainder :: TokenizerInput -> Maybe BS.ByteString
decodedRemainder = fmap snd . decodedState


-- | Store the given binary sequence as unparsable without further input, to be
-- prepended to the beginning of stream on the next call to
-- 'Web.Mangrove.Parse.Tokenize.tokenize'.
setRemainder :: BS.SH.ShortByteString -> TokenizerState -> TokenizerState
setRemainder bs state = state
    { decoderState_ = Right $ Willow.setRemainder bs <$> decoderDefaultState state BS.empty
    }


-- | The standard output of parsers used in the tokenization stage.
-- Specifically, it contains the final state of the decoder stage in addition
-- to the generated value, to enable the recursion loop to detect the end of
-- multi-'Char' outputs and properly update the resume state.  May be easily
-- instantiated through calls to 'packToken' and 'packState'.
data TokenizerOutput out = TokenizerOutput
    { tokenizedErrs :: [ParseError]
        -- ^ Any authoring errors detected during decoding or tokenization.
    , tokenizedOut :: out
        -- ^ The point of data specifically generated by the parser.
    , tokenizedState :: DecoderOutputState
        -- ^ The data required to resume tokenization immediately following the
        -- value, if possible.
    }
  deriving ( Eq, Show, Read )

instance Functor TokenizerOutput where
    fmap f out = out { tokenizedOut = f $ tokenizedOut out }


-- | Type-level abstraction for values output by any tokenize-stage parser,
-- along with the flag required to determine the proper remainder of the input
-- stream to return.
type Wrapped out = (Bool, out)

-- | Type-level abstraction for single values output by a tokenizer, along with
-- the flag required to determine the proper remainder of the input stream to
-- return.
type WrappedOutput out = Wrapped (TokenizerOutput out)

-- | Type-level abstraction for multiple values output by a tokenizer, along
-- with the flag required to determine the proper remainder of the input stream
-- to return.
type WrappedOutputs out = Wrapped [TokenizerOutput out]


-- | Wrap an 'If_' clause returning a single value, with a more user-friendly
-- predicate type than would otherwise be required by a raw 'SwitchCase' in the
-- tokenizer stage.
if_
    :: (Char -> Bool)
    -> Tokenizer (TokenizerOutput out)
    -> SwitchCase TokenizerInput Tokenizer (WrappedOutput out)
if_ f t = If (f . decodedOut) $ \c' ->
    (,) True . mapErrs (decodedErrs c' ++) <$> t

-- | Wrap an 'If_' clause returning a list of semantic atoms, with a more
-- user-friendly predicate type than would otherwise be required by a raw
-- 'SwitchCase' in the tokenizer stage.
ifs_
    :: (Char -> Bool)
    -> Tokenizer [TokenizerOutput Token]
    -> SwitchCase TokenizerInput Tokenizer (WrappedOutputs Token)
ifs_ f t = If (f . decodedOut) $ \c' ->
    (,) True . consTokenErrorsList (decodedErrs c') <$> t


-- | Wrap an 'If' clause returning a single value, with more user-friendly
-- predicate and monad types than would otherwise be required by a raw
-- 'SwitchCase' in the tokenizer stage.
ifChar
    :: (Char -> Bool)
    -> (Char -> Tokenizer (TokenizerOutput out))
    -> SwitchCase TokenizerInput Tokenizer (WrappedOutput out)
ifChar f t = If (f . decodedOut) $ \c' ->
    (,) True . mapErrs (decodedErrs c' ++) <$> t (decodedOut c')

-- | Wrap an 'If' clause returning a list of semantic atoms, with more
-- user-friendly predicate and monad types than would otherwise be required by
-- a raw 'SwitchCase' in the tokenizer stage.
ifsChar
    :: (Char -> Bool)
    -> (Char -> Tokenizer [TokenizerOutput Token])
    -> SwitchCase TokenizerInput Tokenizer (WrappedOutputs Token)
ifsChar f t = If (f . decodedOut) $ \c' ->
    (,) True . consTokenErrorsList (decodedErrs c') <$> t (decodedOut c')


-- | Wrap an 'If_' clause returning a single value, with a more user-friendly
-- predicate type, and return the input character to the stream for
-- reconsumption.
ifPush_
    :: (Char -> Bool)
    -> Tokenizer out
    -> SwitchCase TokenizerInput Tokenizer (Wrapped out)
ifPush_ f t = If (f . decodedOut) $ \c' ->
    push c' *> fmap ((,) False) t

-- | Wrap an 'If' clause returning a single value, with more user-friendly
-- predicate and monad types, and return the input character to the stream for
-- reconsumption.
ifPushChar
    :: (Char -> Bool)
    -> (Char -> Tokenizer out)
    -> SwitchCase TokenizerInput Tokenizer (Wrapped out)
ifPushChar f t = If (f . decodedOut) $ \c' ->
    push c' *> fmap ((,) False) (t $ decodedOut c')


-- | Wrap an 'Else_' clause returning a single value, performing any relevant
-- back-end processing.
else_
    :: Tokenizer (TokenizerOutput out)
    -> SwitchCase TokenizerInput Tokenizer (WrappedOutput out)
else_ t = Else $ \c' ->
    (,) True . mapErrs (decodedErrs c' ++) <$> t

-- | Wrap an 'Else_' clause returning a list of semantic atoms, performing any
-- relevant back-end processing.
elses_
    :: Tokenizer [TokenizerOutput Token]
    -> SwitchCase TokenizerInput Tokenizer (WrappedOutputs Token)
elses_ t = Else $ \c' ->
    (,) True . consTokenErrorsList (decodedErrs c') <$> t


-- | Wrap 'Else' clause returning a single value, with a more user-friendly
-- monad input type than would otherwise be required by a raw 'SwitchCase' in
-- the tokenizer stage.
elseChar
    :: (Char -> Tokenizer (TokenizerOutput out))
    -> SwitchCase TokenizerInput Tokenizer (WrappedOutput out)
elseChar t = Else $ \c' ->
    (,) True . mapErrs (decodedErrs c' ++) <$> t (decodedOut c')

-- | Wrap an 'Else' clause returning a list of semantic atoms, with a more
-- user-friendly monad input type than would otherwise be required by a raw
-- 'SwitchCase' in the tokenizer stage.
elsesChar
    :: (Char -> Tokenizer [TokenizerOutput Token])
    -> SwitchCase TokenizerInput Tokenizer (WrappedOutputs Token)
elsesChar t = Else $ \c' ->
    (,) True . consTokenErrorsList (decodedErrs c') <$> t (decodedOut c')


-- | Wrap an 'Else_' clause returning a single value, performing any relevant
-- back-end processing, and return the input character to the stream for
-- reconsumption.
elsePush_
    :: Tokenizer out
    -> SwitchCase TokenizerInput Tokenizer (Wrapped out)
elsePush_ t = Else $ \c' ->
    push c' *> fmap ((,) False) t

-- | Wrap an 'Else' clause returning a single value, with a more user-friendly
-- monad type, and return the input character to the stream for reconsumption.
elsePushChar
    :: (Char -> Tokenizer out)
    -> SwitchCase TokenizerInput Tokenizer (Wrapped out)
elsePushChar t = Else $ \c' ->
    push c' *> fmap ((,) False) (t $ decodedOut c')


-- | Add the given 'ParseError' to the collection of warnings associated with
-- the wrapped value.
consTokenError :: ParseError -> TokenizerOutput out -> TokenizerOutput out
consTokenError err = mapErrs (err :)

-- | Add the given 'ParseError's to the collection of warnings associated with
-- the wrapped value.
consTokenErrors :: [ParseError] -> TokenizerOutput out -> TokenizerOutput out
consTokenErrors errs = mapErrs (errs ++)

-- | Add the given 'ParseError's to the collection of warnings associated with
-- the first wrapped value in the list, or create a placeholder if it does not
-- yet contain any 'Token's.
consTokenErrorsList :: [ParseError] -> [TokenizerOutput Token] -> [TokenizerOutput Token]
consTokenErrorsList [] [] = []
consTokenErrorsList errs [] = [TokenizerOutput errs EndOfStream Nothing]
consTokenErrorsList errs (t:ts) = consTokenErrors errs t : ts


-- | Push the given value to the head of the wrapped collection.
-- 
-- @
-- consOut x == fmap (x :)
-- @
consOut :: out -> TokenizerOutput [out] -> TokenizerOutput [out]
consOut x = fmap (x :)

-- | Push the given values to the head of the wrapped collection.
-- 
-- @
-- consOut xs == fmap (xs ++)
-- @
consOuts :: [out] -> TokenizerOutput [out] -> TokenizerOutput [out]
consOuts xs = fmap (xs ++)


-- | Lift a call to 'chunk' over the wrapped 'Char's which make up the input to
-- the tokenization stage.  As an additional quality-of-life, perform some
-- function on the input 'String' before comparing it to the desired value; for
-- behaviour according to the base 'chunk' function, use @'chunk'' 'id'@.
chunk' :: (Char -> Char) -> String -> Tokenizer [TokenizerInput]
chunk' f test = nextChunk (fromIntegral $ length test) >>=
    satisfying (\str -> map (f . decodedOut) str == test)


-- | Produce a placeholder value for use in generating a 'TokenizerOutput'
-- value located at the end of the byte stream.
endState :: DecoderOutputState
endState = Just (Nothing, BS.empty)