{-| Description: Functions and objects used to build the tokenizer. Copyright: (c) 2020-2021 Sam May License: MPL-2.0 Maintainer: ag.eitilt@gmail.com Stability: provisional Portability: portable The tokenizer stage is in a uniquely challenging position where the standard's instructions are given as 'switch' statements (unlike 'Web.Mangrove.Parse.Encoding.Preprocess'), it operates over the output ---including errors and re-entrant state--- of a previous stage (unlike 'Web.Mangrove.Parse.Encoding'), and the conceptual input is of a type, 'Char', with many constructors and a lot of standard library support (unlike 'Web.Mangrove.Parse.Tree'). That intersection means that it has proven best to abstract the parser constructors themselves, rather than to abstract the predicate tests and/or the combinator functions as in the other stages. -} module Web.Mangrove.Parse.Tokenize.Common ( -- * Types -- ** Parser Tokenizer , TokenizerState ( .. ) , TokenParserState ( .. ) , decoderState , decoderDefaultState , CurrentTokenizerState ( .. ) , defaultTokenizerState -- *** Output -- **** Tokenizer , TokenizerOutput ( .. ) , mapErrs , continueState , endState , finalStateList , Wrapped , WrappedOutput , WrappedOutputs -- **** Decoder , TokenizerInput ( .. ) , DecoderOutputState , decodedRemainder , setRemainder -- ** Data , Token ( .. ) , DoctypeParams ( .. ) , emptyDoctypeParams , TagParams ( .. ) , emptyTagParams , BasicAttribute -- * Parser building -- ** Input -- *** Single token , tokenizer , if_ , ifChar , else_ , elseChar -- *** Token list , tokenizers , ifs_ , ifsChar , elses_ , elsesChar -- *** Reconsume input , ifPush_ , ifPushChar , elsePush_ , elsePushChar -- ** Output , packToken , packState , emit , emit' , consEmit , consTokenError , consTokenErrors , consTokenErrorsList , consOut , consOuts -- ** Combinators , appropriateEndTag , changeState , chunk' ) where import qualified Control.Applicative as A import qualified Control.Monad.Trans.State as N.S import qualified Data.ByteString as BS import qualified Data.ByteString.Short as BS.SH import qualified Data.Either as E import qualified Data.HashMap.Strict as M import qualified Data.Maybe as Y import qualified Data.Text as T import Web.Willow.DOM import Web.Mangrove.Parse.Common.Error import Web.Willow.Common.Encoding hiding ( setRemainder ) import Web.Willow.Common.Encoding.Sniffer import Web.Willow.Common.Parser import Web.Willow.Common.Parser.Switch import qualified Web.Willow.Common.Encoding as Willow import Control.Applicative ( (<|>) ) -- | The smallest segment of data which carries semantic meaning. data Token = Doctype DoctypeParams -- ^ __HTML:__ -- @DOCTYPE token@ -- -- 'Web.Mangrove.DOM.DocumentType', describing the language used in the -- document. | StartTag TagParams -- ^ __HTML:__ -- @start tag token@ -- -- 'Web.Mangrove.DOM.Element', marking the start of a markup section, -- or a point of markup which (per the specification) doesn't contain -- any content. | EndTag TagParams -- ^ __HTML:__ -- @end tag token@ -- -- 'Web.Mangrove.DOM.Element' with a @'/'@ character before the name, -- marking the end of a section opened by 'StartTag'. | Comment T.Text -- ^ __HTML:__ -- @comment token@ -- -- 'Web.Mangrove.DOM.Comment', marking author's notes or other text -- about the document itself, rather than being part of the content. | Character Char -- ^ __HTML:__ -- @character token@ -- -- 'Web.Mangrove.DOM.Character', usually containing (a small portion -- of) text which should rendered for the user or included in the -- header metadata, but occasionally subject to further processing -- (i.e. the content of @\@ or @\@ sections). | EndOfStream -- ^ __HTML:__ -- @end-of-file token@ -- -- Represents both an explicit mark of the end of the stream, when a -- simple @[]@ doesn't suffice, and provides a seat to carry -- 'ParseError's if no other token is emitted at the same time. -- -- Note: the former role doesn't have any guarantees; a stream can end -- without an 'EndOfStream' termination, and 'EndOfStream' tokens can occur in -- places other than the end of the file. deriving ( Eq, Show, Read ) -- | __HTML:__ -- the data associated with a @doctype token@ -- -- All data comprising a document type declaration which may be obtained -- directly from the raw document stream. Values may be easily instantiated as -- updates to 'emptyDoctypeParams'. data DoctypeParams = DoctypeParams { doctypeName :: Maybe T.Text -- ^ The root element of the document, which may also identify the -- primary language used. , doctypePublicId :: Maybe T.Text -- ^ A globally-unique reference to the definition of the language. , doctypeSystemId :: Maybe T.Text -- ^ A system-dependant (but perhaps easier to access) reference to the -- definition of the language. , doctypeQuirks :: Bool -- ^ Whether the document should be read and rendered in a -- backwards-compatible manner, even if the other data in the token -- would match that expected by the specification. Note that 'False' -- value is still subject to those expectations; this just provides an -- override in the case of, for example, a malformed declaration. } deriving ( Eq, Show, Read ) -- | A sane default collection for easy record initialization; namely, -- 'Nothing's and 'False'. emptyDoctypeParams :: DoctypeParams emptyDoctypeParams = DoctypeParams { doctypeName = Nothing , doctypePublicId = Nothing , doctypeSystemId = Nothing , doctypeQuirks = False } -- | __HTML:__ -- the data associated with a @start tag@ or an @end tag token@ -- -- All data comprising a markup tag which may be obtained directly from the raw -- document stream. Values may be easily instantiated as updates to -- 'emptyTagParams'. data TagParams = TagParams { tagName :: ElementName -- ^ The primary identifier of the markup tag, defining its behaviour -- during rendering, and providing a means of matching opening tags -- with closing ones. , tagIsSelfClosing :: Bool -- ^ Whether the tag was closed at the same point it was opened, -- according to the XML-style "@/>@" syntax. HTML null elements are -- handled in the tree construction stage instead. , tagAttributes :: M.HashMap T.Text T.Text -- ^ Finer-grained metadata attached to the markup tag. } deriving ( Eq, Show, Read ) -- | A sane default collection for easy record initialization. emptyTagParams :: TagParams emptyTagParams = TagParams { tagName = T.empty , tagIsSelfClosing = False , tagAttributes = M.empty } -- | Parser combinators written over the output of the -- 'Web.Mangrove.Parse.Encoding.decoder' stage, segmenting the raw 'Char' -- strings into semantic atoms. type Tokenizer = StateParser TokenParserState [TokenizerInput] -- | The collection of data required to extract a list of semantic atoms from a -- binary document stream. Values may be easily instantiated as updates to -- 'defaultTokenizerState'. data TokenizerState = TokenizerState { tokenParserState :: TokenParserState -- ^ The state of the current 'Web.Mangrove.Parse.Tokenize.tokenize' -- stage. , decoderState_ :: Either (Either SnifferEnvironment Encoding) (Maybe DecoderState) -- ^ The state of the previous 'Web.Mangrove.Parse.Encoding.decoder' -- stage, or the data used to initialize it. For easy access to the -- 'DecoderState' itself, see 'decoderState'. } deriving ( Eq, Show, Read ) -- | All the data which needs to be tracked for correct behaviour in the -- tokenization stage. data TokenParserState = TokenParserState { prevStartTag :: Maybe ElementName -- ^ __HTML:__ -- @[appropriate end tag token] -- (https://html.spec.whatwg.org/multipage/parsing.html#appropriate-end-tag-token)@ -- -- Certain states in the parser, which only emit 'Character' tokens, -- are able to collapse multiple tags which result in that behaviour by -- comparing a potential closing markup tag to the name on the -- 'StartTag' token which triggered the state. , currentState :: CurrentTokenizerState -- ^ The set of rules currently active in the state machine. , currentNodeNamespace :: Maybe Namespace -- ^ Certain states in the parser change behaviour if the @adjusted -- current node@ is not an HTML element. Given the direction of -- visibility in the parser stack, this stage can't directly access -- that (tree construction level) datum, and so that needs to be -- tracked redundently. , atEndOfStream :: Bool -- ^ Whether the current input stream is known to be the final part of -- the document stream ('True') or whether additional input may still -- follow ('False') and thus any finalization should not be performed. } deriving ( Eq, Show, Read ) -- | The various fixed points in the tokenization algorithm, where the parser -- may break and re-enter seamlessly. data CurrentTokenizerState = DataState -- ^ __HTML:__ -- @[data state] -- (https://html.spec.whatwg.org/multipage/parsing.html#data-state)@ -- -- The core rules, providing the most common tokenization behaviour. | RCDataState -- ^ __HTML:__ -- @[RCDATA state] -- (https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state)@ -- -- 'Character'-focused production while, unlike 'RawTextState', -- resolving character reference values. | RawTextState -- ^ __HTML:__ -- @[RAWTEXT state] -- (https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state)@ -- -- 'Character'-focused production which, unlike 'RCDataState', passes -- character reference sequences unchanged. | PlainTextState -- ^ __HTML:__ -- @[PLAINTEXT state] -- (https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state)@ -- -- Blind conversion of the entire document stream into 'Character' -- tokens. | ScriptDataState -- ^ __HTML:__ -- @[script data state] -- (https://html.spec.whatwg.org/multipage/parsing.html#script-data-state)@ -- -- 'Character'-focused production according to the (occasionally -- complex) rules governing the handling of @\@ contents. | ScriptDataEscapedState -- ^ __HTML:__ -- @[script data escaped state] -- (https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state)@ -- -- 'Character'-focused production for data within a @\