{-| Description: Extract basic semantic categories from a simple textual stream. Copyright: (c) 2020 Sam May License: MPL-2.0 Maintainer: ag.eitilt@gmail.com Stability: provisional Portability: portable This module and the internal branch it heads implement the "Tokenization" section of the __[HTML](https://html.spec.whatwg.org/multipage/parsing.html#tokenization)__ document parsing specification, processing a stream of text to add information on, or group it by, semantic category. This then allows the following stage to base its logic on such higher-level concepts as "markup tag" or "comment" without worrying about the (sometimes complex) escaping behaviour required to parse them. -} module Web.Mangrove.Parse.Tokenize ( -- * Types -- ** Final Token ( .. ) , BasicAttribute , TagParams ( .. ) , emptyTagParams , DoctypeParams ( .. ) , emptyDoctypeParams -- ** Intermediate , TokenizerState , CurrentTokenizerState ( .. ) , Encoding ( .. ) -- * Initialization , defaultTokenizerState , tokenizerMode , tokenizerStartTag , tokenizerEncoding -- * Transformations , tokenize , tokenizeStep , finalizeTokenizer ) where import qualified Control.Applicative as A import qualified Control.Monad.Trans.State as N.S import qualified Data.Bifunctor as F.B import qualified Data.ByteString as BS import qualified Data.ByteString.Short as BS.SH import qualified Data.Maybe as Y import Web.Willow.DOM import Web.Mangrove.Parse.Common.Error import Web.Mangrove.Parse.Encoding.Preprocess import Web.Mangrove.Parse.Tokenize.Common hiding ( setRemainder ) import Web.Mangrove.Parse.Tokenize.Dispatcher import Web.Willow.Common.Encoding import Web.Willow.Common.Encoding.Sniffer import Web.Willow.Common.Parser -- | __HTML:__ -- @[tokenization] -- (https://html.spec.whatwg.org/multipage/parsing.html#tokenization)@ -- -- Given a starting environment, transform a binary document stream into a -- stream of semantic atoms. If the parse fails, returns all tokens before the -- one which caused the error, but any trailing bytes are silently dropped. tokenize :: TokenizerState -> BS.ByteString -> ([([ParseError], Token)], TokenizerState) tokenize state stream = loop $ tokenizeStep state stream where loop (ts, state', output) | BS.null output = (ts, state') | otherwise = F.B.first (ts ++) . loop $ tokenizeStep state' output -- | Parse a minimal number of bytes from an input stream, into a sequence of -- semantic tokens. Returns all data required to seamlessly resume parsing. tokenizeStep :: TokenizerState -> BS.ByteString -> ([([ParseError], Token)], TokenizerState, BS.ByteString) tokenizeStep state stream = case runParserT (N.S.runStateT dispatcher' state) stream of Just ((out, state'), stream') -> (out, state', stream') Nothing -> ([], stateEof, BS.empty) where stateEof = state { decoderState_ = Right $ setRemainder (BS.SH.toShort stream) <$> decoderDefaultState state stream } -- | Explicitly indicate that the input stream will not contain any further -- bytes, and perform any finalization processing based on that. finalizeTokenizer :: TokenizerState -> [([ParseError], Token)] finalizeTokenizer state = fst $ tokenize state' BS.empty where state' = state { tokenParserState = (tokenParserState state) { atEndOfStream = True } } -- | Given a string as emitted by the decoder and the final state of that -- parser, repack it into a single list with that final state encapsulated in a -- 'Just'; the 'init' of the string is given 'Nothing's. repackStream :: ([([ParseError], Char)], DecoderState, BS.ByteString) -> [TokenizerInput] repackStream ([], _, _) = [] repackStream ([(errs, c)], dState, bs) = [TokenizerInput errs c $ Just (Just dState, bs)] repackStream ((errs, c):cs, dState, bs) = TokenizerInput errs c Nothing : repackStream (cs, dState, bs) -- | Wrap the standard dispatcher to operate over a raw 'BS.ByteString' -- rather than the "Web.Mangrove.Parse.Encoding" output. dispatcher' :: StateParser TokenizerState BS.ByteString [([ParseError], Token)] dispatcher' = do state <- N.S.get stream <- abridge let dState = Y.fromMaybe (initialDecoderState Utf8) $ decoderDefaultState state stream recurse state $ preprocessStep' dState stream where preprocessStep' dState input = case preprocessStep dState input of ([], _, _) -> [] cs'@(_, dState', input') -> repackStream cs' ++ preprocessStep' dState' input' -- | Loop the tokenization dispatcher until it returns a set of tokens which -- happens to coincide with a decoder breakpoint. Relies on lazy evaluation in -- the stream generation to avoid forcing the entire thing at once, while still -- retaining the capability to consume as much input as necessary to get the -- parsers to line up. recurse :: TokenizerState -> [TokenizerInput] -> StateParser TokenizerState BS.ByteString [([ParseError], Token)] recurse state stream = case runParserT (N.S.runStateT dispatcher $ tokenParserState state) stream of Nothing -> A.empty Just ((out, tokState'), stream') -> case Y.listToMaybe (reverse out) >>= tokenizedState of Nothing -> do let state' = state { tokenParserState = tokState' } N.S.put state' out' <- recurse state' stream' return $ map repackOut out ++ out' Just (dState, dStream) -> do pushChunk dStream N.S.put $ state { decoderState_ = Right dState , tokenParserState = tokState' } return $ map repackOut out where repackOut t' = (tokenizedErrs t', tokenizedOut t') -- | Specify which section of the finite state machine describing the -- tokenization algorithm should be active. tokenizerMode :: CurrentTokenizerState -> TokenizerState -> TokenizerState tokenizerMode mode state = state { tokenParserState = (tokenParserState state) { currentState = mode } } -- | Specify the data to use as the previous tag which had been emitted by the -- tokenizer. This only has to be called when required for external algorithms -- or constructions; the parser automatically updates as required for generated -- 'StartTag' tokens. tokenizerStartTag :: Maybe Namespace -> ElementName -> TokenizerState -> TokenizerState tokenizerStartTag ns name state = state { tokenParserState = (tokenParserState state) { prevStartTag = Just name , currentNodeNamespace = ns } } -- | Specify the encoding scheme used by a given parse environment to read from -- the binary input stream. Note that this will always use the initial state -- for the respective decoder; intermediate states as returned by 'decodeStep' -- are not supported. tokenizerEncoding :: Either SnifferEnvironment (Maybe Encoding) -> TokenizerState -> TokenizerState tokenizerEncoding enc' state = state { decoderState_ = case enc' of Right Nothing -> Right Nothing Right (Just enc) -> Left $ Right enc Left env -> Left $ Left env }