{-| Description: Basic tokenization rules for the character stream. Copyright: (c) 2020-2021 Sam May License: MPL-2.0 Maintainer: ag.eitilt@gmail.com Stability: stable Portability: portable -} module Web.Mangrove.Parse.Tokenize.Data ( tokenData ) where import Web.Mangrove.Parse.Common.Error import Web.Mangrove.Parse.Tokenize.Common import Web.Mangrove.Parse.Tokenize.Character import Web.Mangrove.Parse.Tokenize.Tag -- | __HTML:__ -- @[data state] -- (https://html.spec.whatwg.org/multipage/parsing.html#data-state)@ -- -- The parsing instructions rooted in the 'DataState' section of the state -- machine. tokenData :: Tokenizer [TokenizerOutput Token] tokenData = tokenizers (Just [([], EndOfStream)]) [ ifs_ (== '&') $ tokenCharacterReference False >>= flushCharRef , ifs_ (== '<') tokenTagOpen , ifs_ (== '\NUL') $ emit' ([UnexpectedNullCharacter], Character '\NUL') , elsesChar $ \c -> emit' ([], Character c) ]