{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE MultiWayIf #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE ScopedTypeVariables #-}

-- | The HTML Lexer.
module Zenacy.HTML.Internal.Lexer
  ( Lexer(..)
  , LexerOptions(..)
  , LexerSkip(..)
  , lexerNew
  , lexerSetRCDATA
  , lexerSetRAWTEXT
  , lexerSetPLAINTEXT
  , lexerSetScriptData
  , lexerSkipNextLF
  , lexerNext
  ) where

import Zenacy.HTML.Internal.BS
import Zenacy.HTML.Internal.Buffer
import Zenacy.HTML.Internal.Char
import Zenacy.HTML.Internal.Core
import Zenacy.HTML.Internal.Entity
import Zenacy.HTML.Internal.Token
import Control.Monad
  ( forM
  , forM_
  , mapM
  , when
  )
import Control.Monad.Extra
  ( whenM
  , whenJust
  )
import Control.Monad.ST
  ( ST
  )
import Data.STRef
  ( STRef
  , newSTRef
  )
import Data.DList
  ( DList
  )
import qualified Data.DList as D
  ( empty
  , snoc
  , toList
  )
import Data.Map
  ( Map
  )
import qualified Data.Map as Map
  ( fromList
  , lookup
  )
import Data.Maybe
  ( isJust
  )
import Data.Word
  ( Word8
  )
import Data.Default
  ( Default(..)
  )
import Data.Vector.Storable.Mutable
  ( MVector(..)
  )
import qualified Data.Vector.Storable.Mutable as U
  ( new
  , length
  , read
  , write
  , grow
  )

import Debug.Trace (trace)


-- | Lexer options type.
data LexerOptions = LexerOptions
  { lexerOptionInput          :: BS
  -- ^ The input to the lexer.
  , lexerOptionLogErrors      :: Bool
  -- ^ Indicates whether errors are logged.
  , lexerOptionIgnoreEntities :: Bool
  -- ^ Indicates that entities should not be tokenized.
  } deriving (Show)

-- | Defines the skip mode.
data LexerSkip
  = LexerSkipNone
  | LexerSkipLF
    deriving (Eq, Ord, Show)

-- | Defines the lexer state.
data Lexer s = Lexer
  { lexerData     :: BS
  -- ^ The lexer data.
  , lexerIgnore   :: Bool
  -- ^ Flag to ignore entity processing.
  , lexerLog      :: Bool
  -- ^ Flag to log errors.
  , lexerOffset   :: STRef s Int
  -- ^ The offset in the data for the next word to read.
  , lexerToken    :: STRef s (TokenBuffer s)
  -- ^ The token buffer.
  , lexerBuffer   :: STRef s (Buffer s)
  -- ^ The buffer is used to accumulate characters during
  --   some types of character processing.
  , lexerLast     :: STRef s [Word8]
  -- ^ The last start tag name to have been emitted.
  , lexerState    :: STRef s LexerState
  -- ^ The current lexer state in some cases.
  , lexerReturn   :: STRef s LexerState
  -- ^ The state to return to after character reference processing.
  , lexerSkip     :: STRef s LexerSkip
  -- ^ The skip next linefeed flag.
  , lexerErrors   :: STRef s (DList BS)
  -- ^ The lexer errors.
  , lexerCode     :: STRef s Int
  -- ^ The character reference code.
  }

-- | Default instance for lexer options.
instance Default LexerOptions where
  def = LexerOptions
    { lexerOptionInput          = bsEmpty
    , lexerOptionLogErrors      = False
    , lexerOptionIgnoreEntities = False
    }

-- | Defines the lexer state.
data LexerState
  = StateData
  | StateRCDATA
  | StateRAWTEXT
  | StateScriptData
  | StatePLAINTEXT
  | StateTagOpen
  | StateEndTagOpen
  | StateTagName
  | StateRCDATALessThan
  | StateRCDATAEndTagOpen
  | StateRCDATAEndTagName
  | StateRAWTEXTLessThan
  | StateRAWTEXTEndTagOpen
  | StateRAWTEXTEndTagName
  | StateScriptDataLessThan
  | StateScriptDataEndTagOpen
  | StateScriptDataEndTagName
  | StateScriptDataEscapeStart
  | StateScriptDataEscapeStartDash
  | StateScriptDataEscaped
  | StateScriptDataEscapedDash
  | StateScriptDataEscapedDashDash
  | StateScriptDataEscapedLessThan
  | StateScriptDataEscapedEndTagOpen
  | StateScriptDataEscapedEndTagName
  | StateScriptDataDoubleEscapedStart
  | StateScriptDataDoubleEscaped
  | StateScriptDataDoubleEscapedDash
  | StateScriptDataDoubleEscapedDashDash
  | StateScriptDataDoubleEscapedLessThan
  | StateScriptDataDoubleEscapeEnd
  | StateBeforeAttrName
  | StateAttrName
  | StateAfterAttrName
  | StateBeforeAttrValue
  | StateAttrValueDoubleQuoted
  | StateAttrValueSingleQuoted
  | StateAttrValueUnquoted
  | StateAfterAttrValue
  | StateSelfClosingStartTag
  | StateBogusComment
  | StateMarkupDeclarationOpen
  | StateCommentStart
  | StateCommentStartDash
  | StateComment
  | StateCommentLessThan
  | StateCommentLessThanBang
  | StateCommentLessThanBangDash
  | StateCommentLessThanBangDashDash
  | StateCommentEndDash
  | StateCommentEnd
  | StateCommentEndBang
  | StateDoctype
  | StateBeforeDoctypeName
  | StateDoctypeName
  | StateAfterDoctypeName
  | StateAfterDoctypePublicKeyword
  | StateBeforeDoctypePublicId
  | StateDoctypePublicIdDoubleQuoted
  | StateDoctypePublicIdSingleQuoted
  | StateAfterDoctypePublicId
  | StateBetweenDoctypePublicAndSystem
  | StateAfterDoctypeSystemKeyword
  | StateBeforeDoctypeSystemId
  | StateDoctypeSystemIdDoubleQuoted
  | StateDoctypeSystemIdSingleQuoted
  | StateAfterDoctypeSystemId
  | StateBogusDoctype
  | StateCDATASection
  | StateCDATASectionBracket
  | StateCDATASectionEnd
  | StateCharacterReference
  | StateNamedCharacterReference
  | StateAmbiguousAmpersand
  | StateNumericCharacterReference
  | StateHexCharacterReferenceStart
  | StateDecimalCharacterReferenceStart
  | StateHexCharacterReference
  | StateDecimalCharacterReference
  | StateNumericCharacterReferenceEnd
    deriving (Show, Eq, Ord)

lexerDispatch :: LexerState -> Lexer s -> ST s ()
lexerDispatch = \case
  StateData ->
    doData
  StateRCDATA ->
    doRCDATA
  StateRAWTEXT ->
    doRAWTEXT
  StateScriptData ->
    doScriptData
  StatePLAINTEXT ->
    doPLAINTEXT
  StateTagOpen ->
    doTagOpen
  StateEndTagOpen ->
    doEndTagOpen
  StateTagName ->
    doTagName
  StateRCDATALessThan ->
    doRCDATALessThan
  StateRCDATAEndTagOpen ->
    doRCDATAEndTagOpen
  StateRCDATAEndTagName ->
    doRCDATAEndTagName
  StateRAWTEXTLessThan ->
    doRAWTEXTLessThan
  StateRAWTEXTEndTagOpen ->
    doRAWTEXTEndTagOpen
  StateRAWTEXTEndTagName ->
    doRAWTEXTEndTagName
  StateScriptDataLessThan ->
    doScriptDataLessThan
  StateScriptDataEndTagOpen ->
    doScriptDataEndTagOpen
  StateScriptDataEndTagName ->
    doScriptDataEndTagName
  StateScriptDataEscapeStart ->
    doScriptDataEscapeStart
  StateScriptDataEscapeStartDash ->
    doScriptDataEscapeStartDash
  StateScriptDataEscaped ->
    doScriptDataEscaped
  StateScriptDataEscapedDash ->
    doScriptDataEscapedDash
  StateScriptDataEscapedDashDash ->
    doScriptDataEscapedDashDash
  StateScriptDataEscapedLessThan ->
    doScriptDataEscapedLessThan
  StateScriptDataEscapedEndTagOpen ->
    doScriptDataEscapedEndTagOpen
  StateScriptDataEscapedEndTagName ->
    doScriptDataEscapedEndTagName
  StateScriptDataDoubleEscapedStart ->
    doScriptDataDoubleEscapedStart
  StateScriptDataDoubleEscaped ->
    doScriptDataDoubleEscaped
  StateScriptDataDoubleEscapedDash ->
    doScriptDataDoubleEscapedDash
  StateScriptDataDoubleEscapedDashDash ->
    doScriptDataDoubleEscapedDashDash
  StateScriptDataDoubleEscapedLessThan ->
    doScriptDataDoubleEscapedLessThan
  StateScriptDataDoubleEscapeEnd ->
    doScriptDataDoubleEscapeEnd
  StateBeforeAttrName ->
    doBeforeAttrName
  StateAttrName ->
    doAttrName
  StateAfterAttrName ->
    doAfterAttrName
  StateBeforeAttrValue ->
    doBeforeAttrValue
  StateAttrValueDoubleQuoted ->
    doAttrValueDoubleQuoted
  StateAttrValueSingleQuoted ->
    doAttrValueSingleQuoted
  StateAttrValueUnquoted ->
    doAttrValueUnquoted
  StateAfterAttrValue ->
    doAfterAttrValue
  StateSelfClosingStartTag ->
    doSelfClosingStartTag
  StateBogusComment ->
    doBogusComment
  StateMarkupDeclarationOpen ->
    doMarkupDeclarationOpen
  StateCommentStart ->
    doCommentStart
  StateCommentStartDash ->
    doCommentStartDash
  StateComment ->
    doComment
  StateCommentLessThan ->
    doCommentLessThan
  StateCommentLessThanBang ->
    doCommentLessThanBang
  StateCommentLessThanBangDash ->
    doCommentLessThanBangDash
  StateCommentLessThanBangDashDash ->
    doCommentLessThanBangDashDash
  StateCommentEndDash ->
    doCommentEndDash
  StateCommentEnd ->
    doCommentEnd
  StateCommentEndBang ->
    doCommentEndBang
  StateDoctype ->
    doDoctype
  StateBeforeDoctypeName ->
    doBeforeDoctypeName
  StateDoctypeName ->
    doDoctypeName
  StateAfterDoctypeName ->
    doAfterDoctypeName
  StateAfterDoctypePublicKeyword ->
    doAfterDoctypePublicKeyword
  StateBeforeDoctypePublicId ->
    doBeforeDoctypePublicId
  StateDoctypePublicIdDoubleQuoted ->
    doDoctypePublicIdDoubleQuoted
  StateDoctypePublicIdSingleQuoted ->
    doDoctypePublicIdSingleQuoted
  StateAfterDoctypePublicId ->
    doAfterDoctypePublicId
  StateBetweenDoctypePublicAndSystem ->
    doBetweenDoctypePublicAndSystem
  StateAfterDoctypeSystemKeyword ->
    doAfterDoctypeSystemKeyword
  StateBeforeDoctypeSystemId ->
    doBeforeDoctypeSystemId
  StateDoctypeSystemIdDoubleQuoted ->
    doDoctypeSystemIdDoubleQuoted
  StateDoctypeSystemIdSingleQuoted ->
    doDoctypeSystemIdSingleQuoted
  StateAfterDoctypeSystemId ->
    doAfterDoctypeSystemId
  StateBogusDoctype ->
    doBogusDoctype
  StateCDATASection ->
    doCDATASection
  StateCDATASectionBracket ->
    doCDATASectionBracket
  StateCDATASectionEnd ->
    doCDATASectionEnd
  StateCharacterReference ->
    doCharacterReference
  StateNamedCharacterReference ->
    doNamedCharacterReference
  StateAmbiguousAmpersand ->
    doAmbiguousAmpersand
  StateNumericCharacterReference ->
    doNumericCharacterReference
  StateHexCharacterReferenceStart ->
    doHexCharacterReferenceStart
  StateDecimalCharacterReferenceStart ->
    doDecimalCharacterReferenceStart
  StateHexCharacterReference ->
    doHexCharacterReference
  StateDecimalCharacterReference ->
    doDecimalCharacterReference
  StateNumericCharacterReferenceEnd ->
    doNumericCharacterReferenceEnd

-- | Makes a new lexer.
lexerNew :: LexerOptions -> ST s (Either BS (Lexer s))
lexerNew o@LexerOptions{..}
  | Just i <- bsElemIndex 0 lexerOptionInput =
      pure $ Left $ bsConcat [ "input contains null at ", bcPack $ show i ]
  | otherwise =
      Right <$> lexerMake o

-- | Makes a new lexer.
lexerMake :: LexerOptions -> ST s (Lexer s)
lexerMake LexerOptions{..} = do
  offset <- newSTRef 0
  state  <- newSTRef StateData
  ret    <- newSTRef StateData
  token  <- tokenBuffer
  buffer <- bufferNew
  last   <- newSTRef []
  skip   <- newSTRef LexerSkipNone
  errors <- newSTRef D.empty
  code   <- newSTRef 0
  pure $ Lexer
    { lexerData   = lexerOptionInput
    , lexerIgnore = lexerOptionIgnoreEntities
    , lexerLog    = lexerOptionLogErrors
    , lexerOffset = offset
    , lexerToken  = token
    , lexerBuffer = buffer
    , lexerLast   = last
    , lexerState  = state
    , lexerReturn = ret
    , lexerSkip   = skip
    , lexerErrors = errors
    , lexerCode   = code
    }

-- | Sets the RCDATA mode.
lexerSetRCDATA :: Lexer s -> ST s ()
lexerSetRCDATA Lexer{..} = wref lexerState StateRCDATA

-- | Sets the raw text mode.
lexerSetRAWTEXT :: Lexer s -> ST s ()
lexerSetRAWTEXT Lexer{..} = wref lexerState StateRAWTEXT

-- | Sets the plain text mode.
lexerSetPLAINTEXT :: Lexer s -> ST s ()
lexerSetPLAINTEXT Lexer{..} = wref lexerState StatePLAINTEXT

-- | Sets the script data mode.
lexerSetScriptData :: Lexer s -> ST s ()
lexerSetScriptData Lexer{..} = wref lexerState StateScriptData

-- | Sets the skip next linefeed flag.
lexerSkipNextLF :: Lexer s -> ST s ()
lexerSkipNextLF Lexer{..} = wref lexerSkip LexerSkipLF

-- | Gets the next token from a lexer.
lexerNext :: Lexer s -> ST s Token
lexerNext x@Lexer {..} =
  skip
  where
    skip = do
      t <- next
      case t of
        TChar 0x10 ->
          rref lexerSkip >>= \case
            LexerSkipLF ->
              wref lexerSkip LexerSkipNone >> skip
            LexerSkipNone ->
              pure t
        _otherwise ->
          pure t

    next = do
      i <- tokenNext lexerToken
      if i == 0
         then do
           tokenReset lexerToken
           s <- rref lexerState
           lexerDispatch s x
           i <- tokenFirst lexerToken
           if i == 0
              then pure TEOF
              else tokenPack i lexerToken
         else do
           tokenPack i lexerToken

-- | Gets the next word from a lexer.
nextWord :: Lexer s -> ST s Word8
nextWord Lexer {..} = do
  offset <- rref lexerOffset
  if | offset < bsLen lexerData -> do
         wref lexerOffset (offset + 1)
         pure $ bsIndex lexerData offset
     | otherwise ->
         pure chrEOF

-- | Gets the next word from a lexer without advancing.
peekWord :: Lexer s -> ST s Word8
peekWord Lexer {..} = do
  offset <- rref lexerOffset
  if | offset < bsLen lexerData ->
         pure $ bsIndex lexerData offset
     | otherwise ->
         pure chrEOF

-- | Moves the lexer back one word.
backWord :: Lexer s -> ST s ()
backWord Lexer {..} = uref lexerOffset (subtract 1)

-- | Skips the specified number of words.
skipWord :: Lexer s -> Int -> ST s ()
skipWord Lexer {..} n = uref lexerOffset (+n)

-- | Gets an indexing function from the current lexer offset.
dataIndexer :: Lexer s -> ST s (Int -> Word8)
dataIndexer Lexer {..} = do
  offset <- rref lexerOffset
  pure $ \i -> bsIndex lexerData (offset + i)

-- | Gets the remaining number of words.
dataRemain :: Lexer s -> ST s Int
dataRemain Lexer {..} = do
  offset <- rref lexerOffset
  pure $ bsLen lexerData - offset

-- | Emits the last token in the token buffer.
emit :: Lexer s -> ST s ()
emit Lexer {..} = do
  tokenTail lexerToken >>= flip tokenTagStartName lexerToken >>= \case
    Just a  -> wref lexerLast a
    Nothing -> pure ()

-- | Emits a character token.
emitChar :: Lexer s -> Word8 -> ST s ()
emitChar x@Lexer {..} w = do
  tokenCharInit w lexerToken
  emit x

-- | Emits the characters in the buffer.
emitBuffer :: Lexer s -> ST s ()
emitBuffer x@Lexer {..} = do
  bufferApply (emitChar x) lexerBuffer
  bufferReset lexerBuffer

-- | Sets the lexer state.
state :: Lexer s -> LexerState -> ST s ()
state Lexer {..} = wref lexerState

-- | Sets the lexer return state.
returnSet :: Lexer s -> LexerState -> ST s ()
returnSet x@Lexer {..} = wref lexerReturn

-- | Gets the lexer return state.
returnGet :: Lexer s -> ST s LexerState
returnGet x@Lexer {..} = rref lexerReturn

-- | Switches to the return state.
returnState :: Lexer s -> ST s ()
returnState x = returnGet x >>= flip lexerDispatch x

-- | Handles parse errors.
parseError :: Lexer s -> BS -> ST s ()
parseError x@Lexer {..} =
  when lexerLog . (uref lexerErrors . flip D.snoc)

-- | Determines if the current token is an appropriate end tag.
appropriateEndTag :: Lexer s -> ST s Bool
appropriateEndTag x@Lexer {..} = do
  tokenTail lexerToken >>= flip tokenTagEndName lexerToken >>= \case
    Just a  -> (==a) <$> rref lexerLast
    Nothing -> pure False

-- | Determines if an attribute value is currently being consumed.
consumingAttibute :: Lexer s -> ST s Bool
consumingAttibute x@Lexer {..} = do
  a <- returnGet x
  pure $ any (==a)
    [ StateAttrValueDoubleQuoted
    , StateAttrValueSingleQuoted
    , StateAttrValueUnquoted
    ]

-- | Flushes the code points stored in the buffer.
flushCodePoints :: Lexer s -> ST s ()
flushCodePoints x@Lexer {..} = do
  a <- consumingAttibute x
  if | a -> do
         bufferApply (flip tokenAttrValAppend lexerToken) lexerBuffer
         bufferReset lexerBuffer
     | otherwise -> do
         emitBuffer x

-- 12.2.5.1 Data state
doData :: Lexer s -> ST s ()
doData x@Lexer {..} = do
  c <- nextWord x
  if | c == chrAmpersand -> do
         returnSet x StateData
         doCharacterReference x
     | c == chrLess -> do
         doTagOpen x
     | c == chrEOF -> do
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         emitChar x c
         doData x

-- 12.2.5.2 RCDATA state
doRCDATA :: Lexer s -> ST s ()
doRCDATA x@Lexer {..} = do
  c <- nextWord x
  if | c == chrAmpersand -> do
         returnSet x StateRCDATA
         doCharacterReference x
     | c == chrLess -> do
         doRCDATALessThan x
     | c == chrEOF -> do
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         emitChar x c
         doRCDATA x

-- 12.2.5.3 RAWTEXT state
doRAWTEXT :: Lexer s -> ST s ()
doRAWTEXT x@Lexer {..} = do
  c <- nextWord x
  if | c == chrLess -> do
         doRAWTEXTLessThan x
     | c == chrEOF -> do
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         emitChar x c
         doRAWTEXT x

-- 12.2.5.4 Script data state
doScriptData :: Lexer s -> ST s ()
doScriptData x@Lexer {..} = do
  c <- nextWord x
  if | c == chrLess -> do
         doScriptDataLessThan x
     | c == chrEOF -> do
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         emitChar x c
         doScriptData x

-- 12.2.5.5 PLAINTEXT state
doPLAINTEXT :: Lexer s -> ST s ()
doPLAINTEXT x@Lexer {..} = do
  c <- nextWord x
  if | c == chrEOF -> do
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         emitChar x c
         doPLAINTEXT x

-- 12.2.5.6 Tag open state
doTagOpen :: Lexer s -> ST s ()
doTagOpen x@Lexer {..} = do
  c <- nextWord x
  if | c == chrExclamation -> do
         doMarkupDeclarationOpen x
     | c == chrSolidus -> do
         doEndTagOpen x
     | chrASCIIAlpha c -> do
         tokenTagStartInit lexerToken
         backWord x
         doTagName x
     | c == chrQuestion -> do
         parseError x "unexpected-question-mark-instead-of-tag-name"
         tokenCommentInit lexerToken
         backWord x
         doBogusComment x
     | c == chrEOF -> do
         parseError x "eof-before-tag-name"
         tokenCharInit chrLess lexerToken
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         parseError x "invalid-first-character-of-tag-name"
         backWord x
         doData x

-- 12.2.5.7 End tag open state
doEndTagOpen :: Lexer s -> ST s ()
doEndTagOpen x@Lexer {..} = do
  c <- nextWord x
  if | chrASCIIAlpha c -> do
         tokenTagEndInit lexerToken
         backWord x
         doTagName x
     | c == chrGreater -> do
         parseError x "missing-end-tag-name"
         doData x
     | c == chrEOF -> do
         parseError x "eof-before-tag-name"
         tokenCharInit chrLess lexerToken
         tokenCharInit chrSolidus lexerToken
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         parseError x "invalid-first-character-of-tag-name"
         tokenCommentInit lexerToken
         backWord x
         doBogusComment x

-- 12.2.5.8 Tag name state
doTagName :: Lexer s -> ST s ()
doTagName x@Lexer {..} = do
  c <- nextWord x
  if | c == chrTab ||
       c == chrLF ||
       c == chrFF ||
       c == chrSpace -> do
         doBeforeAttrName x
     | c == chrSolidus -> do
         doSelfClosingStartTag x
     | c == chrGreater -> do
         state x StateData
         emit x
     | chrASCIIUpperAlpha c -> do
         tokenTagNameAppend (chrToLower c) lexerToken
         doTagName x
     | c == chrEOF -> do
         parseError x "eof-in-tag"
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         tokenTagNameAppend c lexerToken
         doTagName x

-- 12.2.5.9 RCDATA less-than sign state
doRCDATALessThan :: Lexer s -> ST s ()
doRCDATALessThan x@Lexer {..} = do
  c <- nextWord x
  if | c == chrSolidus -> do
         bufferReset lexerBuffer
         doRCDATAEndTagOpen x
     | otherwise -> do
         tokenCharInit chrLess lexerToken
         backWord x
         doRCDATA x

-- 12.2.5.10 RCDATA end tag open state
doRCDATAEndTagOpen :: Lexer s -> ST s ()
doRCDATAEndTagOpen x@Lexer {..} = do
  c <- nextWord x
  if | chrASCIIAlpha c -> do
         tokenTagEndInit lexerToken
         backWord x
         doRCDATAEndTagName x
     | otherwise -> do
         tokenCharInit chrLess lexerToken
         tokenCharInit chrSolidus lexerToken
         backWord x
         doRCDATA x

-- 12.2.5.11 RCDATA end tag name state
doRCDATAEndTagName :: Lexer s -> ST s ()
doRCDATAEndTagName x@Lexer {..} = do
  c <- nextWord x
  a <- appropriateEndTag x
  if | c == chrTab ||
       c == chrLF ||
       c == chrFF ||
       c == chrSpace -> do
         if a
         then do
           doBeforeAttrName x
         else do
           anythingElse
     | c == chrSolidus -> do
         if a
         then do
           doSelfClosingStartTag x
         else do
           anythingElse
     | c == chrGreater -> do
         if a
         then do
           state x StateData
           emit x
         else do
           anythingElse
     | chrASCIIUpperAlpha c -> do
         tokenTagNameAppend (chrToLower c) lexerToken
         bufferAppend c lexerBuffer
         doRCDATAEndTagName x
     | chrASCIILowerAlpha c -> do
         tokenTagNameAppend c lexerToken
         bufferAppend c lexerBuffer
         doRCDATAEndTagName x
     | otherwise -> do
         anythingElse
  where
    anythingElse = do
      tokenDrop lexerToken
      emitChar x chrLess
      emitChar x chrSolidus
      emitBuffer x
      backWord x
      doRCDATA x

-- 12.2.5.12 RAWTEXT less-than sign state
doRAWTEXTLessThan :: Lexer s -> ST s ()
doRAWTEXTLessThan x@Lexer {..} = do
  c <- nextWord x
  if | c == chrSolidus -> do
         bufferReset lexerBuffer
         doRAWTEXTEndTagOpen x
     | otherwise -> do
         tokenCharInit chrLess lexerToken
         backWord x
         doRAWTEXT x

-- 12.2.5.13 RAWTEXT end tag open state
doRAWTEXTEndTagOpen :: Lexer s -> ST s ()
doRAWTEXTEndTagOpen x@Lexer {..} = do
  c <- nextWord x
  if | chrASCIIAlpha c -> do
         tokenTagEndInit lexerToken
         backWord x
         doRAWTEXTEndTagName x
     | otherwise -> do
         emitChar x chrLess
         emitChar x chrSolidus
         backWord x
         doRAWTEXT x

-- 12.2.5.14 RAWTEXT end tag name state
doRAWTEXTEndTagName :: Lexer s -> ST s ()
doRAWTEXTEndTagName x@Lexer {..} = do
  c <- nextWord x
  a <- appropriateEndTag x
  if | c == chrTab ||
       c == chrLF ||
       c == chrFF ||
       c == chrSpace -> do
         if a
         then do
           doBeforeAttrName x
         else do
           anythingElse
     | c == chrSolidus -> do
         if a
         then do
           doSelfClosingStartTag x
         else do
           anythingElse
     | c == chrGreater -> do
         if a
         then do
           state x StateData
           emit x
         else do
           anythingElse
     | chrASCIIUpperAlpha c -> do
         tokenTagNameAppend (chrToLower c) lexerToken
         bufferAppend c lexerBuffer
         doRAWTEXTEndTagName x
     | chrASCIILowerAlpha c -> do
         tokenTagNameAppend c lexerToken
         bufferAppend c lexerBuffer
         doRAWTEXTEndTagName x
     | otherwise -> do
         anythingElse
  where
    anythingElse = do
      tokenDrop lexerToken
      emitChar x chrLess
      emitChar x chrSolidus
      emitBuffer x
      backWord x
      doRAWTEXT x

-- 12.2.5.15 Script data less-than sign state
doScriptDataLessThan :: Lexer s -> ST s ()
doScriptDataLessThan x@Lexer {..} = do
  c <- nextWord x
  if | c == chrSolidus -> do
         bufferReset lexerBuffer
         doScriptDataEndTagOpen x
     | c == chrExclamation -> do
         tokenCharInit chrLess lexerToken
         tokenCharInit chrExclamation lexerToken
         doScriptDataEscapeStart x
     | otherwise -> do
         tokenCharInit chrLess lexerToken
         backWord x
         doScriptData x

-- 12.2.5.16 Script data end tag open state
doScriptDataEndTagOpen :: Lexer s -> ST s ()
doScriptDataEndTagOpen x@Lexer {..} = do
  c <- nextWord x
  if | chrASCIIAlpha c -> do
         tokenTagEndInit lexerToken
         backWord x
         doScriptDataEndTagName x
     | otherwise -> do
         emitChar x chrLess
         emitChar x chrSolidus
         backWord x
         doScriptData x

-- 12.2.5.17 Script data end tag name state
doScriptDataEndTagName :: Lexer s -> ST s ()
doScriptDataEndTagName x@Lexer {..} = do
  c <- nextWord x
  a <- appropriateEndTag x
  if | c == chrTab ||
       c == chrLF ||
       c == chrFF ||
       c == chrSpace -> do
         if a
         then do
           doBeforeAttrName x
         else do
           anythingElse
     | c == chrSolidus -> do
         if a
         then do
           doSelfClosingStartTag x
         else do
           anythingElse
     | c == chrGreater -> do
         if a
         then do
           state x StateData
           emit x
         else do
           anythingElse
     | chrASCIIUpperAlpha c -> do
         tokenTagNameAppend (chrToLower c) lexerToken
         bufferAppend c lexerBuffer
         doScriptDataEndTagName x
     | chrASCIILowerAlpha c -> do
         tokenTagNameAppend c lexerToken
         bufferAppend c lexerBuffer
         doScriptDataEndTagName x
     | otherwise -> do
         anythingElse
  where
    anythingElse = do
      tokenDrop lexerToken
      emitChar x chrLess
      emitChar x chrSolidus
      emitBuffer x
      backWord x
      doScriptData x

-- 12.2.5.18 Script data escape start state
doScriptDataEscapeStart :: Lexer s -> ST s ()
doScriptDataEscapeStart x@Lexer {..} = do
  c <- nextWord x
  if | c == chrHyphen -> do
         emitChar x chrHyphen
         doScriptDataEscapeStartDash x
     | otherwise -> do
         backWord x
         doScriptData x

-- 12.2.5.19 Script data escape start dash state
doScriptDataEscapeStartDash :: Lexer s -> ST s ()
doScriptDataEscapeStartDash x@Lexer {..} = do
  c <- nextWord x
  if | c == chrHyphen -> do
         emitChar x chrHyphen
         doScriptDataEscapedDashDash x
     | otherwise -> do
         backWord x
         doScriptData x

-- 12.2.5.20 Script data escaped state
doScriptDataEscaped :: Lexer s -> ST s ()
doScriptDataEscaped x@Lexer {..} = do
  c <- nextWord x
  if | c == chrHyphen -> do
         emitChar x chrHyphen
         doScriptDataEscapedDash x
     | c == chrLess -> do
         doScriptDataEscapedLessThan x
     | c == chrEOF -> do
         parseError x "eof-in-script-html-comment-like-text"
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         emitChar x c
         doScriptDataEscaped x

-- 12.2.5.21 Script data escaped dash state
doScriptDataEscapedDash :: Lexer s -> ST s ()
doScriptDataEscapedDash x@Lexer {..} = do
  c <- nextWord x
  if | c == chrHyphen -> do
         emitChar x chrHyphen
         doScriptDataEscapedDashDash x
     | c == chrLess -> do
         doScriptDataEscapedLessThan x
     | c == chrEOF -> do
         parseError x "eof-in-script-html-comment-like-text"
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         emitChar x c
         doScriptDataEscaped x

-- 12.2.5.22 Script data escaped dash dash state
doScriptDataEscapedDashDash :: Lexer s -> ST s ()
doScriptDataEscapedDashDash x@Lexer {..} = do
  c <- nextWord x
  if | c == chrHyphen -> do
         emitChar x c
         doScriptDataEscapedDashDash x
     | c == chrLess -> do
         doScriptDataEscapedLessThan x
     | c == chrGreater -> do
         emitChar x c
         doScriptData x
     | c == chrEOF -> do
         parseError x "eof-in-script-html-comment-like-text"
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         emitChar x c
         doScriptDataEscaped x

-- 12.2.5.23 Script data escaped less-than sign state
doScriptDataEscapedLessThan :: Lexer s -> ST s ()
doScriptDataEscapedLessThan x@Lexer {..} = do
  c <- nextWord x
  if | c == chrSolidus -> do
         bufferReset lexerBuffer
         doScriptDataEscapedEndTagOpen x
     | chrASCIIAlpha c -> do
         bufferReset lexerBuffer
         emitChar x chrLess
         backWord x
         doScriptDataDoubleEscapedStart x
     | otherwise -> do
         emitChar x chrLess
         backWord x
         doScriptDataEscaped x

-- 12.2.5.24 Script data escaped end tag open state
doScriptDataEscapedEndTagOpen :: Lexer s -> ST s ()
doScriptDataEscapedEndTagOpen x@Lexer {..} = do
  c <- nextWord x
  if | chrASCIIAlpha c -> do
         tokenTagEndInit lexerToken
         backWord x
         doScriptDataEscapedEndTagName x
     | otherwise -> do
         emitChar x chrLess
         emitChar x chrSolidus
         backWord x
         doScriptDataEscaped x

-- 12.2.5.25 Script data escaped end tag name state
doScriptDataEscapedEndTagName :: Lexer s -> ST s ()
doScriptDataEscapedEndTagName x@Lexer {..} = do
  c <- nextWord x
  a <- appropriateEndTag x
  if | c == chrTab ||
       c == chrLF ||
       c == chrFF ||
       c == chrSpace -> do
         if a
         then do
           doBeforeAttrName x
         else do
           anythingElse
     | c == chrSolidus -> do
         if a
         then do
           doSelfClosingStartTag x
         else do
           anythingElse
     | c == chrGreater -> do
         if a
         then do
           state x StateData
           emit x
         else do
           anythingElse
     | chrASCIIUpperAlpha c -> do
         tokenTagNameAppend (chrToLower c) lexerToken
         bufferAppend c lexerBuffer
         doScriptDataEscapedEndTagName x
     | chrASCIILowerAlpha c -> do
         tokenTagNameAppend c lexerToken
         bufferAppend c lexerBuffer
         doScriptDataEscapedEndTagName x
     | otherwise -> do
         anythingElse
  where
    anythingElse = do
      tokenDrop lexerToken
      emitChar x chrLess
      emitChar x chrSolidus
      emitBuffer x
      backWord x
      state x StateScriptDataEscaped
      emit x

-- 12.2.5.26 Script data double escape start state
doScriptDataDoubleEscapedStart :: Lexer s -> ST s ()
doScriptDataDoubleEscapedStart x@Lexer {..} = do
  c <- nextWord x
  if | c == chrTab ||
       c == chrLF ||
       c == chrFF ||
       c == chrSpace ||
       c == chrSolidus ||
       c == chrGreater -> do
         bufferContains (bsUnpack "script") lexerBuffer >>= \case
           True -> do
             doScriptDataDoubleEscaped x
           False -> do
             tokenCharInit c lexerToken
             state x StateScriptDataEscaped
             emit x
     | chrASCIIUpperAlpha c -> do
         bufferAppend (chrToLower c) lexerBuffer
         tokenCharInit c lexerToken
         state x StateScriptDataDoubleEscapedStart
         emit x
     | chrASCIILowerAlpha c -> do
         bufferAppend c lexerBuffer
         tokenCharInit c lexerToken
         state x StateScriptDataDoubleEscapedStart
         emit x
     | otherwise -> do
         backWord x
         doScriptDataEscaped x

-- 12.2.5.27 Script data double escaped state
doScriptDataDoubleEscaped :: Lexer s -> ST s ()
doScriptDataDoubleEscaped x@Lexer {..} = do
  c <- nextWord x
  if | c == chrHyphen -> do
         tokenCharInit c lexerToken
         state x StateScriptDataDoubleEscapedDash
         emit x
     | c == chrLess -> do
         tokenCharInit c lexerToken
         state x StateScriptDataDoubleEscapedLessThan
         emit x
     | c == chrEOF -> do
         parseError x "eof-in-script-html-comment-like-text"
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         tokenCharInit c lexerToken
         emit x

-- 12.2.5.28 Script data double escaped dash state
doScriptDataDoubleEscapedDash :: Lexer s -> ST s ()
doScriptDataDoubleEscapedDash x@Lexer {..} = do
  c <- nextWord x
  if | c == chrHyphen -> do
         tokenCharInit c lexerToken
         state x StateScriptDataDoubleEscapedDashDash
         emit x
     | c == chrLess -> do
         tokenCharInit c lexerToken
         state x StateScriptDataDoubleEscapedLessThan
         emit x
     | c == chrEOF -> do
         parseError x "eof-in-script-html-comment-like-text"
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         tokenCharInit c lexerToken
         state x StateScriptDataDoubleEscaped
         emit x

-- 12.2.5.29 Script data double escaped dash dash state
doScriptDataDoubleEscapedDashDash :: Lexer s -> ST s ()
doScriptDataDoubleEscapedDashDash x@Lexer {..} = do
  c <- nextWord x
  if | c == chrHyphen -> do
         emitChar x c
         doScriptDataDoubleEscapedDashDash x
     | c == chrLess -> do
         tokenCharInit c lexerToken
         state x StateScriptDataDoubleEscapedLessThan
         emit x
     | c == chrGreater -> do
         tokenCharInit c lexerToken
         state x StateScriptData
         emit x
     | c == chrEOF -> do
         parseError x "eof-in-script-html-comment-like-text"
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         tokenCharInit c lexerToken
         state x StateScriptDataDoubleEscaped
         emit x

-- 12.2.5.30 Script data double escaped less-than sign state
doScriptDataDoubleEscapedLessThan :: Lexer s -> ST s ()
doScriptDataDoubleEscapedLessThan x@Lexer {..} = do
  c <- nextWord x
  if | c == chrSolidus -> do
         bufferReset lexerBuffer
         tokenCharInit c lexerToken
         state x StateScriptDataDoubleEscapeEnd
         emit x
     | otherwise -> do
         backWord x
         doScriptDataDoubleEscaped x

-- 12.2.5.31 Script data double escape end state
doScriptDataDoubleEscapeEnd :: Lexer s -> ST s ()
doScriptDataDoubleEscapeEnd x@Lexer {..} = do
  c <- nextWord x
  if | c == chrTab ||
       c == chrLF ||
       c == chrFF ||
       c == chrSpace ||
       c == chrSolidus ||
       c == chrGreater -> do
         bufferContains (bsUnpack "script") lexerBuffer >>= \case
           True -> do
             doScriptDataEscaped x
           False -> do
             tokenCharInit c lexerToken
             state x StateScriptDataDoubleEscaped
             emit x
     | chrASCIIUpperAlpha c -> do
         bufferAppend (chrToLower c) lexerBuffer
         emitChar x c
         doScriptDataDoubleEscapeEnd x
     | chrASCIILowerAlpha c -> do
         bufferAppend c lexerBuffer
         emitChar x c
         doScriptDataDoubleEscapeEnd x
     | otherwise -> do
         backWord x
         doScriptDataDoubleEscaped x

-- 12.2.5.32 Before attribute name state
doBeforeAttrName :: Lexer s -> ST s ()
doBeforeAttrName x@Lexer {..} = do
  c <- nextWord x
  if | c == chrTab ||
       c == chrLF ||
       c == chrFF ||
       c == chrSpace -> do
         doBeforeAttrName x
     | c == chrSolidus ||
       c == chrGreater ||
       c == chrEOF -> do
         backWord x
         doAfterAttrName x
     | c == chrEqual -> do
         parseError x "unexpected-equals-sign-before-attribute-name"
         tokenAttrInit lexerToken
         tokenAttrNameAppend c lexerToken
         doAttrName x
     | otherwise -> do
         tokenAttrInit lexerToken
         backWord x
         doAttrName x

-- 12.2.5.33 Attribute name state
doAttrName :: Lexer s -> ST s ()
doAttrName x@Lexer {..} = do
  c <- nextWord x
  if | c == chrTab ||
       c == chrLF ||
       c == chrFF ||
       c == chrSpace ||
       c == chrSolidus ||
       c == chrGreater ||
       c == chrEOF -> do
         checkAttr
         backWord x
         doAfterAttrName x
     | c == chrEqual -> do
         checkAttr
         doBeforeAttrValue x
     | chrASCIIUpperAlpha c -> do
         tokenAttrNameAppend (chrToLower c) lexerToken
         doAttrName x
     | c == chrQuote ||
       c == chrApostrophe ||
       c == chrLess -> do
         parseError x "unexpected-character-in-attribute-name"
         tokenAttrNameAppend c lexerToken
         doAttrName x
     | otherwise -> do
         tokenAttrNameAppend c lexerToken
         doAttrName x
  where
    checkAttr = do
      i <- tokenTail lexerToken
      whenM (tokenAttrNamePrune i lexerToken) $ do
        parseError x "duplicate-attribute"

-- 12.2.5.34 After attribute name state
doAfterAttrName :: Lexer s -> ST s ()
doAfterAttrName x@Lexer {..} = do
  c <- nextWord x
  if | c == chrTab ||
       c == chrLF ||
       c == chrFF ||
       c == chrSpace -> do
         doAfterAttrName x
     | c == chrSolidus -> do
         doSelfClosingStartTag x
     | c == chrEqual -> do
         doBeforeAttrValue x
     | c == chrGreater -> do
         state x StateData
         emit x
     | c == chrEOF -> do
         parseError x "eof-in-tag"
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         tokenAttrInit lexerToken
         backWord x
         doAttrName x

-- 12.2.5.35 Before attribute value state
doBeforeAttrValue :: Lexer s -> ST s ()
doBeforeAttrValue x@Lexer {..} = do
  c <- nextWord x
  if | c == chrTab ||
       c == chrLF ||
       c == chrFF ||
       c == chrSpace -> do
         doBeforeAttrValue x
     | c == chrQuote -> do
         doAttrValueDoubleQuoted x
     | c == chrApostrophe -> do
         doAttrValueSingleQuoted x
     | c == chrGreater -> do
         parseError x "missing-attribute-value"
         state x StateData
         emit x
     | otherwise -> do
         backWord x
         doAttrValueUnquoted x

-- 12.2.5.36 Attribute value (double-quoted) state
doAttrValueDoubleQuoted :: Lexer s -> ST s ()
doAttrValueDoubleQuoted x@Lexer {..} = do
  c <- nextWord x
  if | c == chrQuote -> do
         doAfterAttrValue x
     | c == chrAmpersand -> do
         returnSet x StateAttrValueDoubleQuoted
         doCharacterReference x
     | c == chrEOF -> do
         parseError x "eof-in-tag"
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         tokenAttrValAppend c lexerToken
         doAttrValueDoubleQuoted x

-- 12.2.5.37 Attribute value (single-quoted) state
doAttrValueSingleQuoted :: Lexer s -> ST s ()
doAttrValueSingleQuoted x@Lexer {..} = do
  c <- nextWord x
  if | c == chrApostrophe -> do
         doAfterAttrValue x
     | c == chrAmpersand -> do
         returnSet x StateAttrValueSingleQuoted
         doCharacterReference x
     | c == chrEOF -> do
         parseError x "eof-in-tag"
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         tokenAttrValAppend c lexerToken
         doAttrValueSingleQuoted x

-- 12.2.5.38 Attribute value (unquoted) state
doAttrValueUnquoted :: Lexer s -> ST s ()
doAttrValueUnquoted x@Lexer {..} = do
  c <- nextWord x
  if | c == chrTab ||
       c == chrLF ||
       c == chrFF ||
       c == chrSpace -> do
         doBeforeAttrName x
     | c == chrAmpersand -> do
         returnSet x StateAttrValueUnquoted
         doCharacterReference x
     | c == chrGreater -> do
         state x StateData
         emit x
     | c == chrQuote ||
       c == chrApostrophe ||
       c == chrLess ||
       c == chrEqual ||
       c == chrGrave -> do
         parseError x "unexpected-character-in-unquoted-attribute-value"
         tokenAttrValAppend c lexerToken
         doAttrValueUnquoted x
     | c == chrEOF -> do
         parseError x "eof-in-tag"
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         tokenAttrValAppend c lexerToken
         doAttrValueUnquoted x

-- 12.2.5.39 After attribute value (quoted) state
doAfterAttrValue :: Lexer s -> ST s ()
doAfterAttrValue x@Lexer {..} = do
  c <- nextWord x
  if | c == chrTab ||
       c == chrLF ||
       c == chrFF ||
       c == chrSpace -> do
         doBeforeAttrName x
     | c == chrSolidus -> do
         doSelfClosingStartTag x
     | c == chrGreater -> do
         state x StateData
         emit x
     | c == chrEOF -> do
         parseError x "eof-in-tag"
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         parseError x "missing-whitespace-between-attributes"
         backWord x
         doBeforeAttrName x

-- 12.2.5.40 Self-closing start tag state
doSelfClosingStartTag :: Lexer s -> ST s ()
doSelfClosingStartTag x@Lexer {..} = do
  c <- nextWord x
  if | c == chrGreater -> do
         tokenTagStartSetSelfClosing lexerToken
         state x StateData
         emit x
     | c == chrEOF -> do
         parseError x "eof-in-tag"
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         parseError x "unexpected-solidus-in-tag"
         backWord x
         doBeforeAttrName x

-- 12.2.5.41 Bogus comment state
doBogusComment :: Lexer s -> ST s ()
doBogusComment x@Lexer {..} = do
  c <- nextWord x
  if | c == chrGreater -> do
         state x StateData
         emit x
     | c == chrEOF -> do
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         tokenCommentAppend c lexerToken
         doBogusComment x

-- 12.2.5.42 Markup declaration open state
doMarkupDeclarationOpen :: Lexer s -> ST s ()
doMarkupDeclarationOpen x@Lexer {..} = do
  f <- dataIndexer x
  n <- dataRemain x
  if | n > 1 &&
       f 0 == chrHyphen &&
       f 1 == chrHyphen -> do
         skipWord x 2
         tokenCommentInit lexerToken
         doCommentStart x
     | n > 6 &&
       (f 0 == 0x44 || f 0 == 0x64) &&    -- D or d
       (f 1 == 0x4F || f 1 == 0x6F) &&    -- O or o
       (f 2 == 0x43 || f 2 == 0x63) &&    -- C or c
       (f 3 == 0x54 || f 3 == 0x74) &&    -- T or t
       (f 4 == 0x59 || f 4 == 0x79) &&    -- Y or y
       (f 5 == 0x50 || f 5 == 0x70) &&    -- P or p
       (f 6 == 0x45 || f 6 == 0x65) -> do -- E or e
         skipWord x 7
         doDoctype x
     | n > 6 &&
       f 0 == 0x5B &&    -- [
       f 1 == 0x43 &&    -- C
       f 2 == 0x44 &&    -- D
       f 3 == 0x41 &&    -- A
       f 4 == 0x54 &&    -- T
       f 5 == 0x41 &&    -- A
       f 6 == 0x5B -> do -- [
         skipWord x 7
  -- The standard says to check if the parser has an adjusted current node
  -- that is not in the HTML namespace, and if so then CDATA sections are
  -- allowed.  For now we assume that condition is true and we do not
  -- check with the parser to verify.  CDATA sections in HTML are probably
  -- not common and in most cases they are going to be in the correct
  -- locations anyway.
         if True -- TODO
            then do
              state x StateCDATASection
              doCDATASection x
            else do
              parseError x "cdata-in-html-content"
              tokenCommentInit lexerToken
              mapM_ (flip tokenCommentAppend lexerToken)
                [ 0x5B, 0x43, 0x44, 0x41, 0x54, 0x41, 0x5B ]
              doBogusComment x
     | otherwise -> do
         parseError x "incorrectly-opened-comment"
         tokenCommentInit lexerToken
         doBogusComment x

-- 12.2.5.43 Comment start state
doCommentStart :: Lexer s -> ST s ()
doCommentStart x@Lexer {..} = do
  c <- nextWord x
  if | c == chrHyphen -> do
         doCommentStartDash x
     | c == chrGreater -> do
         parseError x "abrupt-closing-of-empty-comment"
         state x StateData
         emit x
     | otherwise -> do
         backWord x
         doComment x

-- 12.2.5.44 Comment start dash state
doCommentStartDash :: Lexer s -> ST s ()
doCommentStartDash x@Lexer {..} = do
  c <- nextWord x
  if | c == chrHyphen -> do
         doCommentEnd x
     | c == chrGreater -> do
         parseError x "abrupt-closing-of-empty-comment"
         state x StateData
         emit x
     | c == chrEOF -> do
         parseError x "eof-in-comment"
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         tokenCommentAppend chrHyphen lexerToken
         backWord x
         doComment x

-- 12.2.5.45 Comment state
doComment :: Lexer s -> ST s ()
doComment x@Lexer {..} = do
  c <- nextWord x
  if | c == chrLess -> do
         tokenCommentAppend c lexerToken
         doCommentLessThan x
     | c == chrHyphen -> do
         doCommentEndDash x
     | c == chrEOF -> do
         parseError x "eof-in-comment"
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         tokenCommentAppend c lexerToken
         doComment x

-- 12.2.5.46 Comment less-than sign state
doCommentLessThan :: Lexer s -> ST s ()
doCommentLessThan x@Lexer {..} = do
  c <- nextWord x
  if | c == chrExclamation -> do
         tokenCommentAppend c lexerToken
         doCommentLessThanBang x
     | c == chrLess -> do
         tokenCommentAppend c lexerToken
         doCommentLessThan x
     | otherwise -> do
         backWord x
         doComment x

-- 12.2.5.47 Comment less-than sign bang state
doCommentLessThanBang :: Lexer s -> ST s ()
doCommentLessThanBang x@Lexer {..} = do
  c <- nextWord x
  if | c == chrHyphen -> do
         doCommentLessThanBangDash x
     | otherwise -> do
         backWord x
         doComment x

-- 12.2.5.48 Comment less-than sign bang dash state
doCommentLessThanBangDash :: Lexer s -> ST s ()
doCommentLessThanBangDash x@Lexer {..} = do
  c <- nextWord x
  if | c == chrHyphen -> do
         doCommentLessThanBangDashDash x
     | otherwise -> do
         backWord x
         doCommentEndDash x

-- 12.2.5.49 Comment less-than sign bang dash dash state
doCommentLessThanBangDashDash :: Lexer s -> ST s ()
doCommentLessThanBangDashDash x@Lexer {..} = do
  c <- nextWord x
  if | c == chrHyphen ||
       c == chrEOF -> do
         backWord x
         doCommentEnd x
     | otherwise -> do
         parseError x "nested-comment"
         backWord x
         doCommentEnd x

-- 12.2.5.50 Comment end dash state
doCommentEndDash :: Lexer s -> ST s ()
doCommentEndDash x@Lexer {..} = do
  c <- nextWord x
  if | c == chrHyphen -> do
         doCommentEnd x
     | c == chrEOF -> do
         parseError x "eof-in-comment"
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         tokenCommentAppend chrHyphen lexerToken
         backWord x
         doComment x

-- 12.2.5.51 Comment end state
doCommentEnd :: Lexer s -> ST s ()
doCommentEnd x@Lexer {..} = do
  c <- nextWord x
  if | c == chrGreater -> do
         state x StateData
         emit x
     | c == chrExclamation -> do
         doCommentEndBang x
     | c == chrHyphen -> do
         tokenCommentAppend chrHyphen lexerToken
         doCommentEnd x
     | c == chrEOF -> do
         parseError x "eof-in-comment"
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         tokenCommentAppend chrHyphen lexerToken
         tokenCommentAppend chrHyphen lexerToken
         backWord x
         doComment x

-- 12.2.5.52 Comment end bang state
doCommentEndBang :: Lexer s -> ST s ()
doCommentEndBang x@Lexer {..} = do
  c <- nextWord x
  if | c == chrGreater -> do
         state x StateData
         emit x
     | c == chrHyphen -> do
         tokenCommentAppend chrHyphen lexerToken
         tokenCommentAppend chrHyphen lexerToken
         tokenCommentAppend chrExclamation lexerToken
         doCommentEndDash x
     | c == chrGreater -> do
         parseError x "incorrectly-closed-comment"
         state x StateData
         emit x
     | c == chrEOF -> do
         parseError x "eof-in-comment"
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         tokenCommentAppend chrHyphen lexerToken
         tokenCommentAppend chrHyphen lexerToken
         tokenCommentAppend chrExclamation lexerToken
         backWord x
         doComment x

-- 12.2.5.53 DOCTYPE state
doDoctype :: Lexer s -> ST s ()
doDoctype x@Lexer {..} = do
  c <- nextWord x
  if | c == chrTab ||
       c == chrLF ||
       c == chrFF ||
       c == chrSpace -> do
         doBeforeDoctypeName x
     | c == chrGreater -> do
         backWord x
         doBeforeDoctypeName x
     | c == chrEOF -> do
         parseError x "eof-in-doctype"
         tokenDoctypeInit lexerToken
         tokenDoctypeSetForceQuirks lexerToken
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         parseError x "missing-whitespace-before-doctype-name"
         backWord x
         doBeforeDoctypeName x

-- 12.2.5.54 Before DOCTYPE name state
doBeforeDoctypeName :: Lexer s -> ST s ()
doBeforeDoctypeName x@Lexer {..} = do
  c <- nextWord x
  if | c == chrTab ||
       c == chrLF ||
       c == chrFF ||
       c == chrSpace -> do
         doBeforeDoctypeName x
     | chrASCIIUpperAlpha c -> do
         tokenDoctypeInit lexerToken
         tokenDoctypeNameAppend (chrToLower c) lexerToken
         doDoctypeName x
     | c == chrGreater -> do
         parseError x "missing-doctype-name"
         tokenDoctypeInit lexerToken
         tokenDoctypeSetForceQuirks lexerToken
         state x StateData
         emit x
     | c == chrEOF -> do
         parseError x "eof-in-doctype"
         tokenDoctypeInit lexerToken
         tokenDoctypeSetForceQuirks lexerToken
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         tokenDoctypeInit lexerToken
         tokenDoctypeNameAppend c lexerToken
         doDoctypeName x

-- 12.2.5.55 DOCTYPE name state
doDoctypeName :: Lexer s -> ST s ()
doDoctypeName x@Lexer {..} = do
  c <- nextWord x
  if | c == chrTab ||
       c == chrLF ||
       c == chrFF ||
       c == chrSpace -> do
         doAfterDoctypeName x
     | c == chrGreater -> do
         state x StateData
         emit x
     | chrASCIIUpperAlpha c -> do
         tokenDoctypeNameAppend (chrToLower c) lexerToken
         doDoctypeName x
     | c == chrEOF -> do
         parseError x "eof-in-doctype"
         tokenDoctypeSetForceQuirks lexerToken
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         tokenDoctypeNameAppend c lexerToken
         doDoctypeName x

-- 12.2.5.56 After DOCTYPE name state
doAfterDoctypeName :: Lexer s -> ST s ()
doAfterDoctypeName x@Lexer {..} = do
  c <- nextWord x
  -- Get data indexer after the character.
  f <- dataIndexer x
  if | c == chrTab ||
       c == chrLF ||
       c == chrFF ||
       c == chrSpace -> do
         doAfterDoctypeName x
     | c == chrGreater -> do
         state x StateData
         emit x
     | c == chrEOF -> do
         parseError x "eof-in-doctype"
         tokenDoctypeSetForceQuirks lexerToken
         tokenEOFInit lexerToken
         emit x
     | (c == 0x50 || c == 0x70) &&        -- P or p
       (f 0 == 0x55 || f 0 == 0x75) &&    -- U or u
       (f 1 == 0x42 || f 1 == 0x62) &&    -- B or b
       (f 2 == 0x4C || f 2 == 0x6C) &&    -- L or l
       (f 3 == 0x49 || f 3 == 0x69) &&    -- I or i
       (f 4 == 0x43 || f 4 == 0x63) -> do -- C or c
         skipWord x 5
         doAfterDoctypePublicKeyword x
     | (c == 0x53 || c == 0x73) &&        -- S or s
       (f 0 == 0x59 || f 0 == 0x79) &&    -- Y or y
       (f 1 == 0x53 || f 1 == 0x73) &&    -- S or s
       (f 2 == 0x54 || f 2 == 0x74) &&    -- T or t
       (f 3 == 0x45 || f 3 == 0x65) &&    -- E or e
       (f 4 == 0x4D || f 4 == 0x6D) -> do -- M or m
         skipWord x 5
         doAfterDoctypeSystemKeyword x
     | otherwise -> do
         parseError x "invalid-character-sequence-after-doctype-name"
         tokenDoctypeSetForceQuirks lexerToken
         doBogusDoctype x

-- 12.2.5.57 After DOCTYPE public keyword state
doAfterDoctypePublicKeyword :: Lexer s -> ST s ()
doAfterDoctypePublicKeyword x@Lexer {..} = do
  c <- nextWord x
  if | c == chrTab ||
       c == chrLF ||
       c == chrFF ||
       c == chrSpace -> do
         doBeforeDoctypePublicId x
     | c == chrQuote -> do
         parseError x "missing-whitespace-after-doctype-public-keyword"
         tokenDoctypePublicIdInit lexerToken
         doDoctypePublicIdDoubleQuoted x
     | c == chrApostrophe -> do
         parseError x "missing-whitespace-after-doctype-public-keyword"
         tokenDoctypePublicIdInit lexerToken
         doDoctypePublicIdSingleQuoted x
     | c == chrGreater -> do
         parseError x "missing-doctype-public-identifier"
         tokenDoctypeSetForceQuirks lexerToken
         state x StateData
         emit x
     | c == chrEOF -> do
         parseError x "eof-in-doctype"
         tokenDoctypeSetForceQuirks lexerToken
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         parseError x "missing-quote-before-doctype-public-identifier"
         tokenDoctypeSetForceQuirks lexerToken
         doBogusDoctype x

-- 12.2.5.58 Before DOCTYPE public identifier state
doBeforeDoctypePublicId :: Lexer s -> ST s ()
doBeforeDoctypePublicId x@Lexer {..} = do
  c <- nextWord x
  if | c == chrTab ||
       c == chrLF ||
       c == chrFF ||
       c == chrSpace -> do
         doBeforeDoctypePublicId x
     | c == chrQuote -> do
         tokenDoctypePublicIdInit lexerToken
         doDoctypePublicIdDoubleQuoted x
     | c == chrApostrophe -> do
         tokenDoctypePublicIdInit lexerToken
         doDoctypePublicIdSingleQuoted x
     | c == chrGreater -> do
         parseError x "missing-doctype-public-identifier"
         tokenDoctypeSetForceQuirks lexerToken
         state x StateData
         emit x
     | c == chrEOF -> do
         parseError x "eof-in-doctype"
         tokenDoctypeSetForceQuirks lexerToken
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         parseError x "missing-quote-before-doctype-public-identifier"
         tokenDoctypeSetForceQuirks lexerToken
         doBogusDoctype x

-- 12.2.5.59 DOCTYPE public identifier (double-quoted) state
doDoctypePublicIdDoubleQuoted :: Lexer s -> ST s ()
doDoctypePublicIdDoubleQuoted x@Lexer {..} = do
  c <- nextWord x
  if | c == chrQuote -> do
         doAfterDoctypePublicId x
     | c == chrGreater -> do
         parseError x "abrupt-doctype-public-identifier"
         tokenDoctypeSetForceQuirks lexerToken
         state x StateData
         emit x
     | c == chrEOF -> do
         parseError x "eof-in-doctype"
         tokenDoctypeSetForceQuirks lexerToken
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         tokenDoctypePublicIdAppend c lexerToken
         doDoctypePublicIdDoubleQuoted x

-- 12.2.5.60 DOCTYPE public identifier (single-quoted) state
doDoctypePublicIdSingleQuoted :: Lexer s -> ST s ()
doDoctypePublicIdSingleQuoted x@Lexer {..} = do
  c <- nextWord x
  if | c == chrApostrophe -> do
         doAfterDoctypePublicId x
     | c == chrGreater -> do
         parseError x "abrupt-doctype-public-identifier"
         tokenDoctypeSetForceQuirks lexerToken
         state x StateData
         emit x
     | c == chrEOF -> do
         parseError x "eof-in-doctype"
         tokenDoctypeSetForceQuirks lexerToken
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         tokenDoctypePublicIdAppend c lexerToken
         doDoctypePublicIdSingleQuoted x

-- 12.2.5.61 After DOCTYPE public identifier state
doAfterDoctypePublicId :: Lexer s -> ST s ()
doAfterDoctypePublicId x@Lexer {..} = do
  c <- nextWord x
  if | c == chrTab ||
       c == chrLF ||
       c == chrFF ||
       c == chrSpace -> do
         doBetweenDoctypePublicAndSystem x
     | c == chrGreater -> do
         state x StateData
         emit x
     | c == chrQuote -> do
         parseError x "missing-whitespace-between-doctype-public-and-system-identifiers"
         tokenDoctypeSystemIdInit lexerToken
         doDoctypeSystemIdDoubleQuoted x
     | c == chrApostrophe -> do
         parseError x "missing-whitespace-between-doctype-public-and-system-identifiers"
         tokenDoctypeSystemIdInit lexerToken
         doDoctypeSystemIdSingleQuoted x
     | c == chrEOF -> do
         parseError x "eof-in-doctype"
         tokenDoctypeSetForceQuirks lexerToken
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         parseError x "missing-quote-before-doctype-system-identifier"
         tokenDoctypeSetForceQuirks lexerToken
         doBogusDoctype x

-- 12.2.5.62 Between DOCTYPE public and system identifiers state
doBetweenDoctypePublicAndSystem :: Lexer s -> ST s ()
doBetweenDoctypePublicAndSystem x@Lexer {..} = do
  c <- nextWord x
  if | c == chrTab ||
       c == chrLF ||
       c == chrFF ||
       c == chrSpace -> do
         doBetweenDoctypePublicAndSystem x
     | c == chrGreater -> do
         state x StateData
         emit x
     | c == chrQuote -> do
         tokenDoctypeSystemIdInit lexerToken
         doDoctypeSystemIdDoubleQuoted x
     | c == chrApostrophe -> do
         tokenDoctypeSystemIdInit lexerToken
         doDoctypeSystemIdSingleQuoted x
     | c == chrEOF -> do
         parseError x "eof-in-doctype"
         tokenDoctypeSetForceQuirks lexerToken
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         parseError x "missing-quote-before-doctype-system-identifier"
         tokenDoctypeSetForceQuirks lexerToken
         doBogusDoctype x

-- 12.2.5.63 After DOCTYPE system keyword state
doAfterDoctypeSystemKeyword :: Lexer s -> ST s ()
doAfterDoctypeSystemKeyword x@Lexer {..} = do
  c <- nextWord x
  if | c == chrTab ||
       c == chrLF ||
       c == chrFF ||
       c == chrSpace -> do
         doBeforeDoctypeSystemId x
     | c == chrQuote -> do
         parseError x "missing-whitespace-after-doctype-system-keyword"
         tokenDoctypeSystemIdInit lexerToken
         doDoctypeSystemIdDoubleQuoted x
     | c == chrApostrophe -> do
         parseError x "missing-whitespace-after-doctype-system-keyword"
         tokenDoctypeSystemIdInit lexerToken
         doDoctypeSystemIdSingleQuoted x
     | c == chrGreater -> do
         parseError x "missing-doctype-system-identifier"
         tokenDoctypeSetForceQuirks lexerToken
         state x StateData
         emit x
     | c == chrEOF -> do
         parseError x "eof-in-doctype"
         tokenDoctypeSetForceQuirks lexerToken
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         parseError x "missing-quote-before-doctype-system-identifier"
         tokenDoctypeSetForceQuirks lexerToken
         doBogusDoctype x

-- 12.2.5.64 Before DOCTYPE system identifier state
doBeforeDoctypeSystemId :: Lexer s -> ST s ()
doBeforeDoctypeSystemId x@Lexer {..} = do
  c <- nextWord x
  if | c == chrTab ||
       c == chrLF ||
       c == chrFF ||
       c == chrSpace -> do
         doBeforeDoctypeSystemId x
     | c == chrQuote -> do
         tokenDoctypeSystemIdInit lexerToken
         doDoctypeSystemIdDoubleQuoted x
     | c == chrApostrophe -> do
         tokenDoctypeSystemIdInit lexerToken
         doDoctypeSystemIdSingleQuoted x
     | c == chrGreater -> do
         parseError x "missing-doctype-system-identifier"
         tokenDoctypeSetForceQuirks lexerToken
         state x StateData
         emit x
     | c == chrEOF -> do
         parseError x "eof-in-doctype"
         tokenDoctypeSetForceQuirks lexerToken
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         parseError x "missing-quote-before-doctype-system-identifier"
         tokenDoctypeSetForceQuirks lexerToken
         doBogusDoctype x

-- 12.2.5.65 DOCTYPE system identifier (double-quoted) state
doDoctypeSystemIdDoubleQuoted :: Lexer s -> ST s ()
doDoctypeSystemIdDoubleQuoted x@Lexer {..} = do
  c <- nextWord x
  if | c == chrQuote -> do
         doAfterDoctypeSystemId x
     | c == chrGreater -> do
         parseError x "abrupt-doctype-system-identifier"
         tokenDoctypeSetForceQuirks lexerToken
         state x StateData
         emit x
     | c == chrEOF -> do
         parseError x "eof-in-doctype"
         tokenDoctypeSetForceQuirks lexerToken
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         tokenDoctypeSystemIdAppend c lexerToken
         doDoctypeSystemIdDoubleQuoted x

-- 12.2.5.66 DOCTYPE system identifier (single-quoted) state
doDoctypeSystemIdSingleQuoted :: Lexer s -> ST s ()
doDoctypeSystemIdSingleQuoted x@Lexer {..} = do
  c <- nextWord x
  if | c == chrApostrophe -> do
         doAfterDoctypeSystemId x
     | c == chrGreater -> do
         parseError x "abrupt-doctype-system-identifier"
         tokenDoctypeSetForceQuirks lexerToken
         state x StateData
         emit x
     | c == chrEOF -> do
         parseError x "eof-in-doctype"
         tokenDoctypeSetForceQuirks lexerToken
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         tokenDoctypeSystemIdAppend c lexerToken
         doDoctypeSystemIdSingleQuoted x

-- 12.2.5.67 After DOCTYPE system identifier state
doAfterDoctypeSystemId :: Lexer s -> ST s ()
doAfterDoctypeSystemId x@Lexer {..} = do
  c <- nextWord x
  if | c == chrTab ||
       c == chrLF ||
       c == chrFF ||
       c == chrSpace -> do
         doAfterDoctypeSystemId x
     | c == chrGreater -> do
         state x StateData
         emit x
     | c == chrEOF -> do
         parseError x "eof-in-doctype"
         tokenDoctypeSetForceQuirks lexerToken
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         parseError x "unexpected-character-after-doctype-system-identifier"
         doBogusDoctype x

-- 12.2.5.68 Bogus DOCTYPE state
doBogusDoctype :: Lexer s -> ST s ()
doBogusDoctype x@Lexer {..} = do
  c <- nextWord x
  if | c == chrGreater -> do
         state x StateData
         emit x
     | c == chrEOF -> do
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         doBogusDoctype x

-- 12.2.5.69 CDATA section state
doCDATASection :: Lexer s -> ST s ()
doCDATASection x@Lexer {..} = do
  c <- nextWord x
  if | c == chrBracketRight -> do
         doCDATASectionBracket x
     | c == chrEOF -> do
         parseError x "eof-in-cdata"
         tokenEOFInit lexerToken
         emit x
     | otherwise -> do
         emitChar x c
         doCDATASection x

-- 12.2.5.70 CDATA section bracket state
doCDATASectionBracket :: Lexer s -> ST s ()
doCDATASectionBracket x@Lexer {..} = do
  c <- nextWord x
  if | c == chrBracketRight -> do
         doCDATASectionEnd x
     | otherwise -> do
         tokenCharInit chrBracketRight lexerToken
         backWord x
         doCDATASection x

-- 12.2.5.71 CDATA section end state
doCDATASectionEnd :: Lexer s -> ST s ()
doCDATASectionEnd x@Lexer {..} = do
  c <- nextWord x
  if | c == chrBracketRight -> do
         emitChar x c
         doCDATASectionEnd x
     | c == chrGreater -> do
         doData x
     | otherwise -> do
         emitChar x chrBracketRight
         emitChar x chrBracketRight
         backWord x
         doCDATASection x

-- 12.2.5.72 Character reference state
doCharacterReference :: Lexer s -> ST s ()
doCharacterReference x@Lexer {..} = do
  bufferReset lexerBuffer
  bufferAppend chrAmpersand lexerBuffer
  c <- nextWord x
  if | lexerIgnore -> do
         flushCodePoints x
         backWord x
         returnState x
     | chrASCIIAlphanumeric c -> do
         backWord x
         doNamedCharacterReference x
     | c == chrNumberSign -> do
         bufferAppend c lexerBuffer
         doNumericCharacterReference x
     | otherwise -> do
         flushCodePoints x
         backWord x
         returnState x

-- 12.2.5.73 Named character reference state
doNamedCharacterReference :: Lexer s -> ST s ()
doNamedCharacterReference x@Lexer {..} = do
  o <- rref lexerOffset
  case entityMatch (bsDrop o lexerData) of
    Just (p, v, _) -> do
      skipWord x $ bsLen p
      forM_ (bsUnpack p) $
        flip bufferAppend lexerBuffer
      attr <- consumingAttibute x
      semi <- pure $ bsLast p == Just chrSemicolon
      c    <- peekWord x
      if | attr
         , not semi
         , c == chrSemicolon || chrASCIIAlphanumeric c -> do
             flushCodePoints x
             returnState x
         | otherwise -> do
             when (not semi) $
               parseError x "missing-semicolon-after-character-reference"
             bufferReset lexerBuffer
             forM_ (bsUnpack v) $
               flip bufferAppend lexerBuffer
             flushCodePoints x
             returnState x
    Nothing -> do
      flushCodePoints x
      doAmbiguousAmpersand x

-- 12.2.5.74 Ambiguous ampersand state
doAmbiguousAmpersand :: Lexer s -> ST s ()
doAmbiguousAmpersand x@Lexer {..} = do
  c <- nextWord x
  if | chrASCIIAlphanumeric c -> do
         consumingAttibute x >>= \case
           True  -> tokenAttrValAppend c lexerToken
           False -> emitChar x c
         doAmbiguousAmpersand x
     | c == chrSemicolon -> do
         parseError x "unknown-named-character-reference"
         backWord x
         returnState x
     | otherwise -> do
         backWord x
         returnState x

-- 12.2.5.75 Numeric character reference state
doNumericCharacterReference :: Lexer s -> ST s ()
doNumericCharacterReference x@Lexer {..} = do
  wref lexerCode 0
  c <- nextWord x
  if | c == chrUpperX || c == chrLowerX -> do
         bufferAppend c lexerBuffer
         doHexCharacterReferenceStart x
     | otherwise -> do
         backWord x
         doDecimalCharacterReference x

-- 12.2.5.76 Hexademical character reference start state
doHexCharacterReferenceStart :: Lexer s -> ST s ()
doHexCharacterReferenceStart x@Lexer {..} = do
  c <- nextWord x
  if | chrASCIIHexDigit c -> do
         backWord x
         doHexCharacterReference x
     | otherwise -> do
         parseError x "absence-of-digits-in-numeric-character-reference"
         flushCodePoints x
         backWord x
         returnState x

-- 12.2.5.77 Decimal character reference start state
doDecimalCharacterReferenceStart :: Lexer s -> ST s ()
doDecimalCharacterReferenceStart x@Lexer {..} = do
  c <- nextWord x
  if | chrASCIIDigit c -> do
         backWord x
         doDecimalCharacterReference x
     | otherwise -> do
         parseError x "absence-of-digits-in-numeric-character-reference"
         flushCodePoints x
         backWord x
         returnState x

-- 12.2.5.78 Hexademical character reference state
doHexCharacterReference :: Lexer s -> ST s ()
doHexCharacterReference x@Lexer {..} = do
  c <- nextWord x
  if | chrASCIIDigit c -> do
         uref lexerCode $ \y -> 16 * y + (fromIntegral c - 0x30)
         doHexCharacterReference x
     | chrASCIIUpperHexDigit c -> do
         uref lexerCode $ \y -> 16 * y + (fromIntegral c - 0x37)
         doHexCharacterReference x
     | chrASCIILowerHexDigit c -> do
         uref lexerCode $ \y -> 16 * y + (fromIntegral c - 0x57)
         doHexCharacterReference x
     | c == chrSemicolon -> do
         doNumericCharacterReferenceEnd x
     | otherwise -> do
         parseError x "missing-semicolon-after-character-reference"
         backWord x
         doNumericCharacterReferenceEnd x

-- 12.2.5.79 Decimal character reference state
doDecimalCharacterReference :: Lexer s -> ST s ()
doDecimalCharacterReference x@Lexer {..} = do
  c <- nextWord x
  if | chrASCIIDigit c -> do
         uref lexerCode $ \y -> 10 * y + (fromIntegral c - 0x30)
         doDecimalCharacterReference x
     | c == chrSemicolon -> do
         doNumericCharacterReferenceEnd x
     | otherwise -> do
         parseError x "missing-semicolon-after-character-reference"
         backWord x
         doNumericCharacterReferenceEnd x

-- 12.2.5.80 Numeric character reference end state
doNumericCharacterReferenceEnd :: Lexer s -> ST s ()
doNumericCharacterReferenceEnd x@Lexer {..} = do
  c <- rref lexerCode
  let n = fromIntegral c
  if | c == 0 -> do
         parseError x "null-character-reference"
         wref lexerCode 0xFFFD
     | c > 0x10FFFF -> do
         parseError x "character-reference-outside-unicode-range"
         wref lexerCode 0xFFFD
     | chrSurrogate c -> do
         parseError x "surrogate-character-reference"
         wref lexerCode 0xFFFD
     | chrNonCharacter c -> do
         parseError x "noncharacter-character-reference"
     | c == 0x0D || (chrWord8 c && chrControl n && not (chrWhitespace n)) -> do
         parseError x "control-character-reference"
         whenJust (Map.lookup c codeMap) $ wref lexerCode
     | otherwise ->
         pure ()
  bufferReset lexerBuffer
  forM_ (chrUTF8 c) $ flip bufferAppend lexerBuffer
  flushCodePoints x
  returnState x

-- | Character code map.
codeMap :: Map Int Int
codeMap = Map.fromList
  [ (0x80, 0x20AC)
  , (0x82, 0x201A)
  , (0x83, 0x0192)
  , (0x84, 0x201E)
  , (0x85, 0x2026)
  , (0x86, 0x2020)
  , (0x87, 0x2021)
  , (0x88, 0x02C6)
  , (0x89, 0x2030)
  , (0x8A, 0x0160)
  , (0x8B, 0x2039)
  , (0x8C, 0x0152)
  , (0x8E, 0x017D)
  , (0x91, 0x2018)
  , (0x92, 0x2019)
  , (0x93, 0x201C)
  , (0x94, 0x201D)
  , (0x95, 0x2022)
  , (0x96, 0x2013)
  , (0x97, 0x2014)
  , (0x98, 0x02DC)
  , (0x99, 0x2122)
  , (0x9A, 0x0161)
  , (0x9B, 0x203A)
  , (0x9C, 0x0153)
  , (0x9E, 0x017E)
  , (0x9F, 0x0178)
  ]