{-# LANGUAGE Trustworthy #-}

{-|
Description:    Tokenization rules for characters comprising character references.

Copyright:      (c) 2020-2021 Sam May
License:        MPL-2.0
Maintainer:     ag.eitilt@gmail.com

Stability:      stable
Portability:    portable

The monadic parser construction and prefix-preferring 'String' concatenation
mean that character references are resolved opposite the algorithm described in
the __[HTML](https://html.spec.whatwg.org/multipage/parsing.html)__
specification.  That is most notable in the numeric references, where the spec
recommends multiplying an accumulator by the numeric base before adding each
digit, while this implementation multiplies each digit by the positional value
before adding the accumulator.  The reversal in named references is more
conceptual and subtle, but the spec is worded to read a name from the input
while the implementation reads input 'Char's according to the list of reference
names; this is likely how most implementations accomplish it.
-}
module Web.Mangrove.Parse.Tokenize.Character
    ( flushCharRef
    , tokenCharacterReference
    ) where


import qualified Control.Applicative as A
import qualified Control.Monad as N

import qualified Data.Bifunctor as F.B
import qualified Data.Char as C
import qualified Data.HashMap.Strict as M
import qualified Data.Maybe as Y
import qualified Data.Vector as V
import qualified Data.Word as W

import qualified Numeric.Natural as Z

import Data.Functor ( ($>) )
import Data.Vector ( (!?) )

import Web.Mangrove.Parse.Common.Error
import Web.Mangrove.Parse.Common.Character
import Web.Mangrove.Parse.Tokenize.Common
import Web.Willow.Common.Encoding.Character
import Web.Willow.Common.Parser
import Web.Willow.Common.Parser.Util

import {-# SOURCE #-} Web.Mangrove.Parse.Tokenize.Dispatcher


-- | __HTML:__
--      @[flush code points consumed as a character reference]
--      (https://html.spec.whatwg.org/multipage/parsing.html#flush-code-points-consumed-as-a-character-reference)@
-- 
-- Transform a wrapped 'Char' sequence into a sequence of wrapped 'Character'
-- tokens.
flushCharRef :: TokenizerOutput String -> Tokenizer [TokenizerOutput Token]
flushCharRef :: TokenizerOutput String -> Tokenizer [TokenizerOutput Token]
flushCharRef TokenizerOutput String
out = case TokenizerOutput String -> String
forall out. TokenizerOutput out -> out
tokenizedOut TokenizerOutput String
out of
    String
"" -> case TokenizerOutput String -> [ParseError]
forall out. TokenizerOutput out -> [ParseError]
tokenizedErrs TokenizerOutput String
out of
        [] -> [TokenizerOutput Token] -> Tokenizer [TokenizerOutput Token]
forall (m :: * -> *) a. Monad m => a -> m a
return []
        [ParseError]
errs -> [ParseError] -> [TokenizerOutput Token] -> [TokenizerOutput Token]
consTokenErrorsList [ParseError]
errs ([TokenizerOutput Token] -> [TokenizerOutput Token])
-> (Maybe [TokenizerOutput Token] -> [TokenizerOutput Token])
-> Maybe [TokenizerOutput Token]
-> [TokenizerOutput Token]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [TokenizerOutput Token]
-> Maybe [TokenizerOutput Token] -> [TokenizerOutput Token]
forall a. a -> Maybe a -> a
Y.fromMaybe [] (Maybe [TokenizerOutput Token] -> [TokenizerOutput Token])
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (Maybe [TokenizerOutput Token])
-> Tokenizer [TokenizerOutput Token]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Tokenizer [TokenizerOutput Token]
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (Maybe [TokenizerOutput Token])
forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
A.optional Tokenizer [TokenizerOutput Token]
dispatcher
    [Char
c] -> [TokenizerOutput Token] -> Tokenizer [TokenizerOutput Token]
forall (m :: * -> *) a. Monad m => a -> m a
return [TokenizerOutput String
out { tokenizedOut :: Token
tokenizedOut = Char -> Token
Character Char
c }]
    (Char
c:String
_) -> do
        let c' :: TokenizerOutput Token
c' = TokenizerOutput String
out
                { tokenizedOut :: Token
tokenizedOut = Char -> Token
Character Char
c
                , tokenizedState :: DecoderOutputState
tokenizedState = DecoderOutputState
forall a. Maybe a
Nothing
                }
        [TokenizerOutput Token]
cs' <- TokenizerOutput String -> Tokenizer [TokenizerOutput Token]
flushCharRef TokenizerOutput String
out'
        [TokenizerOutput Token] -> Tokenizer [TokenizerOutput Token]
forall (m :: * -> *) a. Monad m => a -> m a
return ([TokenizerOutput Token] -> Tokenizer [TokenizerOutput Token])
-> [TokenizerOutput Token] -> Tokenizer [TokenizerOutput Token]
forall a b. (a -> b) -> a -> b
$ TokenizerOutput Token
c' TokenizerOutput Token
-> [TokenizerOutput Token] -> [TokenizerOutput Token]
forall a. a -> [a] -> [a]
: [TokenizerOutput Token]
cs'
  where out' :: TokenizerOutput String
out' = TokenizerOutput String
out
            { tokenizedErrs :: [ParseError]
tokenizedErrs = []
            , tokenizedOut :: String
tokenizedOut = Int -> String -> String
forall a. Int -> [a] -> [a]
drop Int
1 (String -> String) -> String -> String
forall a b. (a -> b) -> a -> b
$ TokenizerOutput String -> String
forall out. TokenizerOutput out -> out
tokenizedOut TokenizerOutput String
out
            }


-- | __HTML:__
--      @[character reference state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state)@
-- 
-- The parsing instructions for after reading @"&"@ in a section of the state
-- machine which allows character references.
tokenCharacterReference
    :: Bool
        -- ^ Whether this parser was called from within a markup tag's attribute.
    -> Tokenizer (TokenizerOutput String)
tokenCharacterReference :: Bool -> Tokenizer (TokenizerOutput String)
tokenCharacterReference Bool
inAttribute = Maybe ([ParseError], String)
-> [SwitchCase TokenizerInput Tokenizer (WrappedOutput String)]
-> Tokenizer (TokenizerOutput String)
forall out.
Maybe ([ParseError], out)
-> [SwitchCase TokenizerInput Tokenizer (WrappedOutput out)]
-> Tokenizer (TokenizerOutput out)
tokenizer (([ParseError], String) -> Maybe ([ParseError], String)
forall a. a -> Maybe a
Just ([], String
"&"))
    [ (Char -> Bool)
-> Tokenizer (TokenizerOutput String)
-> SwitchCase TokenizerInput Tokenizer (WrappedOutput String)
forall out.
(Char -> Bool)
-> Tokenizer out
-> SwitchCase TokenizerInput Tokenizer (Wrapped out)
ifPush_ Char -> Bool
isAsciiAlphaNum (Tokenizer (TokenizerOutput String)
 -> SwitchCase TokenizerInput Tokenizer (WrappedOutput String))
-> Tokenizer (TokenizerOutput String)
-> SwitchCase TokenizerInput Tokenizer (WrappedOutput String)
forall a b. (a -> b) -> a -> b
$ do
        TokenizerOutput CharacterReference
ref' <- Bool
-> CharacterReferenceTree
-> Tokenizer (TokenizerOutput CharacterReference)
tokenNamedCharacterReference Bool
inAttribute CharacterReferenceTree
characterReferences
        case TokenizerOutput CharacterReference -> CharacterReference
forall out. TokenizerOutput out -> out
tokenizedOut TokenizerOutput CharacterReference
ref' of
            CharacterReference
NotFound -> [ParseError] -> TokenizerOutput String -> TokenizerOutput String
forall out.
[ParseError] -> TokenizerOutput out -> TokenizerOutput out
consTokenErrors (TokenizerOutput CharacterReference -> [ParseError]
forall out. TokenizerOutput out -> [ParseError]
tokenizedErrs TokenizerOutput CharacterReference
ref') (TokenizerOutput String -> TokenizerOutput String)
-> (TokenizerOutput String -> TokenizerOutput String)
-> TokenizerOutput String
-> TokenizerOutput String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> TokenizerOutput String -> TokenizerOutput String
forall out. out -> TokenizerOutput [out] -> TokenizerOutput [out]
consOut Char
'&' (TokenizerOutput String -> TokenizerOutput String)
-> Tokenizer (TokenizerOutput String)
-> Tokenizer (TokenizerOutput String)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Tokenizer (TokenizerOutput String)
tokenAmbiguousAmpersand
            Flush String
name -> TokenizerOutput String -> Tokenizer (TokenizerOutput String)
forall (m :: * -> *) a. Monad m => a -> m a
return TokenizerOutput CharacterReference
ref'
                { tokenizedOut :: String
tokenizedOut = Char
'&' Char -> String -> String
forall a. a -> [a] -> [a]
: String
name
                }
            Found String
ref -> TokenizerOutput String -> Tokenizer (TokenizerOutput String)
forall (m :: * -> *) a. Monad m => a -> m a
return TokenizerOutput CharacterReference
ref'
                { tokenizedOut :: String
tokenizedOut = String
ref
                }
    , (Char -> Bool)
-> Tokenizer (TokenizerOutput String)
-> SwitchCase TokenizerInput Tokenizer (WrappedOutput String)
forall out.
(Char -> Bool)
-> Tokenizer (TokenizerOutput out)
-> SwitchCase TokenizerInput Tokenizer (WrappedOutput out)
if_ (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'#') Tokenizer (TokenizerOutput String)
tokenNumericCharacterReference
    , Tokenizer (TokenizerOutput String)
-> SwitchCase TokenizerInput Tokenizer (WrappedOutput String)
forall out.
Tokenizer out -> SwitchCase TokenizerInput Tokenizer (Wrapped out)
elsePush_ (Tokenizer (TokenizerOutput String)
 -> SwitchCase TokenizerInput Tokenizer (WrappedOutput String))
-> Tokenizer (TokenizerOutput String)
-> SwitchCase TokenizerInput Tokenizer (WrappedOutput String)
forall a b. (a -> b) -> a -> b
$ ([ParseError], String) -> Tokenizer (TokenizerOutput String)
forall out. ([ParseError], out) -> Tokenizer (TokenizerOutput out)
packToken ([], String
"&")
    ]

-- | The result of looking up a named character reference.
data CharacterReference
    = NotFound
        -- ^ No matching name found.
    | Flush String
        -- ^ A named reference was found, but historical reasons require
        -- emitting the name as characters anyway.
    | Found String
        -- ^ A named reference was found and successfully resolved.

-- | __HTML:__
--      @[named character reference state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state)@
-- 
-- The parsing instructions for after reading @"&"@ followed by letters and/or
-- numbers in a section of the state machine which allows character references.
tokenNamedCharacterReference
    :: Bool
        -- ^ Whether this parser was called from within a markup tag's attribute.
    -> CharacterReferenceTree
        -- ^ The list of reference names, filtered according to what prefix has
        -- been already encountered.
    -> Tokenizer (TokenizerOutput CharacterReference)
tokenNamedCharacterReference :: Bool
-> CharacterReferenceTree
-> Tokenizer (TokenizerOutput CharacterReference)
tokenNamedCharacterReference Bool
inAttribute (CharacterReferenceTree HashMap Char (Maybe ReferenceValue, CharacterReferenceTree)
refs) = do
    Maybe TokenizerInput
cm' <- StateT TokenParserState (Parser [TokenizerInput]) TokenizerInput
-> StateT
     TokenParserState (Parser [TokenizerInput]) (Maybe TokenizerInput)
forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
A.optional StateT TokenParserState (Parser [TokenizerInput]) TokenizerInput
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next
    let state1 :: DecoderOutputState
state1 = DecoderOutputState
endState
        state2 :: DecoderOutputState
state2 = do
            TokenizerInput
cm <- Maybe TokenizerInput
cm'
            TokenizerInput -> DecoderOutputState
decodedState TokenizerInput
cm
    case Maybe TokenizerInput
cm' of
        Maybe TokenizerInput
Nothing -> ([ParseError], CharacterReference)
-> DecoderOutputState
-> Tokenizer (TokenizerOutput CharacterReference)
forall out.
([ParseError], out)
-> DecoderOutputState -> Tokenizer (TokenizerOutput out)
packState ([], CharacterReference
NotFound) DecoderOutputState
state1
        Just TokenizerInput
c -> case Char
-> HashMap Char (Maybe ReferenceValue, CharacterReferenceTree)
-> Maybe (Maybe ReferenceValue, CharacterReferenceTree)
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.lookup (TokenizerInput -> Char
decodedOut TokenizerInput
c) HashMap Char (Maybe ReferenceValue, CharacterReferenceTree)
refs of
            -- No reference found, and no further references possible.
            Maybe (Maybe ReferenceValue, CharacterReferenceTree)
Nothing -> TokenizerInput
-> StateT TokenParserState (Parser [TokenizerInput]) ()
forall (m :: * -> *) stream token.
MonadParser m stream token =>
token -> m ()
push TokenizerInput
c StateT TokenParserState (Parser [TokenizerInput]) ()
-> Tokenizer (TokenizerOutput CharacterReference)
-> Tokenizer (TokenizerOutput CharacterReference)
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> ([ParseError], CharacterReference)
-> DecoderOutputState
-> Tokenizer (TokenizerOutput CharacterReference)
forall out.
([ParseError], out)
-> DecoderOutputState -> Tokenizer (TokenizerOutput out)
packState ([], CharacterReference
NotFound) DecoderOutputState
state2
            -- No reference found, but longer potential reference names exist.
            Just (Maybe ReferenceValue
Nothing, CharacterReferenceTree
refs') -> do
                TokenizerOutput CharacterReference
ref' <- Bool
-> CharacterReferenceTree
-> Tokenizer (TokenizerOutput CharacterReference)
tokenNamedCharacterReference Bool
inAttribute CharacterReferenceTree
refs'
                let errs' :: [ParseError]
errs' = TokenizerInput -> [ParseError]
decodedErrs TokenizerInput
c [ParseError] -> [ParseError] -> [ParseError]
forall a. [a] -> [a] -> [a]
++ TokenizerOutput CharacterReference -> [ParseError]
forall out. TokenizerOutput out -> [ParseError]
tokenizedErrs TokenizerOutput CharacterReference
ref'
                case TokenizerOutput CharacterReference -> CharacterReference
forall out. TokenizerOutput out -> out
tokenizedOut TokenizerOutput CharacterReference
ref' of
                    -- None of those longer names match.
                    CharacterReference
NotFound -> TokenizerInput
-> StateT TokenParserState (Parser [TokenizerInput]) ()
forall (m :: * -> *) stream token.
MonadParser m stream token =>
token -> m ()
push TokenizerInput
c StateT TokenParserState (Parser [TokenizerInput]) ()
-> Tokenizer (TokenizerOutput CharacterReference)
-> Tokenizer (TokenizerOutput CharacterReference)
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> ([ParseError], CharacterReference)
-> DecoderOutputState
-> Tokenizer (TokenizerOutput CharacterReference)
forall out.
([ParseError], out)
-> DecoderOutputState -> Tokenizer (TokenizerOutput out)
packState ([], CharacterReference
NotFound) DecoderOutputState
state2
                    -- Fall back on the longer name.
                    Flush String
name -> TokenizerOutput CharacterReference
-> Tokenizer (TokenizerOutput CharacterReference)
forall (m :: * -> *) a. Monad m => a -> m a
return TokenizerOutput CharacterReference
ref'
                        { tokenizedErrs :: [ParseError]
tokenizedErrs = [ParseError]
errs'
                        , tokenizedOut :: CharacterReference
tokenizedOut = String -> CharacterReference
Flush (String -> CharacterReference) -> String -> CharacterReference
forall a b. (a -> b) -> a -> b
$ TokenizerInput -> Char
decodedOut TokenizerInput
c Char -> String -> String
forall a. a -> [a] -> [a]
: String
name
                        }
                    Found String
ref -> TokenizerOutput CharacterReference
-> Tokenizer (TokenizerOutput CharacterReference)
forall (m :: * -> *) a. Monad m => a -> m a
return TokenizerOutput CharacterReference
ref'
                        { tokenizedErrs :: [ParseError]
tokenizedErrs = [ParseError]
errs'
                        , tokenizedOut :: CharacterReference
tokenizedOut = String -> CharacterReference
Found String
ref
                        }
            -- Reference found at the current name.
            Just (Just ReferenceValue
found, CharacterReferenceTree
refs') -> do
                Maybe TokenizerInput
semicolon <- StateT TokenParserState (Parser [TokenizerInput]) TokenizerInput
-> StateT
     TokenParserState (Parser [TokenizerInput]) (Maybe TokenizerInput)
forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
A.optional (StateT TokenParserState (Parser [TokenizerInput]) TokenizerInput
 -> StateT
      TokenParserState (Parser [TokenizerInput]) (Maybe TokenizerInput))
-> StateT TokenParserState (Parser [TokenizerInput]) TokenizerInput
-> StateT
     TokenParserState (Parser [TokenizerInput]) (Maybe TokenizerInput)
forall a b. (a -> b) -> a -> b
$ StateT TokenParserState (Parser [TokenizerInput]) TokenizerInput
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next StateT TokenParserState (Parser [TokenizerInput]) TokenizerInput
-> (TokenizerInput
    -> StateT
         TokenParserState (Parser [TokenizerInput]) TokenizerInput)
-> StateT TokenParserState (Parser [TokenizerInput]) TokenizerInput
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= (TokenizerInput -> Bool)
-> TokenizerInput
-> StateT TokenParserState (Parser [TokenizerInput]) TokenizerInput
forall (trans :: * -> *) stream token out.
MonadParser trans stream token =>
(out -> Bool) -> out -> trans out
satisfying (\TokenizerInput
d -> TokenizerInput -> Char
decodedOut TokenizerInput
d Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
';')
                let state3 :: DecoderOutputState
state3 = DecoderOutputState
-> (TokenizerInput -> DecoderOutputState)
-> Maybe TokenizerInput
-> DecoderOutputState
forall b a. b -> (a -> b) -> Maybe a -> b
maybe DecoderOutputState
state2 TokenizerInput -> DecoderOutputState
decodedState Maybe TokenizerInput
semicolon
                if Bool -> Bool
not (ReferenceValue -> Bool
isSemicolonOptional ReferenceValue
found) Bool -> Bool -> Bool
&& Maybe TokenizerInput -> Bool
forall a. Maybe a -> Bool
Y.isNothing Maybe TokenizerInput
semicolon
                    then Bool
-> CharacterReferenceTree
-> Tokenizer (TokenizerOutput CharacterReference)
tokenNamedCharacterReference Bool
inAttribute CharacterReferenceTree
refs' Tokenizer (TokenizerOutput CharacterReference)
-> (TokenizerOutput CharacterReference
    -> Tokenizer (TokenizerOutput CharacterReference))
-> Tokenizer (TokenizerOutput CharacterReference)
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \TokenizerOutput CharacterReference
ref ->
                        case TokenizerOutput CharacterReference -> CharacterReference
forall out. TokenizerOutput out -> out
tokenizedOut TokenizerOutput CharacterReference
ref of
                            CharacterReference
NotFound -> TokenizerInput
-> StateT TokenParserState (Parser [TokenizerInput]) ()
forall (m :: * -> *) stream token.
MonadParser m stream token =>
token -> m ()
push TokenizerInput
c StateT TokenParserState (Parser [TokenizerInput]) ()
-> TokenizerOutput CharacterReference
-> Tokenizer (TokenizerOutput CharacterReference)
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> TokenizerOutput CharacterReference
ref
                            CharacterReference
_ -> TokenizerOutput CharacterReference
-> Tokenizer (TokenizerOutput CharacterReference)
forall (m :: * -> *) a. Monad m => a -> m a
return TokenizerOutput CharacterReference
ref
                    else Bool
-> CharacterReferenceTree
-> Tokenizer (TokenizerOutput CharacterReference)
tokenNamedCharacterReference Bool
inAttribute CharacterReferenceTree
refs' Tokenizer (TokenizerOutput CharacterReference)
-> (TokenizerOutput CharacterReference
    -> Tokenizer (TokenizerOutput CharacterReference))
-> Tokenizer (TokenizerOutput CharacterReference)
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>=
                        Bool
-> Bool
-> Char
-> DecoderOutputState
-> String
-> TokenizerOutput CharacterReference
-> Tokenizer (TokenizerOutput CharacterReference)
foundNamedCharacterReference
                            Bool
inAttribute
                            (Maybe TokenizerInput -> Bool
forall a. Maybe a -> Bool
Y.isJust Maybe TokenizerInput
semicolon)
                            (TokenizerInput -> Char
decodedOut TokenizerInput
c)
                            DecoderOutputState
state3
                            (ReferenceValue -> String
referenceValue ReferenceValue
found)

-- | Perform the logistics around determining what 'Character'(s) should be
-- emitted, based on the current environment and any longer match.
foundNamedCharacterReference
    :: Bool
        -- ^ Whether the reference is being evaluated as part of an attribute value.
    -> Bool
        -- ^ Whether the last character matched is a semicolon.
    -> Char
        -- ^ The last character of the reference, /excluding/ the semicolon.
    -> DecoderOutputState
        -- ^ Remainder of the binary document stream after the last character matched.
    -> String
        -- ^ The evaluated sequence represented by the name.
    -> TokenizerOutput CharacterReference
        -- ^ The value returned by continuing the lookup on a longer string.
    -> Tokenizer (TokenizerOutput CharacterReference)
foundNamedCharacterReference :: Bool
-> Bool
-> Char
-> DecoderOutputState
-> String
-> TokenizerOutput CharacterReference
-> Tokenizer (TokenizerOutput CharacterReference)
foundNamedCharacterReference Bool
attribute Bool
semicolon Char
char DecoderOutputState
state String
found TokenizerOutput CharacterReference
ref = case TokenizerOutput CharacterReference -> CharacterReference
forall out. TokenizerOutput out -> out
tokenizedOut TokenizerOutput CharacterReference
ref of
    CharacterReference
NotFound -> if Bool
attribute Bool -> Bool -> Bool
&& Bool -> Bool
not Bool
semicolon
        then do
            Maybe TokenizerInput
c' <- StateT TokenParserState (Parser [TokenizerInput]) TokenizerInput
-> StateT
     TokenParserState (Parser [TokenizerInput]) (Maybe TokenizerInput)
forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
A.optional (StateT TokenParserState (Parser [TokenizerInput]) TokenizerInput
 -> StateT
      TokenParserState (Parser [TokenizerInput]) (Maybe TokenizerInput))
-> StateT TokenParserState (Parser [TokenizerInput]) TokenizerInput
-> StateT
     TokenParserState (Parser [TokenizerInput]) (Maybe TokenizerInput)
forall a b. (a -> b) -> a -> b
$ StateT TokenParserState (Parser [TokenizerInput]) TokenizerInput
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next StateT TokenParserState (Parser [TokenizerInput]) TokenizerInput
-> (TokenizerInput
    -> StateT
         TokenParserState (Parser [TokenizerInput]) TokenizerInput)
-> StateT TokenParserState (Parser [TokenizerInput]) TokenizerInput
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= (TokenizerInput -> Bool)
-> TokenizerInput
-> StateT TokenParserState (Parser [TokenizerInput]) TokenizerInput
forall (trans :: * -> *) stream token out.
MonadParser trans stream token =>
(out -> Bool) -> out -> trans out
satisfying
                (\TokenizerInput
c -> TokenizerInput -> Char
decodedOut TokenizerInput
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'=' Bool -> Bool -> Bool
|| Char -> Bool
isAsciiAlphaNum (TokenizerInput -> Char
decodedOut TokenizerInput
c))
            case Maybe TokenizerInput
c' of
                Maybe TokenizerInput
Nothing -> ([ParseError], CharacterReference)
-> DecoderOutputState
-> Tokenizer (TokenizerOutput CharacterReference)
forall out.
([ParseError], out)
-> DecoderOutputState -> Tokenizer (TokenizerOutput out)
packState ([ParseError
MissingSemicolonAfterCharacterReference], String -> CharacterReference
Found String
found) DecoderOutputState
state
                Just TokenizerInput
c -> TokenizerInput
-> StateT TokenParserState (Parser [TokenizerInput]) ()
forall (m :: * -> *) stream token.
MonadParser m stream token =>
token -> m ()
push TokenizerInput
c StateT TokenParserState (Parser [TokenizerInput]) ()
-> Tokenizer (TokenizerOutput CharacterReference)
-> Tokenizer (TokenizerOutput CharacterReference)
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> ([ParseError], CharacterReference)
-> DecoderOutputState
-> Tokenizer (TokenizerOutput CharacterReference)
forall out.
([ParseError], out)
-> DecoderOutputState -> Tokenizer (TokenizerOutput out)
packState ([], String -> CharacterReference
Flush [Char
char]) DecoderOutputState
state
        else ([ParseError], CharacterReference)
-> DecoderOutputState
-> Tokenizer (TokenizerOutput CharacterReference)
forall out.
([ParseError], out)
-> DecoderOutputState -> Tokenizer (TokenizerOutput out)
packState ([ParseError]
errs', String -> CharacterReference
Found String
found) DecoderOutputState
state
    Flush String
name -> TokenizerOutput CharacterReference
-> Tokenizer (TokenizerOutput CharacterReference)
forall (m :: * -> *) a. Monad m => a -> m a
return (TokenizerOutput CharacterReference
 -> Tokenizer (TokenizerOutput CharacterReference))
-> TokenizerOutput CharacterReference
-> Tokenizer (TokenizerOutput CharacterReference)
forall a b. (a -> b) -> a -> b
$ TokenizerOutput CharacterReference
ref
        { tokenizedOut :: CharacterReference
tokenizedOut = String -> CharacterReference
Flush (String -> CharacterReference) -> String -> CharacterReference
forall a b. (a -> b) -> a -> b
$ Char
char Char -> String -> String
forall a. a -> [a] -> [a]
: String
name
        }
    CharacterReference
_ -> TokenizerOutput CharacterReference
-> Tokenizer (TokenizerOutput CharacterReference)
forall (m :: * -> *) a. Monad m => a -> m a
return TokenizerOutput CharacterReference
ref
  where errs' :: [ParseError]
errs'
            | Bool
semicolon = []
            | Bool
otherwise = [ParseError
MissingSemicolonAfterCharacterReference]

-- | __HTML:__
--      @[ambiguous ampersand state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#ambiguous-ampersand-state)@
-- 
-- The parsing instructions for after reading @"&"@ followed by a string which
-- does not correspond to any known reference name, in a section of the state
-- machine which allows character references.
tokenAmbiguousAmpersand :: Tokenizer (TokenizerOutput String)
tokenAmbiguousAmpersand :: Tokenizer (TokenizerOutput String)
tokenAmbiguousAmpersand = Maybe ([ParseError], String)
-> [SwitchCase TokenizerInput Tokenizer (WrappedOutput String)]
-> Tokenizer (TokenizerOutput String)
forall out.
Maybe ([ParseError], out)
-> [SwitchCase TokenizerInput Tokenizer (WrappedOutput out)]
-> Tokenizer (TokenizerOutput out)
tokenizer (([ParseError], String) -> Maybe ([ParseError], String)
forall a. a -> Maybe a
Just ([], String
""))
    [ (Char -> Bool)
-> (Char -> Tokenizer (TokenizerOutput String))
-> SwitchCase TokenizerInput Tokenizer (WrappedOutput String)
forall out.
(Char -> Bool)
-> (Char -> Tokenizer (TokenizerOutput out))
-> SwitchCase TokenizerInput Tokenizer (WrappedOutput out)
ifChar Char -> Bool
isAsciiAlphaNum ((Char -> Tokenizer (TokenizerOutput String))
 -> SwitchCase TokenizerInput Tokenizer (WrappedOutput String))
-> (Char -> Tokenizer (TokenizerOutput String))
-> SwitchCase TokenizerInput Tokenizer (WrappedOutput String)
forall a b. (a -> b) -> a -> b
$ \Char
c -> Char -> TokenizerOutput String -> TokenizerOutput String
forall out. out -> TokenizerOutput [out] -> TokenizerOutput [out]
consOut Char
c (TokenizerOutput String -> TokenizerOutput String)
-> Tokenizer (TokenizerOutput String)
-> Tokenizer (TokenizerOutput String)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Tokenizer (TokenizerOutput String)
tokenAmbiguousAmpersand
    , (Char -> Bool)
-> Tokenizer (TokenizerOutput String)
-> SwitchCase TokenizerInput Tokenizer (WrappedOutput String)
forall out.
(Char -> Bool)
-> Tokenizer out
-> SwitchCase TokenizerInput Tokenizer (Wrapped out)
ifPush_ (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
';') (Tokenizer (TokenizerOutput String)
 -> SwitchCase TokenizerInput Tokenizer (WrappedOutput String))
-> Tokenizer (TokenizerOutput String)
-> SwitchCase TokenizerInput Tokenizer (WrappedOutput String)
forall a b. (a -> b) -> a -> b
$ ([ParseError], String) -> Tokenizer (TokenizerOutput String)
forall out. ([ParseError], out) -> Tokenizer (TokenizerOutput out)
packToken ([ParseError
UnknownNamedCharacterReference], String
"")
    , Tokenizer (TokenizerOutput String)
-> SwitchCase TokenizerInput Tokenizer (WrappedOutput String)
forall out.
Tokenizer out -> SwitchCase TokenizerInput Tokenizer (Wrapped out)
elsePush_ (Tokenizer (TokenizerOutput String)
 -> SwitchCase TokenizerInput Tokenizer (WrappedOutput String))
-> Tokenizer (TokenizerOutput String)
-> SwitchCase TokenizerInput Tokenizer (WrappedOutput String)
forall a b. (a -> b) -> a -> b
$ ([ParseError], String) -> Tokenizer (TokenizerOutput String)
forall out. ([ParseError], out) -> Tokenizer (TokenizerOutput out)
packToken ([], String
"")
    ]

-- | __HTML:__
--      @[numeric character reference state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state)@
-- 
-- The parsing instructions for after reading @"&#"@ in a section of the state
-- machine which allows character references.
tokenNumericCharacterReference :: Tokenizer (TokenizerOutput String)
tokenNumericCharacterReference :: Tokenizer (TokenizerOutput String)
tokenNumericCharacterReference = (Either String String -> String)
-> TokenizerOutput (Either String String) -> TokenizerOutput String
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap ((String -> String)
-> (String -> String) -> Either String String -> String
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either (String
"&#" String -> String -> String
forall a. [a] -> [a] -> [a]
++) String -> String
forall a. a -> a
id) (TokenizerOutput (Either String String) -> TokenizerOutput String)
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Either String String))
-> Tokenizer (TokenizerOutput String)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Maybe ([ParseError], Either String String)
-> [SwitchCase
      TokenizerInput Tokenizer (WrappedOutput (Either String String))]
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Either String String))
forall out.
Maybe ([ParseError], out)
-> [SwitchCase TokenizerInput Tokenizer (WrappedOutput out)]
-> Tokenizer (TokenizerOutput out)
tokenizer
        (([ParseError], Either String String)
-> Maybe ([ParseError], Either String String)
forall a. a -> Maybe a
Just ([ParseError
AbsenceOfDigitsInNumericCharacterReference], String -> Either String String
forall a b. a -> Either a b
Left String
""))
    [ (Char -> Bool)
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Either String String))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Either String String))
forall out.
(Char -> Bool)
-> Tokenizer (TokenizerOutput out)
-> SwitchCase TokenizerInput Tokenizer (WrappedOutput out)
if_ (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'x') (StateT
   TokenParserState
   (Parser [TokenizerInput])
   (TokenizerOutput (Either String String))
 -> SwitchCase
      TokenizerInput Tokenizer (WrappedOutput (Either String String)))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Either String String))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Either String String))
forall a b. (a -> b) -> a -> b
$ TokenizerOutput (Either String ([ParseError], Char))
-> TokenizerOutput (Either String String)
forall a a.
TokenizerOutput (Either a ([ParseError], a))
-> TokenizerOutput (Either a [a])
flatten (TokenizerOutput (Either String ([ParseError], Char))
 -> TokenizerOutput (Either String String))
-> (TokenizerOutput (Either String Natural)
    -> TokenizerOutput (Either String ([ParseError], Char)))
-> TokenizerOutput (Either String Natural)
-> TokenizerOutput (Either String String)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Either String Natural -> Either String ([ParseError], Char))
-> TokenizerOutput (Either String Natural)
-> TokenizerOutput (Either String ([ParseError], Char))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap ((String -> String)
-> (Natural -> ([ParseError], Char))
-> Either String Natural
-> Either String ([ParseError], Char)
forall (p :: * -> * -> *) a b c d.
Bifunctor p =>
(a -> b) -> (c -> d) -> p a c -> p b d
F.B.bimap (Char
'x' Char -> String -> String
forall a. a -> [a] -> [a]
:) Natural -> ([ParseError], Char)
tokenNumericCharacterReferenceEnd) (TokenizerOutput (Either String Natural)
 -> TokenizerOutput (Either String String))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Either String Natural))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Either String String))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$>
        StateT
  TokenParserState
  (Parser [TokenizerInput])
  (TokenizerOutput (Either String Natural))
tokenHexadecimalCharacterReferenceStart
    , (Char -> Bool)
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Either String String))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Either String String))
forall out.
(Char -> Bool)
-> Tokenizer (TokenizerOutput out)
-> SwitchCase TokenizerInput Tokenizer (WrappedOutput out)
if_ (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'X') (StateT
   TokenParserState
   (Parser [TokenizerInput])
   (TokenizerOutput (Either String String))
 -> SwitchCase
      TokenizerInput Tokenizer (WrappedOutput (Either String String)))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Either String String))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Either String String))
forall a b. (a -> b) -> a -> b
$ TokenizerOutput (Either String ([ParseError], Char))
-> TokenizerOutput (Either String String)
forall a a.
TokenizerOutput (Either a ([ParseError], a))
-> TokenizerOutput (Either a [a])
flatten (TokenizerOutput (Either String ([ParseError], Char))
 -> TokenizerOutput (Either String String))
-> (TokenizerOutput (Either String Natural)
    -> TokenizerOutput (Either String ([ParseError], Char)))
-> TokenizerOutput (Either String Natural)
-> TokenizerOutput (Either String String)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Either String Natural -> Either String ([ParseError], Char))
-> TokenizerOutput (Either String Natural)
-> TokenizerOutput (Either String ([ParseError], Char))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap ((String -> String)
-> (Natural -> ([ParseError], Char))
-> Either String Natural
-> Either String ([ParseError], Char)
forall (p :: * -> * -> *) a b c d.
Bifunctor p =>
(a -> b) -> (c -> d) -> p a c -> p b d
F.B.bimap (Char
'X' Char -> String -> String
forall a. a -> [a] -> [a]
:) Natural -> ([ParseError], Char)
tokenNumericCharacterReferenceEnd) (TokenizerOutput (Either String Natural)
 -> TokenizerOutput (Either String String))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Either String Natural))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Either String String))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$>
        StateT
  TokenParserState
  (Parser [TokenizerInput])
  (TokenizerOutput (Either String Natural))
tokenHexadecimalCharacterReferenceStart
    , StateT
  TokenParserState
  (Parser [TokenizerInput])
  (TokenizerOutput (Either String String))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Either String String))
forall out.
Tokenizer out -> SwitchCase TokenizerInput Tokenizer (Wrapped out)
elsePush_ (StateT
   TokenParserState
   (Parser [TokenizerInput])
   (TokenizerOutput (Either String String))
 -> SwitchCase
      TokenizerInput Tokenizer (WrappedOutput (Either String String)))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Either String String))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Either String String))
forall a b. (a -> b) -> a -> b
$ TokenizerOutput (Either String ([ParseError], Char))
-> TokenizerOutput (Either String String)
forall a a.
TokenizerOutput (Either a ([ParseError], a))
-> TokenizerOutput (Either a [a])
flatten (TokenizerOutput (Either String ([ParseError], Char))
 -> TokenizerOutput (Either String String))
-> (TokenizerOutput (Either String Natural)
    -> TokenizerOutput (Either String ([ParseError], Char)))
-> TokenizerOutput (Either String Natural)
-> TokenizerOutput (Either String String)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Either String Natural -> Either String ([ParseError], Char))
-> TokenizerOutput (Either String Natural)
-> TokenizerOutput (Either String ([ParseError], Char))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap ((Natural -> ([ParseError], Char))
-> Either String Natural -> Either String ([ParseError], Char)
forall (p :: * -> * -> *) b c a.
Bifunctor p =>
(b -> c) -> p a b -> p a c
F.B.second Natural -> ([ParseError], Char)
tokenNumericCharacterReferenceEnd) (TokenizerOutput (Either String Natural)
 -> TokenizerOutput (Either String String))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Either String Natural))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Either String String))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$>
        StateT
  TokenParserState
  (Parser [TokenizerInput])
  (TokenizerOutput (Either String Natural))
tokenDecimalCharacterReferenceStart
    ]
  where flatten :: TokenizerOutput (Either a ([ParseError], a))
-> TokenizerOutput (Either a [a])
flatten TokenizerOutput (Either a ([ParseError], a))
tok = case TokenizerOutput (Either a ([ParseError], a))
-> Either a ([ParseError], a)
forall out. TokenizerOutput out -> out
tokenizedOut TokenizerOutput (Either a ([ParseError], a))
tok of
            Left a
str -> TokenizerOutput (Either a ([ParseError], a))
tok { tokenizedOut :: Either a [a]
tokenizedOut = a -> Either a [a]
forall a b. a -> Either a b
Left a
str }
            Right ([ParseError]
errs, a
c) -> [ParseError]
-> TokenizerOutput (Either a [a]) -> TokenizerOutput (Either a [a])
forall out.
[ParseError] -> TokenizerOutput out -> TokenizerOutput out
consTokenErrors [ParseError]
errs (TokenizerOutput (Either a [a]) -> TokenizerOutput (Either a [a]))
-> TokenizerOutput (Either a [a]) -> TokenizerOutput (Either a [a])
forall a b. (a -> b) -> a -> b
$ TokenizerOutput (Either a ([ParseError], a))
tok { tokenizedOut :: Either a [a]
tokenizedOut = [a] -> Either a [a]
forall a b. b -> Either a b
Right [a
c] }

-- | __HTML:__
--      @[hexadecimal character reference start state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-start-state)@
-- 
-- The parsing instructions for after reading @"&#x"@ or @"&#X"@ in a section
-- of the state machine which allows character references.
tokenHexadecimalCharacterReferenceStart
    :: Tokenizer (TokenizerOutput (Either String Z.Natural))
        -- ^ The inner data contains a theoretically-Unicode code point
        -- ('Right', though it may exceed the upper bound) or the characters
        -- consumed in reading something invalid ('Left'), as relevant.
tokenHexadecimalCharacterReferenceStart :: StateT
  TokenParserState
  (Parser [TokenizerInput])
  (TokenizerOutput (Either String Natural))
tokenHexadecimalCharacterReferenceStart = Maybe ([ParseError], Either String Natural)
-> [SwitchCase
      TokenizerInput Tokenizer (WrappedOutput (Either String Natural))]
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Either String Natural))
forall out.
Maybe ([ParseError], out)
-> [SwitchCase TokenizerInput Tokenizer (WrappedOutput out)]
-> Tokenizer (TokenizerOutput out)
tokenizer
        (([ParseError], Either String Natural)
-> Maybe ([ParseError], Either String Natural)
forall a. a -> Maybe a
Just ([ParseError
AbsenceOfDigitsInNumericCharacterReference], String -> Either String Natural
forall a b. a -> Either a b
Left String
""))
    [ (Char -> Bool)
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Either String Natural))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Either String Natural))
forall out.
(Char -> Bool)
-> Tokenizer out
-> SwitchCase TokenizerInput Tokenizer (Wrapped out)
ifPush_ Char -> Bool
C.isHexDigit (StateT
   TokenParserState
   (Parser [TokenizerInput])
   (TokenizerOutput (Either String Natural))
 -> SwitchCase
      TokenizerInput Tokenizer (WrappedOutput (Either String Natural)))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Either String Natural))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Either String Natural))
forall a b. (a -> b) -> a -> b
$ (TokenizerOutput (Natural, Natural)
 -> TokenizerOutput (Either String Natural))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Natural, Natural))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Either String Natural))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap TokenizerOutput (Natural, Natural)
-> TokenizerOutput (Either String Natural)
packReference StateT
  TokenParserState
  (Parser [TokenizerInput])
  (TokenizerOutput (Natural, Natural))
tokenHexadecimalCharacterReference
    , StateT
  TokenParserState
  (Parser [TokenizerInput])
  (TokenizerOutput (Either String Natural))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Either String Natural))
forall out.
Tokenizer out -> SwitchCase TokenizerInput Tokenizer (Wrapped out)
elsePush_ (StateT
   TokenParserState
   (Parser [TokenizerInput])
   (TokenizerOutput (Either String Natural))
 -> SwitchCase
      TokenizerInput Tokenizer (WrappedOutput (Either String Natural)))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Either String Natural))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Either String Natural))
forall a b. (a -> b) -> a -> b
$ ([ParseError], Either String Natural)
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Either String Natural))
forall out. ([ParseError], out) -> Tokenizer (TokenizerOutput out)
packToken ([ParseError
AbsenceOfDigitsInNumericCharacterReference], String -> Either String Natural
forall a b. a -> Either a b
Left String
"")
    ]

-- | __HTML:__
--      @[decimal character reference start state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state)@
-- 
-- The parsing instructions for after reading @"&#"@ followed by digit from @0@
-- to @9@ in a section of the state machine which allows character references.
tokenDecimalCharacterReferenceStart
    :: Tokenizer (TokenizerOutput (Either String Z.Natural))
        -- ^ The inner data contains a theoretically-Unicode code point
        -- ('Right', though it may exceed the upper bound) or the characters
        -- consumed in reading something invalid ('Left'), as relevant.
tokenDecimalCharacterReferenceStart :: StateT
  TokenParserState
  (Parser [TokenizerInput])
  (TokenizerOutput (Either String Natural))
tokenDecimalCharacterReferenceStart = Maybe ([ParseError], Either String Natural)
-> [SwitchCase
      TokenizerInput Tokenizer (WrappedOutput (Either String Natural))]
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Either String Natural))
forall out.
Maybe ([ParseError], out)
-> [SwitchCase TokenizerInput Tokenizer (WrappedOutput out)]
-> Tokenizer (TokenizerOutput out)
tokenizer
        (([ParseError], Either String Natural)
-> Maybe ([ParseError], Either String Natural)
forall a. a -> Maybe a
Just ([ParseError
AbsenceOfDigitsInNumericCharacterReference], String -> Either String Natural
forall a b. a -> Either a b
Left String
""))
    [ (Char -> Bool)
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Either String Natural))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Either String Natural))
forall out.
(Char -> Bool)
-> Tokenizer out
-> SwitchCase TokenizerInput Tokenizer (Wrapped out)
ifPush_ Char -> Bool
C.isDigit (StateT
   TokenParserState
   (Parser [TokenizerInput])
   (TokenizerOutput (Either String Natural))
 -> SwitchCase
      TokenizerInput Tokenizer (WrappedOutput (Either String Natural)))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Either String Natural))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Either String Natural))
forall a b. (a -> b) -> a -> b
$ (TokenizerOutput (Natural, Natural)
 -> TokenizerOutput (Either String Natural))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Natural, Natural))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Either String Natural))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap TokenizerOutput (Natural, Natural)
-> TokenizerOutput (Either String Natural)
packReference StateT
  TokenParserState
  (Parser [TokenizerInput])
  (TokenizerOutput (Natural, Natural))
tokenDecimalCharacterReference
    , StateT
  TokenParserState
  (Parser [TokenizerInput])
  (TokenizerOutput (Either String Natural))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Either String Natural))
forall out.
Tokenizer out -> SwitchCase TokenizerInput Tokenizer (Wrapped out)
elsePush_ (StateT
   TokenParserState
   (Parser [TokenizerInput])
   (TokenizerOutput (Either String Natural))
 -> SwitchCase
      TokenizerInput Tokenizer (WrappedOutput (Either String Natural)))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Either String Natural))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Either String Natural))
forall a b. (a -> b) -> a -> b
$ ([ParseError], Either String Natural)
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Either String Natural))
forall out. ([ParseError], out) -> Tokenizer (TokenizerOutput out)
packToken ([ParseError
AbsenceOfDigitsInNumericCharacterReference], String -> Either String Natural
forall a b. a -> Either a b
Left String
"")
    ]

-- | Extract the Unicode-ish code point calculated by the numeric character
-- reference loops, and repack it as output by the respective initializer.
packReference :: TokenizerOutput (Z.Natural, Z.Natural) -> TokenizerOutput (Either String Z.Natural)
packReference :: TokenizerOutput (Natural, Natural)
-> TokenizerOutput (Either String Natural)
packReference = ((Natural, Natural) -> Either String Natural)
-> TokenizerOutput (Natural, Natural)
-> TokenizerOutput (Either String Natural)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (((Natural, Natural) -> Either String Natural)
 -> TokenizerOutput (Natural, Natural)
 -> TokenizerOutput (Either String Natural))
-> ((Natural, Natural) -> Either String Natural)
-> TokenizerOutput (Natural, Natural)
-> TokenizerOutput (Either String Natural)
forall a b. (a -> b) -> a -> b
$ Natural -> Either String Natural
forall a b. b -> Either a b
Right (Natural -> Either String Natural)
-> ((Natural, Natural) -> Natural)
-> (Natural, Natural)
-> Either String Natural
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Natural, Natural) -> Natural
forall a b. (a, b) -> b
snd

-- | __HTML:__
--      @[hexadecimal character reference state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-state)@
-- 
-- The parsing instructions for after reading @"&#x"@ or @"&#X"@ followed by a
-- hexadecimal digit in a section of the state machine which allows character
-- references.
tokenHexadecimalCharacterReference
    :: Tokenizer (TokenizerOutput (Z.Natural, Z.Natural))
        -- ^ The inner data contains the number of valid numeric digits to the
        -- right ('fst') and the number composed from those digits ('snd').
        -- 
        -- The datatype has been chosen so that even extremely long references
        -- (over a gigabyte of digits /at minimum/ with 'Word') have no chance
        -- of causing system-dependant behaviour; @&#x20;@ and @&#x20..0;@ being
        -- equivalent when the second has 2^30 @0@s (among other alignments)
        -- could potentially allow some obscure attack.
tokenHexadecimalCharacterReference :: StateT
  TokenParserState
  (Parser [TokenizerInput])
  (TokenizerOutput (Natural, Natural))
tokenHexadecimalCharacterReference = Maybe ([ParseError], (Natural, Natural))
-> [SwitchCase
      TokenizerInput Tokenizer (WrappedOutput (Natural, Natural))]
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Natural, Natural))
forall out.
Maybe ([ParseError], out)
-> [SwitchCase TokenizerInput Tokenizer (WrappedOutput out)]
-> Tokenizer (TokenizerOutput out)
tokenizer
        (([ParseError], (Natural, Natural))
-> Maybe ([ParseError], (Natural, Natural))
forall a. a -> Maybe a
Just ([ParseError
MissingSemicolonAfterCharacterReference], (Natural
0, Natural
0)))
    [ (Char -> Bool)
-> (Char
    -> StateT
         TokenParserState
         (Parser [TokenizerInput])
         (TokenizerOutput (Natural, Natural)))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Natural, Natural))
forall out.
(Char -> Bool)
-> (Char -> Tokenizer (TokenizerOutput out))
-> SwitchCase TokenizerInput Tokenizer (WrappedOutput out)
ifChar Char -> Bool
C.isDigit ((Char
  -> StateT
       TokenParserState
       (Parser [TokenizerInput])
       (TokenizerOutput (Natural, Natural)))
 -> SwitchCase
      TokenizerInput Tokenizer (WrappedOutput (Natural, Natural)))
-> (Char
    -> StateT
         TokenParserState
         (Parser [TokenizerInput])
         (TokenizerOutput (Natural, Natural)))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Natural, Natural))
forall a b. (a -> b) -> a -> b
$ \Char
c -> Int
-> Char
-> TokenizerOutput (Natural, Natural)
-> TokenizerOutput (Natural, Natural)
forall (f :: * -> *) b b a.
(Functor f, Num b, Integral b, Enum a) =>
Int -> a -> f (b, b) -> f (b, b)
increment Int
0x30 Char
c (TokenizerOutput (Natural, Natural)
 -> TokenizerOutput (Natural, Natural))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Natural, Natural))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Natural, Natural))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> StateT
  TokenParserState
  (Parser [TokenizerInput])
  (TokenizerOutput (Natural, Natural))
tokenHexadecimalCharacterReference
    , (Char -> Bool)
-> (Char
    -> StateT
         TokenParserState
         (Parser [TokenizerInput])
         (TokenizerOutput (Natural, Natural)))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Natural, Natural))
forall out.
(Char -> Bool)
-> (Char -> Tokenizer (TokenizerOutput out))
-> SwitchCase TokenizerInput Tokenizer (WrappedOutput out)
ifChar (Char -> Char -> Char -> Bool
forall a. Ord a => a -> a -> a -> Bool
range Char
'A' Char
'F') ((Char
  -> StateT
       TokenParserState
       (Parser [TokenizerInput])
       (TokenizerOutput (Natural, Natural)))
 -> SwitchCase
      TokenizerInput Tokenizer (WrappedOutput (Natural, Natural)))
-> (Char
    -> StateT
         TokenParserState
         (Parser [TokenizerInput])
         (TokenizerOutput (Natural, Natural)))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Natural, Natural))
forall a b. (a -> b) -> a -> b
$ \Char
c -> Int
-> Char
-> TokenizerOutput (Natural, Natural)
-> TokenizerOutput (Natural, Natural)
forall (f :: * -> *) b b a.
(Functor f, Num b, Integral b, Enum a) =>
Int -> a -> f (b, b) -> f (b, b)
increment Int
0x37 Char
c (TokenizerOutput (Natural, Natural)
 -> TokenizerOutput (Natural, Natural))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Natural, Natural))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Natural, Natural))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> StateT
  TokenParserState
  (Parser [TokenizerInput])
  (TokenizerOutput (Natural, Natural))
tokenHexadecimalCharacterReference
    , (Char -> Bool)
-> (Char
    -> StateT
         TokenParserState
         (Parser [TokenizerInput])
         (TokenizerOutput (Natural, Natural)))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Natural, Natural))
forall out.
(Char -> Bool)
-> (Char -> Tokenizer (TokenizerOutput out))
-> SwitchCase TokenizerInput Tokenizer (WrappedOutput out)
ifChar (Char -> Char -> Char -> Bool
forall a. Ord a => a -> a -> a -> Bool
range Char
'a' Char
'f') ((Char
  -> StateT
       TokenParserState
       (Parser [TokenizerInput])
       (TokenizerOutput (Natural, Natural)))
 -> SwitchCase
      TokenizerInput Tokenizer (WrappedOutput (Natural, Natural)))
-> (Char
    -> StateT
         TokenParserState
         (Parser [TokenizerInput])
         (TokenizerOutput (Natural, Natural)))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Natural, Natural))
forall a b. (a -> b) -> a -> b
$ \Char
c -> Int
-> Char
-> TokenizerOutput (Natural, Natural)
-> TokenizerOutput (Natural, Natural)
forall (f :: * -> *) b b a.
(Functor f, Num b, Integral b, Enum a) =>
Int -> a -> f (b, b) -> f (b, b)
increment Int
0x57 Char
c (TokenizerOutput (Natural, Natural)
 -> TokenizerOutput (Natural, Natural))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Natural, Natural))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Natural, Natural))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> StateT
  TokenParserState
  (Parser [TokenizerInput])
  (TokenizerOutput (Natural, Natural))
tokenHexadecimalCharacterReference
    , (Char -> Bool)
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Natural, Natural))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Natural, Natural))
forall out.
(Char -> Bool)
-> Tokenizer (TokenizerOutput out)
-> SwitchCase TokenizerInput Tokenizer (WrappedOutput out)
if_ (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
';') (StateT
   TokenParserState
   (Parser [TokenizerInput])
   (TokenizerOutput (Natural, Natural))
 -> SwitchCase
      TokenizerInput Tokenizer (WrappedOutput (Natural, Natural)))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Natural, Natural))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Natural, Natural))
forall a b. (a -> b) -> a -> b
$ ([ParseError], (Natural, Natural))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Natural, Natural))
forall out. ([ParseError], out) -> Tokenizer (TokenizerOutput out)
packToken ([], (Natural
0, Natural
0))
    , StateT
  TokenParserState
  (Parser [TokenizerInput])
  (TokenizerOutput (Natural, Natural))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Natural, Natural))
forall out.
Tokenizer out -> SwitchCase TokenizerInput Tokenizer (Wrapped out)
elsePush_ (StateT
   TokenParserState
   (Parser [TokenizerInput])
   (TokenizerOutput (Natural, Natural))
 -> SwitchCase
      TokenizerInput Tokenizer (WrappedOutput (Natural, Natural)))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Natural, Natural))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Natural, Natural))
forall a b. (a -> b) -> a -> b
$ ([ParseError], (Natural, Natural))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Natural, Natural))
forall out. ([ParseError], out) -> Tokenizer (TokenizerOutput out)
packToken ([ParseError
MissingSemicolonAfterCharacterReference], (Natural
0, Natural
0))
    ]
  where increment :: Int -> a -> f (b, b) -> f (b, b)
increment Int
offset a
c = ((b, b) -> (b, b)) -> f (b, b) -> f (b, b)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (((b, b) -> (b, b)) -> f (b, b) -> f (b, b))
-> ((b, b) -> (b, b)) -> f (b, b) -> f (b, b)
forall a b. (a -> b) -> a -> b
$ \(b
pos, b
accum) ->
            (b
pos b -> b -> b
forall a. Num a => a -> a -> a
+ b
1, Int -> b
forall a b. (Integral a, Num b) => a -> b
fromIntegral (a -> Int
forall a. Enum a => a -> Int
fromEnum a
c Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
offset) b -> b -> b
forall a. Num a => a -> a -> a
* (b
16 b -> b -> b
forall a b. (Num a, Integral b) => a -> b -> a
^ b
pos) b -> b -> b
forall a. Num a => a -> a -> a
+ b
accum)

-- | __HTML:__
--      @[decimal character reference state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-state)@
-- 
-- The parsing instructions for after reading @"&#"@ followed by a digit from
-- @0@ through @9@ in a section of the state machine which allows character
-- references.
tokenDecimalCharacterReference
    :: Tokenizer (TokenizerOutput (Z.Natural, Z.Natural))
        -- ^ The inner data contains the number of valid numeric digits to the
        -- right ('fst') and the number composed from those digits ('snd').
        -- 
        -- The datatype has been chosen so that even extremely long references
        -- (over a gigabyte of digits /at minimum/ with 'Word') have no chance
        -- of causing system-dependant behaviour; @&#10;@ and @&#10..0;@ being
        -- equivalent when the second has 2^30 @0@s (among other alignments)
        -- could potentially allow some obscure attack.
tokenDecimalCharacterReference :: StateT
  TokenParserState
  (Parser [TokenizerInput])
  (TokenizerOutput (Natural, Natural))
tokenDecimalCharacterReference = Maybe ([ParseError], (Natural, Natural))
-> [SwitchCase
      TokenizerInput Tokenizer (WrappedOutput (Natural, Natural))]
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Natural, Natural))
forall out.
Maybe ([ParseError], out)
-> [SwitchCase TokenizerInput Tokenizer (WrappedOutput out)]
-> Tokenizer (TokenizerOutput out)
tokenizer
        (([ParseError], (Natural, Natural))
-> Maybe ([ParseError], (Natural, Natural))
forall a. a -> Maybe a
Just ([ParseError
MissingSemicolonAfterCharacterReference], (Natural
0, Natural
0)))
    [ (Char -> Bool)
-> (Char
    -> StateT
         TokenParserState
         (Parser [TokenizerInput])
         (TokenizerOutput (Natural, Natural)))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Natural, Natural))
forall out.
(Char -> Bool)
-> (Char -> Tokenizer (TokenizerOutput out))
-> SwitchCase TokenizerInput Tokenizer (WrappedOutput out)
ifChar Char -> Bool
C.isDigit ((Char
  -> StateT
       TokenParserState
       (Parser [TokenizerInput])
       (TokenizerOutput (Natural, Natural)))
 -> SwitchCase
      TokenizerInput Tokenizer (WrappedOutput (Natural, Natural)))
-> (Char
    -> StateT
         TokenParserState
         (Parser [TokenizerInput])
         (TokenizerOutput (Natural, Natural)))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Natural, Natural))
forall a b. (a -> b) -> a -> b
$ \Char
c -> Char
-> TokenizerOutput (Natural, Natural)
-> TokenizerOutput (Natural, Natural)
forall (f :: * -> *) b b a.
(Functor f, Num b, Integral b, Enum a) =>
a -> f (b, b) -> f (b, b)
increment Char
c (TokenizerOutput (Natural, Natural)
 -> TokenizerOutput (Natural, Natural))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Natural, Natural))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Natural, Natural))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> StateT
  TokenParserState
  (Parser [TokenizerInput])
  (TokenizerOutput (Natural, Natural))
tokenDecimalCharacterReference
    , (Char -> Bool)
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Natural, Natural))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Natural, Natural))
forall out.
(Char -> Bool)
-> Tokenizer (TokenizerOutput out)
-> SwitchCase TokenizerInput Tokenizer (WrappedOutput out)
if_ (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
';') (StateT
   TokenParserState
   (Parser [TokenizerInput])
   (TokenizerOutput (Natural, Natural))
 -> SwitchCase
      TokenizerInput Tokenizer (WrappedOutput (Natural, Natural)))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Natural, Natural))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Natural, Natural))
forall a b. (a -> b) -> a -> b
$ ([ParseError], (Natural, Natural))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Natural, Natural))
forall out. ([ParseError], out) -> Tokenizer (TokenizerOutput out)
packToken ([], (Natural
0, Natural
0))
    , StateT
  TokenParserState
  (Parser [TokenizerInput])
  (TokenizerOutput (Natural, Natural))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Natural, Natural))
forall out.
Tokenizer out -> SwitchCase TokenizerInput Tokenizer (Wrapped out)
elsePush_ (StateT
   TokenParserState
   (Parser [TokenizerInput])
   (TokenizerOutput (Natural, Natural))
 -> SwitchCase
      TokenizerInput Tokenizer (WrappedOutput (Natural, Natural)))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Natural, Natural))
-> SwitchCase
     TokenizerInput Tokenizer (WrappedOutput (Natural, Natural))
forall a b. (a -> b) -> a -> b
$ ([ParseError], (Natural, Natural))
-> StateT
     TokenParserState
     (Parser [TokenizerInput])
     (TokenizerOutput (Natural, Natural))
forall out. ([ParseError], out) -> Tokenizer (TokenizerOutput out)
packToken ([ParseError
MissingSemicolonAfterCharacterReference], (Natural
0, Natural
0))
    ]
  where increment :: a -> f (b, b) -> f (b, b)
increment a
c = ((b, b) -> (b, b)) -> f (b, b) -> f (b, b)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (((b, b) -> (b, b)) -> f (b, b) -> f (b, b))
-> ((b, b) -> (b, b)) -> f (b, b) -> f (b, b)
forall a b. (a -> b) -> a -> b
$ \(b
pos, b
accum) ->
            (b
pos b -> b -> b
forall a. Num a => a -> a -> a
+ b
1, Int -> b
forall a b. (Integral a, Num b) => a -> b
fromIntegral (a -> Int
forall a. Enum a => a -> Int
fromEnum a
c Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
0x30) b -> b -> b
forall a. Num a => a -> a -> a
* (b
10 b -> b -> b
forall a b. (Num a, Integral b) => a -> b -> a
^ b
pos) b -> b -> b
forall a. Num a => a -> a -> a
+ b
accum)

-- | __HTML:__
--      @[numeric character reference end state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state)@
-- 
-- The instructions for processing a theoretically-Unicode code point into an
-- actually-legal 'Char', after reading a non-digit character following a
-- @"&#"@ in a section of the state machine which allows character references.
tokenNumericCharacterReferenceEnd :: Z.Natural -> ([ParseError], Char)
tokenNumericCharacterReferenceEnd :: Natural -> ([ParseError], Char)
tokenNumericCharacterReferenceEnd Natural
0x00 = ([ParseError
NullCharacterReference], Char
replacementChar)
tokenNumericCharacterReferenceEnd Natural
code
    | Natural
code Natural -> Natural -> Bool
forall a. Ord a => a -> a -> Bool
> Natural
0x10FFFF =
        ([Natural -> ParseError
CharacterReferenceOutsideUnicodeRange Natural
code], Char
replacementChar)
    | Natural -> Natural -> Natural -> Bool
forall a. Ord a => a -> a -> a -> Bool
range Natural
0xD800 Natural
0xDFFF Natural
code =
        ([Char -> ParseError
SurrogateCharacterReference (Char -> ParseError) -> (Int -> Char) -> Int -> ParseError
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Char
forall a. Enum a => Int -> a
toEnum (Int -> ParseError) -> Int -> ParseError
forall a b. (a -> b) -> a -> b
$ Natural -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Natural
code], Char
replacementChar)
    | Natural -> Natural -> Natural -> Bool
forall a. Ord a => a -> a -> a -> Bool
range Natural
0xFDD0 Natural
0xFDEF Natural
code =
        ([Char -> ParseError
NoncharacterCharacterReference (Char -> ParseError) -> (Int -> Char) -> Int -> ParseError
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Char
forall a. Enum a => Int -> a
toEnum (Int -> ParseError) -> Int -> ParseError
forall a b. (a -> b) -> a -> b
$ Natural -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Natural
code], Int -> Char
forall a. Enum a => Int -> a
toEnum (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ Natural -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Natural
code)
    | Word16
cMod Word16 -> Word16 -> Bool
forall a. Eq a => a -> a -> Bool
== Word16
0xFFFE Bool -> Bool -> Bool
|| Word16
cMod Word16 -> Word16 -> Bool
forall a. Eq a => a -> a -> Bool
== Word16
0xFFFF =
        ([Char -> ParseError
NoncharacterCharacterReference (Char -> ParseError) -> (Int -> Char) -> Int -> ParseError
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Char
forall a. Enum a => Int -> a
toEnum (Int -> ParseError) -> Int -> ParseError
forall a b. (a -> b) -> a -> b
$ Natural -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Natural
code], Int -> Char
forall a. Enum a => Int -> a
toEnum (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ Natural -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Natural
code)
    | Natural -> Natural -> Natural -> Bool
forall a. Ord a => a -> a -> a -> Bool
range Natural
0x00 Natural
0x1F Natural
code Bool -> Bool -> Bool
&& Natural -> [Natural] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
notElem Natural
code [Natural
0x09, Natural
0x0A, Natural
0x0C] =
        ([ParseError
ControlCharacterReference], Int -> Char
forall a. Enum a => Int -> a
toEnum (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ Natural -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Natural
code)
    | Natural -> Natural -> Natural -> Bool
forall a. Ord a => a -> a -> a -> Bool
range Natural
0x7F Natural
0x9F Natural
code = ([ParseError
ControlCharacterReference],
        Char -> Maybe Char -> Char
forall a. a -> Maybe a -> a
Y.fromMaybe (Int -> Char
forall a. Enum a => Int -> a
toEnum (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ Natural -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Natural
code) (Maybe Char -> Char)
-> (Maybe (Maybe Char) -> Maybe Char) -> Maybe (Maybe Char) -> Char
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Maybe (Maybe Char) -> Maybe Char
forall (m :: * -> *) a. Monad m => m (m a) -> m a
N.join (Maybe (Maybe Char) -> Char) -> Maybe (Maybe Char) -> Char
forall a b. (a -> b) -> a -> b
$
        Vector (Maybe Char)
controlReplacement Vector (Maybe Char) -> Int -> Maybe (Maybe Char)
forall a. Vector a -> Int -> Maybe a
!? (Natural -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Natural
code Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
0x7F))
    | Bool
otherwise = ([], Int -> Char
forall a. Enum a => Int -> a
toEnum (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ Natural -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Natural
code)
  where cMod :: Word16
cMod = Natural -> Word16
forall a b. (Integral a, Num b) => a -> b
fromIntegral Natural
code :: W.Word16

-- | The specification-defined replacements for the C1 control characters, plus
-- @0x7F@ (DEL).  'Nothing' placeholders are used for controls without a
-- replacement, to allow indexing by the code point minus 0x7F.
controlReplacement :: V.Vector (Maybe Char)
controlReplacement :: Vector (Maybe Char)
controlReplacement = [Maybe Char] -> Vector (Maybe Char)
forall a. [a] -> Vector a
V.fromList
    [ Maybe Char
forall a. Maybe a
Nothing       -- 0x7F
    , Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\x20AC' -- 0x80
    , Maybe Char
forall a. Maybe a
Nothing       -- 0x81
    , Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\x201A' -- 0x82
    , Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\x0192' -- 0x83
    , Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\x201E' -- 0x84
    , Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\x2026' -- 0x85
    , Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\x2020' -- 0x86
    , Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\x2021' -- 0x87
    , Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\x02C6' -- 0x88
    , Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\x2030' -- 0x89
    , Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\x0160' -- 0x8A
    , Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\x2039' -- 0x8B
    , Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\x0152' -- 0x8C
    , Maybe Char
forall a. Maybe a
Nothing       -- 0x8D
    , Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\x017D' -- 0x8E
    , Maybe Char
forall a. Maybe a
Nothing       -- 0x8F
    , Maybe Char
forall a. Maybe a
Nothing       -- 0x90
    , Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\x2018' -- 0x91
    , Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\x2019' -- 0x92
    , Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\x201C' -- 0x93
    , Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\x201D' -- 0x94
    , Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\x2022' -- 0x95
    , Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\x2013' -- 0x96
    , Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\x2014' -- 0x97
    , Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\x02DC' -- 0x98
    , Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\x2122' -- 0x99
    , Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\x0161' -- 0x9A
    , Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\x203A' -- 0x9B
    , Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\x0153' -- 0x9C
    , Maybe Char
forall a. Maybe a
Nothing       -- 0x9D
    , Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\x017E' -- 0x9E
    , Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\x0178' -- 0x9F
    ]