{-# LANGUAGE Trustworthy #-} {-| Description: Tokenization rules for characters comprising character references. Copyright: (c) 2020-2021 Sam May License: MPL-2.0 Maintainer: ag.eitilt@gmail.com Stability: stable Portability: portable The monadic parser construction and prefix-preferring 'String' concatenation mean that character references are resolved opposite the algorithm described in the __[HTML](https://html.spec.whatwg.org/multipage/parsing.html)__ specification. That is most notable in the numeric references, where the spec recommends multiplying an accumulator by the numeric base before adding each digit, while this implementation multiplies each digit by the positional value before adding the accumulator. The reversal in named references is more conceptual and subtle, but the spec is worded to read a name from the input while the implementation reads input 'Char's according to the list of reference names; this is likely how most implementations accomplish it. -} module Web.Mangrove.Parse.Tokenize.Character ( flushCharRef , tokenCharacterReference ) where import qualified Control.Applicative as A import qualified Control.Monad as N import qualified Data.Bifunctor as F.B import qualified Data.Char as C import qualified Data.HashMap.Strict as M import qualified Data.Maybe as Y import qualified Data.Vector as V import qualified Data.Word as W import qualified Numeric.Natural as Z import Data.Functor ( ($>) ) import Data.Vector ( (!?) ) import Web.Mangrove.Parse.Common.Error import Web.Mangrove.Parse.Common.Character import Web.Mangrove.Parse.Tokenize.Common import Web.Willow.Common.Encoding.Character import Web.Willow.Common.Parser import Web.Willow.Common.Parser.Util import {-# SOURCE #-} Web.Mangrove.Parse.Tokenize.Dispatcher -- | __HTML:__ -- @[flush code points consumed as a character reference] -- (https://html.spec.whatwg.org/multipage/parsing.html#flush-code-points-consumed-as-a-character-reference)@ -- -- Transform a wrapped 'Char' sequence into a sequence of wrapped 'Character' -- tokens. flushCharRef :: TokenizerOutput String -> Tokenizer [TokenizerOutput Token] flushCharRef out = case tokenizedOut out of "" -> case tokenizedErrs out of [] -> return [] errs -> consTokenErrorsList errs . Y.fromMaybe [] <$> A.optional dispatcher [c] -> return [out { tokenizedOut = Character c }] (c:_) -> do let c' = out { tokenizedOut = Character c , tokenizedState = Nothing } cs' <- flushCharRef out' return $ c' : cs' where out' = out { tokenizedErrs = [] , tokenizedOut = drop 1 $ tokenizedOut out } -- | __HTML:__ -- @[character reference state] -- (https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state)@ -- -- The parsing instructions for after reading @"&"@ in a section of the state -- machine which allows character references. tokenCharacterReference :: Bool -- ^ Whether this parser was called from within a markup tag's attribute. -> Tokenizer (TokenizerOutput String) tokenCharacterReference inAttribute = tokenizer (Just ([], "&")) [ ifPush_ isAsciiAlphaNum $ do ref' <- tokenNamedCharacterReference inAttribute characterReferences case tokenizedOut ref' of NotFound -> consTokenErrors (tokenizedErrs ref') . consOut '&' <$> tokenAmbiguousAmpersand Flush name -> return ref' { tokenizedOut = '&' : name } Found ref -> return ref' { tokenizedOut = ref } , if_ (== '#') tokenNumericCharacterReference , elsePush_ $ packToken ([], "&") ] -- | The result of looking up a named character reference. data CharacterReference = NotFound -- ^ No matching name found. | Flush String -- ^ A named reference was found, but historical reasons require -- emitting the name as characters anyway. | Found String -- ^ A named reference was found and successfully resolved. -- | __HTML:__ -- @[named character reference state] -- (https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state)@ -- -- The parsing instructions for after reading @"&"@ followed by letters and/or -- numbers in a section of the state machine which allows character references. tokenNamedCharacterReference :: Bool -- ^ Whether this parser was called from within a markup tag's attribute. -> CharacterReferenceTree -- ^ The list of reference names, filtered according to what prefix has -- been already encountered. -> Tokenizer (TokenizerOutput CharacterReference) tokenNamedCharacterReference inAttribute (CharacterReferenceTree refs) = do cm' <- A.optional next let state1 = endState state2 = do cm <- cm' decodedState cm case cm' of Nothing -> packState ([], NotFound) state1 Just c -> case M.lookup (decodedOut c) refs of -- No reference found, and no further references possible. Nothing -> push c *> packState ([], NotFound) state2 -- No reference found, but longer potential reference names exist. Just (Nothing, refs') -> do ref' <- tokenNamedCharacterReference inAttribute refs' let errs' = decodedErrs c ++ tokenizedErrs ref' case tokenizedOut ref' of -- None of those longer names match. NotFound -> push c *> packState ([], NotFound) state2 -- Fall back on the longer name. Flush name -> return ref' { tokenizedErrs = errs' , tokenizedOut = Flush $ decodedOut c : name } Found ref -> return ref' { tokenizedErrs = errs' , tokenizedOut = Found ref } -- Reference found at the current name. Just (Just found, refs') -> do semicolon <- A.optional $ next >>= satisfying (\d -> decodedOut d == ';') let state3 = maybe state2 decodedState semicolon if not (isSemicolonOptional found) && Y.isNothing semicolon then tokenNamedCharacterReference inAttribute refs' >>= \ref -> case tokenizedOut ref of NotFound -> push c $> ref _ -> return ref else tokenNamedCharacterReference inAttribute refs' >>= foundNamedCharacterReference inAttribute (Y.isJust semicolon) (decodedOut c) state3 (referenceValue found) -- | Perform the logistics around determining what 'Character'(s) should be -- emitted, based on the current environment and any longer match. foundNamedCharacterReference :: Bool -- ^ Whether the reference is being evaluated as part of an attribute value. -> Bool -- ^ Whether the last character matched is a semicolon. -> Char -- ^ The last character of the reference, /excluding/ the semicolon. -> DecoderOutputState -- ^ Remainder of the binary document stream after the last character matched. -> String -- ^ The evaluated sequence represented by the name. -> TokenizerOutput CharacterReference -- ^ The value returned by continuing the lookup on a longer string. -> Tokenizer (TokenizerOutput CharacterReference) foundNamedCharacterReference attribute semicolon char state found ref = case tokenizedOut ref of NotFound -> if attribute && not semicolon then do c' <- A.optional $ next >>= satisfying (\c -> decodedOut c == '=' || isAsciiAlphaNum (decodedOut c)) case c' of Nothing -> packState ([MissingSemicolonAfterCharacterReference], Found found) state Just c -> push c *> packState ([], Flush [char]) state else packState (errs', Found found) state Flush name -> return $ ref { tokenizedOut = Flush $ char : name } _ -> return ref where errs' | semicolon = [] | otherwise = [MissingSemicolonAfterCharacterReference] -- | __HTML:__ -- @[ambiguous ampersand state] -- (https://html.spec.whatwg.org/multipage/parsing.html#ambiguous-ampersand-state)@ -- -- The parsing instructions for after reading @"&"@ followed by a string which -- does not correspond to any known reference name, in a section of the state -- machine which allows character references. tokenAmbiguousAmpersand :: Tokenizer (TokenizerOutput String) tokenAmbiguousAmpersand = tokenizer (Just ([], "")) [ ifChar isAsciiAlphaNum $ \c -> consOut c <$> tokenAmbiguousAmpersand , ifPush_ (== ';') $ packToken ([UnknownNamedCharacterReference], "") , elsePush_ $ packToken ([], "") ] -- | __HTML:__ -- @[numeric character reference state] -- (https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state)@ -- -- The parsing instructions for after reading @"&#"@ in a section of the state -- machine which allows character references. tokenNumericCharacterReference :: Tokenizer (TokenizerOutput String) tokenNumericCharacterReference = fmap (either ("&#" ++) id) <$> tokenizer (Just ([AbsenceOfDigitsInNumericCharacterReference], Left "")) [ if_ (== 'x') $ flatten . fmap (F.B.bimap ('x' :) tokenNumericCharacterReferenceEnd) <$> tokenHexadecimalCharacterReferenceStart , if_ (== 'X') $ flatten . fmap (F.B.bimap ('X' :) tokenNumericCharacterReferenceEnd) <$> tokenHexadecimalCharacterReferenceStart , elsePush_ $ flatten . fmap (F.B.second tokenNumericCharacterReferenceEnd) <$> tokenDecimalCharacterReferenceStart ] where flatten tok = case tokenizedOut tok of Left str -> tok { tokenizedOut = Left str } Right (errs, c) -> consTokenErrors errs $ tok { tokenizedOut = Right [c] } -- | __HTML:__ -- @[hexadecimal character reference start state] -- (https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-start-state)@ -- -- The parsing instructions for after reading @"&#x"@ or @"&#X"@ in a section -- of the state machine which allows character references. tokenHexadecimalCharacterReferenceStart :: Tokenizer (TokenizerOutput (Either String Z.Natural)) -- ^ The inner data contains a theoretically-Unicode code point -- ('Right', though it may exceed the upper bound) or the characters -- consumed in reading something invalid ('Left'), as relevant. tokenHexadecimalCharacterReferenceStart = tokenizer (Just ([AbsenceOfDigitsInNumericCharacterReference], Left "")) [ ifPush_ C.isHexDigit $ fmap packReference tokenHexadecimalCharacterReference , elsePush_ $ packToken ([AbsenceOfDigitsInNumericCharacterReference], Left "") ] -- | __HTML:__ -- @[decimal character reference start state] -- (https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state)@ -- -- The parsing instructions for after reading @"&#"@ followed by digit from @0@ -- to @9@ in a section of the state machine which allows character references. tokenDecimalCharacterReferenceStart :: Tokenizer (TokenizerOutput (Either String Z.Natural)) -- ^ The inner data contains a theoretically-Unicode code point -- ('Right', though it may exceed the upper bound) or the characters -- consumed in reading something invalid ('Left'), as relevant. tokenDecimalCharacterReferenceStart = tokenizer (Just ([AbsenceOfDigitsInNumericCharacterReference], Left "")) [ ifPush_ C.isDigit $ fmap packReference tokenDecimalCharacterReference , elsePush_ $ packToken ([AbsenceOfDigitsInNumericCharacterReference], Left "") ] -- | Extract the Unicode-ish code point calculated by the numeric character -- reference loops, and repack it as output by the respective initializer. packReference :: TokenizerOutput (Z.Natural, Z.Natural) -> TokenizerOutput (Either String Z.Natural) packReference = fmap $ Right . snd -- | __HTML:__ -- @[hexadecimal character reference state] -- (https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-state)@ -- -- The parsing instructions for after reading @"&#x"@ or @"&#X"@ followed by a -- hexadecimal digit in a section of the state machine which allows character -- references. tokenHexadecimalCharacterReference :: Tokenizer (TokenizerOutput (Z.Natural, Z.Natural)) -- ^ The inner data contains the number of valid numeric digits to the -- right ('fst') and the number composed from those digits ('snd'). -- -- The datatype has been chosen so that even extremely long references -- (over a gigabyte of digits /at minimum/ with 'Word') have no chance -- of causing system-dependant behaviour; @ @ and @ ..0;@ being -- equivalent when the second has 2^30 @0@s (among other alignments) -- could potentially allow some obscure attack. tokenHexadecimalCharacterReference = tokenizer (Just ([MissingSemicolonAfterCharacterReference], (0, 0))) [ ifChar C.isDigit $ \c -> increment 0x30 c <$> tokenHexadecimalCharacterReference , ifChar (range 'A' 'F') $ \c -> increment 0x37 c <$> tokenHexadecimalCharacterReference , ifChar (range 'a' 'f') $ \c -> increment 0x57 c <$> tokenHexadecimalCharacterReference , if_ (== ';') $ packToken ([], (0, 0)) , elsePush_ $ packToken ([MissingSemicolonAfterCharacterReference], (0, 0)) ] where increment offset c = fmap $ \(pos, accum) -> (pos + 1, fromIntegral (fromEnum c - offset) * (16 ^ pos) + accum) -- | __HTML:__ -- @[decimal character reference state] -- (https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-state)@ -- -- The parsing instructions for after reading @"&#"@ followed by a digit from -- @0@ through @9@ in a section of the state machine which allows character -- references. tokenDecimalCharacterReference :: Tokenizer (TokenizerOutput (Z.Natural, Z.Natural)) -- ^ The inner data contains the number of valid numeric digits to the -- right ('fst') and the number composed from those digits ('snd'). -- -- The datatype has been chosen so that even extremely long references -- (over a gigabyte of digits /at minimum/ with 'Word') have no chance -- of causing system-dependant behaviour; @ @ and @ ..0;@ being -- equivalent when the second has 2^30 @0@s (among other alignments) -- could potentially allow some obscure attack. tokenDecimalCharacterReference = tokenizer (Just ([MissingSemicolonAfterCharacterReference], (0, 0))) [ ifChar C.isDigit $ \c -> increment c <$> tokenDecimalCharacterReference , if_ (== ';') $ packToken ([], (0, 0)) , elsePush_ $ packToken ([MissingSemicolonAfterCharacterReference], (0, 0)) ] where increment c = fmap $ \(pos, accum) -> (pos + 1, fromIntegral (fromEnum c - 0x30) * (10 ^ pos) + accum) -- | __HTML:__ -- @[numeric character reference end state] -- (https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state)@ -- -- The instructions for processing a theoretically-Unicode code point into an -- actually-legal 'Char', after reading a non-digit character following a -- @"&#"@ in a section of the state machine which allows character references. tokenNumericCharacterReferenceEnd :: Z.Natural -> ([ParseError], Char) tokenNumericCharacterReferenceEnd 0x00 = ([NullCharacterReference], replacementChar) tokenNumericCharacterReferenceEnd code | code > 0x10FFFF = ([CharacterReferenceOutsideUnicodeRange code], replacementChar) | range 0xD800 0xDFFF code = ([SurrogateCharacterReference . toEnum $ fromIntegral code], replacementChar) | range 0xFDD0 0xFDEF code = ([NoncharacterCharacterReference . toEnum $ fromIntegral code], toEnum $ fromIntegral code) | cMod == 0xFFFE || cMod == 0xFFFF = ([NoncharacterCharacterReference . toEnum $ fromIntegral code], toEnum $ fromIntegral code) | range 0x00 0x1F code && notElem code [0x09, 0x0A, 0x0C] = ([ControlCharacterReference], toEnum $ fromIntegral code) | range 0x7F 0x9F code = ([ControlCharacterReference], Y.fromMaybe (toEnum $ fromIntegral code) . N.join $ controlReplacement !? (fromIntegral code - 0x7F)) | otherwise = ([], toEnum $ fromIntegral code) where cMod = fromIntegral code :: W.Word16 -- | The specification-defined replacements for the C1 control characters, plus -- @0x7F@ (DEL). 'Nothing' placeholders are used for controls without a -- replacement, to allow indexing by the code point minus 0x7F. controlReplacement :: V.Vector (Maybe Char) controlReplacement = V.fromList [ Nothing -- 0x7F , Just '\x20AC' -- 0x80 , Nothing -- 0x81 , Just '\x201A' -- 0x82 , Just '\x0192' -- 0x83 , Just '\x201E' -- 0x84 , Just '\x2026' -- 0x85 , Just '\x2020' -- 0x86 , Just '\x2021' -- 0x87 , Just '\x02C6' -- 0x88 , Just '\x2030' -- 0x89 , Just '\x0160' -- 0x8A , Just '\x2039' -- 0x8B , Just '\x0152' -- 0x8C , Nothing -- 0x8D , Just '\x017D' -- 0x8E , Nothing -- 0x8F , Nothing -- 0x90 , Just '\x2018' -- 0x91 , Just '\x2019' -- 0x92 , Just '\x201C' -- 0x93 , Just '\x201D' -- 0x94 , Just '\x2022' -- 0x95 , Just '\x2013' -- 0x96 , Just '\x2014' -- 0x97 , Just '\x02DC' -- 0x98 , Just '\x2122' -- 0x99 , Just '\x0161' -- 0x9A , Just '\x203A' -- 0x9B , Just '\x0153' -- 0x9C , Nothing -- 0x9D , Just '\x017E' -- 0x9E , Just '\x0178' -- 0x9F ]