{-# LANGUAGE Trustworthy #-}

{-|
Description:    Tokenization rules for characters comprising character references.

Copyright:      (c) 2020-2021 Sam May
License:        MPL-2.0
Maintainer:     ag.eitilt@gmail.com

Stability:      stable
Portability:    portable

The monadic parser construction and prefix-preferring 'String' concatenation
mean that character references are resolved opposite the algorithm described in
the __[HTML](https://html.spec.whatwg.org/multipage/parsing.html)__
specification.  That is most notable in the numeric references, where the spec
recommends multiplying an accumulator by the numeric base before adding each
digit, while this implementation multiplies each digit by the positional value
before adding the accumulator.  The reversal in named references is more
conceptual and subtle, but the spec is worded to read a name from the input
while the implementation reads input 'Char's according to the list of reference
names; this is likely how most implementations accomplish it.
-}
module Web.Mangrove.Parse.Tokenize.Character
    ( flushCharRef
    , tokenCharacterReference
    ) where


import qualified Control.Applicative as A
import qualified Control.Monad as N

import qualified Data.Bifunctor as F.B
import qualified Data.Char as C
import qualified Data.HashMap.Strict as M
import qualified Data.Maybe as Y
import qualified Data.Vector as V
import qualified Data.Word as W

import qualified Numeric.Natural as Z

import Data.Functor ( ($>) )
import Data.Vector ( (!?) )

import Web.Mangrove.Parse.Common.Error
import Web.Mangrove.Parse.Common.Character
import Web.Mangrove.Parse.Tokenize.Common
import Web.Willow.Common.Encoding.Character
import Web.Willow.Common.Parser
import Web.Willow.Common.Parser.Util

import {-# SOURCE #-} Web.Mangrove.Parse.Tokenize.Dispatcher


-- | __HTML:__
--      @[flush code points consumed as a character reference]
--      (https://html.spec.whatwg.org/multipage/parsing.html#flush-code-points-consumed-as-a-character-reference)@
-- 
-- Transform a wrapped 'Char' sequence into a sequence of wrapped 'Character'
-- tokens.
flushCharRef :: TokenizerOutput String -> Tokenizer [TokenizerOutput Token]
flushCharRef out = case tokenizedOut out of
    "" -> case tokenizedErrs out of
        [] -> return []
        errs -> consTokenErrorsList errs . Y.fromMaybe [] <$> A.optional dispatcher
    [c] -> return [out { tokenizedOut = Character c }]
    (c:_) -> do
        let c' = out
                { tokenizedOut = Character c
                , tokenizedState = Nothing
                }
        cs' <- flushCharRef out'
        return $ c' : cs'
  where out' = out
            { tokenizedErrs = []
            , tokenizedOut = drop 1 $ tokenizedOut out
            }


-- | __HTML:__
--      @[character reference state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state)@
-- 
-- The parsing instructions for after reading @"&"@ in a section of the state
-- machine which allows character references.
tokenCharacterReference
    :: Bool
        -- ^ Whether this parser was called from within a markup tag's attribute.
    -> Tokenizer (TokenizerOutput String)
tokenCharacterReference inAttribute = tokenizer (Just ([], "&"))
    [ ifPush_ isAsciiAlphaNum $ do
        ref' <- tokenNamedCharacterReference inAttribute characterReferences
        case tokenizedOut ref' of
            NotFound -> consTokenErrors (tokenizedErrs ref') . consOut '&' <$> tokenAmbiguousAmpersand
            Flush name -> return ref'
                { tokenizedOut = '&' : name
                }
            Found ref -> return ref'
                { tokenizedOut = ref
                }
    , if_ (== '#') tokenNumericCharacterReference
    , elsePush_ $ packToken ([], "&")
    ]

-- | The result of looking up a named character reference.
data CharacterReference
    = NotFound
        -- ^ No matching name found.
    | Flush String
        -- ^ A named reference was found, but historical reasons require
        -- emitting the name as characters anyway.
    | Found String
        -- ^ A named reference was found and successfully resolved.

-- | __HTML:__
--      @[named character reference state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state)@
-- 
-- The parsing instructions for after reading @"&"@ followed by letters and/or
-- numbers in a section of the state machine which allows character references.
tokenNamedCharacterReference
    :: Bool
        -- ^ Whether this parser was called from within a markup tag's attribute.
    -> CharacterReferenceTree
        -- ^ The list of reference names, filtered according to what prefix has
        -- been already encountered.
    -> Tokenizer (TokenizerOutput CharacterReference)
tokenNamedCharacterReference inAttribute (CharacterReferenceTree refs) = do
    cm' <- A.optional next
    let state1 = endState
        state2 = do
            cm <- cm'
            decodedState cm
    case cm' of
        Nothing -> packState ([], NotFound) state1
        Just c -> case M.lookup (decodedOut c) refs of
            -- No reference found, and no further references possible.
            Nothing -> push c *> packState ([], NotFound) state2
            -- No reference found, but longer potential reference names exist.
            Just (Nothing, refs') -> do
                ref' <- tokenNamedCharacterReference inAttribute refs'
                let errs' = decodedErrs c ++ tokenizedErrs ref'
                case tokenizedOut ref' of
                    -- None of those longer names match.
                    NotFound -> push c *> packState ([], NotFound) state2
                    -- Fall back on the longer name.
                    Flush name -> return ref'
                        { tokenizedErrs = errs'
                        , tokenizedOut = Flush $ decodedOut c : name
                        }
                    Found ref -> return ref'
                        { tokenizedErrs = errs'
                        , tokenizedOut = Found ref
                        }
            -- Reference found at the current name.
            Just (Just found, refs') -> do
                semicolon <- A.optional $ next >>= satisfying (\d -> decodedOut d == ';')
                let state3 = maybe state2 decodedState semicolon
                if not (isSemicolonOptional found) && Y.isNothing semicolon
                    then tokenNamedCharacterReference inAttribute refs' >>= \ref ->
                        case tokenizedOut ref of
                            NotFound -> push c $> ref
                            _ -> return ref
                    else tokenNamedCharacterReference inAttribute refs' >>=
                        foundNamedCharacterReference
                            inAttribute
                            (Y.isJust semicolon)
                            (decodedOut c)
                            state3
                            (referenceValue found)

-- | Perform the logistics around determining what 'Character'(s) should be
-- emitted, based on the current environment and any longer match.
foundNamedCharacterReference
    :: Bool
        -- ^ Whether the reference is being evaluated as part of an attribute value.
    -> Bool
        -- ^ Whether the last character matched is a semicolon.
    -> Char
        -- ^ The last character of the reference, /excluding/ the semicolon.
    -> DecoderOutputState
        -- ^ Remainder of the binary document stream after the last character matched.
    -> String
        -- ^ The evaluated sequence represented by the name.
    -> TokenizerOutput CharacterReference
        -- ^ The value returned by continuing the lookup on a longer string.
    -> Tokenizer (TokenizerOutput CharacterReference)
foundNamedCharacterReference attribute semicolon char state found ref = case tokenizedOut ref of
    NotFound -> if attribute && not semicolon
        then do
            c' <- A.optional $ next >>= satisfying
                (\c -> decodedOut c == '=' || isAsciiAlphaNum (decodedOut c))
            case c' of
                Nothing -> packState ([MissingSemicolonAfterCharacterReference], Found found) state
                Just c -> push c *> packState ([], Flush [char]) state
        else packState (errs', Found found) state
    Flush name -> return $ ref
        { tokenizedOut = Flush $ char : name
        }
    _ -> return ref
  where errs'
            | semicolon = []
            | otherwise = [MissingSemicolonAfterCharacterReference]

-- | __HTML:__
--      @[ambiguous ampersand state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#ambiguous-ampersand-state)@
-- 
-- The parsing instructions for after reading @"&"@ followed by a string which
-- does not correspond to any known reference name, in a section of the state
-- machine which allows character references.
tokenAmbiguousAmpersand :: Tokenizer (TokenizerOutput String)
tokenAmbiguousAmpersand = tokenizer (Just ([], ""))
    [ ifChar isAsciiAlphaNum $ \c -> consOut c <$> tokenAmbiguousAmpersand
    , ifPush_ (== ';') $ packToken ([UnknownNamedCharacterReference], "")
    , elsePush_ $ packToken ([], "")
    ]

-- | __HTML:__
--      @[numeric character reference state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state)@
-- 
-- The parsing instructions for after reading @"&#"@ in a section of the state
-- machine which allows character references.
tokenNumericCharacterReference :: Tokenizer (TokenizerOutput String)
tokenNumericCharacterReference = fmap (either ("&#" ++) id) <$> tokenizer
        (Just ([AbsenceOfDigitsInNumericCharacterReference], Left ""))
    [ if_ (== 'x') $ flatten . fmap (F.B.bimap ('x' :) tokenNumericCharacterReferenceEnd) <$>
        tokenHexadecimalCharacterReferenceStart
    , if_ (== 'X') $ flatten . fmap (F.B.bimap ('X' :) tokenNumericCharacterReferenceEnd) <$>
        tokenHexadecimalCharacterReferenceStart
    , elsePush_ $ flatten . fmap (F.B.second tokenNumericCharacterReferenceEnd) <$>
        tokenDecimalCharacterReferenceStart
    ]
  where flatten tok = case tokenizedOut tok of
            Left str -> tok { tokenizedOut = Left str }
            Right (errs, c) -> consTokenErrors errs $ tok { tokenizedOut = Right [c] }

-- | __HTML:__
--      @[hexadecimal character reference start state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-start-state)@
-- 
-- The parsing instructions for after reading @"&#x"@ or @"&#X"@ in a section
-- of the state machine which allows character references.
tokenHexadecimalCharacterReferenceStart
    :: Tokenizer (TokenizerOutput (Either String Z.Natural))
        -- ^ The inner data contains a theoretically-Unicode code point
        -- ('Right', though it may exceed the upper bound) or the characters
        -- consumed in reading something invalid ('Left'), as relevant.
tokenHexadecimalCharacterReferenceStart = tokenizer
        (Just ([AbsenceOfDigitsInNumericCharacterReference], Left ""))
    [ ifPush_ C.isHexDigit $ fmap packReference tokenHexadecimalCharacterReference
    , elsePush_ $ packToken ([AbsenceOfDigitsInNumericCharacterReference], Left "")
    ]

-- | __HTML:__
--      @[decimal character reference start state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state)@
-- 
-- The parsing instructions for after reading @"&#"@ followed by digit from @0@
-- to @9@ in a section of the state machine which allows character references.
tokenDecimalCharacterReferenceStart
    :: Tokenizer (TokenizerOutput (Either String Z.Natural))
        -- ^ The inner data contains a theoretically-Unicode code point
        -- ('Right', though it may exceed the upper bound) or the characters
        -- consumed in reading something invalid ('Left'), as relevant.
tokenDecimalCharacterReferenceStart = tokenizer
        (Just ([AbsenceOfDigitsInNumericCharacterReference], Left ""))
    [ ifPush_ C.isDigit $ fmap packReference tokenDecimalCharacterReference
    , elsePush_ $ packToken ([AbsenceOfDigitsInNumericCharacterReference], Left "")
    ]

-- | Extract the Unicode-ish code point calculated by the numeric character
-- reference loops, and repack it as output by the respective initializer.
packReference :: TokenizerOutput (Z.Natural, Z.Natural) -> TokenizerOutput (Either String Z.Natural)
packReference = fmap $ Right . snd

-- | __HTML:__
--      @[hexadecimal character reference state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-state)@
-- 
-- The parsing instructions for after reading @"&#x"@ or @"&#X"@ followed by a
-- hexadecimal digit in a section of the state machine which allows character
-- references.
tokenHexadecimalCharacterReference
    :: Tokenizer (TokenizerOutput (Z.Natural, Z.Natural))
        -- ^ The inner data contains the number of valid numeric digits to the
        -- right ('fst') and the number composed from those digits ('snd').
        -- 
        -- The datatype has been chosen so that even extremely long references
        -- (over a gigabyte of digits /at minimum/ with 'Word') have no chance
        -- of causing system-dependant behaviour; @&#x20;@ and @&#x20..0;@ being
        -- equivalent when the second has 2^30 @0@s (among other alignments)
        -- could potentially allow some obscure attack.
tokenHexadecimalCharacterReference = tokenizer
        (Just ([MissingSemicolonAfterCharacterReference], (0, 0)))
    [ ifChar C.isDigit $ \c -> increment 0x30 c <$> tokenHexadecimalCharacterReference
    , ifChar (range 'A' 'F') $ \c -> increment 0x37 c <$> tokenHexadecimalCharacterReference
    , ifChar (range 'a' 'f') $ \c -> increment 0x57 c <$> tokenHexadecimalCharacterReference
    , if_ (== ';') $ packToken ([], (0, 0))
    , elsePush_ $ packToken ([MissingSemicolonAfterCharacterReference], (0, 0))
    ]
  where increment offset c = fmap $ \(pos, accum) ->
            (pos + 1, fromIntegral (fromEnum c - offset) * (16 ^ pos) + accum)

-- | __HTML:__
--      @[decimal character reference state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-state)@
-- 
-- The parsing instructions for after reading @"&#"@ followed by a digit from
-- @0@ through @9@ in a section of the state machine which allows character
-- references.
tokenDecimalCharacterReference
    :: Tokenizer (TokenizerOutput (Z.Natural, Z.Natural))
        -- ^ The inner data contains the number of valid numeric digits to the
        -- right ('fst') and the number composed from those digits ('snd').
        -- 
        -- The datatype has been chosen so that even extremely long references
        -- (over a gigabyte of digits /at minimum/ with 'Word') have no chance
        -- of causing system-dependant behaviour; @&#10;@ and @&#10..0;@ being
        -- equivalent when the second has 2^30 @0@s (among other alignments)
        -- could potentially allow some obscure attack.
tokenDecimalCharacterReference = tokenizer
        (Just ([MissingSemicolonAfterCharacterReference], (0, 0)))
    [ ifChar C.isDigit $ \c -> increment c <$> tokenDecimalCharacterReference
    , if_ (== ';') $ packToken ([], (0, 0))
    , elsePush_ $ packToken ([MissingSemicolonAfterCharacterReference], (0, 0))
    ]
  where increment c = fmap $ \(pos, accum) ->
            (pos + 1, fromIntegral (fromEnum c - 0x30) * (10 ^ pos) + accum)

-- | __HTML:__
--      @[numeric character reference end state]
--      (https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state)@
-- 
-- The instructions for processing a theoretically-Unicode code point into an
-- actually-legal 'Char', after reading a non-digit character following a
-- @"&#"@ in a section of the state machine which allows character references.
tokenNumericCharacterReferenceEnd :: Z.Natural -> ([ParseError], Char)
tokenNumericCharacterReferenceEnd 0x00 = ([NullCharacterReference], replacementChar)
tokenNumericCharacterReferenceEnd code
    | code > 0x10FFFF =
        ([CharacterReferenceOutsideUnicodeRange code], replacementChar)
    | range 0xD800 0xDFFF code =
        ([SurrogateCharacterReference . toEnum $ fromIntegral code], replacementChar)
    | range 0xFDD0 0xFDEF code =
        ([NoncharacterCharacterReference . toEnum $ fromIntegral code], toEnum $ fromIntegral code)
    | cMod == 0xFFFE || cMod == 0xFFFF =
        ([NoncharacterCharacterReference . toEnum $ fromIntegral code], toEnum $ fromIntegral code)
    | range 0x00 0x1F code && notElem code [0x09, 0x0A, 0x0C] =
        ([ControlCharacterReference], toEnum $ fromIntegral code)
    | range 0x7F 0x9F code = ([ControlCharacterReference],
        Y.fromMaybe (toEnum $ fromIntegral code) . N.join $
        controlReplacement !? (fromIntegral code - 0x7F))
    | otherwise = ([], toEnum $ fromIntegral code)
  where cMod = fromIntegral code :: W.Word16

-- | The specification-defined replacements for the C1 control characters, plus
-- @0x7F@ (DEL).  'Nothing' placeholders are used for controls without a
-- replacement, to allow indexing by the code point minus 0x7F.
controlReplacement :: V.Vector (Maybe Char)
controlReplacement = V.fromList
    [ Nothing       -- 0x7F
    , Just '\x20AC' -- 0x80
    , Nothing       -- 0x81
    , Just '\x201A' -- 0x82
    , Just '\x0192' -- 0x83
    , Just '\x201E' -- 0x84
    , Just '\x2026' -- 0x85
    , Just '\x2020' -- 0x86
    , Just '\x2021' -- 0x87
    , Just '\x02C6' -- 0x88
    , Just '\x2030' -- 0x89
    , Just '\x0160' -- 0x8A
    , Just '\x2039' -- 0x8B
    , Just '\x0152' -- 0x8C
    , Nothing       -- 0x8D
    , Just '\x017D' -- 0x8E
    , Nothing       -- 0x8F
    , Nothing       -- 0x90
    , Just '\x2018' -- 0x91
    , Just '\x2019' -- 0x92
    , Just '\x201C' -- 0x93
    , Just '\x201D' -- 0x94
    , Just '\x2022' -- 0x95
    , Just '\x2013' -- 0x96
    , Just '\x2014' -- 0x97
    , Just '\x02DC' -- 0x98
    , Just '\x2122' -- 0x99
    , Just '\x0161' -- 0x9A
    , Just '\x203A' -- 0x9B
    , Just '\x0153' -- 0x9C
    , Nothing       -- 0x9D
    , Just '\x017E' -- 0x9E
    , Just '\x0178' -- 0x9F
    ]