{-| Description: Tokenization rules for characters comprising a @\@ declaration. Copyright: (c) 2020 Sam May License: MPL-2.0 Maintainer: ag.eitilt@gmail.com Stability: stable Portability: portable -} module Web.Mangrove.Parse.Tokenize.Doctype ( tokenDoctype ) where import qualified Data.Char as C import qualified Data.Text as T import Web.Mangrove.Parse.Common.Error import Web.Mangrove.Parse.Tokenize.Common import Web.Willow.Common.Encoding.Character import Web.Willow.Common.Parser.Util import Control.Applicative ( (<|>) ) -- | An intermediate form of 'DoctypeParams' with unpacked parameter types to -- facilitate construction. Specifically, 'Char'-by-'Char' construction of a -- 'String' followed by a single 'T.pack' is much faster than repeated @O(n)@ -- calls to 'T.cons'. Values may be easily instantiated as updates to -- @emptyDoctypeData@ or @quirksDoctype@. data DoctypeParams' = DoctypeParams' { doctypeName' :: Maybe String -- ^ See 'doctypeName'. , doctypePublicId' :: Maybe String -- ^ See 'doctypePublicId'. , doctypeSystemId' :: Maybe String -- ^ See 'doctypeSystemId'. , doctypeQuirks' :: Bool -- ^ See 'doctypeQuirks'. } deriving ( Eq, Show, Read ) -- | A sane default collection for easy record initialization; namely, -- 'Nothing's and 'False'. See also @quirksDoctype@. emptyDoctypeData :: DoctypeParams' emptyDoctypeData = DoctypeParams' { doctypeName' = Nothing , doctypePublicId' = Nothing , doctypeSystemId' = Nothing , doctypeQuirks' = False } -- | As like @emptyDoctypeData@, but indicating that the document should be -- forced to render in the quirks mode. quirksDoctype :: DoctypeParams' quirksDoctype = emptyDoctypeData { doctypeQuirks' = True } -- | Wrap a given collection of data into the payload of a 'Doctype'. packDoctypeToken :: TokenizerOutput DoctypeParams' -> TokenizerOutput Token packDoctypeToken d' = flip fmap d' $ \d -> Doctype $ emptyDoctypeParams { doctypeName = T.pack <$> doctypeName' d , doctypePublicId = T.pack <$> doctypePublicId' d , doctypeSystemId = T.pack <$> doctypeSystemId' d , doctypeQuirks = doctypeQuirks' d } -- | Mark a collection of data as containing a doctype public identifier, -- without overwriting any data which may already be set. ensurePublicId :: TokenizerOutput DoctypeParams' -> TokenizerOutput DoctypeParams' ensurePublicId = fmap $ \d -> d { doctypePublicId' = doctypePublicId' d <|> Just "" } -- | Mark a collection of data as containing a doctype system identifier, -- without overwriting any data which may already be set. ensureSystemId :: TokenizerOutput DoctypeParams' -> TokenizerOutput DoctypeParams' ensureSystemId = fmap $ \d -> d { doctypeSystemId' = doctypeSystemId' d <|> Just "" } -- | Mark a collection of data as forcing a document to render in quirks mode. ensureQuirks :: TokenizerOutput DoctypeParams' -> TokenizerOutput DoctypeParams' ensureQuirks = fmap $ \d -> d { doctypeQuirks' = True } -- | Prepend a character to the unpacked name of the root element -- or, in -- other words, the markup language used. consDoctypeName :: Char -> TokenizerOutput DoctypeParams' -> TokenizerOutput DoctypeParams' consDoctypeName c = fmap $ \d -> d { doctypeName' = fmap (c :) $ doctypeName' d <|> Just [] } -- | Prepend a character to the unpacked public identifier of a doctype. consPublicId :: Char -> TokenizerOutput DoctypeParams' -> TokenizerOutput DoctypeParams' consPublicId c = fmap $ \d -> d { doctypePublicId' = fmap (c :) $ doctypePublicId' d <|> Just [] } -- | Prepend a character to the unpacked system identifier of a doctype. consSystemId :: Char -> TokenizerOutput DoctypeParams' -> TokenizerOutput DoctypeParams' consSystemId c = fmap $ \d -> d { doctypeSystemId' = fmap (c :) $ doctypeSystemId' d <|> Just [] } -- | __HTML:__ -- @[DOCTYPE state] -- (https://html.spec.whatwg.org/multipage/parsing.html#doctype-state)@ -- -- The parsing instructions for after reading @"\') $ fmap packDoctypeToken tokenBeforeDoctypeName , elsePush_ $ fmap (packDoctypeToken . consTokenError MissingWhitespaceBeforeDoctypeName) tokenBeforeDoctypeName ] where quirksDoctype' = Doctype $ emptyDoctypeParams { doctypeQuirks = True } -- | __HTML:__ -- @[before DOCTYPE name state] -- (https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state)@ -- -- The parsing instructions for after reading @"\ tokenDoctypeName , if_ (== '>') $ packToken ([MissingDoctypeName], quirksDoctype) , elseChar $ \c -> consDoctypeName (toAsciiLower c) <$> tokenDoctypeName ] -- | __HTML:__ -- @[DOCTYPE name state] -- (https://html.spec.whatwg.org/multipage/parsing.html#doctype-name-state)@ -- -- The parsing instructions at the initial segment of the doctype declaration -- section of the state machine. tokenDoctypeName :: Tokenizer (TokenizerOutput DoctypeParams') tokenDoctypeName = tokenizer (Just ([EOFInDoctype], quirksDoctype)) [ if_ isAsciiWhitespace tokenAfterDoctypeName , if_ (== '>') $ changeState DataState *> packToken ([], emptyDoctypeData) , if_ (== '\NUL') $ consTokenError UnexpectedNullCharacter . consDoctypeName replacementChar <$> tokenDoctypeName , elseChar $ \c -> consDoctypeName (toAsciiLower c) <$> tokenDoctypeName ] -- | __HTML:__ -- @[after DOCTYPE name state] -- (https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-name-state)@ -- -- The parsing instructions for after reading the root element in the doctype -- declaration section of the state machine. tokenAfterDoctypeName :: Tokenizer (TokenizerOutput DoctypeParams') tokenAfterDoctypeName = tokenizer (Just ([EOFInDoctype], quirksDoctype)) [ if_ isAsciiWhitespace tokenAfterDoctypeName , if_ (== '>') $ changeState DataState *> packToken ([], emptyDoctypeData) , elsePush_ tokenAfterDoctypeName' ] -- | Dispatch the parser according to a @"PUBLIC"@ or @"SYSTEM"@ keyword in the -- doctype declaration. tokenAfterDoctypeName' :: Tokenizer (TokenizerOutput DoctypeParams') tokenAfterDoctypeName' = choice [ chunk' C.toUpper "PUBLIC" *> tokenAfterDoctypePublicKeyword , chunk' C.toUpper "SYSTEM" *> tokenAfterDoctypeSystemKeyword , consTokenError InvalidCharacterSequenceAfterDoctypeName . ensureQuirks <$> tokenBogusDoctype ] -- | __HTML:__ -- @[after DOCTYPE public keyword state] -- (https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-keyword-state)@ -- -- The parsing instructions for after reading @"PUBLIC"@ in the doctype -- declaration section of the state machine. tokenAfterDoctypePublicKeyword :: Tokenizer (TokenizerOutput DoctypeParams') tokenAfterDoctypePublicKeyword = tokenizer (Just ([EOFInDoctype], quirksDoctype)) [ if_ isAsciiWhitespace tokenBeforeDoctypePublicIdentifier , if_ (== '"') $ consTokenError MissingWhitespaceAfterDoctypePublicKeyword . ensurePublicId <$> tokenDoctypePublicIdentifierDoubleQuoted , if_ (== '\'') $ consTokenError MissingWhitespaceAfterDoctypePublicKeyword . ensurePublicId <$> tokenDoctypePublicIdentifierSingleQuoted , if_ (== '>') $ changeState DataState *> packToken ([MissingDoctypePublicIdentifier], quirksDoctype) , elsePush_ $ consTokenError MissingQuoteBeforeDoctypePublicIdentifier . ensureQuirks <$> tokenBogusDoctype ] -- | __HTML:__ -- @[before DOCTYPE public identifier state] -- (https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-public-identifier-state)@ -- -- The parsing instructions for after reading @"PUBLIC "@ in the doctype -- declaration section of the state machine. tokenBeforeDoctypePublicIdentifier :: Tokenizer (TokenizerOutput DoctypeParams') tokenBeforeDoctypePublicIdentifier = tokenizer (Just ([EOFInDoctype], quirksDoctype)) [ if_ isAsciiWhitespace tokenBeforeDoctypePublicIdentifier , if_ (== '"') $ fmap ensurePublicId tokenDoctypePublicIdentifierDoubleQuoted , if_ (== '\'') $ fmap ensurePublicId tokenDoctypePublicIdentifierSingleQuoted , if_ (== '>') $ changeState DataState *> packToken ([MissingDoctypePublicIdentifier], quirksDoctype) , elsePush_ $ consTokenError MissingQuoteBeforeDoctypePublicIdentifier . ensureQuirks <$> tokenBogusDoctype ] -- | __HTML:__ -- @[DOCTYPE public identifier (double-quoted) state] -- (https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(double-quoted\)-state)@ -- -- The parsing instructions for after reading @"PUBLIC \\""@ in the doctype -- declaration section of the state machine. tokenDoctypePublicIdentifierDoubleQuoted :: Tokenizer (TokenizerOutput DoctypeParams') tokenDoctypePublicIdentifierDoubleQuoted = tokenizer (Just ([EOFInDoctype], quirksDoctype)) [ if_ (== '"') tokenAfterDoctypePublicIdentifier , if_ (== '\NUL') $ consTokenError UnexpectedNullCharacter . consPublicId replacementChar <$> tokenDoctypePublicIdentifierDoubleQuoted , if_ (== '>') $ changeState DataState *> packToken ([AbruptDoctypePublicIdentifier], quirksDoctype) , elseChar $ \c -> consPublicId c <$> tokenDoctypePublicIdentifierDoubleQuoted ] -- | __HTML:__ -- @[DOCTYPE public identifier (single-quoted) state] -- (https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(single-quoted\)-state)@ -- -- The parsing instructions for after reading @"PUBLIC '"@ in the doctype -- declaration section of the state machine. tokenDoctypePublicIdentifierSingleQuoted :: Tokenizer (TokenizerOutput DoctypeParams') tokenDoctypePublicIdentifierSingleQuoted = tokenizer (Just ([EOFInDoctype], quirksDoctype)) [ if_ (== '\'') tokenAfterDoctypePublicIdentifier , if_ (== '\NUL') $ consTokenError UnexpectedNullCharacter . consPublicId replacementChar <$> tokenDoctypePublicIdentifierSingleQuoted , if_ (== '>') $ changeState DataState *> packToken ([AbruptDoctypePublicIdentifier], quirksDoctype) , elseChar $ \c -> consPublicId c <$> tokenDoctypePublicIdentifierSingleQuoted ] -- | __HTML:__ -- @[after DOCTYPE public identifier state] -- (https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-identifier-state)@ -- -- The parsing instructions for after reading the closing quote of a public -- identifier in the doctype declaration section of the state machine. tokenAfterDoctypePublicIdentifier :: Tokenizer (TokenizerOutput DoctypeParams') tokenAfterDoctypePublicIdentifier = tokenizer (Just ([EOFInDoctype], quirksDoctype)) [ if_ isAsciiWhitespace tokenBetweenDoctypePublicAndSystemIdentifiers , if_ (== '>') $ changeState DataState *> packToken ([], emptyDoctypeData) , if_ (== '"') $ consTokenError MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers . ensureSystemId <$> tokenDoctypeSystemIdentifierDoubleQuoted , if_ (== '\'') $ consTokenError MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers . ensureSystemId <$> tokenDoctypeSystemIdentifierSingleQuoted , elsePush_ $ consTokenError MissingQuoteBeforeDoctypeSystemIdentifier . ensureQuirks <$> tokenBogusDoctype ] -- | __HTML:__ -- @[between DOCTYPE public and system identifier state] -- (https://html.spec.whatwg.org/multipage/parsing.html#between-doctype-public-and-system-identifiers-state)@ -- -- The parsing instructions for after reading a whitespace character after the -- closing quote of a public identifier in the doctype declaration section of -- the state machine. tokenBetweenDoctypePublicAndSystemIdentifiers :: Tokenizer (TokenizerOutput DoctypeParams') tokenBetweenDoctypePublicAndSystemIdentifiers = tokenizer (Just ([EOFInDoctype], quirksDoctype)) [ if_ isAsciiWhitespace tokenBetweenDoctypePublicAndSystemIdentifiers , if_ (== '>') $ changeState DataState *> packToken ([], emptyDoctypeData) , if_ (== '"') $ fmap ensureSystemId tokenDoctypeSystemIdentifierDoubleQuoted , if_ (== '\'') $ fmap ensureSystemId tokenDoctypeSystemIdentifierSingleQuoted , elsePush_ $ consTokenError MissingQuoteBeforeDoctypeSystemIdentifier . ensureQuirks <$> tokenBogusDoctype ] -- | __HTML:__ -- @[after DOCTYPE system keyword state] -- (https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-keyword-state)@ -- -- The parsing instructions for after reading @"SYSTEM"@ in the doctype -- declaration section of the state machine. tokenAfterDoctypeSystemKeyword :: Tokenizer (TokenizerOutput DoctypeParams') tokenAfterDoctypeSystemKeyword = tokenizer (Just ([EOFInDoctype], quirksDoctype)) [ if_ isAsciiWhitespace tokenBeforeDoctypeSystemIdentifier , if_ (== '"') $ consTokenError MissingWhitespaceAfterDoctypeSystemKeyword . ensureSystemId <$> tokenDoctypeSystemIdentifierDoubleQuoted , if_ (== '\'') $ consTokenError MissingWhitespaceAfterDoctypeSystemKeyword . ensureSystemId <$> tokenDoctypeSystemIdentifierSingleQuoted , if_ (== '>') $ changeState DataState *> packToken ([MissingDoctypeSystemIdentifier], quirksDoctype) , elsePush_ $ consTokenError MissingQuoteBeforeDoctypeSystemIdentifier . ensureQuirks <$> tokenBogusDoctype ] -- | __HTML:__ -- @[before DOCTYPE system identifier state] -- (https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-system-identifier-state)@ -- -- The parsing instructions for after reading @"SYSTEM "@ in the doctype -- declaration section of the state machine. tokenBeforeDoctypeSystemIdentifier :: Tokenizer (TokenizerOutput DoctypeParams') tokenBeforeDoctypeSystemIdentifier = tokenizer (Just ([EOFInDoctype], quirksDoctype)) [ if_ isAsciiWhitespace tokenBeforeDoctypeSystemIdentifier , if_ (== '"') $ fmap ensureSystemId tokenDoctypeSystemIdentifierDoubleQuoted , if_ (== '\'') $ fmap ensureSystemId tokenDoctypeSystemIdentifierSingleQuoted , if_ (== '>') $ changeState DataState *> packToken ([MissingDoctypeSystemIdentifier], quirksDoctype) , elsePush_ $ consTokenError MissingQuoteBeforeDoctypeSystemIdentifier . ensureQuirks <$> tokenBogusDoctype ] -- | __HTML:__ -- @[DOCTYPE system identifier (double-quoted) state] -- (https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(double-quoted\)-state)@ -- -- The parsing instructions for after reading @"SYSTEM \\""@, or a @'"'@ after -- a public identifier, in the doctype declaration section of the state -- machine. tokenDoctypeSystemIdentifierDoubleQuoted :: Tokenizer (TokenizerOutput DoctypeParams') tokenDoctypeSystemIdentifierDoubleQuoted = tokenizer (Just ([EOFInDoctype], quirksDoctype)) [ if_ (== '"') tokenAfterDoctypeSystemIdentifier , if_ (== '\NUL') $ consTokenError UnexpectedNullCharacter . consSystemId replacementChar <$> tokenDoctypeSystemIdentifierDoubleQuoted , if_ (== '>') $ changeState DataState *> packToken ([AbruptDoctypeSystemIdentifier], quirksDoctype) , elseChar $ \c -> consSystemId c <$> tokenDoctypeSystemIdentifierDoubleQuoted ] -- | __HTML:__ -- @[DOCTYPE system identifier (single-quoted) state] -- (https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(single-quoted\)-state)@ -- -- The parsing instructions for after reading @"SYSTEM '"@, or a @'\''@ after -- a public identifier, in the doctype declaration section of the state -- machine. tokenDoctypeSystemIdentifierSingleQuoted :: Tokenizer (TokenizerOutput DoctypeParams') tokenDoctypeSystemIdentifierSingleQuoted = tokenizer (Just ([EOFInDoctype], quirksDoctype)) [ if_ (== '\'') tokenAfterDoctypeSystemIdentifier , if_ (== '\NUL') $ consTokenError UnexpectedNullCharacter . consSystemId replacementChar <$> tokenDoctypeSystemIdentifierSingleQuoted , if_ (== '>') $ changeState DataState *> packToken ([AbruptDoctypeSystemIdentifier], quirksDoctype) , elseChar $ \c -> consSystemId c <$> tokenDoctypeSystemIdentifierSingleQuoted ] -- | __HTML:__ -- @[after DOCTYPE system identifier state] -- (https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-identifier-state)@ -- -- The parsing instructions for after reading the closing quote of a system -- identifier in the doctype declaration section of the state machine. tokenAfterDoctypeSystemIdentifier :: Tokenizer (TokenizerOutput DoctypeParams') tokenAfterDoctypeSystemIdentifier = tokenizer (Just ([EOFInDoctype], quirksDoctype)) [ if_ isAsciiWhitespace tokenAfterDoctypeSystemIdentifier , if_ (== '>') $ changeState DataState *> packToken ([], emptyDoctypeData) , elsePushChar $ \c -> consTokenError (UnexpectedCharacterAfterDoctypeSystemIdentifier c) <$> tokenBogusDoctype ] -- | __HTML:__ -- @[bogus DOCTYPE state] -- (https://html.spec.whatwg.org/multipage/parsing.html#bogus-doctype-state)@ -- -- The parsing instructions for after reading a disallowed character sequence -- in the doctype declaration section of the state machine. tokenBogusDoctype :: Tokenizer (TokenizerOutput DoctypeParams') tokenBogusDoctype = tokenizer (Just ([], emptyDoctypeData)) [ if_ (== '>') $ changeState DataState *> packToken ([], emptyDoctypeData) , if_ (== '\NUL') $ consTokenError UnexpectedNullCharacter <$> tokenBogusDoctype , else_ tokenBogusDoctype ]