{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE LambdaCase #-} -- | This is a performance-oriented HTML tokenizer aim at web-crawling -- applications. It follows the HTML5 parsing specification quite closely, -- so it behaves reasonable well on ill-formed documents from the open Web. -- module Text.HTML.Parser ( -- * Parsing parseTokens , parseTokensLazy , token -- * Types , Token(..) , TagName, AttrName, AttrValue , Attr(..) -- * Rendering, text canonicalization , renderTokens , renderToken , renderAttrs , renderAttr , canonicalizeTokens ) where import Data.Char hiding (isSpace) import Data.List (unfoldr) import GHC.Generics import Control.Applicative import Data.Monoid import Control.Monad (guard) import Control.DeepSeq import Data.Attoparsec.Text import qualified Data.Attoparsec.Text.Lazy as AL import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.Lazy as TL import Data.Text.Lazy.Builder (Builder) import qualified Data.Text.Lazy.Builder as B import Prelude hiding (take, takeWhile) -- Section numbers refer to W3C HTML 5.2 specification. -- | A tag name (e.g. @body@) type TagName = Text -- | An attribute name (e.g. @href@) type AttrName = Text -- | The value of an attribute type AttrValue = Text -- | An HTML token data Token -- | An opening tag. Attribute ordering is arbitrary. = TagOpen !TagName [Attr] -- | A self-closing tag. | TagSelfClose !TagName [Attr] -- | A closing tag. | TagClose !TagName -- | The content between tags. | ContentText !Text -- | A single character of content | ContentChar !Char -- | Contents of a comment. | Comment !Builder -- | Doctype | Doctype !Text deriving (Show, Ord, Eq, Generic) -- | This is a bit of a hack endOfFileToken :: Token endOfFileToken = ContentText "" -- | An attribute of a tag data Attr = Attr !AttrName !AttrValue deriving (Show, Eq, Ord) instance NFData Token where rnf (Comment b) = rnf $ B.toLazyText b rnf _ = () -- | Parse a single 'Token'. token :: Parser Token token = dataState -- Start in the data state. -- | /§8.2.4.1/: Data state dataState :: Parser Token dataState = do content <- takeWhile (/= '<') if not $ T.null content then return $ ContentText content else char '<' >> tagOpen -- | /§8.2.4.6/: Tag open state tagOpen :: Parser Token tagOpen = (char '!' >> markupDeclOpen) <|> (char '/' >> endTagOpen) <|> (char '?' >> bogusComment mempty) <|> tagNameOpen <|> other where other = do return $ ContentChar '<' -- | /§8.2.4.7/: End tag open state endTagOpen :: Parser Token endTagOpen = tagNameClose -- | /§8.2.4.8/: Tag name state: the open case -- -- deviation: no lower-casing, don't handle NULL characters tagNameOpen :: Parser Token tagNameOpen = do tag <- tagName' id $ (satisfy (inClass "\x09\x0a\x0c ") >> beforeAttrName tag []) <|> (char '/' >> selfClosingStartTag tag []) <|> (char '>' >> return (TagOpen tag [])) -- | /§8.2.4.10/: Tag name state: close case tagNameClose :: Parser Token tagNameClose = do tag <- tagName' char '>' >> return (TagClose tag) -- | /§8.2.4.10/: Tag name state: common code -- -- deviation: no lower-casing, don't handle NULL characters tagName' :: Parser Text tagName' = do c <- peekChar' guard $ isAsciiUpper c || isAsciiLower c takeWhile $ notInClass "\x09\x0a\x0c /<>" -- | /§8.2.4.40/: Self-closing start tag state selfClosingStartTag :: TagName -> [Attr] -> Parser Token selfClosingStartTag tag attrs = do (char '>' >> return (TagSelfClose tag attrs)) <|> (endOfInput >> return endOfFileToken) <|> beforeAttrName tag attrs -- | /§8.2.4.32/: Before attribute name state -- -- deviation: no lower-casing beforeAttrName :: TagName -> [Attr] -> Parser Token beforeAttrName tag attrs = do skipWhile $ inClass "\x09\x0a\x0c " id $ (char '/' >> selfClosingStartTag tag attrs) <|> (char '>' >> return (TagOpen tag attrs)) -- <|> (char '\x00' >> attrName tag attrs) -- TODO: NULL <|> attrName tag attrs -- | /§8.2.4.33/: Attribute name state attrName :: TagName -> [Attr] -> Parser Token attrName tag attrs = do name <- takeWhile $ notInClass "\x09\x0a\x0c /=>" id $ (endOfInput >> afterAttrName tag attrs name) <|> (char '=' >> beforeAttrValue tag attrs name) <|> try (do mc <- peekChar case mc of Just c | inClass "\x09\x0a\x0c />" c -> afterAttrName tag attrs name _ -> empty) -- <|> -- TODO: NULL -- | /§8.2.4.34/: After attribute name state afterAttrName :: TagName -> [Attr] -> AttrName -> Parser Token afterAttrName tag attrs name = do skipWhile $ inClass "\x09\x0a\x0c " id $ (char '/' >> selfClosingStartTag tag attrs) <|> (char '=' >> beforeAttrValue tag attrs name) <|> (char '>' >> return (TagOpen tag (Attr name T.empty : attrs))) <|> (endOfInput >> return endOfFileToken) <|> attrName tag (Attr name T.empty : attrs) -- not exactly sure this is right -- | /§8.2.4.35/: Before attribute value state beforeAttrValue :: TagName -> [Attr] -> AttrName -> Parser Token beforeAttrValue tag attrs name = do skipWhile $ inClass "\x09\x0a\x0c " id $ (char '"' >> attrValueDQuoted tag attrs name) <|> (char '\'' >> attrValueSQuoted tag attrs name) <|> (char '>' >> return (TagOpen tag (Attr name T.empty : attrs))) <|> attrValueUnquoted tag attrs name -- | /§8.2.4.36/: Attribute value (double-quoted) state attrValueDQuoted :: TagName -> [Attr] -> AttrName -> Parser Token attrValueDQuoted tag attrs name = do value <- takeWhile (/= '"') _ <- char '"' afterAttrValueQuoted tag attrs name value -- | /§8.2.4.37/: Attribute value (single-quoted) state attrValueSQuoted :: TagName -> [Attr] -> AttrName -> Parser Token attrValueSQuoted tag attrs name = do value <- takeWhile (/= '\'') _ <- char '\'' afterAttrValueQuoted tag attrs name value -- | /§8.2.4.38/: Attribute value (unquoted) state attrValueUnquoted :: TagName -> [Attr] -> AttrName -> Parser Token attrValueUnquoted tag attrs name = do value <- takeTill (inClass "\x09\x0a\x0c >") id $ (satisfy (inClass "\x09\x0a\x0c ") >> beforeAttrName tag attrs) -- unsure: don't emit? <|> (char '>' >> return (TagOpen tag (Attr name value : attrs))) <|> (endOfInput >> return endOfFileToken) -- | /§8.2.4.39/: After attribute value (quoted) state afterAttrValueQuoted :: TagName -> [Attr] -> AttrName -> AttrValue -> Parser Token afterAttrValueQuoted tag attrs name value = (satisfy (inClass "\x09\x0a\x0c ") >> beforeAttrName tag attrs') <|> (char '/' >> selfClosingStartTag tag attrs') <|> (char '>' >> return (TagOpen tag attrs')) <|> (endOfInput >> return endOfFileToken) where attrs' = Attr name value : attrs -- | /§8.2.4.41/: Bogus comment state bogusComment :: Builder -> Parser Token bogusComment content = do (char '>' >> return (Comment content)) <|> (endOfInput >> return (Comment content)) <|> (char '\x00' >> bogusComment (content <> "\xfffd")) <|> (anyChar >>= \c -> bogusComment (content <> B.singleton c)) -- | /§8.2.4.42/: Markup declaration open state markupDeclOpen :: Parser Token markupDeclOpen = try comment_ <|> try docType <|> bogusComment mempty where comment_ = string "--" >> commentStart docType = do -- switching this to asciiCI slowed things down by a factor of two s <- take 7 guard $ T.toLower s == "doctype" doctype -- | /§8.2.4.43/: Comment start state commentStart :: Parser Token commentStart = do (char '-' >> commentStartDash) <|> (char '>' >> return (Comment mempty)) <|> comment mempty -- | /§8.2.4.44/: Comment start dash state commentStartDash :: Parser Token commentStartDash = (char '-' >> commentEnd mempty) <|> (char '>' >> return (Comment mempty)) <|> (endOfInput >> return (Comment mempty)) <|> (comment (B.singleton '-')) -- | /§8.2.4.45/: Comment state comment :: Builder -> Parser Token comment content0 = do content <- B.fromText <$> takeWhile (notInClass "-\x00<") id $ (char '<' >> commentLessThan (content0 <> content <> "<")) <|> (char '-' >> commentEndDash (content0 <> content)) <|> (char '\x00' >> comment (content0 <> content <> B.singleton '\xfffd')) <|> (endOfInput >> return (Comment $ content0 <> content)) -- | /§8.2.46/: Comment less-than sign state commentLessThan :: Builder -> Parser Token commentLessThan content = (char '!' >> commentLessThanBang (content <> "!")) <|> (char '<' >> commentLessThan (content <> "<")) <|> comment content -- | /§8.2.47/: Comment less-than sign bang state commentLessThanBang :: Builder -> Parser Token commentLessThanBang content = (char '-' >> commentLessThanBangDash content) <|> comment content -- | /§8.2.48/: Comment less-than sign bang dash state commentLessThanBangDash :: Builder -> Parser Token commentLessThanBangDash content = (char '-' >> commentLessThanBangDashDash content) <|> commentEndDash content -- | /§8.2.49/: Comment less-than sign bang dash dash state commentLessThanBangDashDash :: Builder -> Parser Token commentLessThanBangDashDash content = (char '>' >> comment content) <|> (endOfInput >> comment content) <|> commentEnd content -- | /§8.2.4.50/: Comment end dash state commentEndDash :: Builder -> Parser Token commentEndDash content = do (char '-' >> commentEnd content) <|> (endOfInput >> return (Comment content)) <|> (comment (content <> "-")) -- | /§8.2.4.51/: Comment end state commentEnd :: Builder -> Parser Token commentEnd content = do (char '>' >> return (Comment content)) <|> (char '!' >> commentEndBang content) <|> (char '-' >> commentEnd (content <> "-")) <|> (endOfInput >> return (Comment content)) <|> (comment (content <> "--")) -- | /§8.2.4.52/: Comment end bang state commentEndBang :: Builder -> Parser Token commentEndBang content = do (char '-' >> commentEndDash (content <> "--!")) <|> (char '>' >> return (Comment content)) <|> (endOfInput >> return (Comment content)) <|> (comment (content <> "--!")) -- | /§8.2.4.53/: DOCTYPE state -- FIXME doctype :: Parser Token doctype = do content <- takeTill (=='>') _ <- char '>' return $ Doctype content -- | Parse a lazy list of tokens from strict 'Text'. parseTokens :: Text -> [Token] parseTokens = unfoldr f where f :: Text -> Maybe (Token, Text) f t | T.null t = Nothing | otherwise = case parse token t of Done rest tok -> Just (tok, rest) Partial cont -> case cont mempty of Done rest tok -> Just (tok, rest) _ -> Nothing _ -> Nothing -- | Parse a lazy list of tokens from lazy 'TL.Text'. parseTokensLazy :: TL.Text -> [Token] parseTokensLazy = unfoldr f where f :: TL.Text -> Maybe (Token, TL.Text) f t | TL.null t = Nothing | otherwise = case AL.parse token t of AL.Done rest tok -> Just (tok, rest) _ -> Nothing -- | See 'renderToken'. renderTokens :: [Token] -> TL.Text renderTokens = mconcat . fmap renderToken -- | (Somewhat) canonical string representation of 'Token'. renderToken :: Token -> TL.Text renderToken = TL.fromStrict . mconcat . \case (TagOpen n []) -> ["<", n, ">"] (TagOpen n attrs) -> ["<", n, " ", renderAttrs attrs, ">"] (TagSelfClose n attrs) -> ["<", n, " ", renderAttrs attrs, " />"] (TagClose n) -> [""] (ContentChar c) -> [T.singleton c] (ContentText t) -> [t] (Comment builder) -> [""] (Doctype t) -> [""] -- | See 'renderAttr'. renderAttrs :: [Attr] -> Text renderAttrs = T.unwords . fmap renderAttr . reverse -- | Does not escape quotation in attribute values! renderAttr :: Attr -> Text renderAttr (Attr k v) = mconcat [k, "=\"", v, "\""] -- | Meld neighoring 'ContentChar' and 'ContentText' constructors together and drops empty text -- elements. canonicalizeTokens :: [Token] -> [Token] canonicalizeTokens = filter (/= ContentText "") . meldTextTokens meldTextTokens :: [Token] -> [Token] meldTextTokens = concatTexts . fmap charToText where charToText (ContentChar c) = ContentText (T.singleton c) charToText t = t concatTexts = \case (ContentText t : ContentText t' : ts) -> concatTexts $ ContentText (t <> t') : ts (t : ts) -> t : concatTexts ts [] -> []