-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | A high-performance HTML tokenizer -- -- This package provides a fast and reasonably robust HTML5 tokenizer -- built upon the attoparsec library. The parsing strategy is -- based upon the HTML5 parsing specification with few deviations. -- -- For instance, -- --
-- >>> parseTokens "<div><h1 class=widget>Hello World</h1><br/>" -- [TagOpen "div" [], -- TagOpen "h1" [Attr "class" "widget"], -- ContentText "Hello World", -- TagClose "h1", -- TagSelfClose "br" []] ---- -- The package targets similar use-cases to the venerable -- tagsoup library, but is significantly more efficient, -- achieving parsing speeds of over 80 megabytes per second on modern -- hardware and typical web documents. Here are some typical performance -- numbers taken from parsing a Wikipedia article of moderate length: -- --
-- benchmarking Forced/tagsoup fast Text -- time 186.1 ms (175.3 ms .. 194.6 ms) -- 0.999 R² (0.995 R² .. 1.000 R²) -- mean 191.7 ms (188.9 ms .. 198.3 ms) -- std dev 5.053 ms (1.092 ms .. 6.809 ms) -- variance introduced by outliers: 14% (moderately inflated) -- -- benchmarking Forced/tagsoup normal Text -- time 189.7 ms (182.8 ms .. 197.7 ms) -- 0.999 R² (0.998 R² .. 1.000 R²) -- mean 196.5 ms (193.1 ms .. 202.1 ms) -- std dev 5.481 ms (2.141 ms .. 7.383 ms) -- variance introduced by outliers: 14% (moderately inflated) -- -- benchmarking Forced/html-parser -- time 15.81 ms (15.75 ms .. 15.89 ms) -- 1.000 R² (1.000 R² .. 1.000 R²) -- mean 15.72 ms (15.66 ms .. 15.77 ms) -- std dev 140.9 μs (113.6 μs .. 174.5 μs) --@package html-parse @version 0.2.0.2 -- | This is a performance-oriented HTML tokenizer aim at web-crawling -- applications. It follows the HTML5 parsing specification quite -- closely, so it behaves reasonable well on ill-formed documents from -- the open Web. module Text.HTML.Parser -- | Parse a lazy list of tokens from strict Text. parseTokens :: Text -> [Token] -- | Parse a lazy list of tokens from lazy Text. parseTokensLazy :: Text -> [Token] -- | Parse a single Token. token :: Parser Token -- | An HTML token data Token -- | An opening tag. Attribute ordering is arbitrary. TagOpen :: !TagName -> [Attr] -> Token -- | A self-closing tag. TagSelfClose :: !TagName -> [Attr] -> Token -- | A closing tag. TagClose :: !TagName -> Token -- | The content between tags. ContentText :: !Text -> Token -- | A single character of content ContentChar :: !Char -> Token -- | Contents of a comment. Comment :: !Builder -> Token -- | Doctype Doctype :: !Text -> Token -- | A tag name (e.g. body) type TagName = Text -- | An attribute name (e.g. href) type AttrName = Text -- | The value of an attribute type AttrValue = Text -- | An attribute of a tag data Attr Attr :: !AttrName -> !AttrValue -> Attr -- | See renderToken. renderTokens :: [Token] -> Text -- | (Somewhat) canonical string representation of Token. renderToken :: Token -> Text -- | See renderAttr. renderAttrs :: [Attr] -> Text -- | Does not escape quotation in attribute values! renderAttr :: Attr -> Text -- | Meld neighoring ContentChar and ContentText constructors -- together and drops empty text elements. canonicalizeTokens :: [Token] -> [Token] instance GHC.Generics.Generic Text.HTML.Parser.Token instance GHC.Classes.Eq Text.HTML.Parser.Token instance GHC.Classes.Ord Text.HTML.Parser.Token instance GHC.Show.Show Text.HTML.Parser.Token instance GHC.Classes.Ord Text.HTML.Parser.Attr instance GHC.Classes.Eq Text.HTML.Parser.Attr instance GHC.Show.Show Text.HTML.Parser.Attr instance Control.DeepSeq.NFData Text.HTML.Parser.Token module Text.HTML.Tree tokensToForest :: [Token] -> Either ParseTokenForestError (Forest Token) data ParseTokenForestError ParseTokenForestErrorBracketMismatch :: PStack -> (Maybe Token) -> ParseTokenForestError data PStack PStack :: Forest Token -> [(Token, Forest Token)] -> PStack [_pstackToplevelSiblings] :: PStack -> Forest Token [_pstackParents] :: PStack -> [(Token, Forest Token)] nonClosing :: [Text] tokensFromForest :: Forest Token -> [Token] tokensFromTree :: Tree Token -> [Token] instance GHC.Show.Show Text.HTML.Tree.ParseTokenForestError instance GHC.Classes.Eq Text.HTML.Tree.ParseTokenForestError instance GHC.Show.Show Text.HTML.Tree.PStack instance GHC.Classes.Eq Text.HTML.Tree.PStack