html-parse-0.2.0.1: A high-performance HTML tokenizer

Safe HaskellSafe
LanguageHaskell2010

Text.HTML.Parser

Contents

Description

This is a performance-oriented HTML tokenizer aim at web-crawling applications. It follows the HTML5 parsing specification quite closely, so it behaves reasonable well on ill-formed documents from the open Web.

Synopsis

Parsing

parseTokens :: Text -> [Token] Source #

Parse a lazy list of tokens from strict Text.

parseTokensLazy :: Text -> [Token] Source #

Parse a lazy list of tokens from lazy Text.

token :: Parser Token Source #

Parse a single Token.

Types

data Token Source #

An HTML token

Constructors

TagOpen !TagName [Attr]

An opening tag. Attribute ordering is arbitrary.

TagSelfClose !TagName [Attr]

A self-closing tag.

TagClose !TagName

A closing tag.

ContentText !Text

The content between tags.

ContentChar !Char

A single character of content

Comment !Builder

Contents of a comment.

Doctype !Text

Doctype

Instances

Eq Token Source # 

Methods

(==) :: Token -> Token -> Bool #

(/=) :: Token -> Token -> Bool #

Ord Token Source # 

Methods

compare :: Token -> Token -> Ordering #

(<) :: Token -> Token -> Bool #

(<=) :: Token -> Token -> Bool #

(>) :: Token -> Token -> Bool #

(>=) :: Token -> Token -> Bool #

max :: Token -> Token -> Token #

min :: Token -> Token -> Token #

Show Token Source # 

Methods

showsPrec :: Int -> Token -> ShowS #

show :: Token -> String #

showList :: [Token] -> ShowS #

Generic Token Source # 

Associated Types

type Rep Token :: * -> * #

Methods

from :: Token -> Rep Token x #

to :: Rep Token x -> Token #

NFData Token Source # 

Methods

rnf :: Token -> () #

type Rep Token Source # 

type TagName = Text Source #

A tag name (e.g. body)

type AttrName = Text Source #

An attribute name (e.g. href)

type AttrValue = Text Source #

The value of an attribute

data Attr Source #

An attribute of a tag

Constructors

Attr !AttrName !AttrValue 

Instances

Eq Attr Source # 

Methods

(==) :: Attr -> Attr -> Bool #

(/=) :: Attr -> Attr -> Bool #

Ord Attr Source # 

Methods

compare :: Attr -> Attr -> Ordering #

(<) :: Attr -> Attr -> Bool #

(<=) :: Attr -> Attr -> Bool #

(>) :: Attr -> Attr -> Bool #

(>=) :: Attr -> Attr -> Bool #

max :: Attr -> Attr -> Attr #

min :: Attr -> Attr -> Attr #

Show Attr Source # 

Methods

showsPrec :: Int -> Attr -> ShowS #

show :: Attr -> String #

showList :: [Attr] -> ShowS #

Rendering, text canonicalization

renderToken :: Token -> Text Source #

(Somewhat) canonical string representation of Token.

renderAttr :: Attr -> Text Source #

Does not escape quotation in attribute values!

canonicalizeTokens :: [Token] -> [Token] Source #

Meld neighoring ContentChar and ContentText constructors together and drops empty text elements.