-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | Parsing and extracting information from (possibly malformed) HTML/XML documents -- -- TagSoup is a library for parsing HTML/XML. It supports the HTML 5 -- specification, and can be used to parse either well-formed XML, or -- unstructured and malformed HTML from the web. The library also -- provides useful functions to extract information from an HTML -- document, making it ideal for screen-scraping. -- -- Users should start from the Text.HTML.TagSoup module. @package tagsoup @version 0.13 -- | This module converts between HTML/XML entities (i.e. -- &) and the characters they represent. module Text.HTML.TagSoup.Entity -- | Lookup an entity, using lookupNumericEntity if it starts with -- # and lookupNamedEntity otherwise lookupEntity :: String -> Maybe String -- | Lookup a named entity, using htmlEntities -- --
--   lookupNamedEntity "amp" == Just "&"
--   lookupNamedEntity "haskell" == Nothing
--   
lookupNamedEntity :: String -> Maybe String -- | Lookup a numeric entity, the leading '#' must have already -- been removed. -- --
--   lookupNumericEntity "65" == Just "A"
--   lookupNumericEntity "x41" == Just "A"
--   lookupNumericEntity "x4E" === Just "N"
--   lookupNumericEntity "x4e" === Just "N"
--   lookupNumericEntity "Haskell" == Nothing
--   lookupNumericEntity "" == Nothing
--   lookupNumericEntity "89439085908539082" == Nothing
--   
lookupNumericEntity :: String -> Maybe String -- | Escape an XML string. -- --
--   escapeXML "hello world" == "hello world"
--   escapeXML "hello & world" == "hello & world"
--   
escapeXML :: String -> String -- | A table mapping XML entity names to resolved strings. All strings are -- a single character long. Does not include apos as -- Internet Explorer does not know about it. xmlEntities :: [(String, String)] -- | A table mapping HTML entity names to resolved strings. Most resolved -- strings are a single character long, but some (e.g. -- ngeqq) are two characters long. The list is taken from -- http://www.w3.org/TR/html5/syntax.html#named-character-references. htmlEntities :: [(String, String)] -- | WARNING: This module is not intended for use outside the -- TagSoup library. -- -- This module provides an abstraction for String's as used inside -- TagSoup. It allows TagSoup to work with String (list of Char), -- ByteString.Char8, ByteString.Lazy.Char8, Data.Text and Data.Text.Lazy. module Text.StringLike -- | A class to generalise TagSoup parsing over many types of string-like -- types. Examples are given for the String type. class (Typeable a, Eq a) => StringLike a empty :: StringLike a => a cons :: StringLike a => Char -> a -> a uncons :: StringLike a => a -> Maybe (Char, a) toString :: StringLike a => a -> String fromString :: StringLike a => String -> a fromChar :: StringLike a => Char -> a strConcat :: StringLike a => [a] -> a strNull :: StringLike a => a -> Bool append :: StringLike a => a -> a -> a -- | Convert a String from one type to another. castString :: (StringLike a, StringLike b) => a -> b instance StringLike Text instance StringLike Text instance StringLike ByteString instance StringLike ByteString instance StringLike String -- | Combinators to match tags. Some people prefer to use (~==) -- from Text.HTML.TagSoup, others prefer these more structured -- combinators. Which you use is personal preference. module Text.HTML.TagSoup.Match -- | match an opening tag tagOpen :: (str -> Bool) -> ([Attribute str] -> Bool) -> Tag str -> Bool -- | match an closing tag tagClose :: (str -> Bool) -> Tag str -> Bool -- | match a text tagText :: (str -> Bool) -> Tag str -> Bool tagComment :: (str -> Bool) -> Tag str -> Bool -- | match a opening tag's name literally tagOpenLit :: Eq str => str -> ([Attribute str] -> Bool) -> Tag str -> Bool -- | match a closing tag's name literally tagCloseLit :: Eq str => str -> Tag str -> Bool tagOpenAttrLit :: Eq str => str -> Attribute str -> Tag str -> Bool -- | Match a tag with given name, that contains an attribute with given -- name, that satisfies a predicate. If an attribute occurs multiple -- times, all occurrences are checked. tagOpenAttrNameLit :: Eq str => str -> str -> (str -> Bool) -> Tag str -> Bool -- | Check if the 'Tag str' is TagOpen and matches the given name tagOpenNameLit :: Eq str => str -> Tag str -> Bool -- | Check if the 'Tag str' is TagClose and matches the given name tagCloseNameLit :: Eq str => str -> Tag str -> Bool anyAttr :: ((str, str) -> Bool) -> [Attribute str] -> Bool anyAttrName :: (str -> Bool) -> [Attribute str] -> Bool anyAttrValue :: (str -> Bool) -> [Attribute str] -> Bool anyAttrLit :: Eq str => (str, str) -> [Attribute str] -> Bool anyAttrNameLit :: Eq str => str -> [Attribute str] -> Bool anyAttrValueLit :: Eq str => str -> [Attribute str] -> Bool getTagContent :: Eq str => str -> ([Attribute str] -> Bool) -> [Tag str] -> [Tag str] -- | NOTE: This module is preliminary and may change at a future -- date. -- -- This module is intended to help converting a list of tags into a tree -- of tags. module Text.HTML.TagSoup.Tree data TagTree str TagBranch :: str -> [Attribute str] -> [TagTree str] -> TagTree str TagLeaf :: (Tag str) -> TagTree str -- | Convert a list of tags into a tree. This version is not lazy at all, -- that is saved for version 2. tagTree :: Eq str => [Tag str] -> [TagTree str] flattenTree :: [TagTree str] -> [Tag str] -- | This operation is based on the Uniplate transform function. -- Given a list of trees, it applies the function to every tree in a -- bottom-up manner. This operation is useful for manipulating a tree - -- for example to make all tag names upper case: -- --
--   upperCase = transformTree f
--     where f (TagBranch name atts inner) = [TagBranch (map toUpper name) atts inner]
--           f x = x
--   
transformTree :: (TagTree str -> [TagTree str]) -> [TagTree str] -> [TagTree str] -- | This operation is based on the Uniplate universe function. -- Given a list of trees, it returns those trees, and all the children -- trees at any level. For example: -- --
--   universeTree
--      [TagBranch "a" [("href","url")] [TagBranch "b" [] [TagLeaf (TagText "text")]]]
--   == [TagBranch "a" [("href","url")] [TagBranch "b" [] [TagLeaf (TagText "text")]]]
--      ,TagBranch "b" [] [TagLeaf (TagText "text")]]
--   
-- -- This operation is particularly useful for queries. To collect all -- "a" tags in a tree, simply do: -- --
--   [x | x@(TagTree "a" _ _) <- universeTree tree]
--   
universeTree :: [TagTree str] -> [TagTree str] instance Eq str => Eq (TagTree str) instance Ord str => Ord (TagTree str) instance Show str => Show (TagTree str) instance Functor TagTree -- | This module is for working with HTML/XML. It deals with both -- well-formed XML and malformed HTML from the web. It features: -- -- -- -- The standard practice is to parse a String to -- [Tag String] using parseTags, -- then operate upon it to extract the necessary information. module Text.HTML.TagSoup -- | A single HTML element. A whole document is represented by a list of -- Tag. There is no requirement for TagOpen and -- TagClose to match. data Tag str -- | An open tag with Attributes in their original order TagOpen :: str -> [Attribute str] -> Tag str -- | A closing tag TagClose :: str -> Tag str -- | A text node, guaranteed not to be the empty string TagText :: str -> Tag str -- | A comment TagComment :: str -> Tag str -- | Meta: A syntax error in the input file TagWarning :: str -> Tag str -- | Meta: The position of a parsed element TagPosition :: !Row -> !Column -> Tag str -- | The row/line of a position, starting at 1 type Row = Int -- | The column of a position, starting at 1 type Column = Int -- | An HTML attribute id="name" generates ("id","name") type Attribute str = (str, str) -- | Parse a string to a list of tags, using an HTML 5 compliant parser. -- --
--   parseTags "<hello>my&amp;</world>" == [TagOpen "hello" [],TagText "my&",TagClose "world"]
--   
parseTags :: StringLike str => str -> [Tag str] -- | Parse a string to a list of tags, using settings supplied by the -- ParseOptions parameter, eg. to output position information: -- --
--   parseTagsOptions parseOptions{optTagPosition = True} "<hello>my&amp;</world>" ==
--      [TagPosition 1 1,TagOpen "hello" [],TagPosition 1 8,TagText "my&",TagPosition 1 15,TagClose "world"]
--   
parseTagsOptions :: StringLike str => ParseOptions str -> str -> [Tag str] -- | These options control how parseTags works. The -- ParseOptions type is usually generated by one of -- parseOptions, parseOptionsFast or -- parseOptionsEntities, then selected fields may be overriden. -- -- The options optTagPosition and optTagWarning specify -- whether to generate TagPosition or TagWarning elements -- respectively. Usually these options should be set to False to -- simplify future stages, unless you rely on position information or -- want to give malformed HTML messages to the end user. -- -- The options optEntityData and optEntityAttrib control -- how entities, for example &nbsp; are handled. Both take a -- string, and a boolean, where True indicates that the entity -- ended with a semi-colon ;. Inside normal text -- optEntityData will be called, and the results will be inserted -- in the tag stream. Inside a tag attribute optEntityAttrib will -- be called, and the first component of the result will be used in the -- attribute, and the second component will be appended after the -- TagOpen value (usually the second component is []). As -- an example, to not decode any entities, pass: -- --
--   parseOptions
--       {optEntityData=\(str,b) -> [TagText $ "&" ++ str ++ [';' | b]]
--       ,optEntityAttrib\(str,b) -> ("&" ++ str ++ [';' | b], [])
--   
data ParseOptions str ParseOptions :: Bool -> Bool -> ((str, Bool) -> [Tag str]) -> ((str, Bool) -> (str, [Tag str])) -> Bool -> ParseOptions str -- | Should TagPosition values be given before some items -- (default=False,fast=False). optTagPosition :: ParseOptions str -> Bool -- | Should TagWarning values be given (default=False,fast=False) optTagWarning :: ParseOptions str -> Bool -- | How to lookup an entity (Bool = has ending ';') optEntityData :: ParseOptions str -> (str, Bool) -> [Tag str] -- | How to lookup an entity in an attribute (Bool = has ending -- ';'?) optEntityAttrib :: ParseOptions str -> (str, Bool) -> (str, [Tag str]) -- | Require no adjacent TagText values (default=True,fast=False) optTagTextMerge :: ParseOptions str -> Bool -- | The default parse options value, described in ParseOptions. -- Equivalent to parseOptionsEntities -- lookupEntity. parseOptions :: StringLike str => ParseOptions str -- | A ParseOptions structure optimised for speed, following the -- fast options. parseOptionsFast :: StringLike str => ParseOptions str -- | A ParseOptions structure using a custom function to lookup -- attributes. Any attribute that is not found will be left intact, and a -- TagWarning given (if optTagWarning is set). -- -- If you do not want to resolve any entities, simpliy pass const -- Nothing for the lookup function. parseOptionsEntities :: StringLike str => (str -> Maybe str) -> ParseOptions str -- | Show a list of tags, as they might have been parsed, using the default -- settings given in RenderOptions. -- --
--   renderTags [TagOpen "hello" [],TagText "my&",TagClose "world"] == "<hello>my&amp;</world>"
--   
renderTags :: StringLike str => [Tag str] -> str -- | Show a list of tags using settings supplied by the -- RenderOptions parameter, eg. to avoid escaping any characters -- one could do: -- --
--   renderTagsOptions renderOptions{optEscape = id} [TagText "my&"] == "my&"
--   
renderTagsOptions :: StringLike str => RenderOptions str -> [Tag str] -> str -- | Replace the four characters &"<> with their HTML -- entities (escapeXML lifted to StringLike). escapeHTML :: StringLike str => str -> str -- | These options control how renderTags works. -- -- The strange quirk of only minimizing <br> tags is due -- to Internet Explorer treating <br></br> as -- <br><br>. data RenderOptions str RenderOptions :: (str -> str) -> (str -> Bool) -> (str -> Bool) -> RenderOptions str -- | Escape a piece of text (default = escape the four characters -- &"<>) optEscape :: RenderOptions str -> str -> str -- | Minimise <b></b> -> <b/> (default = minimise only -- <br> tags) optMinimize :: RenderOptions str -> str -> Bool -- | Should a tag be output with no escaping (default = true only for -- script) optRawTag :: RenderOptions str -> str -> Bool -- | The default render options value, described in RenderOptions. renderOptions :: StringLike str => RenderOptions str -- | Turns all tag names and attributes to lower case and converts DOCTYPE -- to upper case. canonicalizeTags :: StringLike str => [Tag str] -> [Tag str] -- | Test if a Tag is a TagOpen isTagOpen :: Tag str -> Bool -- | Test if a Tag is a TagClose isTagClose :: Tag str -> Bool -- | Test if a Tag is a TagText isTagText :: Tag str -> Bool -- | Test if a Tag is a TagWarning isTagWarning :: Tag str -> Bool -- | Test if a Tag is a TagPosition isTagPosition :: Tag str -> Bool -- | Returns True if the Tag is TagOpen and matches the given -- name isTagOpenName :: Eq str => str -> Tag str -> Bool -- | Returns True if the Tag is TagClose and matches the -- given name isTagCloseName :: Eq str => str -> Tag str -> Bool -- | Extract the string from within TagText, crashes if not a -- TagText fromTagText :: Show str => Tag str -> str -- | Extract an attribute, crashes if not a TagOpen. Returns -- "" if no attribute present. fromAttrib :: (Show str, Eq str, StringLike str) => str -> Tag str -> str -- | Extract the string from within TagText, otherwise -- Nothing maybeTagText :: Tag str -> Maybe str -- | Extract the string from within TagWarning, otherwise -- Nothing maybeTagWarning :: Tag str -> Maybe str -- | Extract all text content from tags (similar to Verbatim found in -- HaXml) innerText :: StringLike str => [Tag str] -> str -- | This function takes a list, and returns all suffixes whose first item -- matches the predicate. sections :: (a -> Bool) -> [a] -> [[a]] -- | This function is similar to sections, but splits the list so no -- element appears in any two partitions. partitions :: (a -> Bool) -> [a] -> [[a]] -- | Define a class to allow String's or Tag str's to be used as matches class TagRep a toTagRep :: (TagRep a, StringLike str) => a -> Tag str -- | Performs an inexact match, the first item should be the thing to -- match. If the second item is a blank string, that is considered to -- match anything. For example: -- --
--   (TagText "test" ~== TagText ""    ) == True
--   (TagText "test" ~== TagText "test") == True
--   (TagText "test" ~== TagText "soup") == False
--   
-- -- For TagOpen missing attributes on the right are allowed. (~==) :: (StringLike str, TagRep t) => Tag str -> t -> Bool -- | Negation of ~== (~/=) :: (StringLike str, TagRep t) => Tag str -> t -> Bool instance TagRep String instance StringLike str => TagRep (Tag str)