-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | Parsing and extracting information from (possibly malformed) HTML/XML documents -- -- TagSoup is a library for parsing HTML/XML. It supports the HTML 5 -- specification, and can be used to parse either well-formed XML, or -- unstructured and malformed HTML from the web. The library also -- provides useful functions to extract information from an HTML -- document, making it ideal for screen-scraping. -- -- Users should start from the Text.HTML.TagSoup module. @package tagsoup @version 0.13 -- | This module converts between HTML/XML entities (i.e. -- &) and the characters they represent. module Text.HTML.TagSoup.Entity -- | Lookup an entity, using lookupNumericEntity if it starts with -- # and lookupNamedEntity otherwise lookupEntity :: String -> Maybe String -- | Lookup a named entity, using htmlEntities -- --
-- lookupNamedEntity "amp" == Just "&" -- lookupNamedEntity "haskell" == Nothing --lookupNamedEntity :: String -> Maybe String -- | Lookup a numeric entity, the leading '#' must have already -- been removed. -- --
-- lookupNumericEntity "65" == Just "A" -- lookupNumericEntity "x41" == Just "A" -- lookupNumericEntity "x4E" === Just "N" -- lookupNumericEntity "x4e" === Just "N" -- lookupNumericEntity "Haskell" == Nothing -- lookupNumericEntity "" == Nothing -- lookupNumericEntity "89439085908539082" == Nothing --lookupNumericEntity :: String -> Maybe String -- | Escape an XML string. -- --
-- escapeXML "hello world" == "hello world" -- escapeXML "hello & world" == "hello & world" --escapeXML :: String -> String -- | A table mapping XML entity names to resolved strings. All strings are -- a single character long. Does not include apos as -- Internet Explorer does not know about it. xmlEntities :: [(String, String)] -- | A table mapping HTML entity names to resolved strings. Most resolved -- strings are a single character long, but some (e.g. -- ngeqq) are two characters long. The list is taken from -- http://www.w3.org/TR/html5/syntax.html#named-character-references. htmlEntities :: [(String, String)] -- | WARNING: This module is not intended for use outside the -- TagSoup library. -- -- This module provides an abstraction for String's as used inside -- TagSoup. It allows TagSoup to work with String (list of Char), -- ByteString.Char8, ByteString.Lazy.Char8, Data.Text and Data.Text.Lazy. module Text.StringLike -- | A class to generalise TagSoup parsing over many types of string-like -- types. Examples are given for the String type. class (Typeable a, Eq a) => StringLike a empty :: StringLike a => a cons :: StringLike a => Char -> a -> a uncons :: StringLike a => a -> Maybe (Char, a) toString :: StringLike a => a -> String fromString :: StringLike a => String -> a fromChar :: StringLike a => Char -> a strConcat :: StringLike a => [a] -> a strNull :: StringLike a => a -> Bool append :: StringLike a => a -> a -> a -- | Convert a String from one type to another. castString :: (StringLike a, StringLike b) => a -> b instance StringLike Text instance StringLike Text instance StringLike ByteString instance StringLike ByteString instance StringLike String -- | Combinators to match tags. Some people prefer to use (~==) -- from Text.HTML.TagSoup, others prefer these more structured -- combinators. Which you use is personal preference. module Text.HTML.TagSoup.Match -- | match an opening tag tagOpen :: (str -> Bool) -> ([Attribute str] -> Bool) -> Tag str -> Bool -- | match an closing tag tagClose :: (str -> Bool) -> Tag str -> Bool -- | match a text tagText :: (str -> Bool) -> Tag str -> Bool tagComment :: (str -> Bool) -> Tag str -> Bool -- | match a opening tag's name literally tagOpenLit :: Eq str => str -> ([Attribute str] -> Bool) -> Tag str -> Bool -- | match a closing tag's name literally tagCloseLit :: Eq str => str -> Tag str -> Bool tagOpenAttrLit :: Eq str => str -> Attribute str -> Tag str -> Bool -- | Match a tag with given name, that contains an attribute with given -- name, that satisfies a predicate. If an attribute occurs multiple -- times, all occurrences are checked. tagOpenAttrNameLit :: Eq str => str -> str -> (str -> Bool) -> Tag str -> Bool -- | Check if the 'Tag str' is TagOpen and matches the given name tagOpenNameLit :: Eq str => str -> Tag str -> Bool -- | Check if the 'Tag str' is TagClose and matches the given name tagCloseNameLit :: Eq str => str -> Tag str -> Bool anyAttr :: ((str, str) -> Bool) -> [Attribute str] -> Bool anyAttrName :: (str -> Bool) -> [Attribute str] -> Bool anyAttrValue :: (str -> Bool) -> [Attribute str] -> Bool anyAttrLit :: Eq str => (str, str) -> [Attribute str] -> Bool anyAttrNameLit :: Eq str => str -> [Attribute str] -> Bool anyAttrValueLit :: Eq str => str -> [Attribute str] -> Bool getTagContent :: Eq str => str -> ([Attribute str] -> Bool) -> [Tag str] -> [Tag str] -- | NOTE: This module is preliminary and may change at a future -- date. -- -- This module is intended to help converting a list of tags into a tree -- of tags. module Text.HTML.TagSoup.Tree data TagTree str TagBranch :: str -> [Attribute str] -> [TagTree str] -> TagTree str TagLeaf :: (Tag str) -> TagTree str -- | Convert a list of tags into a tree. This version is not lazy at all, -- that is saved for version 2. tagTree :: Eq str => [Tag str] -> [TagTree str] flattenTree :: [TagTree str] -> [Tag str] -- | This operation is based on the Uniplate transform function. -- Given a list of trees, it applies the function to every tree in a -- bottom-up manner. This operation is useful for manipulating a tree - -- for example to make all tag names upper case: -- --
-- upperCase = transformTree f -- where f (TagBranch name atts inner) = [TagBranch (map toUpper name) atts inner] -- f x = x --transformTree :: (TagTree str -> [TagTree str]) -> [TagTree str] -> [TagTree str] -- | This operation is based on the Uniplate universe function. -- Given a list of trees, it returns those trees, and all the children -- trees at any level. For example: -- --
-- universeTree
-- [TagBranch "a" [("href","url")] [TagBranch "b" [] [TagLeaf (TagText "text")]]]
-- == [TagBranch "a" [("href","url")] [TagBranch "b" [] [TagLeaf (TagText "text")]]]
-- ,TagBranch "b" [] [TagLeaf (TagText "text")]]
--
--
-- This operation is particularly useful for queries. To collect all
-- "a" tags in a tree, simply do:
--
-- -- [x | x@(TagTree "a" _ _) <- universeTree tree] --universeTree :: [TagTree str] -> [TagTree str] instance Eq str => Eq (TagTree str) instance Ord str => Ord (TagTree str) instance Show str => Show (TagTree str) instance Functor TagTree -- | This module is for working with HTML/XML. It deals with both -- well-formed XML and malformed HTML from the web. It features: -- --
-- parseTags "<hello>my&</world>" == [TagOpen "hello" [],TagText "my&",TagClose "world"] --parseTags :: StringLike str => str -> [Tag str] -- | Parse a string to a list of tags, using settings supplied by the -- ParseOptions parameter, eg. to output position information: -- --
-- parseTagsOptions parseOptions{optTagPosition = True} "<hello>my&</world>" ==
-- [TagPosition 1 1,TagOpen "hello" [],TagPosition 1 8,TagText "my&",TagPosition 1 15,TagClose "world"]
--
parseTagsOptions :: StringLike str => ParseOptions str -> str -> [Tag str]
-- | These options control how parseTags works. The
-- ParseOptions type is usually generated by one of
-- parseOptions, parseOptionsFast or
-- parseOptionsEntities, then selected fields may be overriden.
--
-- The options optTagPosition and optTagWarning specify
-- whether to generate TagPosition or TagWarning elements
-- respectively. Usually these options should be set to False to
-- simplify future stages, unless you rely on position information or
-- want to give malformed HTML messages to the end user.
--
-- The options optEntityData and optEntityAttrib control
-- how entities, for example are handled. Both take a
-- string, and a boolean, where True indicates that the entity
-- ended with a semi-colon ;. Inside normal text
-- optEntityData will be called, and the results will be inserted
-- in the tag stream. Inside a tag attribute optEntityAttrib will
-- be called, and the first component of the result will be used in the
-- attribute, and the second component will be appended after the
-- TagOpen value (usually the second component is []). As
-- an example, to not decode any entities, pass:
--
--
-- parseOptions
-- {optEntityData=\(str,b) -> [TagText $ "&" ++ str ++ [';' | b]]
-- ,optEntityAttrib\(str,b) -> ("&" ++ str ++ [';' | b], [])
--
data ParseOptions str
ParseOptions :: Bool -> Bool -> ((str, Bool) -> [Tag str]) -> ((str, Bool) -> (str, [Tag str])) -> Bool -> ParseOptions str
-- | Should TagPosition values be given before some items
-- (default=False,fast=False).
optTagPosition :: ParseOptions str -> Bool
-- | Should TagWarning values be given (default=False,fast=False)
optTagWarning :: ParseOptions str -> Bool
-- | How to lookup an entity (Bool = has ending ';')
optEntityData :: ParseOptions str -> (str, Bool) -> [Tag str]
-- | How to lookup an entity in an attribute (Bool = has ending
-- ';'?)
optEntityAttrib :: ParseOptions str -> (str, Bool) -> (str, [Tag str])
-- | Require no adjacent TagText values (default=True,fast=False)
optTagTextMerge :: ParseOptions str -> Bool
-- | The default parse options value, described in ParseOptions.
-- Equivalent to parseOptionsEntities
-- lookupEntity.
parseOptions :: StringLike str => ParseOptions str
-- | A ParseOptions structure optimised for speed, following the
-- fast options.
parseOptionsFast :: StringLike str => ParseOptions str
-- | A ParseOptions structure using a custom function to lookup
-- attributes. Any attribute that is not found will be left intact, and a
-- TagWarning given (if optTagWarning is set).
--
-- If you do not want to resolve any entities, simpliy pass const
-- Nothing for the lookup function.
parseOptionsEntities :: StringLike str => (str -> Maybe str) -> ParseOptions str
-- | Show a list of tags, as they might have been parsed, using the default
-- settings given in RenderOptions.
--
-- -- renderTags [TagOpen "hello" [],TagText "my&",TagClose "world"] == "<hello>my&</world>" --renderTags :: StringLike str => [Tag str] -> str -- | Show a list of tags using settings supplied by the -- RenderOptions parameter, eg. to avoid escaping any characters -- one could do: -- --
-- renderTagsOptions renderOptions{optEscape = id} [TagText "my&"] == "my&"
--
renderTagsOptions :: StringLike str => RenderOptions str -> [Tag str] -> str
-- | Replace the four characters &"<> with their HTML
-- entities (escapeXML lifted to StringLike).
escapeHTML :: StringLike str => str -> str
-- | These options control how renderTags works.
--
-- The strange quirk of only minimizing <br> tags is due
-- to Internet Explorer treating <br></br> as
-- <br><br>.
data RenderOptions str
RenderOptions :: (str -> str) -> (str -> Bool) -> (str -> Bool) -> RenderOptions str
-- | Escape a piece of text (default = escape the four characters
-- &"<>)
optEscape :: RenderOptions str -> str -> str
-- | Minimise <b></b> -> <b/> (default = minimise only
-- <br> tags)
optMinimize :: RenderOptions str -> str -> Bool
-- | Should a tag be output with no escaping (default = true only for
-- script)
optRawTag :: RenderOptions str -> str -> Bool
-- | The default render options value, described in RenderOptions.
renderOptions :: StringLike str => RenderOptions str
-- | Turns all tag names and attributes to lower case and converts DOCTYPE
-- to upper case.
canonicalizeTags :: StringLike str => [Tag str] -> [Tag str]
-- | Test if a Tag is a TagOpen
isTagOpen :: Tag str -> Bool
-- | Test if a Tag is a TagClose
isTagClose :: Tag str -> Bool
-- | Test if a Tag is a TagText
isTagText :: Tag str -> Bool
-- | Test if a Tag is a TagWarning
isTagWarning :: Tag str -> Bool
-- | Test if a Tag is a TagPosition
isTagPosition :: Tag str -> Bool
-- | Returns True if the Tag is TagOpen and matches the given
-- name
isTagOpenName :: Eq str => str -> Tag str -> Bool
-- | Returns True if the Tag is TagClose and matches the
-- given name
isTagCloseName :: Eq str => str -> Tag str -> Bool
-- | Extract the string from within TagText, crashes if not a
-- TagText
fromTagText :: Show str => Tag str -> str
-- | Extract an attribute, crashes if not a TagOpen. Returns
-- "" if no attribute present.
fromAttrib :: (Show str, Eq str, StringLike str) => str -> Tag str -> str
-- | Extract the string from within TagText, otherwise
-- Nothing
maybeTagText :: Tag str -> Maybe str
-- | Extract the string from within TagWarning, otherwise
-- Nothing
maybeTagWarning :: Tag str -> Maybe str
-- | Extract all text content from tags (similar to Verbatim found in
-- HaXml)
innerText :: StringLike str => [Tag str] -> str
-- | This function takes a list, and returns all suffixes whose first item
-- matches the predicate.
sections :: (a -> Bool) -> [a] -> [[a]]
-- | This function is similar to sections, but splits the list so no
-- element appears in any two partitions.
partitions :: (a -> Bool) -> [a] -> [[a]]
-- | Define a class to allow String's or Tag str's to be used as matches
class TagRep a
toTagRep :: (TagRep a, StringLike str) => a -> Tag str
-- | Performs an inexact match, the first item should be the thing to
-- match. If the second item is a blank string, that is considered to
-- match anything. For example:
--
-- -- (TagText "test" ~== TagText "" ) == True -- (TagText "test" ~== TagText "test") == True -- (TagText "test" ~== TagText "soup") == False ---- -- For TagOpen missing attributes on the right are allowed. (~==) :: (StringLike str, TagRep t) => Tag str -> t -> Bool -- | Negation of ~== (~/=) :: (StringLike str, TagRep t) => Tag str -> t -> Bool instance TagRep String instance StringLike str => TagRep (Tag str)