{-# LANGUAGE OverloadedStrings #-} -- | An integration of the /tagsoup/ and /hexpat/ packages, allowing HTML to be parsed to a -- hexpat tree, tolerant of errors. -- -- The real work is done by Neil Mitchell's /tagsoup/ package. module Text.XML.Expat.TagSoup ( parseTags, parseTagsOptions, TS.ParseOptions(..), parseOptions, parseOptionsFast ) where import Control.Arrow ((***)) import Data.Maybe (mapMaybe) import Text.XML.Expat.SAX import Text.XML.Expat.Tree import qualified Text.HTML.TagSoup as TS import Text.HTML.TagSoup (Tag(..), ParseOptions(..), parseOptions, parseOptionsFast, canonicalizeTags) import Text.StringLike (StringLike, toString) -- | Parse tags using TagSoup, invoke canonicalizeTags to convert them all to -- lower case, automatically self-close tags like @img@ and @input@, then -- convert to a hexpat tree. parseTags :: (StringLike s, GenericXMLString text) => s -> UNode text parseTags = tagsToTree . TS.parseTags -- | Variant that accepts options. parseTagsOptions :: (StringLike s, GenericXMLString text) => TS.ParseOptions s -> s -> UNode text parseTagsOptions opts = tagsToTree . TS.parseTagsOptions opts tagsToTree :: (StringLike s, GenericXMLString text) => [Tag s] -> UNode text tagsToTree = fst . saxToTree . selfClose . mapMaybe tag2sax . canonicalizeTags where tag2sax (TagOpen str attrs) = Just $ StartElement (toText str) (map (toText *** toText) attrs) tag2sax (TagClose str) = Just $ EndElement (toText str) tag2sax (TagText str) = Just $ CharacterData (toText str) tag2sax _ = Nothing toText = gxFromString . toString -- | To do - this list is not very authoritative. -- Also, more efficient would be nice. isSelfClosing :: String -> Bool isSelfClosing "area" = True isSelfClosing "base" = True isSelfClosing "br" = True isSelfClosing "col" = True isSelfClosing "hr" = True isSelfClosing "img" = True isSelfClosing "input" = True isSelfClosing "link" = True isSelfClosing "meta" = True isSelfClosing "param" = True isSelfClosing _ = False selfClose :: GenericXMLString text => [SAXEvent text text] -> [SAXEvent text text] selfClose (start@(StartElement name1 atts) : end@(EndElement name2) : rem) | name1 == name2 = start : end : selfClose rem selfClose (start@(StartElement name atts) : rem) | isSelfClosing (gxToString name) = start : EndElement name : selfClose rem selfClose (tag : rem) = tag : selfClose rem selfClose [] = []