module Text.XML.Expat.TagSoup (
parseTags,
parseTagsOptions,
TS.ParseOptions(..),
parseOptions,
parseOptionsFast
) where
import Control.Arrow ((***))
import Data.Maybe (mapMaybe)
import Text.XML.Expat.SAX
import Text.XML.Expat.Tree
import qualified Text.HTML.TagSoup as TS
import Text.HTML.TagSoup (Tag(..), ParseOptions(..),
parseOptions, parseOptionsFast, canonicalizeTags)
import Text.StringLike (StringLike, toString)
parseTags :: (StringLike s, GenericXMLString text) => s -> UNode text
parseTags = tagsToTree . TS.parseTags
parseTagsOptions :: (StringLike s, GenericXMLString text) =>
TS.ParseOptions s -> s -> UNode text
parseTagsOptions opts = tagsToTree . TS.parseTagsOptions opts
tagsToTree :: (StringLike s, GenericXMLString text) => [Tag s] -> UNode text
tagsToTree = fst . saxToTree . selfClose . mapMaybe tag2sax . canonicalizeTags
where
tag2sax (TagOpen str attrs) = Just $ StartElement (toText str) (map (toText *** toText) attrs)
tag2sax (TagClose str) = Just $ EndElement (toText str)
tag2sax (TagText str) = Just $ CharacterData (toText str)
tag2sax _ = Nothing
toText = gxFromString . toString
isSelfClosing :: String -> Bool
isSelfClosing "area" = True
isSelfClosing "base" = True
isSelfClosing "br" = True
isSelfClosing "col" = True
isSelfClosing "hr" = True
isSelfClosing "img" = True
isSelfClosing "input" = True
isSelfClosing "link" = True
isSelfClosing "meta" = True
isSelfClosing "param" = True
isSelfClosing _ = False
selfClose :: GenericXMLString text => [SAXEvent text text] -> [SAXEvent text text]
selfClose (start@(StartElement name1 atts) : end@(EndElement name2) : rem) | name1 == name2 =
start : end : selfClose rem
selfClose (start@(StartElement name atts) : rem) | isSelfClosing (gxToString name) =
start : EndElement name : selfClose rem
selfClose (tag : rem) = tag : selfClose rem
selfClose [] = []