{-# LANGUAGE OverloadedStrings #-}
-- | An integration of the /tagsoup/ and /hexpat/ packages, allowing HTML to be parsed to a
-- hexpat tree, tolerant of errors.
--
-- The real work is done by Neil Mitchell's /tagsoup/ package.
module Text.XML.Expat.TagSoup (
        parseTags,
        parseTagsOptions,
        TS.ParseOptions(..),
        parseOptions,
        parseOptionsFast
    ) where

import Control.Arrow ((***))
import Data.Maybe (mapMaybe)
import Text.XML.Expat.SAX
import Text.XML.Expat.Tree
import qualified Text.HTML.TagSoup as TS
import Text.HTML.TagSoup (Tag(..), ParseOptions(..),
    parseOptions, parseOptionsFast, canonicalizeTags)
import Text.StringLike (StringLike, toString)


-- | Parse tags using TagSoup, invoke canonicalizeTags to convert them all to
-- lower case, automatically self-close tags like @img@ and @input@, then
-- convert to a hexpat tree.
parseTags :: (StringLike s, GenericXMLString text) => s -> UNode text
parseTags = tagsToTree . TS.parseTags

-- | Variant that accepts options.
parseTagsOptions :: (StringLike s, GenericXMLString text) =>
                    TS.ParseOptions s -> s -> UNode text
parseTagsOptions opts = tagsToTree . TS.parseTagsOptions opts

tagsToTree :: (StringLike s, GenericXMLString text) => [Tag s] -> UNode text
tagsToTree = fst . saxToTree . selfClose . mapMaybe tag2sax . canonicalizeTags
  where
    tag2sax (TagOpen str attrs) = Just $ StartElement (toText str) (map (toText *** toText) attrs)
    tag2sax (TagClose str)      = Just $ EndElement (toText str)
    tag2sax (TagText str)       = Just $ CharacterData (toText str)
    tag2sax _                   = Nothing

    toText = gxFromString . toString

-- | To do - this list is not very authoritative.
-- Also, more efficient would be nice.
isSelfClosing :: String -> Bool
isSelfClosing "area" = True
isSelfClosing "base" = True
isSelfClosing "br" = True
isSelfClosing "col" = True
isSelfClosing "hr" = True
isSelfClosing "img" = True
isSelfClosing "input" = True
isSelfClosing "link" = True
isSelfClosing "meta" = True
isSelfClosing "param" = True
isSelfClosing _ = False

selfClose :: GenericXMLString text => [SAXEvent text text] -> [SAXEvent text text]
selfClose (start@(StartElement name1 atts) : end@(EndElement name2) : rem) | name1 == name2 =
    start : end : selfClose rem
selfClose (start@(StartElement name atts) : rem) | isSelfClosing (gxToString name) =
    start : EndElement name : selfClose rem
selfClose (tag : rem) = tag : selfClose rem
selfClose [] = []