module Text.HTML.WraXML.Tree.HXT
(tidy, format, fromHTMLString, fromHTMLStringMetaEncoding,
errorAnnFromHTMLStringMetaEncoding,
errorAnnFromHTMLStringOpt, ErrorMsg,
getMetaHTTPHeaders, ) where
import qualified Text.XML.HXT.DOM.TypeDefs as HXT
import qualified Data.Tree.NTree.TypeDefs as HXTTree
import qualified Text.XML.WraXML.Tree.HXT as WraHXT
import qualified Text.HTML.WraXML.Tree as HTMLTree
import qualified Text.XML.WraXML.Tree as XmlTree
import qualified Text.XML.Basic.Name.LowerCase as NameLC
import qualified Text.XML.Basic.Name as Name
import qualified Text.XML.Basic.Format as Format
import Text.XML.HXT.DOM.XmlTree hiding (cmt, formatTree, )
import Text.XML.HXT.DOM.XmlState
(XmlStateFilter, liftF, run',
setSystemParams, setSysErrorHandler, getErrorMsg,
errorMsgLogging, errorMsgToStderr, )
import Text.XML.HXT.Parser.XmlParser (parseXmlDocEncodingSpec, )
import Text.XML.HXT.Parser.XmlInput (guessDocEncoding, )
import Text.XML.HXT.Parser.XmlOutput (traceTree, traceSource, traceMsg, )
import Text.XML.HXT.Parser.HtmlParser (parseHtmlDoc, )
import Text.XML.HXT.Parser.HtmlParsec (isEmptyHtmlTag, )
import Text.XML.HXT.DOM.EditFilters (escapeXmlDoc, )
import Text.XML.HXT.DOM.ShowXml (xshow, )
import Text.XML.HXT.DOM.QualifiedName (qualifiedName, )
import Data.Tree.NTree.Filter (liftMf, )
import Control.Monad (msum, )
import Data.Maybe (fromMaybe, )
tidy :: String -> IO String
tidy input =
fmap (($"") . formatTrees . (escapeXmlDoc .> getChildren))
(fromHTMLString input)
format :: HXT.XmlTree -> String
format leaf = formatTrees (HXTTree.getChildren leaf) ""
formatTrees :: HXT.XmlTrees -> ShowS
formatTrees = foldr (.) id . map formatTree
formatTree :: HXT.XmlTree -> ShowS
formatTree leaf =
case leaf of
(NTree (XPi n al) _) ->
showString "<?"
.
formatQName n
.
(foldr (.) id . map showPiAttr) al
.
showString "?>"
where
showPiAttr :: HXT.XmlTree -> String -> String
showPiAttr a@(NTree (XAttr an) cs) =
if aName an == a_value
then Format.blank . formatTrees cs
else formatTree a
showPiAttr _ = id
(NTree (XTag t al) cs) ->
if null cs && isEmptyHtmlTag (HXT.qualifiedName t)
then Format.lt . formatQName t . formatTrees al . Format.gt
else Format.lt . formatQName t . formatTrees al . Format.gt
. formatTrees cs
. Format.lt . Format.slash . formatQName t . Format.gt
(NTree (XAttr an) cs) ->
Format.blank . formatQName an . Format.eq . Format.stringQuoted (formatTrees cs "")
(NTree (XError l e) _) ->
showString "<!-- ERROR (" . showString (show l) . showString "):\n"
. showString e . showString "\n-->"
_ -> (xshow [leaf] ++)
formatQName :: QName -> ShowS
formatQName = showString . qualifiedName
findMetaEncoding :: String -> IO (Maybe String)
findMetaEncoding str =
do htmlTrees <- xmlTreesFromHTMLString str
return (msum (map HTMLTree.findMetaEncoding (htmlTrees :: [XmlTree.T () NameLC.T String])))
getMetaHTTPHeaders :: String -> IO [(String, String)]
getMetaHTTPHeaders str =
do htmlTrees <- xmlTreesFromHTMLString str
return (concatMap HTMLTree.getMetaHTTPHeaders (htmlTrees :: [XmlTree.T () NameLC.T String]))
xmlTreesFromHTMLString ::
(Name.Tag name, Name.Attribute name) =>
String -> IO [XmlTree.T () name String]
xmlTreesFromHTMLString str =
do hxtTree <- fromHTMLString str
return $
map (XmlTree.unescape . WraHXT.toXmlTree)
(filter (WraHXT.checkTagName "html" . HXTTree.getNode)
(HXTTree.getChildren hxtTree))
fromHTMLStringMetaEncoding :: String -> IO HXT.XmlTree
fromHTMLStringMetaEncoding str =
do enc <- findMetaEncoding str
fromHTMLStringOpt [(a_encoding, fromMaybe isoLatin1 enc)] str
fromHTMLString :: String -> IO HXT.XmlTree
fromHTMLString = fromHTMLStringOpt [(a_encoding, isoLatin1)]
fromHTMLStringOpt :: Attributes -> String -> IO HXT.XmlTree
fromHTMLStringOpt options input =
do (tree,_,_) <- errorAnnFromHTMLStringOpt options input
return tree
type ErrorMsg = (Int,String)
errorAnnFromHTMLStringMetaEncoding ::
String -> IO (HXT.XmlTree, [ErrorMsg], Int)
errorAnnFromHTMLStringMetaEncoding str =
do enc <- findMetaEncoding str
errorAnnFromHTMLStringOpt [(a_encoding, fromMaybe isoLatin1 enc)] str
errorAnnFromHTMLStringOpt ::
Attributes -> String -> IO (HXT.XmlTree, [ErrorMsg], Int)
errorAnnFromHTMLStringOpt options contents =
do
let options' =
(a_collect_errors, v_1) :
(a_issue_errors, v_0) :
options
(root:errs) <- run' $ parseDocument options' contents emptyRoot
let elvl = intValueOf a_status root
let errMsgs = map ((\(XError level msg) -> (level, msg)) . getNode) errs
return (root, errMsgs, elvl)
getStringContents :: String -> XmlStateFilter a
getStringContents contents =
return . replaceChildren (xtext contents)
.>>
liftF parseXmlDocEncodingSpec
.>>
guessDocEncoding
parseDocument :: Attributes -> String -> XmlStateFilter state
parseDocument userOptions contents
= processDocument userOptions defaultOptions
( traceMsg 1 "parseDocument: options added, start processing"
.>>
traceTree
.>>
getStringContents contents
.>>
parseHtmlDoc
.>>
traceMsg 1 "parseDocument: document processed"
.>>
traceSource
.>>
traceTree
)
where
defaultOptions
= [ ( a_parse_html, v_1 )
, ( a_validate, v_1 )
, ( a_issue_errors, v_1 )
, ( a_issue_warnings, v_1 )
, ( a_check_namespaces, v_0 )
, ( a_canonicalize, v_1 )
, ( a_preserve_comment, v_0 )
, ( a_remove_whitespace, v_0 )
]
addOptions :: Attributes -> XmlFilter
addOptions
= seqF . map (uncurry addAttr)
addDefaultOptions :: Attributes -> XmlFilter
addDefaultOptions
= seqF . map (\ (n,v) -> addAttr n v `whenNot` hasAttr n)
processDocument :: Attributes -> Attributes -> XmlStateFilter state -> XmlStateFilter state
processDocument userOptions defaultOptions processFilter
= liftMf isRoot
.>>
liftMf (addOptions userOptions
.>
addOptions [(a_status, show c_ok)]
)
.>>
liftMf (addDefaultOptions defaultOptions)
.>>
setSystemParams
.>>
choiceM
[ hasOption a_propagate_errors
:-> thisM
, hasOption a_collect_errors
.>
hasOption a_issue_errors
:-> performAction (\ _ -> setSysErrorHandler (errorMsgLogging.>> errorMsgToStderr) )
, hasOption a_collect_errors
:-> performAction (\ _ -> setSysErrorHandler errorMsgLogging )
, hasOption a_issue_errors
:-> performAction (\ _ -> setSysErrorHandler errorMsgToStderr )
, this
:-> performAction (\ _ -> setSysErrorHandler noneM )
]
.>>
processFilter
.>>
( thisM
+++>>
( hasOption a_collect_errors
`guardsM`
getErrorMsg
)
)