module Text.XML.HXT.Parser.HtmlParser
( getHtmlDoc
, parseHtmlDoc
, runHtmlParser
, substHtmlEntities
, module Text.XML.HXT.Parser.HtmlParsec
)
where
import Text.XML.HXT.DOM.XmlTree
import Text.XML.HXT.DOM.XmlState
import Text.XML.HXT.Parser.HtmlParsec
import Text.XML.HXT.Parser.XmlInput
( getXmlContents
)
import Text.XML.HXT.Parser.XmlOutput
( traceTree
, traceSource
, traceMsg
)
import Text.XML.HXT.Parser.XhtmlEntities
( xhtmlEntities
)
getHtmlDoc :: XmlStateFilter state
getHtmlDoc
= setSystemParams
.>>
getXmlContents
.>>
parseHtmlDoc
parseHtmlDoc :: XmlStateFilter a
parseHtmlDoc
= parseDoc
`whenM` ( isRoot .> getChildren .> isXText )
where
parseDoc t'
= ( traceMsg 2 ("parseHtmlDoc: parse HTML document " ++ show loc)
.>>
runHtmlParser
.>>
liftMf (processTopDown substHtmlEntities)
.>>
removeWarnings
.>>
traceTree
.>>
traceSource
) $ t'
where
loc = valueOf a_source t'
removeWarnings :: XmlStateFilter a
removeWarnings t'
= let
noWarnings = not (satisfies (hasOption a_issue_warnings) t')
selWarnings = deep
( choice [ isWarning :-> this
, isXTag :-> (getAttrl .> selWarnings)
]
)
remWarnings = processTopDown
( choice [ isWarning :-> none
, isXTag :-> (processAttrl (remWarnings $$))
, this :-> this
]
)
warnings = selWarnings t'
in do
if null warnings
then thisM t'
else do
if noWarnings
then return []
else do
issueError $$< warnings
return (remWarnings t')
runHtmlParser :: XmlStateFilter a
runHtmlParser t
= if null errs
then do
return (replaceChildren res t)
else do
issueError $$< errs
return (setStatus c_err "parsing HTML" t)
where
res = getChildren .> parseHtmlText loc $ t
errs = isXError .> neg isWarning $$ res
loc = valueOf a_source t
substHtmlEntities :: XmlTree -> XmlTrees
substHtmlEntities
= choice [ isXEntityRef :-> substEntity
, isXTag :-> processAttr (processChildren substHtmlEntities)
, this :-> this
]
where
substEntity t'@(NTree (XEntityRef en) _)
= case (lookup en xhtmlEntities) of
Just i
-> [mkXCharRefTree i]
Nothing
-> xwarn ("no XHTML entity found for reference: \"&" ++ en ++ ";\"")
++
(xmlTreesToText [t'])
substEntity _
= error "substHtmlEntities: illegal argument"