-- |
-- HTML Parser
--
-- Version : $Id: HtmlParser.hs,v 1.4 2006/11/12 14:53:00 hxml Exp $
--
-- This parser tries to interprete everything as HTML
-- no errors are emitted during parsing. If something looks
-- weired, warning messages are inserted in the document tree
--
-- module contains state filter for easy parsing and error handling
-- real work is done in 'Text.XML.HXT.Parser.HtmlParsec'
module Text.XML.HXT.Parser.HtmlParser
( getHtmlDoc
, parseHtmlDoc
, runHtmlParser
, module Text.XML.HXT.Parser.HtmlParsec
)
where
import Text.XML.HXT.DOM.XmlTree
import Text.XML.HXT.DOM.XmlState
import Text.XML.HXT.Parser.HtmlParsec
import Text.XML.HXT.Parser.XmlInput
( getXmlContents
)
import Text.XML.HXT.Parser.XmlOutput
( traceTree
, traceSource
, traceMsg
)
-- ------------------------------------------------------------
-- |
-- read a document and parse it with 'parseHtmlDoc'. The main entry point of this module
--
-- The input tree must be a root tree like in ' Text.XML.HXT.Parser.MainFunctions.getXmlDoc'. The content is read with 'Text.XML.HXT.Parser.XmlInput.getXmlContents',
-- is parsed with 'parseHtmlDoc' and canonicalized (char refs are substituted in content and attributes,
-- but comment is preserved)
--
-- see also : 'Text.XML.HXT.Parser.DTDProcessing.getWellformedDoc'
getHtmlDoc :: XmlStateFilter state
getHtmlDoc
= setSystemParams
.>>
getXmlContents
.>>
parseHtmlDoc
-- | The HTML parsing filter
--
-- The input is parsed with 'runHtmlParser', everything is interpreted as HTML,
-- if errors ocuur, the parser will try to do some meaningfull action and continues
-- parsing. Afterwards the entitiy references for defined for XHTML are resovled,
-- any unresolved reference is transformed into plain text.
--
-- Error messages
-- during parsing and entity resolving are added as warning nodes into the resulting tree.
--
-- The warnings are issued, if the 1. parameter noWarnings is set to True,
-- afterwards all are removed from the resulting tree.
parseHtmlDoc :: XmlStateFilter a
parseHtmlDoc
= parseDoc
`whenM` ( isRoot .> getChildren .> isXText )
where
parseDoc t'
= ( traceMsg 2 ("parseHtmlDoc: parse HTML document " ++ show loc)
.>>
runHtmlParser
.>>
liftMf (processTopDown substHtmlEntities)
.>>
removeWarnings
.>>
traceTree
.>>
traceSource
) $ t'
where
loc = valueOf a_source t' -- get document source uri
removeWarnings :: XmlStateFilter a
removeWarnings t'
= let
noWarnings = not (satisfies (hasOption a_issue_warnings) t')
selWarnings = deep
( choice [ isWarning :-> this
, isXTag :-> (getAttrl .> selWarnings)
]
)
remWarnings = processTopDown
( choice [ isWarning :-> none
, isXTag :-> (processAttrl (remWarnings $$))
, this :-> this
]
)
warnings = selWarnings t'
in do
if null warnings
then thisM t'
else do
if noWarnings
then return []
else do
issueError $$< warnings
return (remWarnings t')
-- | The pure HTML parser, usually called via 'parseHtmlDoc'.
--
runHtmlParser :: XmlStateFilter a
runHtmlParser t
= if null errs
then do
return (replaceChildren res t)
else do
issueError $$< errs
return (setStatus c_err "parsing HTML" t)
where
res = getChildren .> parseHtmlText loc $ t
errs = isXError .> neg isWarning $$ res
loc = valueOf a_source t
-- ------------------------------------------------------------