Haskell Code by HsColour

-- |
-- HTML Parser
--
-- Version : $Id: HtmlParser.hs,v 1.4 2006/11/12 14:53:00 hxml Exp $
--
-- This parser tries to interprete everything as HTML
-- no errors are emitted during parsing. If something looks
-- weired, warning messages are inserted in the document tree
--
-- module contains state filter for easy parsing and error handling
-- real work is done in 'Text.XML.HXT.Parser.HtmlParsec'

module Text.XML.HXT.Parser.HtmlParser
    ( getHtmlDoc
    , parseHtmlDoc
    , runHtmlParser
    , module Text.XML.HXT.Parser.HtmlParsec
    )

where

import Text.XML.HXT.DOM.XmlTree
import Text.XML.HXT.DOM.XmlState
import Text.XML.HXT.Parser.HtmlParsec

import Text.XML.HXT.Parser.XmlInput
    ( getXmlContents
    )

import Text.XML.HXT.Parser.XmlOutput
    ( traceTree
    , traceSource
    , traceMsg
    )

-- ------------------------------------------------------------

-- |
-- read a document and parse it with 'parseHtmlDoc'. The main entry point of this module
--
-- The input tree must be a root tree like in '	Text.XML.HXT.Parser.MainFunctions.getXmlDoc'. The content is read with 'Text.XML.HXT.Parser.XmlInput.getXmlContents',
-- is parsed with 'parseHtmlDoc' and canonicalized (char refs are substituted in content and attributes,
-- but comment is preserved)
--
-- see also : 'Text.XML.HXT.Parser.DTDProcessing.getWellformedDoc'

getHtmlDoc	:: XmlStateFilter state
getHtmlDoc
    = setSystemParams
      .>>
      getXmlContents
      .>>
      parseHtmlDoc

-- | The HTML parsing filter
--
-- The input is parsed with 'runHtmlParser', everything is interpreted as HTML,
-- if errors ocuur, the parser will try to do some meaningfull action and continues
-- parsing. Afterwards the entitiy references for defined for XHTML are resovled,
-- any unresolved reference is transformed into plain text.
--
-- Error messages
-- during parsing and entity resolving are added as warning nodes into the resulting tree.
--
-- The warnings are issued, if the 1. parameter noWarnings is set to True,
-- afterwards all are removed from the resulting tree.

parseHtmlDoc	:: XmlStateFilter a
parseHtmlDoc
    = parseDoc
      `whenM` ( isRoot .> getChildren .> isXText )
      where
      parseDoc t'
	  = ( traceMsg 2 ("parseHtmlDoc: parse HTML document " ++ show loc)
	      .>>
	      runHtmlParser
	      .>>
	      liftMf (processTopDown substHtmlEntities)
	      .>>
	      removeWarnings
	      .>>
	      traceTree
	      .>>
	      traceSource
              ) $ t'
	  where
	  loc    = valueOf a_source t'			-- get document source uri

      removeWarnings	:: XmlStateFilter a
      removeWarnings t'
	  = let
	    noWarnings  = not (satisfies (hasOption a_issue_warnings) t')
	    selWarnings = deep
			  ( choice [ isWarning :-> this
				   , isXTag    :-> (getAttrl .> selWarnings)
				   ]
			  )
	    remWarnings = processTopDown
			  ( choice [ isWarning :-> none
				   , isXTag    :-> (processAttrl (remWarnings $$))
				   , this      :-> this
				   ]
			  )
	    warnings = selWarnings t'
	    in do 
	       if null warnings
		  then thisM t'
		  else do
		       if noWarnings
			  then return []
			  else do
			       issueError $$< warnings
		       return (remWarnings t')

-- | The pure HTML parser, usually called via 'parseHtmlDoc'.
--

runHtmlParser	:: XmlStateFilter a
runHtmlParser t
    = if null errs
      then do
	   return (replaceChildren res t)
      else do
	   issueError $$< errs
	   return (setStatus c_err "parsing HTML" t)
      where
      res  = getChildren .> parseHtmlText loc $ t
      errs = isXError .> neg isWarning $$ res
      loc  = valueOf a_source t


-- ------------------------------------------------------------