module Text.HTML.WraXML.Tree.HXT
   (tidy, format, fromHTMLString, fromHTMLStringMetaEncoding,
    errorAnnFromHTMLStringMetaEncoding,
    errorAnnFromHTMLStringOpt, ErrorMsg,
    getMetaHTTPHeaders, ) where

import qualified Text.XML.HXT.DOM.TypeDefs as HXT
-- import qualified Text.XML.HXT.DOM.XmlTreeFunctions as HXTTreeFunc
import qualified Data.Tree.NTree.TypeDefs  as HXTTree

import qualified Text.XML.WraXML.Tree.HXT as WraHXT

import qualified Text.HTML.WraXML.Tree   as HTMLTree
import qualified Text.XML.WraXML.Tree    as XmlTree

import qualified Text.XML.Basic.Name.LowerCase as NameLC
import qualified Text.XML.Basic.Name as Name

import qualified Text.XML.Basic.Format    as Format

-- needed for code copies from HXT at the of the file
import Text.XML.HXT.DOM.XmlTree hiding (cmt, formatTree, )
import Text.XML.HXT.DOM.XmlState
   (XmlStateFilter, liftF, run',
    setSystemParams, setSysErrorHandler, getErrorMsg,
    errorMsgLogging, errorMsgToStderr, )
import Text.XML.HXT.Parser.XmlParser  (parseXmlDocEncodingSpec, )
import Text.XML.HXT.Parser.XmlInput   (guessDocEncoding, )
import Text.XML.HXT.Parser.XmlOutput  (traceTree, traceSource, traceMsg, )
import Text.XML.HXT.Parser.HtmlParser (parseHtmlDoc, )
import Text.XML.HXT.Parser.HtmlParsec (isEmptyHtmlTag, )
import Text.XML.HXT.DOM.EditFilters   (escapeXmlDoc, )
import Text.XML.HXT.DOM.ShowXml       (xshow, {- showXmlTree, showQName, -})
import Text.XML.HXT.DOM.QualifiedName (qualifiedName, )
import Data.Tree.NTree.Filter         (liftMf, )

import Control.Monad (msum, )
import Data.Maybe (fromMaybe, )



{- |
Tidy a piece of HTML code.
&   ->   &
<   ->   &lt;
unquoted tag attribute values: size=-1   ->   size="-1"
insert omitted closing tags
-}
tidy :: String -> IO String
tidy input =
   -- escapeXmlDoc replaces < by &lt; and so on
   -- XmlParsec.substXmlEntities similar to escapeXmlDoc ?
   fmap (($"") . formatTrees . (escapeXmlDoc .> getChildren))
        (fromHTMLString input)


-- * formatting

{- |
Like 'Text.XML.HXT.DOM.XmlTreeFunctions.xshow'
but it shows empty tags the HTML way.
E.g. it emits @<br>@ instead of @<br\/>@,
@<noscript><\/noscript>@ instead of @<noscript\/>@.
Many browsers prefer that.
-}
format :: HXT.XmlTree -> String
format leaf = formatTrees (HXTTree.getChildren leaf) ""

formatTrees	:: HXT.XmlTrees -> ShowS
formatTrees	= foldr (.) id . map formatTree


-- cf. src/Text/XML/HXT/DOM/XmlTreeFunctions.hs
formatTree	:: HXT.XmlTree -> ShowS
formatTree leaf =
   case leaf of
      (NTree (XPi n al) _) ->
         showString "<?"
         .
         formatQName n
         .
         (foldr (.) id . map showPiAttr) al
         .
         showString "?>"
           where
             showPiAttr :: HXT.XmlTree -> String -> String
             showPiAttr a@(NTree (XAttr an) cs) =
	         if aName an == a_value
	           then Format.blank . formatTrees cs
	           else formatTree a
             showPiAttr _ = id
      (NTree (XTag t al) cs) ->
         if null cs && isEmptyHtmlTag (HXT.qualifiedName t)
           then Format.lt . formatQName t . formatTrees al . Format.gt
           else Format.lt . formatQName t . formatTrees al . Format.gt
                 . formatTrees cs
                 . Format.lt . Format.slash . formatQName t . Format.gt
      (NTree (XAttr an) cs) ->
         Format.blank . formatQName an . Format.eq . Format.stringQuoted (formatTrees cs "")
      (NTree (XError l e) _) ->
         showString "<!-- ERROR (" . showString (show l) . showString "):\n"
          . showString e . showString "\n-->"
      _ -> (xshow [leaf] ++) -- showXmlTree leaf


formatQName :: QName -> ShowS
formatQName = showString . qualifiedName


-- * parsing and encoding

{- |
Search for a META tag for the encoding of the HTML text.
-}
findMetaEncoding :: String -> IO (Maybe String)
findMetaEncoding str =
   do htmlTrees <- xmlTreesFromHTMLString str
      return (msum (map HTMLTree.findMetaEncoding (htmlTrees :: [XmlTree.T () NameLC.T String])))

getMetaHTTPHeaders :: String -> IO [(String, String)]
getMetaHTTPHeaders str =
   do htmlTrees <- xmlTreesFromHTMLString str
      return (concatMap HTMLTree.getMetaHTTPHeaders (htmlTrees :: [XmlTree.T () NameLC.T String]))

xmlTreesFromHTMLString ::
   (Name.Tag name, Name.Attribute name) =>
   String -> IO [XmlTree.T () name String]
xmlTreesFromHTMLString str =
   do hxtTree <- fromHTMLString str
      -- it will hopefully be only one HTML tree
      return $
         map (XmlTree.unescape . WraHXT.toXmlTree)
             (filter (WraHXT.checkTagName "html" . HXTTree.getNode)
                     (HXTTree.getChildren hxtTree))


{- |
Guess the encoding from the META-HTTP-EQUIV attribute, if available.
Otherwise fall back to ISO-Latin-1.
-}
fromHTMLStringMetaEncoding :: String -> IO HXT.XmlTree
fromHTMLStringMetaEncoding str =
   do enc <- findMetaEncoding str
      fromHTMLStringOpt [(a_encoding, fromMaybe isoLatin1 enc)] str

{-
With no encoding option given,
utf8ToUnicode fails when trying to interpret ISO-Latin characters as UTF-8 characters
-}
fromHTMLString :: String -> IO HXT.XmlTree
fromHTMLString = fromHTMLStringOpt [(a_encoding, isoLatin1)]

fromHTMLStringOpt :: Attributes -> String -> IO HXT.XmlTree
fromHTMLStringOpt options input =
   do (tree,_,_) <- errorAnnFromHTMLStringOpt options input
      return tree

type ErrorMsg = (Int,String)

errorAnnFromHTMLStringMetaEncoding ::
   String -> IO (HXT.XmlTree, [ErrorMsg], Int)
errorAnnFromHTMLStringMetaEncoding str =
   do enc <- findMetaEncoding str
      errorAnnFromHTMLStringOpt [(a_encoding, fromMaybe isoLatin1 enc)] str

{- |
Adaption of Text.XML.HXT.Parser.MainFunctions.getXmlDocument
-}
errorAnnFromHTMLStringOpt ::
   Attributes -> String -> IO (HXT.XmlTree, [ErrorMsg], Int)
errorAnnFromHTMLStringOpt options contents =
   do
      let options' =
             (a_collect_errors, v_1) :	-- collect errors
	     (a_issue_errors,   v_0) :	-- but don't issue
	    				-- can be overwritten by supporting other values in options
             options
      (root:errs) <- run' $ parseDocument options' contents emptyRoot
      let elvl = intValueOf a_status root
      let errMsgs = map ((\(XError level msg) -> (level, msg)) . getNode) errs
      return (root, errMsgs, elvl)




{- |
An input handler alternative
to Text.XML.HXT.Parser.ProtocolHandlerFile.
It returns the argument string as something
that can be further processed by HXT functions.
-}
getStringContents :: String -> XmlStateFilter a
getStringContents contents =
   return . replaceChildren (xtext contents)
   .>>
   {- We must attach some encoding info to the tree
      which is processed by the HTML parser.
      Using this encoding information
      the HTML parser converts e.g. UTF-8 characters to Unicode characters.
      Cf. Parser.XmlInput.getXmlContents -}
   liftF parseXmlDocEncodingSpec
   .>>
   guessDocEncoding


{- |
Adaption of Text.XML.HXT.Parser.MainFunctions.parseDocument
-}
parseDocument :: Attributes -> String -> XmlStateFilter state
parseDocument userOptions contents
    = processDocument userOptions defaultOptions
      ( traceMsg 1 "parseDocument: options added, start processing"
	.>>
	traceTree
	.>>
	getStringContents contents		-- get the content as text
	.>>
	parseHtmlDoc				-- parse everything as HTML
	.>>
--        liftMf canonicalizeForXPath
--	.>>
	traceMsg 1 "parseDocument: document processed"	-- trace output
	.>>
	traceSource
	.>>
	traceTree
      )
    where
    defaultOptions
	= [ ( a_parse_html,		v_1 )
	  , ( a_validate,		v_1 )
	  , ( a_issue_errors,		v_1 )
	  , ( a_issue_warnings,		v_1 )
	  , ( a_check_namespaces,	v_0 )
	  , ( a_canonicalize,		v_1 )
	  , ( a_preserve_comment,	v_0 )
	  , ( a_remove_whitespace,	v_0 )
	  ]


{- |
Copy of Text.XML.HXT.Parser.MainFunctions.addOptions add addDefaultOptions
-}
addOptions	  :: Attributes -> XmlFilter
addOptions
    = seqF . map (uncurry addAttr)

addDefaultOptions :: Attributes -> XmlFilter
addDefaultOptions
    = seqF . map (\ (n,v) -> addAttr n v `whenNot` hasAttr n)


{- |
Copy of Text.XML.HXT.Parser.MainFunctions.processDocument
-}
processDocument	:: Attributes -> Attributes -> XmlStateFilter state -> XmlStateFilter state
processDocument userOptions defaultOptions processFilter
    = liftMf isRoot
      .>>
      liftMf (addOptions userOptions
	      .>
	      addOptions [(a_status, show c_ok)]
	     )
      .>>
      liftMf (addDefaultOptions defaultOptions)
      .>>
      setSystemParams							-- store options in system state
      .>>
      choiceM
      [ hasOption a_propagate_errors					-- error handling is set by calling environment
		:-> thisM

      , hasOption a_collect_errors
	.>
	hasOption a_issue_errors
		:-> performAction (\ _ -> setSysErrorHandler (errorMsgLogging.>> errorMsgToStderr)	)

      , hasOption a_collect_errors
		:-> performAction (\ _ -> setSysErrorHandler errorMsgLogging	)

      , hasOption a_issue_errors
		:-> performAction (\ _ -> setSysErrorHandler errorMsgToStderr	)

      , this
		:-> performAction (\ _ -> setSysErrorHandler noneM		)
      ]
      .>>
      processFilter
      .>>
      ( thisM
        +++>>
	( hasOption a_collect_errors
          `guardsM`
	  getErrorMsg
	)
      )


{-
putStr . xshow =<< run' (Text.XML.HXT.Parser.MainFunctions.parseDocument [(a_source,"lousy.html"), (a_parse_html,v_1)] emptyRoot)

readFile "lousy.html" >>= tidy >>= putStr
-}