module Text.HTML.WraXML.Tree.HXT
   (tidy, format, fromHTMLString, fromHTMLStringMetaEncoding,
    errorAnnFromHTMLStringMetaEncoding,
    errorAnnFromHTMLStringOpt, ErrorMsg,
    getMetaHTTPHeaders, ) where


import qualified Text.XML.HXT.DOM.XmlNode as XmlNode
import qualified Text.XML.HXT.Arrow.XmlState.ErrorHandling as HXTError
import qualified Text.XML.HXT.Core as HXT
import Text.XML.HXT.Core (withInputEncoding, isoLatin1, yes)
import Text.XML.HXT.Arrow.XmlState (runX)

import Text.XML.HXT.DOM.QualifiedName (QName, qualifiedName)
import Text.XML.HXT.DOM.TypeDefs (XNode(..))
import Text.XML.HXT.DOM.ShowXml (xshow)

import qualified Data.Tree.Class as HXTTree
import Data.Tree.NTree.TypeDefs (NTree(NTree))

import qualified Text.XML.WraXML.Tree.HXT as WraHXT

import qualified Text.HTML.WraXML.Tree   as HtmlTree
import qualified Text.XML.WraXML.Tree    as XmlTree

import qualified Text.XML.Basic.Name.LowerCase as NameLC
import qualified Text.XML.Basic.Name as Name

import qualified Text.XML.Basic.Format    as Format

import Text.XML.HXT.Parser.HtmlParsec (isEmptyHtmlTag)

import Control.Category ((>>>))
import Control.Monad (msum)
import Data.Maybe (fromMaybe)



{- |
Tidy a piece of HTML code.
&   ->   &
<   ->   &lt;
unquoted tag attribute values: size=-1   ->   size="-1"
insert omitted closing tags
-}
tidy :: String -> IO String
tidy :: String -> IO String
tidy String
input =
   forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (\[String
str] -> String
str) forall a b. (a -> b) -> a -> b
$
   forall c. IOSArrow XmlTree c -> IO [c]
runX forall a b. (a -> b) -> a -> b
$
      forall b s a. IO b -> IOSLA s a b
ioSLAFromIO (String -> IO XmlTree
fromHTMLString String
input)
      forall {k} (cat :: k -> k -> *) (a :: k) (b :: k) (c :: k).
Category cat =>
cat a b -> cat b c -> cat a c
>>>
      forall (a :: * -> * -> *).
ArrowXml a =>
SysConfigList -> a XmlTree String
HXT.writeDocumentToString [SysConfig
HXT.withOutputHTML]

ioSLAFromIO :: IO b -> HXT.IOSLA s a b
ioSLAFromIO :: forall b s a. IO b -> IOSLA s a b
ioSLAFromIO IO b
act = forall s a b. (s -> a -> IO (s, [b])) -> IOSLA s a b
HXT.IOSLA forall a b. (a -> b) -> a -> b
$ \s
s a
_ -> forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (\b
b -> (s
s,[b
b])) IO b
act


-- * formatting

{- |
Like 'Text.XML.HXT.DOM.XmlTreeFunctions.xshow'
but it shows empty tags the HTML way.
E.g. it emits @<br>@ instead of @<br\/>@,
@<noscript><\/noscript>@ instead of @<noscript\/>@.
Many browsers prefer that.
-}
format :: HXT.XmlTree -> String
format :: XmlTree -> String
format XmlTree
leaf = XmlTrees -> ShowS
formatTrees (forall (t :: * -> *) a. Tree t => t a -> [t a]
HXTTree.getChildren XmlTree
leaf) String
""

formatTrees :: HXT.XmlTrees -> ShowS
formatTrees :: XmlTrees -> ShowS
formatTrees = forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr forall b c a. (b -> c) -> (a -> b) -> a -> c
(.) forall a. a -> a
id forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a -> b) -> [a] -> [b]
map XmlTree -> ShowS
formatTree


-- cf. src/Text/XML/HXT/DOM/XmlTreeFunctions.hs
formatTree :: HXT.XmlTree -> ShowS
formatTree :: XmlTree -> ShowS
formatTree XmlTree
leaf =
   case XmlTree
leaf of
      (NTree (XPi QName
n XmlTrees
al) XmlTrees
_) ->
         String -> ShowS
showString String
"<?"
         forall b c a. (b -> c) -> (a -> b) -> a -> c
.
         QName -> ShowS
formatQName QName
n
         forall b c a. (b -> c) -> (a -> b) -> a -> c
.
         (forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr forall b c a. (b -> c) -> (a -> b) -> a -> c
(.) forall a. a -> a
id forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a -> b) -> [a] -> [b]
map XmlTree -> ShowS
showPiAttr) XmlTrees
al
         forall b c a. (b -> c) -> (a -> b) -> a -> c
.
         String -> ShowS
showString String
"?>"
           where
             showPiAttr :: HXT.XmlTree -> String -> String
             showPiAttr :: XmlTree -> ShowS
showPiAttr a :: XmlTree
a@(NTree (XAttr QName
an) XmlTrees
cs) =
                 if QName -> String
qualifiedName QName
an forall a. Eq a => a -> a -> Bool
== String
HXT.a_value
                   then ShowS
Format.blank forall b c a. (b -> c) -> (a -> b) -> a -> c
. XmlTrees -> ShowS
formatTrees XmlTrees
cs
                   else XmlTree -> ShowS
formatTree XmlTree
a
             showPiAttr XmlTree
_ = forall a. a -> a
id
      (NTree (XTag QName
t XmlTrees
al) XmlTrees
cs) ->
         if forall (t :: * -> *) a. Foldable t => t a -> Bool
null XmlTrees
cs Bool -> Bool -> Bool
&& String -> Bool
isEmptyHtmlTag (QName -> String
qualifiedName QName
t)
           then ShowS
Format.lt forall b c a. (b -> c) -> (a -> b) -> a -> c
. QName -> ShowS
formatQName QName
t forall b c a. (b -> c) -> (a -> b) -> a -> c
. XmlTrees -> ShowS
formatTrees XmlTrees
al forall b c a. (b -> c) -> (a -> b) -> a -> c
. ShowS
Format.gt
           else ShowS
Format.lt forall b c a. (b -> c) -> (a -> b) -> a -> c
. QName -> ShowS
formatQName QName
t forall b c a. (b -> c) -> (a -> b) -> a -> c
. XmlTrees -> ShowS
formatTrees XmlTrees
al forall b c a. (b -> c) -> (a -> b) -> a -> c
. ShowS
Format.gt
                 forall b c a. (b -> c) -> (a -> b) -> a -> c
. XmlTrees -> ShowS
formatTrees XmlTrees
cs
                 forall b c a. (b -> c) -> (a -> b) -> a -> c
. ShowS
Format.lt forall b c a. (b -> c) -> (a -> b) -> a -> c
. ShowS
Format.slash forall b c a. (b -> c) -> (a -> b) -> a -> c
. QName -> ShowS
formatQName QName
t forall b c a. (b -> c) -> (a -> b) -> a -> c
. ShowS
Format.gt
      (NTree (XAttr QName
an) XmlTrees
cs) ->
         ShowS
Format.blank forall b c a. (b -> c) -> (a -> b) -> a -> c
. QName -> ShowS
formatQName QName
an forall b c a. (b -> c) -> (a -> b) -> a -> c
. ShowS
Format.eq forall b c a. (b -> c) -> (a -> b) -> a -> c
.
         String -> ShowS
Format.stringQuoted (XmlTrees -> ShowS
formatTrees XmlTrees
cs String
"")
      (NTree (XError Int
l String
e) XmlTrees
_) ->
         String -> ShowS
showString String
"<!-- ERROR (" forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> ShowS
showString (forall a. Show a => a -> String
show Int
l) forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> ShowS
showString String
"):\n"
          forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> ShowS
showString String
e forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> ShowS
showString String
"\n-->"
      XmlTree
_ -> (XmlTrees -> String
xshow [XmlTree
leaf] forall a. [a] -> [a] -> [a]
++) -- showXmlTree leaf


formatQName :: QName -> ShowS
formatQName :: QName -> ShowS
formatQName = String -> ShowS
showString forall b c a. (b -> c) -> (a -> b) -> a -> c
. QName -> String
qualifiedName


-- * parsing and encoding

{- |
Search for a META tag for the encoding of the HTML text.
-}
findMetaEncoding :: String -> IO (Maybe String)
findMetaEncoding :: String -> IO (Maybe String)
findMetaEncoding String
str =
   do [T () T String]
htmlTrees <- forall name.
(Tag name, Attribute name) =>
String -> IO [T () name String]
xmlTreesFromHTMLString String
str
      forall (m :: * -> *) a. Monad m => a -> m a
return (forall (t :: * -> *) (m :: * -> *) a.
(Foldable t, MonadPlus m) =>
t (m a) -> m a
msum (forall a b. (a -> b) -> [a] -> [b]
map forall name i.
(Tag name, Attribute name) =>
T i name String -> Maybe String
HtmlTree.findMetaEncoding
                           ([T () T String]
htmlTrees :: [XmlTree.T () NameLC.T String])))

getMetaHTTPHeaders :: String -> IO [(String, String)]
getMetaHTTPHeaders :: String -> IO [(String, String)]
getMetaHTTPHeaders String
str =
   do [T () T String]
htmlTrees <- forall name.
(Tag name, Attribute name) =>
String -> IO [T () name String]
xmlTreesFromHTMLString String
str
      forall (m :: * -> *) a. Monad m => a -> m a
return (forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap forall name i.
(Tag name, Attribute name) =>
T i name String -> [(String, String)]
HtmlTree.getMetaHTTPHeaders
                           ([T () T String]
htmlTrees :: [XmlTree.T () NameLC.T String]))

xmlTreesFromHTMLString ::
   (Name.Tag name, Name.Attribute name) =>
   String -> IO [XmlTree.T () name String]
xmlTreesFromHTMLString :: forall name.
(Tag name, Attribute name) =>
String -> IO [T () name String]
xmlTreesFromHTMLString String
str =
   do XmlTree
hxtTree <- String -> IO XmlTree
fromHTMLString String
str
      -- it will hopefully be only one HTML tree
      forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$
         forall a b. (a -> b) -> [a] -> [b]
map (forall i name. T i name T -> T i name String
XmlTree.unescape forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall name. (Tag name, Attribute name) => XmlTree -> T () name T
WraHXT.toXmlTree)
             (forall a. (a -> Bool) -> [a] -> [a]
filter (String -> XNode -> Bool
WraHXT.checkTagName String
"html" forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall (t :: * -> *) a. Tree t => t a -> a
HXTTree.getNode)
                     (forall (t :: * -> *) a. Tree t => t a -> [t a]
HXTTree.getChildren XmlTree
hxtTree))


{- |
Guess the encoding from the META-HTTP-EQUIV attribute, if available.
Otherwise fall back to ISO-Latin-1.
-}
fromHTMLStringMetaEncoding :: String -> IO HXT.XmlTree
fromHTMLStringMetaEncoding :: String -> IO XmlTree
fromHTMLStringMetaEncoding String
str =
   do Maybe String
enc <- String -> IO (Maybe String)
findMetaEncoding String
str
      SysConfigList -> String -> IO XmlTree
fromHTMLStringOpt [String -> SysConfig
withInputEncoding forall a b. (a -> b) -> a -> b
$ forall a. a -> Maybe a -> a
fromMaybe String
isoLatin1 Maybe String
enc] String
str

{-
With no encoding option given,
utf8ToUnicode fails when trying
to interpret ISO-Latin characters as UTF-8 characters.
-}
fromHTMLString :: String -> IO HXT.XmlTree
fromHTMLString :: String -> IO XmlTree
fromHTMLString = SysConfigList -> String -> IO XmlTree
fromHTMLStringOpt [String -> SysConfig
withInputEncoding String
isoLatin1]

fromHTMLStringOpt :: HXT.SysConfigList -> String -> IO HXT.XmlTree
fromHTMLStringOpt :: SysConfigList -> String -> IO XmlTree
fromHTMLStringOpt SysConfigList
options String
input =
   do (XmlTree
tree,[ErrorMsg]
_,Maybe Int
_) <- SysConfigList -> String -> IO (XmlTree, [ErrorMsg], Maybe Int)
errorAnnFromHTMLStringOpt SysConfigList
options String
input
      forall (m :: * -> *) a. Monad m => a -> m a
return XmlTree
tree

type ErrorMsg = (Int,String)

{-# WARNING errorAnnFromHTMLStringMetaEncoding
      "error collection does not work currently" #-}
errorAnnFromHTMLStringMetaEncoding ::
   String -> IO (HXT.XmlTree, [ErrorMsg], Maybe Int)
errorAnnFromHTMLStringMetaEncoding :: String -> IO (XmlTree, [ErrorMsg], Maybe Int)
errorAnnFromHTMLStringMetaEncoding String
str = do
   Maybe String
enc <- String -> IO (Maybe String)
findMetaEncoding String
str
   SysConfigList -> String -> IO (XmlTree, [ErrorMsg], Maybe Int)
errorAnnFromHTMLStringOpt [String -> SysConfig
withInputEncoding forall a b. (a -> b) -> a -> b
$ forall a. a -> Maybe a -> a
fromMaybe String
isoLatin1 Maybe String
enc] String
str


{- |
In earlier version I managed to obtain the error messages.
This does not work anymore and do not know how to achieve this.
-}
{-# WARNING errorAnnFromHTMLStringOpt
      "error collection does not work currently" #-}
errorAnnFromHTMLStringOpt ::
   HXT.SysConfigList -> String -> IO (HXT.XmlTree, [ErrorMsg], Maybe Int)
errorAnnFromHTMLStringOpt :: SysConfigList -> String -> IO (XmlTree, [ErrorMsg], Maybe Int)
errorAnnFromHTMLStringOpt = SysConfigList -> String -> IO (XmlTree, [ErrorMsg], Maybe Int)
errorAnnFromHTMLStringInternal

{- |
Adaption of Text.XML.HXT.Parser.MainFunctions.getXmlDocument
-}
errorAnnFromHTMLStringInternal ::
   HXT.SysConfigList -> String -> IO (HXT.XmlTree, [ErrorMsg], Maybe Int)
errorAnnFromHTMLStringInternal :: SysConfigList -> String -> IO (XmlTree, [ErrorMsg], Maybe Int)
errorAnnFromHTMLStringInternal SysConfigList
options String
contents = do
   (XmlTree
root_:XmlTrees
errs) <-
      forall c. IOSArrow XmlTree c -> IO [c]
runX forall a b. (a -> b) -> a -> b
$
         forall s b. IOStateArrow s b b
HXT.errorMsgCollect
         forall {k} (cat :: k -> k -> *) (a :: k) (b :: k) (c :: k).
Category cat =>
cat a b -> cat b c -> cat a c
>>>
         forall s b. SysConfigList -> String -> IOStateArrow s b XmlTree
HXT.readString (Bool -> SysConfig
HXT.withParseHTML Bool
yes forall a. a -> [a] -> [a]
: SysConfigList
options) String
contents
         forall {k} (cat :: k -> k -> *) (a :: k) (b :: k) (c :: k).
Category cat =>
cat a b -> cat b c -> cat a c
>>>
         forall s. IOStateArrow s XmlTree XmlTree
HXTError.filterErrorMsg
   let elvl :: Maybe Int
elvl = forall a. XmlNode a => a -> Maybe Int
XmlNode.getErrorLevel XmlTree
root_
   let errMsgs :: [ErrorMsg]
errMsgs =
         forall a b. (a -> b) -> [a] -> [b]
map ((\(XError Int
level String
msg) -> (Int
level, String
msg)) forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall (t :: * -> *) a. Tree t => t a -> a
HXTTree.getNode) XmlTrees
errs
   forall (m :: * -> *) a. Monad m => a -> m a
return (XmlTree
root_, [ErrorMsg]
errMsgs, Maybe Int
elvl)


{-
putStr . xshow =<< run' (Text.XML.HXT.Parser.MainFunctions.parseDocument [(a_source,"lousy.html"), (a_parse_html,v_1)] emptyRoot)

readFile "lousy.html" >>= tidy >>= putStr
-}