{- |
Very lazy HTML tree parser using TagSoup as lexer.
-}
module Text.XML.WraXML.Tree.Tagchup (
   toXmlTrees,
   fromXmlTree,
   fromElement, fromLeaf,
   liftElementFilter, liftElementCheck,
   example,
   ) where

import qualified Text.HTML.Tagchup.Parser      as TagParser
import qualified Text.HTML.Tagchup.Tag         as Tag
import qualified Text.HTML.Tagchup.PositionTag as PosTag
import qualified Text.HTML.Tagchup.Tag.Match   as MatchTag

import qualified Text.XML.WraXML.Element          as Elem
import qualified Text.XML.WraXML.Tree             as XmlTree
import qualified Data.Tree.BranchLeafLabel        as Tree

import qualified Text.HTML.Basic.Tag as TagH

import qualified Text.XML.Basic.Position    as Position
import qualified Text.XML.Basic.Name.LowerCase as NameLC
import qualified Text.XML.Basic.Name           as Name

-- import Control.Monad.Trans.State (evalState, put, get, gets, )
import Control.Monad.Trans.Writer (runWriter, writer, )

import Data.Monoid (Last(Last, getLast), )
import Data.Maybe (fromMaybe, )
import Data.Bool.HT (select, )

import qualified Text.XML.WraXML.Tree.LazyParser as ParserU
import qualified Text.ParserCombinators.Poly.Lazy as Parser

import Control.Monad (liftM, )


-- we don't use XmlTree synonym in the exported functions for documentation purposes
type XmlTree name string = XmlTree.T Position.T name string


{-
-- FIXME: how to get rid of 'error' ?
fromXmlTree' ::
   (Name.Tag name, Name.Attribute name) =>
   XmlTree.T Position.T name string -> [PosTag.T name string]
fromXmlTree' =
   flip evalState (error "position not initialised, yet") .
   Tree.fold
      (\pos x -> put pos >> x)
      (\branch subTreesM ->
          do pos <- get
             subTrees <- sequence subTreesM
             lastPos <- get
             
             let elm = XmlTree.getElement branch
                 openTag =
                    PosTag.cons pos $ fromElement elm
                 closeTag =
                    PosTag.cons lastPos $
                    Tag.Close $ Elem.name_ elm
             return $ openTag : concat subTrees ++ [closeTag])

      (\leaf ->
          gets (\pos -> [PosTag.cons pos $ fromLeaf leaf]))
   .
   XmlTree.unwrap
-}

fromXmlTree ::
   (Name.Tag name, Name.Attribute name) =>
   XmlTree.T Position.T name string -> [PosTag.T name string]
fromXmlTree :: forall name string.
(Tag name, Attribute name) =>
T T name string -> [T name string]
fromXmlTree =
   forall a b. (a, b) -> a
fst forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall w a. Writer w a -> (a, w)
runWriter forall b c a. (b -> c) -> (a -> b) -> a -> c
.
   forall i branch b leaf.
(i -> branch -> [b] -> b)
-> (i -> leaf -> b) -> T i branch leaf -> b
Tree.foldLabel
      (\T
pos Branch name string
branch [Writer (Last T) [T name string]]
subTreesM ->
          let ([[T name string]]
subTrees, Last T
lastSubPos) = forall w a. Writer w a -> (a, w)
runWriter forall a b. (a -> b) -> a -> b
$ forall (t :: * -> *) (m :: * -> *) a.
(Traversable t, Monad m) =>
t (m a) -> m (t a)
sequence [Writer (Last T) [T name string]]
subTreesM
              lastPos :: T
lastPos = forall a. a -> Maybe a -> a
fromMaybe T
pos forall a b. (a -> b) -> a -> b
$ forall a. Last a -> Maybe a
getLast Last T
lastSubPos
              elm :: T name string
elm = forall name str. Branch name str -> T name str
XmlTree.getElement Branch name string
branch
              openTag :: T name string
openTag =
                 forall name string. T -> T name string -> T name string
PosTag.cons T
pos forall a b. (a -> b) -> a -> b
$ forall name string.
(Tag name, Attribute name) =>
T name string -> T name string
fromElement T name string
elm
              closeTag :: T name string
closeTag =
                 forall name string. T -> T name string -> T name string
PosTag.cons T
lastPos forall a b. (a -> b) -> a -> b
$
                 forall name string. Name name -> T name string
Tag.Close forall a b. (a -> b) -> a -> b
$ forall name str. T name str -> Name name
Elem.name_ T name string
elm
          in  forall (m :: * -> *) a w. Monad m => (a, w) -> WriterT w m a
writer (T name string
openTag forall a. a -> [a] -> [a]
: forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [[T name string]]
subTrees forall a. [a] -> [a] -> [a]
++ [forall {string}. T name string
closeTag],
                      forall a. Maybe a -> Last a
Last forall a b. (a -> b) -> a -> b
$ forall a. a -> Maybe a
Just T
lastPos))
      (\T
pos Leaf name string
leaf ->
          forall (m :: * -> *) a w. Monad m => (a, w) -> WriterT w m a
writer forall a b. (a -> b) -> a -> b
$ forall a b c. (a -> b -> c) -> b -> a -> c
flip (,) (forall a. Maybe a -> Last a
Last forall a b. (a -> b) -> a -> b
$ forall a. a -> Maybe a
Just T
pos) forall a b. (a -> b) -> a -> b
$
          [forall name string. T -> T name string -> T name string
PosTag.cons T
pos forall a b. (a -> b) -> a -> b
$ forall name string.
(Tag name, Attribute name) =>
Leaf name string -> T name string
fromLeaf Leaf name string
leaf])
   forall b c a. (b -> c) -> (a -> b) -> a -> c
.
   forall i name str.
T i name str -> T i (Branch name str) (Leaf name str)
XmlTree.unwrap

fromElement ::
   (Name.Tag name, Name.Attribute name) =>
   Elem.T name string -> Tag.T name string
fromElement :: forall name string.
(Tag name, Attribute name) =>
T name string -> T name string
fromElement T name string
elm =
   forall name string. Name name -> [T name string] -> T name string
Tag.Open (forall name str. T name str -> Name name
Elem.name_ T name string
elm) (forall name str. T name str -> [T name str]
Elem.attributes_ T name string
elm)

fromLeaf ::
   (Name.Tag name, Name.Attribute name) =>
   XmlTree.Leaf name string -> Tag.T name string
fromLeaf :: forall name string.
(Tag name, Attribute name) =>
Leaf name string -> T name string
fromLeaf Leaf name string
leaf =
   case Leaf name string
leaf of
      XmlTree.Text  Bool
_ string
text  -> forall string name. string -> T name string
Tag.text string
text
      XmlTree.Comment String
text  -> forall name string. String -> T name string
Tag.comment String
text
      XmlTree.CData   String
text  -> forall name string. Tag name => String -> T name string
Tag.cdata String
text
      XmlTree.PI Name name
name T name string
instr -> forall name string. Name name -> T name string -> T name string
Tag.processing Name name
name T name string
instr
      XmlTree.Warning String
text  -> forall name string. String -> T name string
Tag.warning String
text


liftElementFilter ::
   (Name.Tag name, Name.Attribute name) =>
   (Elem.T name str -> Elem.T name str) ->
   Tag.T name str -> Tag.T name str
liftElementFilter :: forall name str.
(Tag name, Attribute name) =>
(T name str -> T name str) -> T name str -> T name str
liftElementFilter T name str -> T name str
f T name str
tag =
   forall a b c. (a -> b -> c) -> b -> a -> c
flip (forall b a. b -> (a -> b) -> Maybe a -> b
maybe T name str
tag) (forall name string.
T name string -> Maybe (Name name, [T name string])
Tag.maybeOpen T name str
tag) forall a b. (a -> b) -> a -> b
$
   forall name string.
(Tag name, Attribute name) =>
T name string -> T name string
fromElement forall b c a. (b -> c) -> (a -> b) -> a -> c
. T name str -> T name str
f forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b c. (a -> b -> c) -> (a, b) -> c
uncurry forall name str. Name name -> [T name str] -> T name str
Elem.Cons

liftElementCheck :: (Elem.T name string -> Bool) -> (Tag.T name string -> Bool)
liftElementCheck :: forall name string.
(T name string -> Bool) -> T name string -> Bool
liftElementCheck T name string -> Bool
f T name string
tag =
   forall a b c. (a -> b -> c) -> b -> a -> c
flip (forall b a. b -> (a -> b) -> Maybe a -> b
maybe Bool
False) (forall name string.
T name string -> Maybe (Name name, [T name string])
Tag.maybeOpen T name string
tag) forall a b. (a -> b) -> a -> b
$
   T name string -> Bool
f forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b c. (a -> b -> c) -> (a, b) -> c
uncurry forall name str. Name name -> [T name str] -> T name str
Elem.Cons



{- |
A TagSoup could represent multiple HTML trees,
e.g. with some introducing comments.
-}
toXmlTrees ::
   (Name.Tag name, Name.Attribute name) =>
   [PosTag.T name string] -> [XmlTree.T Position.T name string]
toXmlTrees :: forall name string.
(Tag name, Attribute name) =>
[T name string] -> [T T name string]
toXmlTrees =
   forall a b. (a, b) -> a
fst forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall t a. Parser t a -> [t] -> (a, [t])
Parser.runParser (forall t a. Parser t a -> Parser t [a]
ParserU.manyLazy forall name string.
(Tag name, Attribute name) =>
Parser name string (XmlTree name string)
parseBranch)


type Parser name string a = Parser.Parser (PosTag.T name string) a


parseBranch ::
   (Name.Tag name, Name.Attribute name) =>
   Parser name string (XmlTree name string)
parseBranch :: forall name string.
(Tag name, Attribute name) =>
Parser name string (XmlTree name string)
parseBranch =
   do T name string
nt <- forall t. Parser t t
Parser.next
      let ti :: T
ti = forall name string. T name string -> T
PosTag.position_ T name string
nt
      case forall name string. T name string -> T name string
PosTag.tag_ T name string
nt of
         Tag.Text string
text -> forall (m :: * -> *) a. Monad m => a -> m a
return (forall i str name. i -> str -> T i name str
XmlTree.literalIndex T
ti string
text)
         Tag.Open Name name
name [T name string]
attrs ->
            forall (m :: * -> *) a1 r. Monad m => (a1 -> r) -> m a1 -> m r
liftM
               (forall i name str.
i -> Name name -> [T name str] -> [T i name str] -> T i name str
XmlTree.tagIndexAttr T
ti Name name
name [T name string]
attrs)
               (forall name string.
(Tag name, Attribute name) =>
XmlTree name string
-> Name name -> Parser name string [XmlTree name string]
parseSubTrees
                  (forall i name str. i -> String -> T i name str
XmlTree.warningIndex T
ti
                      String
"unexpected end of file") Name name
name)
         Tag.Close Name name
name ->
            forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall i name str. i -> String -> T i name str
XmlTree.warningIndex T
ti forall a b. (a -> b) -> a -> b
$
            if forall name. Tag name => Name name -> Bool
TagH.isSloppy Name name
name
              then String
"misplaced sloppy closing tag </" forall a. [a] -> [a] -> [a]
++ forall name. C name => name -> String
Name.toString Name name
name forall a. [a] -> [a] -> [a]
++ String
">"
              else String
"isolated closing tag </" forall a. [a] -> [a] -> [a]
++ forall name. C name => name -> String
Name.toString Name name
name forall a. [a] -> [a] -> [a]
++ String
">"
         Tag.Comment String
cmt ->
            forall (m :: * -> *) a. Monad m => a -> m a
return (forall i name str. i -> String -> T i name str
XmlTree.commentIndex T
ti String
cmt)
         Tag.Warning String
msg ->
            forall (m :: * -> *) a. Monad m => a -> m a
return (forall i name str. i -> String -> T i name str
XmlTree.warningIndex T
ti String
msg)
         Tag.Processing Name name
target T name string
p ->
            forall (m :: * -> *) a. Monad m => a -> m a
return (forall i name str. i -> Name name -> T name str -> T i name str
XmlTree.processingIndex T
ti Name name
target T name string
p)
         Tag.Special Name name
name String
str ->
            forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$
            if forall name. C name => String -> name -> Bool
Name.match String
TagH.cdataString Name name
name
              then forall i name str. i -> String -> T i name str
XmlTree.cdataIndex T
ti String
str
              else forall i name str. i -> String -> T i name str
XmlTree.warningIndex T
ti forall a b. (a -> b) -> a -> b
$
                      String
"Special tag " forall a. [a] -> [a] -> [a]
++ forall name. C name => name -> String
Name.toString Name name
name forall a. [a] -> [a] -> [a]
++
                      String
" not allowed within a HTML document"


parseSubTrees ::
   (Name.Tag name, Name.Attribute name) =>
   XmlTree name string -> TagH.Name name ->
   Parser name string [XmlTree name string]
parseSubTrees :: forall name string.
(Tag name, Attribute name) =>
XmlTree name string
-> Name name -> Parser name string [XmlTree name string]
parseSubTrees XmlTree name string
warn Name name
name =
   forall t a. Parser t a -> Parser t a
ParserU.force forall a b. (a -> b) -> a -> b
$
   if forall name. Tag name => Name name -> Bool
TagH.isEmpty Name name
name
     then
       (forall t. (t -> Bool) -> Parser t t
Parser.satisfy (forall name string. (Name name -> Bool) -> T name string -> Bool
MatchTag.close (Name name
nameforall a. Eq a => a -> a -> Bool
==) forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall name string. T name string -> T name string
PosTag.tag_) forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return [])
         forall t a. Parser t a -> Parser t a -> Parser t a
`Parser.onFail` forall (m :: * -> *) a. Monad m => a -> m a
return []
     else
       forall t a. Parser t [a] -> a -> Parser t a -> Parser t [a]
ParserU.manyFinallyAppend
          (forall name string.
(Tag name, Attribute name) =>
Name name -> Parser name string [XmlTree name string]
parseTerminator Name name
name)
          XmlTree name string
warn forall name string.
(Tag name, Attribute name) =>
Parser name string (XmlTree name string)
parseBranch


parseTerminator ::
   (Name.Tag name, Name.Attribute name) =>
   TagH.Name name -> Parser name string [XmlTree name string]
parseTerminator :: forall name string.
(Tag name, Attribute name) =>
Name name -> Parser name string [XmlTree name string]
parseTerminator Name name
name =
   do T name string
c <- forall t. Parser t t
Parser.next
      let ci :: T
ci = forall name string. T name string -> T
PosTag.position_ T name string
c
          retry :: b -> Parser (T name string) b
retry b
warns = forall t. [t] -> Parser t ()
Parser.reparse [T name string
c] forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return b
warns
      case forall name string. T name string -> T name string
PosTag.tag_ T name string
c of
         Tag.Close Name name
closeName ->
            forall a b c. (a -> b -> c) -> b -> a -> c
flip forall a. a -> [(Bool, a)] -> a
select
               ((forall name. Tag name => Name name -> Bool
TagH.isEmpty Name name
closeName,
                    forall (m :: * -> *) a. MonadFail m => String -> m a
fail forall a b. (a -> b) -> a -> b
$ String
"the tag <" forall a. [a] -> [a] -> [a]
++ forall name. C name => name -> String
Name.toString Name name
closeName forall a. [a] -> [a] -> [a]
++ String
"> closes nothing.") forall a. a -> [a] -> [a]
:
                (Name name
name forall name. Tag name => Name name -> Name name -> Bool
`TagH.isInnerOf` Name name
closeName, forall {b}. b -> Parser (T name string) b
retry []) forall a. a -> [a] -> [a]
:
                (Name name
nameforall a. Eq a => a -> a -> Bool
==Name name
closeName, forall (m :: * -> *) a. Monad m => a -> m a
return []) forall a. a -> [a] -> [a]
:
                (forall name. Tag name => Name name -> Bool
TagH.isSloppy Name name
name,
                 forall {b}. b -> Parser (T name string) b
retry [forall i name str. i -> String -> T i name str
XmlTree.warningIndex T
ci forall a b. (a -> b) -> a -> b
$
                           String
"sloppy tag <"
                           forall a. [a] -> [a] -> [a]
++ forall name. C name => name -> String
Name.toString Name name
name forall a. [a] -> [a] -> [a]
++ String
"> closed by </"
                           forall a. [a] -> [a] -> [a]
++ forall name. C name => name -> String
Name.toString Name name
closeName forall a. [a] -> [a] -> [a]
++ String
">"]) forall a. a -> [a] -> [a]
:
                (forall name. Tag name => Name name -> Bool
TagH.isSloppy Name name
closeName,
                 forall (m :: * -> *) a. MonadFail m => String -> m a
fail forall a b. (a -> b) -> a -> b
$ String
"ignore sloppy closing tag </" forall a. [a] -> [a] -> [a]
++ forall name. C name => name -> String
Name.toString Name name
name forall a. [a] -> [a] -> [a]
++ String
">") forall a. a -> [a] -> [a]
:
                [])
               (forall (m :: * -> *) a. Monad m => a -> m a
return [forall i name str. i -> String -> T i name str
XmlTree.warningIndex T
ci forall a b. (a -> b) -> a -> b
$
                            String
"open tag <" forall a. [a] -> [a] -> [a]
++ forall name. C name => name -> String
Name.toString Name name
name forall a. [a] -> [a] -> [a]
++
                            String
"> and close tag </" forall a. [a] -> [a] -> [a]
++ forall name. C name => name -> String
Name.toString Name name
closeName forall a. [a] -> [a] -> [a]
++
                            String
"> do not match"])
         Tag.Open Name name
openName [T name string]
_ ->
            if Name name
openName forall name. Tag name => Name name -> Name name -> Bool
`TagH.closes` Name name
name
              then forall {b}. b -> Parser (T name string) b
retry []
              else forall (m :: * -> *) a. MonadFail m => String -> m a
fail forall a b. (a -> b) -> a -> b
$
                      String
"open tag <" forall a. [a] -> [a] -> [a]
++ forall name. C name => name -> String
Name.toString Name name
openName forall a. [a] -> [a] -> [a]
++
                      String
"> does not close tag <" forall a. [a] -> [a] -> [a]
++ forall name. C name => name -> String
Name.toString Name name
name forall a. [a] -> [a] -> [a]
++ String
">"
         T name string
_ -> forall (m :: * -> *) a. MonadFail m => String -> m a
fail forall a b. (a -> b) -> a -> b
$ String
"not a termination of a tag"


example ::
   [XmlTree NameLC.T String]
example :: [XmlTree T String]
example =
   forall name string.
(Tag name, Attribute name) =>
[T name string] -> [T T name string]
toXmlTrees forall a b. (a -> b) -> a -> b
$
   forall source sink name.
(C source, StringType sink, Attribute name, Tag name) =>
source -> [T name sink]
TagParser.runSoupWithPositions forall a b. (a -> b) -> a -> b
$
--   "<html><head>blub<meta/></head><br><body>bla</body></html>"
--   "<html><head>blub<meta/></head><br><body>bla<UL><li>1.<li>2.</OL><TABLE border=1></TABLE></body></html>"
--   "<html><head><meta>too much</meta>blub<meta/></head><br><body>bla<UL><li>1.<li>2.</UL><TABLE border=1></TABLE></bo"++undefined++"dy></html>"
--    "<b><font>test</b></font>"
    String
"<html><b><font>test</b></font></html>"