{- |
Very lazy HTML tree parser using TagSoup as lexer.
-}
module Text.XML.WraXML.Tree.TagSoup where

import qualified Text.HTML.TagSoup as Tag
import qualified Text.HTML.TagSoup as Match -- TagSoup.Match became hidden in tagsoup-0.8
import Text.HTML.TagSoup (Tag(..), )

import qualified Text.XML.WraXML.Tree.Literal as XmlTree

import qualified Text.HTML.Basic.Tag as TagH
import qualified Text.XML.Basic.Position as Position
import qualified Text.XML.Basic.Attribute as Attr
import qualified Text.XML.Basic.ProcessingInstruction as PI

-- import qualified Text.XML.WraXML.String         as XmlString
-- import qualified Text.XML.WraXML.String.TagSoup as XmlStringTagSoup

import qualified Text.XML.Basic.Name.LowerCase as NameLC
import qualified Text.XML.Basic.Name           as Name

import qualified Text.XML.WraXML.Tree.LazyParser as ParserU
import qualified Text.ParserCombinators.Poly.Lazy as Parser

import Data.Bool.HT (select, )
import Control.Monad (liftM, )
import qualified Data.List as List


type PosTag = (Position.T, Tag String)
-- data PosTag = PosTag !Position.T Tag


defaultFilename :: String
defaultFilename :: String
defaultFilename = String
"input"

attachPos :: [Tag String] -> [PosTag]
attachPos :: [Tag String] -> [PosTag]
attachPos =
   forall a b. (a, b) -> b
snd forall b c a. (b -> c) -> (a -> b) -> a -> c
.
   forall (t :: * -> *) s a b.
Traversable t =>
(s -> a -> (s, b)) -> s -> t a -> (s, t b)
List.mapAccumL
      (\T
pos Tag String
t ->
          let newPos :: T
newPos =
                  case Tag String
t of
                     Tag.TagPosition Row
setRow Row
setColumn ->
                          String -> Row -> Row -> T
Position.new String
defaultFilename Row
setRow Row
setColumn
                     Tag String
_ -> T
pos
          in  (T
newPos, (T
newPos, Tag String
t)))
--          in  (newPos, PosTag newPos t))
      (String -> T
Position.initialize String
defaultFilename)

removeMetaPos :: [PosTag] -> [PosTag]
removeMetaPos :: [PosTag] -> [PosTag]
removeMetaPos =
   forall a. (a -> Bool) -> [a] -> [a]
filter (\(T
_pos,Tag String
tag) -> case Tag String
tag of
               TagPosition Row
_ Row
_ -> Bool
False
               Tag String
_ -> Bool
True)


type XmlTree name string = XmlTree.T Position.T name string

{- |
A TagSoup could represent multiple HTML trees,
e.g. with some introducing comments.
-}
{-
toXmlTrees ::
   [Tag XmlChar] -> [XmlTree name String]
toXmlTrees =
   toXmlTreesAux .
   Tag.canonicalizeTags
-}

toXmlTreesString ::
   (Name.Tag name, Name.Attribute name) =>
   [Tag String] -> [XmlTree name String]
toXmlTreesString :: forall name.
(Tag name, Attribute name) =>
[Tag String] -> [XmlTree name String]
toXmlTreesString =
   forall name.
(Tag name, Attribute name) =>
[PosTag] -> [XmlTree name String]
toXmlTreesAux forall b c a. (b -> c) -> (a -> b) -> a -> c
.
   [PosTag] -> [PosTag]
removeMetaPos forall b c a. (b -> c) -> (a -> b) -> a -> c
.
   [Tag String] -> [PosTag]
attachPos forall b c a. (b -> c) -> (a -> b) -> a -> c
.
   forall str. StringLike str => [Tag str] -> [Tag str]
Tag.canonicalizeTags  -- must remain for Match.tagCloseLit
--   toXmlTrees


toXmlTreesAux ::
   (Name.Tag name, Name.Attribute name) =>
   [PosTag] -> [XmlTree name String]
toXmlTreesAux :: forall name.
(Tag name, Attribute name) =>
[PosTag] -> [XmlTree name String]
toXmlTreesAux =
   forall a b. (a, b) -> a
fst forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall t a. Parser t a -> [t] -> (a, [t])
Parser.runParser (forall t a. Parser t a -> Parser t [a]
ParserU.manyLazy forall name i.
(Tag name, Attribute name) =>
Parser i (T i name String)
parseBranch)



type Parser i a = Parser.Parser (i, Tag String) a


parseBranch ::
   (Name.Tag name, Name.Attribute name) =>
   Parser i (XmlTree.T i name String)
parseBranch :: forall name i.
(Tag name, Attribute name) =>
Parser i (T i name String)
parseBranch =
   do (i
ti,Tag String
t) <- forall t. Parser t t
Parser.next
      case Tag String
t of
         TagText String
text -> forall (m :: * -> *) a. Monad m => a -> m a
return (forall i str name. i -> str -> T i name str
XmlTree.literalIndex i
ti String
text)
         TagOpen (Char
'?':String
target) [Attribute String]
attrs ->
            forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$
               forall name i str.
(Tag name, Attribute name) =>
i -> String -> T name str -> T i name str
XmlTree.processingIndex i
ti String
target forall a b. (a -> b) -> a -> b
$
               case [Attribute String]
attrs of
                  [(String
"",String
str)] ->
                     forall name string. String -> T name string
PI.Unknown String
str
                  [Attribute String]
_ ->
                     forall name string. [T name string] -> T name string
PI.Known forall a b. (a -> b) -> a -> b
$
                     forall a b. (a -> b) -> [a] -> [b]
map (forall a b c. (a -> b -> c) -> (a, b) -> c
uncurry forall name string.
Attribute name =>
String -> string -> T name string
Attr.new) [Attribute String]
attrs
         TagOpen String
"![CDATA[" [(String
"",String
str)] ->
            forall (m :: * -> *) a. Monad m => a -> m a
return (forall i name str. i -> String -> T i name str
XmlTree.cdataIndex i
ti String
str)
         TagOpen String
name [Attribute String]
attrs ->
            forall (m :: * -> *) a1 r. Monad m => (a1 -> r) -> m a1 -> m r
liftM
               (forall name i str.
(Tag name, Attribute name) =>
i -> String -> [(String, str)] -> [T i name str] -> T i name str
XmlTree.tagIndexAttr i
ti String
name [Attribute String]
attrs)
               (forall name i.
(Tag name, Attribute name) =>
T i name String -> Name name -> Parser i [T i name String]
parseSubTrees
                  (forall i name str. i -> String -> T i name str
XmlTree.warningIndex i
ti
                      String
"unexpected end of file") (forall name. C name => String -> name
Name.fromString String
name))
         TagClose String
name ->
            let makeWarning ::
                   (Name.Tag name) =>
                   i -> TagH.Name name -> XmlTree.T i name String
                makeWarning :: forall name i. Tag name => i -> Name name -> T i name String
makeWarning i
i Name name
n =
                   forall i name str. i -> String -> T i name str
XmlTree.warningIndex i
i forall a b. (a -> b) -> a -> b
$
                   if forall name. Tag name => Name name -> Bool
TagH.isSloppy Name name
n
                     then String
"misplaced sloppy closing tag </" forall a. [a] -> [a] -> [a]
++ String
name forall a. [a] -> [a] -> [a]
++ String
">"
                     else String
"isolated closing tag </" forall a. [a] -> [a] -> [a]
++ String
name forall a. [a] -> [a] -> [a]
++ String
">"
            in  forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall name i. Tag name => i -> Name name -> T i name String
makeWarning i
ti forall a b. (a -> b) -> a -> b
$ forall name. C name => String -> name
Name.fromString String
name
         TagComment String
cmt ->
            forall (m :: * -> *) a. Monad m => a -> m a
return (forall i name str. i -> String -> T i name str
XmlTree.commentIndex i
ti String
cmt)
         TagWarning String
msg ->
            forall (m :: * -> *) a. Monad m => a -> m a
return (forall i name str. i -> String -> T i name str
XmlTree.warningIndex i
ti String
msg)
         TagPosition Row
_ Row
_ ->
            forall a. HasCallStack => String -> a
error String
"Unexpected position information. Please filter it out!"
{-
         TagSpecial name _ ->
            return $ XmlTree.warningIndex ti $
               "Special tag " ++ name ++ " not allowed within a HTML document"
-}


parseSubTrees ::
   (Name.Tag name, Name.Attribute name) =>
   XmlTree.T i name String -> TagH.Name name -> Parser i [XmlTree.T i name String]
parseSubTrees :: forall name i.
(Tag name, Attribute name) =>
T i name String -> Name name -> Parser i [T i name String]
parseSubTrees T i name String
warn Name name
name =
   forall t a. Parser t a -> Parser t a
ParserU.force forall a b. (a -> b) -> a -> b
$
   if forall name. Tag name => Name name -> Bool
TagH.isEmpty Name name
name
     then
       (forall t. (t -> Bool) -> Parser t t
Parser.satisfy (forall str. Eq str => str -> Tag str -> Bool
Match.isTagCloseName (forall name. C name => name -> String
Name.toString Name name
name) forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a, b) -> b
snd) forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return [])
         forall t a. Parser t a -> Parser t a -> Parser t a
`Parser.onFail` forall (m :: * -> *) a. Monad m => a -> m a
return []
     else
       forall t a. Parser t [a] -> a -> Parser t a -> Parser t [a]
ParserU.manyFinallyAppend
          (forall name i.
(Tag name, Attribute name) =>
Name name -> Parser i [T i name String]
parseTerminator Name name
name)
          T i name String
warn forall name i.
(Tag name, Attribute name) =>
Parser i (T i name String)
parseBranch


parseTerminator ::
   (Name.Tag name, Name.Attribute name) =>
   TagH.Name name -> Parser i [XmlTree.T i name String]
parseTerminator :: forall name i.
(Tag name, Attribute name) =>
Name name -> Parser i [T i name String]
parseTerminator Name name
name =
   do c :: (i, Tag String)
c@(i
ci,Tag String
ct) <- forall t. Parser t t
Parser.next
      let retry :: b -> Parser (i, Tag String) b
retry b
warns = forall t. [t] -> Parser t ()
Parser.reparse [(i, Tag String)
c] forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return b
warns
      case Tag String
ct of
         TagClose String
closeName_ ->
            let closeName :: Name name
closeName = forall name. C name => String -> name
Name.fromString String
closeName_
            in  forall a b c. (a -> b -> c) -> b -> a -> c
flip forall a. a -> [(Bool, a)] -> a
select
                   ((forall name. Tag name => Name name -> Bool
TagH.isEmpty Name name
closeName,
                        forall (m :: * -> *) a. MonadFail m => String -> m a
fail forall a b. (a -> b) -> a -> b
$ String
"the tag <" forall a. [a] -> [a] -> [a]
++ String
closeName_ forall a. [a] -> [a] -> [a]
++ String
"> closes nothing.") forall a. a -> [a] -> [a]
:
                    (Name name
name forall name. Tag name => Name name -> Name name -> Bool
`TagH.isInnerOf` Name name
closeName, forall {b}. b -> Parser (i, Tag String) b
retry []) forall a. a -> [a] -> [a]
:
                    (Name name
nameforall a. Eq a => a -> a -> Bool
==Name name
closeName, forall (m :: * -> *) a. Monad m => a -> m a
return []) forall a. a -> [a] -> [a]
:
                    (forall name. Tag name => Name name -> Bool
TagH.isSloppy Name name
name,
                     forall {b}. b -> Parser (i, Tag String) b
retry [forall i name str. i -> String -> T i name str
XmlTree.warningIndex i
ci forall a b. (a -> b) -> a -> b
$
                               String
"sloppy tag <"
                               forall a. [a] -> [a] -> [a]
++ forall name. C name => name -> String
Name.toString Name name
name forall a. [a] -> [a] -> [a]
++ String
"> closed by </"
                               forall a. [a] -> [a] -> [a]
++ String
closeName_ forall a. [a] -> [a] -> [a]
++ String
">"]) forall a. a -> [a] -> [a]
:
                    (forall name. Tag name => Name name -> Bool
TagH.isSloppy Name name
closeName,
                     forall (m :: * -> *) a. MonadFail m => String -> m a
fail forall a b. (a -> b) -> a -> b
$ String
"ignore sloppy closing tag </" forall a. [a] -> [a] -> [a]
++ forall name. C name => name -> String
Name.toString Name name
name forall a. [a] -> [a] -> [a]
++ String
">") forall a. a -> [a] -> [a]
:
                    [])
                   (forall (m :: * -> *) a. Monad m => a -> m a
return [forall i name str. i -> String -> T i name str
XmlTree.warningIndex i
ci forall a b. (a -> b) -> a -> b
$
                                String
"open tag <" forall a. [a] -> [a] -> [a]
++ forall name. C name => name -> String
Name.toString Name name
name forall a. [a] -> [a] -> [a]
++
                                String
"> and close tag </" forall a. [a] -> [a] -> [a]
++ forall name. C name => name -> String
Name.toString Name name
closeName forall a. [a] -> [a] -> [a]
++
                                String
"> do not match"])
         TagOpen String
openName [Attribute String]
_ ->
            if forall name. C name => String -> name
Name.fromString String
openName forall name. Tag name => Name name -> Name name -> Bool
`TagH.closes` Name name
name
              then forall {b}. b -> Parser (i, Tag String) b
retry []
              else forall (m :: * -> *) a. MonadFail m => String -> m a
fail forall a b. (a -> b) -> a -> b
$
                      String
"open tag <" forall a. [a] -> [a] -> [a]
++ String
openName forall a. [a] -> [a] -> [a]
++
                      String
"> does not close tag <" forall a. [a] -> [a] -> [a]
++ forall name. C name => name -> String
Name.toString Name name
name forall a. [a] -> [a] -> [a]
++ String
">"
         Tag String
_ -> forall (m :: * -> *) a. MonadFail m => String -> m a
fail forall a b. (a -> b) -> a -> b
$ String
"not a termination of a tag"


parseOptions :: Tag.ParseOptions String
parseOptions :: ParseOptions String
parseOptions =
   forall str. StringLike str => ParseOptions str
Tag.parseOptions
      {optTagPosition :: Bool
Tag.optTagPosition = Bool
True,
       optTagWarning :: Bool
Tag.optTagWarning = Bool
True}

example :: [XmlTree NameLC.T String]
example :: [XmlTree T String]
example =
   forall name.
(Tag name, Attribute name) =>
[Tag String] -> [XmlTree name String]
toXmlTreesString forall a b. (a -> b) -> a -> b
$
   forall str. StringLike str => ParseOptions str -> str -> [Tag str]
Tag.parseTagsOptions ParseOptions String
parseOptions forall a b. (a -> b) -> a -> b
$
--   "<html><head>blub<meta/></head><br><body>bla</body></html>"
--   "<html><head>blub<meta/></head><br><body>bla<UL><li>1.<li>2.</OL><TABLE border=1></TABLE></body></html>"
--   "<html><head><meta>too much</meta>blub<meta/></head><br><body>bla<UL><li>1.<li>2.</UL><TABLE border=1></TABLE></bo"++undefined++"dy></html>"
--    "<b><font>test</b></font>"
    String
"<html><b><font>test</b></font></html>"