module Text.XML.WraXML.Tree.TagSoup where
import qualified Text.HTML.TagSoup as Tag
import Text.HTML.TagSoup (Tag(..), )
import qualified Text.XML.WraXML.Tree.Literal as XmlTree
import qualified Text.HTML.TagSoup.Match as Match
import qualified Text.HTML.Basic.Tag as TagH
import qualified Text.XML.Basic.Position as Position
import qualified Text.XML.Basic.Attribute as Attr
import qualified Text.XML.Basic.ProcessingInstruction as PI
import qualified Text.XML.Basic.Name.LowerCase as NameLC
import qualified Text.XML.Basic.Name as Name
import qualified Text.XML.WraXML.Tree.LazyParser as ParserU
import qualified Text.ParserCombinators.Poly.Lazy as Parser
import Data.Bool.HT (select, )
import Control.Monad (liftM, )
import qualified Data.List as List
type PosTag = (Position.T, Tag)
defaultFilename :: String
defaultFilename = "input"
attachPos :: [Tag] -> [PosTag]
attachPos =
snd .
List.mapAccumL
(\pos t ->
let newPos =
case t of
Tag.TagPosition setRow setColumn ->
Position.new defaultFilename setRow setColumn
_ -> pos
in (newPos, (newPos, t)))
(Position.initialize defaultFilename)
removeMetaPos :: [PosTag] -> [PosTag]
removeMetaPos =
filter (\(_pos,tag) -> case tag of
TagPosition _ _ -> False
_ -> True)
type XmlTree name string = XmlTree.T Position.T name string
toXmlTreesString ::
(Name.Tag name, Name.Attribute name) =>
[Tag] -> [XmlTree name String]
toXmlTreesString =
toXmlTreesAux .
removeMetaPos .
attachPos .
Tag.canonicalizeTags
toXmlTreesAux ::
(Name.Tag name, Name.Attribute name) =>
[PosTag] -> [XmlTree name String]
toXmlTreesAux =
fst . Parser.runParser (ParserU.manyLazy parseBranch)
type Parser i a = Parser.Parser (i, Tag) a
parseBranch ::
(Name.Tag name, Name.Attribute name) =>
Parser i (XmlTree.T i name String)
parseBranch =
do (ti,t) <- Parser.next
case t of
TagText text -> return (XmlTree.literalIndex ti text)
TagOpen ('?':target) attrs ->
return $
XmlTree.processingIndex ti target $
case attrs of
[("",str)] ->
PI.Unknown str
_ ->
PI.Known $
map (uncurry Attr.new) attrs
TagOpen "![CDATA[" [("",str)] ->
return (XmlTree.cdataIndex ti str)
TagOpen name attrs ->
liftM
(XmlTree.tagIndexAttr ti name attrs)
(parseSubTrees
(XmlTree.warningIndex ti
"unexpected end of file") (Name.fromString name))
TagClose name ->
let makeWarning ::
(Name.Tag name) =>
i -> TagH.Name name -> XmlTree.T i name String
makeWarning i n =
XmlTree.warningIndex i $
if TagH.isSloppy n
then "misplaced sloppy closing tag </" ++ name ++ ">"
else "isolated closing tag </" ++ name ++ ">"
in return $ makeWarning ti $ Name.fromString name
TagComment cmt ->
return (XmlTree.commentIndex ti cmt)
TagWarning msg ->
return (XmlTree.warningIndex ti msg)
TagPosition _ _ ->
error "Unexpected position information. Please filter it out!"
parseSubTrees ::
(Name.Tag name, Name.Attribute name) =>
XmlTree.T i name String -> TagH.Name name -> Parser i [XmlTree.T i name String]
parseSubTrees warn name =
ParserU.force $
if TagH.isEmpty name
then
(Parser.satisfy (Match.tagCloseLit (Name.toString name) . snd) >> return [])
`Parser.onFail` return []
else
ParserU.manyFinallyAppend
(parseTerminator name)
warn parseBranch
parseTerminator ::
(Name.Tag name, Name.Attribute name) =>
TagH.Name name -> Parser i [XmlTree.T i name String]
parseTerminator name =
do c@(ci,ct) <- Parser.next
let retry warns = Parser.reparse [c] >> return warns
case ct of
TagClose closeName_ ->
let closeName = Name.fromString closeName_
in flip select
((TagH.isEmpty closeName,
fail $ "the tag <" ++ closeName_ ++ "> closes nothing.") :
(name `TagH.isInnerOf` closeName, retry []) :
(name==closeName, return []) :
(TagH.isSloppy name,
retry [XmlTree.warningIndex ci $
"sloppy tag <"
++ Name.toString name ++ "> closed by </"
++ closeName_ ++ ">"]) :
(TagH.isSloppy closeName,
fail $ "ignore sloppy closing tag </" ++ Name.toString name ++ ">") :
[])
(return [XmlTree.warningIndex ci $
"open tag <" ++ Name.toString name ++
"> and close tag </" ++ Name.toString closeName ++
"> do not match"])
TagOpen openName _ ->
if Name.fromString openName `TagH.closes` name
then retry []
else fail $
"open tag <" ++ openName ++
"> does not close tag <" ++ Name.toString name ++ ">"
_ -> fail $ "not a termination of a tag"
parseOptions :: Tag.ParseOptions
parseOptions =
Tag.parseOptions
{Tag.optTagPosition = True,
Tag.optTagWarning = True}
example :: [XmlTree NameLC.T String]
example =
toXmlTreesString $
Tag.parseTagsOptions parseOptions $
"<html><b><font>test</b></font></html>"