module Text.XML.WraXML.Document.TagSoup where
import qualified Text.XML.WraXML.Tree.TagSoup as TreeTagSoup
import qualified Text.XML.WraXML.Document as XmlDoc
import Text.XML.WraXML.Tree.TagSoup (PosTag, )
import qualified Text.HTML.TagSoup as Tag
import Text.HTML.TagSoup (Tag(..), )
import qualified Text.XML.Basic.Position as Position
import qualified Text.XML.Basic.Attribute as Attr
import qualified Text.XML.Basic.Name.MixedCase as NameMC
import qualified Text.XML.Basic.Name.LowerCase as NameLC
import qualified Text.XML.Basic.Name as Name
import Control.Monad.Trans.State (State, state, evalState, modify, gets, )
import Data.Char (isSpace, )
dropSpace ::[PosTag] -> [PosTag]
dropSpace =
dropWhile
(\tag ->
case snd tag of
Tag.TagText text -> all isSpace text
_ -> False)
withoutLeadingSpace ::
([PosTag] -> (a, [PosTag])) ->
State [PosTag] a
withoutLeadingSpace f =
modify dropSpace >> state f
toXmlDocument ::
(Name.Tag name, Name.Attribute name) =>
[Tag] -> XmlDoc.T Position.T name String
toXmlDocument ts =
flip evalState
(TreeTagSoup.removeMetaPos
(TreeTagSoup.attachPos
(Tag.canonicalizeTags ts))) $
do xml <- withoutLeadingSpace $ \ts0 ->
case ts0 of
(_, Tag.TagOpen "?xml" attrs):ts1 ->
(Just (map (uncurry Attr.new) attrs), ts1)
_ -> (Nothing, ts0)
docType <- withoutLeadingSpace $ \ts0 ->
case ts0 of
(_, Tag.TagOpen "!DOCTYPE" dtd):ts1 ->
(Just (Attr.formatListBlankHead
(map (Attr.fromPair :: (String,String) -> Attr.T NameMC.T String) dtd) ""), ts1)
_ -> (Nothing, ts0)
gets (XmlDoc.Cons xml docType . TreeTagSoup.toXmlTreesAux)
toXmlDocumentString ::
(Name.Tag name, Name.Attribute name) =>
[Tag] -> XmlDoc.T Position.T name String
toXmlDocumentString =
toXmlDocument
example :: IO ()
example =
print .
(toXmlDocumentString :: [Tag] -> XmlDoc.T Position.T NameLC.T String) .
Tag.parseTagsOptions TreeTagSoup.parseOptions
=<< readFile "/home/thielema/public_html/index.html"