module Text.XML.WraXML.Tree where import qualified Text.XML.WraXML.Element as Elem import qualified Data.Tree.BranchLeafLabel as Tree import qualified Text.XML.WraXML.String as XmlString import qualified Text.HTML.WraXML.String as HtmlString import qualified Text.XML.Basic.Tag as Tag import qualified Text.XML.Basic.Attribute as Attr import qualified Text.XML.Basic.Name as Name import qualified Text.XML.Basic.ProcessingInstruction as PI import qualified Text.XML.HXT.DOM.Unicode as Unicode import qualified Data.Char as Char import Control.Monad.Trans.Writer (Writer, writer, runWriter, censor, ) import qualified Control.Monad.Exception.Synchronous as Exc import Data.Foldable as Foldable(Foldable(foldMap)) import Data.Traversable as Traversable(Traversable(traverse)) import Control.Applicative (Applicative, ) import qualified Control.Applicative as App import Data.List.HT (unzipEithers, ) import Data.Tuple.HT (mapFst, mapSnd, mapPair, swap, ) import Data.Maybe (mapMaybe, fromMaybe, ) import Data.Monoid (Monoid, mempty, mappend, mconcat, ) import qualified Text.XML.Basic.Format as Format {- * data structures -} newtype T i name str = Cons {unwrap :: Tree.T i (Branch name str) (Leaf name str)} deriving (Show) data Branch name str = Tag {getElement :: Elem.T name str} deriving (Show) {- It is disputable whether comments (and warnings) shall have type String or 'str'. Can a comment contain non-ASCII characters and XML entities? This is important for finding the comment ending "-->" properly. What about scripts enclosed in comment delimiters? Mozilla and Firefox don't do any encoding in the SCRIPT tag at all. This seems to be wrong, since scripts frequently contain strings with tag descriptions, like '
'. -} data Leaf name str = Text Bool {- is whitespace significant -} str | Comment String -- better 'str'? | CData String | PI (Tag.Name name) (PI.T name str) | Warning String -- better 'str'? {-# DEPRECATED AttributePlain "use Attribute type from xml-basic package instead" #-} type AttributePlain = (String, String) instance (Name.Attribute name, Show name, Show str) => Show (Leaf name str) where showsPrec prec x = showParen (prec>=10) $ case x of Text _ str -> showString "Text " . showsPrec 11 str Comment str -> showString "Comment " . showsPrec 11 str CData str -> showString "CData " . showsPrec 11 str PI target p -> -- showString "ProcessingInstruction " . showString "PI " . showsPrec 11 target . showString " " . showsPrec 11 p {- Maybe it is better to attach warnings to malicious tags instead of throwing them into the tag soup. But how to tell the position of the error then? -} Warning str -> showString "Warning " . showsPrec 11 str {- * generators -} wrap :: Tree.T i (Branch name str) (Leaf name str) -> T i name str wrap = Cons wrap2 :: i -> Tree.Elem i (Branch name str) (Leaf name str) -> T i name str wrap2 = curry wrap lift :: (Tree.T i (Branch name str0) (Leaf name str0) -> Tree.T j (Branch name str1) (Leaf name str1)) -> (T i name str0 -> T j name str1) lift f = wrap . f . unwrap liftA :: Applicative m => (Tree.T i (Branch name str0) (Leaf name str0) -> m (Tree.T i (Branch name str1) (Leaf name str1))) -> (T i name str0 -> m (T i name str1)) liftA f = App.liftA wrap . f . unwrap -- | Build some textual content. literal :: str -> T i name str literal = literalIndex (error "literal: no index given") literalIndex :: i -> str -> T i name str literalIndex i = wrap2 i . Tree.Leaf . Text False comment :: String -> T i name str comment = commentIndex (error "comment: no index given") commentIndex :: i -> String -> T i name str commentIndex i = wrap2 i . Tree.Leaf . Comment warning :: String -> T i name str warning = warningIndex (error "warning: no index given") warningIndex :: i -> String -> T i name str warningIndex i = wrap2 i . Tree.Leaf . Warning cdata :: String -> T i name str cdata = cdataIndex (error "cdata: no index given") cdataIndex :: i -> String -> T i name str cdataIndex i = wrap2 i . Tree.Leaf . CData processing :: Tag.Name name -> PI.T name str -> T i name str processing = processingIndex (error "processing: no index given") processingIndex :: i -> Tag.Name name -> PI.T name str -> T i name str processingIndex i target = wrap2 i . Tree.Leaf . PI target tag :: Tag.Name name -> [T i name str] -> T i name str tag name = tagAttr name [] tagAttr :: Tag.Name name -> [Attr.T name str] -> [T i name str] -> T i name str tagAttr = tagIndexAttr (error "tagAttr: no index given") tagIndexAttr :: i -> Tag.Name name -> [Attr.T name str] -> [T i name str] -> T i name str tagIndexAttr index name attrs = wrap2 index . Tree.Branch (Tag (Elem.Cons name attrs)) . map unwrap {- * Conversions -} liftTrans :: (a -> b) -> (a -> [b]) liftTrans f = (:[]) . f liftText :: (String -> String) -> (Leaf name String -> Leaf name String) liftText f leaf = case leaf of Text b s -> Text b (f s) CData s -> CData (f s) _ -> leaf liftTextA :: Applicative m => (String -> m String) -> (Leaf name String -> m (Leaf name String)) liftTextA f leaf = case leaf of Text b s -> App.liftA (Text b) (f s) CData s -> App.liftA CData $ f s _ -> App.pure leaf instance Functor (Leaf name) where fmap f leaf = case leaf of Text b s -> Text b (f s) Comment s -> Comment s Warning s -> Warning s CData s -> CData s PI t p -> PI t $ fmap f p {- this instance is quite useless but required by Traversable -} instance Foldable (Leaf name) where foldMap f leaf = case leaf of Text _b s -> f s PI _t p -> foldMap f p _ -> mempty instance Traversable (Leaf name) where traverse f leaf = case leaf of Text b s -> App.liftA (Text b) (f s) Comment s -> App.pure $ Comment s Warning s -> App.pure $ Warning s CData s -> App.pure $ CData s PI t p -> App.liftA (PI t) $ traverse f p liftElement :: (Elem.T name str0 -> Elem.T name str1) -> (Branch name str0 -> Branch name str1) liftElement f (Tag elm) = Tag (f elm) liftElementA :: Applicative m => (Elem.T name str0 -> m (Elem.T name str1)) -> (Branch name str0 -> m (Branch name str1)) liftElementA f (Tag elm) = App.liftA Tag (f elm) {- * Tests -} {- | If the Tree is a Leaf, then return False. Otherwise return the result of the predicate. -} checkTag :: (Elem.T name str -> Bool) -> (T i name str -> Bool) checkTag p = Tree.switch (flip const) (const . p . getElement) (const False) . unwrap maybeTag :: T i name str -> Maybe (Elem.T name str, [T i name str]) maybeTag (Cons (_,t)) = case t of Tree.Branch (Tag elm) subTrees -> Just (elm, map wrap subTrees) _ -> Nothing maybeText :: T i name str -> Maybe str maybeText (Cons (_,t)) = case t of Tree.Leaf l -> maybeTextLeaf l _ -> Nothing maybeTextLeaf :: Leaf name str -> Maybe str maybeTextLeaf t = case t of Text _ s -> Just s _ -> Nothing maybeCommentLeaf :: Leaf name str -> Maybe String maybeCommentLeaf t = case t of Comment s -> Just s _ -> Nothing maybeCDataLeaf :: Leaf name str -> Maybe String maybeCDataLeaf t = case t of CData s -> Just s _ -> Nothing maybeProcessingLeaf :: Leaf name str -> Maybe (Tag.Name name, PI.T name str) maybeProcessingLeaf t = case t of PI n instr -> Just (n, instr) _ -> Nothing maybeWarningLeaf :: Leaf name str -> Maybe String maybeWarningLeaf t = case t of Warning s -> Just s _ -> Nothing fold :: (i -> a -> b) -> (Elem.T name str -> [b] -> a) -> (Leaf name str -> a) -> (T i name str -> b) fold iF branchF leafF = Tree.fold iF (branchF . getElement) leafF . unwrap switch :: (i -> a -> b) -> (Elem.T name str -> [T i name str] -> a) -> (Leaf name str -> a) -> (T i name str -> b) switch iF branchF leafF = Tree.switch iF (\b subTrees -> branchF (getElement b) (map wrap subTrees)) leafF . unwrap {- * types of processors -} type Filter i name str = T i name str -> T i name str type FilterA m i name str = T i name str -> m (T i name str) {- * tree processors -} instance Functor (T i name) where fmap f = lift $ Tree.map (liftElement $ fmap f) (fmap f) mapText :: (String -> String) -> (T i name String -> T i name String) mapText f = lift $ Tree.map (liftElement $ fmap f) (liftText f) mapIndex :: (i -> j) -> T i name str -> T j name str mapIndex f = lift $ Tree.mapLabel f mapTag :: (Elem.Filter name str) -> (Filter i name str) mapTag f = lift $ Tree.map (liftElement f) id {- | Convert all CData sections to plain text. -} textFromCData :: T i name String -> T i name String textFromCData = lift $ Tree.map id (\leaf -> maybe leaf (Text False) $ maybeCDataLeaf leaf) {- | You can e.g. filter @text1 text2 text3@ to @text1 text2 text3@ by @filterTag (checkTagName ("b"\/=))@. -} filterTag :: (Elem.T name str -> Bool) -> (T i name str -> [T i name str]) filterTag p = map wrap . Tree.filterBranch (p . getElement) . unwrap mapCond :: (Elem.T name str -> Bool) -> (Elem.Filter name str) -> (Leaf name str -> Leaf name str) -> (Filter i name str) mapCond descend elemF txtF = lift $ Tree.mapCond (descend . getElement) (liftElement elemF) txtF {- mapTextCond :: (Elem.T name String -> Bool) -> (Elem.T name String -> Elem.T name String) -> (String -> String) -> (Filter i name String) mapTextCond descend elemF txtF = lift $ Tree.mapCond (descend . getElement) (liftElement elemF) (liftText txtF) -} {- | Find all branches where the predicate applies and return a list of matching sub-trees in depth-first order. Example: @filterTagsFlatten (checkTagName ("meta"==))@ -} filterTagsFlatten :: (Elem.T name str -> Bool) -> T i name str -> [(Elem.T name str, [T i name str])] filterTagsFlatten p = filter (p . fst) . mapMaybe maybeTag . allSubTrees filterElementsFlatten :: (Elem.T name str -> Bool) -> T i name str -> [Elem.T name str] filterElementsFlatten p = Tree.fold (flip const) (\branch xs -> filter p [getElement branch] ++ concat xs) (const []) . unwrap allSubTrees :: T i name str -> [T i name str] allSubTrees = map wrap . Tree.allSubTrees . unwrap {- | merge subsequent string leafs -} mergeStrings :: (Monoid str) => Filter i name str mergeStrings = processAllSubTrees mergeTopStrings mergeTopStrings :: (Monoid str) => [T i name str] -> [T i name str] mergeTopStrings = let prepend (i, Tree.Leaf (Text w0 t0)) rest = mapFst (\ ~(w1,t1) -> (i, Tree.Leaf $ Text (w0||w1) (mappend t0 t1))) $ case rest of (_, Tree.Leaf (Text w1 t1)) : ss -> ((w1,t1), ss) _ -> ((False,mempty), rest) prepend x rest = (x, rest) in map wrap. foldr (\x -> uncurry (:) . prepend x) [] . map unwrap {- | Process all sub-tree lists in bottom-up order. -} processAllSubTrees :: ([T i name str] -> [T i name str]) -> Filter i name str processAllSubTrees f = lift $ Tree.fold (,) (\branch -> Tree.Branch branch . map unwrap . f . map wrap) Tree.Leaf processSubTrees :: (Tag.Name name -> Bool) -> ([T i name str] -> [T i name str]) -> Filter i name str processSubTrees p f = lift $ Tree.mapSubTrees (\(Tag (Elem.Cons name _)) -> p name) (mapSnd (map unwrap . f . map wrap)) processSubTreesAttrs :: (Tag.Name name -> Bool) -> (([Attr.T name str], [T i name str]) -> ([Attr.T name str], [T i name str])) -> Filter i name str processSubTreesAttrs p f = lift $ Tree.mapSubTrees (\(Tag (Elem.Cons name _)) -> p name) (\(Tag (Elem.Cons name attrs), subTrees) -> mapPair (Tag . Elem.Cons name, map unwrap) $ f (attrs, map wrap subTrees)) {- * applicative functor tree processors -} instance Foldable (T i name) where foldMap f = Tree.fold (const id) (const mconcat) (maybe mempty f . maybeTextLeaf) . unwrap instance Traversable (T i name) where traverse f = liftA $ Tree.mapA (liftElementA $ traverse f) (traverse f) mapTextA :: Applicative m => (String -> m String) -> (FilterA m i name String) mapTextA f = liftA $ Tree.mapA (liftElementA $ traverse f) (liftTextA f) mapCondA :: Applicative m => (Elem.T name str -> Bool) -> (Elem.T name str -> m (Elem.T name str)) -> (Leaf name str -> m (Leaf name str)) -> (FilterA m i name str) mapCondA descend elemF txtF = liftA $ Tree.mapCondA (descend . getElement) (liftElementA elemF) txtF {- mapTextCondA :: Applicative m => (Elem.T name String -> Bool) -> (Elem.T name String -> m (Elem.T name String)) -> (String -> m String) -> (FilterA m i name String) mapTextCondA descend elemF txtF = liftA $ Tree.mapCondA (descend . getElement) (liftElementA elemF) (liftTextA txtF) -} {- * Character decoding -} unescape :: T i name XmlString.T -> T i name String unescape = fmap XmlString.toUnicodeString {- | Use ASCII characters, XML entity references and character references for representing strings. That's not human readable, but portable. -} escape :: T i name String -> T i name XmlString.T escape = fmap XmlString.fromUnicodeString {-# DEPRECATED decodeSpecialChars, maybeDecodeSpecialChars, decodeSpecialCharsDecoder, decodeAttrs, decodeAttr, maybeDecodeUTF8Chars "XmlChar.Unicode constructors must contain unicode characters and not encoded ones. Decode characters before parsing!" #-} {- | Decode characters like those from UTF-8 scheme. -} decodeSpecialChars :: (Name.Tag name, Name.Attribute name) => String -> T i name XmlString.T -> [T i name String] decodeSpecialChars enc tree = fromMaybe [unescape tree] (maybeDecodeSpecialChars enc tree) maybeDecodeSpecialChars :: (Name.Tag name, Name.Attribute name) => String -> T i name XmlString.T -> Maybe [T i name String] maybeDecodeSpecialChars enc tree = fmap (flip decodeSpecialCharsDecoder tree) $ Unicode.getDecodingFctEmbedErrors enc {- test: -- decodeSpecialChars "utf-8" $ literalIndex 0 (XmlString.fromEntString "tr&am;e") traverse (putStrLn . showHTML . escape) $ decodeSpecialChars "utf-8" $ mapText XmlString.fromEntString $ tagIndexAttr 0 "br" [("href","urlö"), ("target","_blank")] [literalIndex 0 "\195tr&am;eü"] -} {- | Conversion errors are appended as warnings to the tree. -} decodeSpecialCharsDecoder :: (Name.Tag name, Name.Attribute name) => Unicode.DecodingFctEmbedErrors -> T i name XmlString.T -> [T i name String] decodeSpecialCharsDecoder decode = let xmlDecode = HtmlString.toUnicodeStringDecodingEmbedError decode mergeDecode = XmlString.uStringWithErrorsMergePlainChars . xmlDecode in Tree.foldLabel (\i branch subTrees -> case branch of Tag (Elem.Cons name attrs) -> let (newAttrs,warnings) = runWriter $ decodeAttrs xmlDecode attrs in [tagIndexAttr i name newAttrs (map (warningIndex i) warnings ++ concat subTrees)]) (\i leaf -> map (wrap2 i . Tree.Leaf) $ case leaf of Text b str -> map (Exc.switch Warning (Text b)) (mergeDecode str) Comment cmt -> [Comment cmt] Warning str -> [Warning str] CData str -> [CData str] PI target instr0 -> let (instr1,warnings) = runWriter $ PI.mapAttributesA (decodeAttrs xmlDecode) instr0 in PI target instr1 : map Warning warnings) . unwrap decodeAttrs :: (Name.Tag name, Name.Attribute name) => (XmlString.T -> XmlString.EmbeddedExceptions) -> [Attr.T name XmlString.T] -> Writer [String] [Attr.T name String] decodeAttrs xmlDecode = traverse (\attr -> traverse (decodeAttr xmlDecode (Attr.name_ attr)) attr) decodeAttr :: (Name.Tag name, Name.Attribute name) => (XmlString.T -> XmlString.EmbeddedExceptions) -> Attr.Name name -> XmlString.T -> Writer [String] String decodeAttr decode name = censor (map (showString $ "in attribute \"" ++ Name.toString name ++ "\": ")) . writer . swap . unzipEithers . map Exc.toEither . decode maybeDecodeUTF8Chars :: String -> T i name XmlString.T -> Maybe (T i name String) maybeDecodeUTF8Chars enc tree = case map Char.toLower enc of "utf-8" -> Just (fmap XmlString.utf8ToUnicodeString tree) _ -> Nothing {- * Formatting -} {- show :: (Name.Tag name, Name.Attribute name) => T i name XmlString.T -> String show leaf = shows leaf "" -} formatMany :: (Name.Tag name, Name.Attribute name, Format.C string) => [T i name string] -> ShowS formatMany = Format.many format -- cf. src/Text/XML/HXT/DOM/XmlTreeFunctions.hs format :: (Name.Tag name, Name.Attribute name, Format.C string) => T i name string -> ShowS format = Tree.fold (flip const) formatBranch formatLeaf . unwrap formatBranch :: (Name.Tag name, Name.Attribute name, Format.C string) => Branch name string -> [ShowS] -> ShowS formatBranch branch formatSubTrees = case branch of Tag elm -> Elem.format (\_tagName -> null formatSubTrees) Format.slash elm formatSubTrees formatLeaf :: (Name.Tag name, Name.Attribute name, Format.C string) => Leaf name string -> ShowS formatLeaf leaf = case leaf of Text _ str -> Format.run str Comment c -> showString "" Warning e -> showString "" CData str -> showString "" PI target p -> Format.angle $ Format.quest . Format.name target . Format.run p . Format.quest instance (Name.Tag name, Name.Attribute name, Format.C string) => Format.C (T i name string) where run = format instance (Name.Tag name, Name.Attribute name, Format.C string) => Format.C (Leaf name string) where run = formatLeaf