-- ------------------------------------------------------------ {- | Module : Text.XML.HXT.Arrow.Edit Copyright : Copyright (C) 2006 Uwe Schmidt License : MIT Maintainer : Uwe Schmidt (uwe@fh-wedel.de) Stability : experimental Portability: portable Version : $Id: Edit.hs,v 1.8 2006/11/12 14:52:59 hxml Exp $ common edit arrows -} -- ------------------------------------------------------------ module Text.XML.HXT.Arrow.Edit ( canonicalizeAllNodes , canonicalizeForXPath , canonicalizeContents , collapseAllXText , collapseXText , escapeXmlDoc , escapeHtmlDoc , haskellRepOfXmlDoc , treeRepOfXmlDoc , addHeadlineToXmlDoc , indentDoc , numberLinesInXmlDoc , removeComment , removeAllComment , removeWhiteSpace , removeAllWhiteSpace , removeDocWhiteSpace , transfCdata , transfAllCdata , transfCharRef , transfAllCharRef , hasXmlPi , addXmlPi , addXmlPiEncoding , addDoctypeDecl , addXHtmlDoctypeStrict , addXHtmlDoctypeTransitional , addXHtmlDoctypeFrameset ) where import Control.Arrow import Control.Arrow.ArrowList import Control.Arrow.ArrowIf import Control.Arrow.ArrowTree import Control.Arrow.ListArrow import Text.XML.HXT.Arrow.XmlArrow import Text.XML.HXT.DOM.Interface import qualified Text.XML.HXT.DOM.XmlNode as XN import Text.XML.HXT.DOM.Unicode ( isXmlSpaceChar ) import Text.XML.HXT.DOM.FormatXmlTree ( formatXmlTree ) import Text.XML.HXT.Parser.XmlEntities ( xmlEntities ) import Text.XML.HXT.Parser.XhtmlEntities ( xhtmlEntities ) import Data.Maybe import qualified Data.Map as M -- ------------------------------------------------------------ -- | -- Applies some "Canonical XML" rules to a document tree. -- -- The rule differ slightly for canonical XML and XPath in handling of comments -- -- Note: This is not the whole canonicalization as it is specified by the W3C -- Recommendation. Adding attribute defaults or sorting attributes in lexicographic -- order is done by the @transform@ function of module @Text.XML.HXT.Validator.Validation@. -- Replacing entities or line feed normalization is done by the parser. -- -- -- Not implemented yet: -- -- - Whitespace within start and end tags is normalized -- -- - Special characters in attribute values and character content are replaced by character references -- -- see 'canonicalizeAllNodes' and 'canonicalizeForXPath' canonicalizeTree' :: LA XmlTree XmlTree -> LA XmlTree XmlTree canonicalizeTree' toBeRemoved = processChildren (none `when` isText) >>> processBottomUp canonicalize1Node where canonicalize1Node :: LA XmlTree XmlTree canonicalize1Node = (deep isPi `when` isDTD) -- remove DTD parts, except PIs >>> (none `when` toBeRemoved) -- remove unintersting nodes >>> ( processAttrl ( processChildren transfCharRef >>> collapseXText ) `when` isElem ) >>> transfCdata -- CDATA -> text >>> transfCharRef -- Char refs -> text >>> collapseXText -- combine text -- | -- Applies some "Canonical XML" rules to a document tree. -- -- The rule differ slightly for canonical XML and XPath in handling of comments -- -- Note: This is not the whole canonicalization as it is specified by the W3C -- Recommendation. Adding attribute defaults or sorting attributes in lexicographic -- order is done by the @transform@ function of module @Text.XML.HXT.Validator.Validation@. -- Replacing entities or line feed normalization is done by the parser. -- -- Rules: remove DTD parts, processing instructions, comments and substitute char refs in attribute -- values and text -- -- Not implemented yet: -- -- - Whitespace within start and end tags is normalized -- -- - Special characters in attribute values and character content are replaced by character references canonicalizeAllNodes :: ArrowList a => a XmlTree XmlTree canonicalizeAllNodes = fromLA $ canonicalizeTree' ( isCmt -- remove comment <+> isXmlPi -- remove xml declaration ) -- | -- Canonicalize a tree for XPath -- Like 'canonicalizeAllNodes' but comment nodes are not removed -- -- see 'canonicalizeAllNodes' canonicalizeForXPath :: ArrowList a => a XmlTree XmlTree canonicalizeForXPath = fromLA $ canonicalizeTree' isXmlPi -- | -- Canonicalize the contents of a document -- -- substitutes all char refs in text and attribute values, -- removes CDATA section and combines all sequences of resulting text -- nodes into a single text node -- -- see 'canonicalizeAllNodes' canonicalizeContents :: ArrowList a => a XmlTree XmlTree canonicalizeContents = fromLA $ processBottomUp canonicalize1Node where canonicalize1Node :: LA XmlTree XmlTree canonicalize1Node = ( processAttrl ( processChildren transfCharRef >>> collapseXText ) `when` isElem ) >>> transfCdata -- CDATA -> text >>> transfCharRef -- Char refs -> text >>> collapseXText -- combine text -- ------------------------------------------------------------ collapseXText' :: LA XmlTree XmlTree collapseXText' = replaceChildren ( listA getChildren >>> arrL (foldr mergeText' []) ) where mergeText' :: XmlTree -> XmlTrees -> XmlTrees mergeText' t1 (t2 : ts2) | XN.isText t1 && XN.isText t2 = let s1 = fromJust . XN.getText $ t1 s2 = fromJust . XN.getText $ t2 t = XN.mkText (s1 ++ s2) in t : ts2 mergeText' t1 ts = t1 : ts -- | -- Collects sequences of text nodes in the list of children of a node into one single text node. -- This is useful, e.g. after char and entity reference substitution collapseXText :: ArrowList a => a XmlTree XmlTree collapseXText = fromLA $ collapseXText' -- | -- Applies collapseXText recursively. -- -- -- see also : 'collapseXText' collapseAllXText :: ArrowList a => a XmlTree XmlTree collapseAllXText = fromLA $ processBottomUp collapseXText' -- ------------------------------------------------------------ -- | -- escape XmlText, -- transform all special XML chars into char- or entity- refs type EntityRefTable = M.Map Int String xmlEntityRefTable , xhtmlEntityRefTable :: EntityRefTable xmlEntityRefTable = buildEntityRefTable $ xmlEntities xhtmlEntityRefTable = buildEntityRefTable $ xhtmlEntities buildEntityRefTable :: [(String, Int)] -> EntityRefTable buildEntityRefTable = M.fromList . map (\ (x,y) -> (y,x) ) escapeText'' :: (Char -> XmlTree) -> (Char -> Bool) -> XmlTree -> XmlTrees escapeText'' escChar isEsc t = maybe [t] escape . XN.getText $ t where escape [] = [] escape (c:s1) | isEsc c = escChar c : escape s1 escape s = XN.mkText s1 : escape s2 where (s1, s2) = break isEsc s {- escapeCharRef :: Char -> XmlTree escapeCharRef = XN.mkCharRef . fromEnum -} escapeEntityRef :: EntityRefTable -> Char -> XmlTree escapeEntityRef entityTable c = maybe (XN.mkCharRef c') XN.mkEntityRef . M.lookup c' $ entityTable where c' = fromEnum c escapeXmlEntityRef :: Char -> XmlTree escapeXmlEntityRef = escapeEntityRef xmlEntityRefTable escapeHtmlEntityRef :: Char -> XmlTree escapeHtmlEntityRef = escapeEntityRef xhtmlEntityRefTable -- | -- escape all special XML chars into XML entity references or char references -- -- convert the special XML chars \< and & in text nodes into prefefiened XML entity references, -- in attribute values also \', \", \>, \\n, \\r and \\t are converted into entity or char references, -- in comments nothing is converted (see XML standard 2.4, useful e.g. for JavaScript). escapeXmlDoc :: ArrowList a => a XmlTree XmlTree escapeXmlDoc = fromLA $ escapeDoc escXmlText escXmlAttrValue where escXmlText = arrL $ escapeText'' escapeXmlEntityRef (`elem` "<&") -- no escape for ", ' and > required: XML standard 2.4 escXmlAttrValue = arrL $ escapeText'' escapeXmlEntityRef (`elem` "<>\"\'&\n\r\t") -- | -- escape all special HTML chars into XHTML entity references or char references -- -- convert the special XML chars \< and & and all none ASCII chars in text nodes into prefefiened XML or XHTML entity references, -- in attribute values also \', \", \>, \\n, \\r and \\t are converted into entity or char references, -- in comments nothing is converted escapeHtmlDoc :: ArrowList a => a XmlTree XmlTree escapeHtmlDoc = fromLA $ escapeDoc escHtmlText escHtmlAttrValue where escHtmlText = arrL $ escapeText'' escapeHtmlEntityRef isHtmlTextEsc escHtmlAttrValue = arrL $ escapeText'' escapeHtmlEntityRef isHtmlAttrEsc isHtmlTextEsc c = c >= toEnum(128) || ( c `elem` "<&" ) isHtmlAttrEsc c = c >= toEnum(128) || ( c `elem` "<>\"\'&\n\r\t" ) escapeDoc :: LA XmlTree XmlTree -> LA XmlTree XmlTree -> LA XmlTree XmlTree escapeDoc escText escAttr = escape where escape = choiceA [ isElem :-> ( processChildren escape >>> processAttrl escVal ) , isText :-> escText -- , isCmt :-> escCmt , isDTD :-> processTopDown escDTD , this :-> this ] escVal = processChildren escAttr escDTD = escVal `when` ( isDTDEntity <+> isDTDPEntity ) -- ------------------------------------------------------------ -- | -- convert a document into a Haskell representation (with show). -- -- Useful for debugging and trace output. -- see also : 'treeRepOfXmlDoc', 'numberLinesInXmlDoc' haskellRepOfXmlDoc :: ArrowList a => a XmlTree XmlTree haskellRepOfXmlDoc = fromLA $ root [getAttrl] [show ^>> mkText] -- | -- convert a document into a text and add line numbers to the text representation. -- -- Result is a root node with a single text node as child. -- Useful for debugging and trace output. -- see also : 'haskellRepOfXmlDoc', 'treeRepOfXmlDoc' numberLinesInXmlDoc :: ArrowList a => a XmlTree XmlTree numberLinesInXmlDoc = fromLA $ processChildren (changeText numberLines) where numberLines :: String -> String numberLines str = concat $ zipWith (\ n l -> lineNr n ++ l ++ "\n") [1..] (lines str) where lineNr :: Int -> String lineNr n = (reverse (take 6 (reverse (show n) ++ replicate 6 ' '))) ++ " " -- | -- convert a document into a text representation in tree form. -- -- Useful for debugging and trace output. -- see also : 'haskellRepOfXmlDoc', 'numberLinesInXmlDoc' treeRepOfXmlDoc :: ArrowList a => a XmlTree XmlTree treeRepOfXmlDoc = fromLA $ root [getAttrl] [formatXmlTree ^>> mkText] addHeadlineToXmlDoc :: ArrowXml a => a XmlTree XmlTree addHeadlineToXmlDoc = fromLA $ ( addTitle $< (getAttrValue a_source >>^ formatTitle) ) where addTitle str = replaceChildren ( txt str <+> getChildren <+> txt "\n" ) formatTitle str = "\n" ++ headline ++ "\n" ++ underline ++ "\n\n" where headline = "content of: " ++ str underline = map (const '=') headline -- ------------------------------------------------------------ removeComment' :: LA XmlTree XmlTree removeComment' = none `when` isCmt -- | -- remove Comments: @none `when` isCmt@ removeComment :: ArrowXml a => a XmlTree XmlTree removeComment = fromLA $ removeComment' -- | -- remove all comments recursively removeAllComment :: ArrowXml a => a XmlTree XmlTree removeAllComment = fromLA $ processBottomUp removeComment' -- ---------- removeWhiteSpace' :: LA XmlTree XmlTree removeWhiteSpace' = none `when` isWhiteSpace -- | -- simple filter for removing whitespace. -- -- no check on sigificant whitespace, e.g. in HTML \-elements, is done. -- -- -- see also : 'removeAllWhiteSpace', 'removeDocWhiteSpace' removeWhiteSpace :: ArrowXml a => a XmlTree XmlTree removeWhiteSpace = fromLA $ removeWhiteSpace' -- | -- simple recursive filter for removing all whitespace. -- -- removes all text nodes in a tree that consist only of whitespace. -- -- -- see also : 'removeWhiteSpace', 'removeDocWhiteSpace' removeAllWhiteSpace :: ArrowXml a => a XmlTree XmlTree removeAllWhiteSpace = fromLA $ processBottomUp removeWhiteSpace' -- | -- filter for removing all not significant whitespace. -- -- the tree traversed for removing whitespace between elements, -- that was inserted for indentation and readability. -- whitespace is only removed at places, where it's not significat -- preserving whitespace may be controlled in a document tree -- by a tag attribute @xml:space@ -- -- allowed values for this attribute are @default | preserve@ -- -- input is root node of the document to be cleaned up, -- output the semantically equivalent simplified tree -- -- -- see also : 'indentDoc', 'removeAllWhiteSpace' removeDocWhiteSpace :: ArrowXml a => a XmlTree XmlTree removeDocWhiteSpace = fromLA $ removeRootWhiteSpace removeRootWhiteSpace :: LA XmlTree XmlTree removeRootWhiteSpace = processChildren processRootElement `when` isRoot where processRootElement :: LA XmlTree XmlTree processRootElement = removeWhiteSpace >>> processChild where processChild = choiceA [ isDTD :-> removeAllWhiteSpace -- whitespace in DTD is redundant , this :-> replaceChildren ( getChildren >>. indentTrees insertNothing False 1 ) ] -- ------------------------------------------------------------ -- | -- filter for indenting a document tree for pretty printing. -- -- the tree is traversed for inserting whitespace for tag indentation. -- -- whitespace is only inserted or changed at places, where it isn't significant, -- is's not inserted between tags and text containing non whitespace chars. -- -- whitespace is only inserted or changed at places, where it's not significant. -- preserving whitespace may be controlled in a document tree -- by a tag attribute @xml:space@ -- -- allowed values for this attribute are @default | preserve@. -- -- input is a complete document tree or a document fragment -- result is the semantically equivalent formatted tree. -- -- -- see also : 'removeDocWhiteSpace' indentDoc :: ArrowXml a => a XmlTree XmlTree indentDoc = fromLA $ ( ( isRoot `guards` indentRoot ) `orElse` (root [] [this] >>> indentRoot >>> getChildren) ) -- ------------------------------------------------------------ indentRoot :: LA XmlTree XmlTree indentRoot = processChildren indentRootChildren where indentRootChildren = removeText >>> indentChild >>> insertNL where removeText = none `when` isText insertNL = this <+> txt "\n" indentChild = ( replaceChildren ( getChildren >>. indentTrees (insertIndentation 2) False 1 ) `whenNot` isDTD ) -- ------------------------------------------------------------ -- -- copied from EditFilter and rewritten for arrows -- to remove dependency to the filter module indentTrees :: (Int -> LA XmlTree XmlTree) -> Bool -> Int -> XmlTrees -> XmlTrees indentTrees _ _ _ [] = [] indentTrees indentFilter preserveSpace level ts = runLAs lsf ls ++ indentRest rs where runLAs f l = runLA (constL l >>> f) undefined (ls, rs) = break XN.isElem ts isSignificant :: Bool isSignificant = preserveSpace || (not . null . runLAs isSignificantPart) ls isSignificantPart :: LA XmlTree XmlTree isSignificantPart = catA [ isText `guards` neg isWhiteSpace , isCdata , isCharRef , isEntityRef ] lsf :: LA XmlTree XmlTree lsf | isSignificant = this | otherwise = (none `when` isWhiteSpace) >>> (indentFilter level <+> this) indentRest :: XmlTrees -> XmlTrees indentRest [] | isSignificant = [] | otherwise = runLA (indentFilter (level - 1)) undefined indentRest (t':ts') = runLA ( ( indentElem >>> lsf ) `when` isElem ) t' ++ ( if null ts' then indentRest else indentTrees indentFilter preserveSpace level ) ts' where indentElem = replaceChildren ( getChildren >>. indentChildren ) xmlSpaceAttrValue :: String xmlSpaceAttrValue = concat . runLA (getAttrValue "xml:space") $ t' preserveSpace' :: Bool preserveSpace' = ( fromMaybe preserveSpace . lookup xmlSpaceAttrValue ) [ ("preserve", True) , ("default", False) ] indentChildren :: XmlTrees -> XmlTrees indentChildren cs' | all (maybe False (all isXmlSpaceChar) . XN.getText) cs' = [] | otherwise = indentTrees indentFilter preserveSpace' (level + 1) cs' -- filter for indenting elements insertIndentation :: Int -> Int -> LA a XmlTree insertIndentation indentWidth level = txt ('\n' : replicate (level * indentWidth) ' ') -- filter for removing all whitespace insertNothing :: Int -> LA a XmlTree insertNothing _ = none -- ------------------------------------------------------------ transfCdata' :: LA XmlTree XmlTree transfCdata' = (getCdata >>> mkText) `when` isCdata -- | -- converts a CDATA section node into a normal text node transfCdata :: ArrowXml a => a XmlTree XmlTree transfCdata = fromLA $ transfCdata' -- | -- converts CDATA sections in whole document tree into normal text nodes transfAllCdata :: ArrowXml a => a XmlTree XmlTree transfAllCdata = fromLA $ processBottomUp transfCdata' -- transfCharRef' :: LA XmlTree XmlTree transfCharRef' = ( getCharRef >>> arr (\ i -> [toEnum i]) >>> mkText ) `when` isCharRef -- | -- converts character references to normal text transfCharRef :: ArrowXml a => a XmlTree XmlTree transfCharRef = fromLA $ transfCharRef' -- | -- recursively converts all character references to normal text transfAllCharRef :: ArrowXml a => a XmlTree XmlTree transfAllCharRef = fromLA $ processBottomUp transfCharRef' -- ------------------------------------------------------------ hasXmlPi :: ArrowXml a => a XmlTree XmlTree hasXmlPi = fromLA ( getChildren >>> isPi >>> hasName t_xml ) -- | add an \ processing instruction -- if it's not already there addXmlPi :: ArrowXml a => a XmlTree XmlTree addXmlPi = fromLA ( insertChildrenAt 0 ( ( mkPi (mkSNsName t_xml) none >>> addAttr a_version "1.0" ) <+> txt "\n" ) `whenNot` hasXmlPi ) -- | add an encoding spec to the \ processing instruction addXmlPiEncoding :: ArrowXml a => String -> a XmlTree XmlTree addXmlPiEncoding enc = fromLA $ processChildren ( addAttr a_encoding enc `when` ( isPi >>> hasName t_xml ) ) -- | add an XHTML strict doctype declaration to a document addXHtmlDoctypeStrict , addXHtmlDoctypeTransitional , addXHtmlDoctypeFrameset :: ArrowXml a => a XmlTree XmlTree -- | add an XHTML strict doctype declaration to a document addXHtmlDoctypeStrict = addDoctypeDecl "html" "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" -- | add an XHTML transitional doctype declaration to a document addXHtmlDoctypeTransitional = addDoctypeDecl "html" "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" -- | add an XHTML frameset doctype declaration to a document addXHtmlDoctypeFrameset = addDoctypeDecl "html" "-//W3C//DTD XHTML 1.0 Frameset//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd" -- | add a doctype declaration to a document -- -- The arguments are the root element name, the PUBLIC id and the SYSTEM id addDoctypeDecl :: ArrowXml a => String -> String -> String -> a XmlTree XmlTree addDoctypeDecl rootElem public system = fromLA $ replaceChildren ( mkDTDDoctype [ (a_name, rootElem) , (k_public, public) , (k_system, system) ] none <+> txt "\n" <+> getChildren ) -- ------------------------------------------------------------