{- This module contains code for escaping/unescaping text in attributes and elements in the HaXml Element type, replacing characters by character references or vice-versa. Two uses are envisaged for this: (1) stopping HaXml generating incorrect XML when a character is included which is also the appropriate XML terminating character, for example when an attribute includes a double quote. (2) representing XML which contains non-ASCII characters as ASCII. -} module Text.XML.HaXml.Escape( xmlEscape, -- :: XmlEscaper -> Element i -> Element i xmlUnEscape, -- :: XmlEscaper -> Element i -> Element i xmlEscapeContent, -- :: XmlEscaper -> [Content i] -> [Content i] xmlUnEscapeContent, -- :: XmlEscaper -> [Content i] -> [Content i] XmlEscaper, -- Something describing a particular set of escapes. stdXmlEscaper, -- Standard boilerplate escaper, escaping everything that is -- nonprintable, non-ASCII, or might conceivably cause problems by -- parsing XML, for example quotes, < signs, and ampersands. mkXmlEscaper, -- :: [(Char,String)] -> (Char -> Bool) -> XmlEscaper -- The first argument contains a list of characters, with their -- corresponding character reference names. -- For example [('\60',"lt"),('\62',"gt"),('\38',"amp"), -- ('\39',"apos"),('\34',"quot")] will give you the "standard" -- XML escapes listed in section 4.6 of the XML standard, so that -- """ will automatically get translated into a double -- quotation mark. -- -- It's the caller's responsibility to see that the reference -- names ("lt","gt","amp","apos" and "quot" in the above example) -- are valid XML reference names. A sequence of letters, digits, -- "." or ":" characters should be fine so long as the first one -- isn't a digit. -- -- The second argument is a function applied to each text character. -- If it returns True, that means we should escape this character. -- Policy: on escaping, we expand all characters for which the -- (Char -> Bool) function returns True, either giving the corresponding -- character reference name if one was supplied, or else using a -- hexadecimal CharRef. -- -- on unescaping, we translate all the references we understand -- (hexadecimal,decimal, and the ones in the [(Char,String)] list, -- and leave the others alone. ) where import Char import Numeric import Text.XML.HaXml.Types #if __GLASGOW_HASKELL__ >= 604 || __NHC__ >= 118 || defined(__HUGS__) -- emulate older finite map interface using Data.Map, if it is available import qualified Data.Map as Map type FiniteMap a b = Map.Map a b listToFM :: Ord a => [(a,b)] -> FiniteMap a b listToFM = Map.fromList lookupFM :: Ord a => FiniteMap a b -> a -> Maybe b lookupFM = flip Map.lookup #elif __GLASGOW_HASKELL__ >= 504 || __NHC__ > 114 -- real finite map, if it is available import Data.FiniteMap #else -- otherwise, a very simple and inefficient implementation of a finite map type FiniteMap a b = [(a,b)] listToFM :: Eq a => [(a,b)] -> FiniteMap a b listToFM = id lookupFM :: Eq a => FiniteMap a b -> a -> Maybe b lookupFM fm k = lookup k fm #endif -- ------------------------------------------------------------------------ -- Data types -- ------------------------------------------------------------------------ data XmlEscaper = XmlEscaper { toEscape :: FiniteMap Char String, fromEscape :: FiniteMap String Char, isEscape :: Char -> Bool } -- ------------------------------------------------------------------------ -- Escaping -- ------------------------------------------------------------------------ xmlEscape :: XmlEscaper -> Element i -> Element i xmlEscape xmlEscaper element = compressElement (escapeElement xmlEscaper element) xmlEscapeContent :: XmlEscaper -> [Content i] -> [Content i] xmlEscapeContent xmlEscaper cs = compressContent (escapeContent xmlEscaper cs) escapeElement :: XmlEscaper -> Element i -> Element i escapeElement xmlEscaper (Elem name attributes content) = Elem name (escapeAttributes xmlEscaper attributes) (escapeContent xmlEscaper content) escapeAttributes :: XmlEscaper -> [Attribute] -> [Attribute] escapeAttributes xmlEscaper atts = map (\ (name,av) -> (name,escapeAttValue xmlEscaper av)) atts escapeAttValue :: XmlEscaper -> AttValue -> AttValue escapeAttValue xmlEscaper (AttValue attValList) = AttValue ( concat ( map (\ av -> case av of Right ref -> [av] Left s -> map (\ c -> if isEscape xmlEscaper c then Right (mkEscape xmlEscaper c) else Left [c] ) s ) attValList ) ) escapeContent :: XmlEscaper -> [Content i] -> [Content i] escapeContent xmlEscaper contents = concat (map (\ content -> case content of (CString b str i) -> map (\ c -> if isEscape xmlEscaper c then CRef (mkEscape xmlEscaper c) i else CString b [c] i ) str (CElem elem i) -> [CElem (escapeElement xmlEscaper elem) i] _ -> [content] ) contents ) mkEscape :: XmlEscaper -> Char -> Reference mkEscape (XmlEscaper {toEscape = toEscape}) ch = case lookupFM toEscape ch of Nothing -> RefChar (ord ch) Just str -> RefEntity str where showHex = showIntAtBase 16 intToDigit -- It should be, but in GHC it isn't. -- ------------------------------------------------------------------------ -- Unescaping -- ------------------------------------------------------------------------ xmlUnEscape :: XmlEscaper -> Element i -> Element i xmlUnEscape xmlEscaper element = compressElement (unEscapeElement xmlEscaper element) xmlUnEscapeContent :: XmlEscaper -> [Content i] -> [Content i] xmlUnEscapeContent xmlEscaper cs = compressContent (unEscapeContent xmlEscaper cs) unEscapeElement :: XmlEscaper -> Element i -> Element i unEscapeElement xmlEscaper (Elem name attributes content) = Elem name (unEscapeAttributes xmlEscaper attributes) (unEscapeContent xmlEscaper content) unEscapeAttributes :: XmlEscaper -> [Attribute] -> [Attribute] unEscapeAttributes xmlEscaper atts = map (\ (name,av) -> (name,unEscapeAttValue xmlEscaper av)) atts unEscapeAttValue :: XmlEscaper -> AttValue -> AttValue unEscapeAttValue xmlEscaper (AttValue attValList) = AttValue ( map (\ av -> case av of Left s -> av Right ref -> case unEscapeChar xmlEscaper ref of Just c -> Left [c] Nothing -> av ) attValList ) unEscapeContent :: XmlEscaper -> [Content i] -> [Content i] unEscapeContent xmlEscaper content = map (\ content -> case content of CRef ref i -> case unEscapeChar xmlEscaper ref of Just c -> CString False [c] i Nothing -> content CElem elem i -> CElem (unEscapeElement xmlEscaper elem) i _ -> content ) content unEscapeChar :: XmlEscaper -> Reference -> Maybe Char unEscapeChar xmlEscaper ref = case ref of RefChar i -> Just (chr i) RefEntity name -> lookupFM (fromEscape xmlEscaper) name -- ------------------------------------------------------------------------ -- After escaping and unescaping we rebuild the lists, compressing -- adjacent identical character data. -- ------------------------------------------------------------------------ compressElement :: Element i -> Element i compressElement (Elem name attributes content) = Elem name (compressAttributes attributes) (compressContent content) compressAttributes :: [(Name,AttValue)] -> [(Name,AttValue)] compressAttributes atts = map (\ (name,av) -> (name,compressAttValue av)) atts compressAttValue :: AttValue -> AttValue compressAttValue (AttValue l) = AttValue (compress l) where compress :: [Either String Reference] -> [Either String Reference] compress [] = [] compress (Right ref : es) = Right ref : (compress es) compress ( (ls @ (Left s1)) : es) = case compress es of (Left s2 : es2) -> Left (s1 ++ s2) : es2 es2 -> ls : es2 compressContent :: [Content i] -> [Content i] compressContent [] = [] compressContent ((csb @ (CString b1 s1 i1)) : cs) = case compressContent cs of (CString b2 s2 i2) : cs2 | b1 == b2 -> CString b1 (s1 ++ s2) i1: cs2 cs2 -> csb : cs2 compressContent (CElem element i : cs) = CElem (compressElement element) i : compressContent cs compressContent (c : cs) = c : compressContent cs -- ------------------------------------------------------------------------ -- Making XmlEscaper values. -- ------------------------------------------------------------------------ stdXmlEscaper :: XmlEscaper stdXmlEscaper = mkXmlEscaper [('\60',"lt"),('\62',"gt"),('\38',"amp"),('\39',"apos"),('\34',"quot")] (\ ch -> let i = ord ch in i < 10 || (10= 127 || case ch of '\'' -> True '\"' -> True '&' -> True '<' -> True '>' -> True _ -> False ) mkXmlEscaper :: [(Char,String)] -> (Char -> Bool) -> XmlEscaper mkXmlEscaper escapes isEscape = XmlEscaper { toEscape = listToFM escapes, fromEscape = listToFM (map (\ (c,str) -> (str,c)) escapes), isEscape = isEscape }