{- This module contains code for escaping/unescaping text in attributes
   and elements in the HaXml Element type, replacing characters by character
   references or vice-versa.  Two uses are envisaged for this:

   (1) stopping HaXml generating incorrect XML when a character is included
       which is also the appropriate XML terminating character, for example
       when an attribute includes a double quote.
   (2) representing XML which contains non-ASCII characters as ASCII.
   -}
module Text.XML.HaXml.Escape(
   xmlEscape,
      -- :: XmlEscaper -> Element i -> Element i
   xmlUnEscape,
      -- :: XmlEscaper -> Element i -> Element i
   xmlEscapeContent,
      -- :: XmlEscaper -> [Content i] -> [Content i]
   xmlUnEscapeContent,
      -- :: XmlEscaper -> [Content i] -> [Content i]

   XmlEscaper,
      -- Something describing a particular set of escapes.

   stdXmlEscaper,
      -- Standard boilerplate escaper, escaping everything that is
      -- nonprintable, non-ASCII, or might conceivably cause problems by
      -- parsing XML, for example quotes, < signs, and ampersands.

   mkXmlEscaper,
      -- :: [(Char,String)] -> (Char -> Bool) -> XmlEscaper
      -- The first argument contains a list of characters, with their
      --    corresponding character reference names.
      --    For example [('\60',"lt"),('\62',"gt"),('\38',"amp"),
      --       ('\39',"apos"),('\34',"quot")] will give you the "standard"
      --       XML escapes listed in section 4.6 of the XML standard, so that
      --       "&quot;" will automatically get translated into a double
      --       quotation mark.
      --
      --       It's the caller's responsibility to see that the reference
      --       names ("lt","gt","amp","apos" and "quot" in the above example)
      --       are valid XML reference names.  A sequence of letters, digits,
      --       "." or ":" characters should be fine so long as the first one
      --       isn't a digit.
      --
      -- The second argument is a function applied to each text character.
      --    If it returns True, that means we should escape this character.

      -- Policy: on escaping, we expand all characters for which the
      -- (Char -> Bool) function returns True, either giving the corresponding
      -- character reference name if one was supplied, or else using a
      -- hexadecimal CharRef.
      --
      -- on unescaping, we translate all the references we understand
      --   (hexadecimal,decimal, and the ones in the [(Char,String)] list,
      --   and leave the others alone.

   ) where

import Data.Char
-- import Numeric
import Text.XML.HaXml.Types

#if __GLASGOW_HASKELL__ >= 604 || __NHC__ >= 118 || defined(__HUGS__)
-- emulate older finite map interface using Data.Map, if it is available
import qualified Data.Map as Map
type FiniteMap a b = Map.Map a b
listToFM :: Ord a => [(a,b)] -> FiniteMap a b
listToFM :: forall a b. Ord a => [(a, b)] -> FiniteMap a b
listToFM = forall a b. Ord a => [(a, b)] -> FiniteMap a b
Map.fromList
lookupFM :: Ord a => FiniteMap a b -> a -> Maybe b
lookupFM :: forall a b. Ord a => FiniteMap a b -> a -> Maybe b
lookupFM = forall a b c. (a -> b -> c) -> b -> a -> c
flip forall k a. Ord k => k -> Map k a -> Maybe a
Map.lookup
#elif __GLASGOW_HASKELL__ >= 504 || __NHC__ > 114
-- real finite map, if it is available
import Data.FiniteMap
#else
-- otherwise, a very simple and inefficient implementation of a finite map
type FiniteMap a b = [(a,b)]
listToFM :: Eq a => [(a,b)] -> FiniteMap a b
listToFM = id
lookupFM :: Eq a => FiniteMap a b -> a -> Maybe b
lookupFM fm k = lookup k fm
#endif


-- ------------------------------------------------------------------------
-- Data types
-- ------------------------------------------------------------------------

data XmlEscaper = XmlEscaper {
   XmlEscaper -> FiniteMap Char String
toEscape :: FiniteMap Char String,
   XmlEscaper -> FiniteMap String Char
fromEscape :: FiniteMap String Char,
   XmlEscaper -> Char -> Bool
isEscape :: Char -> Bool
   }


-- ------------------------------------------------------------------------
-- Escaping
-- ------------------------------------------------------------------------



xmlEscape :: XmlEscaper -> Element i -> Element i
xmlEscape :: forall i. XmlEscaper -> Element i -> Element i
xmlEscape XmlEscaper
xmlEscaper Element i
element =
   forall i. Element i -> Element i
compressElement (forall i. XmlEscaper -> Element i -> Element i
escapeElement XmlEscaper
xmlEscaper Element i
element)

xmlEscapeContent :: XmlEscaper -> [Content i] -> [Content i]
xmlEscapeContent :: forall i. XmlEscaper -> [Content i] -> [Content i]
xmlEscapeContent XmlEscaper
xmlEscaper [Content i]
cs =
   forall i. [Content i] -> [Content i]
compressContent (forall i. XmlEscaper -> [Content i] -> [Content i]
escapeContent XmlEscaper
xmlEscaper [Content i]
cs)

escapeElement :: XmlEscaper -> Element i -> Element i
escapeElement :: forall i. XmlEscaper -> Element i -> Element i
escapeElement XmlEscaper
xmlEscaper (Elem QName
name [Attribute]
attributes [Content i]
content) =
   forall i. QName -> [Attribute] -> [Content i] -> Element i
Elem QName
name (XmlEscaper -> [Attribute] -> [Attribute]
escapeAttributes XmlEscaper
xmlEscaper [Attribute]
attributes)
      (forall i. XmlEscaper -> [Content i] -> [Content i]
escapeContent XmlEscaper
xmlEscaper [Content i]
content)

escapeAttributes :: XmlEscaper -> [Attribute] -> [Attribute]
escapeAttributes :: XmlEscaper -> [Attribute] -> [Attribute]
escapeAttributes XmlEscaper
xmlEscaper =
   forall a b. (a -> b) -> [a] -> [b]
map (\ (QName
name,AttValue
av) -> (QName
name,XmlEscaper -> AttValue -> AttValue
escapeAttValue XmlEscaper
xmlEscaper AttValue
av))

escapeAttValue :: XmlEscaper -> AttValue -> AttValue
escapeAttValue :: XmlEscaper -> AttValue -> AttValue
escapeAttValue XmlEscaper
xmlEscaper (AttValue [Either String Reference]
attValList) =
   [Either String Reference] -> AttValue
AttValue (
      forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap
            (\ Either String Reference
av -> case Either String Reference
av of
               Right Reference
_ -> [Either String Reference
av]
               Left String
s ->
                  forall a b. (a -> b) -> [a] -> [b]
map
                     (\ Char
c -> if XmlEscaper -> Char -> Bool
isEscape XmlEscaper
xmlEscaper Char
c
                        then
                           forall a b. b -> Either a b
Right (XmlEscaper -> Char -> Reference
mkEscape XmlEscaper
xmlEscaper Char
c)
                        else
                           forall a b. a -> Either a b
Left [Char
c]
                        )
                     String
s
               )
            [Either String Reference]
attValList
      )

escapeContent :: XmlEscaper -> [Content i] -> [Content i]
escapeContent :: forall i. XmlEscaper -> [Content i] -> [Content i]
escapeContent XmlEscaper
xmlEscaper =
   forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap
          (\ Content i
content -> case Content i
content of
             (CString Bool
b String
str i
i) ->
                forall a b. (a -> b) -> [a] -> [b]
map
                   (\ Char
c -> if XmlEscaper -> Char -> Bool
isEscape XmlEscaper
xmlEscaper Char
c
                      then
                         forall i. Reference -> i -> Content i
CRef (XmlEscaper -> Char -> Reference
mkEscape XmlEscaper
xmlEscaper Char
c) i
i
                      else
                         forall i. Bool -> String -> i -> Content i
CString Bool
b [Char
c] i
i
                      )
                   String
str
             (CElem Element i
element i
i) -> [forall i. Element i -> i -> Content i
CElem (forall i. XmlEscaper -> Element i -> Element i
escapeElement XmlEscaper
xmlEscaper Element i
element) i
i]
             Content i
_ -> [Content i
content]
             )

mkEscape :: XmlEscaper -> Char -> Reference
mkEscape :: XmlEscaper -> Char -> Reference
mkEscape (XmlEscaper {toEscape :: XmlEscaper -> FiniteMap Char String
toEscape = FiniteMap Char String
toescape}) Char
ch =
   case forall a b. Ord a => FiniteMap a b -> a -> Maybe b
lookupFM FiniteMap Char String
toescape Char
ch of
      Maybe String
Nothing  -> Int -> Reference
RefChar (Char -> Int
ord Char
ch)
      Just String
str -> String -> Reference
RefEntity String
str
--    where
--       _ = showIntAtBase 16 intToDigit
--       -- It should be, but in GHC it isn't.

-- ------------------------------------------------------------------------
-- Unescaping
-- ------------------------------------------------------------------------

xmlUnEscape :: XmlEscaper -> Element i -> Element i
xmlUnEscape :: forall i. XmlEscaper -> Element i -> Element i
xmlUnEscape XmlEscaper
xmlEscaper Element i
element =
   forall i. Element i -> Element i
compressElement (forall i. XmlEscaper -> Element i -> Element i
unEscapeElement XmlEscaper
xmlEscaper Element i
element)

xmlUnEscapeContent :: XmlEscaper -> [Content i] -> [Content i]
xmlUnEscapeContent :: forall i. XmlEscaper -> [Content i] -> [Content i]
xmlUnEscapeContent XmlEscaper
xmlEscaper [Content i]
cs =
   forall i. [Content i] -> [Content i]
compressContent (forall i. XmlEscaper -> [Content i] -> [Content i]
unEscapeContent XmlEscaper
xmlEscaper [Content i]
cs)

unEscapeElement :: XmlEscaper -> Element i -> Element i
unEscapeElement :: forall i. XmlEscaper -> Element i -> Element i
unEscapeElement XmlEscaper
xmlEscaper (Elem QName
name [Attribute]
attributes [Content i]
content) =
   forall i. QName -> [Attribute] -> [Content i] -> Element i
Elem QName
name (XmlEscaper -> [Attribute] -> [Attribute]
unEscapeAttributes XmlEscaper
xmlEscaper [Attribute]
attributes)
      (forall i. XmlEscaper -> [Content i] -> [Content i]
unEscapeContent XmlEscaper
xmlEscaper [Content i]
content)

unEscapeAttributes :: XmlEscaper -> [Attribute] -> [Attribute]
unEscapeAttributes :: XmlEscaper -> [Attribute] -> [Attribute]
unEscapeAttributes XmlEscaper
xmlEscaper =
   forall a b. (a -> b) -> [a] -> [b]
map (\ (QName
name,AttValue
av) -> (QName
name,XmlEscaper -> AttValue -> AttValue
unEscapeAttValue XmlEscaper
xmlEscaper AttValue
av))

unEscapeAttValue :: XmlEscaper -> AttValue -> AttValue
unEscapeAttValue :: XmlEscaper -> AttValue -> AttValue
unEscapeAttValue XmlEscaper
xmlEscaper (AttValue [Either String Reference]
attValList) =
   [Either String Reference] -> AttValue
AttValue (
      forall a b. (a -> b) -> [a] -> [b]
map
         (\ Either String Reference
av -> case Either String Reference
av of
            Left String
_ -> Either String Reference
av
            Right Reference
ref -> case XmlEscaper -> Reference -> Maybe Char
unEscapeChar XmlEscaper
xmlEscaper Reference
ref of
               Just Char
c -> forall a b. a -> Either a b
Left [Char
c]
               Maybe Char
Nothing -> Either String Reference
av
            )
         [Either String Reference]
attValList
      )

unEscapeContent :: XmlEscaper -> [Content i] -> [Content i]
unEscapeContent :: forall i. XmlEscaper -> [Content i] -> [Content i]
unEscapeContent XmlEscaper
xmlEscaper =
   forall a b. (a -> b) -> [a] -> [b]
map
      (\ Content i
cntnt -> case Content i
cntnt of
         CRef Reference
ref i
i -> case XmlEscaper -> Reference -> Maybe Char
unEscapeChar XmlEscaper
xmlEscaper Reference
ref of
            Just Char
c -> forall i. Bool -> String -> i -> Content i
CString Bool
False [Char
c] i
i
            Maybe Char
Nothing -> Content i
cntnt
         CElem Element i
element i
i -> forall i. Element i -> i -> Content i
CElem (forall i. XmlEscaper -> Element i -> Element i
unEscapeElement XmlEscaper
xmlEscaper Element i
element) i
i
         Content i
_ -> Content i
cntnt
         )

unEscapeChar :: XmlEscaper -> Reference -> Maybe Char
unEscapeChar :: XmlEscaper -> Reference -> Maybe Char
unEscapeChar XmlEscaper
xmlEscaper Reference
ref =
   case Reference
ref of
      RefChar Int
i      -> forall a. a -> Maybe a
Just (Int -> Char
chr Int
i)
      RefEntity String
name -> forall a b. Ord a => FiniteMap a b -> a -> Maybe b
lookupFM (XmlEscaper -> FiniteMap String Char
fromEscape XmlEscaper
xmlEscaper) String
name

-- ------------------------------------------------------------------------
-- After escaping and unescaping we rebuild the lists, compressing
-- adjacent identical character data.
-- ------------------------------------------------------------------------

compressElement :: Element i -> Element i
compressElement :: forall i. Element i -> Element i
compressElement (Elem QName
name [Attribute]
attributes [Content i]
content) =
   forall i. QName -> [Attribute] -> [Content i] -> Element i
Elem QName
name ([Attribute] -> [Attribute]
compressAttributes [Attribute]
attributes) (forall i. [Content i] -> [Content i]
compressContent [Content i]
content)

compressAttributes :: [(QName,AttValue)] -> [(QName,AttValue)]
compressAttributes :: [Attribute] -> [Attribute]
compressAttributes =
   forall a b. (a -> b) -> [a] -> [b]
map (\ (QName
name,AttValue
av) -> (QName
name,AttValue -> AttValue
compressAttValue AttValue
av))

compressAttValue :: AttValue -> AttValue
compressAttValue :: AttValue -> AttValue
compressAttValue (AttValue [Either String Reference]
l) = [Either String Reference] -> AttValue
AttValue ([Either String Reference] -> [Either String Reference]
compress [Either String Reference]
l)
   where
      compress :: [Either String Reference] -> [Either String Reference]
      compress :: [Either String Reference] -> [Either String Reference]
compress [] = []
      compress (Right Reference
ref : [Either String Reference]
es) = forall a b. b -> Either a b
Right Reference
ref forall a. a -> [a] -> [a]
: [Either String Reference] -> [Either String Reference]
compress [Either String Reference]
es
      compress ( ls :: Either String Reference
ls@(Left String
s1) : [Either String Reference]
es) =
         case [Either String Reference] -> [Either String Reference]
compress [Either String Reference]
es of
            (Left String
s2 : [Either String Reference]
es2) -> forall a b. a -> Either a b
Left (String
s1 forall a. [a] -> [a] -> [a]
++ String
s2) forall a. a -> [a] -> [a]
: [Either String Reference]
es2
            [Either String Reference]
es2 -> Either String Reference
ls forall a. a -> [a] -> [a]
: [Either String Reference]
es2

compressContent :: [Content i] -> [Content i]
compressContent :: forall i. [Content i] -> [Content i]
compressContent [] = []
compressContent (csb :: Content i
csb@(CString Bool
b1 String
s1 i
i1) : [Content i]
cs) =
   case forall i. [Content i] -> [Content i]
compressContent [Content i]
cs of
      (CString Bool
b2 String
s2 i
_) : [Content i]
cs2
          | Bool
b1 forall a. Eq a => a -> a -> Bool
== Bool
b2
          -> forall i. Bool -> String -> i -> Content i
CString Bool
b1 (String
s1 forall a. [a] -> [a] -> [a]
++ String
s2) i
i1forall a. a -> [a] -> [a]
: [Content i]
cs2
      [Content i]
cs2 -> Content i
csb forall a. a -> [a] -> [a]
: [Content i]
cs2
compressContent (CElem Element i
element i
i : [Content i]
cs) =
   forall i. Element i -> i -> Content i
CElem (forall i. Element i -> Element i
compressElement Element i
element) i
i forall a. a -> [a] -> [a]
: forall i. [Content i] -> [Content i]
compressContent [Content i]
cs
compressContent (Content i
c : [Content i]
cs) = Content i
c forall a. a -> [a] -> [a]
: forall i. [Content i] -> [Content i]
compressContent [Content i]
cs


-- ------------------------------------------------------------------------
-- Making XmlEscaper values.
-- ------------------------------------------------------------------------

stdXmlEscaper :: XmlEscaper
stdXmlEscaper :: XmlEscaper
stdXmlEscaper = [(Char, String)] -> (Char -> Bool) -> XmlEscaper
mkXmlEscaper
   [(Char
'\60',String
"lt"),(Char
'\62',String
"gt"),(Char
'\38',String
"amp"),(Char
'\39',String
"apos"),(Char
'\34',String
"quot")]
   (\ Char
ch ->
      let
         i :: Int
i = Char -> Int
ord Char
ch
      in
         Int
i forall a. Ord a => a -> a -> Bool
< Int
10 Bool -> Bool -> Bool
|| (Int
10forall a. Ord a => a -> a -> Bool
<Int
i Bool -> Bool -> Bool
&& Int
iforall a. Ord a => a -> a -> Bool
<Int
32) Bool -> Bool -> Bool
|| Int
i forall a. Ord a => a -> a -> Bool
>= Int
127 Bool -> Bool -> Bool
||
            case Char
ch of
               Char
'\'' -> Bool
True
               Char
'\"' -> Bool
True
               Char
'&' -> Bool
True
               Char
'<' -> Bool
True
               Char
'>' -> Bool
True
               Char
_ -> Bool
False
      )


mkXmlEscaper :: [(Char,String)] -> (Char -> Bool) -> XmlEscaper
mkXmlEscaper :: [(Char, String)] -> (Char -> Bool) -> XmlEscaper
mkXmlEscaper [(Char, String)]
escapes Char -> Bool
isescape =
   XmlEscaper {
      toEscape :: FiniteMap Char String
toEscape = forall a b. Ord a => [(a, b)] -> FiniteMap a b
listToFM [(Char, String)]
escapes,
      fromEscape :: FiniteMap String Char
fromEscape = forall a b. Ord a => [(a, b)] -> FiniteMap a b
listToFM (forall a b. (a -> b) -> [a] -> [b]
map (\ (Char
c,String
str) -> (String
str,Char
c)) [(Char, String)]
escapes),
      isEscape :: Char -> Bool
isEscape = Char -> Bool
isescape
      }