{-# LANGUAGE OverloadedStrings, PatternGuards #-}

-- | 
-- Module      : Text.XHtmlCombinators.Escape
-- Copyright   : (c) Jasper Van der Jeugt 2009,
--               (c) Alasdair Armstrong 2010
-- License     : BSD-style
-- Maintainer  : alasdair.armstrong@googlemail.com
-- Stability   : experimental
-- Portability : GHC
-- 
-- Escaping is based on the ESAPI project
-- (see 'http://www.owasp.org/index.php/ESAPI')

module Text.XHtmlCombinators.Escape 
    ( escape
    , escapeAttr
    , escapeJavaScript
    , escapeCSS
    ) where

import Data.Char (toUpper)
import qualified Data.IntMap as I
import Numeric (showHex)

import Data.Text (Text)
import qualified Data.Text as T

-- | Escaping text in generic HTML elements
escape :: Text -> Text
escape = escapeEntity htmlImmune
  where htmlImmune = ",.-_ "
  
-- | Escaping text in attribute values
escapeAttr :: Text -> Text
escapeAttr = escapeEntity attributeImmune
  where attributeImmune = ",.-_"

-- | escaping basic text the be used inside normal HTML-elements 
-- based on 'http://code.google.com/p/owasp-esapi-java/source/browse/trunk/src/main/java/org/owasp/esapi/codecs/HTMLEntityCodec.java'
--
-- Should be improved by using an array-based lookup for chars <=0xff
escapeEntity :: String -> Text -> Text
escapeEntity immune = T.concatMap $ escapeChar immune encode
  where    
    encode c 
      | Just ent <- I.lookup (fromEnum c) entityMap 
                  = T.concat ["&", ent, ";"]
      | otherwise = T.pack $ concat ["&#", hex c , ";"]
    
-- | Escaping text intended for places where scripts can be used 
escapeJavaScript :: Text -> Text
escapeJavaScript = T.concatMap $ escapeChar javaScriptImmune encode
  where 
    encode c 
      | c <= toEnum 0xff     = T.pack $ concat ["\\x", pad 2 $ hex c]
      | c <= toEnum 0xffff   = T.pack $ concat ["\\u", pad 4 $ hex c]
      | c <= toEnum 0xffffff = T.pack $ concat ["\\u", pad 6 $ hex c]
      | otherwise            = T.pack $ concat ["\\u", pad 8 $ hex c]
  
    javaScriptImmune = ",._"
    
    pad n hex = reverse . take n $ reverse hex ++ repeat '0'

-- | Escape text for CSS (style) data            
escapeCSS :: Text -> Text
escapeCSS = T.concatMap $ escapeChar "" encode
  where
    encode c = T.pack $ concat ["\\", hex c, " "]

-- | Escape a single character            
escapeChar :: 
    [Char]            -- ^ Lists of chars to be used verbatim
    -> (Char -> Text) -- ^ function handling non standard cases
    -> Char           -- ^ char to encode
    -> Text            
escapeChar immune enc c 
    | c `elem` immune                      = T.singleton c
    | c >= toEnum 0x30 && c <= toEnum 0x39 = T.singleton c
    | c >= toEnum 0x41 && c <= toEnum 0x5A = T.singleton c
    | c >= toEnum 0x61 && c <= toEnum 0x7A = T.singleton c
    | c `elem` "\t\n\r"                    = T.singleton c
    | c <= toEnum 0x1f                     = " "
    | c >= toEnum 0x7f && c <= toEnum 0x9f = " "  
    | otherwise                            = enc c
   
hex :: Char -> String 
hex c = map toUpper $ showHex (fromEnum c) []   
 
entityMap :: I.IntMap Text
entityMap = I.fromList [
    (34,   "quot")     {- quotation mark -},                             
    (38,   "amp")      {- ampersand -},                                  
    (60,   "lt")       {- less-than sign -},                             
    (62,   "gt")       {- greater-than sign -},                          
    (160,  "nbsp")     {- no-break space -},                             
    (161,  "iexcl")    {- inverted exclamation mark -},                  
    (162,  "cent")     {- cent sign -},                                  
    (163,  "pound")    {- pound sign -},                                 
    (164,  "curren")   {- currency sign -},                              
    (165,  "yen")      {- yen sign -},                                   
    (166,  "brvbar")   {- broken bar -},                                 
    (167,  "sect")     {- section sign -},                               
    (168,  "uml")      {- diaeresis -},                                  
    (169,  "copy")     {- copyright sign -},                             
    (170,  "ordf")     {- feminine ordinal indicator -},                 
    (171,  "laquo")    {- left-pointing double angle quotation mark -},  
    (172,  "not")      {- not sign -},                                   
    (173,  "shy")      {- soft hyphen -},                                
    (174,  "reg")      {- registered sign -},                            
    (175,  "macr")     {- macron -},                                     
    (176,  "deg")      {- degree sign -},                                
    (177,  "plusmn")   {- plus-minus sign -},                            
    (178,  "sup2")     {- superscript two -},                            
    (179,  "sup3")     {- superscript three -},                          
    (180,  "acute")    {- acute accent -},                               
    (181,  "micro")    {- micro sign -},                                 
    (182,  "para")     {- pilcrow sign -},                               
    (183,  "middot")   {- middle dot -},                                 
    (184,  "cedil")    {- cedilla -},                                    
    (185,  "sup1")     {- superscript one -},                            
    (186,  "ordm")     {- masculine ordinal indicator -},                
    (187,  "raquo")    {- right-pointing double angle quotation mark -}, 
    (188,  "frac14")   {- vulgar fraction one quarter -},                
    (189,  "frac12")   {- vulgar fraction one half -},                   
    (190,  "frac34")   {- vulgar fraction three quarters -},             
    (191,  "iquest")   {- inverted question mark -},                     
    (192,  "Agrave")   {- Latin capital letter a with grave -},          
    (193,  "Aacute")   {- Latin capital letter a with acute -},          
    (194,  "Acirc")    {- Latin capital letter a with circumflex -},     
    (195,  "Atilde")   {- Latin capital letter a with tilde -},          
    (196,  "Auml")     {- Latin capital letter a with diaeresis -},      
    (197,  "Aring")    {- Latin capital letter a with ring above -},     
    (198,  "AElig")    {- Latin capital letter ae -},                    
    (199,  "Ccedil")   {- Latin capital letter c with cedilla -},        
    (200,  "Egrave")   {- Latin capital letter e with grave -},          
    (201,  "Eacute")   {- Latin capital letter e with acute -},          
    (202,  "Ecirc")    {- Latin capital letter e with circumflex -},     
    (203,  "Euml")     {- Latin capital letter e with diaeresis -},      
    (204,  "Igrave")   {- Latin capital letter i with grave -},          
    (205,  "Iacute")   {- Latin capital letter i with acute -},          
    (206,  "Icirc")    {- Latin capital letter i with circumflex -},     
    (207,  "Iuml")     {- Latin capital letter i with diaeresis -},      
    (208,  "ETH")      {- Latin capital letter eth -},                   
    (209,  "Ntilde")   {- Latin capital letter n with tilde -},          
    (210,  "Ograve")   {- Latin capital letter o with grave -},          
    (211,  "Oacute")   {- Latin capital letter o with acute -},          
    (212,  "Ocirc")    {- Latin capital letter o with circumflex -},     
    (213,  "Otilde")   {- Latin capital letter o with tilde -},          
    (214,  "Ouml")     {- Latin capital letter o with diaeresis -},      
    (215,  "times")    {- multiplication sign -},                        
    (216,  "Oslash")   {- Latin capital letter o with stroke -},         
    (217,  "Ugrave")   {- Latin capital letter u with grave -},          
    (218,  "Uacute")   {- Latin capital letter u with acute -},          
    (219,  "Ucirc")    {- Latin capital letter u with circumflex -},     
    (220,  "Uuml")     {- Latin capital letter u with diaeresis -},      
    (221,  "Yacute")   {- Latin capital letter y with acute -},          
    (222,  "THORN")    {- Latin capital letter thorn -},                 
    (223,  "szlig")    {- Latin small letter sharp s, German Eszett -},  
    (224,  "agrave")   {- Latin small letter a with grave -},            
    (225,  "aacute")   {- Latin small letter a with acute -},            
    (226,  "acirc")    {- Latin small letter a with circumflex -},       
    (227,  "atilde")   {- Latin small letter a with tilde -},            
    (228,  "auml")     {- Latin small letter a with diaeresis -},        
    (229,  "aring")    {- Latin small letter a with ring above -},       
    (230,  "aelig")    {- Latin lowercase ligature ae -},                
    (231,  "ccedil")   {- Latin small letter c with cedilla -},          
    (232,  "egrave")   {- Latin small letter e with grave -},            
    (233,  "eacute")   {- Latin small letter e with acute -},            
    (234,  "ecirc")    {- Latin small letter e with circumflex -},       
    (235,  "euml")     {- Latin small letter e with diaeresis -},        
    (236,  "igrave")   {- Latin small letter i with grave -},            
    (237,  "iacute")   {- Latin small letter i with acute -},            
    (238,  "icirc")    {- Latin small letter i with circumflex -},       
    (239,  "iuml")     {- Latin small letter i with diaeresis -},        
    (240,  "eth")      {- Latin small letter eth -},                     
    (241,  "ntilde")   {- Latin small letter n with tilde -},            
    (242,  "ograve")   {- Latin small letter o with grave -},            
    (243,  "oacute")   {- Latin small letter o with acute -},            
    (244,  "ocirc")    {- Latin small letter o with circumflex -},       
    (245,  "otilde")   {- Latin small letter o with tilde -},            
    (246,  "ouml")     {- Latin small letter o with diaeresis -},        
    (247,  "divide")   {- division sign -},                              
    (248,  "oslash")   {- Latin small letter o with stroke -},           
    (249,  "ugrave")   {- Latin small letter u with grave -},            
    (250,  "uacute")   {- Latin small letter u with acute -},            
    (251,  "ucirc")    {- Latin small letter u with circumflex -},       
    (252,  "uuml")     {- Latin small letter u with diaeresis -},        
    (253,  "yacute")   {- Latin small letter y with acute -},            
    (254,  "thorn")    {- Latin small letter thorn -},                   
    (255,  "yuml")     {- Latin small letter y with diaeresis -},        
    (338,  "OElig")    {- Latin capital ligature oe -},                  
    (339,  "oelig")    {- Latin small ligature oe -},                    
    (352,  "Scaron")   {- Latin capital letter s with caron -},          
    (353,  "scaron")   {- Latin small letter s with caron -},            
    (376,  "Yuml")     {- Latin capital letter y with diaeresis -},      
    (402,  "fnof")     {- Latin small letter f with hook -},             
    (710,  "circ")     {- modifier letter circumflex accent -},          
    (732,  "tilde")    {- small tilde -},                                
    (913,  "Alpha")    {- Greek capital letter alpha -},                 
    (914,  "Beta")     {- Greek capital letter beta -},                  
    (915,  "Gamma")    {- Greek capital letter gamma -},                 
    (916,  "Delta")    {- Greek capital letter delta -},                 
    (917,  "Epsilon")  {- Greek capital letter epsilon -},               
    (918,  "Zeta")     {- Greek capital letter zeta -},                  
    (919,  "Eta")      {- Greek capital letter eta -},                   
    (920,  "Theta")    {- Greek capital letter theta -},                 
    (921,  "Iota")     {- Greek capital letter iota -},                  
    (922,  "Kappa")    {- Greek capital letter kappa -},                 
    (923,  "Lambda")   {- Greek capital letter lambda -},                
    (924,  "Mu")       {- Greek capital letter mu -},                    
    (925,  "Nu")       {- Greek capital letter nu -},                    
    (926,  "Xi")       {- Greek capital letter xi -},                    
    (927,  "Omicron")  {- Greek capital letter omicron -},               
    (928,  "Pi")       {- Greek capital letter pi -},                    
    (929,  "Rho")      {- Greek capital letter rho -},                   
    (931,  "Sigma")    {- Greek capital letter sigma -},                 
    (932,  "Tau")      {- Greek capital letter tau -},                   
    (933,  "Upsilon")  {- Greek capital letter upsilon -},               
    (934,  "Phi")      {- Greek capital letter phi -},                   
    (935,  "Chi")      {- Greek capital letter chi -},                   
    (936,  "Psi")      {- Greek capital letter psi -},                   
    (937,  "Omega")    {- Greek capital letter omega -},                 
    (945,  "alpha")    {- Greek small letter alpha -},                   
    (946,  "beta")     {- Greek small letter beta -},                    
    (947,  "gamma")    {- Greek small letter gamma -},                   
    (948,  "delta")    {- Greek small letter delta -},                   
    (949,  "epsilon")  {- Greek small letter epsilon -},                 
    (950,  "zeta")     {- Greek small letter zeta -},                    
    (951,  "eta")      {- Greek small letter eta -},                     
    (952,  "theta")    {- Greek small letter theta -},                   
    (953,  "iota")     {- Greek small letter iota -},                    
    (954,  "kappa")    {- Greek small letter kappa -},                   
    (955,  "lambda")   {- Greek small letter lambda -},                  
    (956,  "mu")       {- Greek small letter mu -},                      
    (957,  "nu")       {- Greek small letter nu -},                      
    (958,  "xi")       {- Greek small letter xi -},                      
    (959,  "omicron")  {- Greek small letter omicron -},                 
    (960,  "pi")       {- Greek small letter pi -},                      
    (961,  "rho")      {- Greek small letter rho -},                     
    (962,  "sigmaf")   {- Greek small letter final sigma -},             
    (963,  "sigma")    {- Greek small letter sigma -},                   
    (964,  "tau")      {- Greek small letter tau -},                     
    (965,  "upsilon")  {- Greek small letter upsilon -},                 
    (966,  "phi")      {- Greek small letter phi -},                     
    (967,  "chi")      {- Greek small letter chi -},                     
    (968,  "psi")      {- Greek small letter psi -},                     
    (969,  "omega")    {- Greek small letter omega -},                   
    (977,  "thetasym") {- Greek theta symbol -},                         
    (978,  "upsih")    {- Greek upsilon with hook symbol -},             
    (982,  "piv")      {- Greek pi symbol -},                            
    (8194, "ensp")     {- en space -},                                   
    (8195, "emsp")     {- em space -},                                   
    (8201, "thinsp")   {- thin space -},                                 
    (8204, "zwnj")     {- zero width non-joiner -},                      
    (8205, "zwj")      {- zero width joiner -},                          
    (8206, "lrm")      {- left-to-right mark -},                         
    (8207, "rlm")      {- right-to-left mark -},                         
    (8211, "ndash")    {- en dash -},                                    
    (8212, "mdash")    {- em dash -},                                    
    (8216, "lsquo")    {- left single quotation mark -},                 
    (8217, "rsquo")    {- right single quotation mark -},                
    (8218, "sbquo")    {- single low-9 quotation mark -},                
    (8220, "ldquo")    {- left double quotation mark -},                 
    (8221, "rdquo")    {- right double quotation mark -},                
    (8222, "bdquo")    {- double low-9 quotation mark -},                
    (8224, "dagger")   {- dagger -},                                     
    (8225, "Dagger")   {- double dagger -},                              
    (8226, "bull")     {- bullet -},                                     
    (8230, "hellip")   {- horizontal ellipsis -},                        
    (8240, "permil")   {- per mille sign -},                             
    (8242, "prime")    {- prime -},                                      
    (8243, "Prime")    {- double prime -},                               
    (8249, "lsaquo")   {- single left-pointing angle quotation mark -},  
    (8250, "rsaquo")   {- single right-pointing angle quotation mark -}, 
    (8254, "oline")    {- overline -},                                   
    (8260, "frasl")    {- fraction slash -},                             
    (8364, "euro")     {- euro sign -},                                  
    (8465, "image")    {- black-letter capital i -},                     
    (8472, "weierp")   {- script capital p, Weierstrass p -},            
    (8476, "real")     {- black-letter capital r -},                     
    (8482, "trade")    {- trademark sign -},                             
    (8501, "alefsym")  {- alef symbol -},                                
    (8592, "larr")     {- leftwards arrow -},                            
    (8593, "uarr")     {- upwards arrow -},                              
    (8594, "rarr")     {- rightwards arrow -},                           
    (8595, "darr")     {- downwards arrow -},                            
    (8596, "harr")     {- left right arrow -},                           
    (8629, "crarr")    {- downwards arrow with corner leftwards -},      
    (8656, "lArr")     {- leftwards double arrow -},                     
    (8657, "uArr")     {- upwards double arrow -},                       
    (8658, "rArr")     {- rightwards double arrow -},                    
    (8659, "dArr")     {- downwards double arrow -},                     
    (8660, "hArr")     {- left right double arrow -},                    
    (8704, "forall")   {- for all -},                                    
    (8706, "part")     {- partial differential -},                       
    (8707, "exist")    {- there exists -},                               
    (8709, "empty")    {- empty set -},                                  
    (8711, "nabla")    {- nabla -},                                      
    (8712, "isin")     {- element of -},                                 
    (8713, "notin")    {- not an element of -},                          
    (8715, "ni")       {- contains as member -},                         
    (8719, "prod")     {- n-ary product -},                              
    (8721, "sum")      {- n-ary summation -},                            
    (8722, "minus")    {- minus sign -},                                 
    (8727, "lowast")   {- asterisk operator -},                          
    (8730, "radic")    {- square root -},                                
    (8733, "prop")     {- proportional to -},                            
    (8734, "infin")    {- infinity -},                                   
    (8736, "ang")      {- angle -},                                      
    (8743, "and")      {- logical and -},                                
    (8744, "or")       {- logical or -},                                 
    (8745, "cap")      {- intersection -},                               
    (8746, "cup")      {- union -},                                      
    (8747, "int")      {- integral -},                                   
    (8756, "there4")   {- therefore -},                                  
    (8764, "sim")      {- tilde operator -},                             
    (8773, "cong")     {- congruent to -},                               
    (8776, "asymp")    {- almost equal to -},                            
    (8800, "ne")       {- not equal to -},                               
    (8801, "equiv")    {- identical to, equivalent to -},                
    (8804, "le")       {- less-than or equal to -},                      
    (8805, "ge")       {- greater-than or equal to -},                   
    (8834, "sub")      {- subset of -},                                  
    (8835, "sup")      {- superset of -},                                
    (8836, "nsub")     {- not a subset of -},                            
    (8838, "sube")     {- subset of or equal to -},                      
    (8839, "supe")     {- superset of or equal to -},                    
    (8853, "oplus")    {- circled plus -},                               
    (8855, "otimes")   {- circled times -},                              
    (8869, "perp")     {- up tack -},                                    
    (8901, "sdot")     {- dot operator -},                               
    (8968, "lceil")    {- left ceiling -},                               
    (8969, "rceil")    {- right ceiling -},                              
    (8970, "lfloor")   {- left floor -},                                 
    (8971, "rfloor")   {- right floor -},                                
    (9001, "lang")     {- left-pointing angle bracket -},                
    (9002, "rang")     {- right-pointing angle bracket -},               
    (9674, "loz")      {- lozenge -},                                    
    (9824, "spades")   {- black spade suit -},                           
    (9827, "clubs")    {- black club suit -},                            
    (9829, "hearts")   {- black heart suit -},                           
    (9830, "diams")    {- black diamond suit -}]