-- | This module converts between HTML/XML entities (i.e. @&@) and
--   the characters they represent.
module Text.HTML.TagSoup.Entity(
    lookupEntity, lookupNamedEntity, lookupNumericEntity,
    escapeXMLChar,
    xmlEntities, htmlEntities
    ) where

import Data.Char
import Data.Ix
import Numeric


-- | Lookup an entity, using 'lookupNumericEntity' if it starts with
--   @#@ and 'lookupNamedEntity' otherwise
lookupEntity :: String -> Maybe Char
lookupEntity ('#':xs) = lookupNumericEntity xs
lookupEntity xs = lookupNamedEntity xs

-- | Lookup a numeric entity, the leading @\'#\'@ must have already been removed.
--
-- > lookupNumericEntity "65" == Just 'A'
-- > lookupNumericEntity "x41" == Just 'A'
-- > lookupNumericEntity "x4E" === Just 'N'
-- > lookupNumericEntity "x4e" === Just 'N'
-- > lookupNumericEntity "Haskell" == Nothing
-- > lookupNumericEntity "" == Nothing
-- > lookupNumericEntity "89439085908539082" == Nothing
lookupNumericEntity :: String -> Maybe Char
lookupNumericEntity = f
        -- entity = '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
    where
        f ('x':xs) = g [('0','9'),('a','f'),('A','F')] readHex xs
        f xs = g [('0','9')] reads xs

        g :: [(Char,Char)] -> ReadS Integer -> String -> Maybe Char
        g valid reader xs = do
            let test b = if b then Just () else Nothing
            test $ isValid valid xs
            test $ not $ null xs
            case reader xs of
                [(a,"")] -> do
                    test $ inRange (toInteger $ ord minBound, toInteger $ ord maxBound) a
                    return $ chr $ fromInteger a
                _ -> Nothing

        isValid :: [(Char,Char)] -> String -> Bool
        isValid valid xs = all (\x -> any (`inRange` x) valid) xs


-- | Lookup a named entity, using 'htmlEntities'
--
-- > lookupNamedEntity "amp" == Just '&'
-- > lookupNamedEntity "haskell" == Nothing
lookupNamedEntity :: String -> Maybe Char
lookupNamedEntity x = fmap chr $ lookup x htmlEntities


-- | Escape a character before writing it out to XML.
--
-- > escapeXMLChar 'a' == Nothing
-- > escapeXMLChar '&' == Just "amp"
escapeXMLChar :: Char -> Maybe String
escapeXMLChar x = case [a | (a,b) <- xmlEntities, b == ord x] of
                       (y:_) -> Just y
                       _ -> Nothing


-- | A table mapping XML entity names to code points.
--   Does /not/ include @apos@ as Internet Explorer does not know about it.
xmlEntities :: [(String, Int)]
xmlEntities = let a*b = (a,ord b) in
    ["quot" * '"'
    ,"amp"  * '&'
    -- ,"apos" * '\''    -- Internet Explorer does not know that
    ,"lt"   * '<'
    ,"gt"   * '>'
    ]

-- | A table mapping HTML entity names to code points
htmlEntities :: [(String, Int)]
htmlEntities = let (*) = (,) in
    xmlEntities ++
    ["apos"    * ord '\''  -- quirky IE!!!

    ,"nbsp"    * 160
    ,"iexcl"   * 161
    ,"cent"    * 162
    ,"pound"   * 163
    ,"curren"  * 164
    ,"yen"     * 165
    ,"brvbar"  * 166
    ,"sect"    * 167
    ,"uml"     * 168
    ,"copy"    * 169
    ,"ordf"    * 170
    ,"laquo"   * 171
    ,"not"     * 172
    ,"shy"     * 173
    ,"reg"     * 174
    ,"macr"    * 175
    ,"deg"     * 176
    ,"plusmn"  * 177
    ,"sup2"    * 178
    ,"sup3"    * 179
    ,"acute"   * 180
    ,"micro"   * 181
    ,"para"    * 182
    ,"middot"  * 183
    ,"cedil"   * 184
    ,"sup1"    * 185
    ,"ordm"    * 186
    ,"raquo"   * 187
    ,"frac14"  * 188
    ,"frac12"  * 189
    ,"frac34"  * 190
    ,"iquest"  * 191
    ,"Agrave"  * 192
    ,"Aacute"  * 193
    ,"Acirc"   * 194
    ,"Atilde"  * 195
    ,"Auml"    * 196
    ,"Aring"   * 197
    ,"AElig"   * 198
    ,"Ccedil"  * 199
    ,"Egrave"  * 200
    ,"Eacute"  * 201
    ,"Ecirc"   * 202
    ,"Euml"    * 203
    ,"Igrave"  * 204
    ,"Iacute"  * 205
    ,"Icirc"   * 206
    ,"Iuml"    * 207
    ,"ETH"     * 208
    ,"Ntilde"  * 209
    ,"Ograve"  * 210
    ,"Oacute"  * 211
    ,"Ocirc"   * 212
    ,"Otilde"  * 213
    ,"Ouml"    * 214
    ,"times"   * 215
    ,"Oslash"  * 216
    ,"Ugrave"  * 217
    ,"Uacute"  * 218
    ,"Ucirc"   * 219
    ,"Uuml"    * 220
    ,"Yacute"  * 221
    ,"THORN"   * 222
    ,"szlig"   * 223
    ,"agrave"  * 224
    ,"aacute"  * 225
    ,"acirc"   * 226
    ,"atilde"  * 227
    ,"auml"    * 228
    ,"aring"   * 229
    ,"aelig"   * 230
    ,"ccedil"  * 231
    ,"egrave"  * 232
    ,"eacute"  * 233
    ,"ecirc"   * 234
    ,"euml"    * 235
    ,"igrave"  * 236
    ,"iacute"  * 237
    ,"icirc"   * 238
    ,"iuml"    * 239
    ,"eth"     * 240
    ,"ntilde"  * 241
    ,"ograve"  * 242
    ,"oacute"  * 243
    ,"ocirc"   * 244
    ,"otilde"  * 245
    ,"ouml"    * 246
    ,"divide"  * 247
    ,"oslash"  * 248
    ,"ugrave"  * 249
    ,"uacute"  * 250
    ,"ucirc"   * 251
    ,"uuml"    * 252
    ,"yacute"  * 253
    ,"thorn"   * 254
    ,"yuml"    * 255

    ,"OElig"   * 338
    ,"oelig"   * 339
    ,"Scaron"  * 352
    ,"scaron"  * 353
    ,"Yuml"    * 376
    ,"circ"    * 710
    ,"tilde"   * 732

    ,"ensp"    * 8194
    ,"emsp"    * 8195
    ,"thinsp"  * 8201
    ,"zwnj"    * 8204
    ,"zwj"     * 8205
    ,"lrm"     * 8206
    ,"rlm"     * 8207
    ,"ndash"   * 8211
    ,"mdash"   * 8212
    ,"lsquo"   * 8216
    ,"rsquo"   * 8217
    ,"sbquo"   * 8218
    ,"ldquo"   * 8220
    ,"rdquo"   * 8221
    ,"bdquo"   * 8222
    ,"dagger"  * 8224
    ,"Dagger"  * 8225
    ,"permil"  * 8240
    ,"lsaquo"  * 8249
    ,"rsaquo"  * 8250
    ,"euro"    * 8364

    ,"fnof"    * 402
    ,"Alpha"   * 913
    ,"Beta"    * 914
    ,"Gamma"   * 915
    ,"Delta"   * 916
    ,"Epsilon" * 917
    ,"Zeta"    * 918
    ,"Eta"     * 919
    ,"Theta"   * 920
    ,"Iota"    * 921
    ,"Kappa"   * 922
    ,"Lambda"  * 923
    ,"Mu"      * 924
    ,"Nu"      * 925
    ,"Xi"      * 926
    ,"Omicron" * 927
    ,"Pi"      * 928
    ,"Rho"     * 929
    ,"Sigma"   * 931
    ,"Tau"     * 932
    ,"Upsilon" * 933
    ,"Phi"     * 934
    ,"Chi"     * 935
    ,"Psi"     * 936
    ,"Omega"   * 937
    ,"alpha"   * 945
    ,"beta"    * 946
    ,"gamma"   * 947
    ,"delta"   * 948
    ,"epsilon" * 949
    ,"zeta"    * 950
    ,"eta"     * 951
    ,"theta"   * 952
    ,"iota"    * 953
    ,"kappa"   * 954
    ,"lambda"  * 955
    ,"mu"      * 956
    ,"nu"      * 957
    ,"xi"      * 958
    ,"omicron" * 959
    ,"pi"      * 960
    ,"rho"     * 961
    ,"sigmaf"  * 962
    ,"sigma"   * 963
    ,"tau"     * 964
    ,"upsilon" * 965
    ,"phi"     * 966
    ,"chi"     * 967
    ,"psi"     * 968
    ,"omega"   * 969
    ,"thetasym"* 977
    ,"upsih"   * 978
    ,"piv"     * 982
    ,"bull"    * 8226
    ,"hellip"  * 8230
    ,"prime"   * 8242
    ,"Prime"   * 8243
    ,"oline"   * 8254
    ,"frasl"   * 8260
    ,"weierp"  * 8472
    ,"image"   * 8465
    ,"real"    * 8476
    ,"trade"   * 8482
    ,"alefsym" * 8501
    ,"larr"    * 8592
    ,"uarr"    * 8593
    ,"rarr"    * 8594
    ,"darr"    * 8595
    ,"harr"    * 8596
    ,"crarr"   * 8629
    ,"lArr"    * 8656
    ,"uArr"    * 8657
    ,"rArr"    * 8658
    ,"dArr"    * 8659
    ,"hArr"    * 8660
    ,"forall"  * 8704
    ,"part"    * 8706
    ,"exist"   * 8707
    ,"empty"   * 8709
    ,"nabla"   * 8711
    ,"isin"    * 8712
    ,"notin"   * 8713
    ,"ni"      * 8715
    ,"prod"    * 8719
    ,"sum"     * 8721
    ,"minus"   * 8722
    ,"lowast"  * 8727
    ,"radic"   * 8730
    ,"prop"    * 8733
    ,"infin"   * 8734
    ,"ang"     * 8736
    ,"and"     * 8743
    ,"or"      * 8744
    ,"cap"     * 8745
    ,"cup"     * 8746
    ,"int"     * 8747
    ,"there4"  * 8756
    ,"sim"     * 8764
    ,"cong"    * 8773
    ,"asymp"   * 8776
    ,"ne"      * 8800
    ,"equiv"   * 8801
    ,"le"      * 8804
    ,"ge"      * 8805
    ,"sub"     * 8834
    ,"sup"     * 8835
    ,"nsub"    * 8836
    ,"sube"    * 8838
    ,"supe"    * 8839
    ,"oplus"   * 8853
    ,"otimes"  * 8855
    ,"perp"    * 8869
    ,"sdot"    * 8901
    ,"lceil"   * 8968
    ,"rceil"   * 8969
    ,"lfloor"  * 8970
    ,"rfloor"  * 8971
    ,"lang"    * 9001
    ,"rang"    * 9002
    ,"loz"     * 9674
    ,"spades"  * 9824
    ,"clubs"   * 9827
    ,"hearts"  * 9829
    ,"diams"   * 9830
    ]