module Kevin.Util.Entity ( entityEncode, entityDecode ) where import Control.Applicative ((<|>), (<$>), (<*>)) import Control.Monad (guard) import Control.Monad.Fix import Data.Attoparsec.Text import Data.Char import Data.Maybe import Data.Monoid import qualified Data.Text as T import qualified Data.Text.Read as R import Prelude hiding (take) decodeCharacter :: Parser T.Text decodeCharacter = entityNumeric <|> entityNamed <|> take 1 entityNumeric :: Parser T.Text entityNumeric = do string "&#" entity <- (<>) <$> option "" (string "x") <*> takeWhile1 isHexDigit char ';' return . fromMaybe (T.concat ["&#", entity, ";"]) $ (if "x" `T.isPrefixOf` entity then lookupHexEntity else lookupNumericEntity) entity entityNamed :: Parser T.Text entityNamed = do char '&' entity <- T.cons <$> letter <*> takeWhile1 isAlphaNum char ';' return . fromMaybe (T.concat ["&", entity, ";"]) . lookupNamedEntity $ entity decodeParser :: Parser T.Text decodeParser = T.concat <$> many1 decodeCharacter entityDecode :: T.Text -> T.Text entityDecode "" = "" entityDecode str = case parseOnly decodeParser str of Left err -> error $ "entityDecode: " ++ err Right s -> s entityEncode :: T.Text -> T.Text entityEncode = T.pack . concat . entityEncodeS . T.unpack entityEncodeS :: String -> [String] entityEncodeS = fix (\f str -> case str of [] -> [] (x:xs) -> if x < '\127' then [x]:f xs else ("&#" ++ show (ord x) ++ ";"):f xs) lookupNamedEntity :: T.Text -> Maybe T.Text lookupNamedEntity ent = (T.singleton . chr) <$> lookup ent namedEntities lookupHexEntity :: T.Text -> Maybe T.Text lookupHexEntity e = case R.hexadecimal $ T.cons '0' e of Right (n,_) -> do guard $ n < ord maxBound return . T.singleton . chr $ n Left _ -> Nothing lookupNumericEntity :: T.Text -> Maybe T.Text lookupNumericEntity e = case R.decimal e of Right (n,_) -> do guard $ n < ord maxBound return . T.singleton . chr $ n Left _ -> Nothing namedEntities :: [(T.Text, Int)] namedEntities = [ ("quot", 34), ("amp", 38), ("apos", 39), ("lt", 60) , ("gt", 62), ("nbsp", 160), ("iexcl", 161), ("cent", 162) , ("pound", 163), ("curren", 164), ("yen", 165) , ("brvbar", 166), ("sect", 167), ("uml", 168), ("copy", 169) , ("ordf", 170), ("laquo", 171), ("not", 172), ("shy", 173) , ("reg", 174), ("macr", 175), ("deg", 176), ("plusmn", 177) , ("sup2", 178), ("sup3", 179), ("acute", 180), ("micro", 181) , ("para", 182), ("middot", 183), ("cedil", 184), ("sup1", 185) , ("ordm", 186), ("raquo", 187), ("frac14", 188) , ("frac12", 189), ("frac34", 190), ("iquest", 191) , ("Agrave", 192), ("Aacute", 193), ("Acirc", 194) , ("Atilde", 195), ("Auml", 196), ("Aring", 197) , ("AElig", 198), ("Ccedil", 199), ("Egrave", 200) , ("Eacute", 201), ("Ecirc", 202), ("Euml", 203) , ("Igrave", 204), ("Iacute", 205), ("Icirc", 206) , ("Iuml", 207), ("ETH", 208), ("Ntilde", 209), ("Ograve", 210) , ("Oacute", 211), ("Ocirc", 212), ("Otilde", 213) , ("Ouml", 214), ("times", 215), ("Oslash", 216) , ("Ugrave", 217), ("Uacute", 218), ("Ucirc", 219) , ("Uuml", 220), ("Yacute", 221), ("THORN", 222) , ("szlig", 223), ("agrave", 224), ("aacute", 225) , ("acirc", 226), ("atilde", 227), ("auml", 228) , ("aring", 229), ("aelig", 230), ("ccedil", 231) , ("egrave", 232), ("eacute", 233), ("ecirc", 234) , ("euml", 235), ("igrave", 236), ("iacute", 237) , ("icirc", 238), ("iuml", 239), ("eth", 240), ("ntilde", 241) , ("ograve", 242), ("oacute", 243), ("ocirc", 244) , ("otilde", 245), ("ouml", 246), ("divide", 247) , ("oslash", 248), ("ugrave", 249), ("uacute", 250) , ("ucirc", 251), ("uuml", 252), ("yacute", 253) , ("thorn", 254), ("yuml", 255), ("OElig", 338), ("oelig", 339) , ("Scaron", 352), ("scaron", 353), ("Yuml", 376) , ("fnof", 402), ("circ", 710), ("tilde", 732), ("Alpha", 913) , ("Beta", 914), ("Gamma", 915), ("Delta", 916) , ("Epsilon", 917), ("Zeta", 918), ("Eta", 919), ("Theta", 920) , ("Iota", 921), ("Kappa", 922), ("Lambda", 923), ("Mu", 924) , ("Nu", 925), ("Xi", 926), ("Omicron", 927), ("Pi", 928) , ("Rho", 929), ("Sigma", 931), ("Tau", 932), ("Upsilon", 933) , ("Phi", 934), ("Chi", 935), ("Psi", 936), ("Omega", 937) , ("alpha", 945), ("beta", 946), ("gamma", 947), ("delta", 948) , ("epsilon", 949), ("zeta", 950), ("eta", 951), ("theta", 952) , ("iota", 953), ("kappa", 954), ("lambda", 955), ("mu", 956) , ("nu", 957), ("xi", 958), ("omicron", 959), ("pi", 960) , ("rho", 961), ("sigmaf", 962), ("sigma", 963), ("tau", 964) , ("upsilon", 965), ("phi", 966), ("chi", 967), ("psi", 968) , ("omega", 969), ("thetasym", 977), ("upsih", 978) , ("piv", 982), ("ensp", 8194), ("emsp", 8195) , ("thinsp", 8201), ("zwnj", 8204), ("zwj", 8205) , ("lrm", 8206), ("rlm", 8207), ("ndash", 8211) , ("mdash", 8212), ("lsquo", 8216), ("rsquo", 8217) , ("sbquo", 8218), ("ldquo", 8220), ("rdquo", 8221) , ("bdquo", 8222), ("dagger", 8224), ("Dagger", 8225) , ("bull", 8226), ("hellip", 8230), ("permil", 8240) , ("prime", 8242), ("Prime", 8243), ("lsaquo", 8249) , ("rsaquo", 8250), ("oline", 8254), ("frasl", 8260) , ("euro", 8364), ("image", 8465), ("weierp", 8472) , ("real", 8476), ("trade", 8482), ("alefsym", 8501) , ("larr", 8592), ("uarr", 8593), ("rarr", 8594) , ("darr", 8595), ("harr", 8596), ("crarr", 8629) , ("lArr", 8656), ("uArr", 8657), ("rArr", 8658) , ("dArr", 8659), ("hArr", 8660), ("forall", 8704) , ("part", 8706), ("exist", 8707), ("empty", 8709) , ("nabla", 8711), ("isin", 8712), ("notin", 8713) , ("ni", 8715), ("prod", 8719), ("sum", 8721), ("minus", 8722) , ("lowast", 8727), ("radic", 8730), ("prop", 8733) , ("infin", 8734), ("ang", 8736), ("and", 8743), ("or", 8744) , ("cap", 8745), ("cup", 8746), ("int", 8747), ("there4", 8756) , ("sim", 8764), ("cong", 8773), ("asymp", 8776), ("ne", 8800) , ("equiv", 8801), ("le", 8804), ("ge", 8805), ("sub", 8834) , ("sup", 8835), ("nsub", 8836), ("sube", 8838), ("supe", 8839) , ("oplus", 8853), ("otimes", 8855), ("perp", 8869) , ("sdot", 8901), ("lceil", 8968), ("rceil", 8969) , ("lfloor", 8970), ("rfloor", 8971), ("lang", 9001) , ("rang", 9002), ("loz", 9674), ("spades", 9824) , ("clubs", 9827), ("hearts", 9829), ("diams", 9830)]