/* * entities.c * * Copyright (c) Chris Putnam 2003-2009 * * Source code released under the GPL */ #include #include #include #include "entities.h" /* HTML 4.0 entities */ typedef struct entities { char html[20]; unsigned int unicode; } entities; entities html_entities[] = { /* Special Entities */ { """, 34 }, /* quotation mark */ { "&", 38 }, /* ampersand */ { "'", 39 }, /* apostrophe */ { "(", 40 }, /* left parenthesis */ { ")", 41 }, /* right parenthesis */ { "‐", 45 }, /* hyphen */ { "<", 60 }, /* less-than sign */ { ">", 62 }, /* greater-than sign */ { "?", 63 }, /* question mark */ { "Œ", 338 }, /* Latin cap ligature OE */ { "œ", 339 }, /* Latin small ligature OE */ { "Š", 352 }, /* Latin cap S with caron */ { "š", 353 }, /* Latin cap S with caron */ { "Ÿ", 376 }, /* Latin cap y with diaeresis */ { "ˆ", 710 }, /* modifier letter circumflex */ { "˜", 732 }, /* small tilde */ { " ", 8194 }, /* en space */ { " ", 8195 }, /* em space */ { " ", 8201 }, /* thin space */ { "‌", 8204 }, /* zero width non-joiner */ { "‍", 8205 }, /* zero width joiner */ { "‎", 8206 }, /* left-to-right mark */ { "‏", 8207 }, /* right-to-left mark */ { "–", 8211 }, /* en dash */ { "—", 8212 }, /* em dash */ { "‘", 8216 }, /* left single quotation mark */ { "’", 8217 }, /* right single quot. mark */ { "‚", 8218 }, /* single low-9 quot. mark */ { "“", 8220 }, /* left double quot. mark */ { "”", 8221 }, /* right double quot. mark */ { "„", 8222 }, /* double low-9 quot. mark */ { "†", 8224 }, /* dagger */ { "‡", 8225 }, /* double dagger */ { "‰", 8240 }, /* per mille sign */ { "‹", 8249 }, /* sin. left angle quot mark */ { "›", 8250 }, /* sin. right angle quot mark */ { "€", 8364 }, /* euro sign */ /* Symbols and Greek characters */ { "ƒ", 402 }, /* small f with hook = function */ { "Α", 913 }, /* capital alpha */ { "Β", 914 }, /* capital beta */ { "Γ", 915 }, /* capital gamma */ { "Δ", 916 }, /* capital delta */ { "Ε", 917 }, /* capital epsilon */ { "Ζ", 918 }, /* capital zeta */ { "Η", 919 }, /* capital eta */ { "Θ", 920 }, /* capital theta */ { "Ι", 921 }, /* capital iota */ { "Κ", 922 }, /* capital kappa */ { "Λ", 923 }, /* capital lambda */ { "Μ", 924 }, /* capital mu */ { "Ν", 925 }, /* capital nu */ { "Ξ", 926 }, /* capital xi */ { "Ο", 927 }, /* capital omicron */ { "Π", 928 }, /* capital pi */ { "Ρ", 929 }, /* capital rho */ { "Σ", 931 }, /* capital sigma */ { "Τ", 932 }, /* capital tau */ { "Υ", 933 }, /* capital upsilon */ { "Φ", 934 }, /* capital phi */ { "Χ", 935 }, /* capital chi */ { "Ψ", 936 }, /* capital psi */ { "Ω", 937 }, /* capital omega */ { "α", 945 }, /* small alpha */ { "β", 946 }, /* small beta */ { "γ", 947 }, /* small gamma */ { "δ", 948 }, /* small delta */ { "ε", 949 }, /* small epsilon */ { "ζ", 950 }, /* small zeta */ { "η", 951 }, /* small eta */ { "θ", 952 }, /* small theta */ { "ι", 953 }, /* small iota */ { "κ", 954 }, /* small kappa */ { "λ", 955 }, /* small lambda */ { "μ", 956 }, /* small mu */ { "ν", 957 }, /* small nu */ { "ξ", 958 }, /* small xi */ { "ο", 959 }, /* small omicron */ { "π", 960 }, /* small pi */ { "ρ", 961 }, /* small rho */ { "ς", 962 }, /* small final sigma */ { "σ", 963 }, /* small simga */ { "τ", 964 }, /* small tau */ { "υ", 965 }, /* small upsilon */ { "φ", 966 }, /* small phi */ { "χ", 967 }, /* small chi */ { "ψ", 968 }, /* small psi */ { "ω", 969 }, /* small omega */ { "ϑ",977 }, /* small theta symbol */ { "ϒ", 978 }, /* small upsilon with hook */ { "ϖ", 982 }, /* pi symbol */ { "•", 8226 }, /* bullet = small blk circle */ { "…", 8230 }, /* horizontal ellipsis */ { "′", 8242 }, /* prime = minutes = feet */ { "″", 8243 }, /* double prime */ { "‾", 8254 }, /* overline */ { "⁄", 8260 }, /* fraction slash */ { "℘", 8472 }, /* Weierstrass p = power set */ { "ℑ", 8465 }, /* imaginary part-black cap I */ { "ℜ", 8476 }, /* real part-black cap R */ { "™", 8482 }, /* trademark sign */ { "ℵ",8501 }, /* alef symbol */ { "←", 8592 }, /* left arrow */ { "↑", 8593 }, /* up arrow */ { "→", 8594 }, /* right arrow */ { "↓", 8595 }, /* down arrow */ { "↔", 8596 }, /* left/right arrow */ { "↵", 8629 }, /* down arrow with corner left */ { "⇐", 8656 }, /* left double arrow */ { "⇑", 8657 }, /* up double arrow */ { "⇒", 8658 }, /* up double arrow */ { "⇓", 8659 }, /* up double arrow */ { "⇔", 8660 }, /* up double arrow */ { "∀", 8704}, /* for all */ { "∂", 8706}, /* partial differential */ { "∃", 8707}, /* there exists */ { "∅", 8709}, /* empty set */ { "∇", 8711}, /* nabla=backwards difference */ { "∈", 8712}, /* element of */ { "∉", 8713}, /* not an element of */ { "∋", 8715}, /* contains as member */ { "∏", 8719}, /* n-ary product */ { "∑", 8721}, /* n-ary summation */ { "−", 8722}, /* minuss sign */ { "∗", 8727}, /* asterisk operator */ { "√", 8730}, /* square root */ { "∝", 8733}, /* proportional to */ { "∞", 8734}, /* infinity */ { "∠", 8736}, /* angle */ { "∧", 8743}, /* logical and */ { "∨", 8744}, /* logical or */ { "∩", 8745}, /* intersection */ { "∪", 8746}, /* union */ { "∫", 8747}, /* integral */ { "∴", 8756}, /* therefore */ { "∼", 8764}, /* tilde operator */ { "≅", 8773}, /* approximately equal to */ { "≈", 8776}, /* asymptotic to */ { "≠", 8800}, /* not equal to */ { "≡", 8801}, /* identical to */ { "≤", 8804}, /* less-than or equal to */ { "≥", 8805}, /* greater-than or equal to */ { "⊂", 8834}, /* subset of */ { "⊃", 8835}, /* superset of */ { "⊄", 8836}, /* not a subset of */ { "⊆", 8838}, /* subset of or equal to */ { "⊇", 8839}, /* superset of or equal to */ { "⊕", 8853}, /* circled plus = direct sum */ { "⊗", 8855}, /* circled times = vec prod */ { "⊥", 8869}, /* perpendicular */ { "⋅", 8901}, /* dot operator */ { "⌈", 8968}, /* left ceiling */ { "⌉", 8969}, /* right ceiling */ { "⌊", 8970}, /* left floor */ { "⌋", 8971}, /* right floor */ { "⟨", 9001}, /* left angle bracket */ { "⟩", 9002}, /* right angle bracket */ { "◊", 9674}, /* lozenge */ { "♠", 9824}, /* spades */ { "♣", 9827}, /* clubs */ { "♥", 9829}, /* hearts */ { "♦", 9830}, /* diamonds */ /* Latin-1 */ { " ", 32 }, /* non-breaking space */ { "¡", 161 }, /* inverted exclamation mark */ { "¢", 162 }, /* cent sign */ { "£", 163 }, /* pound sign */ { "¤", 164 }, /* currency sign */ { "¥", 165 }, /* yen sign */ { "¦", 166 }, /* broken vertical bar */ { "§", 167 }, /* section sign */ { "¨", 168 }, /* diaeresis - spacing diaeresis */ { "©", 169 }, /* copyright sign */ { "ª", 170 }, /* feminine ordinal indicator */ { "«", 171 }, /* left-pointing guillemet */ { "¬", 172 }, /* not sign */ { "­", 173 }, /* soft (discretionary) hyphen */ { "®", 174 }, /* registered sign */ { "¯", 175 }, /* macron = overline */ { "°", 176 }, /* degree sign */ { "±", 177 }, /* plus-minus sign */ { "²", 178 }, /* superscript two */ { "³", 179 }, /* superscript three */ { "´", 180 }, /* acute accent = spacing acute */ { "µ", 181 }, /* micro sign */ { "¶", 182 }, /* pilcrow (paragraph) sign */ { "·", 183 }, /* middle dot (georgian comma) */ { "¸", 184 }, /* cedilla = spacing cedilla */ { "¹", 185 }, /* superscript one */ { "º", 186 }, /* masculine ordinal indicator */ { "»", 187 }, /* right pointing guillemet */ { "¼", 188 }, /* 1/4 */ { "½", 189 }, /* 1/2 */ { "¾", 190 }, /* 3/4 */ { "¿", 191 }, /* inverted question mark */ { "À", 192 }, /* cap A with grave */ { "Á", 193 }, /* cap A with acute */ { "Â", 194 }, /* cap A with circumflex */ { "Ã", 195 }, /* cap A with tilde */ { "Ä", 196 }, /* cap A with diaeresis */ { "Å", 197 }, /* cap A with ring */ { "Æ", 198 }, /* cap AE ligature */ { "Ç", 199 }, /* cap C with cedilla */ { "È", 200 }, /* cap E with grave */ { "É", 201 }, /* cap E with acute */ { "Ê", 202 }, /* cap E with circumflex */ { "Ë", 203 }, /* cap E with diaeresis */ { "Ì", 204 }, /* cap I with grave */ { "Í", 205 }, /* cap I with acute */ { "Î", 206 }, /* cap I with circumflex */ { "Ï", 207 }, /* cap I with diaeresis */ { "Ð", 208 }, /* cap letter ETH */ { "Ñ", 209 }, /* cap N with tilde */ { "Ò", 210 }, /* cap O with grave */ { "Ó", 211 }, /* cap O with acute */ { "Ô", 212 }, /* cap O with circumflex */ { "Õ", 213 }, /* cap O with tilde */ { "Ö", 214 }, /* cap O with diaeresis */ { "×", 215 }, /* multiplication sign */ { "Ø", 216 }, /* cap O with stroke */ { "Ù", 217 }, /* cap U with grave */ { "Ú", 218 }, /* cap U with acute */ { "Û", 219 }, /* cap U with circumflex */ { "Ü", 220 }, /* cap U with diaeresis */ { "Ý", 221 }, /* cap Y with acute */ { "Þ", 222 }, /* cap letter THORN */ { "ß", 223 }, /* small sharp s = ess-zed */ { "à", 224 }, /* small a with grave */ { "á", 225 }, /* small a with acute */ { "â", 226 }, /* small a with cirucmflex */ { "ã", 227 }, /* small a with tilde */ { "&amul;", 228 }, /* small a with diaeresis */ { "å", 229 }, /* small a with ring */ { "æ", 230 }, /* small ligature ae */ { "ç", 231 }, /* small c with cedilla */ { "è", 232 }, /* small e with grave */ { "é", 233 }, /* small e with acute */ { "ê", 234 }, /* small e with circumflex */ { "&emul;", 235 }, /* small e with diaeresis */ { "ì", 236 }, /* small i with grave */ { "í", 237 }, /* small i with acute */ { "î", 238 }, /* small i with circumflex */ { "ï", 239 }, /* small i with diaeresis */ { "ð", 240 }, /* latin small letter eth */ { "ñ", 241 }, /* small n with tilde */ { "ò", 242 }, /* small o with grave */ { "ó", 243 }, /* small o with acute */ { "ô", 244 }, /* small o with circumflex */ { "õ", 245 }, /* small o with tilde */ { "ö", 246 }, /* small o with diaeresis */ { "÷", 247 }, /* division sign */ { "ø", 248 }, /* small o with slash */ { "ù", 249 }, /* small u with grave */ { "ú", 250 }, /* small u with acute */ { "û", 251 }, /* small u with circumflex */ { "ü", 252 }, /* small u with diaeresis */ { "ý", 253 }, /* small y with acute */ { "þ", 254 }, /* latin small letter thorn */ { "ÿ", 255 }, /* small y with diaeresis */ }; static unsigned int decode_html_entity( char *s, unsigned int *pi, int *err ) { int nhtml_entities = sizeof( html_entities ) / sizeof( entities ); char *e; int i, n=-1, len; for ( i=0; i