/* * unicode.c * * Helper unicode functions/values to determine the * types of unicode characters. */ #include "utf8.h" #include "unicode.h" typedef struct { unsigned int value; unsigned short info; } unicodeinfo_t; static unicodeinfo_t unicodeinfo[] = { { 48, UNICODE_NUMBER }, /* 0 */ { 49, UNICODE_NUMBER }, /* 1 */ { 50, UNICODE_NUMBER }, /* 2 */ { 51, UNICODE_NUMBER }, /* 3 */ { 52, UNICODE_NUMBER }, /* 4 */ { 53, UNICODE_NUMBER }, /* 5 */ { 54, UNICODE_NUMBER }, /* 6 */ { 55, UNICODE_NUMBER }, /* 7 */ { 56, UNICODE_NUMBER }, /* 8 */ { 57, UNICODE_NUMBER }, /* 9 */ { 65, UNICODE_UPPER }, /* Latin Capital A */ { 66, UNICODE_UPPER }, /* Latin Capital B */ { 67, UNICODE_UPPER }, /* Latin Capital C */ { 68, UNICODE_UPPER }, /* Latin Capital D */ { 69, UNICODE_UPPER }, /* Latin Capital E */ { 70, UNICODE_UPPER }, /* Latin Capital F */ { 71, UNICODE_UPPER }, /* Latin Capital G */ { 72, UNICODE_UPPER }, /* Latin Capital H */ { 73, UNICODE_UPPER }, /* Latin Capital I */ { 74, UNICODE_UPPER }, /* Latin Capital J */ { 75, UNICODE_UPPER }, /* Latin Capital K */ { 76, UNICODE_UPPER }, /* Latin Capital L */ { 77, UNICODE_UPPER }, /* Latin Capital M */ { 78, UNICODE_UPPER }, /* Latin Capital N */ { 79, UNICODE_UPPER }, /* Latin Capital O */ { 80, UNICODE_UPPER }, /* Latin Capital P */ { 81, UNICODE_UPPER }, /* Latin Capital Q */ { 82, UNICODE_UPPER }, /* Latin Capital R */ { 83, UNICODE_UPPER }, /* Latin Capital S */ { 84, UNICODE_UPPER }, /* Latin Capital T */ { 85, UNICODE_UPPER }, /* Latin Capital U */ { 86, UNICODE_UPPER }, /* Latin Capital V */ { 87, UNICODE_UPPER }, /* Latin Capital W */ { 88, UNICODE_UPPER }, /* Latin Capital X */ { 89, UNICODE_UPPER }, /* Latin Capital Y */ { 90, UNICODE_UPPER }, /* Latin Capital Z */ { 97, UNICODE_LOWER }, /* Latin Small a */ { 98, UNICODE_LOWER }, /* Latin Small b */ { 99, UNICODE_LOWER }, /* Latin Small c */ { 100, UNICODE_LOWER }, /* Latin Small d */ { 101, UNICODE_LOWER }, /* Latin Small e */ { 102, UNICODE_LOWER }, /* Latin Small f */ { 103, UNICODE_LOWER }, /* Latin Small g */ { 104, UNICODE_LOWER }, /* Latin Small h */ { 105, UNICODE_LOWER }, /* Latin Small i */ { 106, UNICODE_LOWER }, /* Latin Small j */ { 107, UNICODE_LOWER }, /* Latin Small k */ { 108, UNICODE_LOWER }, /* Latin Small l */ { 109, UNICODE_LOWER }, /* Latin Small m */ { 110, UNICODE_LOWER }, /* Latin Small n */ { 111, UNICODE_LOWER }, /* Latin Small o */ { 112, UNICODE_LOWER }, /* Latin Small p */ { 113, UNICODE_LOWER }, /* Latin Small q */ { 114, UNICODE_LOWER }, /* Latin Small r */ { 115, UNICODE_LOWER }, /* Latin Small s */ { 116, UNICODE_LOWER }, /* Latin Small t */ { 117, UNICODE_LOWER }, /* Latin Small u */ { 118, UNICODE_LOWER }, /* Latin Small v */ { 119, UNICODE_LOWER }, /* Latin Small w */ { 120, UNICODE_LOWER }, /* Latin Small x */ { 121, UNICODE_LOWER }, /* Latin Small y */ { 122, UNICODE_LOWER }, /* Latin Small z */ { 192, UNICODE_UPPER }, /* Latin Capital A with grave */ { 193, UNICODE_UPPER }, /* Latin Capital A with acute */ { 194, UNICODE_UPPER }, /* Latin Capital A with circumflex */ { 195, UNICODE_UPPER }, /* Latin Capital A with tilde */ { 196, UNICODE_UPPER }, /* Latin Capital A with diuresis */ { 197, UNICODE_UPPER }, /* Latin Capital A with ring above */ { 198, UNICODE_UPPER }, /* Latin Capital AE */ { 199, UNICODE_UPPER }, /* Latin Capital C with cedilla */ { 200, UNICODE_UPPER }, /* Latin Capital E with grave */ { 201, UNICODE_UPPER }, /* Latin Capital E with acute */ { 202, UNICODE_UPPER }, /* Latin Capital E with circumflex */ { 203, UNICODE_UPPER }, /* Latin Capital E with diuresis */ { 204, UNICODE_UPPER }, /* Latin Capital I with grave */ { 205, UNICODE_UPPER }, /* Latin Capital I with acute */ { 206, UNICODE_UPPER }, /* Latin Capital I with circumflex */ { 207, UNICODE_UPPER }, /* Latin Capital I with diuresis */ { 208, UNICODE_UPPER }, /* Latin Capital ETH */ { 209, UNICODE_UPPER }, /* Latin Capital N with tilde */ { 210, UNICODE_UPPER }, /* Latin Capital O with grave */ { 211, UNICODE_UPPER }, /* Latin Capital O with acute */ { 212, UNICODE_UPPER }, /* Latin Capital O with circumflex */ { 213, UNICODE_UPPER }, /* Latin Capital O with tilde */ { 214, UNICODE_UPPER }, /* Latin Captial O with diaeresis */ { 216, UNICODE_UPPER }, /* Latin Capital O with stroke */ { 217, UNICODE_UPPER }, /* Latin Capital U with grave */ { 218, UNICODE_UPPER }, /* Latin Capital U with acute */ { 219, UNICODE_UPPER }, /* Latin Capital U with circumflex */ { 220, UNICODE_UPPER }, /* Latin Capital U with diaeresis */ { 221, UNICODE_UPPER }, /* Latin Capital Y with acute */ { 222, UNICODE_UPPER }, /* Latin Capital THORN */ { 223, UNICODE_LOWER }, /* German sz ligature */ { 224, UNICODE_LOWER }, /* Latin Small a with grave */ { 225, UNICODE_LOWER }, /* Latin Small a with acute */ { 226, UNICODE_LOWER }, /* Latin Small a with circumflex */ { 227, UNICODE_LOWER }, /* Latin Small a with tilde */ { 228, UNICODE_LOWER }, /* Latin Small a with diuresis */ { 229, UNICODE_LOWER }, /* Latin Small a with ring above */ { 230, UNICODE_LOWER }, /* Latin Small ae */ { 231, UNICODE_LOWER }, /* Latin Small c with cedilla */ { 232, UNICODE_LOWER }, /* Latin Small e with grave */ { 233, UNICODE_LOWER }, /* Latin Small e with acute */ { 234, UNICODE_LOWER }, /* Latin Small e with circumflex */ { 235, UNICODE_LOWER }, /* Latin Small e with diuresis */ { 236, UNICODE_LOWER }, /* Latin Small i with grave */ { 237, UNICODE_LOWER }, /* Latin Small i with acute */ { 238, UNICODE_LOWER }, /* Latin Small i with circumflex */ { 239, UNICODE_LOWER }, /* Latin Small i with diuresis */ { 240, UNICODE_LOWER }, /* Latin Small eth */ { 241, UNICODE_LOWER }, /* Latin Small n with tilde */ { 242, UNICODE_LOWER }, /* Latin Small o with grave */ { 243, UNICODE_LOWER }, /* Latin Small o with acute */ { 244, UNICODE_LOWER }, /* Latin Small o with circumflex */ { 245, UNICODE_LOWER }, /* Latin Small o with tilde */ { 246, UNICODE_LOWER }, /* Latin Small o with diaeresis */ { 248, UNICODE_LOWER }, /* Latin Small o with stroke */ { 249, UNICODE_LOWER }, /* Latin Small u with grave */ { 250, UNICODE_LOWER }, /* Latin Small u with acute */ { 251, UNICODE_LOWER }, /* Latin Small u with circumflex */ { 252, UNICODE_LOWER }, /* Latin Small u with diaeresis */ { 253, UNICODE_LOWER }, /* Latin Small y with acute */ { 254, UNICODE_LOWER }, /* Latin Small thorn */ { 255, UNICODE_LOWER }, /* Latin Small y with diaeresis */ { 256, UNICODE_UPPER }, /* Latin Capital A with macron */ { 257, UNICODE_LOWER }, /* Latin Small a with macron */ { 258, UNICODE_UPPER }, /* Latin Capital A with breve */ { 259, UNICODE_LOWER }, /* Latin Small a with breve */ { 260, UNICODE_UPPER }, /* Latin Capital A with ogonek */ { 261, UNICODE_LOWER }, /* Latin Small a with ogonek */ { 262, UNICODE_UPPER }, /* Latin Capital C with acute */ { 263, UNICODE_LOWER }, /* Latin Small c with acute */ { 264, UNICODE_UPPER }, /* Latin Capital C with circumflex */ { 265, UNICODE_LOWER }, /* Latin Small c with circumflex */ { 266, UNICODE_UPPER }, /* Latin Capital C with dot above */ { 267, UNICODE_LOWER }, /* Latin Small c with dot above */ { 268, UNICODE_UPPER }, /* Latin Capital C with caron (hacek) */ { 269, UNICODE_LOWER }, /* Latin Small c with caron (hacek) */ { 270, UNICODE_UPPER }, /* Latin Capital D with caron (hacek) */ { 271, UNICODE_LOWER }, /* Latin Small d with caron (hacek) */ { 272, UNICODE_UPPER }, /* Latin Capital D with stroke */ { 273, UNICODE_LOWER }, /* Latin Small d with stroke */ { 274, UNICODE_UPPER }, /* Latin Capital E with macron */ { 275, UNICODE_LOWER }, /* Latin Small e with macron */ { 276, UNICODE_UPPER }, /* Latin Capital E with breve */ { 277, UNICODE_LOWER }, /* Latin Small e with breve */ { 278, UNICODE_UPPER }, /* Latin Capital E with dot above */ { 279, UNICODE_LOWER }, /* Latin Small e with dot above */ { 280, UNICODE_UPPER }, /* Latin Capital E with ogonek */ { 281, UNICODE_LOWER }, /* Latin Small e with ogonek */ { 282, UNICODE_UPPER }, /* Latin Capital E with caron (hacek) */ { 283, UNICODE_LOWER }, /* Latin Small e with caron */ { 284, UNICODE_UPPER }, /* Latin Capital G with circumflex */ { 285, UNICODE_LOWER }, /* Latin Small g with circumflex */ { 286, UNICODE_UPPER }, /* Latin Capital G with breve */ { 287, UNICODE_LOWER }, /* Latin Small g with breve */ { 288, UNICODE_UPPER }, /* Latin Capital G with dot above */ { 289, UNICODE_LOWER }, /* Latin Small g with dot above */ { 290, UNICODE_UPPER }, /* Latin Capital G with cedilla */ { 291, UNICODE_LOWER }, /* Latin Small g with cedilla */ { 292, UNICODE_UPPER }, /* Latin Capital H with circumflex */ { 293, UNICODE_LOWER }, /* Latin Small h with circumflex */ { 294, UNICODE_UPPER }, /* Latin Capital H with stroke */ { 295, UNICODE_LOWER }, /* Latin Small h with stroke */ { 296, UNICODE_UPPER }, /* Latin Capital I with tilde */ { 297, UNICODE_LOWER }, /* Latin Small i with tilde */ { 298, UNICODE_UPPER }, /* Latin Capital I with macron */ { 299, UNICODE_LOWER }, /* Latin Small i with macron */ { 300, UNICODE_UPPER }, /* Latin Capital I with breve */ { 301, UNICODE_LOWER }, /* Latin Small i with breve */ { 302, UNICODE_UPPER }, /* Latin Capital I with ogonek */ { 303, UNICODE_LOWER }, /* Latin Small i with ogonek */ { 304, UNICODE_UPPER }, /* Latin Capital I with dot above */ { 305, UNICODE_LOWER }, /* Latin Small i without dot above */ { 306, UNICODE_UPPER }, /* Latin Capital IJ */ { 307, UNICODE_LOWER }, /* Latin Small IJ */ { 308, UNICODE_UPPER }, /* Latin Capital J with circumflex */ { 309, UNICODE_LOWER }, /* Latin Small j with circumflex */ { 310, UNICODE_UPPER }, /* Latin Capital K with cedilla */ { 311, UNICODE_LOWER }, /* Latin Small j with cedilla */ { 312, UNICODE_LOWER }, /* Latin Small kra */ { 313, UNICODE_UPPER }, /* Latin Capital L with acute */ { 314, UNICODE_LOWER }, /* Latin Small l with acute */ { 315, UNICODE_UPPER }, /* Latin Capital L with cedilla */ { 316, UNICODE_LOWER }, /* Latin Small l with cedilla */ { 317, UNICODE_UPPER }, /* Latin Capital L with caron */ { 318, UNICODE_LOWER }, /* Latin Small l with caron */ { 319, UNICODE_UPPER }, /* Latin Capital L with middle dot */ { 320, UNICODE_LOWER }, /* Latin Small l with middle dot */ { 321, UNICODE_UPPER }, /* Latin Capital L with stroke */ { 322, UNICODE_LOWER }, /* Latin Small l with stroke */ { 323, UNICODE_UPPER }, /* Latin Capital N with acute */ { 324, UNICODE_LOWER }, /* Latin Small n with acute */ { 325, UNICODE_UPPER }, /* Latin Capital N with cedilla */ { 326, UNICODE_LOWER }, /* Latin Small n with cedilla */ { 327, UNICODE_UPPER }, /* Latin Capital N with caron */ { 328, UNICODE_LOWER }, /* Latin Small n with caron */ { 329, UNICODE_LOWER }, /* Latin Small n preceeded by apostrophe */ { 330, UNICODE_UPPER }, /* Latin Capital Eng */ { 331, UNICODE_LOWER }, /* Latin Small eng */ { 332, UNICODE_UPPER }, /* Latin Capital O with macron */ { 333, UNICODE_LOWER }, /* Latin Small o with macron */ { 334, UNICODE_UPPER }, /* Latin Capital O with breve */ { 335, UNICODE_LOWER }, /* Latin Small o with breve */ { 336, UNICODE_UPPER }, /* Latin Capital O with double acute */ { 337, UNICODE_LOWER }, /* Latin Small o with double acute */ { 338, UNICODE_UPPER }, /* Latin Capital OE */ { 339, UNICODE_LOWER }, /* Latin Small oe */ { 340, UNICODE_UPPER }, /* Latin Capital R with acute */ { 341, UNICODE_LOWER }, /* Latin Small r with acute */ { 342, UNICODE_UPPER }, /* Latin Capital R with cedilla */ { 343, UNICODE_LOWER }, /* Latin Small r with cedilla */ { 344, UNICODE_UPPER }, /* Latin Capital R with caron */ { 345, UNICODE_LOWER }, /* Latin Small r with caron */ { 346, UNICODE_UPPER }, /* Latin Capital S with acute */ { 347, UNICODE_LOWER }, /* Latin Small s with acute */ { 348, UNICODE_UPPER }, /* Latin Capital S with circumflex */ { 349, UNICODE_LOWER }, /* Latin Small s with circumflex */ { 350, UNICODE_UPPER }, /* Latin Capital S with cedilla */ { 351, UNICODE_LOWER }, /* Latin Small s with cedilla */ { 352, UNICODE_UPPER }, /* Latin Capital S with caron */ { 353, UNICODE_LOWER }, /* Latin Small s with caron */ { 354, UNICODE_UPPER }, /* Latin Capital T with cedilla */ { 355, UNICODE_LOWER }, /* Latin Small t with cedilla */ { 356, UNICODE_UPPER }, /* Latin Capital T with caron */ { 357, UNICODE_LOWER }, /* Latin Small t with caron */ { 358, UNICODE_UPPER }, /* Latin Capital T with stroke */ { 359, UNICODE_LOWER }, /* Latin Small t with stroke */ { 360, UNICODE_UPPER }, /* Latin Capital U with tilde */ { 361, UNICODE_LOWER }, /* Latin Small u with tilde */ { 362, UNICODE_UPPER }, /* Latin Capital U with macron */ { 363, UNICODE_LOWER }, /* Latin Small u with macron */ { 364, UNICODE_UPPER }, /* Latin Capital U with breve */ { 365, UNICODE_LOWER }, /* Latin Small u with breve */ { 366, UNICODE_UPPER }, /* Latin Capital U with ring above */ { 367, UNICODE_LOWER }, /* Latin Small u with ring above */ { 368, UNICODE_UPPER }, /* Latin Capital U with double acute */ { 369, UNICODE_LOWER }, /* Latin Small u with double acute */ { 370, UNICODE_UPPER }, /* Latin Capital U with ogonek */ { 371, UNICODE_LOWER }, /* Latin Small u with ogonek */ { 372, UNICODE_UPPER }, /* Latin Capital W with circumflex */ { 373, UNICODE_LOWER }, /* Latin Small w with circumflex */ { 374, UNICODE_UPPER }, /* Latin Capital Y with circumflex */ { 375, UNICODE_LOWER }, /* Latin Small y with circumflex */ { 376, UNICODE_UPPER }, /* Latin Capital Y with diaeresis */ { 377, UNICODE_UPPER }, /* Latin Capital Z with acute */ { 378, UNICODE_LOWER }, /* Latin Small z with acute */ { 379, UNICODE_UPPER }, /* Latin Capital Z with dot above */ { 380, UNICODE_LOWER }, /* Latin Small z with dot above */ { 381, UNICODE_UPPER }, /* Latin Capital Z with caron */ { 382, UNICODE_LOWER }, /* Latin Small z with caron */ { 383, UNICODE_LOWER }, /* Latin Small long S */ { 461, UNICODE_UPPER }, /* Latin Capital A with caron (hacek) */ { 462, UNICODE_LOWER }, /* Latin Small a with caron (hacek) */ { 463, UNICODE_UPPER }, /* Latin Capital I with caron (hacek) */ { 464, UNICODE_LOWER }, /* Latin Small i with caron (hacek) */ { 465, UNICODE_UPPER }, /* Latin Capital O with caron (hacek) */ { 466, UNICODE_LOWER }, /* Latin Small o with caron (hacek) */ { 467, UNICODE_UPPER }, /* Latin Capital U with caron (hacek) */ { 468, UNICODE_LOWER }, /* Latin Small u with caron (hacek) */ { 486, UNICODE_UPPER }, /* Latin Capital G with caron */ { 487, UNICODE_LOWER }, /* Latin Small g with caron */ { 488, UNICODE_UPPER }, /* Latin Capital J with caron */ { 489, UNICODE_LOWER }, /* Latin Small j with caron */ { 490, UNICODE_UPPER }, /* Latin Capital O with caron */ { 491, UNICODE_LOWER }, /* Latin Small o with caron */ { 500, UNICODE_UPPER }, /* Latin Capital G with acute */ { 501, UNICODE_LOWER }, /* Latin Small g with caron */ }; static int nunicodeinfo = sizeof( unicodeinfo ) / sizeof( unicodeinfo[0] ); static int unicode_find( unsigned int unicode_character ) { int min = 0, max = nunicodeinfo, mid; while ( min < max ) { mid = ( min + max ) / 2; if ( unicodeinfo[mid].value < unicode_character ) min = mid + 1; else max = mid; } if ( ( max==min ) && ( unicodeinfo[min].value == unicode_character ) ) return min; else return -1; } unsigned short unicode_utf8_classify( char *p ) { unsigned int unicode_character, pos = 0; int n; unicode_character = utf8_decode( p, &pos ); n = unicode_find( unicode_character ); if ( n==-1 ) return UNICODE_SYMBOL; else return unicodeinfo[n].info; } unsigned short unicode_utf8_classify_newstr( newstr *s ) { unsigned int unicode_character, pos = 0; unsigned short value = 0; int n; while ( pos < s->len ) { unicode_character = utf8_decode( s->data, &pos ); n = unicode_find( unicode_character ); if ( n==-1 ) value |= UNICODE_SYMBOL; else value |= unicodeinfo[n].info; } return value; }