#include #include "gb18030.h" /* GB18030-2000 is an encoding of Unicode character used in China * * {0x00-0x7f} are one byte characters identical to US-ASCII * {0x80} is properly undefined, but many GB18030 encodings make * it the Euro sign (Unicode 0x20AC), so use that * {0x81-0xFE}{0x40-0x7E,0x80-0xFE} a full superset of GBK (with fallback * mappings) * {0x81-0xFE}{0x30-0x39}{0x81-0xFE}{0x30-0x39} maps linearly to ISO 10646 * GB+81308130 = U+0080 up to U+FFFF * GB+90308130 = U+10000 up to U+10FFFF skipping mappings already * defined in 1-byte and 2-byte areas. * * Truth is it's a bit of a mess algorithmically as it doesn't multiply * encode characters, so there are holes in the Unicode mapping that * should be avoided. */ /* This is a "small" region that needs explicit enumeration */ #include "gb18030_enumeration.c" static int in_range( unsigned char n, unsigned char low, unsigned char high ) { if ( n < low || n > high ) return 0; return 1; } /* Get GB 18030 from Unicode Value in Table */ static int gb18030_unicode_table_lookup( unsigned int unicode, unsigned char out[4] ) { int i, j; if ( unicode >= 0x0080 && unicode <= 0xFFE5 ) { /* list is sorted, so should do binary search here */ for ( i=0; i elements. However, most four-byte GB 18030 mappings can be enumerated efficiently within distinct ranges. Therefore, we use elements for all but the 31000 or so assignments above. --> #endif } unsigned int gb18030_to_unicode( unsigned char *s, unsigned char len ) { unsigned int ret; int found; ret = gb18030_table_lookup( s, len, &found ); if ( !found && len==4 ) { ret = gb18030_range_lookup( s, &found ); if ( !found ) ret = '?'; } return ret; } /* * Convert unicode character to gb18030 * * returns number of characters for output */ int gb18030_encode( unsigned int unicode, unsigned char out[4] ) { int len; if ( unicode < 0x80 ) { out[0] = unicode; len = 1; } else { len = gb18030_unicode_table_lookup( unicode, out ); if ( !len ) len = gb18030_unicode_range_lookup( unicode, out ); } return len; } /* * Decode a gb18030 character into unicode */ unsigned int gb18030_decode( char *s, unsigned int *pi ) { unsigned int c; unsigned char uc[4]; int i = *pi; uc[0] = ( unsigned char ) s[i]; if ( ( uc[0] & 128 ) == 0 ) { c = ( unsigned int ) uc[0]; i += 1; } else if ( uc[0] == 0x80 ) { c = 0x20AC; i += 1; } else if ( uc[0] != 0xFF ) { /* multi-byte character */ uc[1] = ( unsigned char ) s[i+1]; uc[2] = ( unsigned char ) s[i+2]; uc[3]= ( unsigned char ) s[i+3]; if ( in_range( uc[1], 0x40, 0x7e ) || in_range( uc[1], 0x80, 0xfe ) ) { /* two-byte character */ c = gb18030_to_unicode( &(uc[0]), 2 ); i += 2; } else if ( in_range( uc[1], 0x30, 0x39 ) && in_range( uc[2], 0x81, 0xfe ) && in_range( uc[3], 0x30, 0x39 ) ) { /* four-byte character */ c = gb18030_to_unicode( &(uc[0]), 4 ); i += 4; } else { /* this is an illegal character */ c = '?'; i += 1; } } else { /* s[i]==0xFF */ /* this is an illegal character */ c = '?'; i += 1; } *pi = i; return c; }