/* * utf8.c * * Copyright (c) Chris Putnam 2004-2013 * * Source code released under the GPL version 2 * */ #include #include "utf8.h" /* UTF-8 encoding U-00000000 - U-0000007F: 0xxxxxxx U-00000080 - U-000007FF: 110xxxxx 10xxxxxx U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ static void utf8_build( unsigned int value, unsigned char out[6], int in_pos, int out_pos ) { unsigned int in_mask, out_mask; int byte = 0; while ( in_pos < 32 ) { in_mask = 1 << ( 31 - in_pos ); out_mask = 1 << ( 7 - out_pos ); if ( value & in_mask ) out[byte] |= out_mask; in_pos++; out_pos++; if ( out_pos > 7 ) { out_pos=2; byte++; } } } /* int utf8( in, out[6] ); * * in is character code 0x0 -> 0x7FFFFFFF * int is number of characters for output * */ int utf8_encode( unsigned int value, unsigned char out[6] ) { int i; for ( i=1; i<6; ++i ) out[i] = 0x80; /* 10xxxxxx */ if ( value < 0x80 ) { out[0] = 0x0; /* 0xxxxxxx */ utf8_build( value, out, 25, 1 ); return 1; } else if ( value < 0x800 ) { out[0] = 0xC0; /* 110xxxxx */ utf8_build( value, out, 21, 3 ); return 2; } else if ( value < 0x10000 ) { out[0] = 0xE0; /* 1110xxxx */ utf8_build( value, out, 16, 4 ); return 3; } else if ( value < 0x200000 ) { out[0] = 0xF0; /* 11110xxx */ utf8_build( value, out, 11, 5 ); return 4; } else if ( value < 0x4000000 ) { out[0] = 0xF8; /* 111110xx */ utf8_build( value, out, 6, 6 ); return 5; } else if ( value < (unsigned int ) 0x80000000 ) { out[0] = 0xFC; /* 1111110x */ utf8_build( value, out, 1, 7 ); return 6; } else { /* error, above 2^31 bits encodable by UTF-8 */ return 0; } } /* Generate UTF8 character as null-terminated string */ void utf8_encode_str( unsigned int value, char outstr[7] ) { unsigned char encoded[6]; int i, n; n = utf8_encode( value, encoded ); for ( i=0; i