//////////////////////////////////////////////////////////// // // SFML - Simple and Fast Multimedia Library // Copyright (C) 2007-2009 Laurent Gomila (laurent.gom@gmail.com) // // This software is provided 'as-is', without any express or implied warranty. // In no event will the authors be held liable for any damages arising from the use of this software. // // Permission is granted to anyone to use this software for any purpose, // including commercial applications, and to alter it and redistribute it freely, // subject to the following restrictions: // // 1. The origin of this software must not be misrepresented; // you must not claim that you wrote the original software. // If you use this software in a product, an acknowledgment // in the product documentation would be appreciated but is not required. // // 2. Altered source versions must be plainly marked as such, // and must not be misrepresented as being the original software. // // 3. This notice may not be removed or altered from any source distribution. // //////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////// // References : // // http://www.unicode.org/ // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.h // http://people.w3.org/rishida/scripts/uniview/conversion // //////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////// template In Utf<8>::Decode(In begin, In end, Uint32& output, Uint32 replacement) { // Some useful precomputed data static const int trailing[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 }; static const Uint32 offsets[6] = { 0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 }; // Decode the character int trailingBytes = trailing[static_cast(*begin)]; if (begin + trailingBytes < end) { output = 0; switch (trailingBytes) { case 5 : output += static_cast(*begin++); output <<= 6; case 4 : output += static_cast(*begin++); output <<= 6; case 3 : output += static_cast(*begin++); output <<= 6; case 2 : output += static_cast(*begin++); output <<= 6; case 1 : output += static_cast(*begin++); output <<= 6; case 0 : output += static_cast(*begin++); } output -= offsets[trailingBytes]; } else { // Incomplete character begin = end; output = replacement; } return begin; } //////////////////////////////////////////////////////////// template Out Utf<8>::Encode(Uint32 input, Out output, Uint8 replacement) { // Some useful precomputed data static const Uint8 firstBytes[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; // Encode the character if ((input > 0x0010FFFF) || ((input >= 0xD800) && (input <= 0xDBFF))) { // Invalid character if (replacement) *output++ = replacement; } else { // Valid character // Get the number of bytes to write int bytesToWrite = 1; if (input < 0x80) bytesToWrite = 1; else if (input < 0x800) bytesToWrite = 2; else if (input < 0x10000) bytesToWrite = 3; else if (input <= 0x0010FFFF) bytesToWrite = 4; // Extract the bytes to write Uint8 bytes[4]; switch (bytesToWrite) { case 4 : bytes[3] = static_cast((input | 0x80) & 0xBF); input >>= 6; case 3 : bytes[2] = static_cast((input | 0x80) & 0xBF); input >>= 6; case 2 : bytes[1] = static_cast((input | 0x80) & 0xBF); input >>= 6; case 1 : bytes[0] = static_cast (input | firstBytes[bytesToWrite]); } // Add them to the output const Uint8* currentByte = bytes; switch (bytesToWrite) { case 4 : *output++ = *currentByte++; case 3 : *output++ = *currentByte++; case 2 : *output++ = *currentByte++; case 1 : *output++ = *currentByte++; } } return output; } //////////////////////////////////////////////////////////// template In Utf<8>::Next(In begin, In end) { Uint32 codepoint; return Decode(begin, end, codepoint); } //////////////////////////////////////////////////////////// template std::size_t Utf<8>::Count(In begin, In end) { std::size_t length = 0; while (begin < end) { begin = Next(begin, end); ++length; } return length; } //////////////////////////////////////////////////////////// template Out Utf<8>::FromAnsi(In begin, In end, Out output, const std::locale& locale) { while (begin < end) { Uint32 codepoint = Utf<32>::DecodeAnsi(*begin++, locale); output = Encode(codepoint, output); } return output; } //////////////////////////////////////////////////////////// template Out Utf<8>::FromWide(In begin, In end, Out output) { while (begin < end) { Uint32 codepoint = Utf<32>::DecodeWide(*begin++); output = Encode(codepoint, output); } return output; } //////////////////////////////////////////////////////////// template Out Utf<8>::FromLatin1(In begin, In end, Out output) { // Latin-1 is directly compatible with Unicode encodings, // and can thus be treated as (a sub-range of) UTF-32 while (begin < end) output = Encode(*begin++, output); return output; } //////////////////////////////////////////////////////////// template Out Utf<8>::ToAnsi(In begin, In end, Out output, char replacement, const std::locale& locale) { while (begin < end) { Uint32 codepoint; begin = Decode(begin, end, codepoint); output = Utf<32>::EncodeAnsi(codepoint, output, replacement, locale); } return output; } //////////////////////////////////////////////////////////// template Out Utf<8>::ToWide(In begin, In end, Out output, wchar_t replacement) { while (begin < end) { Uint32 codepoint; begin = Decode(begin, end, codepoint); output = Utf<32>::EncodeWide(codepoint, output, replacement); } return output; } //////////////////////////////////////////////////////////// template Out Utf<8>::ToLatin1(In begin, In end, Out output, char replacement) { // Latin-1 is directly compatible with Unicode encodings, // and can thus be treated as (a sub-range of) UTF-32 while (begin < end) { Uint32 codepoint; begin = Decode(begin, end, codepoint); *output++ = codepoint < 256 ? static_cast(codepoint) : replacement; } return output; } //////////////////////////////////////////////////////////// template Out Utf<8>::ToUtf8(In begin, In end, Out output) { while (begin < end) *output++ = *begin++; return output; } //////////////////////////////////////////////////////////// template Out Utf<8>::ToUtf16(In begin, In end, Out output) { while (begin < end) { Uint32 codepoint; begin = Decode(begin, end, codepoint); output = Utf<16>::Encode(codepoint, output); } return output; } //////////////////////////////////////////////////////////// template Out Utf<8>::ToUtf32(In begin, In end, Out output) { while (begin < end) { Uint32 codepoint; begin = Decode(begin, end, codepoint); *output++ = codepoint; } return output; } //////////////////////////////////////////////////////////// template In Utf<16>::Decode(In begin, In end, Uint32& output, Uint32 replacement) { Uint16 first = *begin++; // If it's a surrogate pair, first convert to a single UTF-32 character if ((first >= 0xD800) && (first <= 0xDBFF)) { if (begin < end) { Uint32 second = *begin++; if ((second >= 0xDC00) && (second <= 0xDFFF)) { // The second element is valid: convert the two elements to a UTF-32 character output = static_cast(((first - 0xD800) << 10) + (second - 0xDC00) + 0x0010000); } else { // Invalid character output = replacement; } } else { // Invalid character begin = end; output = replacement; } } else { // We can make a direct copy output = first; } return begin; } //////////////////////////////////////////////////////////// template Out Utf<16>::Encode(Uint32 input, Out output, Uint16 replacement) { if (input < 0xFFFF) { // The character can be copied directly, we just need to check if it's in the valid range if ((input >= 0xD800) && (input <= 0xDFFF)) { // Invalid character (this range is reserved) if (replacement) *output++ = replacement; } else { // Valid character directly convertible to a single UTF-16 character *output++ = static_cast(input); } } else if (input > 0x0010FFFF) { // Invalid character (greater than the maximum unicode value) if (replacement) *output++ = replacement; } else { // The input character will be converted to two UTF-16 elements input -= 0x0010000; *output++ = static_cast((input >> 10) + 0xD800); *output++ = static_cast((input & 0x3FFUL) + 0xDC00); } return output; } //////////////////////////////////////////////////////////// template In Utf<16>::Next(In begin, In end) { Uint32 codepoint; return Decode(begin, end, codepoint); } //////////////////////////////////////////////////////////// template std::size_t Utf<16>::Count(In begin, In end) { std::size_t length = 0; while (begin < end) { begin = Next(begin, end); ++length; } return length; } //////////////////////////////////////////////////////////// template Out Utf<16>::FromAnsi(In begin, In end, Out output, const std::locale& locale) { while (begin < end) { Uint32 codepoint = Utf<32>::DecodeAnsi(*begin++, locale); output = Encode(codepoint, output); } return output; } //////////////////////////////////////////////////////////// template Out Utf<16>::FromWide(In begin, In end, Out output) { while (begin < end) { Uint32 codepoint = Utf<32>::DecodeWide(*begin++); output = Encode(codepoint, output); } return output; } //////////////////////////////////////////////////////////// template Out Utf<16>::FromLatin1(In begin, In end, Out output) { // Latin-1 is directly compatible with Unicode encodings, // and can thus be treated as (a sub-range of) UTF-32 while (begin < end) *output++ = *begin++; return output; } //////////////////////////////////////////////////////////// template Out Utf<16>::ToAnsi(In begin, In end, Out output, char replacement, const std::locale& locale) { while (begin < end) { Uint32 codepoint; begin = Decode(begin, end, codepoint); output = Utf<32>::EncodeAnsi(codepoint, output, replacement, locale); } return output; } //////////////////////////////////////////////////////////// template Out Utf<16>::ToWide(In begin, In end, Out output, wchar_t replacement) { while (begin < end) { Uint32 codepoint; begin = Decode(begin, end, codepoint); output = Utf<32>::EncodeWide(codepoint, output, replacement); } return output; } //////////////////////////////////////////////////////////// template Out Utf<16>::ToLatin1(In begin, In end, Out output, char replacement) { // Latin-1 is directly compatible with Unicode encodings, // and can thus be treated as (a sub-range of) UTF-32 while (begin < end) { *output++ = *begin < 256 ? static_cast(*begin) : replacement; begin++; } return output; } //////////////////////////////////////////////////////////// template Out Utf<16>::ToUtf8(In begin, In end, Out output) { while (begin < end) { Uint32 codepoint; begin = Decode(begin, end, codepoint); output = Utf<8>::Encode(codepoint, output); } return output; } //////////////////////////////////////////////////////////// template Out Utf<16>::ToUtf16(In begin, In end, Out output) { while (begin < end) *output++ = *begin++; return output; } //////////////////////////////////////////////////////////// template Out Utf<16>::ToUtf32(In begin, In end, Out output) { while (begin < end) { Uint32 codepoint; begin = Decode(begin, end, codepoint); *output++ = codepoint; } return output; } //////////////////////////////////////////////////////////// template In Utf<32>::Decode(In begin, In end, Uint32& output, Uint32) { output = *begin++; return begin; } //////////////////////////////////////////////////////////// template Out Utf<32>::Encode(Uint32 input, Out output, Uint32 replacement) { *output++ = input; return output; } //////////////////////////////////////////////////////////// template In Utf<32>::Next(In begin, In end) { return ++begin; } //////////////////////////////////////////////////////////// template std::size_t Utf<32>::Count(In begin, In end) { return begin - end; } //////////////////////////////////////////////////////////// template Out Utf<32>::FromAnsi(In begin, In end, Out output, const std::locale& locale) { while (begin < end) *output++ = DecodeAnsi(*begin++, locale); return output; } //////////////////////////////////////////////////////////// template Out Utf<32>::FromWide(In begin, In end, Out output) { while (begin < end) *output++ = DecodeWide(*begin++); return output; } //////////////////////////////////////////////////////////// template Out Utf<32>::FromLatin1(In begin, In end, Out output) { // Latin-1 is directly compatible with Unicode encodings, // and can thus be treated as (a sub-range of) UTF-32 while (begin < end) *output++ = *begin++; return output; } //////////////////////////////////////////////////////////// template Out Utf<32>::ToAnsi(In begin, In end, Out output, char replacement, const std::locale& locale) { while (begin < end) output = EncodeAnsi(*begin++, output, replacement, locale); return output; } //////////////////////////////////////////////////////////// template Out Utf<32>::ToWide(In begin, In end, Out output, wchar_t replacement) { while (begin < end) output = EncodeWide(*begin++, output, replacement); return output; } //////////////////////////////////////////////////////////// template Out Utf<32>::ToLatin1(In begin, In end, Out output, char replacement) { // Latin-1 is directly compatible with Unicode encodings, // and can thus be treated as (a sub-range of) UTF-32 while (begin < end) { *output++ = *begin < 256 ? static_cast(*begin) : replacement; begin++; } return output; } //////////////////////////////////////////////////////////// template Out Utf<32>::ToUtf8(In begin, In end, Out output) { while (begin < end) output = Utf<8>::Encode(*begin++, output); return output; } //////////////////////////////////////////////////////////// template Out Utf<32>::ToUtf16(In begin, In end, Out output) { while (begin < end) output = Utf<16>::Encode(*begin++, output); return output; } //////////////////////////////////////////////////////////// template Out Utf<32>::ToUtf32(In begin, In end, Out output) { while (begin < end) *output++ = *begin++; return output; } //////////////////////////////////////////////////////////// template Uint32 Utf<32>::DecodeAnsi(In input, const std::locale& locale) { // On Windows, gcc's standard library (glibc++) has almost // no support for Unicode stuff. As a consequence, in this // context we can only use the default locale and ignore // the one passed as parameter. #if defined(SFML_SYSTEM_WINDOWS) && /* if Windows ... */ \ (defined(__GLIBCPP__) || defined (__GLIBCXX__)) && /* ... and standard library is glibc++ ... */ \ !(defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION)) /* ... and STLPort is not used on top of it */ wchar_t character = 0; mbtowc(&character, &input, 1); return static_cast(character); #else // Get the facet of the locale which deals with character conversion const std::ctype& facet = std::use_facet< std::ctype >(locale); // Use the facet to convert each character of the input string return static_cast(facet.widen(input)); #endif } //////////////////////////////////////////////////////////// template Uint32 Utf<32>::DecodeWide(In input) { // The encoding of wide characters is not well defined and is left to the system; // however we can safely assume that it is UCS-2 on Windows and // UCS-4 on Unix systems. // In both cases, a simple copy is enough (UCS-2 is a subset of UCS-4, // and UCS-4 *is* UTF-32). return input; } //////////////////////////////////////////////////////////// template Out Utf<32>::EncodeAnsi(Uint32 codepoint, Out output, char replacement, const std::locale& locale) { // On Windows, gcc's standard library (glibc++) has almost // no support for Unicode stuff. As a consequence, in this // context we can only use the default locale and ignore // the one passed as parameter. #if defined(SFML_SYSTEM_WINDOWS) && /* if Windows ... */ \ (defined(__GLIBCPP__) || defined (__GLIBCXX__)) && /* ... and standard library is glibc++ ... */ \ !(defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION)) /* ... and STLPort is not used on top of it */ char character = 0; if (wctomb(&character, static_cast(codepoint)) >= 0) *output++ = character; else if (replacement) *output++ = replacement; return output; #else // Get the facet of the locale which deals with character conversion const std::ctype& facet = std::use_facet< std::ctype >(locale); // Use the facet to convert each character of the input string *output++ = facet.narrow(static_cast(codepoint), replacement); return output; #endif } //////////////////////////////////////////////////////////// template Out Utf<32>::EncodeWide(Uint32 codepoint, Out output, wchar_t replacement) { // The encoding of wide characters is not well defined and is left to the system; // however we can safely assume that it is UCS-2 on Windows and // UCS-4 on Unix systems. // For UCS-2 we need to check if the source characters fits in (UCS-2 is a subset of UCS-4). // For UCS-4 we can do a direct copy (UCS-4 *is* UTF-32). switch (sizeof(wchar_t)) { case 4: { *output++ = static_cast(codepoint); break; } default: { if ((codepoint <= 0xFFFF) && ((codepoint < 0xD800) || (codepoint > 0xDFFF))) { *output++ = static_cast(codepoint); } else if (replacement) { *output++ = replacement; } break; } } return output; }