| ||||
| ||||
| ||||
Description | ||||
Unicode (UCS-2) and UTF-8 Conversion Funtions | ||||
Synopsis | ||||
Unicode Type declarations | ||||
type Unicode = Char | ||||
Unicode is represented as the Char type Precondition for this is the support of Unicode character range in the compiler (e.g. ghc but not hugs) | ||||
type UString = [Unicode] | ||||
the type for Unicode strings | ||||
type UTF8Char = Char | ||||
UTF-8 charachters are represented by the Char type | ||||
type UTF8String = String | ||||
UTF-8 strings are implemented as Haskell strings | ||||
Unicode and UTF-8 predicates | ||||
isLeadingMultiByteChar :: Char -> Bool | ||||
test for leading multibyte UTF-8 character | ||||
isFollowingMultiByteChar :: Char -> Bool | ||||
test for following multibyte UTF-8 character | ||||
isMultiByteChar :: Char -> Bool | ||||
test for following multibyte UTF-8 character | ||||
isNByteChar :: Unicode -> (Int, Int, Int) | ||||
compute the number of following bytes and the mask bits of a leading UTF-8 multibyte char | ||||
is1ByteXmlChar :: Unicode -> Bool | ||||
test for a legal 1 byte XML char | ||||
isMultiByteXmlChar :: Unicode -> Bool | ||||
test for a legal multi byte XML char | ||||
isXmlChar :: Unicode -> Bool | ||||
checking for valid XML characters | ||||
isXmlLatin1Char :: Unicode -> Bool | ||||
test for a legal latin1 XML char | ||||
isXmlSpaceChar :: Unicode -> Bool | ||||
checking for XML space character: \n, \r, \t and " " | ||||
isXml11SpaceChar :: Unicode -> Bool | ||||
checking for XML1.1 space character: additional space 0x85 and 0x2028 see also : isXmlSpaceChar | ||||
isXmlNameChar :: Unicode -> Bool | ||||
checking for XML name character | ||||
isXmlNameStartChar :: Unicode -> Bool | ||||
checking for XML name start character see also : isXmlNameChar | ||||
isXmlNCNameChar :: Unicode -> Bool | ||||
checking for XML NCName character: no ":" allowed see also : isXmlNameChar | ||||
isXmlNCNameStartChar :: Unicode -> Bool | ||||
checking for XML NCName start character: no ":" allowed see also : isXmlNameChar, isXmlNCNameChar | ||||
isXmlPubidChar :: Unicode -> Bool | ||||
checking for XML public id character | ||||
isXmlLetter :: Unicode -> Bool | ||||
checking for XML letter | ||||
isXmlBaseChar :: Unicode -> Bool | ||||
checking for XML base charater | ||||
isXmlIdeographicChar :: Unicode -> Bool | ||||
checking for XML ideographic charater | ||||
isXmlCombiningChar :: Unicode -> Bool | ||||
checking for XML combining charater | ||||
isXmlDigit :: Unicode -> Bool | ||||
checking for XML digit | ||||
isXmlExtender :: Unicode -> Bool | ||||
checking for XML extender | ||||
isXmlControlOrPermanentlyUndefined :: Unicode -> Bool | ||||
checking for XML control or permanently discouraged char see Errata to XML1.0 (http://www.w3.org/XML/xml-V10-2e-errata) No 46 Document authors are encouraged to avoid compatibility characters, as defined in section 6.8 of [Unicode] (see also D21 in section 3.6 of [Unicode3]). The characters defined in the following ranges are also discouraged. They are either control characters or permanently undefined Unicode characters: | ||||
UTF-8 and Unicode conversion functions | ||||
utf8ToUnicodeChar :: UTF8String -> Unicode | ||||
conversion of a UTF-8 encoded single Unicode character into the corresponding Unicode value. precondition: the character is a valid UTF-8 encoded character | ||||
utf8ToUnicode :: UTF8String -> UString | ||||
conversion of a UTF-8 encoded string into a sequence of unicode values. precondition: the string is a valid UTF-8 encoded string | ||||
utf8WithByteMarkToUnicode :: UTF8String -> UString | ||||
UTF-8 to Unicode conversion with deletion of leading byte order mark, as described in XML standard F.1 | ||||
latin1ToUnicode :: String -> UString | ||||
code conversion from latin1 to Unicode | ||||
ucs2ToUnicode :: String -> UString | ||||
UCS-2 to UTF-8 conversion with byte order mark analysis | ||||
ucs2BigEndianToUnicode :: String -> UString | ||||
UCS-2 big endian to Unicode conversion | ||||
ucs2LittleEndianToUnicode :: String -> UString | ||||
UCS-2 little endian to Unicode conversion | ||||
utf16beToUnicode :: String -> UString | ||||
UTF-16 big endian to UTF-8 conversion with removal of byte order mark | ||||
utf16leToUnicode :: String -> UString | ||||
UTF-16 little endian to UTF-8 conversion with removal of byte order mark | ||||
unicodeCharToUtf8 :: Unicode -> UTF8String | ||||
conversion from Unicode (Char) to a UTF8 encoded string. | ||||
unicodeToUtf8 :: UString -> UTF8String | ||||
conversion from Unicode strings (UString) to UTF8 encoded strings. | ||||
unicodeToXmlEntity :: UString -> String | ||||
substitute all Unicode characters, that are not legal 1-byte UTF-8 XML characters by a character reference. This function can be used to translate all text nodes and attribute values into pure ascii. see also : unicodeToLatin1 | ||||
unicodeToLatin1 :: UString -> String | ||||
substitute all Unicode characters, that are not legal latin1 UTF-8 XML characters by a character reference. This function can be used to translate all text nodes and attribute values into ISO latin1. see also : unicodeToXmlEntity | ||||
unicodeRemoveNoneAscii :: UString -> String | ||||
removes all non ascii chars, may be used to transform a document into a pure ascii representation by removing all non ascii chars from tag and attibute names see also : unicodeRemoveNoneLatin1, unicodeToXmlEntity | ||||
unicodeRemoveNoneLatin1 :: UString -> String | ||||
removes all non latin1 chars, may be used to transform a document into a pure ascii representation by removing all non ascii chars from tag and attibute names see also : unicodeRemoveNoneAscii, unicodeToLatin1 | ||||
intToCharRef :: Int -> String | ||||
convert an Unicode into a XML character reference. see also : intToCharRefHex | ||||
intToCharRefHex :: Int -> String | ||||
convert an Unicode into a XML hexadecimal character reference. see also: intToCharRef | ||||
getEncodingFct :: String -> Maybe (UString -> String) | ||||
the lookup function for selecting the encoding function | ||||
getOutputEncodingFct :: String -> Maybe (String -> UString) | ||||
the lookup function for selecting the encoding function | ||||
normalizeNL :: String -> String | ||||
White Space (XML Standard 2.3) and end of line handling (2.11) #x0D and #x0D#x0A are mapped to #x0A | ||||
guessEncoding :: String -> String | ||||
Produced by Haddock version 0.8 |