-- ------------------------------------------------------------ {- | Module : Data.String.Unicode Copyright : Copyright (C) 2010- Uwe Schmidt License : MIT Maintainer : Uwe Schmidt (uwe@fh-wedel.de) Stability : stable Portability: portable Unicode and UTF-8 Conversion Functions -} -- ------------------------------------------------------------ module Data.String.Unicode ( -- * Unicode Type declarations Unicode, UString, UTF8Char, UTF8String, UStringWithErrors, DecodingFct, DecodingFctEmbedErrors, utf8ToUnicode , utf8ToUnicodeEmbedErrors , latin1ToUnicode , ucs2ToUnicode , ucs2BigEndianToUnicode , ucs2LittleEndianToUnicode , utf16beToUnicode , utf16leToUnicode , unicodeCharToUtf8 , unicodeToUtf8 , unicodeToXmlEntity , unicodeToLatin1 , unicodeRemoveNoneAscii , unicodeRemoveNoneLatin1 , intToCharRef , intToCharRefHex , intToHexString , getDecodingFct , getDecodingFctEmbedErrors , getOutputEncodingFct , normalizeNL , guessEncoding , getOutputEncodingFct' , unicodeCharToUtf8' , unicodeCharToXmlEntity' , unicodeCharToLatin1' ) where import Data.Char ( toUpper ) import Data.Char.Properties.XMLCharProps( isXml1ByteChar , isXmlLatin1Char ) import Data.Char.IsoLatinTables import Data.String.UTF8Decoding ( decodeUtf8, decodeUtf8EmbedErrors ) import Data.String.EncodingNames -- ------------------------------------------------------------ -- | Unicode is represented as the Char type -- Precondition for this is the support of Unicode character range -- in the compiler (e.g. ghc but not hugs) type Unicode = Char -- | the type for Unicode strings type UString = [Unicode] -- | UTF-8 charachters are represented by the Char type type UTF8Char = Char -- | UTF-8 strings are implemented as Haskell strings type UTF8String = String -- | Decoding function with a pair containing the result string and a list of decoding errors as result type DecodingFct = String -> (UString, [String]) type UStringWithErrors = [Either String Char] -- | Decoding function where decoding errors are interleaved with decoded characters type DecodingFctEmbedErrors = String -> UStringWithErrors -- ------------------------------------------------------------ -- | -- conversion from Unicode strings (UString) to UTF8 encoded strings. unicodeToUtf8 :: UString -> UTF8String unicodeToUtf8 = concatMap unicodeCharToUtf8 -- | -- conversion from Unicode (Char) to a UTF8 encoded string. unicodeCharToUtf8 :: Unicode -> UTF8String unicodeCharToUtf8 c | i >= 0 && i <= 0x0000007F -- 1 byte UTF8 (7 bits) = [ toEnum i ] | i >= 0x00000080 && i <= 0x000007FF -- 2 byte UTF8 (5 + 6 bits) = [ toEnum (0xC0 + i `div` 0x40) , toEnum (0x80 + i `mod` 0x40) ] | i >= 0x00000800 && i <= 0x0000FFFF -- 3 byte UTF8 (4 + 6 + 6 bits) = [ toEnum (0xE0 + i `div` 0x1000) , toEnum (0x80 + (i `div` 0x40) `mod` 0x40) , toEnum (0x80 + i `mod` 0x40) ] | i >= 0x00010000 && i <= 0x001FFFFF -- 4 byte UTF8 (3 + 6 + 6 + 6 bits) = [ toEnum (0xF0 + i `div` 0x40000) , toEnum (0x80 + (i `div` 0x1000) `mod` 0x40) , toEnum (0x80 + (i `div` 0x40) `mod` 0x40) , toEnum (0x80 + i `mod` 0x40) ] | i >= 0x00200000 && i <= 0x03FFFFFF -- 5 byte UTF8 (2 + 6 + 6 + 6 + 6 bits) = [ toEnum (0xF8 + i `div` 0x1000000) , toEnum (0x80 + (i `div` 0x40000) `mod` 0x40) , toEnum (0x80 + (i `div` 0x1000) `mod` 0x40) , toEnum (0x80 + (i `div` 0x40) `mod` 0x40) , toEnum (0x80 + i `mod` 0x40) ] | i >= 0x04000000 && i <= 0x7FFFFFFF -- 6 byte UTF8 (1 + 6 + 6 + 6 + 6 + 6 bits) = [ toEnum (0xFC + i `div` 0x40000000) , toEnum (0x80 + (i `div` 0x1000000) `mod` 0x40) , toEnum (0x80 + (i `div` 0x40000) `mod` 0x40) , toEnum (0x80 + (i `div` 0x1000) `mod` 0x40) , toEnum (0x80 + (i `div` 0x40) `mod` 0x40) , toEnum (0x80 + i `mod` 0x40) ] | otherwise -- other values not supported = error ("unicodeCharToUtf8: illegal integer argument " ++ show i) where i = fromEnum c -- ------------------------------------------------------------ -- | -- code conversion from latin1 to Unicode latin1ToUnicode :: String -> UString latin1ToUnicode = id latinToUnicode :: [(Char, Char)] -> String -> UString latinToUnicode tt = map charToUni where charToUni c = foldr (\(src,dst) r -> case compare c src of EQ -> dst LT -> c {- not found in table -} GT -> r) c tt -- | conversion from ASCII to unicode with check for legal ASCII char set -- -- Structure of decoding function copied from 'Data.Char.UTF8.decode'. decodeAscii :: DecodingFct decodeAscii = swap . partitionEither . decodeAsciiEmbedErrors decodeAsciiEmbedErrors :: String -> UStringWithErrors decodeAsciiEmbedErrors str = map (\(c,pos) -> if isValid c then Right c else Left (toErrStr c pos)) posStr where posStr = zip str [(0::Int)..] toErrStr errChr pos = " at input position " ++ show pos ++ ": none ASCII char " ++ show errChr isValid x = x < '\x80' -- | -- UCS-2 big endian to Unicode conversion ucs2BigEndianToUnicode :: String -> UString ucs2BigEndianToUnicode (b : l : r) = toEnum (fromEnum b * 256 + fromEnum l) : ucs2BigEndianToUnicode r ucs2BigEndianToUnicode [] = [] ucs2BigEndianToUnicode _ = [] -- error "illegal UCS-2 byte input sequence with odd length" -- is ignored (garbage in, garbage out) -- ------------------------------------------------------------ -- | -- UCS-2 little endian to Unicode conversion ucs2LittleEndianToUnicode :: String -> UString ucs2LittleEndianToUnicode (l : b : r) = toEnum (fromEnum b * 256 + fromEnum l) : ucs2LittleEndianToUnicode r ucs2LittleEndianToUnicode [] = [] ucs2LittleEndianToUnicode [_] = [] -- error "illegal UCS-2 byte input sequence with odd length" -- is ignored -- ------------------------------------------------------------ -- | -- UCS-2 to UTF-8 conversion with byte order mark analysis ucs2ToUnicode :: String -> UString ucs2ToUnicode ('\xFE':'\xFF':s) -- 2 byte mark for big endian encoding = ucs2BigEndianToUnicode s ucs2ToUnicode ('\xFF':'\xFE':s) -- 2 byte mark for little endian encoding = ucs2LittleEndianToUnicode s ucs2ToUnicode s = ucs2BigEndianToUnicode s -- default: big endian -- ------------------------------------------------------------ -- | -- UTF-8 to Unicode conversion with deletion of leading byte order mark, as described in XML standard F.1 utf8ToUnicode :: DecodingFct utf8ToUnicode ('\xEF':'\xBB':'\xBF':s) -- remove byte order mark ( XML standard F.1 ) = decodeUtf8 s utf8ToUnicode s = decodeUtf8 s utf8ToUnicodeEmbedErrors :: DecodingFctEmbedErrors utf8ToUnicodeEmbedErrors ('\xEF':'\xBB':'\xBF':s) -- remove byte order mark ( XML standard F.1 ) = decodeUtf8EmbedErrors s utf8ToUnicodeEmbedErrors s = decodeUtf8EmbedErrors s -- ------------------------------------------------------------ -- | -- UTF-16 big endian to UTF-8 conversion with removal of byte order mark utf16beToUnicode :: String -> UString utf16beToUnicode ('\xFE':'\xFF':s) -- remove byte order mark = ucs2BigEndianToUnicode s utf16beToUnicode s = ucs2BigEndianToUnicode s -- ------------------------------------------------------------ -- | -- UTF-16 little endian to UTF-8 conversion with removal of byte order mark utf16leToUnicode :: String -> UString utf16leToUnicode ('\xFF':'\xFE':s) -- remove byte order mark = ucs2LittleEndianToUnicode s utf16leToUnicode s = ucs2LittleEndianToUnicode s -- ------------------------------------------------------------ -- | -- substitute all Unicode characters, that are not legal 1-byte -- UTF-8 XML characters by a character reference. -- -- This function can be used to translate all text nodes and -- attribute values into pure ascii. -- -- see also : 'unicodeToLatin1' unicodeToXmlEntity :: UString -> String unicodeToXmlEntity = escape isXml1ByteChar (intToCharRef . fromEnum) -- | -- substitute all Unicode characters, that are not legal latin1 -- UTF-8 XML characters by a character reference. -- -- This function can be used to translate all text nodes and -- attribute values into ISO latin1. -- -- see also : 'unicodeToXmlEntity' unicodeToLatin1 :: UString -> String unicodeToLatin1 = escape isXmlLatin1Char (intToCharRef . fromEnum) -- | -- substitute selected characters -- The @check@ function returns 'True' whenever a character needs to substitution -- The function @esc@ computes a substitute. escape :: (Unicode -> Bool) -> (Unicode -> String) -> UString -> String escape check esc = concatMap (\uc -> if check uc then [uc] else esc uc) -- | -- removes all non ascii chars, may be used to transform -- a document into a pure ascii representation by removing -- all non ascii chars from tag and attibute names -- -- see also : 'unicodeRemoveNoneLatin1', 'unicodeToXmlEntity' unicodeRemoveNoneAscii :: UString -> String unicodeRemoveNoneAscii = filter isXml1ByteChar -- | -- removes all non latin1 chars, may be used to transform -- a document into a pure ascii representation by removing -- all non ascii chars from tag and attibute names -- -- see also : 'unicodeRemoveNoneAscii', 'unicodeToLatin1' unicodeRemoveNoneLatin1 :: UString -> String unicodeRemoveNoneLatin1 = filter isXmlLatin1Char -- ------------------------------------------------------------ -- | -- convert an Unicode into a XML character reference. -- -- see also : 'intToCharRefHex' intToCharRef :: Int -> String intToCharRef i = "&#" ++ show i ++ ";" -- | -- convert an Unicode into a XML hexadecimal character reference. -- -- see also: 'intToCharRef' intToCharRefHex :: Int -> String intToCharRefHex i = "&#x" ++ h2 ++ ";" where h1 = intToHexString i h2 = if length h1 `mod` 2 == 1 then '0': h1 else h1 -- ------------------------------------------------------------ intToHexString :: Int -> String intToHexString i | i == 0 = "0" | i > 0 = intToStr i | otherwise = error ("intToHexString: negative argument " ++ show i) where intToStr 0 = "" intToStr i' = intToStr (i' `div` 16) ++ [fourBitsToChar (i' `mod` 16)] fourBitsToChar :: Int -> Char fourBitsToChar i = "0123456789ABCDEF" !! i {-# INLINE fourBitsToChar #-} -- ------------------------------------------------------------ -- -- | White Space (XML Standard 2.3) and -- end of line handling (2.11) -- -- \#x0D and \#x0D\#x0A are mapped to \#x0A normalizeNL :: String -> String normalizeNL ('\r' : '\n' : rest) = '\n' : normalizeNL rest normalizeNL ('\r' : rest) = '\n' : normalizeNL rest normalizeNL (c : rest) = c : normalizeNL rest normalizeNL [] = [] -- ------------------------------------------------------------ -- | -- the table of supported character encoding schemes and the associated -- conversion functions into Unicode:q {- This table could be derived from decodingTableEither, but this way it is certainly more efficient. -} decodingTable :: [(String, DecodingFct)] decodingTable = [ (utf8, utf8ToUnicode ) , (isoLatin1, liftDecFct latin1ToUnicode ) , (usAscii, decodeAscii ) , (ucs2, liftDecFct ucs2ToUnicode ) , (utf16, liftDecFct ucs2ToUnicode ) , (utf16be, liftDecFct utf16beToUnicode ) , (utf16le, liftDecFct utf16leToUnicode ) , (iso8859_2, liftDecFct (latinToUnicode iso_8859_2) ) , (iso8859_3, liftDecFct (latinToUnicode iso_8859_3) ) , (iso8859_4, liftDecFct (latinToUnicode iso_8859_4) ) , (iso8859_5, liftDecFct (latinToUnicode iso_8859_5) ) , (iso8859_6, liftDecFct (latinToUnicode iso_8859_6) ) , (iso8859_7, liftDecFct (latinToUnicode iso_8859_7) ) , (iso8859_8, liftDecFct (latinToUnicode iso_8859_8) ) , (iso8859_9, liftDecFct (latinToUnicode iso_8859_9) ) , (iso8859_10, liftDecFct (latinToUnicode iso_8859_10) ) , (iso8859_11, liftDecFct (latinToUnicode iso_8859_11) ) , (iso8859_13, liftDecFct (latinToUnicode iso_8859_13) ) , (iso8859_14, liftDecFct (latinToUnicode iso_8859_14) ) , (iso8859_15, liftDecFct (latinToUnicode iso_8859_15) ) , (iso8859_16, liftDecFct (latinToUnicode iso_8859_16) ) , (unicodeString, liftDecFct id ) , ("", liftDecFct id ) -- default ] where liftDecFct df = \ s -> (df s, []) -- | -- the lookup function for selecting the decoding function getDecodingFct :: String -> Maybe DecodingFct getDecodingFct enc = lookup (map toUpper enc) decodingTable -- | -- Similar to 'decodingTable' but it embeds errors -- in the string of decoded characters. decodingTableEmbedErrors :: [(String, DecodingFctEmbedErrors)] decodingTableEmbedErrors = [ (utf8, utf8ToUnicodeEmbedErrors ) , (isoLatin1, liftDecFct latin1ToUnicode ) , (usAscii, decodeAsciiEmbedErrors ) , (ucs2, liftDecFct ucs2ToUnicode ) , (utf16, liftDecFct ucs2ToUnicode ) , (utf16be, liftDecFct utf16beToUnicode ) , (utf16le, liftDecFct utf16leToUnicode ) , (iso8859_2, liftDecFct (latinToUnicode iso_8859_2) ) , (iso8859_3, liftDecFct (latinToUnicode iso_8859_3) ) , (iso8859_4, liftDecFct (latinToUnicode iso_8859_4) ) , (iso8859_5, liftDecFct (latinToUnicode iso_8859_5) ) , (iso8859_6, liftDecFct (latinToUnicode iso_8859_6) ) , (iso8859_7, liftDecFct (latinToUnicode iso_8859_7) ) , (iso8859_8, liftDecFct (latinToUnicode iso_8859_8) ) , (iso8859_9, liftDecFct (latinToUnicode iso_8859_9) ) , (iso8859_10, liftDecFct (latinToUnicode iso_8859_10) ) , (iso8859_11, liftDecFct (latinToUnicode iso_8859_11) ) , (iso8859_13, liftDecFct (latinToUnicode iso_8859_13) ) , (iso8859_14, liftDecFct (latinToUnicode iso_8859_14) ) , (iso8859_15, liftDecFct (latinToUnicode iso_8859_15) ) , (iso8859_16, liftDecFct (latinToUnicode iso_8859_16) ) , (unicodeString, liftDecFct id ) , ("", liftDecFct id ) -- default ] where liftDecFct df = map Right . df -- | -- the lookup function for selecting the decoding function getDecodingFctEmbedErrors :: String -> Maybe DecodingFctEmbedErrors getDecodingFctEmbedErrors enc = lookup (map toUpper enc) decodingTableEmbedErrors -- | -- the table of supported output encoding schemes and the associated -- conversion functions from Unicode outputEncodingTable :: [(String, (UString -> String))] outputEncodingTable = [ (utf8, unicodeToUtf8 ) , (isoLatin1, unicodeToLatin1 ) , (usAscii, unicodeToXmlEntity ) , (unicodeString, id ) , ("", unicodeToUtf8 ) -- default ] -- | -- the lookup function for selecting the encoding function getOutputEncodingFct :: String -> Maybe (String -> UString) getOutputEncodingFct enc = lookup (map toUpper enc) outputEncodingTable -- ------------------------------------------------------------ -- guessEncoding :: String -> String guessEncoding ('\xFF':'\xFE':'\x00':'\x00':_) = "UCS-4LE" -- with byte order mark guessEncoding ('\xFF':'\xFE':_) = "UTF-16LE" -- with byte order mark guessEncoding ('\xFE':'\xFF':'\x00':'\x00':_) = "UCS-4-3421" -- with byte order mark guessEncoding ('\xFE':'\xFF':_) = "UTF-16BE" -- with byte order mark guessEncoding ('\xEF':'\xBB':'\xBF':_) = utf8 -- with byte order mark guessEncoding ('\x00':'\x00':'\xFE':'\xFF':_) = "UCS-4BE" -- with byte order mark guessEncoding ('\x00':'\x00':'\xFF':'\xFE':_) = "UCS-4-2143" -- with byte order mark guessEncoding ('\x00':'\x00':'\x00':'\x3C':_) = "UCS-4BE" -- "<" of " (b,a) swap (x,y) = (y,x) {-# INLINE swap #-} partitionEither :: [Either a b] -> ([a], [b]) partitionEither = foldr (\x ~(ls,rs) -> either (\l -> (l:ls,rs)) (\r -> (ls,r:rs)) x) ([],[]) {-# INLINE partitionEither #-} -- ------------------------------------------------------------ -- output encoding for bytestrings -- | -- the table of supported output encoding schemes and the associated -- conversion functions from Unicode type StringFct = String -> String outputEncodingTable' :: [(String, (Char -> StringFct))] outputEncodingTable' = [ (utf8, unicodeCharToUtf8' ) , (isoLatin1, unicodeCharToLatin1' ) , (usAscii, unicodeCharToXmlEntity' ) , ("", unicodeCharToUtf8' ) -- default ] -- | -- the lookup function for selecting the encoding function getOutputEncodingFct' :: String -> Maybe (Char -> StringFct) getOutputEncodingFct' enc = lookup (map toUpper enc) outputEncodingTable' -- ------------------------------------------------------------ -- | -- conversion from Unicode (Char) to a UTF8 encoded string. unicodeCharToUtf8' :: Char -> StringFct unicodeCharToUtf8' c | i >= 0 && i <= 0x0000007F -- 1 byte UTF8 (7 bits) = (c :) | i >= 0x00000080 && i <= 0x000007FF -- 2 byte UTF8 (5 + 6 bits) = ((toEnum (0xC0 + i `div` 0x40) ) :) . ((toEnum (0x80 + i `mod` 0x40)) :) | i >= 0x00000800 && i <= 0x0000FFFF -- 3 byte UTF8 (4 + 6 + 6 bits) = ((toEnum (0xE0 + i `div` 0x1000) ) :) . ((toEnum (0x80 + (i `div` 0x40) `mod` 0x40)) :) . ((toEnum (0x80 + i `mod` 0x40)) :) | i >= 0x00010000 && i <= 0x001FFFFF -- 4 byte UTF8 (3 + 6 + 6 + 6 bits) -- extension to encode 21 bit values = ((toEnum (0xF0 + i `div` 0x40000) ) :) . ((toEnum (0x80 + (i `div` 0x1000) `mod` 0x40)) :) . ((toEnum (0x80 + (i `div` 0x40) `mod` 0x40)) :) . ((toEnum (0x80 + i `mod` 0x40)) :) | i >= 0x00200000 && i <= 0x03FFFFFF -- 5 byte UTF8 (2 + 6 + 6 + 6 + 6 bits) -- extension to encode 26 bit values = ((toEnum (0xF8 + i `div` 0x1000000) ) :) . ((toEnum (0x80 + (i `div` 0x40000) `mod` 0x40)) :) . ((toEnum (0x80 + (i `div` 0x1000) `mod` 0x40)) :) . ((toEnum (0x80 + (i `div` 0x40) `mod` 0x40)) :) . ((toEnum (0x80 + i `mod` 0x40)) :) | i >= 0x04000000 && i <= 0x7FFFFFFF -- 6 byte UTF8 (1 + 6 + 6 + 6 + 6 + 6 bits) -- extension to encode 31 bit values = ((toEnum (0xFC + i `div` 0x40000000) ) :) . ((toEnum (0x80 + (i `div` 0x1000000) `mod` 0x40)) :) . ((toEnum (0x80 + (i `div` 0x40000) `mod` 0x40)) :) . ((toEnum (0x80 + (i `div` 0x1000) `mod` 0x40)) :) . ((toEnum (0x80 + (i `div` 0x40) `mod` 0x40)) :) . ((toEnum (0x80 + i `mod` 0x40)) :) | otherwise -- other values not supported = error ("unicodeCharToUtf8: illegal integer argument " ++ show i) where i = fromEnum c -- ------------------------------------------------------------ -- | -- substitute all Unicode characters, that are not legal 1-byte -- UTF-8 XML characters by a character reference. unicodeCharToXmlEntity' :: Char -> StringFct unicodeCharToXmlEntity' c | isXml1ByteChar c = (c :) | otherwise = ((intToCharRef . fromEnum $ c) ++) -- ------------------------------------------------------------ -- | -- substitute all Unicode characters, that are not legal latin1 -- UTF-8 XML characters by a character reference. unicodeCharToLatin1' :: Char -> StringFct unicodeCharToLatin1' c | isXmlLatin1Char c = (c :) | otherwise = ((intToCharRef . fromEnum $ c) ++) -- ------------------------------------------------------------