{-# OPTIONS_GHC -cpp #-} {-# LANGUAGE CPP #-} -- | -- Module : Foreign.C.UTF8 -- Copyright : (c) 2004 John Meacham, Alistair Bayley -- License : BSD-style -- Maintainer : alistair@abayley.org -- Stability : experimental -- Portability : portable -- -- Marshall Haskell Strings to and from UTF8-encoded CStrings. -- This module's code is inspired by John Meacham's UTF8 en- & de-coders, -- and also those found in the HXT library (module Text.XML.HXT.DOM.Unicode). -- -- Note that the -Len functions all return the length in bytes, -- not Chars (this is more useful, as you are most likely to want -- to pass the length to an FFI function, which is most likely -- expecting the length in bytes). If you want the length in Chars, -- well, you have the original String, so... -- This module has been reasonablt optimised. You can check GHC's -- simplifier output (for unboxing, mainly) with this: -- ghc -O2 -c UTF8.hs -ddump-simpl > simpl.txt module Foreign.C.UTF8 ( peekUTF8String, peekUTF8StringLen , newUTF8String, withUTF8String, withUTF8StringLen , toUTF8String, fromUTF8String , lengthUTF8, fromUTF8, toUTF8 ) where import Control.Monad (when, liftM) import Data.Bits import Data.Char import Data.Word (Word8) import Foreign.C.Types import Foreign.C.String import Foreign.Ptr import Foreign.Marshal.Array import Foreign.Storable #ifdef __GLASGOW_HASKELL__ import GHC.Base (unsafeChr) #else unsafeChr :: Int -> Char unsafeChr i = chr i #endif nullCChar :: CChar nullCChar = 0 nullByte :: Word8 nullByte = 0 -- | Analogous to peekCString. Converts UTF8 CString to String. peekUTF8String :: CString -> IO String peekUTF8String cs = fromUTF8Ptr0 (castPtr cs) --peekUTF8String cs = peekArray0 nullByte (castPtr cs) >>= return . fromUTF8 -- | Analogous to peekCStringLen. Converts UTF8 CString to String. -- The resulting String will end either when @len@ bytes -- have been converted, or when a NULL is found. peekUTF8StringLen :: CStringLen -> IO String peekUTF8StringLen (cs, len) = fromUTF8Ptr (len-1) (castPtr cs) "" --peekUTF8StringLen (cs, len) = peekArray len (castPtr cs) >>= return . fromUTF8 -- | Analogous to newCString. Creates UTF8 encoded CString. newUTF8String :: String -> IO CString newUTF8String hs = do p <- newArray0 nullByte (toUTF8 hs) return (castPtr p) -- | Analogous to newCStringLen. -- The length returned is in bytes (encoding units), not chars. newUTF8StringLen :: String -> IO CStringLen newUTF8StringLen hs = do let utf8 = toUTF8 hs p <- newArray0 nullByte utf8 return (castPtr p, length utf8) -- | Analogous to withCString. Creates UTF8 encoded CString. withUTF8String :: String -> (CString -> IO a) -> IO a withUTF8String s action = withUTF8StringLen s (\(cstr, _) -> action cstr) -- | Analogous to withCStringLen. -- The length returned is in bytes (encoding units), not chars. withUTF8StringLen :: String -> (CStringLen -> IO a) -> IO a withUTF8StringLen s action = do let utf8 = toUTF8 s withArray0 nullByte utf8 (\arr -> action (castPtr arr, length utf8) ) -- | Convert a String that was marshalled from a CString without -- any decoder applied. This might be useful if the client encoding -- is unknown, and the user code must convert. -- We assume that the UTF8 CString was marshalled as if Latin-1 -- i.e. all chars are in the range 0-255. fromUTF8String :: String -> String fromUTF8String = fromUTF8 . map charToWord8 charToWord8 :: Char -> Word8 charToWord8 = fromIntegral . fromEnum -- | Convert a Haskell String into a UTF8 String, where each UTF8 byte -- is represented by its Char equivalent i.e. only chars 0-255 are used. -- The resulting String can be marshalled to CString directly i.e. with -- a Latin-1 encoding. toUTF8String :: String -> String toUTF8String = map word8ToChar . toUTF8 word8ToChar :: Word8 -> Char word8ToChar = unsafeChr . fromIntegral lengthUTF8 :: String -> Int lengthUTF8 s = length (toUTF8 s) {- The codepoint-to-UTF8 rules: 0x00 - 0x7f: 7 bits: as is 0x80 - 0x07ff: 11 bits byte 1: 0xC0 OR ((x << 6) AND 0x1F) i.e. 0xC0 + bits 7-11 (bits 12-up are 0) byte 2: 0x80 OR (x AND 0x3F) i.e. 0x80 + bits 1-6 0x0800 - 0xFFFF: 16 bits byte 1: 0xE0 OR ((x << 12) AND 0x0F) i.e. 0xE0 + bits 13-16 byte 2: 0x80 OR ((x << 6) AND 0x3F) i.e. 0x80 + bits 7-12 byte 3: 0x80 OR (x AND 0x3F) i.e. 0x80 + bits 1-6 0x00010000 - 0x001FFFFF: 21 bits byte 1: 0xF0 OR ((x << 18) AND 0x07) i.e. 0xF0 + bits 19-21 byte 2: 0x80 OR ((x << 12) AND 0x3F) i.e. 0x80 + bits 13-18 byte 3: 0x80 OR ((x << 6) AND 0x3F) i.e. 0x80 + bits 7-12 byte 4: 0x80 OR (x AND 0x3F) i.e. 0x80 + bits 1-6 0x00200000 - 0x03FFFFFF: 26 bits byte 1: 0xF8 OR ((x << 24) AND 0x03) i.e. 0xF8 + bits 25-26 byte 2: 0x80 OR ((x << 18) AND 0x3F) i.e. 0x80 + bits 19-24 byte 3: 0x80 OR ((x << 12) AND 0x3F) i.e. 0x80 + bits 13-18 byte 4: 0x80 OR ((x << 6) AND 0x3F) i.e. 0x80 + bits 7-12 byte 5: 0x80 OR (x AND 0x3F) i.e. 0x80 + bits 1-6 0x04000000 - 0x7FFFFFFF: 31 bits byte 1: 0xFC OR ((x << 30) AND 0x01) i.e. 0xFC + bit 31 byte 2: 0x80 OR ((x << 24) AND 0x3F) i.e. 0x80 + bits 25-30 byte 3: 0x80 OR ((x << 18) AND 0x3F) i.e. 0x80 + bits 19-24 byte 4: 0x80 OR ((x << 12) AND 0x3F) i.e. 0x80 + bits 13-18 byte 5: 0x80 OR ((x << 6) AND 0x3F) i.e. 0x80 + bits 7-12 byte 6: 0x80 OR (x AND 0x3F) i.e. 0x80 + bits 1-6 -} -- | Convert Unicode characters to UTF-8. toUTF8 :: String -> [Word8] toUTF8 [] = [] toUTF8 (x:xs) = toUTF8' (ord x) where toUTF8' x | x <= 0x0000007F = fromIntegral x : more | x <= 0x000007FF = w8 0xC0 6 : w8 0x80 0 : more | x <= 0x0000FFFF = w8 0xE0 12 : w8 0x80 6 : w8 0x80 0 : more -- If we want to encode chars > 1114111 then this test should be -- x <= 0x001FFFFF -- because that's the upper limit of the 4-byte encoding -- (and the 5- and 6-byte cases below might also be enabled). | x <= 0x0010FFFF = w8 0xF0 18 : w8 0x80 12 : w8 0x80 6 : w8 0x80 0 : more | otherwise = error ("toUTF8: codepoint " ++ show x ++ " is greater than the largest allowed (decimal 1114111, hex 0x10FFFF).") {- -- Potentially useful code, if Haskell ever supports codepoints > 0x0010FFFF. -- There are no tests for this, because we can't create Strings containing -- chars > 0x0010FFFF. | x <= 0x03FFFFFF = w8 0xF8 24 : w8 0x80 18 : w8 0x80 12 : w8 0x80 6 : w8 0x80 0 : more | x <= 0x7FFFFFFF = w8 0xFC 30 : w8 0x80 24 : w8 0x80 18 : w8 0x80 12 : w8 0x80 6 : w8 0x80 0 : more | otherwise = error ("toUTF8: codepoint " ++ show x ++ " is greater " ++ "then the largest that can be represented by UTF8 encoding" ++ "(decimal 2147483647, hex 0x7FFFFFFF).") -} where more = toUTF8 xs w8 :: Word8 -> Int -> Word8 w8 base rshift = base .|. (fromIntegral (shiftR x rshift) .&. mask) where mask | base == 0x80 = 0x3F | base == 0xC0 = 0x1F | base == 0xE0 = 0x0F | base == 0xF0 = 0x07 | base == 0xF8 = 0x03 | base == 0xFC = 0x01 {- And the rules for UTF8-to-codepoint: examine first byte: 0x00-0x7F: 1 byte: as-is 0x80-0xBF: error (surrogate) 0xC0-0xDF: 2 bytes: b1 AND 0x1F + remaining 0xE0-0xEF: 3 bytes: b1 AND 0x0F + remaining 0xF0-0xF7: 4 bytes: b1 AND 0x07 + remaining 0xF8-0xFB: 5 bytes: b1 AND 0x03 + remaining 0xFC-0xFD: 6 bytes: b1 AND 0x01 + remaining 0xFE-0xFF: error (byte-order-mark indicators: UTF8 - EFBBBF, UTF16 - FEFF or FFFE) remaining = lower 6 bits of each byte, concatenated -} -- | Convert UTF-8 to Unicode. fromUTF8 :: [Word8] -> String fromUTF8 [] = "" fromUTF8 (x:xs) | x <= 0x7F = remaining 0 (fromIntegral x) xs | x <= 0xBF = err x | x <= 0xDF = remaining 1 (bAND x 0x1F) xs | x <= 0xEF = remaining 2 (bAND x 0x0F) xs | x <= 0xF7 = remaining 3 (bAND x 0x07) xs | otherwise = err x {- -- Again, only works for chars > 0x0010FFFF, which we can't test. | x <= 0xFB = remaining 4 (bAND x 0x03) xs | x <= 0xFD = remaining 5 (bAND x 0x01) xs | otherwise = err x -} where err x = error ("fromUTF8: illegal UTF-8 character " ++ show x) bAND :: Word8 -> Word8 -> Int bAND x m = fromIntegral (x .&. m) remaining :: Int -> Int -> [Word8] -> String remaining 0 x xs = chr x : fromUTF8 xs remaining n x [] = error "fromUTF8: incomplete UTF8 sequence" remaining n x (b:xs) | b == 0 = err x | otherwise = remaining (n-1) ((shiftL x 6) .|. (bAND b 0x3F)) xs {- This version of fromUTF8Ptr starts at the end of the array and works backwards. This will allow us to create the result String with constant space usage. Contrast this with creating the String by processing the array from start to end: in this case we would probably use an accumulating parameter, and reverse the list when we reach the end of the array. This isn't so bad, if we expect reverse to work in constant space (more or less). -} -- | Convert UTF-8 to Unicode, from a null-terminated C array of bytes. -- This function is useful, in addition to 'fromUTF8' above, -- because it doesn't create an intermediate @[Word8]@ list. fromUTF8Ptr0 :: Ptr Word8 -> IO String fromUTF8Ptr0 p = do len <- lengthArray0 nullByte p --fromUTF8PtrUnboxed (len-1) p "" fromUTF8Ptr (len-1) p "" -- | The bytes parameter should be len-1 -- i.e. if the CString has length 2, then you should pass -- bytes=1. fromUTF8Ptr :: Int -> Ptr Word8 -> String -> IO String fromUTF8Ptr bytes p acc | bytes `seq` p `seq` acc `seq` False = undefined | bytes < 0 = do if null acc then return acc else -- BOM = chr 65279 ( EF BB BF ) if head acc /= chr 65279 then return acc else return (tail acc) | otherwise = do x <- liftM fromIntegral (peekElemOff p bytes) case () of _ | x == 0 -> error ("fromUTF8Ptr: zero byte found in string as position " ++ show bytes) | x <= 0x7F -> fromUTF8Ptr (bytes-1) p (unsafeChr x:acc) | x <= 0xBF && bytes == 0 -> error "fromUTF8Ptr: surrogate at start of string" | x <= 0xBF -> fromUTF8Ptr (bytes-1) p acc | otherwise -> do c <- readUTF8Char x bytes p fromUTF8Ptr (bytes-1) p (c:acc) readUTF8Char :: Int -> Int -> Ptr Word8 -> IO Char readUTF8Char x offset p | x `seq` offset `seq` p `seq` False = undefined | otherwise = case () of _ | x == 0 -> err x | x <= 0x7F -> return (unsafeChr x) | x <= 0xBF -> err x | x <= 0xDF -> do x1 <- liftM fromIntegral (peekElemOff p (offset + 1)) return (unsafeChr ( ((x - 0xC0) * 64) + (x1 - 0x80) )) | x <= 0xEF -> do x1 <- liftM fromIntegral (peekElemOff p (offset + 1)) x2 <- liftM fromIntegral (peekElemOff p (offset + 2)) return (unsafeChr ( ((x - 0xE0) * 4096) + ((x1 - 0x80) * 64) + (x2 - 0x80) )) | x <= 0xF7 -> do x1 <- liftM fromIntegral (peekElemOff p (offset + 1)) x2 <- liftM fromIntegral (peekElemOff p (offset + 2)) x3 <- liftM fromIntegral (peekElemOff p (offset + 3)) return (unsafeChr ( ((x - 0xF0) * 262144) + ((x1 - 0x80) * 4096) + ((x2 - 0x80) * 64) + (x3 - 0x80) )) | otherwise -> err x where err x = error ("readUTF8Char: illegal UTF-8 character " ++ show x)