{-# LANGUAGE DeriveDataTypeable #-} {- | This module implements UTF-32 encoding and decoding. See for more information. -} module Data.Encoding.UTF32 (UTF32(..)) where import Data.Bits import Data.Char (ord,chr) import Data.Encoding.Base import Data.Word import Control.Exception (throwDyn) import Data.Typeable import qualified Data.ByteString as BS import qualified Data.ByteString.Lazy as LBS data UTF32 = UTF32 -- ^ Detects big or little endian through the use of the BOM (Byte Order Mask) character. Defaults to big endian if not present. | UTF32BE -- ^ Encodes and decodes using the big endian encoding. | UTF32LE -- ^ Encodes and decodes using the little endian encoding. deriving (Eq,Show,Typeable) bom :: Char bom = '\xFEFF' instance Encoding UTF32 where encode UTF32 str = encodeMultibyte encodeUTF32be (bom:str) encode UTF32LE str = encodeMultibyte encodeUTF32le str encode UTF32BE str = encodeMultibyte encodeUTF32be str encodeLazy UTF32 str = encodeMultibyteLazy encodeUTF32be (bom:str) encodeLazy UTF32LE str = encodeMultibyteLazy encodeUTF32le str encodeLazy UTF32BE str = encodeMultibyteLazy encodeUTF32be str encodable _ c = ord c < 0x0010FFFF decode UTF32 str = let (start,rest) = BS.splitAt 4 str in case BS.unpack start of [0x00,0x00,0xFE,0xFF] -> decode UTF32BE rest [0xFE,0xFF,0x00,0x00] -> decode UTF32LE rest _ -> decode UTF32BE str decode UTF32LE str = decodeMultibyte decodeUTF32le str decode UTF32BE str = decodeMultibyte decodeUTF32be str decodeLazy UTF32 str = let (start,rest) = LBS.splitAt 4 str in case LBS.unpack start of [0x00,0x00,0xFE,0xFF] -> decodeLazy UTF32BE rest [0xFE,0xFF,0x00,0x00] -> decodeLazy UTF32LE rest _ -> decodeLazy UTF32BE str decodeLazy UTF32LE str = decodeMultibyteLazy decodeUTF32le str decodeLazy UTF32BE str = decodeMultibyteLazy decodeUTF32be str encodeUTF32be :: Char -> (Word8,EncodeState) encodeUTF32be ch = let w = ord ch w1 = fromIntegral $ w `shiftR` 24 w2 = fromIntegral $ w `shiftR` 16 w3 = fromIntegral $ w `shiftR` 8 w4 = fromIntegral $ w in (w1,Put3 w2 w3 w4) encodeUTF32le :: Char -> (Word8,EncodeState) encodeUTF32le ch = let w = ord ch w1 = fromIntegral $ w `shiftR` 24 w2 = fromIntegral $ w `shiftR` 16 w3 = fromIntegral $ w `shiftR` 8 w4 = fromIntegral $ w in (w4,Put3 w3 w2 w1) decodeUTF32be :: [Word8] -> (Char,[Word8]) decodeUTF32be (w1:w2:w3:w4:rest) = let v = (fromIntegral w1 `shiftL` 24) .|. (fromIntegral w2 `shiftL` 16) .|. (fromIntegral w3 `shiftL` 8) .|. (fromIntegral w4) in if v < 0x0010FFFF then (chr v,rest) else throwDyn (IllegalRepresentation [w1,w2,w3,w4]) decodeUTF32be _ = throwDyn UnexpectedEnd decodeUTF32le :: [Word8] -> (Char,[Word8]) decodeUTF32le (w1:w2:w3:w4:rest) = let v = (fromIntegral w4 `shiftL` 24) .|. (fromIntegral w3 `shiftL` 16) .|. (fromIntegral w2 `shiftL` 8) .|. (fromIntegral w1) in if v < 0x0010FFFF then (chr v,rest) else throwDyn (IllegalRepresentation [w1,w2,w3,w4]) decodeUTF32le _ = throwDyn UnexpectedEnd