module Data.CompactString.Encodings (
UTF8(..),
BE(..), LE(..), Native,
UTF16(..), UTF16BE, UTF16LE, UTF16Native,
UTF32(..), UTF32BE, UTF32LE, UTF32Native,
ASCII(..),
Latin1(..),
Compact(..)
) where
import Data.Word
import Data.Bits
import Foreign.Ptr (Ptr)
import Control.Monad (when, unless)
import Data.CompactString.Internal
data UTF8 = UTF8
instance Encoding UTF8 where
pokeCharFun _ c = case ord c of
x | x < 0x80 -> (1, \p -> poke p $ fromIntegral x )
| x < 0x800 -> (2, \p -> do poke p $ fromIntegral (x `shiftR` 6) .|. 0xC0
pokeByteOff p 1 $ (fromIntegral x .&. 0x3F) .|. 0x80)
| x < 0x10000 -> (3, \p -> do poke p $ fromIntegral (x `shiftR` 12) .|. 0xE0
pokeByteOff p 1 $ (fromIntegral (x `shiftR` 6) .&. 0x3F) .|. 0x80
pokeByteOff p 2 $ (fromIntegral x .&. 0x3F) .|. 0x80)
| otherwise -> (4, \p -> do poke p $ fromIntegral (x `shiftR` 18) .|. 0xF0
pokeByteOff p 1 $ (fromIntegral (x `shiftR` 12) .&. 0x3F) .|. 0x80
pokeByteOff p 2 $ (fromIntegral (x `shiftR` 6) .&. 0x3F) .|. 0x80
pokeByteOff p 3 $ (fromIntegral x .&. 0x3F) .|. 0x80)
peekChar _ p = do
a <- peek p
case charLenUTF8 a of
1 -> return (1, decodeUTF8_1 a)
2 -> do b <- peekByteOff p 1
return (2, decodeUTF8_2 a b)
3 -> do b <- peekByteOff p 1
c <- peekByteOff p 2
return (3, decodeUTF8_3 a b c)
_ -> do b <- peekByteOff p 1
c <- peekByteOff p 2
d <- peekByteOff p 3
return (4, decodeUTF8_4 a b c d)
peekCharLen _ p = do
a <- peek p
return (charLenUTF8 a)
peekCharRev _ p =
peek p >>= \a -> if not (a `testBit` 7) then return (1, decodeUTF8_1 a) else
peekByteOff p (1) >>= \b -> if (b `testBit` 6) then return (2, decodeUTF8_2 b a) else
peekByteOff p (2) >>= \c -> if (c `testBit` 6) then return (3, decodeUTF8_3 c b a) else
peekByteOff p (3) >>= \d -> return (4, decodeUTF8_4 d c b a)
peekCharLenRev _ p =
peek p >>= \a -> if not (a `testBit` 7) then return 1 else
peekByteOff p (1) >>= \b -> if (b `testBit` 6) then return 2 else
peekByteOff p (2) >>= \c -> if (c `testBit` 6) then return 3 else
return 4
peekCharSafe _ l p = do
a <- peek p
case () of
_ | not (a `testBit` 7) -> returnChr 1 (fromIntegral a)
| not (a `testBit` 6) -> failMessage "decode UTF8" "Invalid UTF8"
| not (a `testBit` 5) -> do require 2
b <- peekMore 1
let x = ((fromIntegral a .&. 0x1F) `shiftL` 6) .|. b
tooShort (x < 0x80)
returnChr 2 x
| not (a `testBit` 4) -> do require 3
b <- peekMore 1
c <- peekMore 2
let x = ((fromIntegral a .&. 0x0F) `shiftL` 12) .|. (b `shiftL` 6) .|. c
tooShort (x < 0x800)
returnChr 3 x
| not (a `testBit` 3) -> do require 4
b <- peekMore 1
c <- peekMore 2
d <- peekMore 3
let x = ((fromIntegral a .&. 0x07) `shiftL` 18) .|. (b `shiftL` 12) .|. (c `shiftL` 6) .|. d
tooShort (x < 0x10000)
returnChr 4 x
| otherwise -> failMessage "decode UTF8" "Invalid UTF8"
where peekMore off = do x <- peekByteOff p off
when (x < 0x80 || x > 0xBF) (failMessage "decode UTF8" "Invalid UTF8")
return $ fromIntegral (x .&. 0x3F)
require len = when (l < len) (failMessage "decode UTF8" "Not enough input")
tooShort b = when b (failMessage "decode UTF8" "Shorter encoding possible")
copyChar enc src dst = do
l <- peekCharLen enc src
memcpy dst src (fromIntegral l)
return l
copyCharRev enc src dst = do
l <- peekCharLenRev enc src
memcpy dst (src `plusPtr` (1 l)) (fromIntegral l)
return l
containsASCII _ = True
validOrdering _ = True
validSubstring _ = True
byteCount _ n = 4 * n
charLenUTF8 :: Word8 -> Int
charLenUTF8 x
| not (x `testBit` 7) = 1
| not (x `testBit` 5) = 2
| not (x `testBit` 4) = 3
| otherwise = 4
decodeUTF8_1 :: Word8 -> Char
decodeUTF8_1 a
= unsafeChr $ (fromIntegral a)
decodeUTF8_2 :: Word8 -> Word8 -> Char
decodeUTF8_2 a b
= unsafeChr $ (fromIntegral a `shiftL` 6)
`xor` (fromIntegral b)
`xor` 0x3080
decodeUTF8_3 :: Word8 -> Word8 -> Word8 -> Char
decodeUTF8_3 a b c
= unsafeChr $ (fromIntegral a `shiftL` 12)
`xor` (fromIntegral b `shiftL` 6)
`xor` (fromIntegral c)
`xor` 0xE2080
decodeUTF8_4 :: Word8 -> Word8 -> Word8 -> Word8 -> Char
decodeUTF8_4 a b c d
= unsafeChr $ (fromIntegral a `shiftL` 18)
`xor` (fromIntegral b `shiftL` 12)
`xor` (fromIntegral c `shiftL` 6)
`xor` (fromIntegral d)
`xor` 0x3C82080
data BE = BE
data LE = LE
type Native = LE
class Endian e where
peekWord16 :: Proxy (t e) -> Ptr Word8 -> IO Word16
pokeWord16 :: Proxy (t e) -> Ptr Word8 -> Word16 -> IO ()
peekWord32 :: Proxy (t e) -> Ptr Word8 -> IO Word32
pokeWord32 :: Proxy (t e) -> Ptr Word8 -> Word32 -> IO ()
isBE :: Proxy (t e) -> Bool
instance Endian BE where
peekWord16 _ ptr = do
a <- peek ptr
b <- peekByteOff ptr 1
return ((fromIntegral a `shiftL` 8) .|. fromIntegral b)
pokeWord16 _ ptr x = do
poke ptr $ fromIntegral (x `shiftR` 8)
pokeByteOff ptr 1 $ fromIntegral x
peekWord32 _ ptr = do
a <- peek ptr
b <- peekByteOff ptr 1
c <- peekByteOff ptr 2
d <- peekByteOff ptr 3
return $ (fromIntegral a `shiftL` 24) .|.
(fromIntegral b `shiftL` 16) .|.
(fromIntegral c `shiftL` 8 ) .|.
(fromIntegral d )
pokeWord32 _ ptr x = do
poke ptr $ fromIntegral (x `shiftR` 24)
pokeByteOff ptr 1 $ fromIntegral (x `shiftR` 16)
pokeByteOff ptr 2 $ fromIntegral (x `shiftR` 8)
pokeByteOff ptr 3 $ fromIntegral x
isBE _ = True
instance Endian LE where
peekWord16 _ ptr = do
a <- peek ptr
b <- peekByteOff ptr 1
return ((fromIntegral b `shiftL` 8) .|. fromIntegral a)
pokeWord16 _ ptr x = do
poke ptr $ fromIntegral x
pokeByteOff ptr 1 $ fromIntegral (x `shiftR` 8)
peekWord32 _ ptr = do
a <- peek ptr
b <- peekByteOff ptr 1
c <- peekByteOff ptr 2
d <- peekByteOff ptr 3
return $ (fromIntegral d `shiftL` 24) .|.
(fromIntegral c `shiftL` 16) .|.
(fromIntegral b `shiftL` 8 ) .|.
(fromIntegral a )
pokeWord32 _ ptr x = do
poke ptr $ fromIntegral x
pokeByteOff ptr 1 $ fromIntegral (x `shiftR` 8)
pokeByteOff ptr 2 $ fromIntegral (x `shiftR` 16)
pokeByteOff ptr 3 $ fromIntegral (x `shiftR` 24)
isBE _ = False
data UTF16 endianness = UTF16 endianness
type UTF16BE = UTF16 BE
type UTF16LE = UTF16 LE
type UTF16Native = UTF16 Native
instance Endian e => Encoding (UTF16 e) where
pokeCharFun enc c
| isSurrogate x = (2, \_ -> failMessage "encoding UTF16" "Surrogate character")
| x <= 0xFFFF = (2, \p -> pokeWord16 enc p $ fromIntegral x)
| otherwise = (4, \p -> pokeWord16 enc p h
>> pokeWord16 enc (p `plusPtr` 2) l)
where x = ord c
x' = x 0x10000
h = fromIntegral $ (x' `shiftR` 10) .|. 0xD800
l = fromIntegral $ (x' .&. 0x3FF) .|. 0xDC00
pokeCharLen _ c = if ord c <= 0xFFFF then 2 else 4
peekChar enc p = do a <- peekWord16 enc p
if isSurrogate a
then do
b <- peekWord16 enc (p `plusPtr` 2)
decodeUTF16 a b
else
return (2, unsafeChr $ fromIntegral a)
peekCharRev enc p = do a <- peekWord16 enc (p `plusPtr` (1))
if isSurrogate a
then do
b <- peekWord16 enc (p `plusPtr` (3))
decodeUTF16 b a
else
return (2, unsafeChr $ fromIntegral a)
peekCharLen enc p = do a <- peekWord16 enc p
return (if isSurrogate a then 4 else 2)
peekCharLenRev enc p = do a <- peekWord16 enc (p `plusPtr` (1))
return (if isSurrogate a then 4 else 2)
peekCharSafe enc l p = do a <- peekWord16 enc p
if isSurrogate a
then do
when (l < 4) (failMessage "decode UTF16" "Not enough input")
b <- peekWord16 enc (p `plusPtr` 2)
decodeUTF16Safe a b
else
return (2, unsafeChr $ fromIntegral a)
validateLength _ len
| len `mod` 2 /= 0 = failMessage "decode UTF16" "Not enough input"
| otherwise = return ()
containsASCII _ = False
validEquality _ = False
validOrdering _ = False
validSubstring _ = False
charCount _ n = n `div` 2
byteCount _ n = n * 4
newSize _ n = n * 2
isSurrogate :: (Num a, Ord a) => a -> Bool
isSurrogate c = c >= 0xD800 && c <= 0xDFFF
decodeUTF16 :: Word16 -> Word16 -> IO (Int,Char)
decodeUTF16 a b
| a <= 0xDBFF = return (4, unsafeChr $ (((fromIntegral a .&. 0x3FF) `shiftL` 10) .|. (fromIntegral b .&. 0x3FF)) + 0x10000)
| otherwise = return (4, unsafeChr $ (((fromIntegral b .&. 0x3FF) `shiftL` 10) .|. (fromIntegral a .&. 0x3FF)) + 0x10000)
decodeUTF16Safe :: Word16 -> Word16 -> IO (Int,Char)
decodeUTF16Safe a b
| ahi && blo = returnChr 4 $ (((fromIntegral a .&. 0x3FF) `shiftL` 10) .|. (fromIntegral b .&. 0x3FF)) + 0x10000
| alo && bhi = returnChr 4 $ (((fromIntegral b .&. 0x3FF) `shiftL` 10) .|. (fromIntegral a .&. 0x3FF)) + 0x10000
| otherwise = failMessage "decode UTF16" "Unpaired surrogate"
where ahi = a >= 0xD800 && a <= 0xDBFF
alo = a >= 0xDC00 && a <= 0xDFFF
bhi = b >= 0xD800 && b <= 0xDBFF
blo = b >= 0xDC00 && b <= 0xDFFF
data UTF32 endianness = UTF32 endianness
type UTF32BE = UTF32 BE
type UTF32LE = UTF32 LE
type UTF32Native = UTF32 Native
instance Endian e => Encoding (UTF32 e) where
pokeCharFun enc c = (4, \p -> pokeWord32 enc p (fromIntegral (ord c)))
pokeCharLen _ _ = 4
peekChar enc p = do a <- peekWord32 enc p
return (4, unsafeChr $ fromIntegral a)
peekCharRev enc p = do a <- peekWord32 enc (p `plusPtr` (3))
return (4, unsafeChr $ fromIntegral a)
peekCharLen _ _ = return 4
peekCharLenRev _ _ = return 4
peekCharSafe enc _ p = do a <- peekWord32 enc p
returnChr 4 a
validateLength _ len
| len `mod` 4 /= 0 = failMessage "decode UTF32" "Not enough input"
| otherwise = return ()
containsASCII _ = False
validOrdering enc = isBE enc
validSubstring _ = True
charCount _ n = n `div` 4
byteCount _ n = n * 4
newSize _ n = n
data ASCII = ASCII
instance Encoding ASCII where
pokeCharFun _ c
| c <= '\x7F' = (1, \p -> poke p (fromIntegral (ord c)))
| otherwise = (1, \_ -> failMessage "encode ASCII" "Not an ASCII character")
pokeCharLen _ _ = 1
peekChar _ p = do a <- peek p
return (1, unsafeChr $ fromIntegral a)
peekCharRev _ p = do a <- peek p
return (1, unsafeChr $ fromIntegral a)
peekCharLen _ _ = return 1
peekCharLenRev _ _ = return 1
peekCharSafe _ _ p = do a <- peek p
unless (a <= 0x7F) (failMessage "decode ASCII" "Not an ASCII character")
returnChr 1 (fromIntegral a)
containsASCII _ = True
validOrdering _ = True
validSubstring _ = True
charCount _ n = n
byteCount _ n = n
newSize _ n = n
data Latin1 = Latin1
instance Encoding Latin1 where
pokeCharFun _ c
| c <= '\xFF' = (1, \p -> poke p (fromIntegral (ord c)))
| otherwise = (1, \_ -> failMessage "encode Latin1" "Not a Latin1 character")
pokeCharLen _ _ = 1
peekChar _ p = do a <- peek p
return (1, unsafeChr $ fromIntegral a)
peekCharRev _ p = do a <- peek p
return (1, unsafeChr $ fromIntegral a)
peekCharLen _ _ = return 1
peekCharLenRev _ _ = return 1
peekCharSafe _ _ p = do a <- peek p
unless (a <= 0xFF) (failMessage "decode Latin1" "Not a Latin1 character")
returnChr 1 (fromIntegral a)
containsASCII _ = True
validOrdering _ = True
validSubstring _ = True
charCount _ n = n
byteCount _ n = n
newSize _ n = n
data Compact = Compact
instance Encoding Compact where
pokeCharFun _ c = case ord c of
x | x < 0x80 -> (1, \p -> poke p $ fromIntegral x )
| x < 0x4000 -> (2, \p -> do poke p $ fromIntegral (x `shiftR` 7) .|. 0x80
pokeByteOff p 1 $ fromIntegral x .|. 0x80 )
| otherwise -> (3, \p -> do poke p $ fromIntegral (x `shiftR` 14) .|. 0x80
pokeByteOff p 1 $ fromIntegral (x `shiftR` 7) .&. 0x7F
pokeByteOff p 2 $ fromIntegral x .|. 0x80 )
pokeCharLen _ c = case ord c of
x | x < 0x80 -> 1
| x < 0x4000 -> 2
| otherwise -> 3
peekChar _ p = do
aw <- peek p
let a = fromIntegral aw
if a `testBit` 7
then do
bw <- peekByteOff p 1
let b = fromIntegral bw
if b `testBit` 7
then
return (2, unsafeChr $ (a `shiftL` 7) `xor` b `xor` 0x4080 )
else do
cw <- peekByteOff p 2
let c = fromIntegral cw
return (3, unsafeChr $ (a `shiftL` 14) `xor` (b `shiftL` 7) `xor` c `xor` 0x200080 )
else return (1, unsafeChr a)
peekCharLen _ p = do
a <- peek p
if a `testBit` 7
then do
b <- peekByteOff p 1
return (if b `testBit` 7 then 2 else 3)
else return 1
peekCharRev _ p = do
cw <- peek p
let c = fromIntegral cw
if c `testBit` 7
then do
bw <- peekByteOff p (1)
let b = fromIntegral bw
if b `testBit` 7
then
return (2, unsafeChr $ (b `shiftL` 7) `xor` c `xor` 0x4080 )
else do
aw <- peekByteOff p (2)
let a = fromIntegral aw
return (3, unsafeChr $ (a `shiftL` 14) `xor` (b `shiftL` 7) `xor` c `xor` 0x200080 )
else return (1, unsafeChr c)
peekCharLenRev _ p = do
a <- peek p
if a `testBit` 7
then do
b <- peekByteOff p (1)
return (if b `testBit` 7 then 2 else 3)
else return 1
peekCharSafe _ l p = do
aw <- peek p
let a = fromIntegral aw
if a `testBit` 7
then do
require 2
bw <- peekByteOff p 1
let b = fromIntegral bw
if b `testBit` 7
then do
let x = ((a `shiftL` 7) `xor` b `xor` 0x4080)
tooShort (x < 0x80)
returnChr 2 x
else do
require 3
cw <- peekByteOff p 2
let c = fromIntegral cw
unless (c `testBit` 7) (failMessage "decode Compact" "Invalid byte sequence")
let x = ((a `shiftL` 14) `xor` (b `shiftL` 7) `xor` c `xor` 0x200080)
tooShort (x < 0x4000)
returnChr 3 x
else returnChr 1 a
where require len = when (l < len) (failMessage "decode Compact" "Not enough input")
tooShort b = when b (failMessage "decode UTF8" "Shorter encoding possible")
copyChar _ src dst = do
aw <- peek src
poke dst aw
if aw `testBit` 7
then do
bw <- peekByteOff src 1
pokeByteOff dst 1 bw
if bw `testBit` 7
then do
return 2
else do
peekByteOff src 2 >>= pokeByteOff dst 2
return 3
else do
return 1
copyCharRev _ src dst = do
aw <- peek src
if aw `testBit` 7
then do
bw <- peekByteOff src (1)
if bw `testBit` 7
then do
poke dst bw
pokeByteOff dst 1 aw
return 2
else do
cw <- peekByteOff src (2)
poke dst cw
pokeByteOff dst 1 bw
pokeByteOff dst 2 aw
return 3
else do
poke dst aw
return 1
containsASCII _ = True
validOrdering _ = False
validSubstring _ = False
byteCount _ n = 3 * n