{-# LANGUAGE DeriveDataTypeable #-}
{- | This module implements UTF-8 encoding and decoding as in RFC 3629.
     See <http://en.wikipedia.org/wiki/UTF-8> for more information.
 -}
module Data.Encoding.UTF8 where

import Control.Throws
import Data.Char
import Data.Bits

import Data.Encoding.Base
import Data.Encoding.ByteSource
import Data.Encoding.ByteSink
import Data.Encoding.Exception

import Data.Typeable

data UTF8 = UTF8        -- ^ Very forgiving decoding mechanism, accepts everything that it can make any sense of.
          | UTF8Strict  -- ^ More strict decoding, doesn\'t accept sequences that have a too long representation and checks bits that aren\'t used in the decoding
          deriving (UTF8 -> UTF8 -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: UTF8 -> UTF8 -> Bool
$c/= :: UTF8 -> UTF8 -> Bool
== :: UTF8 -> UTF8 -> Bool
$c== :: UTF8 -> UTF8 -> Bool
Eq,Int -> UTF8 -> ShowS
[UTF8] -> ShowS
UTF8 -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [UTF8] -> ShowS
$cshowList :: [UTF8] -> ShowS
show :: UTF8 -> String
$cshow :: UTF8 -> String
showsPrec :: Int -> UTF8 -> ShowS
$cshowsPrec :: Int -> UTF8 -> ShowS
Show,Typeable)

instance Encoding UTF8 where
    encodeChar :: forall (m :: * -> *). ByteSink m => UTF8 -> Char -> m ()
encodeChar UTF8
_ Char
c
               | Int
n forall a. Ord a => a -> a -> Bool
<= Int
0x0000007F = Int -> m ()
p8 Int
n
               | Int
n forall a. Ord a => a -> a -> Bool
<= Int
0x000007FF = do
                         Int -> m ()
p8 forall a b. (a -> b) -> a -> b
$ Int
0xC0 forall a. Bits a => a -> a -> a
.|. (Int
n forall a. Bits a => a -> Int -> a
`shiftR` Int
6)
                         Int -> m ()
p8 forall a b. (a -> b) -> a -> b
$ Int
0x80 forall a. Bits a => a -> a -> a
.|. (Int
n forall a. Bits a => a -> a -> a
.&. Int
0x3F)
               | Int
n forall a. Ord a => a -> a -> Bool
<= Int
0x0000FFFF = do
                         Int -> m ()
p8 forall a b. (a -> b) -> a -> b
$ Int
0xE0 forall a. Bits a => a -> a -> a
.|. (Int
n forall a. Bits a => a -> Int -> a
`shiftR` Int
12)
                         Int -> m ()
p8 forall a b. (a -> b) -> a -> b
$ Int
0x80 forall a. Bits a => a -> a -> a
.|. ((Int
n forall a. Bits a => a -> Int -> a
`shiftR` Int
6) forall a. Bits a => a -> a -> a
.&. Int
0x3F)
                         Int -> m ()
p8 forall a b. (a -> b) -> a -> b
$ Int
0x80 forall a. Bits a => a -> a -> a
.|. (Int
n forall a. Bits a => a -> a -> a
.&. Int
0x3F)
               | Int
n forall a. Ord a => a -> a -> Bool
<= Int
0x0010FFFF = do
                         Int -> m ()
p8 forall a b. (a -> b) -> a -> b
$ Int
0xF0 forall a. Bits a => a -> a -> a
.|. (Int
n forall a. Bits a => a -> Int -> a
`shiftR` Int
18)
                         Int -> m ()
p8 forall a b. (a -> b) -> a -> b
$ Int
0x80 forall a. Bits a => a -> a -> a
.|. ((Int
n forall a. Bits a => a -> Int -> a
`shiftR` Int
12) forall a. Bits a => a -> a -> a
.&. Int
0x3F)
                         Int -> m ()
p8 forall a b. (a -> b) -> a -> b
$ Int
0x80 forall a. Bits a => a -> a -> a
.|. ((Int
n forall a. Bits a => a -> Int -> a
`shiftR` Int
6) forall a. Bits a => a -> a -> a
.&. Int
0x3F)
                         Int -> m ()
p8 forall a b. (a -> b) -> a -> b
$ Int
0x80 forall a. Bits a => a -> a -> a
.|. (Int
n forall a. Bits a => a -> a -> a
.&. Int
0x3F)
               | Bool
otherwise = forall e (m :: * -> *) a. Throws e m => e -> m a
throwException (Char -> EncodingException
HasNoRepresentation Char
c)
               where
                 n :: Int
n = Char -> Int
ord Char
c
                 p8 :: Int -> m ()
p8 = forall (m :: * -> *). ByteSink m => Word8 -> m ()
pushWord8forall b c a. (b -> c) -> (a -> b) -> a -> c
.forall a b. (Integral a, Num b) => a -> b
fromIntegral
    encodeable :: UTF8 -> Char -> Bool
encodeable UTF8
_ Char
c = Char
c forall a. Ord a => a -> a -> Bool
<= Char
'\x10FFFF'
    decodeChar :: forall (m :: * -> *). ByteSource m => UTF8 -> m Char
decodeChar UTF8
UTF8 = do
      Word8
w1 <- forall (m :: * -> *). ByteSource m => m Word8
fetchWord8
      case () of 
        ()
_
          | Word8
w1 forall a. Ord a => a -> a -> Bool
<= Word8
0x7F -> forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ Int -> Char
chr forall a b. (a -> b) -> a -> b
$ forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
w1
          | Word8
w1 forall a. Ord a => a -> a -> Bool
<= Word8
0xBF -> forall e (m :: * -> *) a. Throws e m => e -> m a
throwException (Word8 -> DecodingException
IllegalCharacter Word8
w1)
          | Word8
w1 forall a. Ord a => a -> a -> Bool
<= Word8
0xDF -> do
                         Word8
w2 <- forall (m :: * -> *). ByteSource m => m Word8
fetchWord8
                         forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ Int -> Char
chr forall a b. (a -> b) -> a -> b
$
                                    ((forall a b. (Integral a, Num b) => a -> b
fromIntegral forall a b. (a -> b) -> a -> b
$ Word8
w1 forall a. Bits a => a -> a -> a
.&. Word8
0x1F) forall a. Bits a => a -> Int -> a
`shiftL` Int
6)
                                    forall a. Bits a => a -> a -> a
.|. (forall a b. (Integral a, Num b) => a -> b
fromIntegral forall a b. (a -> b) -> a -> b
$ Word8
w2 forall a. Bits a => a -> a -> a
.&. Word8
0x3F)

          | Word8
w1 forall a. Ord a => a -> a -> Bool
<= Word8
0xEF -> do
                         Word8
w2 <- forall (m :: * -> *). ByteSource m => m Word8
fetchWord8
                         Word8
w3 <- forall (m :: * -> *). ByteSource m => m Word8
fetchWord8
                         let v1 :: Word8
v1 = Word8
w1 forall a. Bits a => a -> a -> a
.&. Word8
0x0F
                             v2 :: Word8
v2 = Word8
w2 forall a. Bits a => a -> a -> a
.&. Word8
0x3F
                             v3 :: Word8
v3 = Word8
w3 forall a. Bits a => a -> a -> a
.&. Word8
0x3F
                         forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ Int -> Char
chr forall a b. (a -> b) -> a -> b
$
                                    ((forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v1) forall a. Bits a => a -> Int -> a
`shiftL` Int
12)
                                    forall a. Bits a => a -> a -> a
.|. ((forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v2) forall a. Bits a => a -> Int -> a
`shiftL` Int
6)
                                    forall a. Bits a => a -> a -> a
.|. (forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v3)
          | Word8
w1 forall a. Ord a => a -> a -> Bool
<= Word8
0xF7 -> do
                         Word8
w2 <- forall (m :: * -> *). ByteSource m => m Word8
fetchWord8
                         Word8
w3 <- forall (m :: * -> *). ByteSource m => m Word8
fetchWord8
                         Word8
w4 <- forall (m :: * -> *). ByteSource m => m Word8
fetchWord8
                         let v1 :: Word8
v1 = Word8
w1 forall a. Bits a => a -> a -> a
.&. Word8
0x07
                             v2 :: Word8
v2 = Word8
w2 forall a. Bits a => a -> a -> a
.&. Word8
0x3F
                             v3 :: Word8
v3 = Word8
w3 forall a. Bits a => a -> a -> a
.&. Word8
0x3F
                             v4 :: Word8
v4 = Word8
w4 forall a. Bits a => a -> a -> a
.&. Word8
0x3F
                             v :: Int
v  = ((forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v1) forall a. Bits a => a -> Int -> a
`shiftL` Int
18)
                                    forall a. Bits a => a -> a -> a
.|. ((forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v2) forall a. Bits a => a -> Int -> a
`shiftL` Int
12)
                                    forall a. Bits a => a -> a -> a
.|. ((forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v3) forall a. Bits a => a -> Int -> a
`shiftL` Int
6)
                                    forall a. Bits a => a -> a -> a
.|. (forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v4)
                         if Int
v forall a. Ord a => a -> a -> Bool
<= Int
0x10FFFF
                           then forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ Int -> Char
chr Int
v
                           else forall e (m :: * -> *) a. Throws e m => e -> m a
throwException ([Word8] -> DecodingException
IllegalRepresentation [Word8
w1,Word8
w2,Word8
w3,Word8
w4])
          | Bool
otherwise -> forall e (m :: * -> *) a. Throws e m => e -> m a
throwException (Word8 -> DecodingException
IllegalCharacter Word8
w1)
    decodeChar UTF8
UTF8Strict = do
      Word8
w1 <- forall (m :: * -> *). ByteSource m => m Word8
fetchWord8
      case () of 
        ()
_
          | Word8
w1 forall a. Ord a => a -> a -> Bool
<= Word8
0x7F -> forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ Int -> Char
chr forall a b. (a -> b) -> a -> b
$ forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
w1
          | Word8
w1 forall a. Ord a => a -> a -> Bool
<= Word8
0xBF -> forall e (m :: * -> *) a. Throws e m => e -> m a
throwException (Word8 -> DecodingException
IllegalCharacter Word8
w1)
          | Word8
w1 forall a. Ord a => a -> a -> Bool
<= Word8
0xDF -> do
                         Word8
w2 <- m Word8
fetchExtend8
                         let v1 :: Word8
v1 = Word8
w1 forall a. Bits a => a -> a -> a
.&. Word8
0x1F
                         if Word8
v1 forall a. Ord a => a -> a -> Bool
<= Word8
1
                           then forall e (m :: * -> *) a. Throws e m => e -> m a
throwException ([Word8] -> DecodingException
IllegalRepresentation [Word8
w1,Word8
w2])
                           else forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ Int -> Char
chr forall a b. (a -> b) -> a -> b
$ 
                                    ((forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v1) forall a. Bits a => a -> Int -> a
`shiftL` Int
6)
                                    forall a. Bits a => a -> a -> a
.|. (forall a b. (Integral a, Num b) => a -> b
fromIntegral forall a b. (a -> b) -> a -> b
$ Word8
w2 forall a. Bits a => a -> a -> a
.&. Word8
0x3F)
          | Word8
w1 forall a. Ord a => a -> a -> Bool
<= Word8
0xEF -> do
                         Word8
w2 <- m Word8
fetchExtend8
                         Word8
w3 <- m Word8
fetchExtend8
                         let v1 :: Word8
v1 = Word8
w1 forall a. Bits a => a -> a -> a
.&. Word8
0x0F
                             v2 :: Word8
v2 = Word8
w2 forall a. Bits a => a -> a -> a
.&. Word8
0x3F
                             v3 :: Word8
v3 = Word8
w3 forall a. Bits a => a -> a -> a
.&. Word8
0x3F
                         if Word8
v1 forall a. Eq a => a -> a -> Bool
== Word8
0 Bool -> Bool -> Bool
&& Word8
v2 forall a. Ord a => a -> a -> Bool
< Word8
0x20
                           then forall e (m :: * -> *) a. Throws e m => e -> m a
throwException ([Word8] -> DecodingException
IllegalRepresentation [Word8
w1,Word8
w2,Word8
w3])
                           else forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ Int -> Char
chr forall a b. (a -> b) -> a -> b
$
                                    ((forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v1) forall a. Bits a => a -> Int -> a
`shiftL` Int
12)
                                    forall a. Bits a => a -> a -> a
.|. ((forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v2) forall a. Bits a => a -> Int -> a
`shiftL` Int
6)
                                    forall a. Bits a => a -> a -> a
.|. (forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v3)
          | Word8
w1 forall a. Ord a => a -> a -> Bool
<= Word8
0xF7 -> do
                         Word8
w2 <- m Word8
fetchExtend8
                         Word8
w3 <- m Word8
fetchExtend8
                         Word8
w4 <- m Word8
fetchExtend8
                         let v1 :: Word8
v1 = Word8
w1 forall a. Bits a => a -> a -> a
.&. Word8
0x07
                             v2 :: Word8
v2 = Word8
w2 forall a. Bits a => a -> a -> a
.&. Word8
0x3F
                             v3 :: Word8
v3 = Word8
w3 forall a. Bits a => a -> a -> a
.&. Word8
0x3F
                             v4 :: Word8
v4 = Word8
w4 forall a. Bits a => a -> a -> a
.&. Word8
0x3F
                             v :: Int
v = ((forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v1) forall a. Bits a => a -> Int -> a
`shiftL` Int
18)
                                    forall a. Bits a => a -> a -> a
.|. ((forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v2) forall a. Bits a => a -> Int -> a
`shiftL` Int
12)
                                    forall a. Bits a => a -> a -> a
.|. ((forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v3) forall a. Bits a => a -> Int -> a
`shiftL` Int
6)
                                    forall a. Bits a => a -> a -> a
.|. (forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v4)
                         if Word8
v1 forall a. Eq a => a -> a -> Bool
== Word8
0 Bool -> Bool -> Bool
&& Word8
v2 forall a. Ord a => a -> a -> Bool
< Word8
0x10
                           then forall e (m :: * -> *) a. Throws e m => e -> m a
throwException ([Word8] -> DecodingException
IllegalRepresentation [Word8
w1,Word8
w2,Word8
w3,Word8
w4])
                           else (if Int
v forall a. Ord a => a -> a -> Bool
<= Int
0x10FFFF
                                 then forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ Int -> Char
chr Int
v
                                 else forall e (m :: * -> *) a. Throws e m => e -> m a
throwException ([Word8] -> DecodingException
IllegalRepresentation [Word8
w1,Word8
w2,Word8
w3,Word8
w4]))
          | Bool
otherwise -> forall e (m :: * -> *) a. Throws e m => e -> m a
throwException (Word8 -> DecodingException
IllegalCharacter Word8
w1)
          where
            invalidExtend :: a -> Bool
invalidExtend a
wrd = a
wrd forall a. Bits a => a -> a -> a
.&. a
0xC0 forall a. Eq a => a -> a -> Bool
/= a
0x80
            fetchExtend8 :: m Word8
fetchExtend8 = do
              Word8
w <- forall (m :: * -> *). ByteSource m => m Word8
fetchWord8
              if forall {a}. (Bits a, Num a) => a -> Bool
invalidExtend Word8
w 
                then forall e (m :: * -> *) a. Throws e m => e -> m a
throwException (Word8 -> DecodingException
IllegalCharacter Word8
w)
                else forall (m :: * -> *) a. Monad m => a -> m a
return Word8
w