{-# LANGUAGE DeriveDataTypeable #-}
{- | This module implements UTF-8 encoding and decoding as in RFC 3629.
     See <http://en.wikipedia.org/wiki/UTF-8> for more information.
 -}
module Data.Encoding.UTF8 where

import Control.Throws
import Data.Char
import Data.Bits

import Data.Encoding.Base
import Data.Encoding.ByteSource
import Data.Encoding.ByteSink
import Data.Encoding.Exception

import Data.Typeable

data UTF8 = UTF8        -- ^ Very forgiving decoding mechanism, accepts everything that it can make any sense of.
          | UTF8Strict  -- ^ More strict decoding, doesn\'t accept sequences that have a too long representation and checks bits that aren\'t used in the decoding
          deriving (UTF8 -> UTF8 -> Bool
(UTF8 -> UTF8 -> Bool) -> (UTF8 -> UTF8 -> Bool) -> Eq UTF8
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: UTF8 -> UTF8 -> Bool
$c/= :: UTF8 -> UTF8 -> Bool
== :: UTF8 -> UTF8 -> Bool
$c== :: UTF8 -> UTF8 -> Bool
Eq,Int -> UTF8 -> ShowS
[UTF8] -> ShowS
UTF8 -> String
(Int -> UTF8 -> ShowS)
-> (UTF8 -> String) -> ([UTF8] -> ShowS) -> Show UTF8
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [UTF8] -> ShowS
$cshowList :: [UTF8] -> ShowS
show :: UTF8 -> String
$cshow :: UTF8 -> String
showsPrec :: Int -> UTF8 -> ShowS
$cshowsPrec :: Int -> UTF8 -> ShowS
Show,Typeable)

instance Encoding UTF8 where
    encodeChar :: UTF8 -> Char -> m ()
encodeChar UTF8
_ Char
c
               | Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x0000007F = Int -> m ()
p8 Int
n
               | Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x000007FF = do
                         Int -> m ()
p8 (Int -> m ()) -> Int -> m ()
forall a b. (a -> b) -> a -> b
$ Int
0xC0 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (Int
n Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftR` Int
6)
                         Int -> m ()
p8 (Int -> m ()) -> Int -> m ()
forall a b. (a -> b) -> a -> b
$ Int
0x80 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (Int
n Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x3F)
               | Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x0000FFFF = do
                         Int -> m ()
p8 (Int -> m ()) -> Int -> m ()
forall a b. (a -> b) -> a -> b
$ Int
0xE0 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (Int
n Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftR` Int
12)
                         Int -> m ()
p8 (Int -> m ()) -> Int -> m ()
forall a b. (a -> b) -> a -> b
$ Int
0x80 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. ((Int
n Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftR` Int
6) Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x3F)
                         Int -> m ()
p8 (Int -> m ()) -> Int -> m ()
forall a b. (a -> b) -> a -> b
$ Int
0x80 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (Int
n Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x3F)
               | Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x0010FFFF = do
                         Int -> m ()
p8 (Int -> m ()) -> Int -> m ()
forall a b. (a -> b) -> a -> b
$ Int
0xF0 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (Int
n Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftR` Int
18)
                         Int -> m ()
p8 (Int -> m ()) -> Int -> m ()
forall a b. (a -> b) -> a -> b
$ Int
0x80 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. ((Int
n Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftR` Int
12) Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x3F)
                         Int -> m ()
p8 (Int -> m ()) -> Int -> m ()
forall a b. (a -> b) -> a -> b
$ Int
0x80 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. ((Int
n Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftR` Int
6) Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x3F)
                         Int -> m ()
p8 (Int -> m ()) -> Int -> m ()
forall a b. (a -> b) -> a -> b
$ Int
0x80 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (Int
n Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x3F)
               | Bool
otherwise = EncodingException -> m ()
forall e (m :: * -> *) a. Throws e m => e -> m a
throwException (Char -> EncodingException
HasNoRepresentation Char
c)
               where
                 n :: Int
n = Char -> Int
ord Char
c
                 p8 :: Int -> m ()
p8 = Word8 -> m ()
forall (m :: * -> *). ByteSink m => Word8 -> m ()
pushWord8(Word8 -> m ()) -> (Int -> Word8) -> Int -> m ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
.Int -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral
    encodeable :: UTF8 -> Char -> Bool
encodeable UTF8
_ Char
c = Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x10FFFF'
    decodeChar :: UTF8 -> m Char
decodeChar UTF8
UTF8 = do
      Word8
w1 <- m Word8
forall (m :: * -> *). ByteSource m => m Word8
fetchWord8
      case () of 
        ()
_
          | Word8
w1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
0x7F -> Char -> m Char
forall (m :: * -> *) a. Monad m => a -> m a
return (Char -> m Char) -> Char -> m Char
forall a b. (a -> b) -> a -> b
$ Int -> Char
chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
w1
          | Word8
w1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
0xBF -> DecodingException -> m Char
forall e (m :: * -> *) a. Throws e m => e -> m a
throwException (Word8 -> DecodingException
IllegalCharacter Word8
w1)
          | Word8
w1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
0xDF -> do
                         Word8
w2 <- m Word8
forall (m :: * -> *). ByteSource m => m Word8
fetchWord8
                         Char -> m Char
forall (m :: * -> *) a. Monad m => a -> m a
return (Char -> m Char) -> Char -> m Char
forall a b. (a -> b) -> a -> b
$ Int -> Char
chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$
                                    ((Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word8 -> Int) -> Word8 -> Int
forall a b. (a -> b) -> a -> b
$ Word8
w1 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x1F) Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
6)
                                    Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word8 -> Int) -> Word8 -> Int
forall a b. (a -> b) -> a -> b
$ Word8
w2 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F)

          | Word8
w1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
0xEF -> do
                         Word8
w2 <- m Word8
forall (m :: * -> *). ByteSource m => m Word8
fetchWord8
                         Word8
w3 <- m Word8
forall (m :: * -> *). ByteSource m => m Word8
fetchWord8
                         let v1 :: Word8
v1 = Word8
w1 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x0F
                             v2 :: Word8
v2 = Word8
w2 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F
                             v3 :: Word8
v3 = Word8
w3 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F
                         Char -> m Char
forall (m :: * -> *) a. Monad m => a -> m a
return (Char -> m Char) -> Char -> m Char
forall a b. (a -> b) -> a -> b
$ Int -> Char
chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$
                                    ((Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v1) Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
12)
                                    Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. ((Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v2) Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
6)
                                    Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v3)
          | Word8
w1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
0xF7 -> do
                         Word8
w2 <- m Word8
forall (m :: * -> *). ByteSource m => m Word8
fetchWord8
                         Word8
w3 <- m Word8
forall (m :: * -> *). ByteSource m => m Word8
fetchWord8
                         Word8
w4 <- m Word8
forall (m :: * -> *). ByteSource m => m Word8
fetchWord8
                         let v1 :: Word8
v1 = Word8
w1 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x07
                             v2 :: Word8
v2 = Word8
w2 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F
                             v3 :: Word8
v3 = Word8
w3 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F
                             v4 :: Word8
v4 = Word8
w4 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F
                             v :: Int
v  = ((Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v1) Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
18)
                                    Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. ((Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v2) Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
12)
                                    Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. ((Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v3) Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
6)
                                    Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v4)
                         if Int
v Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x10FFFF
                           then Char -> m Char
forall (m :: * -> *) a. Monad m => a -> m a
return (Char -> m Char) -> Char -> m Char
forall a b. (a -> b) -> a -> b
$ Int -> Char
chr Int
v
                           else DecodingException -> m Char
forall e (m :: * -> *) a. Throws e m => e -> m a
throwException ([Word8] -> DecodingException
IllegalRepresentation [Word8
w1,Word8
w2,Word8
w3,Word8
w4])
          | Bool
otherwise -> DecodingException -> m Char
forall e (m :: * -> *) a. Throws e m => e -> m a
throwException (Word8 -> DecodingException
IllegalCharacter Word8
w1)
    decodeChar UTF8
UTF8Strict = do
      Word8
w1 <- m Word8
forall (m :: * -> *). ByteSource m => m Word8
fetchWord8
      case () of 
        ()
_
          | Word8
w1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
0x7F -> Char -> m Char
forall (m :: * -> *) a. Monad m => a -> m a
return (Char -> m Char) -> Char -> m Char
forall a b. (a -> b) -> a -> b
$ Int -> Char
chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
w1
          | Word8
w1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
0xBF -> DecodingException -> m Char
forall e (m :: * -> *) a. Throws e m => e -> m a
throwException (Word8 -> DecodingException
IllegalCharacter Word8
w1)
          | Word8
w1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
0xDF -> do
                         Word8
w2 <- m Word8
fetchExtend8
                         let v1 :: Word8
v1 = Word8
w1 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x1F
                         if Word8
v1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
1
                           then DecodingException -> m Char
forall e (m :: * -> *) a. Throws e m => e -> m a
throwException ([Word8] -> DecodingException
IllegalRepresentation [Word8
w1,Word8
w2])
                           else Char -> m Char
forall (m :: * -> *) a. Monad m => a -> m a
return (Char -> m Char) -> Char -> m Char
forall a b. (a -> b) -> a -> b
$ Int -> Char
chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ 
                                    ((Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v1) Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
6)
                                    Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word8 -> Int) -> Word8 -> Int
forall a b. (a -> b) -> a -> b
$ Word8
w2 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F)
          | Word8
w1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
0xEF -> do
                         Word8
w2 <- m Word8
fetchExtend8
                         Word8
w3 <- m Word8
fetchExtend8
                         let v1 :: Word8
v1 = Word8
w1 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x0F
                             v2 :: Word8
v2 = Word8
w2 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F
                             v3 :: Word8
v3 = Word8
w3 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F
                         if Word8
v1 Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0 Bool -> Bool -> Bool
&& Word8
v2 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0x20
                           then DecodingException -> m Char
forall e (m :: * -> *) a. Throws e m => e -> m a
throwException ([Word8] -> DecodingException
IllegalRepresentation [Word8
w1,Word8
w2,Word8
w3])
                           else Char -> m Char
forall (m :: * -> *) a. Monad m => a -> m a
return (Char -> m Char) -> Char -> m Char
forall a b. (a -> b) -> a -> b
$ Int -> Char
chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$
                                    ((Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v1) Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
12)
                                    Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. ((Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v2) Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
6)
                                    Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v3)
          | Word8
w1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
0xF7 -> do
                         Word8
w2 <- m Word8
fetchExtend8
                         Word8
w3 <- m Word8
fetchExtend8
                         Word8
w4 <- m Word8
fetchExtend8
                         let v1 :: Word8
v1 = Word8
w1 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x07
                             v2 :: Word8
v2 = Word8
w2 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F
                             v3 :: Word8
v3 = Word8
w3 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F
                             v4 :: Word8
v4 = Word8
w4 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F
                             v :: Int
v = ((Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v1) Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
18)
                                    Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. ((Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v2) Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
12)
                                    Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. ((Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v3) Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
6)
                                    Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v4)
                         if Word8
v1 Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0 Bool -> Bool -> Bool
&& Word8
v2 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0x10
                           then DecodingException -> m Char
forall e (m :: * -> *) a. Throws e m => e -> m a
throwException ([Word8] -> DecodingException
IllegalRepresentation [Word8
w1,Word8
w2,Word8
w3,Word8
w4])
                           else (if Int
v Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x10FFFF
                                 then Char -> m Char
forall (m :: * -> *) a. Monad m => a -> m a
return (Char -> m Char) -> Char -> m Char
forall a b. (a -> b) -> a -> b
$ Int -> Char
chr Int
v
                                 else DecodingException -> m Char
forall e (m :: * -> *) a. Throws e m => e -> m a
throwException ([Word8] -> DecodingException
IllegalRepresentation [Word8
w1,Word8
w2,Word8
w3,Word8
w4]))
          | Bool
otherwise -> DecodingException -> m Char
forall e (m :: * -> *) a. Throws e m => e -> m a
throwException (Word8 -> DecodingException
IllegalCharacter Word8
w1)
          where
            invalidExtend :: a -> Bool
invalidExtend a
wrd = a
wrd a -> a -> a
forall a. Bits a => a -> a -> a
.&. a
0xC0 a -> a -> Bool
forall a. Eq a => a -> a -> Bool
/= a
0x80
            fetchExtend8 :: m Word8
fetchExtend8 = do
              Word8
w <- m Word8
forall (m :: * -> *). ByteSource m => m Word8
fetchWord8
              if Word8 -> Bool
forall a. (Bits a, Num a) => a -> Bool
invalidExtend Word8
w 
                then DecodingException -> m Word8
forall e (m :: * -> *) a. Throws e m => e -> m a
throwException (Word8 -> DecodingException
IllegalCharacter Word8
w)
                else Word8 -> m Word8
forall (m :: * -> *) a. Monad m => a -> m a
return Word8
w