-- | This module contains functions for converting to and from the UTF8
-- representations for Strings.
module Util.UTF8(
   toUTF8,
      -- :: String -> String
      -- Converts a String (whose characters must all have codes <2^31) into
      -- its UTF8 representation.
   fromUTF8WE,
      -- :: Monad m => String -> m String
      -- Converts a UTF8 representation of a String back into the String,
      -- catching all possible format errors.
      --
      -- Example: With the Haskell module Control.Monad.Error, you can
      -- instance this as
      -- (fromUTF8WE :: String -> Either String String)
      -- to get a conversion function which either succeeds (Right) or
      -- returns an error message (Left).
   ) where

import Data.Char
import Data.List

import Data.Bits
import Data.Word
import Control.Monad.Except () -- needed for instance Monad (Either String)
import Control.Monad.Fail

import Util.Computation

-- --------------------------------------------------------------------------
-- Encoding
-- --------------------------------------------------------------------------

-- | Converts a String into its UTF8 representation.
toUTF8 :: Enum byte => String -> [byte]
toUTF8 :: String -> [byte]
toUTF8 [] = []
toUTF8 (Char
x:String
xs) =
   let
      xs1 :: [byte]
xs1 = String -> [byte]
forall byte. Enum byte => String -> [byte]
toUTF8 String
xs
      ox :: Int
ox = Char -> Int
ord Char
x

      mkUTF8 :: Int -> [a] -> Int -> Int -> [a]
mkUTF8 Int
x0 [a]
xs0 Int
xmask0 Int
xmax0 =
         let
            xbot :: Int
xbot = Int
0x80 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (Int
x0 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x3f)
            x1 :: Int
x1 = Int
x0 Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftR` Int
6
            xs1 :: [a]
xs1 = Int -> a
forall a. Enum a => Int -> a
toEnum Int
xbot a -> [a] -> [a]
forall a. a -> [a] -> [a]
: [a]
xs0
         in
            if Int
x1 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
xmax0
              then
                 Int -> a
forall a. Enum a => Int -> a
toEnum (Int
xmask0 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
x1) a -> [a] -> [a]
forall a. a -> [a] -> [a]
: [a]
xs1
              else
                 let
                    xmask1 :: Int
xmask1 = Int
xmask0 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
xmax0
                    xmax1 :: Int
xmax1 = Int
xmax0 Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftR` Int
1
                 in
                    Int -> [a] -> Int -> Int -> [a]
mkUTF8 Int
x1 [a]
xs1 Int
xmask1 Int
xmax1
   in
      if Int
ox Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x7f
         then
            Int -> byte
forall a. Enum a => Int -> a
toEnum Int
ox byte -> [byte] -> [byte]
forall a. a -> [a] -> [a]
: [byte]
xs1
         else
           if Int
ox Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftR` Int
31 Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
/= Int
0
              then
                 String -> [byte]
forall a. HasCallStack => String -> a
error (String
"Huge character with code " String -> String -> String
forall a. [a] -> [a] -> [a]
++ Int -> String
forall a. Show a => a -> String
show Int
ox String -> String -> String
forall a. [a] -> [a] -> [a]
++
                    String
" detected in string being converted to UTF8.")
              else
                 Int -> [byte] -> Int -> Int -> [byte]
forall a. Enum a => Int -> [a] -> Int -> Int -> [a]
mkUTF8 Int
ox [byte]
xs1 Int
0xc0 Int
0x20

{-# SPECIALIZE toUTF8 :: String -> [Char] #-}
{-# SPECIALIZE toUTF8 :: String -> [Word8] #-}

-- | Converts a UTF8 representation of a String back into the String,
-- catching all possible format errors.
--
-- Example: With the Haskell module Control.Monad.Error, you can
-- instance this as
-- (fromUTF8WE :: String -> Either String String)
-- to get a conversion function which either succeeds (Right) or
-- returns an error message (Left).
fromUTF8WE :: (Enum byte,MonadFail m) => [byte] -> m String
fromUTF8WE :: [byte] -> m String
fromUTF8WE [] = String -> m String
forall (m :: * -> *) a. Monad m => a -> m a
return []
fromUTF8WE (byte
x0 : [byte]
xs0) =
   let
      ox :: Int
ox = byte -> Int
forall a. Enum a => a -> Int
fromEnum byte
x0
   in
      case Int -> Int
topZero8 Int
ox of
         Int
7 ->
            do
               String
xs1 <- [byte] -> m String
forall byte (m :: * -> *).
(Enum byte, MonadFail m) =>
[byte] -> m String
fromUTF8WE [byte]
xs0
               String -> m String
forall (m :: * -> *) a. Monad m => a -> m a
return (Int -> Char
chr Int
ox Char -> String -> String
forall a. a -> [a] -> [a]
: String
xs1)
         Int
6 ->
            String -> m String
forall (m :: * -> *) a. MonadFail m => String -> m a
Control.Monad.Fail.fail String
"UTF8 escape sequence starts 10xxxxxx"
         Int
0 ->
            String -> m String
forall (m :: * -> *) a. MonadFail m => String -> m a
Control.Monad.Fail.fail String
"UTF8 escape sequence starts 11111110"
         -1 ->
            String -> m String
forall (m :: * -> *) a. MonadFail m => String -> m a
Control.Monad.Fail.fail String
"UTF8 escape sequence starts 11111111"
         Int
n ->
            let
               r :: Int
r = Int
6 Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
n -- number of 6-bit pieces
               xtop :: Int
xtop = Int
ox Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int -> Int
ones Int
n

               minx :: Int
minx =
                  Int -> Int
forall a. Bits a => Int -> a
bit (
                     if Int
r Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
1
                        then
                           Int
7
                        else
                           Int
5Int -> Int -> Int
forall a. Num a => a -> a -> a
*Int
r Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1
                     )

               mkx :: [a] -> Int -> t -> m (Int, [a])
mkx [] Int
_ t
_ =
                  String -> m (Int, [a])
forall (m :: * -> *) a. MonadFail m => String -> m a
Control.Monad.Fail.fail String
"UTF8 string ends in middle of escape sequence"
               mkx (a
ch : [a]
xs1) Int
x0 t
count0 =
                  do
                     let
                        och :: Int
och = a -> Int
forall a. Enum a => a -> Int
fromEnum a
ch
                     if Int
och Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x80 Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
/= Int
0x80
                        then
                           String -> m ()
forall (m :: * -> *) a. MonadFail m => String -> m a
Control.Monad.Fail.fail (String
"UTF8 escape sequence contains continuing "
                              String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
"character not of form 10xxxxxx")
                        else
                           () -> m ()
forall (m :: * -> *) a. Monad m => a -> m a
return ()
                     let
                        xbot :: Int
xbot = Int
och Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x3f
                        x1 :: Int
x1 = (Int
x0 Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
6) Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
xbot
                        count1 :: t
count1 = t
count0 t -> t -> t
forall a. Num a => a -> a -> a
- t
1
                     if t
count1 t -> t -> Bool
forall a. Eq a => a -> a -> Bool
== t
0
                        then
                           (Int, [a]) -> m (Int, [a])
forall (m :: * -> *) a. Monad m => a -> m a
return (Int
x1,[a]
xs1)
                        else
                           [a] -> Int -> t -> m (Int, [a])
mkx [a]
xs1 Int
x1 t
count1
            in
               do
                  (Int
x,[byte]
xs1) <- [byte] -> Int -> Int -> m (Int, [byte])
forall (m :: * -> *) t a.
(MonadFail m, Eq t, Enum a, Num t) =>
[a] -> Int -> t -> m (Int, [a])
mkx [byte]
xs0 Int
xtop Int
r
                  if Int
x Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
minx
                     then
                        String -> m String
forall (m :: * -> *) a. MonadFail m => String -> m a
Control.Monad.Fail.fail (String
"UTF8 escape sequence contains character not "
                           String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
"optimally encoded")
                     else
                        do
                           String
xs2 <- [byte] -> m String
forall byte (m :: * -> *).
(Enum byte, MonadFail m) =>
[byte] -> m String
fromUTF8WE [byte]
xs1
                           String -> m String
forall (m :: * -> *) a. Monad m => a -> m a
return (Int -> Char
forall a. Enum a => Int -> a
toEnum Int
x Char -> String -> String
forall a. a -> [a] -> [a]
: String
xs2)

{-# SPECIALIZE fromUTF8WE :: String -> WithError String #-}
{-# SPECIALIZE fromUTF8WE :: [Word8] -> WithError String #-}


-- --------------------------------------------------------------------------
-- Binary utilities
-- --------------------------------------------------------------------------

-- | return the number of the top bit which is zero, or -1 if they
-- are all zero, for a number between 0 and 255.
topZero8 :: Int -> Int
topZero8 :: Int -> Int
topZero8 Int
i =
   case
      ((Bool -> Bool) -> [Bool] -> Maybe Int
forall a. (a -> Bool) -> [a] -> Maybe Int
findIndex Bool -> Bool
not
         ((Int -> Bool) -> [Int] -> [Bool]
forall a b. (a -> b) -> [a] -> [b]
map
            (\ Int
bn -> Int -> Int -> Bool
forall a. Bits a => a -> Int -> Bool
testBit Int
i Int
bn)
            [Int
7,Int
6..Int
0]
            ))
      of
         Just Int
n -> Int
7 Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
n
         Maybe Int
Nothing -> -Int
1

-- | (ones i) is number with binary representation 1 written i times.
ones :: Int -> Int
ones :: Int -> Int
ones Int
i = Int -> Int
forall a. Bits a => Int -> a
bit Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1