{-# LANGUAGE BangPatterns
           , DataKinds
           , KindSignatures
           , UnboxedTuples #-}

{- | Functions for parsing UTF-32, both little-endian and big-endian.
 -}

module Parser.Lathe.Encoding.UTF32
  ( -- * Byte-order mark
    utf32BOM
    
    -- * UTF-32
  , UTF32Point (..)
  , fromUtf32
  , isSurrogate

    -- ** Parsers
  , unitUtf32BE
  , unitUtf32LE
  ) where

import           Parser.Lathe.Binary.Internal
import           Parser.Lathe.Internal

import           Data.Bits
import           Data.ByteString (ByteString)
import qualified Data.ByteString.Unsafe as B
import           Data.Word
import           GHC.Base (unsafeChr)
import           GHC.ByteOrder



{-# INLINE utf32BOM #-}
-- | Consume 4 bytes that represent a UTF-32 byte-order mark and return
--   the corresponding 'ByteOrder'.
utf32BOM
  :: e                  -- ^ Malformed.
  -> e                  -- ^ Reached end.
  -> Parser e ByteOrder
utf32BOM :: forall e. e -> e -> Parser e ByteOrder
utf32BOM e
malformed = Int
-> (ByteString -> (# Res e ByteOrder #)) -> e -> Parser e ByteOrder
forall e a. Int -> (ByteString -> (# Res e a #)) -> e -> Parser e a
unsafeRead Int
4 (e -> ByteString -> (# Res e ByteOrder #)
forall e. e -> ByteString -> (# Res e ByteOrder #)
convUtf32BOM e
malformed)

convUtf32BOM :: e -> ByteString -> (# Res e ByteOrder #)
convUtf32BOM :: forall e. e -> ByteString -> (# Res e ByteOrder #)
convUtf32BOM e
e = \ByteString
b ->
  let w0 :: Word8
w0 = ByteString -> Int -> Word8
B.unsafeIndex ByteString
b Int
0
      w1 :: Word8
w1 = ByteString -> Int -> Word8
B.unsafeIndex ByteString
b Int
1
      w2 :: Word8
w2 = ByteString -> Int -> Word8
B.unsafeIndex ByteString
b Int
2
      w3 :: Word8
w3 = ByteString -> Int -> Word8
B.unsafeIndex ByteString
b Int
3

  in Word8 -> Word8 -> Word8 -> Word8 -> (# Res e ByteOrder #)
forall {a} {a} {a} {a}.
(Eq a, Eq a, Eq a, Eq a, Num a, Num a, Num a, Num a) =>
a -> a -> a -> a -> (# Res e ByteOrder #)
go Word8
w0 Word8
w1 Word8
w2 Word8
w3
  where
    go :: a -> a -> a -> a -> (# Res e ByteOrder #)
go a
w0 a
w1 a
w2 a
w3
      | a
w0 a -> a -> Bool
forall a. Eq a => a -> a -> Bool
== a
0x00, a
w1 a -> a -> Bool
forall a. Eq a => a -> a -> Bool
== a
0x00, a
w2 a -> a -> Bool
forall a. Eq a => a -> a -> Bool
== a
0xFE, a
w3 a -> a -> Bool
forall a. Eq a => a -> a -> Bool
== a
0xFF = (# ByteOrder -> Res e ByteOrder
forall a e. a -> Res e a
Yes ByteOrder
BigEndian #)
      | a
w0 a -> a -> Bool
forall a. Eq a => a -> a -> Bool
== a
0xFF, a
w1 a -> a -> Bool
forall a. Eq a => a -> a -> Bool
== a
0xFE, a
w2 a -> a -> Bool
forall a. Eq a => a -> a -> Bool
== a
0x00, a
w3 a -> a -> Bool
forall a. Eq a => a -> a -> Bool
== a
0x00 = (# ByteOrder -> Res e ByteOrder
forall a e. a -> Res e a
Yes ByteOrder
LittleEndian #)
      | Bool
otherwise                                      = (# e -> Res e ByteOrder
forall e a. e -> Res e a
No e
e #)



-- | Unicode code unit.
newtype UTF32Point = UTF32Point Word32

-- | Convert a code point into a 'Char'.
fromUtf32 :: UTF32Point -> Char
fromUtf32 :: UTF32Point -> Char
fromUtf32 (UTF32Point Word32
u) = Int -> Char
unsafeChr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ Word32 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word32
u

-- | Check whether a code point lies in the surrogate range (@U+D800@ to @U+DFFF@).
isSurrogate :: UTF32Point -> Bool
isSurrogate :: UTF32Point -> Bool
isSurrogate (UTF32Point Word32
u) = (Word32
u Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.&. Word32
0xFFFFF800) Word32 -> Word32 -> Bool
forall a. Eq a => a -> a -> Bool
== Word32
0x0000D800



{-# INLINE unitUtf32BE #-}
-- | Consume 4 bytes that represents a big-endian UTF-32 character.
unitUtf32BE
  :: e                   -- ^ Code unit is greater than @U+10FFFF@.
  -> e                   -- ^ Reached end.
  -> Parser e UTF32Point
unitUtf32BE :: forall e. e -> e -> Parser e UTF32Point
unitUtf32BE e
e = Int
-> (ByteString -> (# Res e UTF32Point #))
-> e
-> Parser e UTF32Point
forall e a. Int -> (ByteString -> (# Res e a #)) -> e -> Parser e a
unsafeRead Int
4 (e -> ByteString -> (# Res e UTF32Point #)
forall e. e -> ByteString -> (# Res e UTF32Point #)
u32BE e
e)

u32BE :: e -> ByteString -> (# Res e UTF32Point #)
u32BE :: forall e. e -> ByteString -> (# Res e UTF32Point #)
u32BE e
e = \ByteString
b ->
  let w :: Word32
w = ByteString -> Word32
w32BE ByteString
b
  in if Word32
w Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
>= Word32
0x00110000
       then (# e -> Res e UTF32Point
forall e a. e -> Res e a
No e
e #)
       else (# UTF32Point -> Res e UTF32Point
forall a e. a -> Res e a
Yes (Word32 -> UTF32Point
UTF32Point Word32
w) #)



{-# INLINE unitUtf32LE #-}
-- | Consume 4 bytes that represents a little-endian UTF-32 character.
unitUtf32LE
  :: e                   -- ^ Code unit is greater than @U+10FFFF@.
  -> e                   -- ^ Reached end.
  -> Parser e UTF32Point
unitUtf32LE :: forall e. e -> e -> Parser e UTF32Point
unitUtf32LE e
e = Int
-> (ByteString -> (# Res e UTF32Point #))
-> e
-> Parser e UTF32Point
forall e a. Int -> (ByteString -> (# Res e a #)) -> e -> Parser e a
unsafeRead Int
4 (e -> ByteString -> (# Res e UTF32Point #)
forall e. e -> ByteString -> (# Res e UTF32Point #)
u32LE e
e)

u32LE :: e -> ByteString -> (# Res e UTF32Point #)
u32LE :: forall e. e -> ByteString -> (# Res e UTF32Point #)
u32LE e
e = \ByteString
b ->
  let w :: Word32
w = ByteString -> Word32
w32LE ByteString
b
  in if Word32
w Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
>= Word32
0x00110000
       then (# e -> Res e UTF32Point
forall e a. e -> Res e a
No e
e #)
       else (# UTF32Point -> Res e UTF32Point
forall a e. a -> Res e a
Yes (Word32 -> UTF32Point
UTF32Point Word32
w) #)