{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE EmptyCase #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE NoImplicitPrelude #-}

{-# OPTIONS_GHC -O2 #-}

module Data.ByteString.Utf8
  ( isUtf8 
  , isUtf8' 
  ) where

import Data.Bits ( (.&.) )
import Data.ByteString.Ascii (isAscii)
import Data.ByteString.Internal (ByteString(..), accursedUnutterablePerformIO)
import Data.Word (Word8)
import GHC.Base
import GHC.Ptr
import Foreign.ForeignPtr (withForeignPtr)
import Foreign.Storable (peek)

data Utf8 = U8 | U16 | U24 | U32 | UNot
  deriving Eq

which :: Word8 -> Utf8
which c
  | isUtf8b8  c = U8
  | isUtf8b16 c = U16
  | isUtf8b24 c = U24
  | isUtf8b32 c = U32
  | otherwise   = UNot

isUtf8Ptr :: Ptr Word8 -> Ptr Word8 -> IO Bool
isUtf8Ptr !p !q
  | p == q    = pure True
  | otherwise = do
      c <- peek p

      case which c of
        U8  -> isUtf8Ptr (p `plusPtr` 1) q 
        U16 -> if q `minusPtr` p >= 2
                 then 
                   do d <- peek (p `plusPtr` 1)  
                      if isUtf8OtherBytes d then isUtf8Ptr (p `plusPtr` 2) q else pure False
                 else pure False
        U24 -> if q `minusPtr` p >= 3
                 then
                   do d <- peek (p `plusPtr` 1); e <- peek (p `plusPtr` 2)
                      if (isUtf8OtherBytes d && isUtf8OtherBytes e) then isUtf8Ptr (p `plusPtr` 3) q else pure False
                 else pure False
        U32 -> if q `minusPtr` p >= 4
                 then
                   do d <- peek (p `plusPtr` 1); e <- peek (p `plusPtr` 2); f <- peek (p `plusPtr` 3);
                      if (isUtf8OtherBytes d && isUtf8OtherBytes e && isUtf8OtherBytes f) then isUtf8Ptr (p `plusPtr` 4) q else pure False
                 else pure False
        UNot -> pure False

-- Hex:     0x80
-- Binary:  10000000
-- Decimal: 128
isUtf8b8 :: Word8 -> Bool
isUtf8b8 !w = w .&. 0x80 == 0

-- Hex:     0xE0 
-- Binary:  11100000
-- Decimal: 224
isUtf8b16 :: Word8 -> Bool
isUtf8b16 !w = w .&. 0xE0 == 0xC0

-- Hex:     0xF0
-- Binary:  11110000
-- Decimal: 240 
isUtf8b24 :: Word8 -> Bool
isUtf8b24 !w = w .&. 0xF0 == 0xE0


-- Hex:     0xF8
-- Binary:  11111000
-- Decimal: 248
isUtf8b32 :: Word8 -> Bool
isUtf8b32 !w = w .&. 0xF8 == 0xF0

-- Hex:     0xC0
-- Binary:  11000000
-- Decimal: 192
isUtf8OtherBytes :: Word8 -> Bool
isUtf8OtherBytes !w = w .&. 0xC0 == 0x80

-- | 'isUtf8' firsts calls the very fast 'Data.ByteString.Ascii.isAscii' to see if
--    the data is ASCII (and thus UTF-8). Use this if you know most of your
--    data is ASCII-encoded.
--    If you know that most of your data is probably not UTF8-encoded, it is probably
--    best to use 'isUtf8'' to avoid this check.
isUtf8 :: ByteString -> Bool
isUtf8   (PS _ _ 0) = True
isUtf8 b@(PS fp (I# o#) (I# l#)) = if isAscii b then True else
  accursedUnutterablePerformIO
    $ withForeignPtr fp
      $ \(Ptr addr) ->
        do
          let
            start, end :: Ptr Word8
            start = Ptr (plusAddr# addr o#)
            end   = Ptr (plusAddr# addr (o# +# l#))
            
          isUtf8Ptr start end

-- | 'isUtf8'' does not call 'Data.ByteString.Ascii.isAscii. Use this if
--   you know most of your data is not ASCII-encoded.
--   If you know that most of your data is probably ASCII-encoded, it is
--   probably best to use 'isUtf8'.
isUtf8' :: ByteString -> Bool
isUtf8' (PS _ _ 0) = True
isUtf8' (PS fp (I# o#) (I# l#)) =
  accursedUnutterablePerformIO
    $ withForeignPtr fp
      $ \(Ptr addr) ->
        do
          let
            start, end :: Ptr Word8
            start = Ptr (plusAddr# addr o#)
            end   = Ptr (plusAddr# addr (o# +# l#))
            
          isUtf8Ptr start end