{-# language BangPatterns #-}
{-# language BinaryLiterals #-}
{-# language DataKinds #-}
{-# language DeriveFunctor #-}
{-# language DerivingStrategies #-}
{-# language GADTSyntax #-}
{-# language KindSignatures #-}
{-# language LambdaCase #-}
{-# language MagicHash #-}
{-# language MultiWayIf #-}
{-# language PolyKinds #-}
{-# language RankNTypes #-}
{-# language ScopedTypeVariables #-}
{-# language StandaloneDeriving #-}
{-# language TypeApplications #-}
{-# language UnboxedSums #-}
{-# language UnboxedTuples #-}
{-# language CPP #-}

-- | Parse input as UTF-8-encoded text. Parsers in this module will
-- fail if they encounter a byte above @0x7F@.
module Data.Bytes.Parser.Utf8
  ( -- * Get Character
    any#
  , shortText
  ) where

import Prelude hiding (length,any,fail,takeWhile)

import Data.Bits ((.&.),(.|.),unsafeShiftL,xor)
import Data.Bytes.Parser.Internal (Parser(..))
import Data.Text.Short (ShortText)
import GHC.Exts (Int(I#),Char(C#),Int#,Char#,(-#),(+#),(>#),chr#)
import GHC.Word (Word8(W8#))

import qualified Data.ByteString.Short.Internal as BSS
import qualified Data.Bytes.Parser as Parser
import qualified Data.Primitive as PM
import qualified Data.Text.Short as TS
import qualified GHC.Exts as Exts

-- | Interpret the next one to four bytes as a UTF-8-encoded character.
-- Fails if the decoded codepoint is in the range U+D800 through U+DFFF.
any# :: e -> Parser e s Char#
{-# noinline any# #-}
any# e = Parser
  (\(# arr, off, len #) s0 -> case len ># 0# of
    1# ->
      let !w0 = Exts.indexWord8Array# arr off
       in if | oneByteChar (W8# w0) ->
                 (# s0, (# | (# chr# (Exts.word2Int# (
#if MIN_VERSION_base(4,16,0)
                 Exts.word8ToWord#
#endif
                 w0)), off +# 1#, len -# 1# #) #) #)
             | twoByteChar (W8# w0) ->
                 if | I# len > 1
                    , w1 <- Exts.indexWord8Array# arr (off +# 1#)
                    , followingByte (W8# w1)
                    , C# c <- codepointFromTwoBytes (W8# w0) (W8# w1)
                      -> (# s0, (# | (# c, off +# 2#, len -# 2# #) #) #)
                    | otherwise -> (# s0, (# e | #) #)
             | threeByteChar (W8# w0) ->
                 if | I# len > 2
                    , w1 <- Exts.indexWord8Array# arr (off +# 1# )
                    , w2 <- Exts.indexWord8Array# arr (off +# 2# )
                    , followingByte (W8# w1)
                    , !c@(C# c#) <- codepointFromThreeBytes (W8# w0) (W8# w1) (W8# w2)
                    , c < '\xD800' || c > '\xDFFF'
                      -> (# s0, (# | (# c#, off +# 3#, len -# 3# #) #) #)
                    | otherwise -> (# s0, (# e | #) #)
             | fourByteChar (W8# w0) ->
                 if | I# len > 3
                    , w1 <- Exts.indexWord8Array# arr (off +# 1# )
                    , w2 <- Exts.indexWord8Array# arr (off +# 2# )
                    , w3 <- Exts.indexWord8Array# arr (off +# 3# )
                    , followingByte (W8# w1)
                    , !(C# c#) <- codepointFromFourBytes (W8# w0) (W8# w1) (W8# w2) (W8# w3)
                      -> (# s0, (# | (# c#, off +# 4#, len -# 4# #) #) #)
                    | otherwise -> (# s0, (# e | #) #)
             | otherwise -> (# s0, (# e | #) #)
    _ -> (# s0, (# e | #) #)
  )

codepointFromFourBytes :: Word8 -> Word8 -> Word8 -> Word8 -> Char
codepointFromFourBytes w1 w2 w3 w4 = C#
  ( chr#
    ( unI $ fromIntegral
      ( unsafeShiftL (word8ToWord w1 .&. 0b00001111) 18 .|.
        unsafeShiftL (word8ToWord w2 .&. 0b00111111) 12 .|.
        unsafeShiftL (word8ToWord w3 .&. 0b00111111) 6 .|.
        (word8ToWord w4 .&. 0b00111111)
      )
    )
  )

codepointFromThreeBytes :: Word8 -> Word8 -> Word8 -> Char
codepointFromThreeBytes w1 w2 w3 = C#
  ( chr#
    ( unI $ fromIntegral
      ( unsafeShiftL (word8ToWord w1 .&. 0b00001111) 12 .|.
        unsafeShiftL (word8ToWord w2 .&. 0b00111111) 6 .|.
        (word8ToWord w3 .&. 0b00111111)
      )
    )
  )

codepointFromTwoBytes :: Word8 -> Word8 -> Char
codepointFromTwoBytes w1 w2 = C#
  ( chr#
    ( unI $ fromIntegral @Word @Int
      ( unsafeShiftL (word8ToWord w1 .&. 0b00011111) 6 .|.
        (word8ToWord w2 .&. 0b00111111)
      )
    )
  )

oneByteChar :: Word8 -> Bool
oneByteChar !w = w .&. 0b10000000 == 0

twoByteChar :: Word8 -> Bool
twoByteChar !w = w .&. 0b11100000 == 0b11000000

threeByteChar :: Word8 -> Bool
threeByteChar !w = w .&. 0b11110000 == 0b11100000

fourByteChar :: Word8 -> Bool
fourByteChar !w = w .&. 0b11111000 == 0b11110000

followingByte :: Word8 -> Bool
followingByte !w = xor w 0b01000000 .&. 0b11000000 == 0b11000000

word8ToWord :: Word8 -> Word
word8ToWord = fromIntegral

unI :: Int -> Int#
unI (I# w) = w

-- | Consume input that matches the argument. Fails if the
-- input does not match.
shortText :: e -> ShortText -> Parser e s ()
shortText e !t = Parser.byteArray e
  (shortByteStringToByteArray (TS.toShortByteString t))

shortByteStringToByteArray ::
     BSS.ShortByteString
  -> PM.ByteArray
shortByteStringToByteArray (BSS.SBS x) = PM.ByteArray x