{-# LANGUAGE UnboxedTuples #-}
{-# LANGUAGE CPP #-}

module Bytezap.Text where

import Bytezap
import Bytezap.Int

import Data.Text.Internal

-- unused import warnings due to messy CPP
import Bytezap.Bytes
import Data.Text.Array qualified as A
import GHC.Exts

import Data.Char ( ord )
import Data.Foldable ( foldl' )
import Data.Bits ( shiftR, (.&.) )

textUtf8 :: Text -> Write
{-# INLINE textUtf8 #-}
#if MIN_VERSION_text(2,0,0)
textUtf8 (Text (A.ByteArray arr#) (I# off#) len@(I# len#)) =
    Write len $ pokeByteArray# arr# off# len#
#else
textUtf8 = error "Bytezap.Text.textUtf8: cba for text-1"
#endif

-- TODO adapted from utf8-string
charUtf8 :: Char -> Write
charUtf8 = go . ord
 where
  go oc
   | oc <= 0x7f       = w8 $ fromIntegral oc

   | oc <= 0x7ff      =    w8 (fromIntegral (0xc0 + (oc `shiftR` 6)))
                        <> w8 (fromIntegral (0x80 + oc .&. 0x3f))

   | oc <= 0xffff     =    w8 (fromIntegral (0xe0 + (oc `shiftR` 12)))
                        <> w8 (fromIntegral (0x80 + ((oc `shiftR` 6) .&. 0x3f)))
                        <> w8 (fromIntegral (0x80 + oc .&. 0x3f))
   | otherwise        =    w8 (fromIntegral (0xf0 + (oc `shiftR` 18)))
                        <> w8 (fromIntegral (0x80 + ((oc `shiftR` 12) .&. 0x3f)))
                        <> w8 (fromIntegral (0x80 + ((oc `shiftR` 6) .&. 0x3f)))
                        <> w8 (fromIntegral (0x80 + oc .&. 0x3f))
{-# INLINE charUtf8 #-}

-- | TODO
--
-- In a perfect world, functions like this would not exist. But this is not a
-- perfect world. 'String's suck for a number of reasons. One big one is that
-- they are horrendous to serialize. Worse, as of GHC 9.6, type-level strings
-- only reflect to 'String'. This function does the best it can to efficiently
-- serialize 'String's. It would be much easier and probably similarly fast to
-- go through 'Text' instead, but who doesn't like a little challenge?
stringUtf8 :: String -> Write
stringUtf8 = foldl' (\w c -> w <> charUtf8 c) mempty
{-# INLINE stringUtf8 #-}