{-# LANGUAGE CPP #-}
{-# LANGUAGE BangPatterns, MagicHash, UnboxedTuples, MultiWayIf #-}
{-# OPTIONS_GHC -O2 -fno-warn-name-shadowing #-}
-- We always optimise this, otherwise performance of a non-optimised
-- compiler is severely affected. This module used to live in the `ghc`
-- package but has been moved to `ghc-boot` because the definition
-- of the package database (needed in both ghc and in ghc-pkg) lives in
-- `ghc-boot` and uses ShortText, which in turn depends on this module.

-- | Simple, non-streaming UTF-8 codecs.
--
-- This is one of several UTF-8 implementations provided by GHC; see Note
-- [GHC's many UTF-8 implementations] in "GHC.Encoding.UTF8" for an
-- overview.
--
module GHC.Utils.Encoding.UTF8
    ( -- * Decoding single characters
      utf8DecodeCharAddr#
    , utf8DecodeCharPtr
    , utf8DecodeCharByteArray#
    , utf8PrevChar
    , utf8CharStart
    , utf8UnconsByteString
      -- * Decoding strings
    , utf8DecodeByteString
    , utf8DecodeShortByteString
    , utf8DecodeForeignPtr
    , utf8DecodeByteArray#
      -- * Counting characters
    , utf8CountCharsShortByteString
    , utf8CountCharsByteArray#
      -- * Comparison
    , utf8CompareByteArray#
    , utf8CompareShortByteString
      -- * Encoding strings
    , utf8EncodeByteArray#
    , utf8EncodePtr
    , utf8EncodeByteString
    , utf8EncodeShortByteString
    , utf8EncodedLength
    ) where


import Prelude

import Foreign
import GHC.IO
#if MIN_VERSION_base(4,18,0)
import GHC.Encoding.UTF8
#else
import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr)
import Data.Char
import GHC.Exts
import GHC.ST
#endif

import Data.ByteString (ByteString)
import qualified Data.ByteString.Internal as BS
import Data.ByteString.Short.Internal (ShortByteString(..))

-- | Find the start of the codepoint preceding the codepoint at the given
-- 'Ptr'. This is undefined if there is no previous valid codepoint.
utf8PrevChar :: Ptr Word8 -> IO (Ptr Word8)
utf8PrevChar :: Ptr Word8 -> IO (Ptr Word8)
utf8PrevChar Ptr Word8
p = Ptr Word8 -> IO (Ptr Word8)
utf8CharStart (Ptr Word8
p forall a b. Ptr a -> Int -> Ptr b
`plusPtr` (-Int
1))

-- | Find the start of the codepoint at the given 'Ptr'. This is undefined if
-- there is no previous valid codepoint.
utf8CharStart :: Ptr Word8 -> IO (Ptr Word8)
utf8CharStart :: Ptr Word8 -> IO (Ptr Word8)
utf8CharStart Ptr Word8
p = forall {b}. (Storable b, Ord b, Num b) => Ptr b -> IO (Ptr b)
go Ptr Word8
p
 where go :: Ptr b -> IO (Ptr b)
go Ptr b
p = do b
w <- forall a. Storable a => Ptr a -> IO a
peek Ptr b
p
                 if b
w forall a. Ord a => a -> a -> Bool
>= b
0x80 Bool -> Bool -> Bool
&& b
w forall a. Ord a => a -> a -> Bool
< b
0xC0
                        then Ptr b -> IO (Ptr b)
go (Ptr b
p forall a b. Ptr a -> Int -> Ptr b
`plusPtr` (-Int
1))
                        else forall (m :: * -> *) a. Monad m => a -> m a
return Ptr b
p

utf8CountCharsShortByteString :: ShortByteString -> Int
utf8CountCharsShortByteString :: ShortByteString -> Int
utf8CountCharsShortByteString (SBS ByteArray#
ba) = ByteArray# -> Int
utf8CountCharsByteArray# ByteArray#
ba

utf8DecodeShortByteString :: ShortByteString -> [Char]
utf8DecodeShortByteString :: ShortByteString -> [Char]
utf8DecodeShortByteString (SBS ByteArray#
ba#) = ByteArray# -> [Char]
utf8DecodeByteArray# ByteArray#
ba#

-- | Decode a 'ByteString' containing a UTF-8 string.
utf8DecodeByteString :: ByteString -> [Char]
utf8DecodeByteString :: ByteString -> [Char]
utf8DecodeByteString (BS.PS ForeignPtr Word8
fptr Int
offset Int
len)
  = ForeignPtr Word8 -> Int -> Int -> [Char]
utf8DecodeForeignPtr ForeignPtr Word8
fptr Int
offset Int
len

utf8EncodeShortByteString :: String -> ShortByteString
utf8EncodeShortByteString :: [Char] -> ShortByteString
utf8EncodeShortByteString [Char]
str = ByteArray# -> ShortByteString
SBS ([Char] -> ByteArray#
utf8EncodeByteArray# [Char]
str)

-- | Encode a 'String' into a 'ByteString'.
utf8EncodeByteString :: String -> ByteString
utf8EncodeByteString :: [Char] -> ByteString
utf8EncodeByteString [Char]
s =
  forall a. IO a -> a
unsafePerformIO forall a b. (a -> b) -> a -> b
$ do
    let len :: Int
len = [Char] -> Int
utf8EncodedLength [Char]
s
    ForeignPtr Word8
buf <- forall a. Int -> IO (ForeignPtr a)
mallocForeignPtrBytes Int
len
    forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr Word8
buf forall a b. (a -> b) -> a -> b
$ \Ptr Word8
ptr -> do
      Ptr Word8 -> [Char] -> IO ()
utf8EncodePtr Ptr Word8
ptr [Char]
s
      forall (f :: * -> *) a. Applicative f => a -> f a
pure (ForeignPtr Word8 -> Int -> Int -> ByteString
BS.fromForeignPtr ForeignPtr Word8
buf Int
0 Int
len)

utf8UnconsByteString :: ByteString -> Maybe (Char, ByteString)
utf8UnconsByteString :: ByteString -> Maybe (Char, ByteString)
utf8UnconsByteString (BS.PS ForeignPtr Word8
_ Int
_ Int
0) = forall a. Maybe a
Nothing
utf8UnconsByteString (BS.PS ForeignPtr Word8
fptr Int
offset Int
len)
  = forall a. IO a -> a
unsafeDupablePerformIO forall a b. (a -> b) -> a -> b
$
      forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr Word8
fptr forall a b. (a -> b) -> a -> b
$ \Ptr Word8
ptr -> do
        let (Char
c,Int
n) = Ptr Word8 -> (Char, Int)
utf8DecodeCharPtr (Ptr Word8
ptr forall a b. Ptr a -> Int -> Ptr b
`plusPtr` Int
offset)
        forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall a. a -> Maybe a
Just (Char
c, ForeignPtr Word8 -> Int -> Int -> ByteString
BS.PS ForeignPtr Word8
fptr (Int
offset forall a. Num a => a -> a -> a
+ Int
n) (Int
len forall a. Num a => a -> a -> a
- Int
n))

utf8CompareShortByteString :: ShortByteString -> ShortByteString -> Ordering
utf8CompareShortByteString :: ShortByteString -> ShortByteString -> Ordering
utf8CompareShortByteString (SBS ByteArray#
a1) (SBS ByteArray#
a2) = ByteArray# -> ByteArray# -> Ordering
utf8CompareByteArray# ByteArray#
a1 ByteArray#
a2

---------------------------------------------------------
-- Everything below was moved into base in GHC 9.6
--
-- These can be dropped in GHC 9.6 + 2 major releases.
---------------------------------------------------------

#if !MIN_VERSION_base(4,18,0)

-- We can't write the decoder as efficiently as we'd like without
-- resorting to unboxed extensions, unfortunately.  I tried to write
-- an IO version of this function, but GHC can't eliminate boxed
-- results from an IO-returning function.
--
-- We assume we can ignore overflow when parsing a multibyte character here.
-- To make this safe, we add extra sentinel bytes to unparsed UTF-8 sequences
-- before decoding them (see "GHC.Data.StringBuffer").

{-# INLINE utf8DecodeChar# #-}
-- | Decode a single codepoint from a byte buffer indexed by the given indexing
-- function.
utf8DecodeChar# :: (Int# -> Word#) -> (# Char#, Int# #)
utf8DecodeChar# :: (Int# -> Word#) -> (# Char#, Int# #)
utf8DecodeChar# Int# -> Word#
indexWord8# =
  let !ch0 :: Int#
ch0 = Word# -> Int#
word2Int# (Int# -> Word#
indexWord8# Int#
0#) in
  case () of
    ()
_ | Int# -> Bool
isTrue# (Int#
ch0 Int# -> Int# -> Int#
<=# Int#
0x7F#) -> (# Int# -> Char#
chr# Int#
ch0, Int#
1# #)

      | Int# -> Bool
isTrue# ((Int#
ch0 Int# -> Int# -> Int#
>=# Int#
0xC0#) Int# -> Int# -> Int#
`andI#` (Int#
ch0 Int# -> Int# -> Int#
<=# Int#
0xDF#)) ->
        let !ch1 :: Int#
ch1 = Word# -> Int#
word2Int# (Int# -> Word#
indexWord8# Int#
1#) in
        if Int# -> Bool
isTrue# ((Int#
ch1 Int# -> Int# -> Int#
<# Int#
0x80#) Int# -> Int# -> Int#
`orI#` (Int#
ch1 Int# -> Int# -> Int#
>=# Int#
0xC0#)) then Int# -> (# Char#, Int# #)
fail Int#
1# else
        (# Int# -> Char#
chr# (((Int#
ch0 Int# -> Int# -> Int#
-# Int#
0xC0#) Int# -> Int# -> Int#
`uncheckedIShiftL#` Int#
6#) Int# -> Int# -> Int#
+#
                  (Int#
ch1 Int# -> Int# -> Int#
-# Int#
0x80#)),
           Int#
2# #)

      | Int# -> Bool
isTrue# ((Int#
ch0 Int# -> Int# -> Int#
>=# Int#
0xE0#) Int# -> Int# -> Int#
`andI#` (Int#
ch0 Int# -> Int# -> Int#
<=# Int#
0xEF#)) ->
        let !ch1 :: Int#
ch1 = Word# -> Int#
word2Int# (Int# -> Word#
indexWord8# Int#
1#) in
        if Int# -> Bool
isTrue# ((Int#
ch1 Int# -> Int# -> Int#
<# Int#
0x80#) Int# -> Int# -> Int#
`orI#` (Int#
ch1 Int# -> Int# -> Int#
>=# Int#
0xC0#)) then Int# -> (# Char#, Int# #)
fail Int#
1# else
        let !ch2 :: Int#
ch2 = Word# -> Int#
word2Int# (Int# -> Word#
indexWord8# Int#
2#) in
        if Int# -> Bool
isTrue# ((Int#
ch2 Int# -> Int# -> Int#
<# Int#
0x80#) Int# -> Int# -> Int#
`orI#` (Int#
ch2 Int# -> Int# -> Int#
>=# Int#
0xC0#)) then Int# -> (# Char#, Int# #)
fail Int#
2# else
        (# Int# -> Char#
chr# (((Int#
ch0 Int# -> Int# -> Int#
-# Int#
0xE0#) Int# -> Int# -> Int#
`uncheckedIShiftL#` Int#
12#) Int# -> Int# -> Int#
+#
                 ((Int#
ch1 Int# -> Int# -> Int#
-# Int#
0x80#) Int# -> Int# -> Int#
`uncheckedIShiftL#` Int#
6#)  Int# -> Int# -> Int#
+#
                  (Int#
ch2 Int# -> Int# -> Int#
-# Int#
0x80#)),
           Int#
3# #)

     | Int# -> Bool
isTrue# ((Int#
ch0 Int# -> Int# -> Int#
>=# Int#
0xF0#) Int# -> Int# -> Int#
`andI#` (Int#
ch0 Int# -> Int# -> Int#
<=# Int#
0xF8#)) ->
        let !ch1 :: Int#
ch1 = Word# -> Int#
word2Int# (Int# -> Word#
indexWord8# Int#
1#) in
        if Int# -> Bool
isTrue# ((Int#
ch1 Int# -> Int# -> Int#
<# Int#
0x80#) Int# -> Int# -> Int#
`orI#` (Int#
ch1 Int# -> Int# -> Int#
>=# Int#
0xC0#)) then Int# -> (# Char#, Int# #)
fail Int#
1# else
        let !ch2 :: Int#
ch2 = Word# -> Int#
word2Int# (Int# -> Word#
indexWord8# Int#
2#) in
        if Int# -> Bool
isTrue# ((Int#
ch2 Int# -> Int# -> Int#
<# Int#
0x80#) Int# -> Int# -> Int#
`orI#` (Int#
ch2 Int# -> Int# -> Int#
>=# Int#
0xC0#)) then Int# -> (# Char#, Int# #)
fail Int#
2# else
        let !ch3 :: Int#
ch3 = Word# -> Int#
word2Int# (Int# -> Word#
indexWord8# Int#
3#) in
        if Int# -> Bool
isTrue# ((Int#
ch3 Int# -> Int# -> Int#
<# Int#
0x80#) Int# -> Int# -> Int#
`orI#` (Int#
ch3 Int# -> Int# -> Int#
>=# Int#
0xC0#)) then Int# -> (# Char#, Int# #)
fail Int#
3# else
        (# Int# -> Char#
chr# (((Int#
ch0 Int# -> Int# -> Int#
-# Int#
0xF0#) Int# -> Int# -> Int#
`uncheckedIShiftL#` Int#
18#) Int# -> Int# -> Int#
+#
                 ((Int#
ch1 Int# -> Int# -> Int#
-# Int#
0x80#) Int# -> Int# -> Int#
`uncheckedIShiftL#` Int#
12#) Int# -> Int# -> Int#
+#
                 ((Int#
ch2 Int# -> Int# -> Int#
-# Int#
0x80#) Int# -> Int# -> Int#
`uncheckedIShiftL#` Int#
6#)  Int# -> Int# -> Int#
+#
                  (Int#
ch3 Int# -> Int# -> Int#
-# Int#
0x80#)),
           Int#
4# #)

      | Bool
otherwise -> Int# -> (# Char#, Int# #)
fail Int#
1#
  where
        -- all invalid sequences end up here:
        fail :: Int# -> (# Char#, Int# #)
        fail :: Int# -> (# Char#, Int# #)
fail Int#
nBytes# = (# Char#
'\0'#, Int#
nBytes# #)
        -- '\xFFFD' would be the usual replacement character, but
        -- that's a valid symbol in Haskell, so will result in a
        -- confusing parse error later on.  Instead we use '\0' which
        -- will signal a lexer error immediately.

-- | Decode a single character at the given 'Addr#'.
utf8DecodeCharAddr# :: Addr# -> Int# -> (# Char#, Int# #)
utf8DecodeCharAddr# :: Addr# -> Int# -> (# Char#, Int# #)
utf8DecodeCharAddr# Addr#
a# Int#
off# =
#if !MIN_VERSION_base(4,16,0)
    utf8DecodeChar# (\i# -> indexWord8OffAddr# a# (i# +# off#))
#else
    (Int# -> Word#) -> (# Char#, Int# #)
utf8DecodeChar# (\Int#
i# -> Word8# -> Word#
word8ToWord# (Addr# -> Int# -> Word8#
indexWord8OffAddr# Addr#
a# (Int#
i# Int# -> Int# -> Int#
+# Int#
off#)))
#endif

-- | Decode a single codepoint starting at the given 'Ptr'.
utf8DecodeCharPtr :: Ptr Word8 -> (Char, Int)
utf8DecodeCharPtr :: Ptr Word8 -> (Char, Int)
utf8DecodeCharPtr !(Ptr Addr#
a#) =
  case Addr# -> Int# -> (# Char#, Int# #)
utf8DecodeCharAddr# Addr#
a# Int#
0# of
    (# Char#
c#, Int#
nBytes# #) -> ( Char# -> Char
C# Char#
c#, Int# -> Int
I# Int#
nBytes# )

-- | Decode a single codepoint starting at the given byte offset into a
-- 'ByteArray#'.
utf8DecodeCharByteArray# :: ByteArray# -> Int# -> (# Char#, Int# #)
utf8DecodeCharByteArray# :: ByteArray# -> Int# -> (# Char#, Int# #)
utf8DecodeCharByteArray# ByteArray#
ba# Int#
off# =
#if !MIN_VERSION_base(4,16,0)
    utf8DecodeChar# (\i# -> indexWord8Array# ba# (i# +# off#))
#else
    (Int# -> Word#) -> (# Char#, Int# #)
utf8DecodeChar# (\Int#
i# -> Word8# -> Word#
word8ToWord# (ByteArray# -> Int# -> Word8#
indexWord8Array# ByteArray#
ba# (Int#
i# Int# -> Int# -> Int#
+# Int#
off#)))
#endif

{-# INLINE utf8Decode# #-}
utf8Decode# :: (IO ()) -> (Int# -> (# Char#, Int# #)) -> Int# -> IO [Char]
utf8Decode# :: IO () -> (Int# -> (# Char#, Int# #)) -> Int# -> IO [Char]
utf8Decode# IO ()
retain Int# -> (# Char#, Int# #)
decodeChar# Int#
len#
  = Int# -> IO [Char]
unpack Int#
0#
  where
    unpack :: Int# -> IO [Char]
unpack Int#
i#
        | Int# -> Bool
isTrue# (Int#
i# Int# -> Int# -> Int#
>=# Int#
len#) = IO ()
retain forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return []
        | Bool
otherwise =
            case Int# -> (# Char#, Int# #)
decodeChar# Int#
i# of
              (# Char#
c#, Int#
nBytes# #) -> do
                [Char]
rest <- forall a. IO a -> IO a
unsafeDupableInterleaveIO forall a b. (a -> b) -> a -> b
$ Int# -> IO [Char]
unpack (Int#
i# Int# -> Int# -> Int#
+# Int#
nBytes#)
                forall (m :: * -> *) a. Monad m => a -> m a
return (Char# -> Char
C# Char#
c# forall a. a -> [a] -> [a]
: [Char]
rest)

utf8DecodeForeignPtr :: ForeignPtr Word8 -> Int -> Int -> [Char]
utf8DecodeForeignPtr :: ForeignPtr Word8 -> Int -> Int -> [Char]
utf8DecodeForeignPtr ForeignPtr Word8
fp Int
offset (I# Int#
len#)
  = forall a. IO a -> a
unsafeDupablePerformIO forall a b. (a -> b) -> a -> b
$ do
      let !(Ptr Addr#
a#) = forall a. ForeignPtr a -> Ptr a
unsafeForeignPtrToPtr ForeignPtr Word8
fp forall a b. Ptr a -> Int -> Ptr b
`plusPtr` Int
offset
      IO () -> (Int# -> (# Char#, Int# #)) -> Int# -> IO [Char]
utf8Decode# (forall a. ForeignPtr a -> IO ()
touchForeignPtr ForeignPtr Word8
fp) (Addr# -> Int# -> (# Char#, Int# #)
utf8DecodeCharAddr# Addr#
a#) Int#
len#
-- Note that since utf8Decode# returns a thunk the lifetime of the
-- ForeignPtr actually needs to be longer than the lexical lifetime
-- withForeignPtr would provide here. That's why we use touchForeignPtr to
-- keep the fp alive until the last character has actually been decoded.

utf8DecodeByteArray# :: ByteArray# -> [Char]
utf8DecodeByteArray# :: ByteArray# -> [Char]
utf8DecodeByteArray# ByteArray#
ba#
  = forall a. IO a -> a
unsafeDupablePerformIO forall a b. (a -> b) -> a -> b
$
      let len# :: Int#
len# = ByteArray# -> Int#
sizeofByteArray# ByteArray#
ba# in
      IO () -> (Int# -> (# Char#, Int# #)) -> Int# -> IO [Char]
utf8Decode# (forall (m :: * -> *) a. Monad m => a -> m a
return ()) (ByteArray# -> Int# -> (# Char#, Int# #)
utf8DecodeCharByteArray# ByteArray#
ba#) Int#
len#

utf8CompareByteArray# :: ByteArray# -> ByteArray# -> Ordering
utf8CompareByteArray# :: ByteArray# -> ByteArray# -> Ordering
utf8CompareByteArray# ByteArray#
a1 ByteArray#
a2 = Int# -> Int# -> Ordering
go Int#
0# Int#
0#
   -- UTF-8 has the property that sorting by bytes values also sorts by
   -- code-points.
   -- BUT we use "Modified UTF-8" which encodes \0 as 0xC080 so this property
   -- doesn't hold and we must explicitly check this case here.
   -- Note that decoding every code point would also work but it would be much
   -- more costly.
   where
       !sz1 :: Int#
sz1 = ByteArray# -> Int#
sizeofByteArray# ByteArray#
a1
       !sz2 :: Int#
sz2 = ByteArray# -> Int#
sizeofByteArray# ByteArray#
a2
       go :: Int# -> Int# -> Ordering
go Int#
off1 Int#
off2
         | Int# -> Bool
isTrue# ((Int#
off1 Int# -> Int# -> Int#
>=# Int#
sz1) Int# -> Int# -> Int#
`andI#` (Int#
off2 Int# -> Int# -> Int#
>=# Int#
sz2)) = Ordering
EQ
         | Int# -> Bool
isTrue# (Int#
off1 Int# -> Int# -> Int#
>=# Int#
sz1)                          = Ordering
LT
         | Int# -> Bool
isTrue# (Int#
off2 Int# -> Int# -> Int#
>=# Int#
sz2)                          = Ordering
GT
         | Bool
otherwise =
#if !MIN_VERSION_base(4,16,0)
               let !b1_1 = indexWord8Array# a1 off1
                   !b2_1 = indexWord8Array# a2 off2
#else
               let !b1_1 :: Word#
b1_1 = Word8# -> Word#
word8ToWord# (ByteArray# -> Int# -> Word8#
indexWord8Array# ByteArray#
a1 Int#
off1)
                   !b2_1 :: Word#
b2_1 = Word8# -> Word#
word8ToWord# (ByteArray# -> Int# -> Word8#
indexWord8Array# ByteArray#
a2 Int#
off2)
#endif
               in case Word#
b1_1 of
                  Word#
0xC0## -> case Word#
b2_1 of
                     Word#
0xC0## -> Int# -> Int# -> Ordering
go (Int#
off1 Int# -> Int# -> Int#
+# Int#
1#) (Int#
off2 Int# -> Int# -> Int#
+# Int#
1#)
#if !MIN_VERSION_base(4,16,0)
                     _      -> case indexWord8Array# a1 (off1 +# 1#) of
#else
                     Word#
_      -> case Word8# -> Word#
word8ToWord# (ByteArray# -> Int# -> Word8#
indexWord8Array# ByteArray#
a1 (Int#
off1 Int# -> Int# -> Int#
+# Int#
1#)) of
#endif
                        Word#
0x80## -> Ordering
LT
                        Word#
_      -> Int# -> Int# -> Ordering
go (Int#
off1 Int# -> Int# -> Int#
+# Int#
1#) (Int#
off2 Int# -> Int# -> Int#
+# Int#
1#)
                  Word#
_      -> case Word#
b2_1 of
#if !MIN_VERSION_base(4,16,0)
                     0xC0## -> case indexWord8Array# a2 (off2 +# 1#) of
#else
                     Word#
0xC0## -> case Word8# -> Word#
word8ToWord# (ByteArray# -> Int# -> Word8#
indexWord8Array# ByteArray#
a2 (Int#
off2 Int# -> Int# -> Int#
+# Int#
1#)) of
#endif
                        Word#
0x80## -> Ordering
GT
                        Word#
_      -> Int# -> Int# -> Ordering
go (Int#
off1 Int# -> Int# -> Int#
+# Int#
1#) (Int#
off2 Int# -> Int# -> Int#
+# Int#
1#)
                     Word#
_   | Int# -> Bool
isTrue# (Word#
b1_1 Word# -> Word# -> Int#
`gtWord#` Word#
b2_1) -> Ordering
GT
                         | Int# -> Bool
isTrue# (Word#
b1_1 Word# -> Word# -> Int#
`ltWord#` Word#
b2_1) -> Ordering
LT
                         | Bool
otherwise                     -> Int# -> Int# -> Ordering
go (Int#
off1 Int# -> Int# -> Int#
+# Int#
1#) (Int#
off2 Int# -> Int# -> Int#
+# Int#
1#)

utf8CountCharsByteArray# :: ByteArray# -> Int
utf8CountCharsByteArray# :: ByteArray# -> Int
utf8CountCharsByteArray# ByteArray#
ba = Int# -> Int# -> Int
go Int#
0# Int#
0#
  where
    len# :: Int#
len# = ByteArray# -> Int#
sizeofByteArray# ByteArray#
ba
    go :: Int# -> Int# -> Int
go Int#
i# Int#
n#
      | Int# -> Bool
isTrue# (Int#
i# Int# -> Int# -> Int#
>=# Int#
len#) = Int# -> Int
I# Int#
n#
      | Bool
otherwise =
          case ByteArray# -> Int# -> (# Char#, Int# #)
utf8DecodeCharByteArray# ByteArray#
ba Int#
i# of
            (# Char#
_, Int#
nBytes# #) -> Int# -> Int# -> Int
go (Int#
i# Int# -> Int# -> Int#
+# Int#
nBytes#) (Int#
n# Int# -> Int# -> Int#
+# Int#
1#)

{-# INLINE utf8EncodeChar #-}
utf8EncodeChar :: (Int# -> Word8# -> State# s -> State# s)
               -> Char -> ST s Int
utf8EncodeChar :: forall s.
(Int# -> Word8# -> State# s -> State# s) -> Char -> ST s Int
utf8EncodeChar Int# -> Word8# -> State# s -> State# s
write# Char
c =
  let x :: Word
x = forall a b. (Integral a, Num b) => a -> b
fromIntegral (Char -> Int
ord Char
c) in
  case () of
    ()
_ | Word
x forall a. Ord a => a -> a -> Bool
> Word
0 Bool -> Bool -> Bool
&& Word
x forall a. Ord a => a -> a -> Bool
<= Word
0x007f -> do
          Int -> Word -> ST s ()
write Int
0 Word
x
          forall (m :: * -> *) a. Monad m => a -> m a
return Int
1
        -- NB. '\0' is encoded as '\xC0\x80', not '\0'.  This is so that we
        -- can have 0-terminated UTF-8 strings (see GHC.Base.unpackCStringUtf8).
      | Word
x forall a. Ord a => a -> a -> Bool
<= Word
0x07ff -> do
          Int -> Word -> ST s ()
write Int
0 (Word
0xC0 forall a. Bits a => a -> a -> a
.|. ((Word
x forall a. Bits a => a -> Int -> a
`shiftR` Int
6) forall a. Bits a => a -> a -> a
.&. Word
0x1F))
          Int -> Word -> ST s ()
write Int
1 (Word
0x80 forall a. Bits a => a -> a -> a
.|. (Word
x forall a. Bits a => a -> a -> a
.&. Word
0x3F))
          forall (m :: * -> *) a. Monad m => a -> m a
return Int
2
      | Word
x forall a. Ord a => a -> a -> Bool
<= Word
0xffff -> do
          Int -> Word -> ST s ()
write Int
0 (Word
0xE0 forall a. Bits a => a -> a -> a
.|. (Word
x forall a. Bits a => a -> Int -> a
`shiftR` Int
12) forall a. Bits a => a -> a -> a
.&. Word
0x0F)
          Int -> Word -> ST s ()
write Int
1 (Word
0x80 forall a. Bits a => a -> a -> a
.|. (Word
x forall a. Bits a => a -> Int -> a
`shiftR` Int
6) forall a. Bits a => a -> a -> a
.&. Word
0x3F)
          Int -> Word -> ST s ()
write Int
2 (Word
0x80 forall a. Bits a => a -> a -> a
.|. (Word
x forall a. Bits a => a -> a -> a
.&. Word
0x3F))
          forall (m :: * -> *) a. Monad m => a -> m a
return Int
3
      | Bool
otherwise -> do
          Int -> Word -> ST s ()
write Int
0 (Word
0xF0 forall a. Bits a => a -> a -> a
.|. (Word
x forall a. Bits a => a -> Int -> a
`shiftR` Int
18))
          Int -> Word -> ST s ()
write Int
1 (Word
0x80 forall a. Bits a => a -> a -> a
.|. ((Word
x forall a. Bits a => a -> Int -> a
`shiftR` Int
12) forall a. Bits a => a -> a -> a
.&. Word
0x3F))
          Int -> Word -> ST s ()
write Int
2 (Word
0x80 forall a. Bits a => a -> a -> a
.|. ((Word
x forall a. Bits a => a -> Int -> a
`shiftR` Int
6) forall a. Bits a => a -> a -> a
.&. Word
0x3F))
          Int -> Word -> ST s ()
write Int
3 (Word
0x80 forall a. Bits a => a -> a -> a
.|. (Word
x forall a. Bits a => a -> a -> a
.&. Word
0x3F))
          forall (m :: * -> *) a. Monad m => a -> m a
return Int
4
  where
    {-# INLINE write #-}
    write :: Int -> Word -> ST s ()
write (I# Int#
off#) (W# Word#
c#) = forall s a. STRep s a -> ST s a
ST forall a b. (a -> b) -> a -> b
$ \State# s
s ->
#if !MIN_VERSION_base(4,16,0)
      case write# off# (narrowWord8# c#) s of
#else
      case Int# -> Word8# -> State# s -> State# s
write# Int#
off# (Word# -> Word8#
wordToWord8# Word#
c#) State# s
s of
#endif
        State# s
s -> (# State# s
s, () #)

utf8EncodePtr :: Ptr Word8 -> String -> IO ()
utf8EncodePtr :: Ptr Word8 -> [Char] -> IO ()
utf8EncodePtr (Ptr Addr#
a#) [Char]
str = Addr# -> [Char] -> IO ()
go Addr#
a# [Char]
str
  where go :: Addr# -> [Char] -> IO ()
go !Addr#
_   []   = forall (m :: * -> *) a. Monad m => a -> m a
return ()
        go Addr#
a# (Char
c:[Char]
cs) = do
#if !MIN_VERSION_base(4,16,0)
          -- writeWord8OffAddr# was taking a Word#
          I# off# <- stToIO $ utf8EncodeChar (\i w -> writeWord8OffAddr# a# i (extendWord8# w)) c
#else
          I# Int#
off# <- forall a. ST RealWorld a -> IO a
stToIO forall a b. (a -> b) -> a -> b
$ forall s.
(Int# -> Word8# -> State# s -> State# s) -> Char -> ST s Int
utf8EncodeChar (forall d. Addr# -> Int# -> Word8# -> State# d -> State# d
writeWord8OffAddr# Addr#
a#) Char
c
#endif
          Addr# -> [Char] -> IO ()
go (Addr#
a# Addr# -> Int# -> Addr#
`plusAddr#` Int#
off#) [Char]
cs

utf8EncodeByteArray# :: String -> ByteArray#
utf8EncodeByteArray# :: [Char] -> ByteArray#
utf8EncodeByteArray# [Char]
str = forall o. (State# RealWorld -> o) -> o
runRW# forall a b. (a -> b) -> a -> b
$ \State# RealWorld
s ->
  case [Char] -> Int
utf8EncodedLength [Char]
str         of { I# Int#
len# ->
  case forall d. Int# -> State# d -> (# State# d, MutableByteArray# d #)
newByteArray# Int#
len# State# RealWorld
s          of { (# State# RealWorld
s, MutableByteArray# RealWorld
mba# #) ->
  case forall {s}. MutableByteArray# s -> Int# -> [Char] -> ST s ()
go MutableByteArray# RealWorld
mba# Int#
0# [Char]
str                of { ST STRep RealWorld ()
f_go ->
  case STRep RealWorld ()
f_go State# RealWorld
s                        of { (# State# RealWorld
s, () #) ->
  case forall d.
MutableByteArray# d -> State# d -> (# State# d, ByteArray# #)
unsafeFreezeByteArray# MutableByteArray# RealWorld
mba# State# RealWorld
s of { (# State# RealWorld
_, ByteArray#
ba# #) ->
  ByteArray#
ba# }}}}}
  where
    go :: MutableByteArray# s -> Int# -> [Char] -> ST s ()
go MutableByteArray# s
_ Int#
_ [] = forall (m :: * -> *) a. Monad m => a -> m a
return ()
    go MutableByteArray# s
mba# Int#
i# (Char
c:[Char]
cs) = do
#if !MIN_VERSION_base(4,16,0)
      -- writeWord8Array# was taking a Word#
      I# off# <- utf8EncodeChar (\j# w -> writeWord8Array# mba# (i# +# j#) (extendWord8# w)) c
#else
      I# Int#
off# <- forall s.
(Int# -> Word8# -> State# s -> State# s) -> Char -> ST s Int
utf8EncodeChar (\Int#
j# -> forall d.
MutableByteArray# d -> Int# -> Word8# -> State# d -> State# d
writeWord8Array# MutableByteArray# s
mba# (Int#
i# Int# -> Int# -> Int#
+# Int#
j#)) Char
c
#endif
      MutableByteArray# s -> Int# -> [Char] -> ST s ()
go MutableByteArray# s
mba# (Int#
i# Int# -> Int# -> Int#
+# Int#
off#) [Char]
cs

utf8EncodedLength :: String -> Int
utf8EncodedLength :: [Char] -> Int
utf8EncodedLength [Char]
str = forall {t}. Num t => t -> [Char] -> t
go Int
0 [Char]
str
  where go :: t -> [Char] -> t
go !t
n [] = t
n
        go t
n (Char
c:[Char]
cs)
          | Char -> Int
ord Char
c forall a. Ord a => a -> a -> Bool
> Int
0 Bool -> Bool -> Bool
&& Char -> Int
ord Char
c forall a. Ord a => a -> a -> Bool
<= Int
0x007f = t -> [Char] -> t
go (t
nforall a. Num a => a -> a -> a
+t
1) [Char]
cs
          | Char -> Int
ord Char
c forall a. Ord a => a -> a -> Bool
<= Int
0x07ff = t -> [Char] -> t
go (t
nforall a. Num a => a -> a -> a
+t
2) [Char]
cs
          | Char -> Int
ord Char
c forall a. Ord a => a -> a -> Bool
<= Int
0xffff = t -> [Char] -> t
go (t
nforall a. Num a => a -> a -> a
+t
3) [Char]
cs
          | Bool
otherwise       = t -> [Char] -> t
go (t
nforall a. Num a => a -> a -> a
+t
4) [Char]
cs

#endif /* MIN_VERSION_base(4,18,0) */