-- Alfred-Margaret: Fast Aho-Corasick string searching
-- Copyright 2022 Channable
--
-- Licensed under the 3-clause BSD license, see the LICENSE file in the
-- repository root.

{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE BinaryLiterals #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE NumericUnderscores #-}
{-# LANGUAGE UnboxedTuples #-}

-- | This module provides functions that allow treating 'Text' values as series of UTF-8 code units
-- instead of characters. Any calls to 'Text' in @alfred-margaret@ go through this module.
-- Therefore we re-export some 'Text' functions, e.g. 'Text.concat'.
module Data.Text.Utf8
    ( CodePoint
    , CodeUnit
    , CodeUnitIndex (..)
    , Text (..)
    , fromByteList
    , isCaseInvariant
    , lengthUtf8
    , lowerCodePoint
    , unlowerCodePoint
    , lowerUtf8
    , toLowerAscii
    , unicode2utf8
    , unpackUtf8
      -- * Decoding
      --
      -- $decoding
    , decode2
    , decode3
    , decode4
    , decodeUtf8
      -- * Indexing
      --
      -- $indexing
    , indexCodeUnit
    , unsafeIndexCodePoint
    , unsafeIndexCodeUnit
    , skipCodePointsBackwards
      -- * Slicing Functions
      --
      -- $slicingFunctions
    , unsafeCutUtf8
    , unsafeSliceUtf8
      -- * Functions on Arrays
      --
      -- $functionsOnArrays
    , arrayContents
    , isArrayPinned
    , unsafeIndexCodePoint'
    , unsafeIndexCodeUnit'
    , BackwardsIter (..)
    , unsafeIndexEndOfCodePoint'
    , unsafeIndexAnywhereInCodePoint'

      -- * General Functions
      --
      -- $generalFunctions
    , Text.concat
    , Text.dropWhile
    , Text.isInfixOf
    , Text.null
    , Text.pack
    , Text.replicate
    , Text.unpack
    , TextSearch.indices
    ) where

import Control.DeepSeq (NFData)
import Data.Bits (Bits (shiftL), shiftR, (.&.), (.|.))
import Data.Hashable (Hashable)
import Data.Text.Internal (Text (..))
import Data.Word (Word8)
import GHC.Generics (Generic)
import Data.Primitive (ByteArray (ByteArray), Prim, byteArrayFromList)
#if defined(HAS_AESON)
import Data.Aeson (FromJSON, ToJSON)
#endif
import Data.Text.Utf8.Unlower (unlowerCodePoint)

import qualified Data.Char as Char
import qualified Data.Text as Text
import qualified Data.Text.Array as TextArray
import qualified Data.Text.Internal.Search as TextSearch
import qualified Data.Text.Unsafe as TextUnsafe
import qualified GHC.Exts as Exts

-- | A UTF-8 code unit is a byte. A Unicode code point can be encoded as up to four code units.
type CodeUnit = Word8

-- | A Unicode code point.
type CodePoint = Char

-- | An index into the raw UTF-8 data of a `Text`. This is not the code point
-- index as conventionally accepted by `Text`, so we wrap it to avoid confusing
-- the two. Incorrect index manipulation can lead to surrogate pairs being
-- sliced, so manipulate indices with care. This type is also used for lengths.
newtype CodeUnitIndex = CodeUnitIndex
    { CodeUnitIndex -> Int
codeUnitIndex :: Int
    }
    deriving stock (CodeUnitIndex -> CodeUnitIndex -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: CodeUnitIndex -> CodeUnitIndex -> Bool
$c/= :: CodeUnitIndex -> CodeUnitIndex -> Bool
== :: CodeUnitIndex -> CodeUnitIndex -> Bool
$c== :: CodeUnitIndex -> CodeUnitIndex -> Bool
Eq, Eq CodeUnitIndex
CodeUnitIndex -> CodeUnitIndex -> Bool
CodeUnitIndex -> CodeUnitIndex -> Ordering
CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
$cmin :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
max :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
$cmax :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
>= :: CodeUnitIndex -> CodeUnitIndex -> Bool
$c>= :: CodeUnitIndex -> CodeUnitIndex -> Bool
> :: CodeUnitIndex -> CodeUnitIndex -> Bool
$c> :: CodeUnitIndex -> CodeUnitIndex -> Bool
<= :: CodeUnitIndex -> CodeUnitIndex -> Bool
$c<= :: CodeUnitIndex -> CodeUnitIndex -> Bool
< :: CodeUnitIndex -> CodeUnitIndex -> Bool
$c< :: CodeUnitIndex -> CodeUnitIndex -> Bool
compare :: CodeUnitIndex -> CodeUnitIndex -> Ordering
$ccompare :: CodeUnitIndex -> CodeUnitIndex -> Ordering
Ord, forall x. Rep CodeUnitIndex x -> CodeUnitIndex
forall x. CodeUnitIndex -> Rep CodeUnitIndex x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cto :: forall x. Rep CodeUnitIndex x -> CodeUnitIndex
$cfrom :: forall x. CodeUnitIndex -> Rep CodeUnitIndex x
Generic, CodeUnitIndex
forall a. a -> a -> Bounded a
maxBound :: CodeUnitIndex
$cmaxBound :: CodeUnitIndex
minBound :: CodeUnitIndex
$cminBound :: CodeUnitIndex
Bounded)
#if defined(HAS_AESON)
    deriving newtype (Int -> CodeUnitIndex -> ShowS
[CodeUnitIndex] -> ShowS
CodeUnitIndex -> [Char]
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
showList :: [CodeUnitIndex] -> ShowS
$cshowList :: [CodeUnitIndex] -> ShowS
show :: CodeUnitIndex -> [Char]
$cshow :: CodeUnitIndex -> [Char]
showsPrec :: Int -> CodeUnitIndex -> ShowS
$cshowsPrec :: Int -> CodeUnitIndex -> ShowS
Show, Addr# -> Int# -> CodeUnitIndex
ByteArray# -> Int# -> CodeUnitIndex
CodeUnitIndex -> Int#
forall s.
Addr# -> Int# -> Int# -> CodeUnitIndex -> State# s -> State# s
forall s.
Addr# -> Int# -> State# s -> (# State# s, CodeUnitIndex #)
forall s. Addr# -> Int# -> CodeUnitIndex -> State# s -> State# s
forall s.
MutableByteArray# s
-> Int# -> Int# -> CodeUnitIndex -> State# s -> State# s
forall s.
MutableByteArray# s
-> Int# -> State# s -> (# State# s, CodeUnitIndex #)
forall s.
MutableByteArray# s
-> Int# -> CodeUnitIndex -> State# s -> State# s
forall a.
(a -> Int#)
-> (a -> Int#)
-> (ByteArray# -> Int# -> a)
-> (forall s.
    MutableByteArray# s -> Int# -> State# s -> (# State# s, a #))
-> (forall s.
    MutableByteArray# s -> Int# -> a -> State# s -> State# s)
-> (forall s.
    MutableByteArray# s -> Int# -> Int# -> a -> State# s -> State# s)
-> (Addr# -> Int# -> a)
-> (forall s. Addr# -> Int# -> State# s -> (# State# s, a #))
-> (forall s. Addr# -> Int# -> a -> State# s -> State# s)
-> (forall s. Addr# -> Int# -> Int# -> a -> State# s -> State# s)
-> Prim a
setOffAddr# :: forall s.
Addr# -> Int# -> Int# -> CodeUnitIndex -> State# s -> State# s
$csetOffAddr# :: forall s.
Addr# -> Int# -> Int# -> CodeUnitIndex -> State# s -> State# s
writeOffAddr# :: forall s. Addr# -> Int# -> CodeUnitIndex -> State# s -> State# s
$cwriteOffAddr# :: forall s. Addr# -> Int# -> CodeUnitIndex -> State# s -> State# s
readOffAddr# :: forall s.
Addr# -> Int# -> State# s -> (# State# s, CodeUnitIndex #)
$creadOffAddr# :: forall s.
Addr# -> Int# -> State# s -> (# State# s, CodeUnitIndex #)
indexOffAddr# :: Addr# -> Int# -> CodeUnitIndex
$cindexOffAddr# :: Addr# -> Int# -> CodeUnitIndex
setByteArray# :: forall s.
MutableByteArray# s
-> Int# -> Int# -> CodeUnitIndex -> State# s -> State# s
$csetByteArray# :: forall s.
MutableByteArray# s
-> Int# -> Int# -> CodeUnitIndex -> State# s -> State# s
writeByteArray# :: forall s.
MutableByteArray# s
-> Int# -> CodeUnitIndex -> State# s -> State# s
$cwriteByteArray# :: forall s.
MutableByteArray# s
-> Int# -> CodeUnitIndex -> State# s -> State# s
readByteArray# :: forall s.
MutableByteArray# s
-> Int# -> State# s -> (# State# s, CodeUnitIndex #)
$creadByteArray# :: forall s.
MutableByteArray# s
-> Int# -> State# s -> (# State# s, CodeUnitIndex #)
indexByteArray# :: ByteArray# -> Int# -> CodeUnitIndex
$cindexByteArray# :: ByteArray# -> Int# -> CodeUnitIndex
alignment# :: CodeUnitIndex -> Int#
$calignment# :: CodeUnitIndex -> Int#
sizeOf# :: CodeUnitIndex -> Int#
$csizeOf# :: CodeUnitIndex -> Int#
Prim, Eq CodeUnitIndex
Int -> CodeUnitIndex -> Int
CodeUnitIndex -> Int
forall a. Eq a -> (Int -> a -> Int) -> (a -> Int) -> Hashable a
hash :: CodeUnitIndex -> Int
$chash :: CodeUnitIndex -> Int
hashWithSalt :: Int -> CodeUnitIndex -> Int
$chashWithSalt :: Int -> CodeUnitIndex -> Int
Hashable, Integer -> CodeUnitIndex
CodeUnitIndex -> CodeUnitIndex
CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a.
(a -> a -> a)
-> (a -> a -> a)
-> (a -> a -> a)
-> (a -> a)
-> (a -> a)
-> (a -> a)
-> (Integer -> a)
-> Num a
fromInteger :: Integer -> CodeUnitIndex
$cfromInteger :: Integer -> CodeUnitIndex
signum :: CodeUnitIndex -> CodeUnitIndex
$csignum :: CodeUnitIndex -> CodeUnitIndex
abs :: CodeUnitIndex -> CodeUnitIndex
$cabs :: CodeUnitIndex -> CodeUnitIndex
negate :: CodeUnitIndex -> CodeUnitIndex
$cnegate :: CodeUnitIndex -> CodeUnitIndex
* :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
$c* :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
- :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
$c- :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
+ :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
$c+ :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
Num, CodeUnitIndex -> ()
forall a. (a -> ()) -> NFData a
rnf :: CodeUnitIndex -> ()
$crnf :: CodeUnitIndex -> ()
NFData, Value -> Parser [CodeUnitIndex]
Value -> Parser CodeUnitIndex
forall a.
(Value -> Parser a) -> (Value -> Parser [a]) -> FromJSON a
parseJSONList :: Value -> Parser [CodeUnitIndex]
$cparseJSONList :: Value -> Parser [CodeUnitIndex]
parseJSON :: Value -> Parser CodeUnitIndex
$cparseJSON :: Value -> Parser CodeUnitIndex
FromJSON, [CodeUnitIndex] -> Encoding
[CodeUnitIndex] -> Value
CodeUnitIndex -> Encoding
CodeUnitIndex -> Value
forall a.
(a -> Value)
-> (a -> Encoding)
-> ([a] -> Value)
-> ([a] -> Encoding)
-> ToJSON a
toEncodingList :: [CodeUnitIndex] -> Encoding
$ctoEncodingList :: [CodeUnitIndex] -> Encoding
toJSONList :: [CodeUnitIndex] -> Value
$ctoJSONList :: [CodeUnitIndex] -> Value
toEncoding :: CodeUnitIndex -> Encoding
$ctoEncoding :: CodeUnitIndex -> Encoding
toJSON :: CodeUnitIndex -> Value
$ctoJSON :: CodeUnitIndex -> Value
ToJSON)
#else
    deriving newtype (Show, Prim, Hashable, Num, NFData)
#endif

{-# INLINABLE unpackUtf8 #-}
unpackUtf8 :: Text -> [CodeUnit]
unpackUtf8 :: Text -> [CodeUnit]
unpackUtf8 (Text Array
u8data Int
offset Int
len) =
  let
    go :: Int -> t -> [CodeUnit]
go Int
_ t
0 = []
    go Int
i t
n = Array -> CodeUnitIndex -> CodeUnit
unsafeIndexCodeUnit' Array
u8data (Int -> CodeUnitIndex
CodeUnitIndex Int
i) forall a. a -> [a] -> [a]
: Int -> t -> [CodeUnit]
go (Int
i forall a. Num a => a -> a -> a
+ Int
1) (t
n forall a. Num a => a -> a -> a
- t
1)
  in
    forall {t}. (Eq t, Num t) => Int -> t -> [CodeUnit]
go Int
offset Int
len

-- | The return value of this function is not really an index.
-- However the signature is supposed to make it clear that the length is returned in terms of code units, not code points.
lengthUtf8 :: Text -> CodeUnitIndex
lengthUtf8 :: Text -> CodeUnitIndex
lengthUtf8 (Text Array
_ Int
_ !Int
len) = Int -> CodeUnitIndex
CodeUnitIndex Int
len

-- | Lower-case the ASCII code points A-Z and leave the rest of ASCII intact.
{-# INLINE toLowerAscii #-}
toLowerAscii :: Char -> Char
toLowerAscii :: Char -> Char
toLowerAscii Char
cp
  | Char -> Bool
Char.isAsciiUpper Char
cp = Int -> Char
Char.chr (Char -> Int
Char.ord Char
cp forall a. Num a => a -> a -> a
+ Int
0x20)
  | Bool
otherwise = Char
cp

-- | Lowercase a 'Text' by applying 'lowerCodePoint' to each 'Char'.
{-# INLINE lowerUtf8 #-}
lowerUtf8 :: Text -> Text
lowerUtf8 :: Text -> Text
lowerUtf8 = (Char -> Char) -> Text -> Text
Text.map Char -> Char
lowerCodePoint

asciiCount :: Int
asciiCount :: Int
asciiCount = Int
128

{-# INLINE lowerCodePoint #-}
-- | Lower-Case a UTF-8 codepoint.
-- Uses 'toLowerAscii' for ASCII and 'Char.toLower' otherwise.
lowerCodePoint :: Char -> Char
lowerCodePoint :: Char -> Char
lowerCodePoint Char
cp
  | Char -> Int
Char.ord Char
cp forall a. Ord a => a -> a -> Bool
< Int
asciiCount = Char -> Char
toLowerAscii Char
cp
  | Bool
otherwise = Char -> Char
Char.toLower Char
cp

-- | Convert a Unicode Code Point 'c' into a list of UTF-8 code units (bytes).
unicode2utf8 :: (Ord a, Num a, Bits a) => a -> [a]
{-# INLINE unicode2utf8 #-}
unicode2utf8 :: forall a. (Ord a, Num a, Bits a) => a -> [a]
unicode2utf8 a
c
    | a
c forall a. Ord a => a -> a -> Bool
< a
0x80    = [a
c]
    | a
c forall a. Ord a => a -> a -> Bool
< a
0x800   = [a
0xc0 forall a. Bits a => a -> a -> a
.|. (a
c forall a. Bits a => a -> Int -> a
`shiftR` Int
6), a
0x80 forall a. Bits a => a -> a -> a
.|. (a
0x3f forall a. Bits a => a -> a -> a
.&. a
c)]
    | a
c forall a. Ord a => a -> a -> Bool
< a
0x10000 = [a
0xe0 forall a. Bits a => a -> a -> a
.|. (a
c forall a. Bits a => a -> Int -> a
`shiftR` Int
12), a
0x80 forall a. Bits a => a -> a -> a
.|. (a
0x3f forall a. Bits a => a -> a -> a
.&. (a
c forall a. Bits a => a -> Int -> a
`shiftR` Int
6)), a
0x80 forall a. Bits a => a -> a -> a
.|. (a
0x3f forall a. Bits a => a -> a -> a
.&. a
c)]
    | Bool
otherwise   = [a
0xf0 forall a. Bits a => a -> a -> a
.|. (a
c forall a. Bits a => a -> Int -> a
`shiftR` Int
18), a
0x80 forall a. Bits a => a -> a -> a
.|. (a
0x3f forall a. Bits a => a -> a -> a
.&. (a
c forall a. Bits a => a -> Int -> a
`shiftR` Int
12)), a
0x80 forall a. Bits a => a -> a -> a
.|. (a
0x3f forall a. Bits a => a -> a -> a
.&. (a
c forall a. Bits a => a -> Int -> a
`shiftR` Int
6)), a
0x80 forall a. Bits a => a -> a -> a
.|. (a
0x3f forall a. Bits a => a -> a -> a
.&. a
c)]

fromByteList :: [Word8] -> Text
fromByteList :: [CodeUnit] -> Text
fromByteList [CodeUnit]
byteList = Array -> Int -> Int -> Text
Text (ByteArray# -> Array
TextArray.ByteArray ByteArray#
ba#) Int
0 (forall (t :: * -> *) a. Foldable t => t a -> Int
length [CodeUnit]
byteList)
  where !(ByteArray ByteArray#
ba#) = forall a. Prim a => [a] -> ByteArray
byteArrayFromList [CodeUnit]
byteList

-- | Return whether text has exactly one case variation, such that this function
-- will not return true when Aho–Corasick would differentiate when doing
-- case-insensitive matching.
{-# INLINE isCaseInvariant #-}
isCaseInvariant :: Text -> Bool
isCaseInvariant :: Text -> Bool
isCaseInvariant = (Char -> Bool) -> Text -> Bool
Text.all (\Char
c -> Char -> [Char]
unlowerCodePoint (Char -> Char
lowerCodePoint Char
c) forall a. Eq a => a -> a -> Bool
== [Char
c])

-- $decoding
--
-- Functions that turns code unit sequences into code point sequences.

-- | Decode a single UTF-8 code unit into its code point.
-- The given code unit should have the following format:
--
-- > ┌───────────────┐
-- > │0 x x x x x x x│
-- > └───────────────┘
decode1 :: CodeUnit -> CodePoint
decode1 :: CodeUnit -> Char
decode1 CodeUnit
cu0 =
  Int -> Char
Char.chr forall a b. (a -> b) -> a -> b
$ forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
cu0

-- | Decode 2 UTF-8 code units into their code point.
-- The given code units should have the following format:
--
-- > ┌───────────────┬───────────────┐
-- > │1 1 0 x x x x x│1 0 x x x x x x│
-- > └───────────────┴───────────────┘
{-# INLINE decode2 #-}
decode2 :: CodeUnit -> CodeUnit -> CodePoint
decode2 :: CodeUnit -> CodeUnit -> Char
decode2 CodeUnit
cu0 CodeUnit
cu1 =
  Int -> Char
Char.chr forall a b. (a -> b) -> a -> b
$ (forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
cu0 forall a. Bits a => a -> a -> a
.&. Int
0x1f) forall a. Bits a => a -> Int -> a
`shiftL` Int
6 forall a. Bits a => a -> a -> a
.|. forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
cu1 forall a. Bits a => a -> a -> a
.&. Int
0x3f

-- | Decode 3 UTF-8 code units into their code point.
-- The given code units should have the following format:
--
-- > ┌───────────────┬───────────────┬───────────────┐
-- > │1 1 1 0 x x x x│1 0 x x x x x x│1 0 x x x x x x│
-- > └───────────────┴───────────────┴───────────────┘
{-# INLINE decode3 #-}
decode3 :: CodeUnit -> CodeUnit -> CodeUnit -> CodePoint
decode3 :: CodeUnit -> CodeUnit -> CodeUnit -> Char
decode3 CodeUnit
cu0 CodeUnit
cu1 CodeUnit
cu2 =
  Int -> Char
Char.chr forall a b. (a -> b) -> a -> b
$ (forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
cu0 forall a. Bits a => a -> a -> a
.&. Int
0xf) forall a. Bits a => a -> Int -> a
`shiftL` Int
12 forall a. Bits a => a -> a -> a
.|. (forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
cu1 forall a. Bits a => a -> a -> a
.&. Int
0x3f) forall a. Bits a => a -> Int -> a
`shiftL` Int
6 forall a. Bits a => a -> a -> a
.|. (forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
cu2 forall a. Bits a => a -> a -> a
.&. Int
0x3f)

-- | Decode 4 UTF-8 code units into their code point.
-- The given code units should have the following format:
--
-- > ┌───────────────┬───────────────┬───────────────┬───────────────┐
-- > │1 1 1 1 0 x x x│1 0 x x x x x x│1 0 x x x x x x│1 0 x x x x x x│
-- > └───────────────┴───────────────┴───────────────┴───────────────┘
{-# INLINE decode4 #-}
decode4 :: CodeUnit -> CodeUnit -> CodeUnit -> CodeUnit -> CodePoint
decode4 :: CodeUnit -> CodeUnit -> CodeUnit -> CodeUnit -> Char
decode4 CodeUnit
cu0 CodeUnit
cu1 CodeUnit
cu2 CodeUnit
cu3 =
  Int -> Char
Char.chr forall a b. (a -> b) -> a -> b
$ (forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
cu0 forall a. Bits a => a -> a -> a
.&. Int
0x7) forall a. Bits a => a -> Int -> a
`shiftL` Int
18 forall a. Bits a => a -> a -> a
.|. (forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
cu1 forall a. Bits a => a -> a -> a
.&. Int
0x3f) forall a. Bits a => a -> Int -> a
`shiftL` Int
12 forall a. Bits a => a -> a -> a
.|. (forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
cu2 forall a. Bits a => a -> a -> a
.&. Int
0x3f) forall a. Bits a => a -> Int -> a
`shiftL` Int
6 forall a. Bits a => a -> a -> a
.|. (forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
cu3 forall a. Bits a => a -> a -> a
.&. Int
0x3f)

-- | Decode a list of UTF-8 code units into a list of code points.
decodeUtf8 :: [CodeUnit] -> [CodePoint]
decodeUtf8 :: [CodeUnit] -> [Char]
decodeUtf8 [] = []
decodeUtf8 (CodeUnit
cu0 : [CodeUnit]
cus) | CodeUnit
cu0 forall a. Ord a => a -> a -> Bool
< CodeUnit
0xc0 = Int -> Char
Char.chr (forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
cu0) forall a. a -> [a] -> [a]
: [CodeUnit] -> [Char]
decodeUtf8 [CodeUnit]
cus
decodeUtf8 (CodeUnit
cu0 : CodeUnit
cu1 : [CodeUnit]
cus) | CodeUnit
cu0 forall a. Ord a => a -> a -> Bool
< CodeUnit
0xe0 = CodeUnit -> CodeUnit -> Char
decode2 CodeUnit
cu0 CodeUnit
cu1 forall a. a -> [a] -> [a]
: [CodeUnit] -> [Char]
decodeUtf8 [CodeUnit]
cus
decodeUtf8 (CodeUnit
cu0 : CodeUnit
cu1 : CodeUnit
cu2 : [CodeUnit]
cus) | CodeUnit
cu0 forall a. Ord a => a -> a -> Bool
< CodeUnit
0xf0 = CodeUnit -> CodeUnit -> CodeUnit -> Char
decode3 CodeUnit
cu0 CodeUnit
cu1 CodeUnit
cu2 forall a. a -> [a] -> [a]
: [CodeUnit] -> [Char]
decodeUtf8 [CodeUnit]
cus
decodeUtf8 (CodeUnit
cu0 : CodeUnit
cu1 : CodeUnit
cu2 : CodeUnit
cu3 : [CodeUnit]
cus) | CodeUnit
cu0 forall a. Ord a => a -> a -> Bool
< CodeUnit
0xf8 = CodeUnit -> CodeUnit -> CodeUnit -> CodeUnit -> Char
decode4 CodeUnit
cu0 CodeUnit
cu1 CodeUnit
cu2 CodeUnit
cu3 forall a. a -> [a] -> [a]
: [CodeUnit] -> [Char]
decodeUtf8 [CodeUnit]
cus
decodeUtf8 [CodeUnit]
cus = forall a. HasCallStack => [Char] -> a
error forall a b. (a -> b) -> a -> b
$ [Char]
"Invalid UTF-8 input sequence at " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show (forall a. Int -> [a] -> [a]
take Int
4 [CodeUnit]
cus)

-- $indexing
--
-- 'Text' can be indexed by code units or code points.
-- A 'CodePoint' is a 21-bit Unicode code point and can consist of up to four code units.
-- A 'CodeUnit' is a single byte.

-- | Does exactly the same thing as 'unsafeIndexCodePoint'', but on 'Text' values.
{-# INLINE unsafeIndexCodePoint #-}
unsafeIndexCodePoint :: Text -> CodeUnitIndex -> (CodeUnitIndex, CodePoint)
unsafeIndexCodePoint :: Text -> CodeUnitIndex -> (CodeUnitIndex, Char)
unsafeIndexCodePoint (Text !Array
u8data !Int
off !Int
_len) !CodeUnitIndex
index =
  Array -> CodeUnitIndex -> (CodeUnitIndex, Char)
unsafeIndexCodePoint' Array
u8data forall a b. (a -> b) -> a -> b
$ Int -> CodeUnitIndex
CodeUnitIndex Int
off forall a. Num a => a -> a -> a
+ CodeUnitIndex
index

-- | Get the code unit at the given 'CodeUnitIndex'.
-- Performs bounds checking.
{-# INLINE indexCodeUnit #-}
indexCodeUnit :: Text -> CodeUnitIndex -> CodeUnit
indexCodeUnit :: Text -> CodeUnitIndex -> CodeUnit
indexCodeUnit !Text
text !CodeUnitIndex
index
  | CodeUnitIndex
index forall a. Ord a => a -> a -> Bool
< CodeUnitIndex
0 Bool -> Bool -> Bool
|| CodeUnitIndex
index forall a. Ord a => a -> a -> Bool
>= Text -> CodeUnitIndex
lengthUtf8 Text
text = forall a. HasCallStack => [Char] -> a
error forall a b. (a -> b) -> a -> b
$ [Char]
"Index out of bounds " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show CodeUnitIndex
index
  | Bool
otherwise = Text -> CodeUnitIndex -> CodeUnit
unsafeIndexCodeUnit Text
text CodeUnitIndex
index

{-# INLINE unsafeIndexCodeUnit #-}
unsafeIndexCodeUnit :: Text -> CodeUnitIndex -> CodeUnit
unsafeIndexCodeUnit :: Text -> CodeUnitIndex -> CodeUnit
unsafeIndexCodeUnit (Text !Array
u8data !Int
off !Int
_len) !CodeUnitIndex
index =
  Array -> CodeUnitIndex -> CodeUnit
unsafeIndexCodeUnit' Array
u8data forall a b. (a -> b) -> a -> b
$ Int -> CodeUnitIndex
CodeUnitIndex Int
off forall a. Num a => a -> a -> a
+ CodeUnitIndex
index

-- | Scan backwards through the text until we've seen the specified number of codepoints. Assumes
-- that the initial CodeUnitIndex is within a codepoint.
{-# INLINE skipCodePointsBackwards #-}
skipCodePointsBackwards :: Text -> CodeUnitIndex -> Int -> CodeUnitIndex
skipCodePointsBackwards :: Text -> CodeUnitIndex -> Int -> CodeUnitIndex
skipCodePointsBackwards (Text !Array
u8data !Int
off !Int
len) !CodeUnitIndex
index0 !Int
n0
  | CodeUnitIndex
index0 forall a. Ord a => a -> a -> Bool
>= Int -> CodeUnitIndex
CodeUnitIndex Int
len = forall a. HasCallStack => [Char] -> a
error [Char]
"Invalid use of skipCodePointsBackwards"
  | Bool
otherwise = forall {t}. (Eq t, Num t) => CodeUnitIndex -> t -> CodeUnitIndex
loop (CodeUnitIndex
index0 forall a. Num a => a -> a -> a
+ Int -> CodeUnitIndex
CodeUnitIndex Int
off) Int
n0
  where
    loop :: CodeUnitIndex -> t -> CodeUnitIndex
loop CodeUnitIndex
index t
n | CodeUnitIndex -> Bool
atTrailingByte CodeUnitIndex
index =
      CodeUnitIndex -> t -> CodeUnitIndex
loop (CodeUnitIndex
indexforall a. Num a => a -> a -> a
-CodeUnitIndex
1) t
n  -- Don't exit before we're at a leading byte
    loop CodeUnitIndex
index t
0 | CodeUnitIndex
index forall a. Ord a => a -> a -> Bool
< CodeUnitIndex
0 =
      -- Throw an error if we've read before the array (e.g. when the data was
      -- not valid UTF-8), this one-time check doesn't prevent undefined
      -- behaviour but may help you locate bugs.
      forall a. HasCallStack => [Char] -> a
error [Char]
"Invalid use of skipCodePointsBackwards"
    loop CodeUnitIndex
index t
0 =
      CodeUnitIndex
index forall a. Num a => a -> a -> a
- Int -> CodeUnitIndex
CodeUnitIndex Int
off
    loop CodeUnitIndex
index t
n =
      CodeUnitIndex -> t -> CodeUnitIndex
loop (CodeUnitIndex
indexforall a. Num a => a -> a -> a
-CodeUnitIndex
1) (t
nforall a. Num a => a -> a -> a
-t
1)

    -- Second, third and fourth bytes of a codepoint are always 10xxxxxx, while
    -- the first byte can be 0xxxxxxx or 11yyyyyy.
    atTrailingByte :: CodeUnitIndex -> Bool
atTrailingByte !CodeUnitIndex
index = Array -> CodeUnitIndex -> CodeUnit
unsafeIndexCodeUnit' Array
u8data CodeUnitIndex
index forall a. Bits a => a -> a -> a
.&. CodeUnit
0b1100_0000 forall a. Eq a => a -> a -> Bool
== CodeUnit
0b1000_0000

-- $slicingFunctions
--
-- 'unsafeCutUtf8' and 'unsafeSliceUtf8' are used to retrieve slices of 'Text' values.
-- @unsafeSliceUtf8 begin length@ returns a substring of length @length@ starting at @begin@.
-- @unsafeSliceUtf8 begin length@ returns a tuple of the "surrounding" substrings.
--
-- They satisfy the following property:
--
-- > let (prefix, suffix) = unsafeCutUtf8 begin length t
-- > in concat [prefix, unsafeSliceUtf8 begin length t, suffix] == t
--
-- The following diagram visualizes the relevant offsets for @begin = CodeUnitIndex 2@, @length = CodeUnitIndex 6@ and @t = \"BCDEFGHIJKL\"@.
--
-- >  off                 off+len
-- >   │                     │
-- >   ▼                     ▼
-- > ──┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬──
-- >  A│B│C│D│E│F│G│H│I│J│K│L│M│N
-- > ──┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴──
-- >       ▲           ▲
-- >       │           │
-- >  off+begin   off+begin+length
-- >
-- > unsafeSliceUtf8 begin length t == "DEFGHI"
-- > unsafeCutUtf8 begin length t == ("BC", "JKL")
--
-- The shown array is open at each end because in general, @t@ may be a slice as well.
--
-- __WARNING__: As their name implies, these functions are not (necessarily) bounds-checked. Use at your own risk.

unsafeCutUtf8 :: CodeUnitIndex -- ^ Starting position of substring.
  -> CodeUnitIndex -- ^ Length of substring.
  -> Text -- ^ Initial string.
  -> (Text, Text)
unsafeCutUtf8 :: CodeUnitIndex -> CodeUnitIndex -> Text -> (Text, Text)
unsafeCutUtf8 (CodeUnitIndex !Int
begin) (CodeUnitIndex !Int
len) !Text
text =
  ( Int -> Text -> Text
TextUnsafe.takeWord8 Int
begin Text
text
  , Int -> Text -> Text
TextUnsafe.dropWord8 (Int
begin forall a. Num a => a -> a -> a
+ Int
len) Text
text
  )

unsafeSliceUtf8 :: CodeUnitIndex -> CodeUnitIndex -> Text -> Text
unsafeSliceUtf8 :: CodeUnitIndex -> CodeUnitIndex -> Text -> Text
unsafeSliceUtf8 (CodeUnitIndex !Int
begin) (CodeUnitIndex !Int
len) !Text
text =
  Int -> Text -> Text
TextUnsafe.takeWord8 Int
len forall a b. (a -> b) -> a -> b
$ Int -> Text -> Text
TextUnsafe.dropWord8 Int
begin Text
text

-- $functionsOnArrays
--
-- Functions for working with 'TextArray.Array' values.

-- | See 'Data.Primitive.isByteArrayPinned'.
isArrayPinned :: TextArray.Array -> Bool
isArrayPinned :: Array -> Bool
isArrayPinned (TextArray.ByteArray ByteArray#
ba#) = Int# -> Bool
Exts.isTrue# (ByteArray# -> Int#
Exts.isByteArrayPinned# ByteArray#
ba#)

-- | See 'Data.Primitive.byteArrayContents'.
arrayContents :: TextArray.Array -> Exts.Ptr Word8
arrayContents :: Array -> Ptr CodeUnit
arrayContents (TextArray.ByteArray ByteArray#
ba#) = forall a. Addr# -> Ptr a
Exts.Ptr (ByteArray# -> Addr#
Exts.byteArrayContents# ByteArray#
ba#)

-- | Decode a code point at the given 'CodeUnitIndex'.
-- Returns garbage if there is no valid code point at that position.
-- Does not perform bounds checking.
-- See 'decode2', 'decode3' and 'decode4' for the expected format of multi-byte code points.
unsafeIndexCodePoint' :: TextArray.Array -> CodeUnitIndex -> (CodeUnitIndex, CodePoint)
{-# INLINE unsafeIndexCodePoint' #-}
unsafeIndexCodePoint' :: Array -> CodeUnitIndex -> (CodeUnitIndex, Char)
unsafeIndexCodePoint' !Array
u8data !CodeUnitIndex
idx =
  CodeUnit
-> CodeUnit -> CodeUnit -> CodeUnit -> (CodeUnitIndex, Char)
decodeN (CodeUnitIndex -> CodeUnit
cuAt CodeUnitIndex
0) (CodeUnitIndex -> CodeUnit
cuAt CodeUnitIndex
1) (CodeUnitIndex -> CodeUnit
cuAt CodeUnitIndex
2) (CodeUnitIndex -> CodeUnit
cuAt CodeUnitIndex
3)
  where
    cuAt :: CodeUnitIndex -> CodeUnit
cuAt !CodeUnitIndex
i = Array -> CodeUnitIndex -> CodeUnit
unsafeIndexCodeUnit' Array
u8data forall a b. (a -> b) -> a -> b
$ CodeUnitIndex
idx forall a. Num a => a -> a -> a
+ CodeUnitIndex
i

decodeN :: CodeUnit -> CodeUnit -> CodeUnit -> CodeUnit -> (CodeUnitIndex, CodePoint)
{-# INLINE decodeN #-}
decodeN :: CodeUnit
-> CodeUnit -> CodeUnit -> CodeUnit -> (CodeUnitIndex, Char)
decodeN CodeUnit
cu0 CodeUnit
cu1 CodeUnit
cu2 CodeUnit
cu3
  | CodeUnit
cu0 forall a. Ord a => a -> a -> Bool
< CodeUnit
0xc0 = (CodeUnitIndex
1, CodeUnit -> Char
decode1 CodeUnit
cu0)
  | CodeUnit
cu0 forall a. Ord a => a -> a -> Bool
< CodeUnit
0xe0 = (CodeUnitIndex
2, CodeUnit -> CodeUnit -> Char
decode2 CodeUnit
cu0 CodeUnit
cu1)
  | CodeUnit
cu0 forall a. Ord a => a -> a -> Bool
< CodeUnit
0xf0 = (CodeUnitIndex
3, CodeUnit -> CodeUnit -> CodeUnit -> Char
decode3 CodeUnit
cu0 CodeUnit
cu1 CodeUnit
cu2)
  | Bool
otherwise = (CodeUnitIndex
4, CodeUnit -> CodeUnit -> CodeUnit -> CodeUnit -> Char
decode4 CodeUnit
cu0 CodeUnit
cu1 CodeUnit
cu2 CodeUnit
cu3)



-- | Intermediate state when you're iterating backwards through a Utf8 text.
data BackwardsIter = BackwardsIter
  { BackwardsIter -> CodeUnitIndex
backwardsIterNext :: {-# UNPACK #-} !CodeUnitIndex
    -- ^ First byte to the left of the codepoint that we're focused on. This can
    -- be used with 'unsafeIndexEndOfCodePoint'' to find the next codepoint.
  , BackwardsIter -> Char
backwardsIterChar :: {-# UNPACK #-} !CodePoint
    -- ^ The codepoint that we're focused on
  , BackwardsIter -> CodeUnitIndex
backwardsIterEndOfChar :: {-# UNPACK #-} !CodeUnitIndex
    -- ^ Points to the last byte of the codepoint that we're focused on
  }

-- | Similar to unsafeIndexCodePoint', but assumes that the given index is the
-- end of a utf8 codepoint. It returns the decoded code point and the index
-- _before_ the code point. The resulting index could be passed directly to
-- unsafeIndexEndOfCodePoint' again to decode the _previous_ code point.
unsafeIndexEndOfCodePoint' :: TextArray.Array -> CodeUnitIndex -> BackwardsIter
{-# INLINE unsafeIndexEndOfCodePoint' #-}
unsafeIndexEndOfCodePoint' :: Array -> CodeUnitIndex -> BackwardsIter
unsafeIndexEndOfCodePoint' !Array
u8data !CodeUnitIndex
idx =
  let
    cuAt :: CodeUnitIndex -> CodeUnit
cuAt !CodeUnitIndex
i = Array -> CodeUnitIndex -> CodeUnit
unsafeIndexCodeUnit' Array
u8data forall a b. (a -> b) -> a -> b
$ CodeUnitIndex
idx forall a. Num a => a -> a -> a
- CodeUnitIndex
i
    -- Second, third and fourth bytes of a codepoint are always 10xxxxxx, while
    -- the first byte can be 0xxxxxxx or 11yyyyyy.
    isFirstByte :: a -> Bool
isFirstByte !a
cu = a
cu forall a. Bits a => a -> a -> a
.&. a
0b1100_0000 forall a. Eq a => a -> a -> Bool
/= a
0b1000_0000
    cu0 :: CodeUnit
cu0 = CodeUnitIndex -> CodeUnit
cuAt CodeUnitIndex
0
  in
    if forall {a}. (Bits a, Num a) => a -> Bool
isFirstByte CodeUnit
cu0
    then CodeUnitIndex -> Char -> CodeUnitIndex -> BackwardsIter
BackwardsIter (CodeUnitIndex
idx forall a. Num a => a -> a -> a
- CodeUnitIndex
1) (CodeUnit -> Char
decode1 CodeUnit
cu0) CodeUnitIndex
idx
    else
      let cu1 :: CodeUnit
cu1 = CodeUnitIndex -> CodeUnit
cuAt CodeUnitIndex
1 in
      if forall {a}. (Bits a, Num a) => a -> Bool
isFirstByte CodeUnit
cu1
      then CodeUnitIndex -> Char -> CodeUnitIndex -> BackwardsIter
BackwardsIter (CodeUnitIndex
idx forall a. Num a => a -> a -> a
- CodeUnitIndex
2) (CodeUnit -> CodeUnit -> Char
decode2 CodeUnit
cu1 CodeUnit
cu0) CodeUnitIndex
idx
      else
        let cu2 :: CodeUnit
cu2 = CodeUnitIndex -> CodeUnit
cuAt CodeUnitIndex
2 in
        if forall {a}. (Bits a, Num a) => a -> Bool
isFirstByte CodeUnit
cu2
        then CodeUnitIndex -> Char -> CodeUnitIndex -> BackwardsIter
BackwardsIter (CodeUnitIndex
idx forall a. Num a => a -> a -> a
- CodeUnitIndex
3) (CodeUnit -> CodeUnit -> CodeUnit -> Char
decode3 CodeUnit
cu2 CodeUnit
cu1 CodeUnit
cu0) CodeUnitIndex
idx
        else
          let cu3 :: CodeUnit
cu3 = CodeUnitIndex -> CodeUnit
cuAt CodeUnitIndex
3 in
          if forall {a}. (Bits a, Num a) => a -> Bool
isFirstByte CodeUnit
cu3
          then CodeUnitIndex -> Char -> CodeUnitIndex -> BackwardsIter
BackwardsIter (CodeUnitIndex
idx forall a. Num a => a -> a -> a
- CodeUnitIndex
4) (CodeUnit -> CodeUnit -> CodeUnit -> CodeUnit -> Char
decode4 CodeUnit
cu3 CodeUnit
cu2 CodeUnit
cu1 CodeUnit
cu0) CodeUnitIndex
idx
          else
            forall a. HasCallStack => [Char] -> a
error [Char]
"unsafeIndexEndOfCodePoint' could not find valid UTF8 codepoint"

unsafeIndexAnywhereInCodePoint' :: TextArray.Array -> CodeUnitIndex -> BackwardsIter
{-# INLINE unsafeIndexAnywhereInCodePoint' #-}
unsafeIndexAnywhereInCodePoint' :: Array -> CodeUnitIndex -> BackwardsIter
unsafeIndexAnywhereInCodePoint' !Array
u8data !CodeUnitIndex
idx =
  let
    cuAt :: CodeUnitIndex -> CodeUnit
cuAt !CodeUnitIndex
i = Array -> CodeUnitIndex -> CodeUnit
unsafeIndexCodeUnit' Array
u8data forall a b. (a -> b) -> a -> b
$ CodeUnitIndex
idx forall a. Num a => a -> a -> a
+ CodeUnitIndex
i
    -- Second, third and fourth bytes of a codepoint are always 10xxxxxx, while
    -- the first byte can be 0xxxxxxx or 11yyyyyy.
    isFirstByte :: a -> Bool
isFirstByte !a
cu = a
cu forall a. Bits a => a -> a -> a
.&. a
0b1100_0000 forall a. Eq a => a -> a -> Bool
/= a
0b1000_0000
    cu0 :: CodeUnit
cu0 = CodeUnitIndex -> CodeUnit
cuAt CodeUnitIndex
0

    makeBackwardsIter :: CodeUnitIndex -> (CodeUnitIndex, Char) -> BackwardsIter
makeBackwardsIter CodeUnitIndex
next (CodeUnitIndex
l, Char
cp) = CodeUnitIndex -> Char -> CodeUnitIndex -> BackwardsIter
BackwardsIter CodeUnitIndex
next Char
cp (CodeUnitIndex
next forall a. Num a => a -> a -> a
+ CodeUnitIndex
l)
  in
    if forall {a}. (Bits a, Num a) => a -> Bool
isFirstByte CodeUnit
cu0
    then CodeUnitIndex -> (CodeUnitIndex, Char) -> BackwardsIter
makeBackwardsIter (CodeUnitIndex
idx forall a. Num a => a -> a -> a
- CodeUnitIndex
1) forall a b. (a -> b) -> a -> b
$ CodeUnit
-> CodeUnit -> CodeUnit -> CodeUnit -> (CodeUnitIndex, Char)
decodeN CodeUnit
cu0 (CodeUnitIndex -> CodeUnit
cuAt CodeUnitIndex
1) (CodeUnitIndex -> CodeUnit
cuAt CodeUnitIndex
2) (CodeUnitIndex -> CodeUnit
cuAt CodeUnitIndex
3)
    else
      let cu00 :: CodeUnit
cu00 = CodeUnitIndex -> CodeUnit
cuAt (-CodeUnitIndex
1) in
      if forall {a}. (Bits a, Num a) => a -> Bool
isFirstByte CodeUnit
cu00
      then CodeUnitIndex -> (CodeUnitIndex, Char) -> BackwardsIter
makeBackwardsIter (CodeUnitIndex
idx forall a. Num a => a -> a -> a
- CodeUnitIndex
2) forall a b. (a -> b) -> a -> b
$ CodeUnit
-> CodeUnit -> CodeUnit -> CodeUnit -> (CodeUnitIndex, Char)
decodeN CodeUnit
cu00 CodeUnit
cu0 (CodeUnitIndex -> CodeUnit
cuAt CodeUnitIndex
1) (CodeUnitIndex -> CodeUnit
cuAt CodeUnitIndex
2)
      else
        let cu000 :: CodeUnit
cu000 = CodeUnitIndex -> CodeUnit
cuAt (-CodeUnitIndex
2) in
        if forall {a}. (Bits a, Num a) => a -> Bool
isFirstByte CodeUnit
cu000
        then CodeUnitIndex -> (CodeUnitIndex, Char) -> BackwardsIter
makeBackwardsIter (CodeUnitIndex
idx forall a. Num a => a -> a -> a
- CodeUnitIndex
3) forall a b. (a -> b) -> a -> b
$ CodeUnit
-> CodeUnit -> CodeUnit -> CodeUnit -> (CodeUnitIndex, Char)
decodeN CodeUnit
cu000 CodeUnit
cu00 CodeUnit
cu0 (CodeUnitIndex -> CodeUnit
cuAt CodeUnitIndex
1)
        else
          let cu0000 :: CodeUnit
cu0000 = CodeUnitIndex -> CodeUnit
cuAt (-CodeUnitIndex
3) in
          if forall {a}. (Bits a, Num a) => a -> Bool
isFirstByte CodeUnit
cu0000
          then CodeUnitIndex -> (CodeUnitIndex, Char) -> BackwardsIter
makeBackwardsIter (CodeUnitIndex
idx forall a. Num a => a -> a -> a
- CodeUnitIndex
4) forall a b. (a -> b) -> a -> b
$ CodeUnit
-> CodeUnit -> CodeUnit -> CodeUnit -> (CodeUnitIndex, Char)
decodeN CodeUnit
cu0000 CodeUnit
cu000 CodeUnit
cu00 CodeUnit
cu0
          else
            forall a. HasCallStack => [Char] -> a
error [Char]
"unsafeIndexAnywhereInCodePoint' could not find valid UTF8 codepoint"

{-# INLINE unsafeIndexCodeUnit' #-}
unsafeIndexCodeUnit' :: TextArray.Array -> CodeUnitIndex -> CodeUnit
unsafeIndexCodeUnit' :: Array -> CodeUnitIndex -> CodeUnit
unsafeIndexCodeUnit' !Array
u8data (CodeUnitIndex !Int
idx) = Array -> Int -> CodeUnit
TextArray.unsafeIndex Array
u8data Int
idx

-- $generalFunctions
--
-- Re-exported from 'Text'.