-- Alfred-Margaret: Fast Aho-Corasick string searching
-- Copyright 2019 Channable
--
-- Licensed under the 3-clause BSD license, see the LICENSE file in the
-- repository root.

{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}

-- | This module provides functions that allow treating Text values as series of UTF-16 codepoints
-- instead of characters.
module Data.Text.Utf16
    ( CodeUnit
    , CodeUnitIndex (..)
    , indexTextArray
    , isCaseInvariant
    , lengthUtf16
    , lowerCodeUnit
    , lowerUtf16
    , unpackUtf16
    , unsafeCutUtf16
    , unsafeIndexUtf16
    , unsafeSliceUtf16
    , upperCodeUnit
    , upperUtf16
    ) where

import Prelude hiding (length)

import Control.DeepSeq (NFData)
import Control.Exception (assert)
import Data.Hashable (Hashable)
import Data.Primitive.ByteArray (ByteArray (..), sizeofByteArray)
import Data.Text.Internal (Text (..))
import Data.Word (Word16)
import GHC.Generics (Generic)

#if defined(HAS_AESON)
import qualified Data.Aeson as AE
#endif

import qualified Data.Char as Char
import qualified Data.Text as Text
import qualified Data.Text.Array as TextArray
import qualified Data.Text.Unsafe as TextUnsafe
import qualified Data.Vector.Primitive as PVector

-- | A code unit is a 16-bit integer from which UTF-16 encoded text is built up.
-- The `Text` type is represented as a UTF-16 string.
type CodeUnit = Word16

-- | An index into the raw UTF-16 data of a `Text`. This is not the code point
-- index as conventionally accepted by `Text`, so we wrap it to avoid confusing
-- the two. Incorrect index manipulation can lead to surrogate pairs being
-- sliced, so manipulate indices with care. This type is also used for lengths.
newtype CodeUnitIndex = CodeUnitIndex
  { CodeUnitIndex -> Int
codeUnitIndex :: Int
  }
  deriving stock (CodeUnitIndex -> CodeUnitIndex -> Bool
(CodeUnitIndex -> CodeUnitIndex -> Bool)
-> (CodeUnitIndex -> CodeUnitIndex -> Bool) -> Eq CodeUnitIndex
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: CodeUnitIndex -> CodeUnitIndex -> Bool
$c/= :: CodeUnitIndex -> CodeUnitIndex -> Bool
== :: CodeUnitIndex -> CodeUnitIndex -> Bool
$c== :: CodeUnitIndex -> CodeUnitIndex -> Bool
Eq, Eq CodeUnitIndex
Eq CodeUnitIndex
-> (CodeUnitIndex -> CodeUnitIndex -> Ordering)
-> (CodeUnitIndex -> CodeUnitIndex -> Bool)
-> (CodeUnitIndex -> CodeUnitIndex -> Bool)
-> (CodeUnitIndex -> CodeUnitIndex -> Bool)
-> (CodeUnitIndex -> CodeUnitIndex -> Bool)
-> (CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex)
-> (CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex)
-> Ord CodeUnitIndex
CodeUnitIndex -> CodeUnitIndex -> Bool
CodeUnitIndex -> CodeUnitIndex -> Ordering
CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
$cmin :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
max :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
$cmax :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
>= :: CodeUnitIndex -> CodeUnitIndex -> Bool
$c>= :: CodeUnitIndex -> CodeUnitIndex -> Bool
> :: CodeUnitIndex -> CodeUnitIndex -> Bool
$c> :: CodeUnitIndex -> CodeUnitIndex -> Bool
<= :: CodeUnitIndex -> CodeUnitIndex -> Bool
$c<= :: CodeUnitIndex -> CodeUnitIndex -> Bool
< :: CodeUnitIndex -> CodeUnitIndex -> Bool
$c< :: CodeUnitIndex -> CodeUnitIndex -> Bool
compare :: CodeUnitIndex -> CodeUnitIndex -> Ordering
$ccompare :: CodeUnitIndex -> CodeUnitIndex -> Ordering
$cp1Ord :: Eq CodeUnitIndex
Ord, Int -> CodeUnitIndex -> ShowS
[CodeUnitIndex] -> ShowS
CodeUnitIndex -> String
(Int -> CodeUnitIndex -> ShowS)
-> (CodeUnitIndex -> String)
-> ([CodeUnitIndex] -> ShowS)
-> Show CodeUnitIndex
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [CodeUnitIndex] -> ShowS
$cshowList :: [CodeUnitIndex] -> ShowS
show :: CodeUnitIndex -> String
$cshow :: CodeUnitIndex -> String
showsPrec :: Int -> CodeUnitIndex -> ShowS
$cshowsPrec :: Int -> CodeUnitIndex -> ShowS
Show, (forall x. CodeUnitIndex -> Rep CodeUnitIndex x)
-> (forall x. Rep CodeUnitIndex x -> CodeUnitIndex)
-> Generic CodeUnitIndex
forall x. Rep CodeUnitIndex x -> CodeUnitIndex
forall x. CodeUnitIndex -> Rep CodeUnitIndex x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cto :: forall x. Rep CodeUnitIndex x -> CodeUnitIndex
$cfrom :: forall x. CodeUnitIndex -> Rep CodeUnitIndex x
Generic, CodeUnitIndex
CodeUnitIndex -> CodeUnitIndex -> Bounded CodeUnitIndex
forall a. a -> a -> Bounded a
maxBound :: CodeUnitIndex
$cmaxBound :: CodeUnitIndex
minBound :: CodeUnitIndex
$cminBound :: CodeUnitIndex
Bounded)
#if defined(HAS_AESON)
  deriving newtype (Int -> CodeUnitIndex -> Int
CodeUnitIndex -> Int
(Int -> CodeUnitIndex -> Int)
-> (CodeUnitIndex -> Int) -> Hashable CodeUnitIndex
forall a. (Int -> a -> Int) -> (a -> Int) -> Hashable a
hash :: CodeUnitIndex -> Int
$chash :: CodeUnitIndex -> Int
hashWithSalt :: Int -> CodeUnitIndex -> Int
$chashWithSalt :: Int -> CodeUnitIndex -> Int
Hashable, Integer -> CodeUnitIndex
CodeUnitIndex -> CodeUnitIndex
CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
(CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex)
-> (CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex)
-> (CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex)
-> (CodeUnitIndex -> CodeUnitIndex)
-> (CodeUnitIndex -> CodeUnitIndex)
-> (CodeUnitIndex -> CodeUnitIndex)
-> (Integer -> CodeUnitIndex)
-> Num CodeUnitIndex
forall a.
(a -> a -> a)
-> (a -> a -> a)
-> (a -> a -> a)
-> (a -> a)
-> (a -> a)
-> (a -> a)
-> (Integer -> a)
-> Num a
fromInteger :: Integer -> CodeUnitIndex
$cfromInteger :: Integer -> CodeUnitIndex
signum :: CodeUnitIndex -> CodeUnitIndex
$csignum :: CodeUnitIndex -> CodeUnitIndex
abs :: CodeUnitIndex -> CodeUnitIndex
$cabs :: CodeUnitIndex -> CodeUnitIndex
negate :: CodeUnitIndex -> CodeUnitIndex
$cnegate :: CodeUnitIndex -> CodeUnitIndex
* :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
$c* :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
- :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
$c- :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
+ :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
$c+ :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
Num, CodeUnitIndex -> ()
(CodeUnitIndex -> ()) -> NFData CodeUnitIndex
forall a. (a -> ()) -> NFData a
rnf :: CodeUnitIndex -> ()
$crnf :: CodeUnitIndex -> ()
NFData, Value -> Parser [CodeUnitIndex]
Value -> Parser CodeUnitIndex
(Value -> Parser CodeUnitIndex)
-> (Value -> Parser [CodeUnitIndex]) -> FromJSON CodeUnitIndex
forall a.
(Value -> Parser a) -> (Value -> Parser [a]) -> FromJSON a
parseJSONList :: Value -> Parser [CodeUnitIndex]
$cparseJSONList :: Value -> Parser [CodeUnitIndex]
parseJSON :: Value -> Parser CodeUnitIndex
$cparseJSON :: Value -> Parser CodeUnitIndex
AE.FromJSON, [CodeUnitIndex] -> Encoding
[CodeUnitIndex] -> Value
CodeUnitIndex -> Encoding
CodeUnitIndex -> Value
(CodeUnitIndex -> Value)
-> (CodeUnitIndex -> Encoding)
-> ([CodeUnitIndex] -> Value)
-> ([CodeUnitIndex] -> Encoding)
-> ToJSON CodeUnitIndex
forall a.
(a -> Value)
-> (a -> Encoding)
-> ([a] -> Value)
-> ([a] -> Encoding)
-> ToJSON a
toEncodingList :: [CodeUnitIndex] -> Encoding
$ctoEncodingList :: [CodeUnitIndex] -> Encoding
toJSONList :: [CodeUnitIndex] -> Value
$ctoJSONList :: [CodeUnitIndex] -> Value
toEncoding :: CodeUnitIndex -> Encoding
$ctoEncoding :: CodeUnitIndex -> Encoding
toJSON :: CodeUnitIndex -> Value
$ctoJSON :: CodeUnitIndex -> Value
AE.ToJSON)
#else
  deriving newtype (Hashable, Num, NFData)
#endif


-- | Return a 'Text' as a list of UTF-16 code units.
{-# INLINABLE unpackUtf16 #-}
unpackUtf16 :: Text -> [CodeUnit]
unpackUtf16 :: Text -> [CodeUnit]
unpackUtf16 (Text Array
u16data Int
offset Int
length) =
  let
    go :: Int -> t -> [CodeUnit]
go Int
_ t
0 = []
    go Int
i t
n = Array -> Int -> CodeUnit
indexTextArray Array
u16data Int
i CodeUnit -> [CodeUnit] -> [CodeUnit]
forall a. a -> [a] -> [a]
: Int -> t -> [CodeUnit]
go (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (t
n t -> t -> t
forall a. Num a => a -> a -> a
- t
1)
  in
    Int -> Int -> [CodeUnit]
forall t. (Eq t, Num t) => Int -> t -> [CodeUnit]
go Int
offset Int
length

-- | Return whether the code unit at the given index starts a surrogate pair.
-- Such a code unit must be followed by a low surrogate in valid UTF-16.
-- Returns false on out of bounds indices.
{-# INLINE isHighSurrogate #-}
isHighSurrogate :: Int -> Text -> Bool
isHighSurrogate :: Int -> Text -> Bool
isHighSurrogate !Int
i (Text !Array
u16data !Int
offset !Int
len) =
  let
    w :: CodeUnit
w = Array -> Int -> CodeUnit
indexTextArray Array
u16data (Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
i)
  in
    Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
0 Bool -> Bool -> Bool
&& Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
len Bool -> Bool -> Bool
&& CodeUnit
w CodeUnit -> CodeUnit -> Bool
forall a. Ord a => a -> a -> Bool
>= CodeUnit
0xd800 Bool -> Bool -> Bool
&& CodeUnit
w CodeUnit -> CodeUnit -> Bool
forall a. Ord a => a -> a -> Bool
<= CodeUnit
0xdbff

-- | Return whether the code unit at the given index ends a surrogate pair.
-- Such a code unit must be preceded by a high surrogate in valid UTF-16.
-- Returns false on out of bounds indices.
{-# INLINE isLowSurrogate #-}
isLowSurrogate :: Int -> Text -> Bool
isLowSurrogate :: Int -> Text -> Bool
isLowSurrogate !Int
i (Text !Array
u16data !Int
offset !Int
len) =
  let
    w :: CodeUnit
w = Array -> Int -> CodeUnit
indexTextArray Array
u16data (Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
i)
  in
    Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
0 Bool -> Bool -> Bool
&& Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
len Bool -> Bool -> Bool
&& CodeUnit
w CodeUnit -> CodeUnit -> Bool
forall a. Ord a => a -> a -> Bool
>= CodeUnit
0xdc00 Bool -> Bool -> Bool
&& CodeUnit
w CodeUnit -> CodeUnit -> Bool
forall a. Ord a => a -> a -> Bool
<= CodeUnit
0xdfff

-- | Extract a substring from a text, at a code unit offset and length.
-- This is similar to `Text.take length . Text.drop begin`, except that the
-- begin and length are in code *units*, not code points, so we can slice the
-- UTF-16 array, and we don't have to walk the entire text to take surrogate
-- pairs into account. It is the responsibility of the user to not slice
-- surrogate pairs, and to ensure that the length is within bounds, hence this
-- function is unsafe.
{-# INLINE unsafeSliceUtf16 #-}
unsafeSliceUtf16 :: CodeUnitIndex -> CodeUnitIndex -> Text -> Text
unsafeSliceUtf16 :: CodeUnitIndex -> CodeUnitIndex -> Text -> Text
unsafeSliceUtf16 (CodeUnitIndex !Int
begin) (CodeUnitIndex !Int
length) !Text
text
  = Bool -> Text -> Text
forall a. (?callStack::CallStack) => Bool -> a -> a
assert (Int
begin Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
length Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Text -> Int
TextUnsafe.lengthWord16 Text
text)
  (Text -> Text) -> Text -> Text
forall a b. (a -> b) -> a -> b
$ Bool -> Text -> Text
forall a. (?callStack::CallStack) => Bool -> a -> a
assert (Bool -> Bool
not (Bool -> Bool) -> Bool -> Bool
forall a b. (a -> b) -> a -> b
$ Int -> Text -> Bool
isLowSurrogate Int
begin Text
text)
  (Text -> Text) -> Text -> Text
forall a b. (a -> b) -> a -> b
$ Bool -> Text -> Text
forall a. (?callStack::CallStack) => Bool -> a -> a
assert (Bool -> Bool
not (Bool -> Bool) -> Bool -> Bool
forall a b. (a -> b) -> a -> b
$ Int -> Text -> Bool
isHighSurrogate (Int
begin Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
length Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1) Text
text)
  (Text -> Text) -> Text -> Text
forall a b. (a -> b) -> a -> b
$ Int -> Text -> Text
TextUnsafe.takeWord16 Int
length (Text -> Text) -> Text -> Text
forall a b. (a -> b) -> a -> b
$ Int -> Text -> Text
TextUnsafe.dropWord16 Int
begin Text
text

-- | The complement of `unsafeSliceUtf16`: removes the slice, and returns the
-- part before and after. See `unsafeSliceUtf16` for details.
{-# INLINE unsafeCutUtf16 #-}
unsafeCutUtf16 :: CodeUnitIndex -> CodeUnitIndex -> Text -> (Text, Text)
unsafeCutUtf16 :: CodeUnitIndex -> CodeUnitIndex -> Text -> (Text, Text)
unsafeCutUtf16 (CodeUnitIndex !Int
begin) (CodeUnitIndex !Int
length) !Text
text
  = Bool -> (Text, Text) -> (Text, Text)
forall a. (?callStack::CallStack) => Bool -> a -> a
assert (Int
begin Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
length Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Text -> Int
TextUnsafe.lengthWord16 Text
text)
  ((Text, Text) -> (Text, Text)) -> (Text, Text) -> (Text, Text)
forall a b. (a -> b) -> a -> b
$ Bool -> (Text, Text) -> (Text, Text)
forall a. (?callStack::CallStack) => Bool -> a -> a
assert (Bool -> Bool
not (Bool -> Bool) -> Bool -> Bool
forall a b. (a -> b) -> a -> b
$ Int -> Text -> Bool
isLowSurrogate Int
begin Text
text)
  ((Text, Text) -> (Text, Text)) -> (Text, Text) -> (Text, Text)
forall a b. (a -> b) -> a -> b
$ Bool -> (Text, Text) -> (Text, Text)
forall a. (?callStack::CallStack) => Bool -> a -> a
assert (Bool -> Bool
not (Bool -> Bool) -> Bool -> Bool
forall a b. (a -> b) -> a -> b
$ Int -> Text -> Bool
isHighSurrogate (Int
begin Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
length Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1) Text
text)
    ( Int -> Text -> Text
TextUnsafe.takeWord16 Int
begin Text
text
    , Int -> Text -> Text
TextUnsafe.dropWord16 (Int
begin Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
length) Text
text
    )

-- | Return the length of the text, in number of code units.
{-# INLINE lengthUtf16 #-}
lengthUtf16 :: Text -> CodeUnitIndex
lengthUtf16 :: Text -> CodeUnitIndex
lengthUtf16 = Int -> CodeUnitIndex
CodeUnitIndex (Int -> CodeUnitIndex) -> (Text -> Int) -> Text -> CodeUnitIndex
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Int
TextUnsafe.lengthWord16

-- | Return the code unit (not character) with the given index.
-- Note: The bounds are not checked.
unsafeIndexUtf16 :: Text -> CodeUnitIndex -> CodeUnit
{-# INLINE unsafeIndexUtf16 #-}
unsafeIndexUtf16 :: Text -> CodeUnitIndex -> CodeUnit
unsafeIndexUtf16 (Text Array
arr Int
off Int
_) (CodeUnitIndex Int
pos) = Array -> Int -> CodeUnit
indexTextArray Array
arr (Int
pos Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
off)

-- | Apply a function to each code unit of a text.
{-# INLINABLE mapUtf16 #-}
mapUtf16 :: (CodeUnit -> CodeUnit) -> Text -> Text
mapUtf16 :: (CodeUnit -> CodeUnit) -> Text -> Text
mapUtf16 CodeUnit -> CodeUnit
f (Text Array
u16data Int
offset Int
length) =
  let
    get :: Int -> CodeUnit
get !Int
i = CodeUnit -> CodeUnit
f (CodeUnit -> CodeUnit) -> CodeUnit -> CodeUnit
forall a b. (a -> b) -> a -> b
$ Array -> Int -> CodeUnit
indexTextArray Array
u16data (Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
i)
    !(PVector.Vector !Int
offset' !Int
length' !(ByteArray !ByteArray#
u16data')) =
      Int -> (Int -> CodeUnit) -> Vector CodeUnit
forall a. Prim a => Int -> (Int -> a) -> Vector a
PVector.generate Int
length Int -> CodeUnit
get
  in
    Array -> Int -> Int -> Text
Text (ByteArray# -> Array
TextArray.Array ByteArray#
u16data') Int
offset' Int
length'

-- | Lowercase each individual code unit of a text without changing their index.
-- This is not a proper case folding, but it does ensure that indices into the
-- lowercased string correspond to indices into the original string.
--
-- Differences from `Text.toLower` include code points in the BMP that lowercase
-- to multiple code points, and code points outside of the BMP.
--
-- For example, \"İ\" (U+0130), which `toLower` converts to \"i\" (U+0069, U+0307),
-- is converted into U+0069 only by `lowerUtf16`.
-- Also, \"𑢢\" (U+118A2), a code point from the Warang City writing system in the
-- Supplementary Multilingual Plane, introduced in 2014 to Unicode 7. It would
-- be lowercased to U+118C2 by `toLower`, but it is left untouched by
-- `lowerUtf16`.
{-# INLINE lowerUtf16 #-}
lowerUtf16 :: Text -> Text
lowerUtf16 :: Text -> Text
lowerUtf16 = (CodeUnit -> CodeUnit) -> Text -> Text
mapUtf16 CodeUnit -> CodeUnit
lowerCodeUnit

-- | Convert CodeUnits that represent a character on their own (i.e. that are not part of a
-- surrogate pair) to their lower case representation.
--
-- This function has a special code path for ASCII characters, because Char.toLower
-- is **incredibly** slow. It's implemented there if you want to see for yourself:
-- (https://github.com/ghc/ghc/blob/ghc-8.6.3-release/libraries/base/cbits/WCsubst.c#L4732)
-- (It does a binary search on 1276 casing rules)
{-# INLINE lowerCodeUnit #-}
lowerCodeUnit :: CodeUnit -> CodeUnit
lowerCodeUnit :: CodeUnit -> CodeUnit
lowerCodeUnit CodeUnit
cu
  -- ASCII letters A..Z and a..z are two contiguous blocks.
  -- Converting to lower case amounts to adding a fixed offset.
  | CodeUnit -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
cu Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Char -> Int
Char.ord Char
'A' Bool -> Bool -> Bool
&& CodeUnit -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
cu Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Char -> Int
Char.ord Char
'Z'
    = CodeUnit
cu CodeUnit -> CodeUnit -> CodeUnit
forall a. Num a => a -> a -> a
+ Int -> CodeUnit
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Char -> Int
Char.ord Char
'a' Int -> Int -> Int
forall a. Num a => a -> a -> a
- Char -> Int
Char.ord Char
'A')

    -- Everything else in ASCII is invariant under toLower.
  -- The a..z range is already lower case, and all non-letter characters are case-invariant.
  | CodeUnit
cu CodeUnit -> CodeUnit -> Bool
forall a. Ord a => a -> a -> Bool
<= CodeUnit
127 = CodeUnit
cu

  -- This code unit is part of a surrogate pair. Don't touch those, because
  -- we don't have all information required to decode the code point. Note
  -- that alphabets that need to be encoded as surrogate pairs are mostly
  -- archaic and obscure; all of the languages used by our customers have
  -- alphabets in the Basic Multilingual Plane, which does not need surrogate
  -- pairs. Note that the BMP is not just ascii or extended ascii. See also
  -- https://codepoints.net/basic_multilingual_plane.
  | CodeUnit
cu CodeUnit -> CodeUnit -> Bool
forall a. Ord a => a -> a -> Bool
>= CodeUnit
0xd800 Bool -> Bool -> Bool
&& CodeUnit
cu CodeUnit -> CodeUnit -> Bool
forall a. Ord a => a -> a -> Bool
< CodeUnit
0xe000 = CodeUnit
cu

  -- The code unit is a code point on its own (not part of a surrogate pair),
  -- lowercase the code point. These code points, which are all in the BMP,
  -- have the important property that lowercasing them is again a code point
  -- in the BMP, so the output can be encoded in exactly one code unit, just
  -- like the input. This property was verified by exhaustive testing; see
  -- also the test in AhoCorasickSpec.hs.
  | Bool
otherwise = Int -> CodeUnit
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> CodeUnit) -> Int -> CodeUnit
forall a b. (a -> b) -> a -> b
$ Char -> Int
Char.ord (Char -> Int) -> Char -> Int
forall a b. (a -> b) -> a -> b
$ Char -> Char
Char.toLower (Char -> Char) -> Char -> Char
forall a b. (a -> b) -> a -> b
$ Int -> Char
Char.chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ CodeUnit -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
cu

-- | Lowercase each individual code unit of a text without changing their index.
-- See also 'lowerUtf16' and 'lowerCodeUnit'.
{-# INLINE upperUtf16 #-}
upperUtf16 :: Text -> Text
upperUtf16 :: Text -> Text
upperUtf16 = (CodeUnit -> CodeUnit) -> Text -> Text
mapUtf16 CodeUnit -> CodeUnit
upperCodeUnit

-- | Analogous to 'lowerCodeUnit'.
{-# INLINE upperCodeUnit #-}
upperCodeUnit :: CodeUnit -> CodeUnit
upperCodeUnit :: CodeUnit -> CodeUnit
upperCodeUnit CodeUnit
cu
  | CodeUnit -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
cu Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Char -> Int
Char.ord Char
'a' Bool -> Bool -> Bool
&& CodeUnit -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
cu Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Char -> Int
Char.ord Char
'z'
    = CodeUnit
cu CodeUnit -> CodeUnit -> CodeUnit
forall a. Num a => a -> a -> a
- Int -> CodeUnit
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Char -> Int
Char.ord Char
'a' Int -> Int -> Int
forall a. Num a => a -> a -> a
- Char -> Int
Char.ord Char
'A')
  | CodeUnit
cu CodeUnit -> CodeUnit -> Bool
forall a. Ord a => a -> a -> Bool
<= CodeUnit
127 = CodeUnit
cu
  | CodeUnit
cu CodeUnit -> CodeUnit -> Bool
forall a. Ord a => a -> a -> Bool
>= CodeUnit
0xd800 Bool -> Bool -> Bool
&& CodeUnit
cu CodeUnit -> CodeUnit -> Bool
forall a. Ord a => a -> a -> Bool
< CodeUnit
0xe000 = CodeUnit
cu
  | Bool
otherwise = Int -> CodeUnit
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> CodeUnit) -> Int -> CodeUnit
forall a b. (a -> b) -> a -> b
$ Char -> Int
Char.ord (Char -> Int) -> Char -> Int
forall a b. (a -> b) -> a -> b
$ Char -> Char
Char.toUpper (Char -> Char) -> Char -> Char
forall a b. (a -> b) -> a -> b
$ Int -> Char
Char.chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ CodeUnit -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
cu

-- | Return whether text is the same lowercase as uppercase, such that this
-- function will not return true when Aho–Corasick would differentiate when
-- doing case-insensitive matching.
{-# INLINE isCaseInvariant #-}
isCaseInvariant :: Text -> Bool
isCaseInvariant :: Text -> Bool
isCaseInvariant = (Char -> Bool) -> Text -> Bool
Text.all (\Char
c -> Char -> Char
Char.toLower Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char -> Char
Char.toUpper Char
c)

-- | Retrieve a code unit from 'Text's internal representation.
{-# INLINE indexTextArray #-}
indexTextArray :: TextArray.Array -> Int -> CodeUnit
indexTextArray :: Array -> Int -> CodeUnit
indexTextArray array :: Array
array@(TextArray.Array ByteArray#
byteArray) Int
index
  = Bool -> CodeUnit -> CodeUnit
forall a. (?callStack::CallStack) => Bool -> a -> a
assert (Int
2 Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
index Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< ByteArray -> Int
sizeofByteArray (ByteArray# -> ByteArray
ByteArray ByteArray#
byteArray))
  (CodeUnit -> CodeUnit) -> CodeUnit -> CodeUnit
forall a b. (a -> b) -> a -> b
$ Bool -> CodeUnit -> CodeUnit
forall a. (?callStack::CallStack) => Bool -> a -> a
assert (Int
0 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
index)
  (CodeUnit -> CodeUnit) -> CodeUnit -> CodeUnit
forall a b. (a -> b) -> a -> b
$ Array -> Int -> CodeUnit
TextArray.unsafeIndex Array
array Int
index