module Language.PureScript.PSString
  ( PSString
  , toUTF16CodeUnits
  , decodeString
  , decodeStringEither
  , decodeStringWithReplacement
  , prettyPrintString
  , prettyPrintStringJS
  , mkString
  ) where

import Prelude
import GHC.Generics (Generic)
import Codec.Serialise (Serialise)
import Control.DeepSeq (NFData)
import Control.Exception (try, evaluate)
import Control.Applicative ((<|>))
import Data.Char qualified as Char
import Data.Bits (shiftR)
import Data.Either (fromRight)
import Data.List (unfoldr)
import Data.Scientific (toBoundedInteger)
import Data.String (IsString(..))
import Data.ByteString (ByteString)
import Data.ByteString qualified as BS
import Data.Text (Text)
import Data.Text qualified as T
import Data.Text.Encoding (decodeUtf16BE)
import Data.Text.Encoding.Error (UnicodeException)
import Data.Vector qualified as V
import Data.Word (Word16, Word8)
import Numeric (showHex)
import System.IO.Unsafe (unsafePerformIO)
import Data.Aeson qualified as A
import Data.Aeson.Types qualified as A

-- |
-- Strings in PureScript are sequences of UTF-16 code units, which do not
-- necessarily represent UTF-16 encoded text. For example, it is permissible
-- for a string to contain *lone surrogates,* i.e. characters in the range
-- U+D800 to U+DFFF which do not appear as a part of a surrogate pair.
--
-- The Show instance for PSString produces a string literal which would
-- represent the same data were it inserted into a PureScript source file.
--
-- Because JSON parsers vary wildly in terms of how they deal with lone
-- surrogates in JSON strings, the ToJSON instance for PSString produces JSON
-- strings where that would be safe (i.e. when there are no lone surrogates),
-- and arrays of UTF-16 code units (integers) otherwise.
--
newtype PSString = PSString { PSString -> [Word16]
toUTF16CodeUnits :: [Word16] }
  deriving (PSString -> PSString -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: PSString -> PSString -> Bool
$c/= :: PSString -> PSString -> Bool
== :: PSString -> PSString -> Bool
$c== :: PSString -> PSString -> Bool
Eq, Eq PSString
PSString -> PSString -> Bool
PSString -> PSString -> Ordering
PSString -> PSString -> PSString
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: PSString -> PSString -> PSString
$cmin :: PSString -> PSString -> PSString
max :: PSString -> PSString -> PSString
$cmax :: PSString -> PSString -> PSString
>= :: PSString -> PSString -> Bool
$c>= :: PSString -> PSString -> Bool
> :: PSString -> PSString -> Bool
$c> :: PSString -> PSString -> Bool
<= :: PSString -> PSString -> Bool
$c<= :: PSString -> PSString -> Bool
< :: PSString -> PSString -> Bool
$c< :: PSString -> PSString -> Bool
compare :: PSString -> PSString -> Ordering
$ccompare :: PSString -> PSString -> Ordering
Ord, NonEmpty PSString -> PSString
PSString -> PSString -> PSString
forall b. Integral b => b -> PSString -> PSString
forall a.
(a -> a -> a)
-> (NonEmpty a -> a)
-> (forall b. Integral b => b -> a -> a)
-> Semigroup a
stimes :: forall b. Integral b => b -> PSString -> PSString
$cstimes :: forall b. Integral b => b -> PSString -> PSString
sconcat :: NonEmpty PSString -> PSString
$csconcat :: NonEmpty PSString -> PSString
<> :: PSString -> PSString -> PSString
$c<> :: PSString -> PSString -> PSString
Semigroup, Semigroup PSString
PSString
[PSString] -> PSString
PSString -> PSString -> PSString
forall a.
Semigroup a -> a -> (a -> a -> a) -> ([a] -> a) -> Monoid a
mconcat :: [PSString] -> PSString
$cmconcat :: [PSString] -> PSString
mappend :: PSString -> PSString -> PSString
$cmappend :: PSString -> PSString -> PSString
mempty :: PSString
$cmempty :: PSString
Monoid, forall x. Rep PSString x -> PSString
forall x. PSString -> Rep PSString x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cto :: forall x. Rep PSString x -> PSString
$cfrom :: forall x. PSString -> Rep PSString x
Generic)

instance NFData PSString
instance Serialise PSString

instance Show PSString where
  show :: PSString -> String
show = forall a. Show a => a -> String
show forall b c a. (b -> c) -> (a -> b) -> a -> c
. PSString -> String
codePoints

-- |
-- Decode a PSString to a String, representing any lone surrogates as the
-- reserved code point with that index. Warning: if there are any lone
-- surrogates, converting the result to Text via Data.Text.pack will result in
-- loss of information as those lone surrogates will be replaced with U+FFFD
-- REPLACEMENT CHARACTER. Because this function requires care to use correctly,
-- we do not export it.
--
codePoints :: PSString -> String
codePoints :: PSString -> String
codePoints = forall a b. (a -> b) -> [a] -> [b]
map (forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either (Int -> Char
Char.chr forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (Integral a, Num b) => a -> b
fromIntegral) forall a. a -> a
id) forall b c a. (b -> c) -> (a -> b) -> a -> c
. PSString -> [Either Word16 Char]
decodeStringEither

-- |
-- Decode a PSString as UTF-16 text. Lone surrogates will be replaced with
-- U+FFFD REPLACEMENT CHARACTER
--
decodeStringWithReplacement :: PSString -> String
decodeStringWithReplacement :: PSString -> String
decodeStringWithReplacement = forall a b. (a -> b) -> [a] -> [b]
map (forall b a. b -> Either a b -> b
fromRight Char
'\xFFFD') forall b c a. (b -> c) -> (a -> b) -> a -> c
. PSString -> [Either Word16 Char]
decodeStringEither

-- |
-- Decode a PSString as UTF-16. Lone surrogates in the input are represented in
-- the output with the Left constructor; characters which were successfully
-- decoded are represented with the Right constructor.
--
decodeStringEither :: PSString -> [Either Word16 Char]
decodeStringEither :: PSString -> [Either Word16 Char]
decodeStringEither = forall b a. (b -> Maybe (a, b)) -> b -> [a]
unfoldr [Word16] -> Maybe (Either Word16 Char, [Word16])
decode forall b c a. (b -> c) -> (a -> b) -> a -> c
. PSString -> [Word16]
toUTF16CodeUnits
  where
  decode :: [Word16] -> Maybe (Either Word16 Char, [Word16])
  decode :: [Word16] -> Maybe (Either Word16 Char, [Word16])
decode (Word16
h:Word16
l:[Word16]
rest) | Word16 -> Bool
isLead Word16
h Bool -> Bool -> Bool
&& Word16 -> Bool
isTrail Word16
l = forall a. a -> Maybe a
Just (forall a b. b -> Either a b
Right (Word16 -> Word16 -> Char
unsurrogate Word16
h Word16
l), [Word16]
rest)
  decode (Word16
c:[Word16]
rest) | Word16 -> Bool
isSurrogate Word16
c = forall a. a -> Maybe a
Just (forall a b. a -> Either a b
Left Word16
c, [Word16]
rest)
  decode (Word16
c:[Word16]
rest) = forall a. a -> Maybe a
Just (forall a b. b -> Either a b
Right (Word16 -> Char
toChar Word16
c), [Word16]
rest)
  decode [] = forall a. Maybe a
Nothing

  unsurrogate :: Word16 -> Word16 -> Char
  unsurrogate :: Word16 -> Word16 -> Char
unsurrogate Word16
h Word16
l = forall a. Enum a => Int -> a
toEnum ((Word16 -> Int
toInt Word16
h forall a. Num a => a -> a -> a
- Int
0xD800) forall a. Num a => a -> a -> a
* Int
0x400 forall a. Num a => a -> a -> a
+ (Word16 -> Int
toInt Word16
l forall a. Num a => a -> a -> a
- Int
0xDC00) forall a. Num a => a -> a -> a
+ Int
0x10000)

-- |
-- Attempt to decode a PSString as UTF-16 text. This will fail (returning
-- Nothing) if the argument contains lone surrogates.
--
decodeString :: PSString -> Maybe Text
decodeString :: PSString -> Maybe Text
decodeString = forall {a} {a}. Either a a -> Maybe a
hush forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> Either UnicodeException Text
decodeEither forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Word8] -> ByteString
BS.pack forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap Word16 -> [Word8]
unpair forall b c a. (b -> c) -> (a -> b) -> a -> c
. PSString -> [Word16]
toUTF16CodeUnits
  where
  unpair :: Word16 -> [Word8]
unpair Word16
w = [Word16 -> Word8
highByte Word16
w, Word16 -> Word8
lowByte Word16
w]

  lowByte :: Word16 -> Word8
  lowByte :: Word16 -> Word8
lowByte = forall a b. (Integral a, Num b) => a -> b
fromIntegral

  highByte :: Word16 -> Word8
  highByte :: Word16 -> Word8
highByte = forall a b. (Integral a, Num b) => a -> b
fromIntegral forall b c a. (b -> c) -> (a -> b) -> a -> c
. (forall a. Bits a => a -> Int -> a
`shiftR` Int
8)

  -- Based on a similar function from Data.Text.Encoding for utf8. This is a
  -- safe usage of unsafePerformIO because there are no side effects after
  -- handling any thrown UnicodeExceptions.
  decodeEither :: ByteString -> Either UnicodeException Text
  decodeEither :: ByteString -> Either UnicodeException Text
decodeEither = forall a. IO a -> a
unsafePerformIO forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall e a. Exception e => IO a -> IO (Either e a)
try forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. a -> IO a
evaluate forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> Text
decodeUtf16BE

  hush :: Either a a -> Maybe a
hush = forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either (forall a b. a -> b -> a
const forall a. Maybe a
Nothing) forall a. a -> Maybe a
Just

instance IsString PSString where
  fromString :: String -> PSString
fromString String
a = [Word16] -> PSString
PSString forall a b. (a -> b) -> a -> b
$ forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap Char -> [Word16]
encodeUTF16 String
a
    where
    surrogates :: Char -> (Word16, Word16)
    surrogates :: Char -> (Word16, Word16)
surrogates Char
c = (Int -> Word16
toWord (Int
h forall a. Num a => a -> a -> a
+ Int
0xD800), Int -> Word16
toWord (Int
l forall a. Num a => a -> a -> a
+ Int
0xDC00))
      where (Int
h, Int
l) = forall a. Integral a => a -> a -> (a, a)
divMod (forall a. Enum a => a -> Int
fromEnum Char
c forall a. Num a => a -> a -> a
- Int
0x10000) Int
0x400

    encodeUTF16 :: Char -> [Word16]
    encodeUTF16 :: Char -> [Word16]
encodeUTF16 Char
c | forall a. Enum a => a -> Int
fromEnum Char
c forall a. Ord a => a -> a -> Bool
> Int
0xFFFF = [Word16
high, Word16
low]
      where (Word16
high, Word16
low) = Char -> (Word16, Word16)
surrogates Char
c
    encodeUTF16 Char
c = [Int -> Word16
toWord forall a b. (a -> b) -> a -> b
$ forall a. Enum a => a -> Int
fromEnum Char
c]

instance A.ToJSON PSString where
  toJSON :: PSString -> Value
toJSON PSString
str =
    case PSString -> Maybe Text
decodeString PSString
str of
      Just Text
t -> forall a. ToJSON a => a -> Value
A.toJSON Text
t
      Maybe Text
Nothing -> forall a. ToJSON a => a -> Value
A.toJSON (PSString -> [Word16]
toUTF16CodeUnits PSString
str)

instance A.FromJSON PSString where
  parseJSON :: Value -> Parser PSString
parseJSON Value
a = Parser PSString
jsonString forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser PSString
arrayOfCodeUnits
    where
    jsonString :: Parser PSString
jsonString = forall a. IsString a => String -> a
fromString forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall a. FromJSON a => Value -> Parser a
A.parseJSON Value
a

    arrayOfCodeUnits :: Parser PSString
arrayOfCodeUnits = [Word16] -> PSString
PSString forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Value -> Parser [Word16]
parseArrayOfCodeUnits Value
a

    parseArrayOfCodeUnits :: A.Value -> A.Parser [Word16]
    parseArrayOfCodeUnits :: Value -> Parser [Word16]
parseArrayOfCodeUnits = forall a. String -> (Array -> Parser a) -> Value -> Parser a
A.withArray String
"array of UTF-16 code units" (forall (t :: * -> *) (f :: * -> *) a b.
(Traversable t, Applicative f) =>
(a -> f b) -> t a -> f (t b)
traverse Value -> Parser Word16
parseCodeUnit forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Vector a -> [a]
V.toList)

    parseCodeUnit :: A.Value -> A.Parser Word16
    parseCodeUnit :: Value -> Parser Word16
parseCodeUnit Value
b = forall a. String -> (Scientific -> Parser a) -> Value -> Parser a
A.withScientific String
"two-byte non-negative integer" (forall b a. b -> (a -> b) -> Maybe a -> b
maybe (forall a. String -> Value -> Parser a
A.typeMismatch String
"" Value
b) forall (m :: * -> *) a. Monad m => a -> m a
return forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall i. (Integral i, Bounded i) => Scientific -> Maybe i
toBoundedInteger) Value
b

-- |
-- Pretty print a PSString, using PureScript escape sequences.
--
prettyPrintString :: PSString -> Text
prettyPrintString :: PSString -> Text
prettyPrintString PSString
s = Text
"\"" forall a. Semigroup a => a -> a -> a
<> forall (t :: * -> *) m a.
(Foldable t, Monoid m) =>
(a -> m) -> t a -> m
foldMap Either Word16 Char -> Text
encodeChar (PSString -> [Either Word16 Char]
decodeStringEither PSString
s) forall a. Semigroup a => a -> a -> a
<> Text
"\""
  where
  encodeChar :: Either Word16 Char -> Text
  encodeChar :: Either Word16 Char -> Text
encodeChar (Left Word16
c) = Text
"\\x" forall a. Semigroup a => a -> a -> a
<> forall a. Enum a => Int -> a -> Text
showHex' Int
6 Word16
c
  encodeChar (Right Char
c)
    | Char
c forall a. Eq a => a -> a -> Bool
== Char
'\t' = Text
"\\t"
    | Char
c forall a. Eq a => a -> a -> Bool
== Char
'\r' = Text
"\\r"
    | Char
c forall a. Eq a => a -> a -> Bool
== Char
'\n' = Text
"\\n"
    | Char
c forall a. Eq a => a -> a -> Bool
== Char
'"'  = Text
"\\\""
    | Char
c forall a. Eq a => a -> a -> Bool
== Char
'\''  = Text
"\\\'"
    | Char
c forall a. Eq a => a -> a -> Bool
== Char
'\\' = Text
"\\\\"
    | Char -> Bool
shouldPrint Char
c = Char -> Text
T.singleton Char
c
    | Bool
otherwise = Text
"\\x" forall a. Semigroup a => a -> a -> a
<> forall a. Enum a => Int -> a -> Text
showHex' Int
6 (Char -> Int
Char.ord Char
c)

  -- Note we do not use Data.Char.isPrint here because that includes things
  -- like zero-width spaces and combining punctuation marks, which could be
  -- confusing to print unescaped.
  shouldPrint :: Char -> Bool
  -- The standard space character, U+20 SPACE, is the only space char we should
  -- print without escaping
  shouldPrint :: Char -> Bool
shouldPrint Char
' ' = Bool
True
  shouldPrint Char
c =
    Char -> GeneralCategory
Char.generalCategory Char
c forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem`
      [ GeneralCategory
Char.UppercaseLetter
      , GeneralCategory
Char.LowercaseLetter
      , GeneralCategory
Char.TitlecaseLetter
      , GeneralCategory
Char.OtherLetter
      , GeneralCategory
Char.DecimalNumber
      , GeneralCategory
Char.LetterNumber
      , GeneralCategory
Char.OtherNumber
      , GeneralCategory
Char.ConnectorPunctuation
      , GeneralCategory
Char.DashPunctuation
      , GeneralCategory
Char.OpenPunctuation
      , GeneralCategory
Char.ClosePunctuation
      , GeneralCategory
Char.InitialQuote
      , GeneralCategory
Char.FinalQuote
      , GeneralCategory
Char.OtherPunctuation
      , GeneralCategory
Char.MathSymbol
      , GeneralCategory
Char.CurrencySymbol
      , GeneralCategory
Char.ModifierSymbol
      , GeneralCategory
Char.OtherSymbol
      ]

-- |
-- Pretty print a PSString, using JavaScript escape sequences. Intended for
-- use in compiled JS output.
--
prettyPrintStringJS :: PSString -> Text
prettyPrintStringJS :: PSString -> Text
prettyPrintStringJS PSString
s = Text
"\"" forall a. Semigroup a => a -> a -> a
<> forall (t :: * -> *) m a.
(Foldable t, Monoid m) =>
(a -> m) -> t a -> m
foldMap Word16 -> Text
encodeChar (PSString -> [Word16]
toUTF16CodeUnits PSString
s) forall a. Semigroup a => a -> a -> a
<> Text
"\""
  where
  encodeChar :: Word16 -> Text
  encodeChar :: Word16 -> Text
encodeChar Word16
c | Word16
c forall a. Ord a => a -> a -> Bool
> Word16
0xFF = Text
"\\u" forall a. Semigroup a => a -> a -> a
<> forall a. Enum a => Int -> a -> Text
showHex' Int
4 Word16
c
  encodeChar Word16
c | Word16
c forall a. Ord a => a -> a -> Bool
> Word16
0x7E Bool -> Bool -> Bool
|| Word16
c forall a. Ord a => a -> a -> Bool
< Word16
0x20 = Text
"\\x" forall a. Semigroup a => a -> a -> a
<> forall a. Enum a => Int -> a -> Text
showHex' Int
2 Word16
c
  encodeChar Word16
c | Word16 -> Char
toChar Word16
c forall a. Eq a => a -> a -> Bool
== Char
'\b' = Text
"\\b"
  encodeChar Word16
c | Word16 -> Char
toChar Word16
c forall a. Eq a => a -> a -> Bool
== Char
'\t' = Text
"\\t"
  encodeChar Word16
c | Word16 -> Char
toChar Word16
c forall a. Eq a => a -> a -> Bool
== Char
'\n' = Text
"\\n"
  encodeChar Word16
c | Word16 -> Char
toChar Word16
c forall a. Eq a => a -> a -> Bool
== Char
'\v' = Text
"\\v"
  encodeChar Word16
c | Word16 -> Char
toChar Word16
c forall a. Eq a => a -> a -> Bool
== Char
'\f' = Text
"\\f"
  encodeChar Word16
c | Word16 -> Char
toChar Word16
c forall a. Eq a => a -> a -> Bool
== Char
'\r' = Text
"\\r"
  encodeChar Word16
c | Word16 -> Char
toChar Word16
c forall a. Eq a => a -> a -> Bool
== Char
'"'  = Text
"\\\""
  encodeChar Word16
c | Word16 -> Char
toChar Word16
c forall a. Eq a => a -> a -> Bool
== Char
'\\' = Text
"\\\\"
  encodeChar Word16
c = Char -> Text
T.singleton forall a b. (a -> b) -> a -> b
$ Word16 -> Char
toChar Word16
c

showHex' :: Enum a => Int -> a -> Text
showHex' :: forall a. Enum a => Int -> a -> Text
showHex' Int
width a
c =
  let hs :: String
hs = forall a. (Integral a, Show a) => a -> ShowS
showHex (forall a. Enum a => a -> Int
fromEnum a
c) String
"" in
  String -> Text
T.pack (forall a. Int -> a -> [a]
replicate (Int
width forall a. Num a => a -> a -> a
- forall (t :: * -> *) a. Foldable t => t a -> Int
length String
hs) Char
'0' forall a. Semigroup a => a -> a -> a
<> String
hs)

isLead :: Word16 -> Bool
isLead :: Word16 -> Bool
isLead Word16
h = Word16
h forall a. Ord a => a -> a -> Bool
>= Word16
0xD800 Bool -> Bool -> Bool
&& Word16
h forall a. Ord a => a -> a -> Bool
<= Word16
0xDBFF

isTrail :: Word16 -> Bool
isTrail :: Word16 -> Bool
isTrail Word16
l = Word16
l forall a. Ord a => a -> a -> Bool
>= Word16
0xDC00 Bool -> Bool -> Bool
&& Word16
l forall a. Ord a => a -> a -> Bool
<= Word16
0xDFFF

isSurrogate :: Word16 -> Bool
isSurrogate :: Word16 -> Bool
isSurrogate Word16
c = Word16 -> Bool
isLead Word16
c Bool -> Bool -> Bool
|| Word16 -> Bool
isTrail Word16
c

toChar :: Word16 -> Char
toChar :: Word16 -> Char
toChar = forall a. Enum a => Int -> a
toEnum forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (Integral a, Num b) => a -> b
fromIntegral

toWord :: Int -> Word16
toWord :: Int -> Word16
toWord = forall a b. (Integral a, Num b) => a -> b
fromIntegral

toInt :: Word16 -> Int
toInt :: Word16 -> Int
toInt = forall a b. (Integral a, Num b) => a -> b
fromIntegral

mkString :: Text -> PSString
mkString :: Text -> PSString
mkString = forall a. IsString a => String -> a
fromString forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> String
T.unpack