{-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE DeriveGeneric #-} module Language.PureScript.PSString ( PSString , toUTF16CodeUnits , decodeString , decodeStringEither , decodeStringWithReplacement , prettyPrintString , prettyPrintStringJS , mkString ) where import Prelude.Compat import GHC.Generics (Generic) import Control.DeepSeq (NFData) import Control.Exception (try, evaluate) import Control.Applicative ((<|>)) import Data.Char (chr) import Data.Bits (shiftR) import Data.List (unfoldr) import Data.Scientific (toBoundedInteger) import Data.String (IsString(..)) import Data.ByteString (ByteString) import qualified Data.ByteString as BS import Data.Text (Text) import qualified Data.Text as T import Data.Text.Encoding (decodeUtf16BE) import Data.Text.Encoding.Error (UnicodeException) import qualified Data.Vector as V import Data.Word (Word16, Word8) import Numeric (showHex) import System.IO.Unsafe (unsafePerformIO) import qualified Data.Aeson as A import qualified Data.Aeson.Types as A -- | -- Strings in PureScript are sequences of UTF-16 code units, which do not -- necessarily represent UTF-16 encoded text. For example, it is permissible -- for a string to contain *lone surrogates,* i.e. characters in the range -- U+D800 to U+DFFF which do not appear as a part of a surrogate pair. -- -- The Show instance for PSString produces a string literal which would -- represent the same data were it inserted into a PureScript source file. -- -- Because JSON parsers vary wildly in terms of how they deal with lone -- surrogates in JSON strings, the ToJSON instance for PSString produces JSON -- strings where that would be safe (i.e. when there are no lone surrogates), -- and arrays of UTF-16 code units (integers) otherwise. -- newtype PSString = PSString { toUTF16CodeUnits :: [Word16] } deriving (Eq, Ord, Semigroup, Monoid, Generic) instance NFData PSString instance Show PSString where show = show . codePoints -- | -- Decode a PSString to a String, representing any lone surrogates as the -- reserved code point with that index. Warning: if there are any lone -- surrogates, converting the result to Text via Data.Text.pack will result in -- loss of information as those lone surrogates will be replaced with U+FFFD -- REPLACEMENT CHARACTER. Because this function requires care to use correctly, -- we do not export it. -- codePoints :: PSString -> String codePoints = map (either (chr . fromIntegral) id) . decodeStringEither -- | -- Decode a PSString as UTF-16 text. Lone surrogates will be replaced with -- U+FFFD REPLACEMENT CHARACTER -- decodeStringWithReplacement :: PSString -> String decodeStringWithReplacement = map (either (const '\xFFFD') id) . decodeStringEither -- | -- Decode a PSString as UTF-16. Lone surrogates in the input are represented in -- the output with the Left constructor; characters which were successfully -- decoded are represented with the Right constructor. -- decodeStringEither :: PSString -> [Either Word16 Char] decodeStringEither = unfoldr decode . toUTF16CodeUnits where decode :: [Word16] -> Maybe (Either Word16 Char, [Word16]) decode (h:l:rest) | isLead h && isTrail l = Just (Right (unsurrogate h l), rest) decode (c:rest) | isSurrogate c = Just (Left c, rest) decode (c:rest) = Just (Right (toChar c), rest) decode [] = Nothing unsurrogate :: Word16 -> Word16 -> Char unsurrogate h l = toEnum ((toInt h - 0xD800) * 0x400 + (toInt l - 0xDC00) + 0x10000) -- | -- Pretty print a PSString, using Haskell/PureScript escape sequences. -- This is identical to the Show instance except that we get a Text out instead -- of a String. -- prettyPrintString :: PSString -> Text prettyPrintString = T.pack . show -- | -- Attempt to decode a PSString as UTF-16 text. This will fail (returning -- Nothing) if the argument contains lone surrogates. -- decodeString :: PSString -> Maybe Text decodeString = hush . decodeEither . BS.pack . concatMap unpair . toUTF16CodeUnits where unpair w = [highByte w, lowByte w] lowByte :: Word16 -> Word8 lowByte = fromIntegral highByte :: Word16 -> Word8 highByte = fromIntegral . (`shiftR` 8) -- Based on a similar function from Data.Text.Encoding for utf8. This is a -- safe usage of unsafePerformIO because there are no side effects after -- handling any thrown UnicodeExceptions. decodeEither :: ByteString -> Either UnicodeException Text decodeEither = unsafePerformIO . try . evaluate . decodeUtf16BE hush = either (const Nothing) Just instance IsString PSString where fromString a = PSString $ concatMap encodeUTF16 a where surrogates :: Char -> (Word16, Word16) surrogates c = (toWord (h + 0xD800), toWord (l + 0xDC00)) where (h, l) = divMod (fromEnum c - 0x10000) 0x400 encodeUTF16 :: Char -> [Word16] encodeUTF16 c | fromEnum c > 0xFFFF = [high, low] where (high, low) = surrogates c encodeUTF16 c = [toWord $ fromEnum c] instance A.ToJSON PSString where toJSON str = case decodeString str of Just t -> A.toJSON t Nothing -> A.toJSON (toUTF16CodeUnits str) instance A.FromJSON PSString where parseJSON a = jsonString <|> arrayOfCodeUnits where jsonString = fromString <$> A.parseJSON a arrayOfCodeUnits = PSString <$> parseArrayOfCodeUnits a parseArrayOfCodeUnits :: A.Value -> A.Parser [Word16] parseArrayOfCodeUnits = A.withArray "array of UTF-16 code units" (traverse parseCodeUnit . V.toList) parseCodeUnit :: A.Value -> A.Parser Word16 parseCodeUnit b = A.withScientific "two-byte non-negative integer" (maybe (A.typeMismatch "" b) return . toBoundedInteger) b -- | -- Pretty print a PSString, using JavaScript escape sequences. Intended for -- use in compiled JS output. -- prettyPrintStringJS :: PSString -> Text prettyPrintStringJS s = "\"" <> foldMap encodeChar (toUTF16CodeUnits s) <> "\"" where encodeChar :: Word16 -> Text encodeChar c | c > 0xFF = "\\u" <> hex 4 c encodeChar c | c > 0x7E || c < 0x20 = "\\x" <> hex 2 c encodeChar c | toChar c == '\b' = "\\b" encodeChar c | toChar c == '\t' = "\\t" encodeChar c | toChar c == '\n' = "\\n" encodeChar c | toChar c == '\v' = "\\v" encodeChar c | toChar c == '\f' = "\\f" encodeChar c | toChar c == '\r' = "\\r" encodeChar c | toChar c == '"' = "\\\"" encodeChar c | toChar c == '\\' = "\\\\" encodeChar c = T.singleton $ toChar c hex :: (Enum a) => Int -> a -> Text hex width c = let hs = showHex (fromEnum c) "" in T.pack (replicate (width - length hs) '0' <> hs) isLead :: Word16 -> Bool isLead h = h >= 0xD800 && h <= 0xDBFF isTrail :: Word16 -> Bool isTrail l = l >= 0xDC00 && l <= 0xDFFF isSurrogate :: Word16 -> Bool isSurrogate c = isLead c || isTrail c toChar :: Word16 -> Char toChar = toEnum . fromIntegral toWord :: Int -> Word16 toWord = fromIntegral toInt :: Word16 -> Int toInt = fromIntegral mkString :: Text -> PSString mkString = fromString . T.unpack