{-# LANGUAGE CPP #-}
-- |
-- Module      : Streamly.Internal.Unicode.Stream
-- Copyright   : (c) 2018 Composewell Technologies
--               (c) Bjoern Hoehrmann 2008-2009
--
-- License     : BSD-3-Clause
-- Maintainer  : streamly@composewell.com
-- Stability   : experimental
-- Portability : GHC
--

module Streamly.Internal.Unicode.Stream
    (
    -- * Setup
    -- | To execute the code examples provided in this module in ghci, please
    -- run the following commands first.
    --
    -- $setup

    -- * Construction (Decoding)
      decodeLatin1

    -- ** UTF-8 Decoding
    , CodingFailureMode(..)
    , writeCharUtf8'
    , parseCharUtf8With
    , decodeUtf8
    , decodeUtf8'
    , decodeUtf8_

    -- ** Resumable UTF-8 Decoding
    , DecodeError(..)
    , DecodeState
    , CodePoint
    , decodeUtf8Either
    , resumeDecodeUtf8Either

    -- ** UTF-8 Array Stream Decoding
    , decodeUtf8Chunks
    , decodeUtf8Chunks'
    , decodeUtf8Chunks_

    -- * Elimination (Encoding)
    -- ** Latin1 Encoding
    , encodeLatin1
    , encodeLatin1'
    , encodeLatin1_

    -- ** UTF-8 Encoding
    , readCharUtf8'
    , readCharUtf8
    , readCharUtf8_
    , encodeUtf8
    , encodeUtf8'
    , encodeUtf8_
    , encodeStrings
    {-
    -- * Operations on character strings
    , strip -- (dropAround isSpace)
    , stripEnd
    -}

    -- * Transformation
    , stripHead
    , lines
    , words
    , unlines
    , unwords

    -- * StreamD UTF8 Encoding / Decoding transformations.
    , decodeUtf8D
    , decodeUtf8D'
    , decodeUtf8D_
    , encodeUtf8D
    , encodeUtf8D'
    , encodeUtf8D_
    , decodeUtf8EitherD
    , resumeDecodeUtf8EitherD

    -- * Decoding String Literals
    , fromStr#

    -- * Deprecations
    , decodeUtf8Lax
    , encodeLatin1Lax
    , encodeUtf8Lax
    )
where

#include "inline.hs"

import Control.Monad (void)
import Control.Monad.IO.Class (MonadIO, liftIO)
import Data.Bits (shiftR, shiftL, (.|.), (.&.))
import Data.Char (chr, ord)
#if MIN_VERSION_base(4,17,0)
import Data.Char (generalCategory, GeneralCategory(Space))
#endif
import Data.Word (Word8)
import Foreign.Marshal.Alloc (mallocBytes)
import Foreign.Storable (Storable(..))
#ifndef __GHCJS__
import Fusion.Plugin.Types (Fuse(..))
#endif
import GHC.Base (assert, unsafeChr)
import GHC.Exts (Addr#)
import GHC.IO.Encoding.Failure (isSurrogate)
import GHC.Ptr (Ptr (..), plusPtr)
import System.IO.Unsafe (unsafePerformIO)
import Streamly.Internal.Data.Array.Type (Array(..))
import Streamly.Internal.Data.MutArray.Type (MutByteArray)
import Streamly.Internal.Data.Fold (Fold)
import Streamly.Internal.Data.Parser (Parser)
import Streamly.Internal.Data.Stream (Stream)
import Streamly.Internal.Data.Stream (Step (..))
import Streamly.Internal.Data.SVar.Type (adaptState)
import Streamly.Internal.Data.Tuple.Strict (Tuple'(..))
import Streamly.Internal.Data.Unbox (Unbox(peekAt))
import Streamly.Internal.Data.Unfold.Type (Unfold(..))
import Streamly.Internal.System.IO (unsafeInlineIO)

import qualified Streamly.Data.Fold as Fold
import qualified Streamly.Data.Unfold as Unfold
import qualified Streamly.Internal.Data.Array.Type as Array
import qualified Streamly.Internal.Data.Parser as Parser (Parser)
import qualified Streamly.Internal.Data.Parser as ParserD
import qualified Streamly.Internal.Data.Stream as Stream
import qualified Streamly.Internal.Data.Stream as D

import Prelude hiding (lines, words, unlines, unwords)

#include "DocTestUnicodeStream.hs"

-------------------------------------------------------------------------------
-- Latin1 decoding
-------------------------------------------------------------------------------

-- | Decode a stream of bytes to Unicode characters by mapping each byte to a
-- corresponding Unicode 'Char' in 0-255 range.
--
{-# INLINE decodeLatin1 #-}
decodeLatin1 :: Monad m => Stream m Word8 -> Stream m Char
decodeLatin1 :: forall (m :: * -> *). Monad m => Stream m Word8 -> Stream m Char
decodeLatin1 = forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (Int -> Char
unsafeChr forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (Integral a, Num b) => a -> b
fromIntegral)

-------------------------------------------------------------------------------
-- Latin1 encoding
-------------------------------------------------------------------------------

-- | Encode a stream of Unicode characters to bytes by mapping each character
-- to a byte in 0-255 range. Throws an error if the input stream contains
-- characters beyond 255.
--
{-# INLINE encodeLatin1' #-}
encodeLatin1' :: Monad m => Stream m Char -> Stream m Word8
encodeLatin1' :: forall (m :: * -> *). Monad m => Stream m Char -> Stream m Word8
encodeLatin1' = forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap forall {a}. Num a => Char -> a
convert
    where
    convert :: Char -> a
convert Char
c =
        let codepoint :: Int
codepoint = Char -> Int
ord Char
c
        in if Int
codepoint forall a. Ord a => a -> a -> Bool
> Int
255
           then forall a. (?callStack::CallStack) => [Char] -> a
error forall a b. (a -> b) -> a -> b
$ [Char]
"Streamly.Unicode.encodeLatin1 invalid " forall a. [a] -> [a] -> [a]
++
                      [Char]
"input char codepoint " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show Int
codepoint
           else forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
codepoint

-- XXX Should we instead replace the invalid chars by NUL or whitespace or some
-- other control char? That may affect the perf a bit but may be a better
-- behavior.
--
-- | Like 'encodeLatin1'' but silently maps input codepoints beyond 255 to
-- arbitrary Latin1 chars in 0-255 range. No error or exception is thrown when
-- such mapping occurs.
--
{-# INLINE encodeLatin1 #-}
encodeLatin1 :: Monad m => Stream m Char -> Stream m Word8
encodeLatin1 :: forall (m :: * -> *). Monad m => Stream m Char -> Stream m Word8
encodeLatin1 = forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (forall a b. (Integral a, Num b) => a -> b
fromIntegral forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Int
ord)

-- | Like 'encodeLatin1' but drops the input characters beyond 255.
--
{-# INLINE encodeLatin1_ #-}
encodeLatin1_ :: Monad m => Stream m Char -> Stream m Word8
encodeLatin1_ :: forall (m :: * -> *). Monad m => Stream m Char -> Stream m Word8
encodeLatin1_ = forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (forall a b. (Integral a, Num b) => a -> b
fromIntegral forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Int
ord) forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall (m :: * -> *) a.
Monad m =>
(a -> Bool) -> Stream m a -> Stream m a
Stream.filter (forall a. Ord a => a -> a -> Bool
<= Int -> Char
chr Int
255)

-- | Same as 'encodeLatin1'
--
{-# DEPRECATED encodeLatin1Lax "Please use 'encodeLatin1' instead" #-}
{-# INLINE encodeLatin1Lax #-}
encodeLatin1Lax :: Monad m => Stream m Char -> Stream m Word8
encodeLatin1Lax :: forall (m :: * -> *). Monad m => Stream m Char -> Stream m Word8
encodeLatin1Lax = forall (m :: * -> *). Monad m => Stream m Char -> Stream m Word8
encodeLatin1

-------------------------------------------------------------------------------
-- UTF-8 decoding
-------------------------------------------------------------------------------

-- Int helps in cheaper conversion from Int to Char
type CodePoint = Int
type DecodeState = Word8

-- We can divide the errors in three general categories:
-- * A non-starter was encountered in a begin state
-- * A starter was encountered without completing a codepoint
-- * The last codepoint was not complete (input underflow)
--
-- Need to separate resumable and non-resumable error. In case of non-resumable
-- error we can also provide the failing byte. In case of resumable error the
-- state can be opaque.
--
data DecodeError = DecodeError !DecodeState !CodePoint deriving Int -> DecodeError -> ShowS
[DecodeError] -> ShowS
DecodeError -> [Char]
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
showList :: [DecodeError] -> ShowS
$cshowList :: [DecodeError] -> ShowS
show :: DecodeError -> [Char]
$cshow :: DecodeError -> [Char]
showsPrec :: Int -> DecodeError -> ShowS
$cshowsPrec :: Int -> DecodeError -> ShowS
Show

-- See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.

-- XXX Use names decodeSuccess = 0, decodeFailure = 12

decodeTable :: [Word8]
decodeTable :: [Word8]
decodeTable = [
   -- The first part of the table maps bytes to character classes that
   -- to reduce the size of the transition table and create bitmasks.
   Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,  Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,
   Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,  Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,
   Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,  Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,
   Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,  Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,Word8
0,
   Word8
1,Word8
1,Word8
1,Word8
1,Word8
1,Word8
1,Word8
1,Word8
1,Word8
1,Word8
1,Word8
1,Word8
1,Word8
1,Word8
1,Word8
1,Word8
1,  Word8
9,Word8
9,Word8
9,Word8
9,Word8
9,Word8
9,Word8
9,Word8
9,Word8
9,Word8
9,Word8
9,Word8
9,Word8
9,Word8
9,Word8
9,Word8
9,
   Word8
7,Word8
7,Word8
7,Word8
7,Word8
7,Word8
7,Word8
7,Word8
7,Word8
7,Word8
7,Word8
7,Word8
7,Word8
7,Word8
7,Word8
7,Word8
7,  Word8
7,Word8
7,Word8
7,Word8
7,Word8
7,Word8
7,Word8
7,Word8
7,Word8
7,Word8
7,Word8
7,Word8
7,Word8
7,Word8
7,Word8
7,Word8
7,
   Word8
8,Word8
8,Word8
2,Word8
2,Word8
2,Word8
2,Word8
2,Word8
2,Word8
2,Word8
2,Word8
2,Word8
2,Word8
2,Word8
2,Word8
2,Word8
2,  Word8
2,Word8
2,Word8
2,Word8
2,Word8
2,Word8
2,Word8
2,Word8
2,Word8
2,Word8
2,Word8
2,Word8
2,Word8
2,Word8
2,Word8
2,Word8
2,
  Word8
10,Word8
3,Word8
3,Word8
3,Word8
3,Word8
3,Word8
3,Word8
3,Word8
3,Word8
3,Word8
3,Word8
3,Word8
3,Word8
4,Word8
3,Word8
3, Word8
11,Word8
6,Word8
6,Word8
6,Word8
5,Word8
8,Word8
8,Word8
8,Word8
8,Word8
8,Word8
8,Word8
8,Word8
8,Word8
8,Word8
8,Word8
8,

   -- The second part is a transition table that maps a combination
   -- of a state of the automaton and a character class to a state.
   Word8
0,Word8
12,Word8
24,Word8
36,Word8
60,Word8
96,Word8
84,Word8
12,Word8
12,Word8
12,Word8
48,Word8
72, Word8
12,Word8
12,Word8
12,Word8
12,Word8
12,Word8
12,Word8
12,Word8
12,Word8
12,Word8
12,Word8
12,Word8
12,
  Word8
12, Word8
0,Word8
12,Word8
12,Word8
12,Word8
12,Word8
12, Word8
0,Word8
12, Word8
0,Word8
12,Word8
12, Word8
12,Word8
24,Word8
12,Word8
12,Word8
12,Word8
12,Word8
12,Word8
24,Word8
12,Word8
24,Word8
12,Word8
12,
  Word8
12,Word8
12,Word8
12,Word8
12,Word8
12,Word8
12,Word8
12,Word8
24,Word8
12,Word8
12,Word8
12,Word8
12, Word8
12,Word8
24,Word8
12,Word8
12,Word8
12,Word8
12,Word8
12,Word8
12,Word8
12,Word8
24,Word8
12,Word8
12,
  Word8
12,Word8
12,Word8
12,Word8
12,Word8
12,Word8
12,Word8
12,Word8
36,Word8
12,Word8
36,Word8
12,Word8
12, Word8
12,Word8
36,Word8
12,Word8
12,Word8
12,Word8
12,Word8
12,Word8
36,Word8
12,Word8
36,Word8
12,Word8
12,
  Word8
12,Word8
36,Word8
12,Word8
12,Word8
12,Word8
12,Word8
12,Word8
12,Word8
12,Word8
12,Word8
12,Word8
12
  ]

{-# INLINE utf8dLength #-}
utf8dLength :: Int
utf8dLength :: Int
utf8dLength = forall (t :: * -> *) a. Foldable t => t a -> Int
length [Word8]
decodeTable

-- | We do not want to garbage collect this and free the memory, we want to
-- keep this persistent. We don't know how to do that with GHC without having a
-- reference in some global structure. So we use a hack, use mallocBytes so
-- that the GC has no way to free it.
{-# NOINLINE utf8d #-}
utf8d :: Ptr Word8
utf8d :: Ptr Word8
utf8d = forall a. IO a -> a
unsafePerformIO forall a b. (a -> b) -> a -> b
$ do
    let size :: Int
size = Int
utf8dLength
    Ptr Word8
p <- forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO forall a b. (a -> b) -> a -> b
$ forall a. Int -> IO (Ptr a)
mallocBytes Int
size
    forall (f :: * -> *) a. Functor f => f a -> f ()
void forall a b. (a -> b) -> a -> b
$ forall (m :: * -> *) a b.
Monad m =>
Fold m a b -> Stream m a -> m b
D.fold
        (forall (m :: * -> *) b a.
Monad m =>
(b -> a -> m b) -> m b -> Fold m a b
Fold.foldlM' (\Ptr Word8
b Word8
a -> forall a. Storable a => Ptr a -> a -> IO ()
poke Ptr Word8
b Word8
a forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return (Ptr Word8
b forall a b. Ptr a -> Int -> Ptr b
`plusPtr` Int
1)) (forall (m :: * -> *) a. Monad m => a -> m a
return Ptr Word8
p))
        (forall (m :: * -> *) a. Applicative m => [a] -> Stream m a
D.fromList [Word8]
decodeTable)
    forall (m :: * -> *) a. Monad m => a -> m a
return Ptr Word8
p

-- | Return element at the specified index without checking the bounds.
-- and without touching the foreign ptr.
{-# INLINE_NORMAL unsafePeekElemOff #-}
unsafePeekElemOff :: forall a. Storable a => Ptr a -> Int -> a
unsafePeekElemOff :: forall a. Storable a => Ptr a -> Int -> a
unsafePeekElemOff Ptr a
p Int
i =
    let !x :: a
x = forall a. IO a -> a
unsafeInlineIO forall a b. (a -> b) -> a -> b
$ forall a. Storable a => Ptr a -> Int -> IO a
peekElemOff Ptr a
p Int
i
     in a
x

-- XXX We can use a fromPtr stream to implement it.
{-# INLINE showMemory #-}
showMemory ::
    forall a. (Show a, Storable a) => Ptr a -> Ptr a -> String
showMemory :: forall a. (Show a, Storable a) => Ptr a -> Ptr a -> [Char]
showMemory Ptr a
cur Ptr a
end
    | Ptr a
cur forall a. Ord a => a -> a -> Bool
< Ptr a
end =
        let cur1 :: Ptr b
cur1 = Ptr a
cur forall a b. Ptr a -> Int -> Ptr b
`plusPtr` forall a. Storable a => a -> Int
sizeOf (forall a. (?callStack::CallStack) => a
undefined :: a)
         in forall a. Show a => a -> [Char]
show (forall a. IO a -> a
unsafeInlineIO forall a b. (a -> b) -> a -> b
$ forall a. Storable a => Ptr a -> IO a
peek Ptr a
cur) forall a. [a] -> [a] -> [a]
++ [Char]
" " forall a. [a] -> [a] -> [a]
++ forall a. (Show a, Storable a) => Ptr a -> Ptr a -> [Char]
showMemory forall {b}. Ptr b
cur1 Ptr a
end
showMemory Ptr a
_ Ptr a
_ = [Char]
""

-- decode is split into two separate cases to avoid branching instructions.
-- From the higher level flow we already know which case we are in so we can
-- call the appropriate decode function.
--
-- When the state is 0
{-# INLINE decode0 #-}
decode0 :: Ptr Word8 -> Word8 -> Tuple' DecodeState CodePoint
decode0 :: Ptr Word8 -> Word8 -> Tuple' Word8 Int
decode0 Ptr Word8
table Word8
byte =
    let !t :: Word8
t = Ptr Word8
table forall a. Storable a => Ptr a -> Int -> a
`unsafePeekElemOff` forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
byte
        !codep' :: Int
codep' = (Int
0xff forall a. Bits a => a -> Int -> a
`shiftR` forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
t) forall a. Bits a => a -> a -> a
.&. forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
byte
        !state' :: Word8
state' = Ptr Word8
table forall a. Storable a => Ptr a -> Int -> a
`unsafePeekElemOff` (Int
256 forall a. Num a => a -> a -> a
+ forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
t)
     in forall a. (?callStack::CallStack) => Bool -> a -> a
assert ((Word8
byte forall a. Ord a => a -> a -> Bool
> Word8
0x7f Bool -> Bool -> Bool
|| forall a. (?callStack::CallStack) => [Char] -> a
error [Char]
showByte)
                Bool -> Bool -> Bool
&& (Word8
state' forall a. Eq a => a -> a -> Bool
/= Word8
0 Bool -> Bool -> Bool
|| forall a. (?callStack::CallStack) => [Char] -> a
error ([Char]
showByte forall a. [a] -> [a] -> [a]
++ [Char]
showTable)))
               (forall a b. a -> b -> Tuple' a b
Tuple' Word8
state' Int
codep')

    where

    utf8tableEnd :: Ptr b
utf8tableEnd = Ptr Word8
table forall a b. Ptr a -> Int -> Ptr b
`plusPtr` Int
364
    showByte :: [Char]
showByte = [Char]
"Streamly: decode0: byte: " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show Word8
byte
    showTable :: [Char]
showTable = [Char]
" table: " forall a. [a] -> [a] -> [a]
++ forall a. (Show a, Storable a) => Ptr a -> Ptr a -> [Char]
showMemory Ptr Word8
table forall {b}. Ptr b
utf8tableEnd

-- When the state is not 0
{-# INLINE decode1 #-}
decode1
    :: Ptr Word8
    -> DecodeState
    -> CodePoint
    -> Word8
    -> Tuple' DecodeState CodePoint
decode1 :: Ptr Word8 -> Word8 -> Int -> Word8 -> Tuple' Word8 Int
decode1 Ptr Word8
table Word8
state Int
codep Word8
byte =
    -- Remember codep is Int type!
    -- Can it be unsafe to convert the resulting Int to Char?
    let !t :: Word8
t = Ptr Word8
table forall a. Storable a => Ptr a -> Int -> a
`unsafePeekElemOff` forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
byte
        !codep' :: Int
codep' = (forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
byte forall a. Bits a => a -> a -> a
.&. Int
0x3f) forall a. Bits a => a -> a -> a
.|. (Int
codep forall a. Bits a => a -> Int -> a
`shiftL` Int
6)
        !state' :: Word8
state' = Ptr Word8
table forall a. Storable a => Ptr a -> Int -> a
`unsafePeekElemOff`
                    (Int
256 forall a. Num a => a -> a -> a
+ forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
state forall a. Num a => a -> a -> a
+ forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
t)
     in forall a. (?callStack::CallStack) => Bool -> a -> a
assert (Int
codep' forall a. Ord a => a -> a -> Bool
<= Int
0x10FFFF
                    Bool -> Bool -> Bool
|| forall a. (?callStack::CallStack) => [Char] -> a
error ([Char]
showByte forall a. [a] -> [a] -> [a]
++ forall {a} {a}. (Show a, Show a) => a -> a -> [Char]
showState Word8
state Int
codep))
               (forall a b. a -> b -> Tuple' a b
Tuple' Word8
state' Int
codep')
    where

    utf8tableEnd :: Ptr b
utf8tableEnd = Ptr Word8
table forall a b. Ptr a -> Int -> Ptr b
`plusPtr` Int
364
    showByte :: [Char]
showByte = [Char]
"Streamly: decode1: byte: " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show Word8
byte
    showState :: a -> a -> [Char]
showState a
st a
cp =
        [Char]
" state: " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show a
st forall a. [a] -> [a] -> [a]
++
        [Char]
" codepoint: " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show a
cp forall a. [a] -> [a] -> [a]
++
        [Char]
" table: " forall a. [a] -> [a] -> [a]
++ forall a. (Show a, Storable a) => Ptr a -> Ptr a -> [Char]
showMemory Ptr Word8
table forall {b}. Ptr b
utf8tableEnd

-------------------------------------------------------------------------------
-- Resumable UTF-8 decoding
-------------------------------------------------------------------------------

-- Strangely, GHCJS hangs linking template-haskell with this
#ifndef __GHCJS__
{-# ANN type UTF8DecodeState Fuse #-}
#endif
data UTF8DecodeState s a
    = UTF8DecodeInit s
    | UTF8DecodeInit1 s Word8
    | UTF8DecodeFirst s Word8
    | UTF8Decoding s !DecodeState !CodePoint
    | YieldAndContinue a (UTF8DecodeState s a)
    | Done

{-# INLINE_NORMAL resumeDecodeUtf8EitherD #-}
resumeDecodeUtf8EitherD
    :: Monad m
    => DecodeState
    -> CodePoint
    -> D.Stream m Word8
    -> D.Stream m (Either DecodeError Char)
resumeDecodeUtf8EitherD :: forall (m :: * -> *).
Monad m =>
Word8
-> Int -> Stream m Word8 -> Stream m (Either DecodeError Char)
resumeDecodeUtf8EitherD Word8
dst Int
codep (D.Stream State StreamK m Word8 -> s -> m (Step s Word8)
step s
state) =
    let stt :: UTF8DecodeState s a
stt =
            if Word8
dst forall a. Eq a => a -> a -> Bool
== Word8
0
            then forall s a. s -> UTF8DecodeState s a
UTF8DecodeInit s
state
            else forall s a. s -> Word8 -> Int -> UTF8DecodeState s a
UTF8Decoding s
state Word8
dst Int
codep
    in forall (m :: * -> *) a s.
(State StreamK m a -> s -> m (Step s a)) -> s -> Stream m a
D.Stream (forall {m :: * -> *} {a}.
Ptr Word8
-> State StreamK m a
-> UTF8DecodeState s (Either DecodeError Char)
-> m (Step
        (UTF8DecodeState s (Either DecodeError Char))
        (Either DecodeError Char))
step' Ptr Word8
utf8d) forall {a}. UTF8DecodeState s a
stt
  where
    {-# INLINE_LATE step' #-}
    step' :: Ptr Word8
-> State StreamK m a
-> UTF8DecodeState s (Either DecodeError Char)
-> m (Step
        (UTF8DecodeState s (Either DecodeError Char))
        (Either DecodeError Char))
step' Ptr Word8
_ State StreamK m a
gst (UTF8DecodeInit s
st) = do
        Step s Word8
r <- State StreamK m Word8 -> s -> m (Step s Word8)
step (forall (t :: (* -> *) -> * -> *) (m :: * -> *) a (n :: * -> *) b.
State t m a -> State t n b
adaptState State StreamK m a
gst) s
st
        forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ case Step s Word8
r of
            Yield Word8
x s
s -> forall s a. s -> Step s a
Skip (forall s a. s -> Word8 -> UTF8DecodeState s a
UTF8DecodeInit1 s
s Word8
x)
            Skip s
s -> forall s a. s -> Step s a
Skip (forall s a. s -> UTF8DecodeState s a
UTF8DecodeInit s
s)
            Step s Word8
Stop   -> forall s a. s -> Step s a
Skip forall s a. UTF8DecodeState s a
Done

    step' Ptr Word8
_ State StreamK m a
_ (UTF8DecodeInit1 s
st Word8
x) = do
        -- Note: It is important to use a ">" instead of a "<=" test
        -- here for GHC to generate code layout for default branch
        -- prediction for the common case. This is fragile and might
        -- change with the compiler versions, we need a more reliable
        -- "likely" primitive to control branch predication.
        case Word8
x forall a. Ord a => a -> a -> Bool
> Word8
0x7f of
            Bool
False ->
                forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall s a. s -> Step s a
Skip forall a b. (a -> b) -> a -> b
$ forall s a. a -> UTF8DecodeState s a -> UTF8DecodeState s a
YieldAndContinue
                    (forall a b. b -> Either a b
Right forall a b. (a -> b) -> a -> b
$ Int -> Char
unsafeChr (forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
x))
                    (forall s a. s -> UTF8DecodeState s a
UTF8DecodeInit s
st)
            -- Using a separate state here generates a jump to a
            -- separate code block in the core which seems to perform
            -- slightly better for the non-ascii case.
            Bool
True -> forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall s a. s -> Step s a
Skip forall a b. (a -> b) -> a -> b
$ forall s a. s -> Word8 -> UTF8DecodeState s a
UTF8DecodeFirst s
st Word8
x

    -- XXX should we merge it with UTF8DecodeInit1?
    step' Ptr Word8
table State StreamK m a
_ (UTF8DecodeFirst s
st Word8
x) = do
        let (Tuple' Word8
sv Int
cp) = Ptr Word8 -> Word8 -> Tuple' Word8 Int
decode0 Ptr Word8
table Word8
x
        forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$
            case Word8
sv of
                Word8
12 ->
                    forall s a. s -> Step s a
Skip forall a b. (a -> b) -> a -> b
$ forall s a. a -> UTF8DecodeState s a -> UTF8DecodeState s a
YieldAndContinue (forall a b. a -> Either a b
Left forall a b. (a -> b) -> a -> b
$ Word8 -> Int -> DecodeError
DecodeError Word8
0 (forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
x))
                                            (forall s a. s -> UTF8DecodeState s a
UTF8DecodeInit s
st)
                Word8
0 -> forall a. (?callStack::CallStack) => [Char] -> a
error [Char]
"unreachable state"
                Word8
_ -> forall s a. s -> Step s a
Skip (forall s a. s -> Word8 -> Int -> UTF8DecodeState s a
UTF8Decoding s
st Word8
sv Int
cp)

    -- We recover by trying the new byte x a starter of a new codepoint.
    -- XXX on error need to report the next byte "x" as well.
    -- XXX need to use the same recovery in array decoding routine as well
    step' Ptr Word8
table State StreamK m a
gst (UTF8Decoding s
st Word8
statePtr Int
codepointPtr) = do
        Step s Word8
r <- State StreamK m Word8 -> s -> m (Step s Word8)
step (forall (t :: (* -> *) -> * -> *) (m :: * -> *) a (n :: * -> *) b.
State t m a -> State t n b
adaptState State StreamK m a
gst) s
st
        case Step s Word8
r of
            Yield Word8
x s
s -> do
                let (Tuple' Word8
sv Int
cp) = Ptr Word8 -> Word8 -> Int -> Word8 -> Tuple' Word8 Int
decode1 Ptr Word8
table Word8
statePtr Int
codepointPtr Word8
x
                forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$
                    case Word8
sv of
                        Word8
0 -> forall s a. s -> Step s a
Skip forall a b. (a -> b) -> a -> b
$ forall s a. a -> UTF8DecodeState s a -> UTF8DecodeState s a
YieldAndContinue (forall a b. b -> Either a b
Right forall a b. (a -> b) -> a -> b
$ Int -> Char
unsafeChr Int
cp)
                                        (forall s a. s -> UTF8DecodeState s a
UTF8DecodeInit s
s)
                        Word8
12 ->
                            forall s a. s -> Step s a
Skip forall a b. (a -> b) -> a -> b
$ forall s a. a -> UTF8DecodeState s a -> UTF8DecodeState s a
YieldAndContinue (forall a b. a -> Either a b
Left forall a b. (a -> b) -> a -> b
$ Word8 -> Int -> DecodeError
DecodeError Word8
statePtr Int
codepointPtr)
                                        (forall s a. s -> Word8 -> UTF8DecodeState s a
UTF8DecodeInit1 s
s Word8
x)
                        Word8
_ -> forall s a. s -> Step s a
Skip (forall s a. s -> Word8 -> Int -> UTF8DecodeState s a
UTF8Decoding s
s Word8
sv Int
cp)
            Skip s
s -> forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall s a. s -> Step s a
Skip (forall s a. s -> Word8 -> Int -> UTF8DecodeState s a
UTF8Decoding s
s Word8
statePtr Int
codepointPtr)
            Step s Word8
Stop -> forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall s a. s -> Step s a
Skip forall a b. (a -> b) -> a -> b
$ forall s a. a -> UTF8DecodeState s a -> UTF8DecodeState s a
YieldAndContinue (forall a b. a -> Either a b
Left forall a b. (a -> b) -> a -> b
$ Word8 -> Int -> DecodeError
DecodeError Word8
statePtr Int
codepointPtr) forall s a. UTF8DecodeState s a
Done

    step' Ptr Word8
_ State StreamK m a
_ (YieldAndContinue Either DecodeError Char
c UTF8DecodeState s (Either DecodeError Char)
s) = forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall s a. a -> s -> Step s a
Yield Either DecodeError Char
c UTF8DecodeState s (Either DecodeError Char)
s
    step' Ptr Word8
_ State StreamK m a
_ UTF8DecodeState s (Either DecodeError Char)
Done = forall (m :: * -> *) a. Monad m => a -> m a
return forall s a. Step s a
Stop

-- XXX We can use just one API, and define InitState = 0 and InitCodePoint = 0
-- to use as starting state.
--
{-# INLINE_NORMAL decodeUtf8EitherD #-}
decodeUtf8EitherD :: Monad m
    => D.Stream m Word8 -> D.Stream m (Either DecodeError Char)
decodeUtf8EitherD :: forall (m :: * -> *).
Monad m =>
Stream m Word8 -> Stream m (Either DecodeError Char)
decodeUtf8EitherD = forall (m :: * -> *).
Monad m =>
Word8
-> Int -> Stream m Word8 -> Stream m (Either DecodeError Char)
resumeDecodeUtf8EitherD Word8
0 Int
0

-- |
--
-- /Pre-release/
{-# INLINE decodeUtf8Either #-}
decodeUtf8Either :: Monad m
    => Stream m Word8 -> Stream m (Either DecodeError Char)
decodeUtf8Either :: forall (m :: * -> *).
Monad m =>
Stream m Word8 -> Stream m (Either DecodeError Char)
decodeUtf8Either = forall (m :: * -> *).
Monad m =>
Stream m Word8 -> Stream m (Either DecodeError Char)
decodeUtf8EitherD

-- |
--
-- /Pre-release/
{-# INLINE resumeDecodeUtf8Either #-}
resumeDecodeUtf8Either
    :: Monad m
    => DecodeState
    -> CodePoint
    -> Stream m Word8
    -> Stream m (Either DecodeError Char)
resumeDecodeUtf8Either :: forall (m :: * -> *).
Monad m =>
Word8
-> Int -> Stream m Word8 -> Stream m (Either DecodeError Char)
resumeDecodeUtf8Either = forall (m :: * -> *).
Monad m =>
Word8
-> Int -> Stream m Word8 -> Stream m (Either DecodeError Char)
resumeDecodeUtf8EitherD

-------------------------------------------------------------------------------
-- One shot decoding
-------------------------------------------------------------------------------

data CodingFailureMode
    = TransliterateCodingFailure
    | ErrorOnCodingFailure
    | DropOnCodingFailure
    deriving (Int -> CodingFailureMode -> ShowS
[CodingFailureMode] -> ShowS
CodingFailureMode -> [Char]
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
showList :: [CodingFailureMode] -> ShowS
$cshowList :: [CodingFailureMode] -> ShowS
show :: CodingFailureMode -> [Char]
$cshow :: CodingFailureMode -> [Char]
showsPrec :: Int -> CodingFailureMode -> ShowS
$cshowsPrec :: Int -> CodingFailureMode -> ShowS
Show)

{-# INLINE replacementChar #-}
replacementChar :: Char
replacementChar :: Char
replacementChar = Char
'\xFFFD'

data UTF8CharDecodeState a
    = UTF8CharDecodeInit
    | UTF8CharDecoding !DecodeState !CodePoint

{-# INLINE parseCharUtf8WithD #-}
parseCharUtf8WithD ::
       Monad m => CodingFailureMode -> ParserD.Parser Word8 m Char
parseCharUtf8WithD :: forall (m :: * -> *).
Monad m =>
CodingFailureMode -> Parser Word8 m Char
parseCharUtf8WithD CodingFailureMode
cfm = forall a (m :: * -> *) b s.
(s -> a -> m (Step s b))
-> m (Initial s b) -> (s -> m (Step s b)) -> Parser a m b
ParserD.Parser (forall {m :: * -> *} {a} {a}.
Monad m =>
Ptr Word8
-> UTF8CharDecodeState a
-> Word8
-> m (Step (UTF8CharDecodeState a) Char)
step' Ptr Word8
utf8d) forall {a} {b}. m (Initial (UTF8CharDecodeState a) b)
initial forall {m :: * -> *} {a} {s}.
Monad m =>
UTF8CharDecodeState a -> m (Step s Char)
extract

    where

    prefix :: [Char]
prefix = [Char]
"Streamly.Internal.Data.Stream.parseCharUtf8WithD:"

    {-# INLINE initial #-}
    initial :: m (Initial (UTF8CharDecodeState a) b)
initial = forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall s b. s -> Initial s b
ParserD.IPartial forall a. UTF8CharDecodeState a
UTF8CharDecodeInit

    handleError :: [Char] -> Bool -> Step (UTF8CharDecodeState a) Char
handleError [Char]
err Bool
souldBackTrack =
        case CodingFailureMode
cfm of
            CodingFailureMode
ErrorOnCodingFailure -> forall s b. [Char] -> Step s b
ParserD.Error [Char]
err
            CodingFailureMode
TransliterateCodingFailure ->
                case Bool
souldBackTrack of
                    Bool
True -> forall s b. Int -> b -> Step s b
ParserD.Done Int
1 Char
replacementChar
                    Bool
False -> forall s b. Int -> b -> Step s b
ParserD.Done Int
0 Char
replacementChar
            CodingFailureMode
DropOnCodingFailure ->
                case Bool
souldBackTrack of
                    Bool
True -> forall s b. Int -> s -> Step s b
ParserD.Continue Int
1 forall a. UTF8CharDecodeState a
UTF8CharDecodeInit
                    Bool
False -> forall s b. Int -> s -> Step s b
ParserD.Continue Int
0 forall a. UTF8CharDecodeState a
UTF8CharDecodeInit

    {-# INLINE step' #-}
    step' :: Ptr Word8
-> UTF8CharDecodeState a
-> Word8
-> m (Step (UTF8CharDecodeState a) Char)
step' Ptr Word8
table UTF8CharDecodeState a
UTF8CharDecodeInit Word8
x =
        -- Note: It is important to use a ">" instead of a "<=" test
        -- here for GHC to generate code layout for default branch
        -- prediction for the common case. This is fragile and might
        -- change with the compiler versions, we need a more reliable
        -- "likely" primitive to control branch predication.
        forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ case Word8
x forall a. Ord a => a -> a -> Bool
> Word8
0x7f of
            Bool
False -> forall s b. Int -> b -> Step s b
ParserD.Done Int
0 forall a b. (a -> b) -> a -> b
$ Int -> Char
unsafeChr forall a b. (a -> b) -> a -> b
$ forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
x
            Bool
True ->
                let (Tuple' Word8
sv Int
cp) = Ptr Word8 -> Word8 -> Tuple' Word8 Int
decode0 Ptr Word8
table Word8
x
                 in case Word8
sv of
                        Word8
12 ->
                            let msg :: [Char]
msg = [Char]
prefix
                                    forall a. [a] -> [a] -> [a]
++ [Char]
"Invalid first UTF8 byte" forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show Word8
x
                             in forall {a}. [Char] -> Bool -> Step (UTF8CharDecodeState a) Char
handleError [Char]
msg Bool
False
                        Word8
0 -> forall a. (?callStack::CallStack) => [Char] -> a
error forall a b. (a -> b) -> a -> b
$ [Char]
prefix forall a. [a] -> [a] -> [a]
++ [Char]
"unreachable state"
                        Word8
_ -> forall s b. Int -> s -> Step s b
ParserD.Continue Int
0 (forall a. Word8 -> Int -> UTF8CharDecodeState a
UTF8CharDecoding Word8
sv Int
cp)

    step' Ptr Word8
table (UTF8CharDecoding Word8
statePtr Int
codepointPtr) Word8
x = forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$
        let (Tuple' Word8
sv Int
cp) = Ptr Word8 -> Word8 -> Int -> Word8 -> Tuple' Word8 Int
decode1 Ptr Word8
table Word8
statePtr Int
codepointPtr Word8
x
         in case Word8
sv of
            Word8
0 -> forall s b. Int -> b -> Step s b
ParserD.Done Int
0 forall a b. (a -> b) -> a -> b
$ Int -> Char
unsafeChr Int
cp
            Word8
12 ->
                let msg :: [Char]
msg = [Char]
prefix
                        forall a. [a] -> [a] -> [a]
++ [Char]
"Invalid subsequent UTF8 byte"
                        forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show Word8
x
                        forall a. [a] -> [a] -> [a]
++ [Char]
"in state"
                        forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show Word8
statePtr
                        forall a. [a] -> [a] -> [a]
++ [Char]
"accumulated value"
                        forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show Int
codepointPtr
                 in forall {a}. [Char] -> Bool -> Step (UTF8CharDecodeState a) Char
handleError [Char]
msg Bool
True
            Word8
_ -> forall s b. Int -> s -> Step s b
ParserD.Continue Int
0 (forall a. Word8 -> Int -> UTF8CharDecodeState a
UTF8CharDecoding Word8
sv Int
cp)

    {-# INLINE extract #-}
    extract :: UTF8CharDecodeState a -> m (Step s Char)
extract UTF8CharDecodeState a
UTF8CharDecodeInit =  forall a. (?callStack::CallStack) => [Char] -> a
error forall a b. (a -> b) -> a -> b
$ [Char]
prefix forall a. [a] -> [a] -> [a]
++ [Char]
"Not enough input"
    extract (UTF8CharDecoding Word8
_ Int
_) =
        case CodingFailureMode
cfm of
            CodingFailureMode
ErrorOnCodingFailure ->
                forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall s b. [Char] -> Step s b
ParserD.Error forall a b. (a -> b) -> a -> b
$ [Char]
prefix forall a. [a] -> [a] -> [a]
++ [Char]
"Not enough input"
            CodingFailureMode
TransliterateCodingFailure ->
                forall (m :: * -> *) a. Monad m => a -> m a
return (forall s b. Int -> b -> Step s b
ParserD.Done Int
0 Char
replacementChar)
            -- XXX We shouldn't error out here. There is no way to represent an
            -- empty parser result unless we return a "Maybe" type.
            CodingFailureMode
DropOnCodingFailure -> forall a. (?callStack::CallStack) => [Char] -> a
error forall a b. (a -> b) -> a -> b
$ [Char]
prefix forall a. [a] -> [a] -> [a]
++ [Char]
"Not enough input"

-- XXX This should ideally accept a "CodingFailureMode" and perform appropriate
-- error handling. This isn't possible now as "TransliterateCodingFailure"'s
-- workflow requires backtracking 1 element. This can be revisited once "Fold"
-- supports backtracking.
{-# INLINE writeCharUtf8' #-}
writeCharUtf8' :: Monad m => Parser Word8 m Char
writeCharUtf8' :: forall (m :: * -> *). Monad m => Parser Word8 m Char
writeCharUtf8' =  forall (m :: * -> *).
Monad m =>
CodingFailureMode -> Parser Word8 m Char
parseCharUtf8WithD CodingFailureMode
ErrorOnCodingFailure

-- XXX The initial idea was to have "parseCharUtf8" and offload the error
-- handling to another parser. So, say we had "parseCharUtf8'",
--
-- >>> parseCharUtf8Smart = parseCharUtf8' <|> Parser.fromPure replacementChar
--
-- But unfortunately parseCharUtf8Smart used in conjunction with "parseMany" -
-- that is "parseMany parseCharUtf8Smart" on a stream causes the heap to
-- overflow. Even a heap size of 500 MB was not sufficient.
--
-- This needs to be investigated futher.
{-# INLINE parseCharUtf8With #-}
parseCharUtf8With ::
       Monad m => CodingFailureMode -> Parser.Parser Word8 m Char
parseCharUtf8With :: forall (m :: * -> *).
Monad m =>
CodingFailureMode -> Parser Word8 m Char
parseCharUtf8With = forall (m :: * -> *).
Monad m =>
CodingFailureMode -> Parser Word8 m Char
parseCharUtf8WithD

-- XXX write it as a parser and use parseMany to decode a stream, need to check
-- if that preserves the same performance. Or we can use a resumable parser
-- that parses a chunk at a time.
--
-- XXX Implement this in terms of decodeUtf8Either. Need to make sure that
-- decodeUtf8Either preserves the performance characterstics.
--
{-# INLINE_NORMAL decodeUtf8WithD #-}
decodeUtf8WithD :: Monad m
    => CodingFailureMode -> D.Stream m Word8 -> D.Stream m Char
decodeUtf8WithD :: forall (m :: * -> *).
Monad m =>
CodingFailureMode -> Stream m Word8 -> Stream m Char
decodeUtf8WithD CodingFailureMode
cfm (D.Stream State StreamK m Word8 -> s -> m (Step s Word8)
step s
state) =
    forall (m :: * -> *) a s.
(State StreamK m a -> s -> m (Step s a)) -> s -> Stream m a
D.Stream (forall {m :: * -> *} {a}.
Ptr Word8
-> State StreamK m a
-> UTF8DecodeState s Char
-> m (Step (UTF8DecodeState s Char) Char)
step' Ptr Word8
utf8d) (forall s a. s -> UTF8DecodeState s a
UTF8DecodeInit s
state)

    where

    prefix :: [Char]
prefix = [Char]
"Streamly.Internal.Data.Stream.decodeUtf8With: "

    {-# INLINE handleError #-}
    handleError :: [Char] -> UTF8DecodeState s Char -> UTF8DecodeState s Char
handleError [Char]
e UTF8DecodeState s Char
s =
        case CodingFailureMode
cfm of
            CodingFailureMode
ErrorOnCodingFailure -> forall a. (?callStack::CallStack) => [Char] -> a
error [Char]
e
            CodingFailureMode
TransliterateCodingFailure -> forall s a. a -> UTF8DecodeState s a -> UTF8DecodeState s a
YieldAndContinue Char
replacementChar UTF8DecodeState s Char
s
            CodingFailureMode
DropOnCodingFailure -> UTF8DecodeState s Char
s

    {-# INLINE handleUnderflow #-}
    handleUnderflow :: UTF8DecodeState s Char
handleUnderflow =
        case CodingFailureMode
cfm of
            CodingFailureMode
ErrorOnCodingFailure -> forall a. (?callStack::CallStack) => [Char] -> a
error forall a b. (a -> b) -> a -> b
$ [Char]
prefix forall a. [a] -> [a] -> [a]
++ [Char]
"Not enough input"
            CodingFailureMode
TransliterateCodingFailure -> forall s a. a -> UTF8DecodeState s a -> UTF8DecodeState s a
YieldAndContinue Char
replacementChar forall s a. UTF8DecodeState s a
Done
            CodingFailureMode
DropOnCodingFailure -> forall s a. UTF8DecodeState s a
Done

    {-# INLINE_LATE step' #-}
    step' :: Ptr Word8
-> State StreamK m a
-> UTF8DecodeState s Char
-> m (Step (UTF8DecodeState s Char) Char)
step' Ptr Word8
_ State StreamK m a
gst (UTF8DecodeInit s
st) = do
        Step s Word8
r <- State StreamK m Word8 -> s -> m (Step s Word8)
step (forall (t :: (* -> *) -> * -> *) (m :: * -> *) a (n :: * -> *) b.
State t m a -> State t n b
adaptState State StreamK m a
gst) s
st
        forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ case Step s Word8
r of
            Yield Word8
x s
s -> forall s a. s -> Step s a
Skip (forall s a. s -> Word8 -> UTF8DecodeState s a
UTF8DecodeInit1 s
s Word8
x)
            Skip s
s -> forall s a. s -> Step s a
Skip (forall s a. s -> UTF8DecodeState s a
UTF8DecodeInit s
s)
            Step s Word8
Stop   -> forall s a. s -> Step s a
Skip forall s a. UTF8DecodeState s a
Done

    step' Ptr Word8
_ State StreamK m a
_ (UTF8DecodeInit1 s
st Word8
x) = do
        -- Note: It is important to use a ">" instead of a "<=" test
        -- here for GHC to generate code layout for default branch
        -- prediction for the common case. This is fragile and might
        -- change with the compiler versions, we need a more reliable
        -- "likely" primitive to control branch predication.
        case Word8
x forall a. Ord a => a -> a -> Bool
> Word8
0x7f of
            Bool
False ->
                forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall s a. s -> Step s a
Skip forall a b. (a -> b) -> a -> b
$ forall s a. a -> UTF8DecodeState s a -> UTF8DecodeState s a
YieldAndContinue
                    (Int -> Char
unsafeChr (forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
x))
                    (forall s a. s -> UTF8DecodeState s a
UTF8DecodeInit s
st)
            -- Using a separate state here generates a jump to a
            -- separate code block in the core which seems to perform
            -- slightly better for the non-ascii case.
            Bool
True -> forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall s a. s -> Step s a
Skip forall a b. (a -> b) -> a -> b
$ forall s a. s -> Word8 -> UTF8DecodeState s a
UTF8DecodeFirst s
st Word8
x

    -- XXX should we merge it with UTF8DecodeInit1?
    step' Ptr Word8
table State StreamK m a
_ (UTF8DecodeFirst s
st Word8
x) = do
        let (Tuple' Word8
sv Int
cp) = Ptr Word8 -> Word8 -> Tuple' Word8 Int
decode0 Ptr Word8
table Word8
x
        forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$
            case Word8
sv of
                Word8
12 ->
                    let msg :: [Char]
msg = [Char]
prefix forall a. [a] -> [a] -> [a]
++ [Char]
"Invalid first UTF8 byte " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show Word8
x
                     in forall s a. s -> Step s a
Skip forall a b. (a -> b) -> a -> b
$ forall {s}.
[Char] -> UTF8DecodeState s Char -> UTF8DecodeState s Char
handleError [Char]
msg (forall s a. s -> UTF8DecodeState s a
UTF8DecodeInit s
st)
                Word8
0 -> forall a. (?callStack::CallStack) => [Char] -> a
error [Char]
"unreachable state"
                Word8
_ -> forall s a. s -> Step s a
Skip (forall s a. s -> Word8 -> Int -> UTF8DecodeState s a
UTF8Decoding s
st Word8
sv Int
cp)

    -- We recover by trying the new byte x as a starter of a new codepoint.
    -- XXX need to use the same recovery in array decoding routine as well
    step' Ptr Word8
table State StreamK m a
gst (UTF8Decoding s
st Word8
statePtr Int
codepointPtr) = do
        Step s Word8
r <- State StreamK m Word8 -> s -> m (Step s Word8)
step (forall (t :: (* -> *) -> * -> *) (m :: * -> *) a (n :: * -> *) b.
State t m a -> State t n b
adaptState State StreamK m a
gst) s
st
        case Step s Word8
r of
            Yield Word8
x s
s -> do
                let (Tuple' Word8
sv Int
cp) = Ptr Word8 -> Word8 -> Int -> Word8 -> Tuple' Word8 Int
decode1 Ptr Word8
table Word8
statePtr Int
codepointPtr Word8
x
                forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ case Word8
sv of
                    Word8
0 -> forall s a. s -> Step s a
Skip forall a b. (a -> b) -> a -> b
$ forall s a. a -> UTF8DecodeState s a -> UTF8DecodeState s a
YieldAndContinue
                            (Int -> Char
unsafeChr Int
cp) (forall s a. s -> UTF8DecodeState s a
UTF8DecodeInit s
s)
                    Word8
12 ->
                        let msg :: [Char]
msg = [Char]
prefix
                                forall a. [a] -> [a] -> [a]
++ [Char]
"Invalid subsequent UTF8 byte "
                                forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show Word8
x
                                forall a. [a] -> [a] -> [a]
++ [Char]
" in state "
                                forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show Word8
statePtr
                                forall a. [a] -> [a] -> [a]
++ [Char]
" accumulated value "
                                forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show Int
codepointPtr
                         in forall s a. s -> Step s a
Skip forall a b. (a -> b) -> a -> b
$ forall {s}.
[Char] -> UTF8DecodeState s Char -> UTF8DecodeState s Char
handleError [Char]
msg (forall s a. s -> Word8 -> UTF8DecodeState s a
UTF8DecodeInit1 s
s Word8
x)
                    Word8
_ -> forall s a. s -> Step s a
Skip (forall s a. s -> Word8 -> Int -> UTF8DecodeState s a
UTF8Decoding s
s Word8
sv Int
cp)
            Skip s
s -> forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$
                forall s a. s -> Step s a
Skip (forall s a. s -> Word8 -> Int -> UTF8DecodeState s a
UTF8Decoding s
s Word8
statePtr Int
codepointPtr)
            Step s Word8
Stop -> forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall s a. s -> Step s a
Skip forall {s}. UTF8DecodeState s Char
handleUnderflow

    step' Ptr Word8
_ State StreamK m a
_ (YieldAndContinue Char
c UTF8DecodeState s Char
s) = forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall s a. a -> s -> Step s a
Yield Char
c UTF8DecodeState s Char
s
    step' Ptr Word8
_ State StreamK m a
_ UTF8DecodeState s Char
Done = forall (m :: * -> *) a. Monad m => a -> m a
return forall s a. Step s a
Stop

{-# INLINE decodeUtf8D #-}
decodeUtf8D :: Monad m => D.Stream m Word8 -> D.Stream m Char
decodeUtf8D :: forall (m :: * -> *). Monad m => Stream m Word8 -> Stream m Char
decodeUtf8D = forall (m :: * -> *).
Monad m =>
CodingFailureMode -> Stream m Word8 -> Stream m Char
decodeUtf8WithD CodingFailureMode
TransliterateCodingFailure

-- | Decode a UTF-8 encoded bytestream to a stream of Unicode characters.
-- Any invalid codepoint encountered is replaced with the unicode replacement
-- character.
--
{-# INLINE decodeUtf8 #-}
decodeUtf8 :: Monad m => Stream m Word8 -> Stream m Char
decodeUtf8 :: forall (m :: * -> *). Monad m => Stream m Word8 -> Stream m Char
decodeUtf8 = forall (m :: * -> *). Monad m => Stream m Word8 -> Stream m Char
decodeUtf8D

{-# INLINE decodeUtf8D' #-}
decodeUtf8D' :: Monad m => D.Stream m Word8 -> D.Stream m Char
decodeUtf8D' :: forall (m :: * -> *). Monad m => Stream m Word8 -> Stream m Char
decodeUtf8D' = forall (m :: * -> *).
Monad m =>
CodingFailureMode -> Stream m Word8 -> Stream m Char
decodeUtf8WithD CodingFailureMode
ErrorOnCodingFailure

-- | Decode a UTF-8 encoded bytestream to a stream of Unicode characters.
-- The function throws an error if an invalid codepoint is encountered.
--
{-# INLINE decodeUtf8' #-}
decodeUtf8' :: Monad m => Stream m Word8 -> Stream m Char
decodeUtf8' :: forall (m :: * -> *). Monad m => Stream m Word8 -> Stream m Char
decodeUtf8' = forall (m :: * -> *). Monad m => Stream m Word8 -> Stream m Char
decodeUtf8D'

{-# INLINE decodeUtf8D_ #-}
decodeUtf8D_ :: Monad m => D.Stream m Word8 -> D.Stream m Char
decodeUtf8D_ :: forall (m :: * -> *). Monad m => Stream m Word8 -> Stream m Char
decodeUtf8D_ = forall (m :: * -> *).
Monad m =>
CodingFailureMode -> Stream m Word8 -> Stream m Char
decodeUtf8WithD CodingFailureMode
DropOnCodingFailure

-- | Decode a UTF-8 encoded bytestream to a stream of Unicode characters.
-- Any invalid codepoint encountered is dropped.
--
{-# INLINE decodeUtf8_ #-}
decodeUtf8_ :: Monad m => Stream m Word8 -> Stream m Char
decodeUtf8_ :: forall (m :: * -> *). Monad m => Stream m Word8 -> Stream m Char
decodeUtf8_ = forall (m :: * -> *). Monad m => Stream m Word8 -> Stream m Char
decodeUtf8D_

-- | Same as 'decodeUtf8'
--
{-# DEPRECATED decodeUtf8Lax "Please use 'decodeUtf8' instead" #-}
{-# INLINE decodeUtf8Lax #-}
decodeUtf8Lax :: Monad m => Stream m Word8 -> Stream m Char
decodeUtf8Lax :: forall (m :: * -> *). Monad m => Stream m Word8 -> Stream m Char
decodeUtf8Lax = forall (m :: * -> *). Monad m => Stream m Word8 -> Stream m Char
decodeUtf8

-------------------------------------------------------------------------------
-- Decoding Array Streams
-------------------------------------------------------------------------------

#ifndef __GHCJS__
{-# ANN type FlattenState Fuse #-}
#endif
data FlattenState s
    = OuterLoop s !(Maybe (DecodeState, CodePoint))
    | InnerLoopDecodeInit s MutByteArray !Int !Int
    | InnerLoopDecodeFirst s MutByteArray !Int !Int Word8
    | InnerLoopDecoding s MutByteArray !Int !Int
        !DecodeState !CodePoint
    | YAndC !Char (FlattenState s)   -- These constructors can be
                                     -- encoded in the UTF8DecodeState
                                     -- type, I prefer to keep these
                                     -- flat even though that means
                                     -- coming up with new names
    | D

-- The normal decodeUtf8 above should fuse with flattenArrays
-- to create this exact code but it doesn't for some reason, as of now this
-- remains the fastest way I could figure out to decodeUtf8.
--
-- XXX Add Proper error messages
{-# INLINE_NORMAL decodeUtf8ArraysWithD #-}
decodeUtf8ArraysWithD ::
       MonadIO m
    => CodingFailureMode
    -> D.Stream m (Array Word8)
    -> D.Stream m Char
decodeUtf8ArraysWithD :: forall (m :: * -> *).
MonadIO m =>
CodingFailureMode -> Stream m (Array Word8) -> Stream m Char
decodeUtf8ArraysWithD CodingFailureMode
cfm (D.Stream State StreamK m (Array Word8) -> s -> m (Step s (Array Word8))
step s
state) =
    forall (m :: * -> *) a s.
(State StreamK m a -> s -> m (Step s a)) -> s -> Stream m a
D.Stream (forall {m :: * -> *} {a}.
Ptr Word8
-> State StreamK m a
-> FlattenState s
-> m (Step (FlattenState s) Char)
step' Ptr Word8
utf8d) (forall s. s -> Maybe (Word8, Int) -> FlattenState s
OuterLoop s
state forall a. Maybe a
Nothing)
  where
    {-# INLINE transliterateOrError #-}
    transliterateOrError :: [Char] -> FlattenState s -> FlattenState s
transliterateOrError [Char]
e FlattenState s
s =
        case CodingFailureMode
cfm of
            CodingFailureMode
ErrorOnCodingFailure -> forall a. (?callStack::CallStack) => [Char] -> a
error [Char]
e
            CodingFailureMode
TransliterateCodingFailure -> forall s. Char -> FlattenState s -> FlattenState s
YAndC Char
replacementChar FlattenState s
s
            CodingFailureMode
DropOnCodingFailure -> FlattenState s
s
    {-# INLINE inputUnderflow #-}
    inputUnderflow :: FlattenState s
inputUnderflow =
        case CodingFailureMode
cfm of
            CodingFailureMode
ErrorOnCodingFailure ->
                forall a. (?callStack::CallStack) => [Char] -> a
error forall a b. (a -> b) -> a -> b
$
                forall a. Show a => a -> [Char]
show [Char]
"Streamly.Internal.Data.Stream."
                forall a. [a] -> [a] -> [a]
++ [Char]
"decodeUtf8ArraysWith: Input Underflow"
            CodingFailureMode
TransliterateCodingFailure -> forall s. Char -> FlattenState s -> FlattenState s
YAndC Char
replacementChar forall s. FlattenState s
D
            CodingFailureMode
DropOnCodingFailure -> forall s. FlattenState s
D
    {-# INLINE_LATE step' #-}
    step' :: Ptr Word8
-> State StreamK m a
-> FlattenState s
-> m (Step (FlattenState s) Char)
step' Ptr Word8
_ State StreamK m a
gst (OuterLoop s
st Maybe (Word8, Int)
Nothing) = do
        Step s (Array Word8)
r <- State StreamK m (Array Word8) -> s -> m (Step s (Array Word8))
step (forall (t :: (* -> *) -> * -> *) (m :: * -> *) a (n :: * -> *) b.
State t m a -> State t n b
adaptState State StreamK m a
gst) s
st
        forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$
            case Step s (Array Word8)
r of
                Yield Array {Int
MutByteArray
arrEnd :: forall a. Array a -> Int
arrStart :: forall a. Array a -> Int
arrContents :: forall a. Array a -> MutByteArray
arrEnd :: Int
arrStart :: Int
arrContents :: MutByteArray
..} s
s ->
                     forall s a. s -> Step s a
Skip (forall s. s -> MutByteArray -> Int -> Int -> FlattenState s
InnerLoopDecodeInit s
s MutByteArray
arrContents Int
arrStart Int
arrEnd)
                Skip s
s -> forall s a. s -> Step s a
Skip (forall s. s -> Maybe (Word8, Int) -> FlattenState s
OuterLoop s
s forall a. Maybe a
Nothing)
                Step s (Array Word8)
Stop -> forall s a. s -> Step s a
Skip forall s. FlattenState s
D
    step' Ptr Word8
_ State StreamK m a
gst (OuterLoop s
st dst :: Maybe (Word8, Int)
dst@(Just (Word8
ds, Int
cp))) = do
        Step s (Array Word8)
r <- State StreamK m (Array Word8) -> s -> m (Step s (Array Word8))
step (forall (t :: (* -> *) -> * -> *) (m :: * -> *) a (n :: * -> *) b.
State t m a -> State t n b
adaptState State StreamK m a
gst) s
st
        forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$
            case Step s (Array Word8)
r of
                Yield Array {Int
MutByteArray
arrEnd :: Int
arrStart :: Int
arrContents :: MutByteArray
arrEnd :: forall a. Array a -> Int
arrStart :: forall a. Array a -> Int
arrContents :: forall a. Array a -> MutByteArray
..} s
s ->
                     forall s a. s -> Step s a
Skip (forall s.
s -> MutByteArray -> Int -> Int -> Word8 -> Int -> FlattenState s
InnerLoopDecoding s
s MutByteArray
arrContents Int
arrStart Int
arrEnd Word8
ds Int
cp)
                Skip s
s -> forall s a. s -> Step s a
Skip (forall s. s -> Maybe (Word8, Int) -> FlattenState s
OuterLoop s
s Maybe (Word8, Int)
dst)
                Step s (Array Word8)
Stop -> forall s a. s -> Step s a
Skip forall s. FlattenState s
inputUnderflow
    step' Ptr Word8
_ State StreamK m a
_ (InnerLoopDecodeInit s
st MutByteArray
_ Int
p Int
end)
        | Int
p forall a. Eq a => a -> a -> Bool
== Int
end = do
            forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall s a. s -> Step s a
Skip forall a b. (a -> b) -> a -> b
$ forall s. s -> Maybe (Word8, Int) -> FlattenState s
OuterLoop s
st forall a. Maybe a
Nothing
    step' Ptr Word8
_ State StreamK m a
_ (InnerLoopDecodeInit s
st MutByteArray
contents Int
p Int
end) = do
        Word8
x <- forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO forall a b. (a -> b) -> a -> b
$ forall a. Unbox a => Int -> MutByteArray -> IO a
peekAt Int
p MutByteArray
contents
        -- Note: It is important to use a ">" instead of a "<=" test here for
        -- GHC to generate code layout for default branch prediction for the
        -- common case. This is fragile and might change with the compiler
        -- versions, we need a more reliable "likely" primitive to control
        -- branch predication.
        case Word8
x forall a. Ord a => a -> a -> Bool
> Word8
0x7f of
            Bool
False ->
                forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall s a. s -> Step s a
Skip forall a b. (a -> b) -> a -> b
$ forall s. Char -> FlattenState s -> FlattenState s
YAndC
                    (Int -> Char
unsafeChr (forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
x))
                    (forall s. s -> MutByteArray -> Int -> Int -> FlattenState s
InnerLoopDecodeInit s
st MutByteArray
contents (Int
p forall a. Num a => a -> a -> a
+ Int
1) Int
end)
            -- Using a separate state here generates a jump to a separate code
            -- block in the core which seems to perform slightly better for the
            -- non-ascii case.
            Bool
True -> forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall s a. s -> Step s a
Skip forall a b. (a -> b) -> a -> b
$ forall s.
s -> MutByteArray -> Int -> Int -> Word8 -> FlattenState s
InnerLoopDecodeFirst s
st MutByteArray
contents Int
p Int
end Word8
x

    step' Ptr Word8
table State StreamK m a
_ (InnerLoopDecodeFirst s
st MutByteArray
contents Int
p Int
end Word8
x) = do
        let (Tuple' Word8
sv Int
cp) = Ptr Word8 -> Word8 -> Tuple' Word8 Int
decode0 Ptr Word8
table Word8
x
        forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$
            case Word8
sv of
                Word8
12 ->
                    forall s a. s -> Step s a
Skip forall a b. (a -> b) -> a -> b
$
                    forall {s}. [Char] -> FlattenState s -> FlattenState s
transliterateOrError
                        (
                           [Char]
"Streamly.Internal.Data.Stream."
                        forall a. [a] -> [a] -> [a]
++ [Char]
"decodeUtf8ArraysWith: Invalid UTF8"
                        forall a. [a] -> [a] -> [a]
++ [Char]
" codepoint encountered"
                        )
                        (forall s. s -> MutByteArray -> Int -> Int -> FlattenState s
InnerLoopDecodeInit s
st MutByteArray
contents (Int
p forall a. Num a => a -> a -> a
+ Int
1) Int
end)
                Word8
0 -> forall a. (?callStack::CallStack) => [Char] -> a
error [Char]
"unreachable state"
                Word8
_ -> forall s a. s -> Step s a
Skip (forall s.
s -> MutByteArray -> Int -> Int -> Word8 -> Int -> FlattenState s
InnerLoopDecoding s
st MutByteArray
contents (Int
p forall a. Num a => a -> a -> a
+ Int
1) Int
end Word8
sv Int
cp)
    step' Ptr Word8
_ State StreamK m a
_ (InnerLoopDecoding s
st MutByteArray
_ Int
p Int
end Word8
sv Int
cp)
        | Int
p forall a. Eq a => a -> a -> Bool
== Int
end = forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall s a. s -> Step s a
Skip forall a b. (a -> b) -> a -> b
$ forall s. s -> Maybe (Word8, Int) -> FlattenState s
OuterLoop s
st (forall a. a -> Maybe a
Just (Word8
sv, Int
cp))
    step' Ptr Word8
table State StreamK m a
_ (InnerLoopDecoding s
st MutByteArray
contents Int
p Int
end Word8
statePtr Int
codepointPtr) = do
        Word8
x <- forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO forall a b. (a -> b) -> a -> b
$ forall a. Unbox a => Int -> MutByteArray -> IO a
peekAt Int
p MutByteArray
contents
        let (Tuple' Word8
sv Int
cp) = Ptr Word8 -> Word8 -> Int -> Word8 -> Tuple' Word8 Int
decode1 Ptr Word8
table Word8
statePtr Int
codepointPtr Word8
x
        forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$
            case Word8
sv of
                Word8
0 ->
                    forall s a. s -> Step s a
Skip forall a b. (a -> b) -> a -> b
$
                    forall s. Char -> FlattenState s -> FlattenState s
YAndC
                        (Int -> Char
unsafeChr Int
cp)
                        (forall s. s -> MutByteArray -> Int -> Int -> FlattenState s
InnerLoopDecodeInit s
st MutByteArray
contents (Int
p forall a. Num a => a -> a -> a
+ Int
1) Int
end)
                Word8
12 ->
                    forall s a. s -> Step s a
Skip forall a b. (a -> b) -> a -> b
$
                    forall {s}. [Char] -> FlattenState s -> FlattenState s
transliterateOrError
                        (
                           [Char]
"Streamly.Internal.Data.Stream."
                        forall a. [a] -> [a] -> [a]
++ [Char]
"decodeUtf8ArraysWith: Invalid UTF8"
                        forall a. [a] -> [a] -> [a]
++ [Char]
" codepoint encountered"
                        )
                        (forall s. s -> MutByteArray -> Int -> Int -> FlattenState s
InnerLoopDecodeInit s
st MutByteArray
contents (Int
p forall a. Num a => a -> a -> a
+ Int
1) Int
end)
                Word8
_ ->
                    forall s a. s -> Step s a
Skip
                    (forall s.
s -> MutByteArray -> Int -> Int -> Word8 -> Int -> FlattenState s
InnerLoopDecoding s
st MutByteArray
contents (Int
p forall a. Num a => a -> a -> a
+ Int
1) Int
end Word8
sv Int
cp)
    step' Ptr Word8
_ State StreamK m a
_ (YAndC Char
c FlattenState s
s) = forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall s a. a -> s -> Step s a
Yield Char
c FlattenState s
s
    step' Ptr Word8
_ State StreamK m a
_ FlattenState s
D = forall (m :: * -> *) a. Monad m => a -> m a
return forall s a. Step s a
Stop

-- | Like 'decodeUtf8' but for a chunked stream. It may be slightly faster than
-- flattening the stream and then decoding with 'decodeUtf8'.
{-# INLINE decodeUtf8Chunks #-}
decodeUtf8Chunks ::
       MonadIO m
    => D.Stream m (Array Word8)
    -> D.Stream m Char
decodeUtf8Chunks :: forall (m :: * -> *).
MonadIO m =>
Stream m (Array Word8) -> Stream m Char
decodeUtf8Chunks = forall (m :: * -> *).
MonadIO m =>
CodingFailureMode -> Stream m (Array Word8) -> Stream m Char
decodeUtf8ArraysWithD CodingFailureMode
TransliterateCodingFailure

-- | Like 'decodeUtf8\'' but for a chunked stream. It may be slightly faster
-- than flattening the stream and then decoding with 'decodeUtf8\''.
{-# INLINE decodeUtf8Chunks' #-}
decodeUtf8Chunks' ::
       MonadIO m
    => D.Stream m (Array Word8)
    -> D.Stream m Char
decodeUtf8Chunks' :: forall (m :: * -> *).
MonadIO m =>
Stream m (Array Word8) -> Stream m Char
decodeUtf8Chunks' = forall (m :: * -> *).
MonadIO m =>
CodingFailureMode -> Stream m (Array Word8) -> Stream m Char
decodeUtf8ArraysWithD CodingFailureMode
ErrorOnCodingFailure

-- | Like 'decodeUtf8_' but for a chunked stream. It may be slightly faster
-- than flattening the stream and then decoding with 'decodeUtf8_'.
{-# INLINE decodeUtf8Chunks_ #-}
decodeUtf8Chunks_ ::
       MonadIO m
    => D.Stream m (Array Word8)
    -> D.Stream m Char
decodeUtf8Chunks_ :: forall (m :: * -> *).
MonadIO m =>
Stream m (Array Word8) -> Stream m Char
decodeUtf8Chunks_ = forall (m :: * -> *).
MonadIO m =>
CodingFailureMode -> Stream m (Array Word8) -> Stream m Char
decodeUtf8ArraysWithD CodingFailureMode
DropOnCodingFailure

-------------------------------------------------------------------------------
-- Encoding Unicode (UTF-8) Characters
-------------------------------------------------------------------------------

data WList = WCons !Word8 !WList | WNil

-- UTF-8 primitives, Lifted from GHC.IO.Encoding.UTF8.

{-# INLINE ord2 #-}
ord2 :: Char -> WList
ord2 :: Char -> WList
ord2 Char
c = forall a. (?callStack::CallStack) => Bool -> a -> a
assert (Int
n forall a. Ord a => a -> a -> Bool
>= Int
0x80 Bool -> Bool -> Bool
&& Int
n forall a. Ord a => a -> a -> Bool
<= Int
0x07ff) (Word8 -> WList -> WList
WCons Word8
x1 (Word8 -> WList -> WList
WCons Word8
x2 WList
WNil))
  where
    n :: Int
n = Char -> Int
ord Char
c
    x1 :: Word8
x1 = forall a b. (Integral a, Num b) => a -> b
fromIntegral forall a b. (a -> b) -> a -> b
$ (Int
n forall a. Bits a => a -> Int -> a
`shiftR` Int
6) forall a. Num a => a -> a -> a
+ Int
0xC0
    x2 :: Word8
x2 = forall a b. (Integral a, Num b) => a -> b
fromIntegral forall a b. (a -> b) -> a -> b
$ (Int
n forall a. Bits a => a -> a -> a
.&. Int
0x3F) forall a. Num a => a -> a -> a
+ Int
0x80

{-# INLINE ord3 #-}
ord3 :: Char -> WList
ord3 :: Char -> WList
ord3 Char
c = forall a. (?callStack::CallStack) => Bool -> a -> a
assert (Int
n forall a. Ord a => a -> a -> Bool
>= Int
0x0800 Bool -> Bool -> Bool
&& Int
n forall a. Ord a => a -> a -> Bool
<= Int
0xffff) (Word8 -> WList -> WList
WCons Word8
x1 (Word8 -> WList -> WList
WCons Word8
x2 (Word8 -> WList -> WList
WCons Word8
x3 WList
WNil)))
  where
    n :: Int
n = Char -> Int
ord Char
c
    x1 :: Word8
x1 = forall a b. (Integral a, Num b) => a -> b
fromIntegral forall a b. (a -> b) -> a -> b
$ (Int
n forall a. Bits a => a -> Int -> a
`shiftR` Int
12) forall a. Num a => a -> a -> a
+ Int
0xE0
    x2 :: Word8
x2 = forall a b. (Integral a, Num b) => a -> b
fromIntegral forall a b. (a -> b) -> a -> b
$ ((Int
n forall a. Bits a => a -> Int -> a
`shiftR` Int
6) forall a. Bits a => a -> a -> a
.&. Int
0x3F) forall a. Num a => a -> a -> a
+ Int
0x80
    x3 :: Word8
x3 = forall a b. (Integral a, Num b) => a -> b
fromIntegral forall a b. (a -> b) -> a -> b
$ (Int
n forall a. Bits a => a -> a -> a
.&. Int
0x3F) forall a. Num a => a -> a -> a
+ Int
0x80

{-# INLINE ord4 #-}
ord4 :: Char -> WList
ord4 :: Char -> WList
ord4 Char
c = forall a. (?callStack::CallStack) => Bool -> a -> a
assert (Int
n forall a. Ord a => a -> a -> Bool
>= Int
0x10000)  (Word8 -> WList -> WList
WCons Word8
x1 (Word8 -> WList -> WList
WCons Word8
x2 (Word8 -> WList -> WList
WCons Word8
x3 (Word8 -> WList -> WList
WCons Word8
x4 WList
WNil))))
  where
    n :: Int
n = Char -> Int
ord Char
c
    x1 :: Word8
x1 = forall a b. (Integral a, Num b) => a -> b
fromIntegral forall a b. (a -> b) -> a -> b
$ (Int
n forall a. Bits a => a -> Int -> a
`shiftR` Int
18) forall a. Num a => a -> a -> a
+ Int
0xF0
    x2 :: Word8
x2 = forall a b. (Integral a, Num b) => a -> b
fromIntegral forall a b. (a -> b) -> a -> b
$ ((Int
n forall a. Bits a => a -> Int -> a
`shiftR` Int
12) forall a. Bits a => a -> a -> a
.&. Int
0x3F) forall a. Num a => a -> a -> a
+ Int
0x80
    x3 :: Word8
x3 = forall a b. (Integral a, Num b) => a -> b
fromIntegral forall a b. (a -> b) -> a -> b
$ ((Int
n forall a. Bits a => a -> Int -> a
`shiftR` Int
6) forall a. Bits a => a -> a -> a
.&. Int
0x3F) forall a. Num a => a -> a -> a
+ Int
0x80
    x4 :: Word8
x4 = forall a b. (Integral a, Num b) => a -> b
fromIntegral forall a b. (a -> b) -> a -> b
$ (Int
n forall a. Bits a => a -> a -> a
.&. Int
0x3F) forall a. Num a => a -> a -> a
+ Int
0x80

{-# INLINE_NORMAL readCharUtf8With #-}
readCharUtf8With :: Monad m => WList -> Unfold m Char Word8
readCharUtf8With :: forall (m :: * -> *). Monad m => WList -> Unfold m Char Word8
readCharUtf8With WList
surr = forall (m :: * -> *) a b s.
(s -> m (Step s b)) -> (a -> m s) -> Unfold m a b
Unfold forall {m :: * -> *}. Monad m => WList -> m (Step WList Word8)
step forall {m :: * -> *}. Monad m => Char -> m WList
inject

    where

    inject :: Char -> m WList
inject Char
c =
        forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ case Char -> Int
ord Char
c of
            Int
x | Int
x forall a. Ord a => a -> a -> Bool
<= Int
0x7F -> forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
x Word8 -> WList -> WList
`WCons` WList
WNil
              | Int
x forall a. Ord a => a -> a -> Bool
<= Int
0x7FF -> Char -> WList
ord2 Char
c
              | Int
x forall a. Ord a => a -> a -> Bool
<= Int
0xFFFF -> if Char -> Bool
isSurrogate Char
c then WList
surr else Char -> WList
ord3 Char
c
              | Bool
otherwise -> Char -> WList
ord4 Char
c

    {-# INLINE_LATE step #-}
    step :: WList -> m (Step WList Word8)
step WList
WNil = forall (m :: * -> *) a. Monad m => a -> m a
return forall s a. Step s a
Stop
    step (WCons Word8
x WList
xs) = forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall s a. a -> s -> Step s a
Yield Word8
x WList
xs

{-# INLINE_NORMAL readCharUtf8' #-}
readCharUtf8' :: Monad m => Unfold m Char Word8
readCharUtf8' :: forall (m :: * -> *). Monad m => Unfold m Char Word8
readCharUtf8' =
    forall (m :: * -> *). Monad m => WList -> Unfold m Char Word8
readCharUtf8With forall a b. (a -> b) -> a -> b
$
        forall a. (?callStack::CallStack) => [Char] -> a
error [Char]
"Streamly.Internal.Unicode.readCharUtf8': Encountered a surrogate"

-- More yield points improve performance, but I am not sure if they can cause
-- too much code bloat or some trouble with fusion. So keeping only two yield
-- points for now, one for the ascii chars (fast path) and one for all other
-- paths (slow path).
{-# INLINE_NORMAL encodeUtf8D' #-}
encodeUtf8D' :: Monad m => D.Stream m Char -> D.Stream m Word8
encodeUtf8D' :: forall (m :: * -> *). Monad m => Stream m Char -> Stream m Word8
encodeUtf8D' = forall (m :: * -> *) a b.
Monad m =>
Unfold m a b -> Stream m a -> Stream m b
D.unfoldMany forall (m :: * -> *). Monad m => Unfold m Char Word8
readCharUtf8'

-- | Encode a stream of Unicode characters to a UTF-8 encoded bytestream. When
-- any invalid character (U+D800-U+D8FF) is encountered in the input stream the
-- function errors out.
--
{-# INLINE encodeUtf8' #-}
encodeUtf8' :: Monad m => Stream m Char -> Stream m Word8
encodeUtf8' :: forall (m :: * -> *). Monad m => Stream m Char -> Stream m Word8
encodeUtf8' = forall (m :: * -> *). Monad m => Stream m Char -> Stream m Word8
encodeUtf8D'

{-# INLINE_NORMAL readCharUtf8 #-}
readCharUtf8 :: Monad m => Unfold m Char Word8
readCharUtf8 :: forall (m :: * -> *). Monad m => Unfold m Char Word8
readCharUtf8 = forall (m :: * -> *). Monad m => WList -> Unfold m Char Word8
readCharUtf8With forall a b. (a -> b) -> a -> b
$ Word8 -> WList -> WList
WCons Word8
239 (Word8 -> WList -> WList
WCons Word8
191 (Word8 -> WList -> WList
WCons Word8
189 WList
WNil))

-- | See section "3.9 Unicode Encoding Forms" in
-- https://www.unicode.org/versions/Unicode13.0.0/UnicodeStandard-13.0.pdf
--
{-# INLINE_NORMAL encodeUtf8D #-}
encodeUtf8D :: Monad m => D.Stream m Char -> D.Stream m Word8
encodeUtf8D :: forall (m :: * -> *). Monad m => Stream m Char -> Stream m Word8
encodeUtf8D = forall (m :: * -> *) a b.
Monad m =>
Unfold m a b -> Stream m a -> Stream m b
D.unfoldMany forall (m :: * -> *). Monad m => Unfold m Char Word8
readCharUtf8

-- | Encode a stream of Unicode characters to a UTF-8 encoded bytestream. Any
-- Invalid characters (U+D800-U+D8FF) in the input stream are replaced by the
-- Unicode replacement character U+FFFD.
--
{-# INLINE encodeUtf8 #-}
encodeUtf8 :: Monad m => Stream m Char -> Stream m Word8
encodeUtf8 :: forall (m :: * -> *). Monad m => Stream m Char -> Stream m Word8
encodeUtf8 = forall (m :: * -> *). Monad m => Stream m Char -> Stream m Word8
encodeUtf8D

{-# INLINE_NORMAL readCharUtf8_ #-}
readCharUtf8_ :: Monad m => Unfold m Char Word8
readCharUtf8_ :: forall (m :: * -> *). Monad m => Unfold m Char Word8
readCharUtf8_ = forall (m :: * -> *). Monad m => WList -> Unfold m Char Word8
readCharUtf8With WList
WNil

{-# INLINE_NORMAL encodeUtf8D_ #-}
encodeUtf8D_ :: Monad m => D.Stream m Char -> D.Stream m Word8
encodeUtf8D_ :: forall (m :: * -> *). Monad m => Stream m Char -> Stream m Word8
encodeUtf8D_ = forall (m :: * -> *) a b.
Monad m =>
Unfold m a b -> Stream m a -> Stream m b
D.unfoldMany forall (m :: * -> *). Monad m => Unfold m Char Word8
readCharUtf8_

-- | Encode a stream of Unicode characters to a UTF-8 encoded bytestream. Any
-- Invalid characters (U+D800-U+D8FF) in the input stream are dropped.
--
{-# INLINE encodeUtf8_ #-}
encodeUtf8_ :: Monad m => Stream m Char -> Stream m Word8
encodeUtf8_ :: forall (m :: * -> *). Monad m => Stream m Char -> Stream m Word8
encodeUtf8_ = forall (m :: * -> *). Monad m => Stream m Char -> Stream m Word8
encodeUtf8D_

-- | Same as 'encodeUtf8'
--
{-# DEPRECATED encodeUtf8Lax "Please use 'encodeUtf8' instead" #-}
{-# INLINE encodeUtf8Lax #-}
encodeUtf8Lax :: Monad m => Stream m Char -> Stream m Word8
encodeUtf8Lax :: forall (m :: * -> *). Monad m => Stream m Char -> Stream m Word8
encodeUtf8Lax = forall (m :: * -> *). Monad m => Stream m Char -> Stream m Word8
encodeUtf8

-------------------------------------------------------------------------------
-- Decoding string literals
-------------------------------------------------------------------------------

-- | Read UTF-8 encoded bytes as chars from an 'Addr#' until a 0 byte is
-- encountered, the 0 byte is not included in the stream.
--
-- /Unsafe:/ The caller is responsible for safe addressing.
--
-- Note that this is completely safe when reading from Haskell string
-- literals because they are guaranteed to be NULL terminated:
--
-- >>> Stream.fold Fold.toList (Unicode.fromStr# "Haskell"#)
-- "Haskell"
--
{-# INLINE fromStr# #-}
fromStr# :: MonadIO m => Addr# -> Stream m Char
fromStr# :: forall (m :: * -> *). MonadIO m => Addr# -> Stream m Char
fromStr# Addr#
addr = forall (m :: * -> *). Monad m => Stream m Word8 -> Stream m Char
decodeUtf8 forall a b. (a -> b) -> a -> b
$ forall (m :: * -> *). Monad m => Addr# -> Stream m Word8
Stream.fromByteStr# Addr#
addr

-------------------------------------------------------------------------------
-- Encode streams of containers
-------------------------------------------------------------------------------

-- | Encode a container to @Array Word8@ provided an unfold to covert it to a
-- Char stream and an encoding function.
--
-- /Internal/
{-# INLINE encodeObject #-}
encodeObject :: MonadIO m =>
       (Stream m Char -> Stream m Word8)
    -> Unfold m a Char
    -> a
    -> m (Array Word8)
encodeObject :: forall (m :: * -> *) a.
MonadIO m =>
(Stream m Char -> Stream m Word8)
-> Unfold m a Char -> a -> m (Array Word8)
encodeObject Stream m Char -> Stream m Word8
encode Unfold m a Char
u = forall (m :: * -> *) a b.
Monad m =>
Fold m a b -> Stream m a -> m b
Stream.fold forall (m :: * -> *) a. (MonadIO m, Unbox a) => Fold m a (Array a)
Array.write forall b c a. (b -> c) -> (a -> b) -> a -> c
. Stream m Char -> Stream m Word8
encode forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall (m :: * -> *) a b.
Applicative m =>
Unfold m a b -> a -> Stream m b
Stream.unfold Unfold m a Char
u

-- | Encode a stream of container objects using the supplied encoding scheme.
-- Each object is encoded as an @Array Word8@.
--
-- /Internal/
{-# INLINE encodeObjects #-}
encodeObjects :: MonadIO m =>
       (Stream m Char -> Stream m Word8)
    -> Unfold m a Char
    -> Stream m a
    -> Stream m (Array Word8)
encodeObjects :: forall (m :: * -> *) a.
MonadIO m =>
(Stream m Char -> Stream m Word8)
-> Unfold m a Char -> Stream m a -> Stream m (Array Word8)
encodeObjects Stream m Char -> Stream m Word8
encode Unfold m a Char
u = forall (m :: * -> *) a b.
Monad m =>
(a -> m b) -> Stream m a -> Stream m b
Stream.mapM (forall (m :: * -> *) a.
MonadIO m =>
(Stream m Char -> Stream m Word8)
-> Unfold m a Char -> a -> m (Array Word8)
encodeObject Stream m Char -> Stream m Word8
encode Unfold m a Char
u)

-- | Encode a stream of 'String' using the supplied encoding scheme. Each
-- string is encoded as an @Array Word8@.
--
{-# INLINE encodeStrings #-}
encodeStrings :: MonadIO m =>
       (Stream m Char -> Stream m Word8)
    -> Stream m String
    -> Stream m (Array Word8)
encodeStrings :: forall (m :: * -> *).
MonadIO m =>
(Stream m Char -> Stream m Word8)
-> Stream m [Char] -> Stream m (Array Word8)
encodeStrings Stream m Char -> Stream m Word8
encode = forall (m :: * -> *) a.
MonadIO m =>
(Stream m Char -> Stream m Word8)
-> Unfold m a Char -> Stream m a -> Stream m (Array Word8)
encodeObjects Stream m Char -> Stream m Word8
encode forall (m :: * -> *) a. Applicative m => Unfold m [a] a
Unfold.fromList

{-
-------------------------------------------------------------------------------
-- Utility operations on strings
-------------------------------------------------------------------------------

strip :: IsStream t => Stream m Char -> Stream m Char
strip = undefined

stripTail :: IsStream t => Stream m Char -> Stream m Char
stripTail = undefined
-}

-- | Remove leading whitespace from a string.
--
-- > stripHead = Stream.dropWhile isSpace
--
-- /Pre-release/
{-# INLINE stripHead #-}
stripHead :: Monad m => Stream m Char -> Stream m Char
stripHead :: forall (m :: * -> *). Monad m => Stream m Char -> Stream m Char
stripHead = forall (m :: * -> *) a.
Monad m =>
(a -> Bool) -> Stream m a -> Stream m a
Stream.dropWhile Char -> Bool
isSpace

-- | Fold each line of the stream using the supplied 'Fold'
-- and stream the result.
--
-- >>> Stream.fold Fold.toList $ Unicode.lines Fold.toList (Stream.fromList "lines\nthis\nstring\n\n\n")
-- ["lines","this","string","",""]
--
-- > lines = Stream.splitOnSuffix (== '\n')
--
-- /Pre-release/
{-# INLINE lines #-}
lines :: Monad m => Fold m Char b -> Stream m Char -> Stream m b
lines :: forall (m :: * -> *) b.
Monad m =>
Fold m Char b -> Stream m Char -> Stream m b
lines Fold m Char b
f = forall (m :: * -> *) a b.
Monad m =>
Fold m a b -> Stream m a -> Stream m b
Stream.foldMany (forall (m :: * -> *) a b.
Monad m =>
(a -> Bool) -> Fold m a b -> Fold m a b
Fold.takeEndBy_ (forall a. Eq a => a -> a -> Bool
== Char
'\n') Fold m Char b
f)

#if !MIN_VERSION_base(4,17,0)
foreign import ccall unsafe "u_iswspace"
  iswspace :: Int -> Int
#endif

-- | Code copied from base/Data.Char to INLINE it
{-# INLINE isSpace #-}
isSpace :: Char -> Bool
isSpace :: Char -> Bool
isSpace Char
c
  | Word
uc forall a. Ord a => a -> a -> Bool
<= Word
0x377 = Word
uc forall a. Eq a => a -> a -> Bool
== Word
32 Bool -> Bool -> Bool
|| Word
uc forall a. Num a => a -> a -> a
- Word
0x9 forall a. Ord a => a -> a -> Bool
<= Word
4 Bool -> Bool -> Bool
|| Word
uc forall a. Eq a => a -> a -> Bool
== Word
0xa0
#if MIN_VERSION_base(4,17,0)
  | otherwise = generalCategory c == Space
#else
  | Bool
otherwise = Int -> Int
iswspace (Char -> Int
ord Char
c) forall a. Eq a => a -> a -> Bool
/= Int
0
#endif
  where
    uc :: Word
uc = forall a b. (Integral a, Num b) => a -> b
fromIntegral (Char -> Int
ord Char
c) :: Word

-- | Fold each word of the stream using the supplied 'Fold'
-- and stream the result.
--
-- >>>  Stream.fold Fold.toList $ Unicode.words Fold.toList (Stream.fromList "fold these     words")
-- ["fold","these","words"]
--
-- > words = Stream.wordsBy isSpace
--
-- /Pre-release/
{-# INLINE words #-}
words :: Monad m => Fold m Char b -> Stream m Char -> Stream m b
words :: forall (m :: * -> *) b.
Monad m =>
Fold m Char b -> Stream m Char -> Stream m b
words Fold m Char b
f = forall (m :: * -> *) a b.
Monad m =>
(a -> Bool) -> Fold m a b -> Stream m a -> Stream m b
D.wordsBy Char -> Bool
isSpace Fold m Char b
f

-- | Unfold a stream to character streams using the supplied 'Unfold'
-- and concat the results suffixing a newline character @\\n@ to each stream.
--
-- @
-- unlines = Stream.interposeSuffix '\n'
-- unlines = Stream.intercalateSuffix Unfold.fromList "\n"
-- @
--
-- /Pre-release/
{-# INLINE unlines #-}
unlines :: MonadIO m => Unfold m a Char -> Stream m a -> Stream m Char
unlines :: forall (m :: * -> *) a.
MonadIO m =>
Unfold m a Char -> Stream m a -> Stream m Char
unlines = forall (m :: * -> *) c b.
Monad m =>
c -> Unfold m b c -> Stream m b -> Stream m c
Stream.interposeSuffix Char
'\n'

-- | Unfold the elements of a stream to character streams using the supplied
-- 'Unfold' and concat the results with a whitespace character infixed between
-- the streams.
--
-- @
-- unwords = Stream.interpose ' '
-- unwords = Stream.intercalate Unfold.fromList " "
-- @
--
-- /Pre-release/
{-# INLINE unwords #-}
unwords :: MonadIO m => Unfold m a Char -> Stream m a -> Stream m Char
unwords :: forall (m :: * -> *) a.
MonadIO m =>
Unfold m a Char -> Stream m a -> Stream m Char
unwords = forall (m :: * -> *) c b.
Monad m =>
c -> Unfold m b c -> Stream m b -> Stream m c
Stream.interpose Char
' '