{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP          #-}

-- | Stream primitives for decoding and encoding 'Text' values in UTF-8 format.
module System.IO.Streams.Text
  ( -- * Decoders and Encoders
    decodeUtf8
  , decodeUtf8With
  , encodeUtf8
  ) where

------------------------------------------------------------------------------
import           Control.Monad                 (when)
import           Control.Monad.IO.Class        (MonadIO (..))
import           Data.ByteString               (ByteString)
import qualified Data.ByteString               as S
import qualified Data.ByteString.Unsafe        as S
#if !MIN_VERSION_base(4,8,0)
import           Data.Monoid                   (mappend)
#endif
import           Data.Text                     (Text)
import qualified Data.Text.Encoding            as T
import           Data.Text.Encoding.Error      (OnDecodeError)
import           Data.Word                     (Word8)
------------------------------------------------------------------------------
import qualified System.IO.Streams.Combinators as Streams
import           System.IO.Streams.Internal    (InputStream, OutputStream)
import qualified System.IO.Streams.Internal    as Streams


------------------------------------------------------------------------------
-- | Convert an 'OutputStream' taking 'ByteString's to an 'OutputStream' that
-- takes 'Text', encoding the data as UTF-8. See
-- @Data.Text.Encoding.'T.encodeUtf8'@.
encodeUtf8 :: OutputStream ByteString -> IO (OutputStream Text)
encodeUtf8 :: OutputStream ByteString -> IO (OutputStream Text)
encodeUtf8 = (Text -> ByteString)
-> OutputStream ByteString -> IO (OutputStream Text)
forall a b. (a -> b) -> OutputStream b -> IO (OutputStream a)
Streams.contramap Text -> ByteString
T.encodeUtf8


------------------------------------------------------------------------------
-- | Decode an 'InputStream' of 'ByteString's in UTF-8 format into an
-- 'InputStream' of 'Text' values. If decoding fails, will throw an exception.
-- See @Data.Text.Encoding.'T.decodeUtf8'@.
decodeUtf8 :: InputStream ByteString -> IO (InputStream Text)
decodeUtf8 :: InputStream ByteString -> IO (InputStream Text)
decodeUtf8 = (ByteString -> Text)
-> InputStream ByteString -> IO (InputStream Text)
decode ByteString -> Text
T.decodeUtf8
{-# INLINE decodeUtf8 #-}


------------------------------------------------------------------------------
-- | Decode an 'InputStream' of 'ByteString's in UTF-8 format into an
-- 'InputStream' of 'Text' values. If decoding fails, invokes the given
-- 'OnDecodeError' function to decide what to do. See
-- @Data.Text.Encoding.'T.decodeUtf8With'@.
decodeUtf8With :: OnDecodeError
               -> InputStream ByteString
               -> IO (InputStream Text)
decodeUtf8With :: OnDecodeError -> InputStream ByteString -> IO (InputStream Text)
decodeUtf8With OnDecodeError
e = (ByteString -> Text)
-> InputStream ByteString -> IO (InputStream Text)
decode (OnDecodeError -> ByteString -> Text
T.decodeUtf8With OnDecodeError
e)
{-# INLINE decodeUtf8With #-}


------------------------------------------------------------------------------
decode :: (ByteString -> Text)
       -> InputStream ByteString
       -> IO (InputStream Text)
decode :: (ByteString -> Text)
-> InputStream ByteString -> IO (InputStream Text)
decode ByteString -> Text
decodeFunc InputStream ByteString
input = Generator Text () -> IO (InputStream Text)
forall r a. Generator r a -> IO (InputStream r)
Streams.fromGenerator (Generator Text () -> IO (InputStream Text))
-> Generator Text () -> IO (InputStream Text)
forall a b. (a -> b) -> a -> b
$ Maybe ByteString -> Generator Text ()
go Maybe ByteString
forall a. Maybe a
Nothing
  where
    go :: Maybe ByteString -> Generator Text ()
go !Maybe ByteString
soFar = IO (Maybe ByteString) -> Generator Text (Maybe ByteString)
forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO (InputStream ByteString -> IO (Maybe ByteString)
forall a. InputStream a -> IO (Maybe a)
Streams.read InputStream ByteString
input) Generator Text (Maybe ByteString)
-> (Maybe ByteString -> Generator Text ()) -> Generator Text ()
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>=
                Generator Text ()
-> (ByteString -> Generator Text ())
-> Maybe ByteString
-> Generator Text ()
forall b a. b -> (a -> b) -> Maybe a -> b
maybe (Maybe ByteString -> Generator Text ()
finish Maybe ByteString
soFar) (Maybe ByteString -> ByteString -> Generator Text ()
chunk Maybe ByteString
soFar)

    finish :: Maybe ByteString -> Generator Text ()
finish Maybe ByteString
Nothing  = () -> Generator Text ()
forall (m :: * -> *) a. Monad m => a -> m a
return (() -> Generator Text ()) -> () -> Generator Text ()
forall a b. (a -> b) -> a -> b
$! ()
    finish (Just ByteString
x) = Text -> Generator Text ()
forall r. r -> Generator r ()
Streams.yield (Text -> Generator Text ()) -> Text -> Generator Text ()
forall a b. (a -> b) -> a -> b
$! ByteString -> Text
decodeFunc ByteString
x

    chunk :: Maybe ByteString -> ByteString -> Generator Text ()
chunk Maybe ByteString
Nothing  ByteString
s = ByteString -> Generator Text ()
process ByteString
s
    chunk (Just ByteString
a) ByteString
b = ByteString -> Generator Text ()
process (ByteString -> Generator Text ())
-> ByteString -> Generator Text ()
forall a b. (a -> b) -> a -> b
$ ByteString
a ByteString -> ByteString -> ByteString
forall a. Monoid a => a -> a -> a
`mappend` ByteString
b

    process :: ByteString -> Generator Text ()
process !ByteString
s =
      case ByteString -> FindOutput
findLastFullCode ByteString
s of
        LastCodeIsComplete ByteString
x -> (Text -> Generator Text ()
forall r. r -> Generator r ()
Streams.yield (Text -> Generator Text ()) -> Text -> Generator Text ()
forall a b. (a -> b) -> a -> b
$! ByteString -> Text
decodeFunc ByteString
x) Generator Text () -> Generator Text () -> Generator Text ()
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Maybe ByteString -> Generator Text ()
go Maybe ByteString
forall a. Maybe a
Nothing
        Split ByteString
a ByteString
b            -> do
                                  Bool -> Generator Text () -> Generator Text ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Bool -> Bool
not (Bool -> Bool) -> Bool -> Bool
forall a b. (a -> b) -> a -> b
$ ByteString -> Bool
S.null ByteString
a) (Generator Text () -> Generator Text ())
-> Generator Text () -> Generator Text ()
forall a b. (a -> b) -> a -> b
$
                                      Text -> Generator Text ()
forall r. r -> Generator r ()
Streams.yield (Text -> Generator Text ()) -> Text -> Generator Text ()
forall a b. (a -> b) -> a -> b
$! ByteString -> Text
decodeFunc ByteString
a
                                  Maybe ByteString -> Generator Text ()
go (ByteString -> Maybe ByteString
forall a. a -> Maybe a
Just ByteString
b)
        NoCodesAreComplete ByteString
x -> Maybe ByteString -> Generator Text ()
go (ByteString -> Maybe ByteString
forall a. a -> Maybe a
Just ByteString
x)


------------------------------------------------------------------------------
data ByteType = Regular
              | Continuation
              | Start !Int


------------------------------------------------------------------------------
between :: Word8 -> Word8 -> Word8 -> Bool
between :: Word8 -> Word8 -> Word8 -> Bool
between Word8
x Word8
y Word8
z = Word8
x Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
>= Word8
y Bool -> Bool -> Bool
&& Word8
x Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
z
{-# INLINE between #-}


------------------------------------------------------------------------------
characterizeByte :: Word8 -> ByteType
characterizeByte :: Word8 -> ByteType
characterizeByte Word8
c | Word8 -> Word8 -> Word8 -> Bool
between Word8
c Word8
0 Word8
0x7F    = ByteType
Regular
                   | Word8 -> Word8 -> Word8 -> Bool
between Word8
c Word8
0x80 Word8
0xBF = ByteType
Continuation
                   | Word8 -> Word8 -> Word8 -> Bool
between Word8
c Word8
0xC0 Word8
0xDF = Int -> ByteType
Start Int
1
                   | Word8 -> Word8 -> Word8 -> Bool
between Word8
c Word8
0xE0 Word8
0xEF = Int -> ByteType
Start Int
2
                   -- Technically utf-8 ends after 0xf4, but those sequences
                   -- won't decode anyways.
                   | Bool
otherwise           = Int -> ByteType
Start Int
3


------------------------------------------------------------------------------
data FindOutput = LastCodeIsComplete !ByteString
                | Split !ByteString !ByteString
                | NoCodesAreComplete !ByteString   -- should be impossibly rare
                                                   -- in real data


------------------------------------------------------------------------------
findLastFullCode :: ByteString -> FindOutput
findLastFullCode :: ByteString -> FindOutput
findLastFullCode ByteString
b | Int
len Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0  = ByteString -> FindOutput
LastCodeIsComplete ByteString
b
                   | Bool
otherwise = FindOutput
go
  where
    len :: Int
len = ByteString -> Int
S.length ByteString
b

    go :: FindOutput
go = let !idx :: Int
idx = Int
len Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1
             !c :: Word8
c   = ByteString -> Int -> Word8
S.unsafeIndex ByteString
b Int
idx
         in case Word8 -> ByteType
characterizeByte Word8
c of
              ByteType
Regular      -> ByteString -> FindOutput
LastCodeIsComplete ByteString
b
              ByteType
Continuation -> Int -> FindOutput
cont (Int
len Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
2)
              ByteType
_            -> ByteString -> ByteString -> FindOutput
Split (Int -> ByteString -> ByteString
S.unsafeTake Int
idx ByteString
b) (Int -> ByteString -> ByteString
S.unsafeDrop Int
idx ByteString
b)

    cont :: Int -> FindOutput
cont !Int
idx | Int
idx Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0 = ByteString -> FindOutput
NoCodesAreComplete ByteString
b
              | Bool
otherwise =
                  let !c :: Word8
c = ByteString -> Int -> Word8
S.unsafeIndex ByteString
b Int
idx
                  in case Word8 -> ByteType
characterizeByte Word8
c of
                       -- what do we do with this? decoding will fail. give up
                       -- and lie, the text decoder will deal with it..
                       ByteType
Regular      -> ByteString -> FindOutput
LastCodeIsComplete ByteString
b
                       ByteType
Continuation -> Int -> FindOutput
cont (Int
idx Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)
                       Start Int
n      -> if Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
idx Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
len Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1
                                         then ByteString -> FindOutput
LastCodeIsComplete ByteString
b
                                         else ByteString -> ByteString -> FindOutput
Split (Int -> ByteString -> ByteString
S.unsafeTake Int
idx ByteString
b)
                                                    (Int -> ByteString -> ByteString
S.unsafeDrop Int
idx ByteString
b)
{-# INLINE findLastFullCode #-}