{-# LANGUAGE BangPatterns #-}

-- | Stream primitives for decoding and encoding 'Text' values in UTF-8 format.
module System.IO.Streams.Text
  ( -- * Decoders and Encoders
    decodeUtf8
  , decodeUtf8With
  , encodeUtf8
  ) where

------------------------------------------------------------------------------
import           Control.Monad                 (when)
import           Control.Monad.IO.Class        (MonadIO (..))
import           Data.ByteString               (ByteString)
import qualified Data.ByteString               as S
import qualified Data.ByteString.Unsafe        as S
import           Data.Monoid                   (mappend)
import           Data.Text                     (Text)
import qualified Data.Text.Encoding            as T
import           Data.Text.Encoding.Error      (OnDecodeError)
import           Data.Word                     (Word8)
------------------------------------------------------------------------------
import qualified System.IO.Streams.Combinators as Streams
import           System.IO.Streams.Internal    (InputStream, OutputStream)
import qualified System.IO.Streams.Internal    as Streams


------------------------------------------------------------------------------
-- | Convert an 'OutputStream' taking 'ByteString's to an 'OutputStream' that
-- takes 'Text', encoding the data as UTF-8. See
-- @Data.Text.Encoding.'T.encodeUtf8'@.
encodeUtf8 :: OutputStream ByteString -> IO (OutputStream Text)
encodeUtf8 = Streams.contramap T.encodeUtf8


------------------------------------------------------------------------------
-- | Decode an 'InputStream' of 'ByteString's in UTF-8 format into an
-- 'InputStream' of 'Text' values. If decoding fails, will throw an exception.
-- See @Data.Text.Encoding.'T.decodeUtf8'@.
decodeUtf8 :: InputStream ByteString -> IO (InputStream Text)
decodeUtf8 = decode T.decodeUtf8
{-# INLINE decodeUtf8 #-}


------------------------------------------------------------------------------
-- | Decode an 'InputStream' of 'ByteString's in UTF-8 format into an
-- 'InputStream' of 'Text' values. If decoding fails, invokes the given
-- 'OnDecodeError' function to decide what to do. See
-- @Data.Text.Encoding.'T.decodeUtf8With'@.
decodeUtf8With :: OnDecodeError
               -> InputStream ByteString
               -> IO (InputStream Text)
decodeUtf8With e = decode (T.decodeUtf8With e)
{-# INLINE decodeUtf8With #-}


------------------------------------------------------------------------------
decode :: (ByteString -> Text)
       -> InputStream ByteString
       -> IO (InputStream Text)
decode decodeFunc input = Streams.fromGenerator $ go Nothing
  where
    go !soFar = liftIO (Streams.read input) >>=
                maybe (finish soFar) (chunk soFar)

    finish Nothing  = return $! ()
    finish (Just x) = Streams.yield $! decodeFunc x

    chunk Nothing  s = process s
    chunk (Just a) b = process $ a `mappend` b

    process !s =
      case findLastFullCode s of
        LastCodeIsComplete x -> (Streams.yield $! decodeFunc x) >> go Nothing
        Split a b            -> do
                                  when (not $ S.null a) $
                                      Streams.yield $! decodeFunc a
                                  go (Just b)
        NoCodesAreComplete x -> go (Just x)


------------------------------------------------------------------------------
data ByteType = Regular
              | Continuation
              | Start !Int


------------------------------------------------------------------------------
between :: Word8 -> Word8 -> Word8 -> Bool
between x y z = x >= y && x <= z
{-# INLINE between #-}


------------------------------------------------------------------------------
characterizeByte :: Word8 -> ByteType
characterizeByte c | between c 0 0x7F    = Regular
                   | between c 0x80 0xBF = Continuation
                   | between c 0xC0 0xDF = Start 1
                   | between c 0xE0 0xEF = Start 2
                   -- Technically utf-8 ends after 0xf4, but those sequences
                   -- won't decode anyways.
                   | otherwise           = Start 3


------------------------------------------------------------------------------
data FindOutput = LastCodeIsComplete !ByteString
                | Split !ByteString !ByteString
                | NoCodesAreComplete !ByteString   -- should be impossibly rare
                                                   -- in real data


------------------------------------------------------------------------------
findLastFullCode :: ByteString -> FindOutput
findLastFullCode b | len == 0  = LastCodeIsComplete b
                   | otherwise = go
  where
    len = S.length b

    go = let !idx = len - 1
             !c   = S.unsafeIndex b idx
         in case characterizeByte c of
              Regular      -> LastCodeIsComplete b
              Continuation -> cont (len - 2)
              _            -> Split (S.unsafeTake idx b) (S.unsafeDrop idx b)

    cont !idx | idx < 0 = NoCodesAreComplete b
              | otherwise =
                  let !c = S.unsafeIndex b idx
                  in case characterizeByte c of
                       -- what do we do with this? decoding will fail. give up
                       -- and lie, the text decoder will deal with it..
                       Regular      -> LastCodeIsComplete b
                       Continuation -> cont (idx - 1)
                       Start n      -> if n + idx == len - 1
                                         then LastCodeIsComplete b
                                         else Split (S.unsafeTake idx b)
                                                    (S.unsafeDrop idx b)
{-# INLINE findLastFullCode #-}