{-# LANGUAGE BangPatterns #-} -- | Stream primitives for decoding and encoding 'Text' values in UTF-8 format. module System.IO.Streams.Text ( -- * Decoders and Encoders decodeUtf8 , decodeUtf8With , encodeUtf8 ) where ------------------------------------------------------------------------------ import Control.Monad (when) import Control.Monad.IO.Class (MonadIO (..)) import Data.ByteString (ByteString) import qualified Data.ByteString as S import qualified Data.ByteString.Unsafe as S import Data.Monoid (mappend) import Data.Text (Text) import qualified Data.Text.Encoding as T import Data.Text.Encoding.Error (OnDecodeError) import Data.Word (Word8) ------------------------------------------------------------------------------ import qualified System.IO.Streams.Combinators as Streams import System.IO.Streams.Internal (InputStream, OutputStream) import qualified System.IO.Streams.Internal as Streams ------------------------------------------------------------------------------ -- | Convert an 'OutputStream' taking 'ByteString's to an 'OutputStream' that -- takes 'Text', encoding the data as UTF-8. See -- @Data.Text.Encoding.'T.encodeUtf8'@. encodeUtf8 :: OutputStream ByteString -> IO (OutputStream Text) encodeUtf8 = Streams.contramap T.encodeUtf8 ------------------------------------------------------------------------------ -- | Decode an 'InputStream' of 'ByteString's in UTF-8 format into an -- 'InputStream' of 'Text' values. If decoding fails, will throw an exception. -- See @Data.Text.Encoding.'T.decodeUtf8'@. decodeUtf8 :: InputStream ByteString -> IO (InputStream Text) decodeUtf8 = decode T.decodeUtf8 {-# INLINE decodeUtf8 #-} ------------------------------------------------------------------------------ -- | Decode an 'InputStream' of 'ByteString's in UTF-8 format into an -- 'InputStream' of 'Text' values. If decoding fails, invokes the given -- 'OnDecodeError' function to decide what to do. See -- @Data.Text.Encoding.'T.decodeUtf8With'@. decodeUtf8With :: OnDecodeError -> InputStream ByteString -> IO (InputStream Text) decodeUtf8With e = decode (T.decodeUtf8With e) {-# INLINE decodeUtf8With #-} ------------------------------------------------------------------------------ decode :: (ByteString -> Text) -> InputStream ByteString -> IO (InputStream Text) decode decodeFunc input = Streams.fromGenerator $ go Nothing where go !soFar = liftIO (Streams.read input) >>= maybe (finish soFar) (chunk soFar) finish Nothing = return $! () finish (Just x) = Streams.yield $! decodeFunc x chunk Nothing s = process s chunk (Just a) b = process $ a `mappend` b process !s = case findLastFullCode s of LastCodeIsComplete x -> (Streams.yield $! decodeFunc x) >> go Nothing Split a b -> do when (not $ S.null a) $ Streams.yield $! decodeFunc a go (Just b) NoCodesAreComplete x -> go (Just x) ------------------------------------------------------------------------------ data ByteType = Regular | Continuation | Start !Int ------------------------------------------------------------------------------ between :: Word8 -> Word8 -> Word8 -> Bool between x y z = x >= y && x <= z {-# INLINE between #-} ------------------------------------------------------------------------------ characterizeByte :: Word8 -> ByteType characterizeByte c | between c 0 0x7F = Regular | between c 0x80 0xBF = Continuation | between c 0xC0 0xDF = Start 1 | between c 0xE0 0xEF = Start 2 -- Technically utf-8 ends after 0xf4, but those sequences -- won't decode anyways. | otherwise = Start 3 ------------------------------------------------------------------------------ data FindOutput = LastCodeIsComplete !ByteString | Split !ByteString !ByteString | NoCodesAreComplete !ByteString -- should be impossibly rare -- in real data ------------------------------------------------------------------------------ findLastFullCode :: ByteString -> FindOutput findLastFullCode b | len == 0 = LastCodeIsComplete b | otherwise = go where len = S.length b go = let !idx = len - 1 !c = S.unsafeIndex b idx in case characterizeByte c of Regular -> LastCodeIsComplete b Continuation -> cont (len - 2) _ -> Split (S.unsafeTake idx b) (S.unsafeDrop idx b) cont !idx | idx < 0 = NoCodesAreComplete b | otherwise = let !c = S.unsafeIndex b idx in case characterizeByte c of -- what do we do with this? decoding will fail. give up -- and lie, the text decoder will deal with it.. Regular -> LastCodeIsComplete b Continuation -> cont (idx - 1) Start n -> if n + idx == len - 1 then LastCodeIsComplete b else Split (S.unsafeTake idx b) (S.unsafeDrop idx b) {-# INLINE findLastFullCode #-}