module Data.Text.Lazy.Encoding.Fusion
(
streamUtf8
, unstream
, module Data.Text.Encoding.Fusion.Common
) where
import Data.ByteString.Lazy.Internal (ByteString(..), defaultChunkSize)
import qualified Data.ByteString as B
import qualified Data.ByteString.Unsafe as B
import Data.Text.Encoding.Fusion.Common
import Data.Text.Encoding.Error
import Data.Text.Fusion (Step(..), Stream(..))
import Data.Text.Fusion.Internal (M(..), PairS(..), S(..))
import Data.Text.UnsafeChar (unsafeChr8)
import Data.Word (Word8)
import qualified Data.Text.Encoding.Utf8 as U8
import System.IO.Unsafe (unsafePerformIO)
import Foreign.ForeignPtr (withForeignPtr, ForeignPtr)
import Foreign.Storable (pokeByteOff)
import Data.ByteString.Internal (mallocByteString, memcpy)
import Control.Exception (assert)
import qualified Data.ByteString.Internal as B
unknownLength :: Int
unknownLength = 4
streamUtf8 :: OnDecodeError -> ByteString -> Stream Char
streamUtf8 onErr bs0 = Stream next (bs0 :!: empty :!: 0) unknownLength
where
empty = S N N N N
next (bs@(Chunk ps _) :!: S N _ _ _ :!: i)
| i < len && U8.validate1 a =
Yield (unsafeChr8 a) (bs :!: empty :!: i+1)
| i + 1 < len && U8.validate2 a b =
Yield (U8.chr2 a b) (bs :!: empty :!: i+2)
| i + 2 < len && U8.validate3 a b c =
Yield (U8.chr3 a b c) (bs :!: empty :!: i+3)
| i + 4 < len && U8.validate4 a b c d =
Yield (U8.chr4 a b c d) (bs :!: empty :!: i+4)
where len = B.length ps
a = B.unsafeIndex ps i
b = B.unsafeIndex ps (i+1)
c = B.unsafeIndex ps (i+2)
d = B.unsafeIndex ps (i+3)
next st@(bs :!: s :!: i) =
case s of
S (J a) N _ _ | U8.validate1 a ->
Yield (unsafeChr8 a) es
S (J a) (J b) N _ | U8.validate2 a b ->
Yield (U8.chr2 a b) es
S (J a) (J b) (J c) N | U8.validate3 a b c ->
Yield (U8.chr3 a b c) es
S (J a) (J b) (J c) (J d) | U8.validate4 a b c d ->
Yield (U8.chr4 a b c d) es
_ -> consume st
where es = bs :!: empty :!: i
consume (bs@(Chunk ps rest) :!: s :!: i)
| i >= B.length ps = consume (rest :!: s :!: 0)
| otherwise =
case s of
S N _ _ _ -> next (bs :!: S x N N N :!: i+1)
S a N _ _ -> next (bs :!: S a x N N :!: i+1)
S a b N _ -> next (bs :!: S a b x N :!: i+1)
S a b c N -> next (bs :!: S a b c x :!: i+1)
S (J a) b c d -> decodeError "streamUtf8" "UTF-8" onErr (Just a)
(bs :!: S b c d N :!: i+1)
where x = J (B.unsafeIndex ps i)
consume (Empty :!: S N _ _ _ :!: _) = Done
consume st = decodeError "streamUtf8" "UTF-8" onErr Nothing st
unstreamChunks :: Int -> Stream Word8 -> ByteString
unstreamChunks chunkSize (Stream next s0 len0) = chunk s0 len0
where chunk s1 len1 = unsafePerformIO $ do
let len = min (max len1 unknownLength) chunkSize
mallocByteString len >>= loop len 0 s1
where
loop !n !off !s fp = case next s of
Done | off == 0 -> return Empty
| otherwise -> do
bs <- trimUp fp off
return $! Chunk bs Empty
Skip s' -> loop n off s' fp
Yield x s'
| off == chunkSize -> do
bs <- trimUp fp off
return (Chunk bs (chunk s (n B.length bs)))
| off == n -> realloc fp n off s' x
| otherwise -> do
withForeignPtr fp $ \p -> pokeByteOff p off x
loop n (off+1) s' fp
realloc fp n off s x = do
let n' = min (n+n) chunkSize
fp' <- copy0 fp n n'
withForeignPtr fp' $ \p -> pokeByteOff p off x
loop n' (off+1) s fp'
trimUp fp off = return $! B.PS fp 0 off
copy0 :: ForeignPtr Word8 -> Int -> Int -> IO (ForeignPtr Word8)
copy0 !src !srcLen !destLen = assert (srcLen <= destLen) $ do
dest <- mallocByteString destLen
withForeignPtr src $ \src' ->
withForeignPtr dest $ \dest' ->
memcpy dest' src' (fromIntegral srcLen)
return dest
unstream :: Stream Word8 -> ByteString
unstream = unstreamChunks defaultChunkSize
decodeError :: forall s. String -> String -> OnDecodeError -> Maybe Word8
-> s -> Step s Char
decodeError func kind onErr mb i =
case onErr desc mb of
Nothing -> Skip i
Just c -> Yield c i
where desc = "Data.Text.Lazy.Encoding.Fusion." ++ func ++ ": Invalid " ++
kind ++ " stream"