{-# language BangPatterns #-} {-# language BlockArguments #-} {-# language MagicHash #-} {-# language NamedFieldPuns #-} {-# language ScopedTypeVariables #-} {-# language TypeApplications #-} {-# language UnliftedFFITypes #-} {-# language UnboxedTuples #-} -- This internal module has functions for splitting strings -- on a particular byte and for counting occurences of that -- byte. module Data.Bytes.Byte ( count , split , splitU , splitNonEmpty , splitInit , splitInitU ) where import Prelude hiding (length) import Control.Monad.ST (runST) import Control.Monad.ST.Run (runPrimArrayST) import Data.Bytes.Types (Bytes(..)) import Data.List.NonEmpty (NonEmpty((:|))) import Data.Primitive (PrimArray(..),MutablePrimArray(..),ByteArray(..)) import Data.Primitive.Unlifted.Array (UnliftedArray) import Data.Word (Word8) import GHC.Exts (ByteArray#,MutableByteArray#) import GHC.IO (unsafeIOToST) import qualified Data.Primitive as PM import qualified Data.Primitive.Unlifted.Array as PM import qualified GHC.Exts as Exts -- | Count the number of times the byte appears in the sequence. count :: Word8 -> Bytes -> Int count !b (Bytes{array=ByteArray arr,offset,length}) = count_ba arr offset length b -- | Variant of 'split' that returns an array of unsliced byte sequences. -- Unlike 'split', this is not a good producer for list fusion. (It does -- not return a list, so it could not be.) Prefer 'split' if the result -- is going to be consumed exactly once by a good consumer. Prefer 'splitU' -- if the result of the split is going to be around for a while and -- inspected multiple times. splitU :: Word8 -> Bytes -> UnliftedArray ByteArray splitU !w !bs = let !lens = splitLengthsAlt w bs !lensSz = PM.sizeofPrimArray lens in splitCommonU lens lensSz bs -- | Variant of 'splitU' that drops the trailing element. See 'splitInit' -- for an explanation of why this may be useful. splitInitU :: Word8 -> Bytes -> UnliftedArray ByteArray splitInitU !w !bs = let !lens = splitLengthsAlt w bs !lensSz = PM.sizeofPrimArray lens in splitCommonU lens (lensSz - 1) bs -- Internal function splitCommonU :: PrimArray Int -- array of segment lengths -> Int -- number of lengths to consider -> Bytes -> UnliftedArray ByteArray splitCommonU !lens !lensSz Bytes{array,offset=arrIx0} = runST do dst <- PM.unsafeNewUnliftedArray lensSz let go !lenIx !arrIx = if lenIx < lensSz then do let !len = PM.indexPrimArray lens lenIx buf <- PM.newByteArray len PM.copyByteArray buf 0 array arrIx len buf' <- PM.unsafeFreezeByteArray buf PM.writeUnliftedArray dst lenIx buf' go (lenIx + 1) (arrIx + len + 1) else pure () go 0 arrIx0 PM.unsafeFreezeUnliftedArray dst -- | Break a byte sequence into pieces separated by the byte argument, -- consuming the delimiter. This function is a good producer for list -- fusion. It is common to immidiately consume the results of @split@ -- with @foldl'@, @traverse_@, @foldlM@, and being a good producer helps -- in this situation. -- -- Note: this function differs from its counterpart in @bytestring@. -- If the byte sequence is empty, this returns a singleton list with -- the empty byte sequence. split :: Word8 -> Bytes -> [Bytes] {-# inline split #-} split !w !bs@Bytes{array,offset=arrIx0} = Exts.build (\g x0 -> let go !lenIx !arrIx = if lenIx < lensSz then let !len = PM.indexPrimArray lens lenIx in g (Bytes array arrIx len) (go (lenIx + 1) (arrIx + len + 1)) else x0 in go 0 arrIx0 ) where !lens = splitLengthsAlt w bs !lensSz = PM.sizeofPrimArray lens -- | Variant of 'split' that returns the result as a 'NonEmpty' -- instead of @[]@. This is also eligible for stream fusion. splitNonEmpty :: Word8 -> Bytes -> NonEmpty Bytes {-# inline splitNonEmpty #-} splitNonEmpty !w !bs@Bytes{array,offset=arrIx0} = Bytes array arrIx0 len0 :| Exts.build (\g x0 -> let go !lenIx !arrIx = if lenIx < lensSz then let !len = PM.indexPrimArray lens lenIx in g (Bytes array arrIx len) (go (lenIx + 1) (arrIx + len + 1)) else x0 in go 1 (1 + (arrIx0 + len0)) ) where !lens = splitLengthsAlt w bs !lensSz = PM.sizeofPrimArray lens !len0 = PM.indexPrimArray lens 0 :: Int -- | Variant of 'split' that drops the trailing element. This behaves -- correctly even if the byte sequence is empty. This is a good producer -- for list fusion. This is useful when splitting a text file -- into lines. -- -- mandates that text files end with a newline, so the list resulting -- from 'split' always has an empty byte sequence as its last element. -- With 'splitInit', that unwanted element is discarded. splitInit :: Word8 -> Bytes -> [Bytes] {-# inline splitInit #-} splitInit !w !bs@Bytes{array,offset=arrIx0} = Exts.build (\g x0 -> let go !lenIx !arrIx = if lenIx < lensSz then let !len = PM.indexPrimArray lens lenIx in g (Bytes array arrIx len) (go (lenIx + 1) (arrIx + len + 1)) else x0 in go 0 arrIx0 ) where -- Remember, the resulting array from splitLengthsAlt always has -- a length of at least one. !lens = splitLengthsAlt w bs !lensSz = PM.sizeofPrimArray lens - 1 -- Internal function. This is just like splitLengths except that -- it does not treat the empty byte sequences specially. The result -- for that byte sequence is a singleton array with the element zero. splitLengthsAlt :: Word8 -> Bytes -> PrimArray Int splitLengthsAlt b Bytes{array=ByteArray arr#,offset=off,length=len} = runPrimArrayST do let !n = count_ba arr# off len b dst@(MutablePrimArray dst# ) :: MutablePrimArray s Int <- PM.newPrimArray (n + 1) total <- unsafeIOToST (memchr_ba_many arr# off len dst# n b) PM.writePrimArray dst n (len - total) PM.unsafeFreezePrimArray dst foreign import ccall unsafe "bs_custom.h memchr_ba_many" memchr_ba_many :: ByteArray# -> Int -> Int -> MutableByteArray# s -> Int -> Word8 -> IO Int foreign import ccall unsafe "bs_custom.h count_ba" count_ba :: ByteArray# -> Int -> Int -> Word8 -> Int