{-# LANGUAGE UnboxedTuples #-}
-- |
-- Module      : Streamly.Internal.Data.Array.Foreign.Mut.Type
-- Copyright   : (c) 2020 Composewell Technologies
-- License     : BSD3-3-Clause
-- Maintainer  : streamly@composewell.com
-- Stability   : experimental
-- Portability : GHC
--
-- Mutable arrays and file system files are quite similar, they can grow and
-- their content is mutable. Therefore, both have similar APIs as well. We
-- strive to keep the API consistent for both. Ideally, you should be able to
-- replace one with another with little changes to the code.

module Streamly.Internal.Data.Array.Foreign.Mut.Type
    (
    -- * Type
    -- $arrayNotes
      Array (..)
    , ArrayContents
    , arrayToFptrContents
    , fptrToArrayContents
    , nilArrayContents
    , touch

    -- * Constructing and Writing
    -- ** Construction
    , nil

    -- *** Uninitialized Arrays
    , newArray
    , newArrayAligned
    , newArrayAlignedUnmanaged
    , newArrayWith

    -- *** Initialized Arrays
    , withNewArrayUnsafe

    -- *** From streams
    , ArrayUnsafe (..)
    , writeNWithUnsafe
    , writeNWith
    , writeNUnsafe
    , writeN
    , writeNAligned
    , writeNAlignedUnmanaged

    , writeWith
    , write

    -- , writeRevN
    -- , writeRev

    -- ** From containers
    , fromForeignPtrUnsafe
    , fromListN
    , fromList
    , fromStreamDN
    , fromStreamD

    -- * Random writes
    , putIndex
    , putIndexUnsafe
    , putIndices
    -- , putFromThenTo
    -- , putFrom -- start writing at the given position
    -- , putUpto -- write from beginning up to the given position
    -- , putFromTo
    -- , putFromRev
    -- , putUptoRev
    , modifyIndexUnsafe
    , modifyIndex
    , modifyIndices
    , modify
    , swapIndices
    , unsafeSwapIndices

    -- * Growing and Shrinking
    -- Arrays grow only at the end, though it is possible to grow on both sides
    -- and therefore have a cons as well as snoc. But that will require two
    -- bounds in the array representation.

    -- ** Appending elements
    , snocWith
    , snoc
    , snocLinear
    , snocMay
    , snocUnsafe

    -- ** Appending streams
    , appendNUnsafe
    , appendN
    , appendWith
    , append

    -- * Eliminating and Reading

    -- ** To streams
    , ReadUState(..)
    , read
    , readRev

    -- ** To containers
    , toStreamD
    , toStreamDRev
    , toStreamK
    , toStreamKRev
    , toList

    -- experimental
    , producer

    -- ** Random reads
    , getIndex
    , getIndexUnsafe
    , getIndices
    , getIndicesD
    -- , getFromThenTo
    , getIndexRev

    -- * Memory Management
    , blockSize
    , arrayChunkBytes
    , allocBytesToElemCount
    , realloc
    , resize
    , resizeExp
    , rightSize

    -- * Size
    , length
    , byteLength
    -- , capacity
    , byteCapacity
    , bytesFree

    -- * In-place Mutation Algorithms
    , reverse
    , permute
    , partitionBy
    , shuffleBy
    , divideBy
    , mergeBy

    -- * Casting
    , cast
    , castUnsafe
    , asBytes
    , asPtrUnsafe

    -- * Folding
    , foldl'
    , foldr
    , cmp

    -- * Arrays of arrays
    --  We can add dimensionality parameter to the array type to get
    --  multidimensional arrays. Multidimensional arrays would just be a
    --  convenience wrapper on top of single dimensional arrays.

    -- | Operations dealing with multiple arrays, streams of arrays or
    -- multidimensional array representations.

    -- ** Construct from streams
    , arraysOf
    , arrayStreamKFromStreamD
    , writeChunks

    -- ** Eliminate to streams
    , flattenArrays
    , flattenArraysRev
    , fromArrayStreamK

    -- ** Construct from arrays
    -- get chunks without copying
    , getSliceUnsafe
    , getSlice
    -- , getSlicesFromLenN
    , splitAt -- XXX should be able to express using getSlice
    , breakOn

    -- ** Appending arrays
    , spliceCopy
    , spliceWith
    , splice
    , spliceExp
    -- , putSlice
    -- , appendSlice
    -- , appendSliceFrom

    -- * Utilities
    , roundUpToPower2
    , memcpy
    , memcmp
    , c_memchr
    )
where

#include "inline.hs"
#include "ArrayMacros.h"
#include "MachDeps.h"

#ifdef USE_C_MALLOC
#define USE_FOREIGN_PTR
#endif

import Control.Exception (assert)
import Control.DeepSeq (NFData(..))
import Control.Monad (when, void)
import Control.Monad.IO.Class (MonadIO(..))
import Data.Bits (shiftR, (.|.), (.&.))
#if __GLASGOW_HASKELL__ < 808
import Data.Semigroup (Semigroup(..))
#endif
import Data.Word (Word8)
import Foreign.C.Types (CSize(..), CInt(..))
import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr)
#ifndef USE_FOREIGN_PTR
import Foreign.Marshal.Alloc (mallocBytes)
#endif
import Foreign.Ptr (plusPtr, minusPtr, castPtr, nullPtr)
import Foreign.Storable (Storable(..))
import GHC.Base
    ( touch#, IO(..), byteArrayContents#
    , Int(..), newAlignedPinnedByteArray#
    )
#ifndef USE_FOREIGN_PTR
import GHC.Base (RealWorld, MutableByteArray#)
#endif
#if __GLASGOW_HASKELL__ < 802
#define noinline
#else
import GHC.Base (noinline)
#endif
import GHC.Exts (unsafeCoerce#)
import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(..))
#ifdef USE_C_MALLOC
import GHC.ForeignPtr (mallocForeignPtrAlignedBytes)
#endif
import GHC.Ptr (Ptr(..))

import Streamly.Internal.BaseCompat
import Streamly.Internal.Data.Fold.Type (Fold(..))
import Streamly.Internal.Data.Producer.Type (Producer (..))
import Streamly.Internal.Data.Stream.Serial (SerialT(..))
import Streamly.Internal.Data.SVar.Type (adaptState, defState)
import Streamly.Internal.Data.Unfold.Type (Unfold(..))
import Streamly.Internal.System.IO (arrayPayloadSize, defaultChunkSize)
import System.IO.Unsafe (unsafePerformIO)

#ifdef DEVBUILD
import qualified Data.Foldable as F
#endif
import qualified Streamly.Internal.Data.Fold.Type as FL
import qualified Streamly.Internal.Data.Producer as Producer
import qualified Streamly.Internal.Data.Stream.StreamD.Type as D
import qualified Streamly.Internal.Data.Stream.StreamK.Type as K
#ifdef USE_FOREIGN_PTR
import qualified Streamly.Internal.Foreign.Malloc as Malloc
#endif

import Prelude hiding
    (length, foldr, read, unlines, splitAt, reverse, truncate)

-- $setup
-- >>> :m
-- >>> import qualified Streamly.Internal.Data.Array.Foreign.Mut.Type as Array
-- >>> import qualified Streamly.Internal.Data.Stream.IsStream as Stream
-- >>> import qualified Streamly.Internal.Data.Stream.StreamD as StreamD
-- >>> import qualified Streamly.Internal.Data.Fold as Fold
-- >>> import qualified Streamly.Internal.Data.Fold.Type as Fold

-------------------------------------------------------------------------------
-- Foreign helpers
-------------------------------------------------------------------------------

foreign import ccall unsafe "string.h memcpy" c_memcpy
    :: Ptr Word8 -> Ptr Word8 -> CSize -> IO (Ptr Word8)

foreign import ccall unsafe "string.h memchr" c_memchr
    :: Ptr Word8 -> Word8 -> CSize -> IO (Ptr Word8)

foreign import ccall unsafe "string.h memcmp" c_memcmp
    :: Ptr Word8 -> Ptr Word8 -> CSize -> IO CInt

-- | Given a 'Storable' type (unused first arg) and a number of bytes, return
-- how many elements of that type will completely fit in those bytes.
--
{-# INLINE bytesToElemCount #-}
bytesToElemCount :: forall a. Storable a => a -> Int -> Int
bytesToElemCount _ n = n `div` SIZE_OF(a)

-- XXX we are converting Int to CSize
memcpy :: Ptr Word8 -> Ptr Word8 -> Int -> IO ()
memcpy dst src len = void (c_memcpy dst src (fromIntegral len))

-- XXX we are converting Int to CSize
-- return True if the memory locations have identical contents
{-# INLINE memcmp #-}
memcmp :: Ptr Word8 -> Ptr Word8 -> Int -> IO Bool
memcmp p1 p2 len = do
    r <- c_memcmp p1 p2 (fromIntegral len)
    return $ r == 0

-------------------------------------------------------------------------------
-- Array Contents
-------------------------------------------------------------------------------

-- We support using ForeignPtrContents or MutableByteArray.

#ifdef USE_FOREIGN_PTR
newtype ArrayContents = ArrayContents ForeignPtrContents
#define UNPACKIF
#else
-- XXX can use UnliftedNewtypes
data ArrayContents = ArrayContents !(MutableByteArray# RealWorld)
#define UNPACKIF {-# UNPACK #-}
#endif

{-# INLINE touch #-}
touch :: ArrayContents -> IO ()
touch (ArrayContents contents) =
    IO $ \s -> case touch# contents s of s' -> (# s', () #)

fptrToArrayContents :: ForeignPtrContents -> ArrayContents
arrayToFptrContents :: ArrayContents -> ForeignPtrContents
#ifdef USE_FOREIGN_PTR
fptrToArrayContents = ArrayContents
arrayToFptrContents (ArrayContents contents) = contents
#else
fptrToArrayContents (PlainPtr mbarr) = ArrayContents mbarr
fptrToArrayContents _ = error "Unsupported foreign ptr"
arrayToFptrContents (ArrayContents contents) = PlainPtr contents
#endif

-------------------------------------------------------------------------------
-- Array Data Type
-------------------------------------------------------------------------------

-- $arrayNotes
--
-- We can use a 'Storable' constraint in the Array type and the constraint can
-- be automatically provided to a function that pattern matches on the Array
-- type. However, it has huge performance cost, so we do not use it.
-- Investigate a GHC improvement possiblity.

-- XXX Rename the fields to better names.

-- | An unboxed, pinned mutable array. An array is created with a given length
-- and capacity. Length is the number of valid elements in the array.  Capacity
-- is the maximum number of elements that the array can be expanded to without
-- having to reallocate the memory.
--
-- The elements in the array can be mutated in-place without changing the
-- reference (constructor). However, the length of the array cannot be mutated
-- in-place.  A new array reference is generated when the length changes.  When
-- the length is increased (upto the maximum reserved capacity of the array),
-- the array is not reallocated and the new reference uses the same underlying
-- memory as the old one.
--
-- Several routines in this module allow the programmer to control the capacity
-- of the array. The programmer can control the trade-off between memory usage
-- and performance impact due to reallocations when growing or shrinking the
-- array.
--
data Array a =
#ifdef DEVBUILD
    Storable a =>
#endif
    Array
    { arrContents :: UNPACKIF !ArrayContents
    , arrStart :: {-# UNPACK #-} !(Ptr a)      -- ^ first address
    , aEnd   :: {-# UNPACK #-} !(Ptr a)        -- ^ first unused address
    , aBound :: {-# UNPACK #-} !(Ptr a)        -- ^ first address beyond allocated memory
    }

-------------------------------------------------------------------------------
-- Construction
-------------------------------------------------------------------------------

-- XXX Change the names to use "new" instead of "newArray". That way we can use
-- the same names for managed file system objects as well. For unmanaged ones
-- we can use open/create etc as usual.
--
-- A new array is similar to "touch" creating a zero length file. An mmapped
-- array would be similar to a sparse file with holes. TBD: support mmapped
-- files and arrays.

-- GHC always guarantees word-aligned memory, alignment is important only when
-- we need more than that.  See stg_newAlignedPinnedByteArrayzh and
-- allocatePinned in GHC source.

-- | @newArrayWith allocator alignment count@ allocates a new array of zero
-- length and with a capacity to hold @count@ elements, using @allocator
-- size alignment@ as the memory allocator function.
--
-- Alignment must be greater than or equal to machine word size and a power of
-- 2.
--
-- /Pre-release/
{-# INLINE newArrayWith #-}
newArrayWith :: forall m a. (MonadIO m, Storable a)
    => (Int -> Int -> m (ArrayContents, Ptr a)) -> Int -> Int -> m (Array a)
newArrayWith alloc alignSize count = do
    let size = max (count * SIZE_OF(a)) 0
    (contents, p) <- alloc size alignSize
    return $ Array
        { arrContents = contents
        , arrStart = p
        , aEnd   = p
        , aBound = p `plusPtr` size
        }

newAlignedArrayContents :: Int -> Int -> IO (ArrayContents, Ptr a)
#ifdef USE_C_MALLOC
newAlignedArrayContents size align = do
    (ForeignPtr addr contents) <- mallocForeignPtrAlignedBytes size align
    return (ArrayContents contents, Ptr addr)
#else
newAlignedArrayContents size _align | size < 0 =
  errorWithoutStackTrace "newAlignedArrayContents: size must be >= 0"
newAlignedArrayContents (I# size) (I# align) = IO $ \s ->
    case newAlignedPinnedByteArray# size align s of
        (# s', mbarr# #) ->
           let p = Ptr (byteArrayContents# (unsafeCoerce# mbarr#))
#ifdef USE_FOREIGN_PTR
               c = ArrayContents (PlainPtr mbarr#)
#else
               c = ArrayContents mbarr#
#endif
            in (# s', (c, p) #)
#endif

{-# NOINLINE nilArrayContents #-}
nilArrayContents :: ArrayContents
nilArrayContents =
    fst $ unsafePerformIO $ newAlignedArrayContents 0 0

nil ::
#ifdef DEVBUILD
    Storable a =>
#endif
    Array a
nil = Array nilArrayContents nullPtr nullPtr nullPtr

-- | @fromForeignPtrUnsafe foreignPtr end bound@ creates an 'Array' that starts
-- at the memory pointed by the @foreignPtr@, @end@ is the first unused
-- address, and @bound@ is the first address beyond the allocated memory.
--
-- Unsafe: Make sure that foreignPtr <= end <= bound and (end - start) is an
-- integral multiple of the element size. Only PlainPtr type ForeignPtr is
-- supported.
--
-- /Pre-release/
--
{-# INLINE fromForeignPtrUnsafe #-}
fromForeignPtrUnsafe ::
#ifdef DEVBUILD
    Storable a =>
#endif
    ForeignPtr a -> Ptr a -> Ptr a -> Array a
fromForeignPtrUnsafe (ForeignPtr start _) _ _
    | Ptr start == nullPtr = nil
fromForeignPtrUnsafe fp@(ForeignPtr start contents) end bound =
    assert (unsafeForeignPtrToPtr fp <= end && end <= bound)
           (Array (fptrToArrayContents contents) (Ptr start) end bound)

-- | Like 'newArrayWith' but using an allocator that allocates unmanaged pinned
-- memory. The memory will never be freed by GHC.  This could be useful in
-- allocate-once global data structures. Use carefully as incorrect use can
-- lead to memory leak.
--
-- /Internal/
{-# INLINE newArrayAlignedUnmanaged #-}
newArrayAlignedUnmanaged :: forall m a. (MonadIO m, Storable a) =>
    Int -> Int -> m (Array a)
#ifdef USE_FOREIGN_PTR
newArrayAlignedUnmanaged = do
    newArrayWith mallocForeignPtrAlignedUnmanagedBytes

    where

    mallocForeignPtrAlignedUnmanagedBytes size align = do
        ForeignPtr addr contents <-
            liftIO $ Malloc.mallocForeignPtrAlignedUnmanagedBytes size align
        return (ArrayContents contents, Ptr addr)
#else
newArrayAlignedUnmanaged _align count = do
    let size = max (count * SIZE_OF(a)) 0
    p <- liftIO $ mallocBytes size
    return $ Array
        { arrContents = nilArrayContents
        , arrStart = p
        , aEnd = p
        , aBound = p `plusPtr` size
        }
#endif

-- | Like 'newArrayWith' but using an allocator that aligns the memory to the
-- alignment dictated by the 'Storable' instance of the type.
--
-- /Internal/
{-# INLINE newArrayAligned #-}
newArrayAligned :: (MonadIO m, Storable a) => Int -> Int -> m (Array a)
newArrayAligned = newArrayWith (\s a -> liftIO $ newAlignedArrayContents s a)

-- XXX can unaligned allocation be more efficient when alignment is not needed?
--
-- | Allocates an empty array that can hold 'count' items.  The memory of the
-- array is uninitialized and the allocation is aligned as per the 'Storable'
-- instance of the type.
--
-- /Pre-release/
{-# INLINE newArray #-}
newArray :: forall m a. (MonadIO m, Storable a) => Int -> m (Array a)
newArray = newArrayAligned (alignment (undefined :: a))

-- | Allocate an Array of the given size and run an IO action passing the array
-- start pointer.
--
-- /Internal/
{-# INLINE withNewArrayUnsafe #-}
withNewArrayUnsafe ::
       (MonadIO m, Storable a) => Int -> (Ptr a -> m ()) -> m (Array a)
withNewArrayUnsafe count f = do
    arr <- newArray count
    asPtrUnsafe arr
        $ \p -> f p >> return arr

-------------------------------------------------------------------------------
-- Random writes
-------------------------------------------------------------------------------

-- | Write the given element to the given index of the array. Does not check if
-- the index is out of bounds of the array.
--
-- /Pre-release/
{-# INLINE putIndexUnsafe #-}
putIndexUnsafe :: forall m a. (MonadIO m, Storable a)
    => Int -> a -> Array a -> m ()
putIndexUnsafe i x arr@(Array{..}) =
    asPtrUnsafe arr $ \ptr -> do
        let elemPtr = PTR_INDEX(ptr,i,a)
        assert (i >= 0 && PTR_VALID(elemPtr,aEnd,a)) (return ())
        liftIO $ poke elemPtr x

invalidIndex :: String -> Int -> a
invalidIndex label i =
    error $ label ++ ": invalid array index " ++ show i

{-# INLINE putIndexPtr #-}
putIndexPtr :: forall m a. (MonadIO m, Storable a) =>
    Ptr a -> Ptr a -> Int -> a -> m ()
putIndexPtr start end i x = do
    let elemPtr = PTR_INDEX(start,i,a)
    if i >= 0 && PTR_VALID(elemPtr,end,a)
    then liftIO $ poke elemPtr x
    else invalidIndex "putIndexPtr" i

-- | /O(1)/ Write the given element at the given index in the array.
-- Performs in-place mutation of the array.
--
-- >>> putIndex arr ix val = Array.modifyIndex ix (const (val, ())) arr
-- >>> f = Array.putIndices
-- >>> putIndex ix val arr = Stream.fold (f arr) (Stream.fromPure (ix, val))
--
-- /Pre-release/
{-# INLINE putIndex #-}
putIndex :: (MonadIO m, Storable a) => Int -> a -> Array a -> m ()
putIndex i x arr =
    asPtrUnsafe arr
        $ \p -> putIndexPtr p (aEnd arr) i x

-- | Write an input stream of (index, value) pairs to an array. Throws an
-- error if any index is out of bounds.
--
-- /Pre-release/
{-# INLINE putIndices #-}
putIndices :: forall m a. (MonadIO m, Storable a)
    => Array a -> Fold m (Int, a) ()
putIndices Array{..} = FL.mkFoldM step initial extract

    where

    initial = return $ FL.Partial ()

    step () (i, x) = FL.Partial <$> liftIO (putIndexPtr arrStart aEnd i x)

    extract () = liftIO $ touch arrContents

-- | Modify a given index of an array using a modifier function.
--
-- /Pre-release/
modifyIndexUnsafe :: forall m a b. (MonadIO m, Storable a) =>
    Int -> (a -> (a, b)) -> Array a -> m b
modifyIndexUnsafe i f arr@(Array{..}) = do
    liftIO $ asPtrUnsafe arr $ \ptr -> do
        let elemPtr = PTR_INDEX(ptr,i,a)
        assert (i >= 0 && PTR_NEXT(elemPtr,a) <= aEnd) (return ())
        r <- peek elemPtr
        let (x, res) = f r
        poke elemPtr x
        return res

{-# INLINE modifyIndexPtr #-}
modifyIndexPtr :: forall m a b. (MonadIO m, Storable a) =>
    Int -> (a -> (a, b)) -> Ptr a -> Ptr a -> m b
modifyIndexPtr i f start end = liftIO $ do
    let elemPtr = PTR_INDEX(start,i,a)
    if i >= 0 && PTR_VALID(elemPtr,end,a)
    then do
        r <- peek elemPtr
        let (x, res) = f r
        poke elemPtr x
        return res
    else invalidIndex "modifyIndex" i

-- | Modify a given index of an array using a modifier function.
--
-- /Pre-release/
modifyIndex :: forall m a b. (MonadIO m, Storable a) =>
    Int -> (a -> (a, b)) -> Array a -> m b
modifyIndex i f arr@(Array{..}) = do
    liftIO $ asPtrUnsafe arr $ \ptr -> do
        modifyIndexPtr i f ptr aEnd

-- | Modify the array indices generated by the supplied stream.
--
-- /Pre-release/
{-# INLINE modifyIndices #-}
modifyIndices :: forall m a. (MonadIO m, Storable a)
    => (a -> a) -> Array a -> Fold m Int ()
modifyIndices f Array{..} = Fold step initial extract

    where

    initial = return $ FL.Partial ()

    step () i =
        let f1 x = (f x, ())
         in FL.Partial <$> liftIO (modifyIndexPtr i f1 arrStart aEnd)

    extract () = liftIO $ touch arrContents

-- | Modify each element of an array using the supplied modifier function.
--
-- /Pre-release/
modify :: forall m a. (MonadIO m, Storable a)
    => (a -> a) -> Array a -> m ()
modify f arr@Array{..} = liftIO $
    asPtrUnsafe arr go

    where

    go ptr =
        when (PTR_VALID(ptr,aEnd,a)) $ do
            r <- peek ptr
            poke ptr (f r)
            go (PTR_NEXT(ptr,a))

{-# INLINE swapPtrs #-}
swapPtrs :: Storable a => Ptr a -> Ptr a -> IO ()
swapPtrs ptr1 ptr2 = do
    r1 <- peek ptr1
    r2 <- peek ptr2
    poke ptr1 r2
    poke ptr2 r1

-- | Swap the elements at two indices without validating the indices.
--
-- /Unsafe/: This could result in memory corruption if indices are not valid.
--
-- /Pre-release/
{-# INLINE unsafeSwapIndices #-}
unsafeSwapIndices :: forall m a. (MonadIO m, Storable a)
    => Int -> Int -> Array a -> m ()
unsafeSwapIndices i1 i2 arr = liftIO $ do
    asPtrUnsafe arr $ \ptr -> do
        let ptr1 = PTR_INDEX(ptr,i1,a)
            ptr2 = PTR_INDEX(ptr,i2,a)
        swapPtrs ptr1 (ptr2 :: Ptr a)

-- | Swap the elements at two indices.
--
-- /Pre-release/
swapIndices :: forall m a. (MonadIO m, Storable a)
    => Int -> Int -> Array a -> m ()
swapIndices i1 i2 Array{..} = liftIO $ do
        let ptr1 = PTR_INDEX(arrStart,i1,a)
            ptr2 = PTR_INDEX(arrStart,i2,a)
        when (i1 < 0 || PTR_INVALID(ptr1,aEnd,a))
            $ invalidIndex "swapIndices" i1
        when (i2 < 0 || PTR_INVALID(ptr2,aEnd,a))
            $ invalidIndex "swapIndices" i2
        swapPtrs ptr1 (ptr2 :: Ptr a)

-------------------------------------------------------------------------------
-- Rounding
-------------------------------------------------------------------------------

-- XXX Should we use bitshifts in calculations or it gets optimized by the
-- compiler/processor itself?
--
-- | The page or block size used by the GHC allocator. Allocator allocates at
-- least a block and then allocates smaller allocations from within a block.
blockSize :: Int
blockSize = 4 * 1024

-- | Allocations larger than 'largeObjectThreshold' are in multiples of block
-- size and are always pinned. The space beyond the end of a large object up to
-- the end of the block is unused.
largeObjectThreshold :: Int
largeObjectThreshold = (blockSize * 8) `div` 10

-- XXX Should be done only when we are using the GHC allocator.
-- | Round up an array larger than 'largeObjectThreshold' to use the whole
-- block.
{-# INLINE roundUpLargeArray #-}
roundUpLargeArray :: Int -> Int
roundUpLargeArray size =
    if size >= largeObjectThreshold
    then
        assert
            (blockSize /= 0 && ((blockSize .&. (blockSize - 1)) == 0))
            ((size + blockSize - 1) .&. negate blockSize)
    else size

{-# INLINE isPower2 #-}
isPower2 :: Int -> Bool
isPower2 n = n .&. (n - 1) == 0

{-# INLINE roundUpToPower2 #-}
roundUpToPower2 :: Int -> Int
roundUpToPower2 n =
#if WORD_SIZE_IN_BITS == 64
    1 + z6
#else
    1 + z5
#endif

    where

    z0 = n - 1
    z1 = z0 .|. z0 `shiftR` 1
    z2 = z1 .|. z1 `shiftR` 2
    z3 = z2 .|. z2 `shiftR` 4
    z4 = z3 .|. z3 `shiftR` 8
    z5 = z4 .|. z4 `shiftR` 16
    z6 = z5 .|. z5 `shiftR` 32

-- | @allocBytesToBytes elem allocatedBytes@ returns the array size in bytes
-- such that the real allocation is less than or equal to @allocatedBytes@,
-- unless @allocatedBytes@ is less than the size of one array element in which
-- case it returns one element's size.
--
{-# INLINE allocBytesToBytes #-}
allocBytesToBytes :: forall a. Storable a => a -> Int -> Int
allocBytesToBytes _ n = max (arrayPayloadSize n) (SIZE_OF(a))

-- | Given a 'Storable' type (unused first arg) and real allocation size
-- (including overhead), return how many elements of that type will completely
-- fit in it, returns at least 1.
--
{-# INLINE allocBytesToElemCount #-}
allocBytesToElemCount :: Storable a => a -> Int -> Int
allocBytesToElemCount x bytes =
    let n = bytesToElemCount x (allocBytesToBytes x bytes)
     in assert (n >= 1) n

-- | The default chunk size by which the array creation routines increase the
-- size of the array when the array is grown linearly.
arrayChunkBytes :: Int
arrayChunkBytes = 1024

-------------------------------------------------------------------------------
-- Resizing
-------------------------------------------------------------------------------

-- | Round the second argument down to multiples of the first argument.
{-# INLINE roundDownTo #-}
roundDownTo :: Int -> Int -> Int
roundDownTo elemSize size = size - (size `mod` elemSize)

-- XXX See if resizing can be implemented by reading the old array as a stream
-- and then using writeN to the new array.
--
{-# NOINLINE reallocAligned #-}
reallocAligned :: Int -> Int -> Int -> Array a -> IO (Array a)
reallocAligned elemSize alignSize newCapacity Array{..} = do
    assert (aEnd <= aBound) (return ())

    -- Allocate new array
    let newCapMax = roundUpLargeArray newCapacity
    (contents, pNew) <- newAlignedArrayContents newCapMax alignSize

    -- Copy old data
    let oldStart = arrStart
        oldSize = aEnd `minusPtr` oldStart
        newCap = roundDownTo elemSize newCapMax
        newLen = min oldSize newCap
    assert (oldSize `mod` elemSize == 0) (return ())
    assert (newLen >= 0) (return ())
    assert (newLen `mod` elemSize == 0) (return ())
    memcpy (castPtr pNew) (castPtr oldStart) newLen
    touch arrContents

    return $ Array
        { arrStart = pNew
        , arrContents = contents
        , aEnd   = pNew `plusPtr` newLen
        , aBound = pNew `plusPtr` newCap
        }

-- | @realloc newCapacity array@ reallocates the array to the specified
-- capacity in bytes.
--
-- If the new size is less than the original array the array gets truncated.
-- If the new size is not a multiple of array element size then it is rounded
-- down to multiples of array size.  If the new size is more than
-- 'largeObjectThreshold' then it is rounded up to the block size (4K).
--
{-# INLINABLE realloc #-}
realloc :: forall m a. (MonadIO m, Storable a) => Int -> Array a -> m (Array a)
realloc n arr =
    liftIO $ reallocAligned (SIZE_OF(a)) (alignment (undefined :: a)) n arr

-- | @reallocWith label capSizer minIncrement array@. The label is used
-- in error messages and the capSizer is used to determine the capacity of the
-- new array in bytes given the current byte length of the array.
reallocWith :: forall m a. (MonadIO m , Storable a) =>
       String
    -> (Int -> Int)
    -> Int
    -> Array a
    -> m (Array a)
reallocWith label capSizer minIncr arr = do
    let oldSize = aEnd arr `minusPtr` arrStart arr
        newCap = capSizer oldSize
        newSize = oldSize + minIncr
        safeCap = max newCap newSize
    assert (newCap >= newSize || error (badSize newSize)) (return ())
    realloc safeCap arr

    where

    badSize newSize = concat
        [ label
        , ": new array size is less than required size "
        , show newSize
        , ". Please check the sizing function passed."
        ]

-- | @resize newCapacity array@ changes the total capacity of the array so that
-- it is enough to hold the specified number of elements.  Nothing is done if
-- the specified capacity is less than the length of the array.
--
-- If the capacity is more than 'largeObjectThreshold' then it is rounded up to
-- the block size (4K).
--
-- /Pre-release/
{-# INLINE resize #-}
resize :: forall m a. (MonadIO m, Storable a) =>
    Int -> Array a -> m (Array a)
resize n arr@Array{..} = do
    let req = SIZE_OF(a) * n
        len = aEnd `minusPtr` arrStart
    if req < len
    then return arr
    else realloc req arr

-- | Like 'resize' but if the capacity is more than 'largeObjectThreshold' then
-- it is rounded up to the closest power of 2.
--
-- /Pre-release/
{-# INLINE resizeExp #-}
resizeExp :: forall m a. (MonadIO m, Storable a) =>
    Int -> Array a -> m (Array a)
resizeExp n arr@Array{..} = do
    let req = roundUpLargeArray (SIZE_OF(a) * n)
        req1 =
            if req > largeObjectThreshold
            then roundUpToPower2 req
            else req
        len = aEnd `minusPtr` arrStart
    if req1 < len
    then return arr
    else realloc req1 arr

-- | Resize the allocated memory to drop any reserved free space at the end of
-- the array and reallocate it to reduce wastage.
--
-- Up to 25% wastage is allowed to avoid reallocations.  If the capacity is
-- more than 'largeObjectThreshold' then free space up to the 'blockSize' is
-- retained.
--
-- /Pre-release/
{-# INLINE rightSize #-}
rightSize :: forall m a. (MonadIO m, Storable a) => Array a -> m (Array a)
rightSize arr@Array{..} = do
    assert (aEnd <= aBound) (return ())
    let start = arrStart
        len = aEnd `minusPtr` start
        capacity = aBound `minusPtr` start
        target = roundUpLargeArray len
        waste = aBound `minusPtr` aEnd
    assert (target >= len) (return ())
    assert (len `mod` SIZE_OF(a) == 0) (return ())
    -- We trade off some wastage (25%) to avoid reallocations and copying.
    if target < capacity && len < 3 * waste
    then realloc target arr
    else return arr

-------------------------------------------------------------------------------
-- Snoc
-------------------------------------------------------------------------------

-- XXX We can possibly use a smallMutableByteArray to hold the start, end,
-- bound pointers.  Using fully mutable handle will ensure that we do not have
-- multiple references to the same array of different lengths lying around and
-- potentially misused. In that case "snoc" need not return a new array (snoc
-- :: Array a -> a -> m ()), it will just modify the old reference.  The array
-- length will be mutable.  This means the length function would also be
-- monadic.  Mutable arrays would behave more like files that grow in that
-- case.

-- | Snoc using a 'Ptr'. Low level reusable function.
--
-- /Internal/
{-# INLINE snocNewEnd #-}
snocNewEnd :: (MonadIO m, Storable a) => Ptr a -> Array a -> a -> m (Array a)
snocNewEnd newEnd arr@Array{..} x = liftIO $ do
    assert (newEnd <= aBound) (return ())
    poke aEnd x
    touch arrContents
    return $ arr {aEnd = newEnd}

-- | Really really unsafe, appends the element into the first array, may
-- cause silent data corruption or if you are lucky a segfault if the first
-- array does not have enough space to append the element.
--
-- /Internal/
{-# INLINE snocUnsafe #-}
snocUnsafe :: forall m a. (MonadIO m, Storable a) =>
    Array a -> a -> m (Array a)
snocUnsafe arr@Array{..} = snocNewEnd (PTR_NEXT(aEnd,a)) arr

-- | Like 'snoc' but does not reallocate when pre-allocated array capacity
-- becomes full.
--
-- /Internal/
{-# INLINE snocMay #-}
snocMay :: forall m a. (MonadIO m, Storable a) =>
    Array a -> a -> m (Maybe (Array a))
snocMay arr@Array{..} x = liftIO $ do
    let newEnd = PTR_NEXT(aEnd,a)
    if newEnd <= aBound
    then Just <$> snocNewEnd newEnd arr x
    else return Nothing

-- NOINLINE to move it out of the way and not pollute the instruction cache.
{-# NOINLINE snocWithRealloc #-}
snocWithRealloc :: forall m a. (MonadIO m, Storable a) =>
       (Int -> Int)
    -> Array a
    -> a
    -> m (Array a)
snocWithRealloc sizer arr x = do
    arr1 <- liftIO $ reallocWith "snocWith" sizer (SIZE_OF(a)) arr
    snocUnsafe arr1 x

-- | @snocWith sizer arr elem@ mutates @arr@ to append @elem@. The length of
-- the array increases by 1.
--
-- If there is no reserved space available in @arr@ it is reallocated to a size
-- in bytes determined by the @sizer oldSizeBytes@ function, where
-- @oldSizeBytes@ is the original size of the array in bytes.
--
-- If the new array size is more than 'largeObjectThreshold' we automatically
-- round it up to 'blockSize'.
--
-- Note that the returned array may be a mutated version of the original array.
--
-- /Pre-release/
{-# INLINE snocWith #-}
snocWith :: forall m a. (MonadIO m, Storable a) =>
       (Int -> Int)
    -> Array a
    -> a
    -> m (Array a)
snocWith allocSize arr x = liftIO $ do
    let newEnd = PTR_NEXT(aEnd arr,a)
    if newEnd <= aBound arr
    then snocNewEnd newEnd arr x
    else snocWithRealloc allocSize arr x

-- | The array is mutated to append an additional element to it. If there
-- is no reserved space available in the array then it is reallocated to grow
-- it by 'arrayChunkBytes' rounded up to 'blockSize' when the size becomes more
-- than 'largeObjectThreshold'.
--
-- Note that the returned array may be a mutated version of the original array.
--
-- Performs O(n^2) copies to grow but is thrifty on memory.
--
-- /Pre-release/
{-# INLINE snocLinear #-}
snocLinear :: forall m a. (MonadIO m, Storable a) => Array a -> a -> m (Array a)
snocLinear = snocWith (+ allocBytesToBytes (undefined :: a) arrayChunkBytes)

-- | The array is mutated to append an additional element to it. If there is no
-- reserved space available in the array then it is reallocated to double the
-- original size.
--
-- This is useful to reduce allocations when appending unknown number of
-- elements.
--
-- Note that the returned array may be a mutated version of the original array.
--
-- >>> snoc = Array.snocWith (* 2)
--
-- Performs O(n * log n) copies to grow, but is liberal with memory allocation.
--
-- /Pre-release/
{-# INLINE snoc #-}
snoc :: forall m a. (MonadIO m, Storable a) => Array a -> a -> m (Array a)
snoc = snocWith f

    where

    f oldSize =
        if isPower2 oldSize
        then oldSize * 2
        else roundUpToPower2 oldSize * 2

-------------------------------------------------------------------------------
-- Random reads
-------------------------------------------------------------------------------

-- | Return the element at the specified index without checking the bounds.
--
-- Unsafe because it does not check the bounds of the array.
{-# INLINE_NORMAL getIndexUnsafe #-}
getIndexUnsafe :: forall m a. (MonadIO m, Storable a) => Int -> Array a -> m a
getIndexUnsafe i arr@(Array {..}) =
    asPtrUnsafe arr $ \ptr -> do
        let elemPtr = PTR_INDEX(ptr,i,a)
        assert (i >= 0 && PTR_VALID(elemPtr,aEnd,a)) (return ())
        liftIO $ peek elemPtr

{-# INLINE getIndexPtr #-}
getIndexPtr :: forall m a. (MonadIO m, Storable a) =>
    Ptr a -> Ptr a -> Int -> m a
getIndexPtr start end i = do
    let elemPtr = PTR_INDEX(start,i,a)
    if i >= 0 && PTR_VALID(elemPtr,end,a)
    then liftIO $ peek elemPtr
    else invalidIndex "getIndexPtr" i

-- | /O(1)/ Lookup the element at the given index. Index starts from 0.
--
{-# INLINE getIndex #-}
getIndex :: (MonadIO m, Storable a) => Int -> Array a -> m a
getIndex i arr =
    asPtrUnsafe arr
        $ \p -> getIndexPtr p (aEnd arr) i

{-# INLINE getIndexPtrRev #-}
getIndexPtrRev :: forall m a. (MonadIO m, Storable a) =>
    Ptr a -> Ptr a -> Int -> m a
getIndexPtrRev start end i = do
    let elemPtr = PTR_RINDEX(end,i,a)
    if i >= 0 && elemPtr >= start
    then liftIO $ peek elemPtr
    else invalidIndex "getIndexPtrRev" i

-- | /O(1)/ Lookup the element at the given index from the end of the array.
-- Index starts from 0.
--
-- Slightly faster than computing the forward index and using getIndex.
--
{-# INLINE getIndexRev #-}
getIndexRev :: (MonadIO m, Storable a) => Int -> Array a -> m a
getIndexRev i arr =
    asPtrUnsafe arr
        $ \p -> getIndexPtrRev p (aEnd arr) i

data GetIndicesState contents start end st =
    GetIndicesState contents start end st

-- | Given an unfold that generates array indices, read the elements on those
-- indices from the supplied Array. An error is thrown if an index is out of
-- bounds.
--
-- /Pre-release/
{-# INLINE getIndicesD #-}
getIndicesD :: (Monad m, Storable a) =>
    (forall b. IO b -> m b) -> D.Stream m Int -> Unfold m (Array a) a
getIndicesD liftio (D.Stream stepi sti) = Unfold step inject

    where

    inject (Array contents start (Ptr end) _) =
        return $ GetIndicesState contents start (Ptr end) sti

    {-# INLINE_LATE step #-}
    step (GetIndicesState contents start end st) = do
        r <- stepi defState st
        case r of
            D.Yield i s -> do
                x <- liftio $ getIndexPtr start end i
                return $ D.Yield x (GetIndicesState contents start end s)
            D.Skip s -> return $ D.Skip (GetIndicesState contents start end s)
            D.Stop -> do
                liftio $ touch contents
                return D.Stop

{-# INLINE getIndices #-}
getIndices :: (MonadIO m, Storable a) => SerialT m Int -> Unfold m (Array a) a
getIndices (SerialT stream) = getIndicesD liftIO $ D.fromStreamK stream

-------------------------------------------------------------------------------
-- Subarrays
-------------------------------------------------------------------------------

-- XXX We can also get immutable slices.

-- | /O(1)/ Slice an array in constant time.
--
-- Unsafe: The bounds of the slice are not checked.
--
-- /Unsafe/
--
-- /Pre-release/
{-# INLINE getSliceUnsafe #-}
getSliceUnsafe :: forall a. Storable a
    => Int -- ^ from index
    -> Int -- ^ length of the slice
    -> Array a
    -> Array a
getSliceUnsafe index len (Array contents start e _) =
    let fp1 = PTR_INDEX(start,index,a)
        end = fp1 `plusPtr` (len * SIZE_OF(a))
     in assert
            (index >= 0 && len >= 0 && end <= e)
            (Array contents fp1 end end)

-- | /O(1)/ Slice an array in constant time. Throws an error if the slice
-- extends out of the array bounds.
--
-- /Pre-release/
{-# INLINE getSlice #-}
getSlice :: forall a. Storable a =>
       Int -- ^ from index
    -> Int -- ^ length of the slice
    -> Array a
    -> Array a
getSlice index len (Array contents start e _) =
    let fp1 = PTR_INDEX(start,index,a)
        end = fp1 `plusPtr` (len * SIZE_OF(a))
     in if index >= 0 && len >= 0 && end <= e
        then Array contents fp1 end end
        else error
                $ "getSlice: invalid slice, index "
                ++ show index ++ " length " ++ show len

-------------------------------------------------------------------------------
-- In-place mutation algorithms
-------------------------------------------------------------------------------

-- XXX consider the bulk update/accumulation/permutation APIs from vector.

-- | You may not need to reverse an array because you can consume it in reverse
-- using 'readRev'. To reverse large arrays you can read in reverse and write
-- to another array. However, in-place reverse can be useful to take adavantage
-- of cache locality and when you do not want to allocate additional memory.
--
-- /Pre-release/
{-# INLINE reverse #-}
reverse :: forall m a. (MonadIO m, Storable a) => Array a -> m ()
reverse Array{..} = liftIO $ do
    let l = arrStart
        h = PTR_PREV(aEnd,a)
     in swap l h

    where

    swap l h = do
        when (l < h) $ do
            swapPtrs l h
            swap (PTR_NEXT(l,a)) (PTR_PREV(h,a))

-- | Generate the next permutation of the sequence, returns False if this is
-- the last permutation.
--
-- /Unimplemented/
{-# INLINE permute #-}
permute :: Array a -> m Bool
permute = undefined

-- | Partition an array into two halves using a partitioning predicate. The
-- first half retains values where the predicate is 'False' and the second half
-- retains values where the predicate is 'True'.
--
-- /Pre-release/
{-# INLINE partitionBy #-}
partitionBy :: forall m a. (MonadIO m, Storable a)
    => (a -> Bool) -> Array a -> m (Array a, Array a)
partitionBy f arr@Array{..} = liftIO $ do
    if arrStart >= aEnd
    then return (arr, arr)
    else do
        ptr <- go arrStart (PTR_PREV(aEnd,a))
        let pl = Array arrContents arrStart ptr ptr
            pr = Array arrContents ptr aEnd aEnd
        return (pl, pr)

    where

    -- Invariant low < high on entry, and on return as well
    moveHigh low high = do
        h <- peek high
        if f h
        then
            -- Correctly classified, continue the loop
            let high1 = PTR_PREV(high,a)
             in if low == high1
                then return Nothing
                else moveHigh low high1
        else return (Just (high, h)) -- incorrectly classified

    -- Keep a low pointer starting at the start of the array (first partition)
    -- and a high pointer starting at the end of the array (second partition).
    -- Keep incrementing the low ptr and decrementing the high ptr until both
    -- are wrongly classified, at that point swap the two and continue until
    -- the two pointer cross each other.
    --
    -- Invariants when entering this loop:
    -- low <= high
    -- Both low and high are valid locations within the array
    go low high = do
        l <- peek low
        if f l
        then
            -- low is wrongly classified
            if low == high
            then return low
            else do -- low < high
                r <- moveHigh low high
                case r of
                    Nothing -> return low
                    Just (high1, h) -> do -- low < high1
                        poke low h
                        poke high1 l
                        let low1 = PTR_NEXT(low,a)
                            high2 = PTR_PREV(high1,a)
                        if low1 <= high2
                        then go low1 high2
                        else return low1 -- low1 > high2

        else do
            -- low is correctly classified
            let low1 = PTR_NEXT(low,a)
            if low == high
            then return low1
            else go low1 high

-- | Shuffle corresponding elements from two arrays using a shuffle function.
-- If the shuffle function returns 'False' then do nothing otherwise swap the
-- elements. This can be used in a bottom up fold to shuffle or reorder the
-- elements.
--
-- /Unimplemented/
{-# INLINE shuffleBy #-}
shuffleBy :: (a -> a -> m Bool) -> Array a -> Array a -> m ()
shuffleBy = undefined

-- XXX we can also make the folds partial by stopping at a certain level.
--
-- | @divideBy level partition array@  performs a top down hierarchical
-- recursive partitioning fold of items in the container using the given
-- function as the partition function.  Level indicates the level in the tree
-- where the fold would stop.
--
-- This performs a quick sort if the partition function is
-- 'partitionBy (< pivot)'.
--
-- /Unimplemented/
{-# INLINABLE divideBy #-}
divideBy ::
    Int -> (Array a -> m (Array a, Array a)) -> Array a -> m ()
divideBy = undefined

-- | @mergeBy level merge array@ performs a pairwise bottom up fold recursively
-- merging the pairs using the supplied merge function. Level indicates the
-- level in the tree where the fold would stop.
--
-- This performs a random shuffle if the merge function is random.  If we
-- stop at level 0 and repeatedly apply the function then we can do a bubble
-- sort.
--
-- /Unimplemented/
mergeBy :: Int -> (Array a -> Array a -> m ()) -> Array a -> m ()
mergeBy = undefined

-------------------------------------------------------------------------------
-- Size
-------------------------------------------------------------------------------

-- | /O(1)/ Get the byte length of the array.
--
-- @since 0.7.0
{-# INLINE byteLength #-}
byteLength :: Array a -> Int
byteLength Array{..} =
    let len = aEnd `minusPtr` arrStart
    in assert (len >= 0) len

-- Note: try to avoid the use of length in performance sensitive internal
-- routines as it involves a costly 'div' operation. Instead use the end ptr
-- int he array to check the bounds etc.
--
-- | /O(1)/ Get the length of the array i.e. the number of elements in the
-- array.
--
-- Note that 'byteLength' is less expensive than this operation, as 'length'
-- involves a costly division operation.
--
-- @since 0.7.0
{-# INLINE length #-}
length :: forall a. Storable a => Array a -> Int
length arr =
    let elemSize = SIZE_OF(a)
        blen = byteLength arr
     in assert (blen `mod` elemSize == 0) (blen `div` elemSize)

-- | Get the total capacity of an array. An array may have space reserved
-- beyond the current used length of the array.
--
-- /Pre-release/
{-# INLINE byteCapacity #-}
byteCapacity :: Array a -> Int
byteCapacity Array{..} =
    let len = aBound `minusPtr` arrStart
    in assert (len >= 0) len

-- | The remaining capacity in the array for appending more elements without
-- reallocation.
--
-- /Pre-release/
{-# INLINE bytesFree #-}
bytesFree :: Array a -> Int
bytesFree Array{..} =
    let n = aBound `minusPtr` aEnd
    in assert (n >= 0) n

-------------------------------------------------------------------------------
-- Streams of arrays - Creation
-------------------------------------------------------------------------------

data GroupState s contents start end bound
    = GroupStart s
    | GroupBuffer s contents start end bound
    | GroupYield
        contents start end bound (GroupState s contents start end bound)
    | GroupFinish

-- | @arraysOf n stream@ groups the input stream into a stream of
-- arrays of size n.
--
-- @arraysOf n = StreamD.foldMany (Array.writeN n)@
--
-- /Pre-release/
{-# INLINE_NORMAL arraysOf #-}
arraysOf :: forall m a. (MonadIO m, Storable a)
    => Int -> D.Stream m a -> D.Stream m (Array a)
-- XXX the idiomatic implementation leads to large regression in the D.reverse'
-- benchmark. It seems it has difficulty producing optimized code when
-- converting to StreamK. Investigate GHC optimizations.
-- arraysOf n = D.foldMany (writeN n)
arraysOf n (D.Stream step state) =
    D.Stream step' (GroupStart state)

    where

    {-# INLINE_LATE step' #-}
    step' _ (GroupStart st) = do
        when (n <= 0) $
            -- XXX we can pass the module string from the higher level API
            error $ "Streamly.Internal.Data.Array.Foreign.Mut.Type.arraysOf: "
                    ++ "the size of arrays [" ++ show n
                    ++ "] must be a natural number"
        Array contents start end bound <- liftIO $ newArray n
        return $ D.Skip (GroupBuffer st contents start end bound)

    step' gst (GroupBuffer st contents start end bound) = do
        r <- step (adaptState gst) st
        case r of
            D.Yield x s -> do
                liftIO $ poke end x >> touch contents
                let end1 = PTR_NEXT(end,a)
                return $
                    if end1 >= bound
                    then D.Skip
                            (GroupYield
                                contents start end1 bound (GroupStart s))
                    else D.Skip (GroupBuffer s contents start end1 bound)
            D.Skip s ->
                return $ D.Skip (GroupBuffer s contents start end bound)
            D.Stop ->
                return
                    $ D.Skip (GroupYield contents start end bound GroupFinish)

    step' _ (GroupYield contents start end bound next) =
        return $ D.Yield (Array contents start end bound) next

    step' _ GroupFinish = return D.Stop

-- XXX buffer to a list instead?
-- | Buffer the stream into arrays in memory.
{-# INLINE arrayStreamKFromStreamD #-}
arrayStreamKFromStreamD :: forall m a. (MonadIO m, Storable a) =>
    D.Stream m a -> m (K.Stream m (Array a))
arrayStreamKFromStreamD =
    let n = allocBytesToElemCount (undefined :: a) defaultChunkSize
     in D.foldr K.cons K.nil . arraysOf n

-------------------------------------------------------------------------------
-- Streams of arrays - Flattening
-------------------------------------------------------------------------------

data FlattenState s contents a =
      OuterLoop s
    | InnerLoop s contents !(Ptr a) !(Ptr a)

-- | Use the "read" unfold instead.
--
-- @flattenArrays = unfoldMany read@
--
-- We can try this if there are any fusion issues in the unfold.
--
{-# INLINE_NORMAL flattenArrays #-}
flattenArrays :: forall m a. (MonadIO m, Storable a)
    => D.Stream m (Array a) -> D.Stream m a
flattenArrays (D.Stream step state) = D.Stream step' (OuterLoop state)

    where

    {-# INLINE_LATE step' #-}
    step' gst (OuterLoop st) = do
        r <- step (adaptState gst) st
        return $ case r of
            D.Yield Array{..} s ->
                D.Skip (InnerLoop s arrContents arrStart aEnd)
            D.Skip s -> D.Skip (OuterLoop s)
            D.Stop -> D.Stop

    step' _ (InnerLoop st _ p end) | assert (p <= end) (p == end) =
        return $ D.Skip $ OuterLoop st

    step' _ (InnerLoop st contents p end) = do
        x <- liftIO $ do
                    r <- peek p
                    touch contents
                    return r
        return $ D.Yield x (InnerLoop st contents (PTR_NEXT(p,a)) end)

-- | Use the "readRev" unfold instead.
--
-- @flattenArrays = unfoldMany readRev@
--
-- We can try this if there are any fusion issues in the unfold.
--
{-# INLINE_NORMAL flattenArraysRev #-}
flattenArraysRev :: forall m a. (MonadIO m, Storable a)
    => D.Stream m (Array a) -> D.Stream m a
flattenArraysRev (D.Stream step state) = D.Stream step' (OuterLoop state)

    where

    {-# INLINE_LATE step' #-}
    step' gst (OuterLoop st) = do
        r <- step (adaptState gst) st
        return $ case r of
            D.Yield Array{..} s ->
                let p = PTR_PREV(aEnd,a)
                 in D.Skip (InnerLoop s arrContents p arrStart)
            D.Skip s -> D.Skip (OuterLoop s)
            D.Stop -> D.Stop

    step' _ (InnerLoop st _ p start) | p < start =
        return $ D.Skip $ OuterLoop st

    step' _ (InnerLoop st contents p start) = do
        x <- liftIO $ do
                    r <- peek p
                    touch contents
                    return r
        let cur = PTR_PREV(p,a)
        return $ D.Yield x (InnerLoop st contents cur start)

-------------------------------------------------------------------------------
-- Unfolds
-------------------------------------------------------------------------------

data ReadUState a = ReadUState
    UNPACKIF !ArrayContents  -- contents
    !(Ptr a)           -- end address
    !(Ptr a)           -- current address

toReadUState :: Array a -> ReadUState a
toReadUState (Array contents start end _) = ReadUState contents end start

-- | Resumable unfold of an array.
--
{-# INLINE_NORMAL producer #-}
producer :: forall m a. (MonadIO m, Storable a) => Producer m (Array a) a
producer = Producer step (return . toReadUState) extract
    where

    {-# INLINE_LATE step #-}
    step (ReadUState contents end cur)
        | assert (cur <= end) (cur == end) = do
            liftIO $ touch contents
            return D.Stop
    step (ReadUState contents end cur) = do
            !x <- liftIO $ peek cur
            return $ D.Yield x (ReadUState contents end (PTR_NEXT(cur,a)))

    extract (ReadUState contents end cur) = return $ Array contents cur end end

-- | Unfold an array into a stream.
--
-- @since 0.7.0
{-# INLINE_NORMAL read #-}
read :: forall m a. (MonadIO m, Storable a) => Unfold m (Array a) a
read = Producer.simplify producer

-- | Unfold an array into a stream in reverse order.
--
-- /Pre-release/
{-# INLINE_NORMAL readRev #-}
readRev :: forall m a. (MonadIO m, Storable a) => Unfold m (Array a) a
readRev = Unfold step inject
    where

    inject (Array contents start end _) =
        let p = PTR_PREV(end,a)
         in return $ ReadUState contents start p

    {-# INLINE_LATE step #-}
    step (ReadUState contents start p) | p < start = do
        liftIO $ touch contents
        return D.Stop
    step (ReadUState contents start p) = do
        x <- liftIO $ peek p
        return $ D.Yield x (ReadUState contents start (PTR_PREV(p,a)))

-------------------------------------------------------------------------------
-- to Lists and streams
-------------------------------------------------------------------------------

{-
-- Use foldr/build fusion to fuse with list consumers
-- This can be useful when using the IsList instance
{-# INLINE_LATE toListFB #-}
toListFB :: forall a b. Storable a => (a -> b -> b) -> b -> Array a -> b
toListFB c n Array{..} = go arrStart
    where

    go p | assert (p <= aEnd) (p == aEnd) = n
    go p =
        -- unsafeInlineIO allows us to run this in Identity monad for pure
        -- toList/foldr case which makes them much faster due to not
        -- accumulating the list and fusing better with the pure consumers.
        --
        -- This should be safe as the array contents are guaranteed to be
        -- evaluated/written to before we peek at them.
        -- XXX
        let !x = unsafeInlineIO $ do
                    r <- peek p
                    touch arrContents
                    return r
        in c x (go (PTR_NEXT(p,a)))
-}

-- XXX Monadic foldr/build fusion?
-- Reference: https://www.researchgate.net/publication/220676509_Monadic_augment_and_generalised_short_cut_fusion
-- | Convert an 'Array' into a list.
--
-- @since 0.7.0
{-# INLINE toList #-}
toList :: forall m a. (MonadIO m, Storable a) => Array a -> m [a]
toList Array{..} = liftIO $ go arrStart
    where

    go p | assert (p <= aEnd) (p == aEnd) = return []
    go p = do
        x <- peek p
        touch arrContents
        (:) x <$> go (PTR_NEXT(p,a))

-- | Use the 'read' unfold instead.
--
-- @toStreamD = D.unfold read@
--
-- We can try this if the unfold has any performance issues.
{-# INLINE_NORMAL toStreamD #-}
toStreamD :: forall m a. (MonadIO m, Storable a) => Array a -> D.Stream m a
toStreamD Array{..} = D.Stream step arrStart

    where

    {-# INLINE_LATE step #-}
    step _ p | assert (p <= aEnd) (p == aEnd) = return D.Stop
    step _ p = liftIO $ do
        r <- peek p
        touch arrContents
        return $ D.Yield r (PTR_NEXT(p,a))

{-# INLINE toStreamK #-}
toStreamK :: forall m a. (MonadIO m, Storable a) => Array a -> K.Stream m a
toStreamK Array{..} = go arrStart

    where

    go p | assert (p <= aEnd) (p == aEnd) = K.nil
         | otherwise =
        let elemM = do
              r <- peek p
              touch arrContents
              return r
        in liftIO elemM `K.consM` go (PTR_NEXT(p,a))

-- | Use the 'readRev' unfold instead.
--
-- @toStreamDRev = D.unfold readRev@
--
-- We can try this if the unfold has any perf issues.
{-# INLINE_NORMAL toStreamDRev #-}
toStreamDRev :: forall m a. (MonadIO m, Storable a) => Array a -> D.Stream m a
toStreamDRev Array{..} =
    let p = PTR_PREV(aEnd,a)
    in D.Stream step p

    where

    {-# INLINE_LATE step #-}
    step _ p | p < arrStart = return D.Stop
    step _ p = liftIO $ do
        r <- peek p
        touch arrContents
        return $ D.Yield r (PTR_PREV(p,a))

{-# INLINE toStreamKRev #-}
toStreamKRev :: forall m a. (MonadIO m, Storable a) => Array a -> K.Stream m a
toStreamKRev Array {..} =
    let p = PTR_PREV(aEnd,a)
    in go p

    where

    go p | p < arrStart = K.nil
         | otherwise =
        let elemM = do
              r <- peek p
              touch arrContents
              return r
        in liftIO elemM `K.consM` go (PTR_PREV(p,a))

-------------------------------------------------------------------------------
-- Folding
-------------------------------------------------------------------------------

-- XXX Need something like "Array m a" enforcing monadic action to avoid the
-- possibility of such APIs.
--
-- | Strict left fold of an array.
{-# INLINE_NORMAL foldl' #-}
foldl' :: (MonadIO m, Storable a) => (b -> a -> b) -> b -> Array a -> m b
foldl' f z arr = D.foldl' f z $ toStreamD arr

-- | Right fold of an array.
{-# INLINE_NORMAL foldr #-}
foldr :: (MonadIO m, Storable a) => (a -> b -> b) -> b -> Array a -> m b
foldr f z arr = D.foldr f z $ toStreamD arr

-------------------------------------------------------------------------------
-- Folds
-------------------------------------------------------------------------------

data ArrayUnsafe a = ArrayUnsafe
    UNPACKIF !ArrayContents  -- contents
    {-# UNPACK #-} !(Ptr a)  -- start address
    {-# UNPACK #-} !(Ptr a)  -- first unused address

toArrayUnsafe :: Array a -> ArrayUnsafe a
toArrayUnsafe (Array contents start end _) =
    ArrayUnsafe contents start end

fromArrayUnsafe ::
#ifdef DEVBUILD
    Storable a =>
#endif
    ArrayUnsafe a -> Array a
fromArrayUnsafe (ArrayUnsafe contents start end) =
         Array contents start end end

-- Note: Arrays may be allocated with a specific alignment at the beginning of
-- the array. If you need to maintain that alignment on reallocations then you
-- can resize the array manually before append, using an aligned resize
-- operation.

-- XXX Keep the bound intact to not lose any free space? Perf impact?

-- | Append up to @n@ input items to the supplied array.
--
-- Unsafe: Do not drive the fold beyond @n@ elements, it will lead to memory
-- corruption or segfault.
--
-- Any free space left in the array after appending @n@ elements is lost.
--
-- /Internal/
{-# INLINE_NORMAL appendNUnsafe #-}
appendNUnsafe :: forall m a. (MonadIO m, Storable a) =>
       m (Array a)
    -> Int
    -> Fold m a (Array a)
appendNUnsafe action n =
    fmap fromArrayUnsafe $ FL.foldlM' step initial

    where

    initial = do
        assert (n >= 0) (return ())
        arr@(Array _ _ end bound) <- action
        let free = bound `minusPtr` end
            needed = n * SIZE_OF(a)
        -- XXX We can also reallocate if the array has too much free space,
        -- otherwise we lose that space.
        arr1 <-
            if free < needed
            then noinline reallocWith "appendNUnsafeWith" (+ needed) needed arr
            else return arr
        return $ toArrayUnsafe arr1

    step (ArrayUnsafe contents start end) x = do
        liftIO $ poke end x >> touch contents
        return $ ArrayUnsafe contents start (PTR_NEXT(end,a))

-- | Append @n@ elements to an existing array. Any free space left in the array
-- after appending @n@ elements is lost.
--
-- >>> appendN initial n = Fold.take n (Array.appendNUnsafe initial n)
--
-- /Pre-release/
{-# INLINE_NORMAL appendN #-}
appendN :: forall m a. (MonadIO m, Storable a) =>
    m (Array a) -> Int -> Fold m a (Array a)
appendN initial n = FL.take n (appendNUnsafe initial n)

-- | @appendWith realloc action@ mutates the array generated by @action@ to
-- append the input stream. If there is no reserved space available in the
-- array it is reallocated to a size in bytes  determined by @realloc oldSize@,
-- where @oldSize@ is the current size of the array in bytes.
--
-- Note that the returned array may be a mutated version of original array.
--
-- >>> appendWith sizer = Fold.foldlM' (Array.snocWith sizer)
--
-- /Pre-release/
{-# INLINE appendWith #-}
appendWith :: forall m a. (MonadIO m, Storable a) =>
    (Int -> Int) -> m (Array a) -> Fold m a (Array a)
appendWith sizer = FL.foldlM' (snocWith sizer)

-- | @append action@ mutates the array generated by @action@ to append the
-- input stream. If there is no reserved space available in the array it is
-- reallocated to double the size.
--
-- Note that the returned array may be a mutated version of original array.
--
-- >>> append = Array.appendWith (* 2)
--
-- /Pre-release/
{-# INLINE append #-}
append :: forall m a. (MonadIO m, Storable a) =>
    m (Array a) -> Fold m a (Array a)
append = appendWith (* 2)

-- XXX We can carry bound as well in the state to make sure we do not lose the
-- remaining capacity. Need to check perf impact.
--
-- | Like 'writeNUnsafe' but takes a new array allocator @alloc size@ function
-- as argument.
--
-- >>> writeNWithUnsafe alloc n = Array.appendNUnsafe (alloc n) n
--
-- /Pre-release/
{-# INLINE_NORMAL writeNWithUnsafe #-}
writeNWithUnsafe :: forall m a. (MonadIO m, Storable a)
    => (Int -> m (Array a)) -> Int -> Fold m a (Array a)
writeNWithUnsafe alloc n = Fold step initial (return . fromArrayUnsafe)

    where

    initial = FL.Partial . toArrayUnsafe <$> alloc (max n 0)

    step (ArrayUnsafe contents start end) x = do
        liftIO $ poke end x >> touch contents
        return
          $ FL.Partial
          $ ArrayUnsafe contents start (PTR_NEXT(end,a))

-- | Like 'writeN' but does not check the array bounds when writing. The fold
-- driver must not call the step function more than 'n' times otherwise it will
-- corrupt the memory and crash. This function exists mainly because any
-- conditional in the step function blocks fusion causing 10x performance
-- slowdown.
--
-- >>> writeNUnsafe = Array.writeNWithUnsafe Array.newArray
--
-- @since 0.7.0
{-# INLINE_NORMAL writeNUnsafe #-}
writeNUnsafe :: forall m a. (MonadIO m, Storable a)
    => Int -> Fold m a (Array a)
writeNUnsafe = writeNWithUnsafe newArray

-- | @writeNWith alloc n@ folds a maximum of @n@ elements into an array
-- allocated using the @alloc@ function.
--
-- >>> writeNWith alloc n = Fold.take n (Array.writeNWithUnsafe alloc n)
-- >>> writeNWith alloc n = Array.appendN (alloc n) n
--
{-# INLINE_NORMAL writeNWith #-}
writeNWith :: forall m a. (MonadIO m, Storable a)
    => (Int -> m (Array a)) -> Int -> Fold m a (Array a)
writeNWith alloc n = FL.take n (writeNWithUnsafe alloc n)

-- | @writeN n@ folds a maximum of @n@ elements from the input stream to an
-- 'Array'.
--
-- >>> writeN = Array.writeNWith Array.newArray
-- >>> writeN n = Fold.take n (Array.writeNUnsafe n)
-- >>> writeN n = Array.appendN (Array.newArray n) n
--
-- @since 0.7.0
{-# INLINE_NORMAL writeN #-}
writeN :: forall m a. (MonadIO m, Storable a) => Int -> Fold m a (Array a)
writeN = writeNWith newArray

-- | @writeNAligned align n@ folds a maximum of @n@ elements from the input
-- stream to an 'Array' aligned to the given size.
--
-- >>> writeNAligned align = Array.writeNWith (Array.newArrayAligned align)
-- >>> writeNAligned align n = Array.appendN (Array.newArrayAligned align n) n
--
-- /Pre-release/
--
{-# INLINE_NORMAL writeNAligned #-}
writeNAligned :: forall m a. (MonadIO m, Storable a)
    => Int -> Int -> Fold m a (Array a)
writeNAligned align = writeNWith (newArrayAligned align)

-- | @writeNAlignedUnmanaged align n@ folds a maximum of @n@ elements from the
-- input stream to an 'Array' whose starting address is aligned to @align@
-- bytes and is allocated using unmanaged memory (never freed).  This could be
-- useful to allocate memory that we need to allocate only once in the lifetime
-- of the program.
--
-- >>> f = Array.newArrayAlignedUnmanaged
-- >>> writeNAlignedUnmanaged a = Array.writeNWith (f a)
-- >>> writeNAlignedUnmanaged a n = Array.appendN (f a n) n
--
-- /Pre-release/
--
{-# INLINE_NORMAL writeNAlignedUnmanaged #-}
writeNAlignedUnmanaged :: forall m a. (MonadIO m, Storable a)
    => Int -> Int -> Fold m a (Array a)
writeNAlignedUnmanaged align = writeNWith (newArrayAlignedUnmanaged align)

-- XXX Buffer to a list instead?
--
-- | Buffer a stream into a stream of arrays.
--
-- >>> writeChunks n = Fold.many (Array.writeN n) Fold.toStreamK
--
-- Breaking an array into an array stream  can be useful to consume a large
-- array sequentially such that memory of the array is released incrementatlly.
--
-- See also: 'arrayStreamKFromStreamD'.
--
-- /Unimplemented/
--
{-# INLINE_NORMAL writeChunks #-}
writeChunks :: (MonadIO m, Storable a) =>
    Int -> Fold m a (K.Stream n (Array a))
writeChunks n = FL.many (writeN n) FL.toStreamK

-- XXX Compare writeWith with fromStreamD which uses an array of streams
-- implementation. We can write this using writeChunks above if that is faster.
-- If writeWith is faster then we should use that to implement
-- fromStreamD.
--
-- XXX The realloc based implementation needs to make one extra copy if we use
-- shrinkToFit.  On the other hand, the stream of arrays implementation may
-- buffer the array chunk pointers in memory but it does not have to shrink as
-- we know the exact size in the end. However, memory copying does not seem to
-- be as expensive as the allocations. Therefore, we need to reduce the number
-- of allocations instead. Also, the size of allocations matters, right sizing
-- an allocation even at the cost of copying sems to help.  Should be measured
-- on a big stream with heavy calls to toArray to see the effect.
--
-- XXX check if GHC's memory allocator is efficient enough. We can try the C
-- malloc to compare against.

-- | @writeWith minCount@ folds the whole input to a single array. The array
-- starts at a size big enough to hold minCount elements, the size is doubled
-- every time the array needs to be grown.
--
-- /Caution! Do not use this on infinite streams./
--
-- >>> f n = Array.appendWith (* 2) (Array.newArray n)
-- >>> writeWith n = Fold.rmapM Array.rightSize (f n)
-- >>> writeWith n = Fold.rmapM Array.fromArrayStreamK (Array.writeChunks n)
--
-- /Pre-release/
{-# INLINE_NORMAL writeWith #-}
writeWith :: forall m a. (MonadIO m, Storable a)
    => Int -> Fold m a (Array a)
-- writeWith n = FL.rmapM rightSize $ appendWith (* 2) (newArray n)
writeWith elemCount =
    FL.rmapM extract $ FL.foldlM' step initial

    where

    insertElem (Array contents start end bound) x = do
        liftIO $ poke end x
        return $ Array contents start (PTR_NEXT(end,a)) bound

    initial = do
        when (elemCount < 0) $ error "writeWith: elemCount is negative"
        liftIO $ newArrayAligned (alignment (undefined :: a)) elemCount
    step arr@(Array _ start end bound) x
        | PTR_NEXT(end,a) > bound = do
        let oldSize = end `minusPtr` start
            newSize = max (oldSize * 2) 1
        arr1 <-
            liftIO
                $ reallocAligned
                    (SIZE_OF(a))
                    (alignment (undefined :: a))
                    newSize
                    arr
        insertElem arr1 x
    step arr x = insertElem arr x
    extract = liftIO . rightSize

-- | Fold the whole input to a single array.
--
-- Same as 'writeWith' using an initial array size of 'arrayChunkBytes' bytes
-- rounded up to the element size.
--
-- /Caution! Do not use this on infinite streams./
--
-- @since 0.7.0
{-# INLINE write #-}
write :: forall m a. (MonadIO m, Storable a) => Fold m a (Array a)
write = writeWith (allocBytesToElemCount (undefined :: a) arrayChunkBytes)

-------------------------------------------------------------------------------
-- construct from streams, known size
-------------------------------------------------------------------------------

-- | Use the 'writeN' fold instead.
--
-- >>> fromStreamDN n = StreamD.fold (Array.writeN n)
--
{-# INLINE_NORMAL fromStreamDN #-}
fromStreamDN :: forall m a. (MonadIO m, Storable a)
    => Int -> D.Stream m a -> m (Array a)
-- fromStreamDN n = D.fold (writeN n)
fromStreamDN limit str = do
    arr <- liftIO $ newArray limit
    end <- D.foldlM' fwrite (return $ aEnd arr) $ D.take limit str
    return $ arr {aEnd = end}

    where

    fwrite ptr x = do
        liftIO $ poke ptr x
        return $ PTR_NEXT(ptr,a)

-- | Create an 'Array' from the first N elements of a list. The array is
-- allocated to size N, if the list terminates before N elements then the
-- array may hold less than N elements.
--
-- @since 0.7.0
{-# INLINABLE fromListN #-}
fromListN :: (MonadIO m, Storable a) => Int -> [a] -> m (Array a)
fromListN n xs = fromStreamDN n $ D.fromList xs

-------------------------------------------------------------------------------
-- convert stream to a single array
-------------------------------------------------------------------------------

{-# INLINE arrayStreamKLength #-}
arrayStreamKLength :: (Monad m, Storable a) => K.Stream m (Array a) -> m Int
arrayStreamKLength as = K.foldl' (+) 0 (K.map length as)

-- | Convert an array stream to an array. Note that this requires peak memory
-- that is double the size of the array stream.
--
{-# INLINE fromArrayStreamK #-}
fromArrayStreamK :: (Storable a, MonadIO m) =>
    K.Stream m (Array a) -> m (Array a)
fromArrayStreamK as = do
    len <- arrayStreamKLength as
    fromStreamDN len $ D.unfoldMany read $ D.fromStreamK as

-- CAUTION: a very large number (millions) of arrays can degrade performance
-- due to GC overhead because we need to buffer the arrays before we flatten
-- all the arrays.
--
-- XXX Compare if this is faster or "fold write".
--
-- | We could take the approach of doubling the memory allocation on each
-- overflow. This would result in more or less the same amount of copying as in
-- the chunking approach. However, if we have to shrink in the end then it may
-- result in an extra copy of the entire data.
--
-- >>> fromStreamD = StreamD.fold Array.write
--
{-# INLINE fromStreamD #-}
fromStreamD :: (MonadIO m, Storable a) => D.Stream m a -> m (Array a)
fromStreamD m = arrayStreamKFromStreamD m >>= fromArrayStreamK

-- | Create an 'Array' from a list. The list must be of finite size.
--
-- @since 0.7.0
{-# INLINE fromList #-}
fromList :: (MonadIO m, Storable a) => [a] -> m (Array a)
fromList xs = fromStreamD $ D.fromList xs

-------------------------------------------------------------------------------
-- Combining
-------------------------------------------------------------------------------

-- | Copy two arrays into a newly allocated array.
{-# INLINE spliceCopy #-}
spliceCopy :: (MonadIO m, Storable a) => Array a -> Array a -> m (Array a)
spliceCopy arr1 arr2 = do
    let src1 = arrStart arr1
        src2 = arrStart arr2
        len1 = aEnd arr1 `minusPtr` src1
        len2 = aEnd arr2 `minusPtr` src2

    arr <- liftIO $ newArray (len1 + len2)
    let dst = arrStart arr

    -- XXX Should we use copyMutableByteArray# instead? Is there an overhead to
    -- ccall?
    liftIO $ do
        memcpy (castPtr dst) (castPtr src1) len1
        touch (arrContents arr1)
        memcpy (castPtr (dst `plusPtr` len1)) (castPtr src2) len2
        touch (arrContents arr2)
    return arr { aEnd = dst `plusPtr` (len1 + len2) }

-- | Really really unsafe, appends the second array into the first array. If
-- the first array does not have enough space it may cause silent data
-- corruption or if you are lucky a segfault.
{-# INLINE spliceUnsafe #-}
spliceUnsafe :: MonadIO m => Array a -> (Array a, Int) -> m (Array a)
spliceUnsafe dst (src, srcLen) =
    liftIO $ do
         let psrc = arrStart src
         let pdst = aEnd dst
         assert (pdst `plusPtr` srcLen <= aBound dst) (return ())
         memcpy (castPtr pdst) (castPtr psrc) srcLen
         touch (arrContents src)
         touch (arrContents dst)
         return $ dst {aEnd = pdst `plusPtr` srcLen}

-- | @spliceWith sizer dst src@ mutates @dst@ to append @src@. If there is no
-- reserved space available in @dst@ it is reallocated to a size determined by
-- the @sizer dstBytesn srcBytes@ function, where @dstBytes@ is the size of the
-- first array and @srcBytes@ is the size of the second array, in bytes.
--
-- Note that the returned array may be a mutated version of first array.
--
-- /Pre-release/
{-# INLINE spliceWith #-}
spliceWith :: forall m a. (MonadIO m, Storable a) =>
    (Int -> Int -> Int) -> Array a -> Array a -> m (Array a)
spliceWith sizer dst@(Array _ start end bound) src = do
{-
    let f = appendWith (`sizer` byteLength src) (return dst)
     in D.fold f (toStreamD src)
-}
    assert (end <= bound) (return ())
    let srcLen = aEnd src `minusPtr` arrStart src

    dst1 <-
        if end `plusPtr` srcLen >= bound
        then do
            let oldSize = end `minusPtr` start
                newSize = sizer oldSize srcLen
            when (newSize < oldSize + srcLen)
                $ error
                    $ "splice: newSize is less than the total size "
                    ++ "of arrays being appended. Please check the "
                    ++ "newSize function passed."
            liftIO $ realloc newSize dst
        else return dst
    spliceUnsafe dst1 (src, srcLen)

-- | The first array is mutated to append the second array. If there is no
-- reserved space available in the first array a new allocation of exact
-- required size is done.
--
-- Note that the returned array may be a mutated version of first array.
--
-- >>> splice = Array.spliceWith (+)
--
-- /Pre-release/
{-# INLINE splice #-}
splice :: (MonadIO m, Storable a) => Array a -> Array a -> m (Array a)
splice = spliceWith (+)

-- | Like 'append' but the growth of the array is exponential. Whenever a new
-- allocation is required the previous array size is at least doubled.
--
-- This is useful to reduce allocations when folding many arrays together.
--
-- Note that the returned array may be a mutated version of first array.
--
-- >>> spliceExp = Array.spliceWith (\l1 l2 -> max (l1 * 2) (l1 + l2))
--
-- /Pre-release/
{-# INLINE spliceExp #-}
spliceExp :: (MonadIO m, Storable a) => Array a -> Array a -> m (Array a)
spliceExp = spliceWith (\l1 l2 -> max (l1 * 2) (l1 + l2))

-------------------------------------------------------------------------------
-- Splitting
-------------------------------------------------------------------------------

-- | Drops the separator byte
{-# INLINE breakOn #-}
breakOn :: MonadIO m
    => Word8 -> Array Word8 -> m (Array Word8, Maybe (Array Word8))
breakOn sep arr@Array{..} = liftIO $ do
    let p = arrStart
    loc <- c_memchr p sep (fromIntegral $ aEnd `minusPtr` p)
    return $
        if loc == nullPtr
        then (arr, Nothing)
        else
            ( Array
                { arrContents = arrContents
                , arrStart = arrStart
                , aEnd = loc
                , aBound = loc
                }
            , Just $ Array
                    { arrContents = arrContents
                    , arrStart = arrStart `plusPtr` (loc `minusPtr` p + 1)
                    , aEnd = aEnd
                    , aBound = aBound
                    }
            )

-- | Create two slices of an array without copying the original array. The
-- specified index @i@ is the first index of the second slice.
--
-- @since 0.7.0
splitAt :: forall a. Storable a => Int -> Array a -> (Array a, Array a)
splitAt i arr@Array{..} =
    let maxIndex = length arr - 1
    in  if i < 0
        then error "sliceAt: negative array index"
        else if i > maxIndex
             then error $ "sliceAt: specified array index " ++ show i
                        ++ " is beyond the maximum index " ++ show maxIndex
             else let off = i * SIZE_OF(a)
                      p = arrStart `plusPtr` off
                in ( Array
                  { arrContents = arrContents
                  , arrStart = arrStart
                  , aEnd = p
                  , aBound = p
                  }
                , Array
                  { arrContents = arrContents
                  , arrStart = arrStart `plusPtr` off
                  , aEnd = aEnd
                  , aBound = aBound
                  }
                )

-------------------------------------------------------------------------------
-- Casting
-------------------------------------------------------------------------------

-- | Cast an array having elements of type @a@ into an array having elements of
-- type @b@. The array size must be a multiple of the size of type @b@
-- otherwise accessing the last element of the array may result into a crash or
-- a random value.
--
-- /Pre-release/
--
castUnsafe ::
#ifdef DEVBUILD
    Storable b =>
#endif
    Array a -> Array b
castUnsafe (Array contents start end bound) =
    Array contents (castPtr start) (castPtr end) (castPtr bound)

-- | Cast an @Array a@ into an @Array Word8@.
--
-- /Pre-release/
--
asBytes :: Array a -> Array Word8
asBytes = castUnsafe

-- | Cast an array having elements of type @a@ into an array having elements of
-- type @b@. The length of the array should be a multiple of the size of the
-- target element otherwise 'Nothing' is returned.
--
-- /Pre-release/
--
cast :: forall a b. Storable b => Array a -> Maybe (Array b)
cast arr =
    let len = byteLength arr
        r = len `mod` SIZE_OF(b)
     in if r /= 0
        then Nothing
        else Just $ castUnsafe arr

-- | Use an @Array a@ as @Ptr a@.
--
-- /Unsafe/
--
-- /Pre-release/
--
asPtrUnsafe :: MonadIO m => Array a -> (Ptr a -> m b) -> m b
asPtrUnsafe Array{..} f = do
  r <- f arrStart
  liftIO $ touch arrContents
  return r

-------------------------------------------------------------------------------
-- Equality
-------------------------------------------------------------------------------

-- | Compare if two arrays are equal.
--
-- /Pre-release/
{-# INLINE cmp #-}
cmp :: MonadIO m => Array a -> Array a -> m Bool
cmp arr1 arr2 =
    liftIO $ do
        let ptr1 = arrStart arr1
        let ptr2 = arrStart arr2
        let len1 = aEnd arr1 `minusPtr` ptr1
        let len2 = aEnd arr2 `minusPtr` ptr2

        if len1 == len2
        then
            if ptr1 == ptr2
            then return True
            else do
                r <- memcmp (castPtr ptr1) (castPtr ptr2) len1
                touch (arrContents arr1)
                touch (arrContents arr2)
                return r
        else return False

-------------------------------------------------------------------------------
-- NFData
-------------------------------------------------------------------------------

-- This is a Storable array, we cannot have unevaluated data in it so this is
-- just a no op.
instance NFData (Array a) where
    {-# INLINE rnf #-}
    rnf Array {} = ()