{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE ExistentialQuantification #-}
{-# LANGUAGE Rank2Types #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE ForeignFunctionInterface #-}
module Synthesizer.LLVM.Causal.Process (
   C(simple, replicateControlled),
   T,
   amplify,
   amplifyStereo,
   apply,
   applyFst,
   applySnd,
   applyConst,
   applyConstFst,
   applyConstSnd,
   (CausalClass.$<), (CausalClass.$>), (CausalClass.$*),
   ($<#), ($>#), ($*#),
   feedFst,
   feedSnd,
   feedConstFst,
   feedConstSnd,
   first,
   envelope,
   envelopeStereo,
   fromModifier,
   fromSignal,
   toSignal,
   loopConst,
   loopZero,
   delay1Zero,
   feedbackControlledZero,
   map,
   mapAccum,
   zipWith,
   mapProc,
   zipProcWith,
   mix,
   takeWhile,
   pipeline,
   stereoFromVector,
   vectorize,
   replaceChannel,
   arrayElement,
   element,
   osciCoreSync,
   osciCore,
   osci,
   shapeModOsci,
   skip,
   foldChunks,
   foldChunksPartial,
   frequencyModulation,
   interpolateConstant,
   quantizeLift,
   applyStorable,
   applyStorableChunky,
   runStorableChunky,
   ) where

import Synthesizer.LLVM.Causal.ProcessPrivate

import qualified Synthesizer.LLVM.Simple.SignalPrivate as Sig
import qualified Synthesizer.LLVM.Simple.Value as Value
import qualified Synthesizer.LLVM.Fold as Fold
import qualified Synthesizer.LLVM.Frame.Stereo as Stereo
import qualified Synthesizer.LLVM.Frame as Frame
import qualified Synthesizer.LLVM.Execution as Exec
import qualified Synthesizer.LLVM.ForeignPtr as ForeignPtr

import qualified Synthesizer.Plain.Modifier as Modifier
import qualified Synthesizer.Causal.Class as CausalClass

import qualified Data.StorableVector.Lazy as SVL
import qualified Data.StorableVector as SV
import qualified Data.StorableVector.Base as SVB

import qualified LLVM.Extra.Multi.Vector as MultiVector
import qualified LLVM.Extra.Multi.Value as MultiValue
import qualified LLVM.Extra.Control as C
import qualified LLVM.Extra.Class as Class
import qualified LLVM.Extra.Arithmetic as A
import qualified LLVM.Extra.ScalarOrVector as SoV
import qualified LLVM.Extra.MaybeContinuation as MaybeCont
import qualified LLVM.Extra.Maybe as Maybe
import qualified LLVM.Extra.Memory as Memory
import LLVM.Extra.Class (Undefined, MakeValueTuple, ValueTuple, )

import qualified LLVM.Core as LLVM
import LLVM.Util.Loop (Phi, )
import LLVM.Core
          (CodeGenFunction, ret, Value, valueOf,
           IsConst, IsFirstClass, IsArithmetic, IsPrimitive)

import qualified Type.Data.Num.Decimal as TypeNum
import Type.Base.Proxy (Proxy, )
import Type.Data.Num.Decimal (D2, (:<:), )

import qualified Control.Arrow    as Arr
import Control.Monad.Trans.State (runState, )
import Control.Arrow (arr, (<<<), (>>>), (&&&), )
import Control.Monad (liftM2, )
import Control.Applicative (liftA3, (<$>), )

import qualified Data.List as List
import Data.Tuple.HT (swap, )
import Data.Word (Word32, )

import qualified Foreign.Marshal.Utils as AllocUtil
import Foreign.Storable (Storable, )
import Foreign.ForeignPtr (ForeignPtr, withForeignPtr, )
import Foreign.Ptr (Ptr, )
import Control.Exception (bracket, )
import qualified System.Unsafe as Unsafe

import Prelude hiding (and, map, zip, zipWith, init, takeWhile, )



fromModifier ::
   (C process) =>
   (Value.Flatten ah, Value.Registers ah ~ al,
    Value.Flatten bh, Value.Registers bh ~ bl,
    Value.Flatten ch, Value.Registers ch ~ cl,
    Value.Flatten sh, Value.Registers sh ~ sl,
    Memory.C sl) =>
   Modifier.Simple sh ch ah bh -> process (cl,al) bl
fromModifier (Modifier.Simple initial step) =
   mapAccum
      (\(c,a) s ->
         Value.flatten $
         runState
            (step (Value.unfold c) (Value.unfold a))
            (Value.unfold s))
      (Value.flatten initial)


apply :: T a b -> Sig.T a -> Sig.T b
apply = CausalClass.apply

feedFst :: Sig.T a -> T b (a,b)
feedFst = CausalClass.feedFst

feedSnd :: Sig.T a -> T b (b,a)
feedSnd = CausalClass.feedSnd

feedConstFst ::
   (MakeValueTuple a, ValueTuple a ~ al) =>
   a -> T b (al,b)
feedConstFst = CausalClass.feedConstFst . Class.valueTupleOf

feedConstSnd ::
   (MakeValueTuple a, ValueTuple a ~ al) =>
   a -> T b (b,al)
feedConstSnd = CausalClass.feedConstSnd . Class.valueTupleOf


applyFst :: T (a,b) c -> Sig.T a -> T b c
applyFst = CausalClass.applyFst

applySnd :: T (a,b) c -> Sig.T b -> T a c
applySnd = CausalClass.applySnd

applyConst ::
   (MakeValueTuple a, ValueTuple a ~ al) =>
   T al b -> a -> Sig.T b
applyConst proc =
   CausalClass.applyConst proc . Class.valueTupleOf

applyConstFst ::
   (MakeValueTuple a, ValueTuple a ~ al) =>
   T (al,b) c -> a -> T b c
applyConstFst proc =
   CausalClass.applyConstFst proc . Class.valueTupleOf

applyConstSnd ::
   (MakeValueTuple b, ValueTuple b ~ bl) =>
   T (a,bl) c -> b -> T a c
applyConstSnd proc =
   CausalClass.applyConstSnd proc . Class.valueTupleOf


infixl 0 $<#, $>#, $*#

{- |
provide constant input in a comfortable way
-}
($*#) ::
   (C process, CausalClass.SignalOf process ~ signal,
    Storable ah, MakeValueTuple ah, ValueTuple ah ~ a,
    Memory.C a) =>
   process a b -> ah -> signal b
proc $*# x = CausalClass.applyConst proc $ Class.valueTupleOf x

($<#) ::
   (C process,
    Storable ah, MakeValueTuple ah, ValueTuple ah ~ a, Memory.C a) =>
   process (a,b) c -> ah -> process b c
proc $<# x = CausalClass.applyConstFst proc $ Class.valueTupleOf x

($>#) ::
   (C process,
    Storable bh, MakeValueTuple bh, ValueTuple bh ~ b, Memory.C b) =>
   process (a,b) c -> bh -> process a c
proc $># x = CausalClass.applyConstSnd proc $ Class.valueTupleOf x



{- |
You may also use '(+)'.
-}
mix ::
   (C process, A.Additive a) =>
   process (a, a) a
mix = zipWith Frame.mix


{- |
You may also use '(*)'.
-}
envelope ::
   (C process, A.PseudoRing a) =>
   process (a, a) a
envelope = zipWith Frame.amplifyMono

envelopeStereo ::
   (C process, A.PseudoRing a) =>
   process (a, Stereo.T a) (Stereo.T a)
envelopeStereo = zipWith Frame.amplifyStereo

amplify ::
   (C process, IsArithmetic a, IsConst a) =>
   a -> process (Value a) (Value a)
amplify x =
   map (Frame.amplifyMono (valueOf x))

amplifyStereo ::
   (C process, IsArithmetic a, IsConst a) =>
   a -> process (Stereo.T (Value a)) (Stereo.T (Value a))
amplifyStereo x =
   map (Frame.amplifyStereo (valueOf x))



loopConst ::
   (C process, Memory.C c) =>
   c -> process (a,c) (b,c) -> process a b
loopConst init =
   alter
      (\(Core next start stop) ->
          Core
             (loopNext next)
             (fmap ((,) init) . start)
             (stop . snd))

{- |
Like 'Synthesizer.LLVM.CausalParameterized.loop'
but uses zero as initial value
and it does not need a zero as Haskell value.
-}
loopZero ::
   (C process, A.Additive c, Memory.C c) =>
   process (a,c) (b,c) -> process a b
loopZero = loopConst A.zero

delay1Zero ::
   (C process, A.Additive a, Memory.C a) =>
   process a a
delay1Zero = loopZero (arr swap)


{- |
This allows to compute a chain of equal processes efficiently,
if all of these processes can be bundled in one vectorial process.
Applications are an allpass cascade or an FM operator cascade.

The function expects that the vectorial input process
works like parallel scalar processes.
The different pipeline stages may be controlled by different parameters,
but the structure of all pipeline stages must be equal.
Our function feeds the input of the pipelined process
to the zeroth element of the Vector.
The result of processing the i-th element (the i-th channel, so to speak)
is fed to the (i+1)-th element.
The (n-1)-th element of the vectorial process is emitted
as output of the pipelined process.

The pipeline necessarily introduces a delay of (n-1) values.
For simplification we extend this to n values delay.
If you need to combine the resulting signal from the pipeline
with another signal in a 'zip'-like way,
you may delay that signal with @pipeline id@.
The first input values in later stages of the pipeline
are initialized with zero.
If this is not appropriate for your application,
then we may add a more sensible initialization.
-}
pipeline ::
   (C process,
    TypeNum.Positive n, MultiVector.C x,
    v ~ MultiVector.T n x,
    a ~ MultiValue.T x,
    Class.Zero v, Memory.C v) =>
   process v v -> process a a
pipeline vectorProcess =
   loopConst MultiVector.zero $
      map (uncurry MultiVector.shiftUp)
      >>>
      Arr.second vectorProcess


feedbackControlledZero ::
   (C process, A.Additive c, Memory.C c) =>
   process ((ctrl,a),c) b -> process (ctrl,b) c -> process (ctrl,a) b
feedbackControlledZero forth back =
   loopZero (feedbackControlledAux forth back)


{-
In order to let this work we have to give the disable-mmx option somewhere,
but where?
-}
stereoFromVector ::
   (C process, IsPrimitive a, IsPrimitive b) =>
   process (Value (LLVM.Vector D2 a)) (Value (LLVM.Vector D2 b)) ->
   process (Stereo.T (Value a)) (Stereo.T (Value b))
stereoFromVector proc =
   map Frame.stereoFromVector <<<
   proc <<<
   map Frame.vectorFromStereo


{-
insert and extract instructions will be in opposite order,
no matter whether we use foldr or foldl
and independent from the order of proc and channel in replaceChannel.
However, LLVM neglects the order anyway.
-}
vectorize ::
   (C process,
    TypeNum.Positive n,
    MultiVector.C x, MultiValue.T x ~ a, MultiVector.T n x ~ va,
    MultiVector.C y, MultiValue.T y ~ b, MultiVector.T n y ~ vb) =>
   process a b -> process va vb
vectorize proc =
   withSize $ \n ->
      foldl
         (\acc i -> replaceChannel i proc acc)
         (arr (const $ Class.undefTuple)) $
      List.take (TypeNum.integralFromSingleton n) [0 ..]

withSize ::
   (TypeNum.Positive n, MultiVector.T n a ~ v) =>
   (TypeNum.Singleton n -> f v) ->
   f v
withSize f = f TypeNum.singleton

{- |
Given a vector process, replace the i-th output by output
that is generated by a scalar process from the i-th input.
-}
replaceChannel ::
   (C process,
    TypeNum.Positive n,
    MultiVector.C x, MultiValue.T x ~ a, MultiVector.T n x ~ va,
    MultiVector.C y, MultiValue.T y ~ b, MultiVector.T n y ~ vb) =>
   Int -> process a b -> process va vb -> process va vb
replaceChannel i channel proc =
   let li = valueOf $ fromIntegral i
   in  zipWith (MultiVector.insert li) <<<
          (channel <<< map (MultiVector.extract li)) &&&
          proc

{- |
Read the i-th element from each array.
-}
arrayElement ::
   (C process, IsFirstClass a,
    TypeNum.Natural index, TypeNum.Natural dim,
    index :<: dim) =>
   Proxy index -> process (Value (LLVM.Array dim a)) (Value a)
arrayElement i =
   map (\array -> LLVM.extractvalue array i)

{- |
Read the i-th element from an aggregate type.
-}
element ::
   (C process, IsFirstClass a, LLVM.GetValue agg index,
    LLVM.ValueType agg index ~ a) =>
   index -> process (Value agg) (Value a)
element i =
   map (\array -> LLVM.extractvalue array i)



{- |
Compute the phases from phase distortions and frequencies.

It's like integrate but with wrap-around performed by @fraction@.
For FM synthesis we need also negative phase distortions,
thus we use 'A.addToPhase' which supports that.
-}
osciCore, _osciCore, osciCoreSync ::
   (C process, Memory.C t, A.Fraction t) =>
   process (t, t) (t)
_osciCore =
   zipWith A.addToPhase <<<
   Arr.second
      (mapAccum
         (\a s -> do
            b <- A.incPhase a s
            return (s,b))
         (return A.zero))

{-
This could be implemented using a generalized frequencyModulation,
however, osciCoreSync allows for negative phase differences.
-}
osciCoreSync =
   zipWith A.addToPhase <<<
   Arr.second
      (mapAccum
         (\a s -> do
            b <- A.incPhase a s
            return (b,b))
         (return A.zero))

osciCore =
   zipWith A.addToPhase <<<
   Arr.second (loopZero (arr snd &&& zipWith A.incPhase))

osci ::
   (C process, Memory.C t, A.Fraction t) =>
   (forall r. t -> CodeGenFunction r y) ->
   process (t, t) y
osci wave =
   map wave <<< osciCore

shapeModOsci ::
   (C process, Memory.C t, A.Fraction t) =>
   (forall r. c -> t -> CodeGenFunction r y) ->
   process (c, (t, t)) y
shapeModOsci wave =
   zipWith wave <<< Arr.second osciCore


{- |
Feeds a signal into a causal process while holding or skipping signal elements
according to the process input.
The skip happens after a value is passed from the fed signal.

@skip x $* 0@ repeats the first signal value in the output.
@skip x $* 1@ feeds the signal to the output as is.
@skip x $* 2@ feeds the signal to the output with double speed.
-}
skip ::
   (C process, CausalClass.SignalOf process ~ signal,
    Undefined a, Phi a, Memory.C a) =>
   signal a -> process (Value Word32) a
skip =
   alterSignal
      (\(Sig.Core next start stop) -> Core
         (\context n1 (yState0,n0) -> do
            yState1@(y,_) <-
               MaybeCont.fromMaybe $ fmap snd $
               MaybeCont.fixedLengthLoop n0 yState0 $
               next context . snd
            return (y, (yState1,n1)))
         (fmap (\s -> ((Class.undefTuple, s), A.one)) . start)
         (\((_y,state),_k) -> stop state))

{- |
The input of the process is a sequence of chunk sizes.
The signal is chopped into chunks of these sizes
and each chunk is folded using
the given initial value and the accumulation function.
A trailing incomplete chunk will be ignored.
-}
foldChunks ::
   (C process, CausalClass.SignalOf process ~ signal, Undefined b, Phi b) =>
   Fold.T a b -> signal a -> process (Value Word32) b
foldChunks (Fold.Cons accum initial) =
   alterSignal
      (\(Sig.Core next start stop) -> Core
         (\context n state ->
            MaybeCont.fromMaybe $ fmap snd $
            MaybeCont.fixedLengthLoop n (initial,state) $ \(b0,state0) -> do
               (a,state1) <- next context state0
               b1 <- MaybeCont.lift $ accum b0 a
               return (b1,state1))
         start
         stop)

{- |
Like 'foldChunks' but an incomplete chunk at the end
is treated like a complete one.
-}
foldChunksPartial ::
   (C process, CausalClass.SignalOf process ~ signal,
    Undefined a, Phi a, Undefined b, Phi b) =>
   Fold.T a b -> signal a -> process (Value Word32) b
foldChunksPartial (Fold.Cons accum initial) =
   alterSignal
      (\(Sig.Core next start stop) -> Core
         (\context n runState0 -> do
            ((i,b), runState1) <-
               MaybeCont.lift $
               C.whileLoopShared ((n, initial), runState0) $
                     \((i0,b0), (run,s0)) ->
                  (A.and run =<< A.cmp LLVM.CmpGT i0 A.zero,
                   do mas1 <- MaybeCont.toMaybe $ next context s0
                      Maybe.run mas1
                        (return ((i0,b0), (valueOf False, s0)))
                        (\(a,s1) -> do
                           b1 <- accum b0 a
                           i1 <- A.dec i0
                           return ((i1,b1), (valueOf True, s1))))
            MaybeCont.guard =<< MaybeCont.lift (A.cmp LLVM.CmpLT i n)
            return (b, runState1))
         (fmap ((,) (valueOf True)) . start)
         (stop . snd))

{-
It is quite similar to quantizeLift but the control is the reciprocal.
This is especially a problem since we need the fractional part for interpolation.
-}
frequencyModulation ::
   (C process, CausalClass.SignalOf process ~ signal,
    SoV.IntegerConstant a, LLVM.IsFloating a,
    LLVM.CmpRet a, LLVM.CmpResult a ~ Bool,
    Memory.FirstClass a, Memory.Stored a ~ am, LLVM.IsSized am,
    Undefined nodes, Phi nodes, Memory.C nodes) =>
   (forall r. Value a -> nodes -> CodeGenFunction r v) ->
   signal nodes -> process (Value a) v
frequencyModulation ip =
   alterSignal (\(Sig.Core next start stop) -> Core
      (\context k yState0 -> do
         ((nodes2,state2), ss2) <-
            MaybeCont.fromBool $
            C.whileLoop
               (valueOf True, yState0)
               (\(cont0, (_, ss0)) ->
                  LLVM.and cont0 =<< A.fcmp LLVM.FPOGE ss0 A.one)
               (\(_,((_,state0), ss0)) ->
                  MaybeCont.toBool $ liftM2 (,)
                     (next context state0)
                     (MaybeCont.lift $ A.sub ss0 A.one))

         MaybeCont.lift $ do
            y <- ip ss2 nodes2
            ss3 <- A.add ss2 k
            return (y, ((nodes2, state2), ss3)))
      (fmap (\sa -> ((Class.undefTuple, sa), A.one)) . start)
      (\((_y01,state),_ss) -> stop state))


{- |
Stretch signal in time by a time-varying factor.
-}
interpolateConstant ::
   (C process, CausalClass.SignalOf process ~ signal,
    Memory.C a,
    Memory.FirstClass b, Memory.Stored b ~ bm, LLVM.IsSized bm,
    SoV.IntegerConstant b,
    LLVM.IsFloating b, LLVM.CmpRet b, LLVM.CmpResult b ~ Bool) =>
   signal a -> process (Value b) a
interpolateConstant xs =
   quantizeLift (CausalClass.fromSignal xs) $># ()


quantizeLift ::
   (C process, Memory.C b,
    SoV.IntegerConstant c, LLVM.IsFloating c,
    LLVM.CmpRet c, LLVM.CmpResult c ~ Bool,
    Memory.FirstClass c, Memory.Stored c ~ cm, LLVM.IsSized cm) =>
   process a b ->
   process (Value c, a) b
quantizeLift = alter (\(Core next start stop) -> Core
   (\context (k, a0) yState0 -> do
      (yState1, frac1) <-
         MaybeCont.fromBool $
         C.whileLoop
            (LLVM.valueOf True, yState0)
            (\(cont1, (_, frac0)) ->
               LLVM.and cont1 =<< A.fcmp LLVM.FPOLE frac0 A.zero)
            (\(_,((_,state01), frac0)) ->
               MaybeCont.toBool $ liftM2 (,)
                  (next context a0 state01)
                  (MaybeCont.lift $ A.add frac0 k))

      frac2 <- MaybeCont.lift $ A.sub frac1 A.one
      return (fst yState1, (yState1, frac2)))
{- using this initialization code we would not need undefined values
   (do sa <- start
       (a,_) <- next sa
       return (sa, a, A.zero))
-}
   (\p -> do
      s <- start p
      return ((Class.undefTuple, s), A.zero))
   (\((_, s), _) -> stop s))



foreign import ccall safe "dynamic" derefFillPtr ::
   Exec.Importer
      (Ptr paramStruct -> Word32 -> Ptr aStruct -> Ptr bStruct -> IO Word32)


compile ::
   (Memory.C aValue, Memory.Struct aValue ~ aStruct,
    Memory.C bValue, Memory.Struct bValue ~ bStruct,
    Memory.C param, Memory.Struct param ~ paramStruct,
    Phi state, Undefined state) =>
   (forall r z. (Phi z) =>
    param -> local -> aValue -> state -> MaybeCont.T r z (bValue, state)) ->
   (forall r. CodeGenFunction r local) ->
   (forall r. param -> CodeGenFunction r state) ->
   IO (Ptr paramStruct -> Word32 -> Ptr aStruct -> Ptr bStruct -> IO Word32)
compile next alloca start =
   Exec.compileModule $
      Exec.createFunction derefFillPtr "fillprocessblock" $
         \ paramPtr size alPtr blPtr -> do
            param <- Memory.load paramPtr
            s <- start param
            local <- alloca
            (pos,_) <- MaybeCont.arrayLoop2 size alPtr blPtr s $
                  \ aPtri bPtri s0 -> do
               a <- MaybeCont.lift $ Memory.load aPtri
               (b,s1) <- next param local a s0
               MaybeCont.lift $ Memory.store b bPtri
               return s1
            ret pos


applyStorable ::
   (Storable a, MakeValueTuple a, ValueTuple a ~ valueA, Memory.C valueA,
    Storable b, MakeValueTuple b, ValueTuple b ~ valueB, Memory.C valueB) =>
   T valueA valueB -> SV.Vector a -> SV.Vector b
applyStorable proc = Unsafe.performIO $ runStorable proc

runStorable ::
   (Storable a, MakeValueTuple a, ValueTuple a ~ valueA, Memory.C valueA,
    Storable b, MakeValueTuple b, ValueTuple b ~ valueB, Memory.C valueB) =>
   T valueA valueB -> IO (SV.Vector a -> SV.Vector b)
runStorable proc = (Unsafe.performIO .) <$> runStorableIO proc

runStorableIO ::
   (Storable a, MakeValueTuple a, ValueTuple a ~ valueA, Memory.C valueA,
    Storable b, MakeValueTuple b, ValueTuple b ~ valueB, Memory.C valueB) =>
   T valueA valueB -> IO (SV.Vector a -> IO (SV.Vector b))
runStorableIO (Cons next alloca start createIOContext deleteIOContext) = do
   fill <- compile next alloca start
   return $ \as ->
      bracket createIOContext (deleteIOContext . fst) $ \ (_ioContext, params) ->
         SVB.withStartPtr as $ \ aPtr len ->
         SVB.createAndTrim len $ \ bPtr ->
         AllocUtil.with params $ \paramPtr ->
            fmap (fromIntegral :: Word32 -> Int) $
            fill
               (Memory.castTuplePtr paramPtr)
               (fromIntegral len)
               (Memory.castTuplePtr aPtr)
               (Memory.castTuplePtr bPtr)


foreign import ccall safe "dynamic" derefStartPtr ::
   Exec.Importer (Ptr b -> IO (Ptr a))

foreign import ccall safe "dynamic" derefStopPtr ::
   Exec.Importer (Ptr a -> IO ())

foreign import ccall safe "dynamic" derefChunkPtr ::
   Exec.Importer
      (Ptr paramStruct -> Ptr stateStruct -> Word32 ->
       Ptr aStruct -> Ptr bStruct -> IO Word32)


compileChunky ::
   (Memory.C aValue, Memory.Struct aValue ~ aStruct,
    Memory.C bValue, Memory.Struct bValue ~ bStruct,
    Memory.C param, Memory.Struct param ~ paramStruct,
    Memory.C state, Memory.Struct state ~ stateStruct) =>
   (forall r z. (Phi z) =>
    param -> local -> aValue -> state -> MaybeCont.T r z (bValue, state)) ->
   (forall r. CodeGenFunction r local) ->
   (forall r.
    param -> CodeGenFunction r state) ->
   IO (Ptr paramStruct -> IO (Ptr stateStruct),
       Exec.Finalizer stateStruct,
       Ptr paramStruct -> Ptr stateStruct ->
       Word32 -> Ptr aStruct -> Ptr bStruct -> IO Word32)
compileChunky next alloca start =
   Exec.compileModule $
      liftA3 (,,)
         (Exec.createFunction derefStartPtr "startprocess" $
          \paramPtr -> do
             pptr <- LLVM.malloc
             param <- Memory.load paramPtr
             flip Memory.store pptr =<< start param
             ret pptr)
         (Exec.createFinalizer derefStopPtr "stopprocess" $
          \ pptr -> LLVM.free pptr >> ret ())
         (Exec.createFunction derefChunkPtr "fillprocess" $
          \paramPtr sptr loopLen aPtr bPtr -> do
             sInit <- Memory.load sptr
             param <- Memory.load paramPtr
             local <- alloca
             (pos,sExit) <- MaybeCont.arrayLoop2 loopLen aPtr bPtr sInit $
                   \ aPtri bPtri s0 -> do
                a <- MaybeCont.lift $ Memory.load aPtri
                (b,s1) <- next param local a s0
                MaybeCont.lift $ Memory.store b bPtri
                return s1
             Memory.store (Maybe.fromJust sExit) sptr
             ret pos)


traverseChunks ::
   (ValueTuple a ~ aValue, Memory.C aValue, Memory.Struct aValue ~ aStruct,
    ValueTuple b ~ bValue, Memory.C bValue, Memory.Struct bValue ~ bStruct,
    ValueTuple parameters ~ paramValue,
    Memory.C paramValue, Memory.Struct paramValue ~ paramStruct,
    Storable a, MakeValueTuple a,
    Storable b, MakeValueTuple b,
    Storable parameters, MakeValueTuple parameters) =>
   (Ptr paramStruct -> Ptr stateStruct ->
    Word32 -> Ptr aStruct -> Ptr bStruct -> IO Word32) ->
   ForeignPtr parameters ->
   ForeignPtr stateStruct ->
   SVL.Vector a -> IO [SVB.Vector b]
traverseChunks fill paramFPtr statePtr =
   let go xt =
          Unsafe.interleaveIO $
          case xt of
             [] -> return []
             x:xs -> SVB.withStartPtr x $ \aPtr size -> do
                v <-
                   ForeignPtr.with paramFPtr $ \paramPtr ->
                   withForeignPtr statePtr $ \sptr ->
                   SVB.createAndTrim size $
                      fmap (fromIntegral :: Word32 -> Int) .
                      fill paramPtr sptr (fromIntegral size)
                         (Memory.castTuplePtr aPtr) .
                      Memory.castTuplePtr
                (if SV.length v > 0
                   then fmap (v:)
                   else id) $
                   (if SV.length v < size
                      then return []
                      else go xs)
   in  go . SVL.chunks


runStorableChunky ::
   (Storable a, MakeValueTuple a, ValueTuple a ~ valueA, Memory.C valueA,
    Storable b, MakeValueTuple b, ValueTuple b ~ valueB, Memory.C valueB) =>
   T valueA valueB -> IO (SVL.Vector a -> SVL.Vector b)
runStorableChunky (Cons next alloca start createIOContext deleteIOContext) = do
   (startFunc, stopFunc, fill) <- compileChunky next alloca start
   return $ \sig -> SVL.fromChunks $ Unsafe.performIO $ do
      (ioContext, params) <- createIOContext
      paramPtr <- ForeignPtr.new (deleteIOContext ioContext) params
      statePtr <-
         ForeignPtr.newInit stopFunc (ForeignPtr.with paramPtr startFunc)
      traverseChunks fill paramPtr statePtr sig


applyStorableChunky ::
   (Storable a, MakeValueTuple a, ValueTuple a ~ valueA, Memory.C valueA,
    Storable b, MakeValueTuple b, ValueTuple b ~ valueB, Memory.C valueB) =>
   T valueA valueB -> SVL.Vector a -> SVL.Vector b
applyStorableChunky = Unsafe.performIO . runStorableChunky