{-# OPTIONS_GHC -funbox-strict-fields -ddump-simpl -ddump-asm -O #-} {-# LANGUAGE ExistentialQuantification #-} {- -dverbose-core2core -ddump-simpl-stats -} -- I use the dump options only in the main module and not in Cabal -- in order to get only code for the main module and not all modules {- This module demonstrates the following: mainMonolithic1Generator performs the same computation as mainMonolithic1Compose but the former is more than two times slower than latter. This is serious since in more complex signal processing programs this factor seems to multiply. I assume that the problem is that 'mixGen' is not inlined. Instead GHC seems to have decided to specialise mixGen. In contrast to mainMonolithic1Compose, mainMonolithic1Generator uses a data type with existential quantification. But this alone is not the problem, since mainMonolithic0 and mainMonolithic0Generator run with the same speed. The program can be compiled using > ghc -package storablevector-0.2.5 -O speedtest/SpeedTestChorus.hs Exporting only 'main' causes warnings about unused functions, but it also reduces the core output to a third. -} module Main (main) where import qualified Data.StorableVector.Lazy.Builder as Builder import qualified Data.StorableVector.ST.Strict as SVSTS import qualified Data.StorableVector.ST.Lazy as SVSTL import qualified Data.StorableVector as SV import qualified Data.StorableVector.Lazy as SVL -- import qualified Data.StorableVector.Private as SVP import qualified Control.Monad.ST.Strict as StrictST import Control.Monad.ST.Lazy (ST, runST, strictToLazyST, ) import Foreign.Storable (Storable, ) import GHC.Float (float2Int, int2Float, double2Int, int2Double, ) import qualified Sound.Frame.Stereo as Stereo -- import qualified Data.Strictness.HT as Strict import Control.Monad (guard, zipWithM, ) import Data.Monoid (mempty, mappend, ) {- GHC-6.10.4: I started with Storable instance for pairs from storable-tuple, that was implemented using the storable-record framework at this time. I got run-time around 5 seconds. When I used inlining then the computation time increased to 8s! Then I switch to sample-frame:Sound.Frame.Stereo computation time dropped to 1.4 seconds. At this time I already switched back from the storable-record based implementation to a custom one of the Storable Stereo instance. With this implementation inlining doesn't change the run-time. But then I noted that the generated file contained only one saw wave tone. This problem disappeared by not using -O2 option, but only -O. Monolithic and chunky require about 2.6 seconds, whereas monolithicStrict needs 3.8 seconds. After inlining monolithicStrict needs 1.8 seconds. -} type Phase = (Float, Float, Float) {-# INLINE saw #-} saw :: Num a => a -> a saw t = 1-2*t {-# INLINE sawChorus #-} sawChorus :: Phase -> Float sawChorus (pl0,pl1,pl2) = 0.3 * (saw pl0 + saw pl1 + saw pl2) {- Much faster than @snd . properFraction@ but fails for large numbers. -} class (Num a, Ord a) => Fraction a where fraction :: a -> a instance Fraction Float where {-# INLINE fraction #-} fraction x = x - int2Float (float2Int x) instance Fraction Double where {-# INLINE fraction #-} fraction x = x - int2Double (double2Int x) {- fraction = Strict.arguments1 $ \x -> let y = x - int2Float (float2Int x) in y -} {- in if y<0 then y+1 else y -} {- if x==0 then 0 else x - int2Float (float2Int x) -} -- rnf x `seq` x - int2Float (float2Int x) {-# INLINE generator0Freq #-} generator0Freq :: Fraction a => a -> a -> Maybe (a, a) generator0Freq freq = \p -> Just (saw p, fraction (p+freq)) {-# INLINE generator0 #-} generator0 :: Float -> Maybe (Float, Float) generator0 = generator0Freq 0.01 {-# INLINE tone0 #-} tone0 :: Float -> Float -> SVL.Vector Float tone0 freq phase = SVL.unfoldr SVL.defaultChunkSize (generator0Freq freq) phase {- Here we let storablevector functions check whether we reached the end of the vector. However, 'SVSTS.maybeWrite' also checks whether the index is non-negative, which is unnecessary in our case. -} {-# INLINE runLoopSTStrictSafe #-} runLoopSTStrictSafe :: (Storable a) => Int -> (s -> Maybe (a, s)) -> s -> SV.Vector a runLoopSTStrictSafe n f s = SVSTS.runSTVector (do v <- SVSTS.new_ n let go i s0 = case f s0 of Nothing -> return v Just (a,s1) -> SVSTS.maybeWrite v i a >>= \cont -> if cont then go (succ i) s1 else return v go 0 s) {-# INLINE runLoopSTStrict #-} runLoopSTStrict :: (Storable a) => Int -> (s -> Maybe (a, s)) -> s -> SV.Vector a runLoopSTStrict n f s = SVSTS.runSTVector (do v <- SVSTS.new_ n let go i s0 = case guard (i> f s0 of Nothing -> return v Just (a,s1) -> SVSTS.unsafeWrite v i a >> go (succ i) s1 go 0 s) {-# INLINE runLoopSTLazy #-} runLoopSTLazy :: (Storable a) => Int -> (s -> Maybe (a, s)) -> s -> SV.Vector a runLoopSTLazy n f s = SVSTL.runSTVector (do v <- SVSTL.new_ n let go s0 i = case guard (i> f s0 of Nothing -> return v Just (a,s1) -> strictToLazyST (SVSTS.unsafeWrite v i a >> return (succ i)) >>= go s1 {- Strict pattern matching on () is necessary in order to avoid a memory leak. Working in ST.Lazy is still three times slower than ST.Strict -} -- SVSTL.unsafeWrite v i a >>= \() -> go s1 (succ i) -- SVSTL.unsafeWrite v i a >> go s1 (succ i) go s 0) {-# INLINE mixST #-} mixST :: (Storable a, Num a) => SVSTS.Vector s a -> (st -> Maybe (a, st)) -> st -> StrictST.ST s Int mixST v f s = let go i s0 = if i < SVSTS.length v then case f s0 of Nothing -> return i Just (a,s1) -> SVSTS.unsafeModify v i (a+) >> go (succ i) s1 else return i in go 0 s {-# INLINE mixSTGuard #-} mixSTGuard :: (Storable a, Num a) => SVSTS.Vector s a -> (st -> Maybe (a, st)) -> st -> StrictST.ST s Int mixSTGuard v f s = let go i s0 = case guard (i < SVSTS.length v) >> f s0 of Nothing -> return i Just (a,s1) -> SVSTS.unsafeModify v i (a+) >> go (succ i) s1 in go 0 s {- It seems that mixSTVectorFoldr is essentially slower than mixSTVectorIndex. The former one should be faster, since 'foldr' uses direct pointer into the source vector. -} {-# INLINE mixSTVectorIndex #-} mixSTVectorIndex :: (Storable a, Num a) => SVSTS.Vector s a -> SV.Vector a -> StrictST.ST s Int mixSTVectorIndex dst src = let end = min (SVSTS.length dst) (SV.length src) go i = if i >= end then return i else SVSTS.unsafeModify dst i (SV.index src i +) >> go (succ i) in go 0 {-# INLINE mixSTVectorFoldr #-} mixSTVectorFoldr :: (Storable a, Num a) => SVSTS.Vector s a -> SV.Vector a -> StrictST.ST s Int mixSTVectorFoldr dst src = SV.foldr (\x go i -> if i >= SVSTS.length dst then return i else SVSTS.unsafeModify dst i (x +) >> go (succ i)) return src 0 {-# INLINE runBuilder #-} runBuilder :: (Storable a) => SVL.ChunkSize -> (s -> Maybe (a, s)) -> s -> SVL.Vector a runBuilder chunkSize f s = Builder.toLazyStorableVector chunkSize (let go s0 = case f s0 of Nothing -> mempty Just (a,s1) -> mappend (Builder.put a) (go s1) in go s) infixl 6 `mix`, `mixGen`, `mixVec` {- | Build a generator from two other generators by handling their state in parallel and mix their results. -} {-# INLINE mix #-} mix :: (Num y) => (s -> Maybe (y, s)) -> (t -> Maybe (y, t)) -> ((s,t) -> Maybe (y, (s,t))) mix f g (s0,t0) = do (a,s1) <- f s0 (b,t1) <- g t0 return ((a+b), (s1,t1)) {- | This is like a list without storage. It is like stream-fusion:Data.Stream but without Skip constructor. -} data Generator a = forall s. Generator (s -> Maybe (a, s)) s {-# INLINE runGeneratorMonolithic #-} runGeneratorMonolithic :: Storable a => Int -> Generator a -> SV.Vector a runGeneratorMonolithic size (Generator f s) = fst $ SV.unfoldrN size f s {- SPECIALISE INLINE generator0Gen :: Float -> Float -> Generator Float -} {-# INLINE generator0Gen #-} generator0Gen :: Fraction a => a -> a -> Generator a generator0Gen freq phase = Generator (\p -> Just (saw p, fraction (p+freq))) phase {- SPECIALISE INLINE mixGen :: Generator Float -> Generator Float -> Generator Float -} {-# INLINE mixGen #-} mixGen :: (Num y) => Generator y -> Generator y -> Generator y mixGen (Generator f s) (Generator g t) = Generator (\(s0,t0) -> do (a,s1) <- f s0 (b,t1) <- g t0 return ((a+b), (s1,t1))) (s,t) {-# INLINE incPhase #-} incPhase :: Phase -> Phase -> Phase incPhase (d0,d1,d2) (p0,p1,p2) = (fraction (p0+d0), fraction (p1+d1), fraction (p2+d2)) {-# INLINE generator1 #-} generator1 :: Phase -> Maybe (Float, Phase) generator1 = \p -> Just (sawChorus p, incPhase dl p) {-# SPECIALISE mixVec :: SVL.Vector Float -> SVL.Vector Float -> SVL.Vector Float #-} {- disabled INLINE mixVec -} mixVec :: (Num y, Storable y) => SVL.Vector y -> SVL.Vector y -> SVL.Vector y mixVec xs0 ys0 = let recourse xt@(x:_) yt@(y:_) = let z = SV.zipWith (+) x y n = SV.length z in z : recourse (SVL.chunks $ SVL.drop n $ SVL.fromChunks xt) (SVL.chunks $ SVL.drop n $ SVL.fromChunks yt) recourse xs [] = xs recourse [] ys = ys in SVL.fromChunks $ recourse (SVL.chunks xs0) (SVL.chunks ys0) {-# INLINE generator2 #-} generator2 :: (Phase, Phase) -> Maybe (Stereo.T Float, (Phase, Phase)) generator2 = \(pl, pr) -> Just (Stereo.cons (sawChorus pl) (sawChorus pr), (incPhase dl pl, incPhase dr pr)) {-# INLINE dl #-} {-# INLINE dr #-} dl, dr :: Phase (dl,dr) = ((0.01008, 0.01003, 0.00990), (0.00992, 0.00997, 0.01010)) {-# INLINE initPhase2 #-} initPhase2 :: (Phase, Phase) initPhase2 = ((0,0.7,0.1), (0.3,0.4,0.6)) size :: Int size = 10000000 mainSumFoldl :: IO () mainSumFoldl = print $ SV.foldl (\s x -> s+x+13) 23 (SV.replicate size (42::Int)) {- stack overflow -} mainSumFoldl' :: IO () mainSumFoldl' = print $ SV.foldl' (\s x -> s+x+13) 23 (SV.replicate size (42::Int)) {- GHC-6.12.1: real 0m0.171s user 0m0.112s sys 0m0.056s -} mainSumFoldr :: IO () mainSumFoldr = print $ SV.foldr (\x go s -> go $! s+x+13) id (SV.replicate size (42::Int)) $! 23 {- GHC-6.12.1: real 0m0.503s user 0m0.464s sys 0m0.036s -} mainMonolithic0 :: IO () mainMonolithic0 = SV.writeFile "speed.f32" (fst $ SV.unfoldrN size generator0 0) {- GHC-6.10.4: real 0m0.423s user 0m0.256s sys 0m0.152s GHC-6.12.1: real 0m0.392s user 0m0.252s sys 0m0.140s -} mainMonolithic0Generator :: IO () mainMonolithic0Generator = SV.writeFile "speed.f32" (runGeneratorMonolithic size (generator0Gen (0.01::Float) 0)) {- GHC-6.12.1: real 0m0.413s user 0m0.240s sys 0m0.172s -} mainMonolithic0STStrict :: IO () mainMonolithic0STStrict = SV.writeFile "speed.f32" (runLoopSTStrict size (generator0Freq (0.01::Float)) 0) {- GHC-6.10.4: real 0m0.430s user 0m0.288s sys 0m0.132s GHC-6.12.1: real 0m0.447s user 0m0.276s sys 0m0.168s -} mainMonolithic0STLazy :: IO () mainMonolithic0STLazy = SV.writeFile "speed.f32" (runLoopSTLazy size (generator0Freq (0.01::Float)) 0) {- GHC-6.10.4: real 0m0.886s user 0m0.752s sys 0m0.128s GHC-6.12.1: real 0m0.763s user 0m0.620s sys 0m0.144s -} mainMonolithic0STMix :: IO () mainMonolithic0STMix = SV.writeFile "speed.f32" $ StrictST.runST (do v <- SVSTS.new size 0 l <- mixSTGuard v (generator0Freq (0.01::Float)) 0 fmap (SV.take l) (SVSTS.unsafeFreeze v)) {- GHC-6.10.4: real 0m0.505s user 0m0.344s sys 0m0.156s GHC-6.12.1: real 0m0.475s user 0m0.344s sys 0m0.128s -} mainMonolithic1 :: IO () mainMonolithic1 = SV.writeFile "speed.f32" (fst $ SV.unfoldrN size generator1 (fst initPhase2)) {- GHC-6.12.1: real 0m0.973s user 0m0.824s sys 0m0.140s -} mainMonolithic1Composed :: IO () mainMonolithic1Composed = SV.writeFile "speed.f32" (fst $ SV.unfoldrN size (let (f0,f1,f2) = dl in generator0Freq f0 `mix` generator0Freq f1 `mix` generator0Freq f2) (let (p0,p1,p2) = fst initPhase2 in ((p0,p1),p2))) {- GHC-6.10.4: real 0m0.974s user 0m0.812s sys 0m0.160s GHC-6.12.1: real 0m0.940s user 0m0.800s sys 0m0.132s -} mainMonolithic1Generator :: IO () mainMonolithic1Generator = SV.writeFile "speed.f32" (runGeneratorMonolithic size (let (f0,f1,f2) = dl (p0,p1,p2) = fst initPhase2 in generator0Gen f0 p0 `mixGen` generator0Gen f1 p1 `mixGen` generator0Gen f2 p2)) {- GHC-6.10.4: real 0m2.244s user 0m2.084s sys 0m0.152s GHC-6.12.1: real 0m2.256s user 0m2.084s sys 0m0.172s -} mainMonolithic1GeneratorFold :: IO () mainMonolithic1GeneratorFold = SV.writeFile "speed.f32" (runGeneratorMonolithic size (let (f0,f1,f2) = dl (p0,p1,p2) = fst initPhase2 in foldl1 mixGen $ map (uncurry generator0Gen) $ [(f0,p0), (f1,p1), (f2,p2)])) {- GHC-6.10.4: real 0m3.006s user 0m2.816s sys 0m0.180s GHC-6.12.1: real 0m3.050s user 0m2.884s sys 0m0.160s -} mainMonolithic1STMix :: IO () mainMonolithic1STMix = SV.writeFile "speed.f32" $ StrictST.runST (do v <- SVSTS.new size 0 let (f0,f1,f2) = dl (p0,p1,p2) = fst initPhase2 l0 <- mixSTGuard v (generator0Freq f0) p0 l1 <- mixSTGuard v (generator0Freq f1) p1 l2 <- mixSTGuard v (generator0Freq f2) p2 fmap (SV.take (l0 `min` l1 `min` l2)) (SVSTS.unsafeFreeze v)) {- GHC-6.10.4: real 0m1.895s user 0m1.684s sys 0m0.180s GHC-6.12.1: real 0m1.932s user 0m1.764s sys 0m0.168s -} mainMonolithic1STMixZip :: IO () mainMonolithic1STMixZip = SV.writeFile "speed.f32" $ StrictST.runST (do v <- SVSTS.new size 0 let (f0,f1,f2) = dl (p0,p1,p2) = fst initPhase2 ls <- zipWithM (mixSTGuard v . generator0Freq) [f0,f1,f2] [p0,p1,p2] fmap (SV.take (minimum ls)) (SVSTS.unsafeFreeze v)) {- GHC-6.10.4: real 0m1.391s user 0m1.232s sys 0m0.160s GHC-6.12.1: real 0m1.560s user 0m1.404s sys 0m0.136s -} mainMonolithic1STMixVector :: IO () mainMonolithic1STMixVector = SV.writeFile "speed.f32" $ StrictST.runST (do v <- SVSTS.new size 0 let (f0,f1,f2) = dl (p0,p1,p2) = fst initPhase2 osci f p = fst $ SV.unfoldrN size (generator0Freq f) p l0 <- mixSTVectorIndex v (osci f0 p0) l1 <- mixSTVectorIndex v (osci f1 p1) l2 <- mixSTVectorIndex v (osci f2 p2) fmap (SV.take (l0 `min` l1 `min` l2)) (SVSTS.unsafeFreeze v)) {- GHC-6.12.1: real 0m1.751s user 0m1.544s sys 0m0.208s -} mainMonolithic1STMixVectorZipFoldr :: IO () mainMonolithic1STMixVectorZipFoldr = SV.writeFile "speed.f32" $ StrictST.runST (do v <- SVSTS.new size 0 let (f0,f1,f2) = dl (p0,p1,p2) = fst initPhase2 vs = zipWith (\f p -> fst $ SV.unfoldrN size (generator0Freq f) p) [f0,f1,f2] [p0,p1,p2] ls <- mapM (mixSTVectorFoldr v) vs fmap (SV.take (minimum ls)) (SVSTS.unsafeFreeze v)) {- GHC-6.12.1: real 0m3.046s user 0m2.828s sys 0m0.216s -} mainMonolithic1STMixVectorZipIndex :: IO () mainMonolithic1STMixVectorZipIndex = SV.writeFile "speed.f32" $ StrictST.runST (do v <- SVSTS.new size 0 let (f0,f1,f2) = dl (p0,p1,p2) = fst initPhase2 vs = zipWith (\f p -> fst $ SV.unfoldrN size (generator0Freq f) p) [f0,f1,f2] [p0,p1,p2] ls <- mapM (mixSTVectorIndex v) vs fmap (SV.take (minimum ls)) (SVSTS.unsafeFreeze v)) {- GHC-6.12.1: real 0m1.782s user 0m1.532s sys 0m0.220s -} mainMonolithic2 :: IO () mainMonolithic2 = SV.writeFile "speed.f32" (fst $ SV.unfoldrN size generator2 initPhase2) {- GHC-6.12.1: real 0m1.852s user 0m1.588s sys 0m0.252s -} {- mainMonolithicStrict2 :: IO () mainMonolithicStrict2 = SV.writeFile "speed.f32" (fst $ SVP.unfoldrStrictN size generator2 initPhase2) mainMonolithicTransition2 :: IO () mainMonolithicTransition2 = SV.writeFile "speed.f32" (fst $ SVP.unfoldrTransitionN size (\(pl,pr) -> (incPhase dl pl, incPhase dr pr)) (\(pl,pr) -> Just (Stereo.cons (sawChorus pl) (sawChorus pr))) initPhase2) -} mainChunky0 :: IO () mainChunky0 = SVL.writeFile "speed.f32" (SVL.take size $ SVL.unfoldr SVL.defaultChunkSize generator0 0) {- GHC-6.10.4: real 0m0.428s user 0m0.292s sys 0m0.132s GHC-6.12.1: real 0m0.424s user 0m0.252s sys 0m0.168s -} mainChunky0Builder :: IO () mainChunky0Builder = SVL.writeFile "speed.f32" (SVL.take size $ runBuilder SVL.defaultChunkSize generator0 0) {- GHC-6.10.4: real 0m1.107s user 0m0.968s sys 0m0.140s GHC-6.12.1: real 0m1.079s user 0m0.936s sys 0m0.136s -} mainChunky1 :: IO () mainChunky1 = SVL.writeFile "speed.f32" (SVL.take size $ SVL.unfoldr SVL.defaultChunkSize generator1 (fst initPhase2)) {- GHC-6.10.4: real 0m0.938s user 0m0.812s sys 0m0.116s GHC-6.12.1: real 0m0.945s user 0m0.788s sys 0m0.152s -} mainChunky1MixFlat :: IO () mainChunky1MixFlat = SVL.writeFile "speed.f32" (let (f0,f1,f2) = dl (p0,p1,p2) = fst initPhase2 in SVL.take size $ tone0 f0 p0 `mixVec` tone0 f1 p1 `mixVec` tone0 f2 p2) {- GHC-6.10.4: real 0m3.932s user 0m2.112s sys 0m0.156s GHC-6.12.1: real 0m2.264s user 0m2.144s sys 0m0.116s -} mainChunky1MixFold :: IO () mainChunky1MixFold = SVL.writeFile "speed.f32" (let (f0,f1,f2) = dl (p0,p1,p2) = fst initPhase2 in SVL.take size $ foldl1 mixVec $ map (uncurry tone0) $ [(f0,p0), (f1,p1), (f2,p2)]) {- GHC-6.10.4: real 0m1.611s user 0m1.476s sys 0m0.108s GHC-6.12.1: real 0m1.555s user 0m1.416s sys 0m0.136s -} mainChunky2 :: IO () mainChunky2 = SVL.writeFile "speed.f32" (SVL.take size $ SVL.unfoldr SVL.defaultChunkSize generator2 initPhase2) {- GHC-6.10.4: real 0m2.220s user 0m1.400s sys 0m0.192s GHC-6.12.1: real 0m1.877s user 0m1.652s sys 0m0.216s -} main :: IO () main = mainSumFoldl'