{-# LANGUAGE CPP #-} {-# LANGUAGE EmptyDataDecls #-} {-# LANGUAGE ForeignFunctionInterface #-} {-# LANGUAGE GADTs #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TypeFamilies #-} {-# LANGUAGE TypeOperators #-} {-# LANGUAGE ViewPatterns #-} -- | -- Module : Data.Array.Accelerate.Math.FFT -- Copyright : [2012..2013] Manuel M T Chakravarty, Gabriele Keller, Trevor L. McDonell, Robert Clifton-Everest -- License : BSD3 -- -- Maintainer : Manuel M T Chakravarty -- Stability : experimental -- Portability : non-portable (GHC extensions) -- -- Computation of a Discrete Fourier Transform using the Cooley-Tuckey -- algorithm. The time complexity is O(n log n) in the size of the input. -- -- This uses a naïve divide-and-conquer algorithm whose absolute performance is -- appalling. -- module Data.Array.Accelerate.Math.FFT ( Mode(..), fft1D, fft1D', fft2D, fft2D', fft3D, fft3D', fft ) where import Prelude as P import Data.Array.Accelerate as A import Data.Array.Accelerate.Array.Sugar ( showShape ) import Data.Array.Accelerate.Math.Complex #ifdef ACCELERATE_CUDA_BACKEND import Data.Array.Accelerate.CUDA.Foreign import Data.Array.Accelerate.Array.Sugar as S ( shapeToList, shape, EltRepr ) import Data.Array.Accelerate.Type import Data.Functor import Foreign.CUDA.FFT import qualified Foreign.CUDA.Driver as CUDA hiding (free) #endif import Data.Bits data Mode = Forward | Reverse | Inverse deriving (Eq, Show) isPow2 :: Int -> Bool isPow2 x = x .&. (x-1) == 0 signOfMode :: Num a => Mode -> a signOfMode m = case m of Forward -> -1 Reverse -> 1 Inverse -> 1 -- Vector Transform -- ---------------- -- -- Discrete Fourier Transform of a vector. Array dimensions must be powers of -- two else error. -- fft1D :: (Elt e, IsFloating e) => Mode -> Vector (Complex e) -> Acc (Vector (Complex e)) fft1D mode vec = let Z :. len = arrayShape vec in fft1D' mode len (use vec) fft1D' :: forall e. (Elt e, IsFloating e) => Mode -> Int -> Acc (Vector (Complex e)) -> Acc (Vector (Complex e)) fft1D' mode len vec = let sign = signOfMode mode :: e scale = P.fromIntegral len #ifdef ACCELERATE_CUDA_BACKEND sh = (Z:.len) vec' = cudaFFT mode sh fft' vec #else vec' = fft' vec #endif fft' a = fft sign Z len a in if P.not (isPow2 len) then error $ unlines [ "Data.Array.Accelerate.FFT: fft1D" , " Array dimensions must be powers of two, but are: " ++ showShape (Z:.len) ] else case mode of Inverse -> A.map (/scale) vec' _ -> vec' -- Matrix Transform -- ---------------- -- -- Discrete Fourier Transform of a matrix. Array dimensions must be powers of -- two else error. -- fft2D :: (Elt e, IsFloating e) => Mode -> Array DIM2 (Complex e) -> Acc (Array DIM2 (Complex e)) fft2D mode arr = let Z :. height :. width = arrayShape arr in fft2D' mode width height (use arr) fft2D' :: forall e. (Elt e, IsFloating e) => Mode -> Int -- ^ width -> Int -- ^ height -> Acc (Array DIM2 (Complex e)) -> Acc (Array DIM2 (Complex e)) fft2D' mode width height arr = let sign = signOfMode mode :: e scale = P.fromIntegral (width * height) #ifdef ACCELERATE_CUDA_BACKEND sh = (Z:.width:.height) arr' = cudaFFT mode sh fft' arr #else arr' = fft' arr #endif fft' a = A.transpose . fft sign (Z:.width) height >-> A.transpose . fft sign (Z:.height) width $ a in if P.not (isPow2 width && isPow2 height) then error $ unlines [ "Data.Array.Accelerate.FFT: fft2D" , " Array dimensions must be powers of two, but are: " ++ showShape (Z:.height:.width) ] else case mode of Inverse -> A.map (/scale) arr' _ -> arr' -- Cube Transform -- -------------- -- -- Discrete Fourier Transform of a 3D array. Array dimensions must be power of -- two else error. -- fft3D :: (Elt e, IsFloating e) => Mode -> Array DIM3 (Complex e) -> Acc (Array DIM3 (Complex e)) fft3D mode arr = let Z :. depth :. height :. width = arrayShape arr in fft3D' mode width height depth (use arr) fft3D' :: forall e. (Elt e, IsFloating e) => Mode -> Int -- ^ width -> Int -- ^ height -> Int -- ^ depth -> Acc (Array DIM3 (Complex e)) -> Acc (Array DIM3 (Complex e)) fft3D' mode width height depth arr = let sign = signOfMode mode :: e scale = P.fromIntegral (width * height) #ifdef ACCELERATE_CUDA_BACKEND sh = (Z:.width:.height:.depth) arr' = cudaFFT mode sh fft' arr #else arr' = fft' arr #endif fft' a = rotate3D . fft sign (Z:.width :.depth) height >-> rotate3D . fft sign (Z:.height:.width) depth >-> rotate3D . fft sign (Z:.depth :.height) width $ a in if P.not (isPow2 width && isPow2 height && isPow2 depth) then error $ unlines [ "Data.Array.Accelerate.FFT: fft3D" , " Array dimensions must be powers of two, but are: " ++ showShape (Z:.depth:.height:.width) ] else case mode of Inverse -> A.map (/scale) arr' _ -> arr' rotate3D :: Elt e => Acc (Array DIM3 e) -> Acc (Array DIM3 e) rotate3D arr = backpermute (swap (A.shape arr)) swap arr where swap :: Exp DIM3 -> Exp DIM3 swap ix = let Z :. m :. k :. l = unlift ix :: Z :. Exp Int :. Exp Int :. Exp Int in lift $ Z :. k :. l :. m -- Rank-generalised Cooley-Tuckey DFT -- -- We require the innermost dimension be passed as a Haskell value because we -- can't do divide-and-conquer recursion directly in the meta-language. -- fft :: forall sh e. (Slice sh, Shape sh, IsFloating e, Elt e) => e -> sh -> Int -> Acc (Array (sh:.Int) (Complex e)) -> Acc (Array (sh:.Int) (Complex e)) fft sign sh sz arr = go sz 0 1 where go :: Int -> Int -> Int -> Acc (Array (sh:.Int) (Complex e)) go len offset stride | len == 2 = A.generate (constant (sh :. len)) swivel | otherwise = combine (go (len `div` 2) offset (stride * 2)) (go (len `div` 2) (offset + stride) (stride * 2)) where len' = the (unit (constant len)) offset' = the (unit (constant offset)) stride' = the (unit (constant stride)) swivel ix = let sh' :. sz' = unlift ix :: Exp sh :. Exp Int in sz' ==* 0 ? ( (arr ! lift (sh' :. offset')) + (arr ! lift (sh' :. offset' + stride')) {- ==* 1-} , (arr ! lift (sh' :. offset')) - (arr ! lift (sh' :. offset' + stride')) ) combine evens odds = let odds' = A.generate (A.shape odds) (\ix -> twiddle len' (indexHead ix) * odds!ix) in append (A.zipWith (+) evens odds') (A.zipWith (-) evens odds') twiddle n' i' = let n = A.fromIntegral n' i = A.fromIntegral i' k = 2*pi*i/n in lift ( cos k, A.constant sign * sin k ) #ifdef ACCELERATE_CUDA_BACKEND -- FFT using the CUFFT library to enable high performance for the CUDA backend of -- Accelerate. The implementation works on all arrays of rank less than or equal -- to 3. The result is un-normalised. -- cudaFFT :: forall e sh. (Shape sh, Elt e, IsFloating e) => Mode -> sh -> (Acc (Array sh (Complex e)) -> Acc (Array sh (Complex e))) -> Acc (Array sh (Complex e)) -> Acc (Array sh (Complex e)) cudaFFT mode sh p arr = deinterleave sh (foreignAcc ff pure (interleave arr)) where ff = cudaAcc foreignFFT -- Unfortunately the pure version of the function needs to be wrapped in -- interleave and deinterleave to match how the foreign version works. -- -- RCE: Do the interleaving and deinterleaving in foreignFFT -- -- TLM: The interleaving might get fused into other parts of the -- computation and thus be okay. We should really support multi types -- such as float2 instead. -- pure = interleave . p . deinterleave sh sign = signOfMode mode :: Int foreignFFT :: Array DIM1 e -> CIO (Array DIM1 e) foreignFFT arr' = do -- Create the plan -- TODO: Cache this. -- hndl <- liftIO $ case shapeToList sh of [width] -> plan1D width types 1 [height, width] -> plan2D height width types [depth, height, width] -> plan3D depth height width types _ -> error "Accelerate-fft cannot use CUFFT for arrays of dimensions higher than 3" output <- allocateArray (S.shape arr') iptr <- floatingDevicePtr arr' optr <- floatingDevicePtr output --Execute liftIO $ execute hndl iptr optr liftIO $ destroy hndl return output types = case (floatingType :: FloatingType e) of TypeFloat{} -> C2C TypeDouble{} -> Z2Z TypeCFloat{} -> C2C TypeCDouble{} -> Z2Z execute :: Handle -> CUDA.DevicePtr e -> CUDA.DevicePtr e -> IO () execute hndl iptr optr = case (floatingType :: FloatingType e) of TypeFloat{} -> execC2C hndl iptr optr sign TypeDouble{} -> execZ2Z hndl iptr optr sign TypeCFloat{} -> execC2C hndl (CUDA.castDevPtr iptr) (CUDA.castDevPtr optr) sign TypeCDouble{} -> execZ2Z hndl (CUDA.castDevPtr iptr) (CUDA.castDevPtr optr) sign floatingDevicePtr :: Vector e -> CIO (CUDA.DevicePtr e) floatingDevicePtr v = case (floatingType :: FloatingType e) of TypeFloat{} -> singleDevicePtr v TypeDouble{} -> singleDevicePtr v TypeCFloat{} -> CUDA.castDevPtr <$> singleDevicePtr v TypeCDouble{} -> CUDA.castDevPtr <$> singleDevicePtr v singleDevicePtr :: DevicePtrs (EltRepr e) ~ ((),CUDA.DevicePtr b) => Vector e -> CIO (CUDA.DevicePtr b) singleDevicePtr v = P.snd <$> devicePtrsOfArray v #endif -- Append two arrays. Doesn't do proper bounds checking or intersection... -- append :: forall sh e. (Slice sh, Shape sh, Elt e) => Acc (Array (sh:.Int) e) -> Acc (Array (sh:.Int) e) -> Acc (Array (sh:.Int) e) append xs ys = let sh :. n = unlift (A.shape xs) :: Exp sh :. Exp Int _ :. m = unlift (A.shape ys) :: Exp sh :. Exp Int in generate (lift (sh :. n+m)) (\ix -> let sz :. i = unlift ix :: Exp sh :. Exp Int in i <* n ? (xs ! lift (sz:.i), ys ! lift (sz:.i-n) )) #ifdef ACCELERATE_CUDA_BACKEND {-# RULES "interleave/deinterleave" forall sh x. deinterleave sh (interleave x) = x; "deinterleave/interleave" forall sh x. interleave (deinterleave sh x) = x #-} -- Interleave the real and imaginary components in a complex array and produce a -- flattened vector. This allows us to mimic the float2 structure used by CUFFT -- to store complex numbers. -- interleave :: (Shape sh, Elt e) => Acc (Array sh (Complex e)) -> Acc (Vector e) interleave arr = generate sh swizzle where sh = index1 (2 * A.size arr) swizzle ix = let i = indexHead ix v = arr A.!! (i `div` 2) in i `mod` 2 ==* 0 ? (real v, imag v) -- Deinterleave a vector into a complex array. Assumes the array is even in length. -- deinterleave :: (Shape sh, Elt e) => sh -> Acc (Vector e) -> Acc (Array sh (Complex e)) deinterleave (constant -> sh) arr = generate sh (\ix -> let i = toIndex sh ix * 2 in lift (arr A.!! i, arr A.!! (i+1))) #endif