{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE ForeignFunctionInterface #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
module Data.Array.Accelerate.LLVM.Native.Execute (
executeAcc, executeAfun1,
) where
import Data.Array.Accelerate.Error
import Data.Array.Accelerate.Array.Sugar
import Data.Array.Accelerate.Analysis.Match
import Data.Array.Accelerate.LLVM.Analysis.Match
import Data.Array.Accelerate.LLVM.Execute
import Data.Array.Accelerate.LLVM.State
import Data.Array.Accelerate.LLVM.Native.Array.Data
import Data.Array.Accelerate.LLVM.Native.Compile
import Data.Array.Accelerate.LLVM.Native.Execute.Async
import Data.Array.Accelerate.LLVM.Native.Execute.Environment
import Data.Array.Accelerate.LLVM.Native.Execute.Marshal
import Data.Array.Accelerate.LLVM.Native.Target
import qualified Data.Array.Accelerate.LLVM.Native.Debug as Debug
import Data.Range.Range ( Range(..) )
import Control.Parallel.Meta ( Executable(..) )
import Data.Array.Accelerate.LLVM.Native.Execute.LBS
import Data.Word ( Word8 )
import Control.Monad.State ( gets )
import Control.Monad.Trans ( liftIO )
import Prelude hiding ( map, sum, scanl, scanr, init )
import qualified Prelude as P
import Foreign.C
import Foreign.LibFFI ( Arg )
import Foreign.Ptr
instance Execute Native where
map = simpleOp
generate = simpleOp
transform = simpleOp
backpermute = simpleOp
fold = foldOp
fold1 = fold1Op
foldSeg = foldSegOp
fold1Seg = foldSegOp
scanl = scanOp
scanl1 = scan1Op
scanl' = scan'Op
scanr = scanOp
scanr1 = scan1Op
scanr' = scan'Op
permute = permuteOp
stencil1 = stencil1Op
stencil2 = stencil2Op
simpleOp
:: (Shape sh, Elt e)
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> sh
-> LLVM Native (Array sh e)
simpleOp NativeR{..} gamma aenv () sh = do
Native{..} <- gets llvmTarget
liftIO $ do
out <- allocateArray sh
executeMain executableR $ \f ->
executeOp defaultLargePPT fillP f gamma aenv (IE 0 (size sh)) out
return out
simpleNamed
:: (Shape sh, Elt e)
=> String
-> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> sh
-> LLVM Native (Array sh e)
simpleNamed fun NativeR{..} gamma aenv () sh = do
Native{..} <- gets llvmTarget
liftIO $ do
out <- allocateArray sh
execute executableR fun $ \f ->
executeOp defaultLargePPT fillP f gamma aenv (IE 0 (size sh)) out
return out
fold1Op
:: (Shape sh, Elt e)
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> (sh :. Int)
-> LLVM Native (Array sh e)
fold1Op kernel gamma aenv stream sh@(sx :. sz)
= $boundsCheck "fold1" "empty array" (sz > 0)
$ case size sh of
0 -> liftIO $ allocateArray sx
_ -> foldCore kernel gamma aenv stream sh
foldOp
:: (Shape sh, Elt e)
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> (sh :. Int)
-> LLVM Native (Array sh e)
foldOp kernel gamma aenv stream sh@(sx :. _) =
case size sh of
0 -> simpleNamed "generate" kernel gamma aenv stream (listToShape (P.map (max 1) (shapeToList sx)))
_ -> foldCore kernel gamma aenv stream sh
foldCore
:: (Shape sh, Elt e)
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> (sh :. Int)
-> LLVM Native (Array sh e)
foldCore kernel gamma aenv stream sh
| Just Refl <- matchShapeType sh (undefined::DIM1)
= foldAllOp kernel gamma aenv stream sh
| otherwise
= foldDimOp kernel gamma aenv stream sh
foldAllOp
:: forall aenv e. Elt e
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> DIM1
-> LLVM Native (Scalar e)
foldAllOp NativeR{..} gamma aenv () (Z :. sz) = do
Native{..} <- gets llvmTarget
let
ncpu = gangSize
stride = defaultLargePPT `min` ((sz + ncpu - 1) `quot` ncpu)
steps = (sz + stride - 1) `quot` stride
if ncpu == 1 || sz <= defaultLargePPT
then liftIO $ do
out <- allocateArray Z
execute executableR "foldAllS" $ \f ->
executeOp 1 fillS f gamma aenv (IE 0 sz) out
return out
else liftIO $ do
out <- allocateArray Z
tmp <- allocateArray (Z :. steps) :: IO (Vector e)
execute executableR "foldAllP1" $ \f1 -> do
execute executableR "foldAllP2" $ \f2 -> do
executeOp 1 fillP f1 gamma aenv (IE 0 steps) (sz, stride, tmp)
executeOp 1 fillS f2 gamma aenv (IE 0 steps) (tmp, out)
return out
foldDimOp
:: (Shape sh, Elt e)
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> (sh :. Int)
-> LLVM Native (Array sh e)
foldDimOp NativeR{..} gamma aenv () (sh :. sz) = do
Native{..} <- gets llvmTarget
let ppt = defaultSmallPPT `max` (defaultLargePPT `quot` (max 1 sz))
liftIO $ do
out <- allocateArray sh
executeMain executableR $ \f ->
executeOp ppt fillP f gamma aenv (IE 0 (size sh)) (sz, out)
return out
foldSegOp
:: (Shape sh, Elt e)
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> (sh :. Int)
-> (Z :. Int)
-> LLVM Native (Array (sh :. Int) e)
foldSegOp NativeR{..} gamma aenv () (sh :. _) (Z :. ss) = do
Native{..} <- gets llvmTarget
let
ncpu = gangSize
kernel | ncpu == 1 = "foldSegS"
| otherwise = "foldSegP"
n | ncpu == 1 = ss
| otherwise = ss - 1
ppt = n
liftIO $ do
out <- allocateArray (sh :. n)
execute executableR kernel $ \f ->
executeOp ppt fillP f gamma aenv (IE 0 (size (sh :. n))) out
return out
scanOp
:: (Shape sh, Elt e)
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> sh :. Int
-> LLVM Native (Array (sh:.Int) e)
scanOp kernel gamma aenv stream (sz :. n) =
case n of
0 -> simpleNamed "generate" kernel gamma aenv stream (sz :. 1)
_ -> scanCore kernel gamma aenv stream sz n (n+1)
scan1Op
:: (Shape sh, Elt e)
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> sh :. Int
-> LLVM Native (Array (sh:.Int) e)
scan1Op kernel gamma aenv stream (sz :. n)
= $boundsCheck "scan1" "empty array" (n > 0)
$ scanCore kernel gamma aenv stream sz n n
scanCore
:: forall aenv sh e. (Shape sh, Elt e)
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> sh
-> Int
-> Int
-> LLVM Native (Array (sh:.Int) e)
scanCore NativeR{..} gamma aenv () sz n m = do
Native{..} <- gets llvmTarget
let
ncpu = gangSize
stride = defaultLargePPT `min` ((n + ncpu - 1) `quot` ncpu)
steps = (n + stride - 1) `quot` stride
steps' = steps - 1
if ncpu == 1 || rank sz > 0 || n <= 2 * defaultLargePPT
then liftIO $ do
out <- allocateArray (sz :. m)
execute executableR "scanS" $ \f ->
executeOp 1 fillP f gamma aenv (IE 0 (size sz)) out
return out
else liftIO $ do
out <- allocateArray (sz :. m)
tmp <- allocateArray (Z :. steps) :: IO (Vector e)
execute executableR "scanP1" $ \f1 -> do
execute executableR "scanP2" $ \f2 -> do
execute executableR "scanP3" $ \f3 -> do
executeOp 1 fillP f1 gamma aenv (IE 0 steps) (stride, steps', out, tmp)
executeOp 1 fillS f2 gamma aenv (IE 0 steps) tmp
executeOp 1 fillP f3 gamma aenv (IE 0 steps') (stride, out, tmp)
return out
scan'Op
:: forall aenv sh e. (Shape sh, Elt e)
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> sh :. Int
-> LLVM Native (Array (sh:.Int) e, Array sh e)
scan'Op native gamma aenv stream sh@(sz :. n) =
case n of
0 -> do
out <- liftIO $ allocateArray (sz :. 0)
sum <- simpleNamed "generate" native gamma aenv stream sz
return (out, sum)
_ -> scan'Core native gamma aenv stream sh
scan'Core
:: forall aenv sh e. (Shape sh, Elt e)
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> sh :. Int
-> LLVM Native (Array (sh:.Int) e, Array sh e)
scan'Core NativeR{..} gamma aenv () sh@(sz :. n) = do
Native{..} <- gets llvmTarget
let
ncpu = gangSize
stride = defaultLargePPT `min` ((n + ncpu - 1) `quot` ncpu)
steps = (n + stride - 1) `quot` stride
steps' = steps - 1
if ncpu == 1 || rank sz > 0 || n <= 2 * defaultLargePPT
then liftIO $ do
out <- allocateArray sh
sum <- allocateArray sz
execute executableR "scanS" $ \f ->
executeOp 1 fillP f gamma aenv (IE 0 (size sz)) (out,sum)
return (out,sum)
else liftIO $ do
tmp <- allocateArray (Z :. steps) :: IO (Vector e)
out <- allocateArray sh
sum <- allocateArray sz
execute executableR "scanP1" $ \f1 -> do
execute executableR "scanP2" $ \f2 -> do
execute executableR "scanP3" $ \f3 -> do
executeOp 1 fillP f1 gamma aenv (IE 0 steps) (stride, steps', out, tmp)
executeOp 1 fillS f2 gamma aenv (IE 0 steps) (sum, tmp)
executeOp 1 fillP f3 gamma aenv (IE 0 steps') (stride, out, tmp)
return (out,sum)
permuteOp
:: (Shape sh, Shape sh', Elt e)
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> Bool
-> sh
-> Array sh' e
-> LLVM Native (Array sh' e)
permuteOp NativeR{..} gamma aenv () inplace shIn dfs = do
Native{..} <- gets llvmTarget
out <- if inplace
then return dfs
else cloneArray dfs
let
ncpu = gangSize
n = size shIn
m = size (shape out)
if ncpu == 1 || n <= defaultLargePPT
then liftIO $ do
execute executableR "permuteS" $ \f ->
executeOp 1 fillS f gamma aenv (IE 0 n) out
else liftIO $ do
symbols <- nm executableR
if "permuteP_rmw" `elem` symbols
then do
execute executableR "permuteP_rmw" $ \f ->
executeOp defaultLargePPT fillP f gamma aenv (IE 0 n) out
else do
barrier@(Array _ adb) <- allocateArray (Z :. m) :: IO (Vector Word8)
memset (ptrsOfArrayData adb) 0 m
execute executableR "permuteP_mutex" $ \f ->
executeOp defaultLargePPT fillP f gamma aenv (IE 0 n) (out, barrier)
return out
stencil1Op
:: (Shape sh, Elt b)
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> Array sh a
-> LLVM Native (Array sh b)
stencil1Op kernel gamma aenv stream arr =
simpleOp kernel gamma aenv stream (shape arr)
stencil2Op
:: (Shape sh, Elt c)
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> Array sh a
-> Array sh b
-> LLVM Native (Array sh c)
stencil2Op kernel gamma aenv stream arr brr =
simpleOp kernel gamma aenv stream (shape arr `intersect` shape brr)
executeOp
:: Marshalable args
=> Int
-> Executable
-> (String, [Arg] -> IO ())
-> Gamma aenv
-> Aval aenv
-> Range
-> args
-> IO ()
executeOp ppt exe (name, f) gamma aenv r args =
runExecutable exe name ppt r $ \start end _tid ->
monitorProcTime $
f =<< marshal (undefined::Native) () (start, end, args, (gamma, aenv))
memset :: Ptr Word8 -> Word8 -> Int -> IO ()
memset p w s = c_memset p (fromIntegral w) (fromIntegral s) >> return ()
foreign import ccall unsafe "string.h memset" c_memset
:: Ptr Word8 -> CInt -> CSize -> IO (Ptr Word8)
monitorProcTime :: IO a -> IO a
monitorProcTime = Debug.withProcessor Debug.Native