{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE ForeignFunctionInterface #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
module Data.Array.Accelerate.LLVM.Native.Execute (
executeAcc, executeAfun,
executeOpenAcc
) where
import Data.Array.Accelerate.Analysis.Match
import Data.Array.Accelerate.Array.Sugar
import Data.Array.Accelerate.Error
import Data.Array.Accelerate.LLVM.Analysis.Match
import Data.Array.Accelerate.LLVM.Execute
import Data.Array.Accelerate.LLVM.State
import Data.Array.Accelerate.LLVM.Native.Array.Data
import Data.Array.Accelerate.LLVM.Native.Link
import Data.Array.Accelerate.LLVM.Native.Execute.Async
import Data.Array.Accelerate.LLVM.Native.Execute.Environment
import Data.Array.Accelerate.LLVM.Native.Execute.Marshal
import Data.Array.Accelerate.LLVM.Native.Target
import qualified Data.Array.Accelerate.LLVM.Native.Debug as Debug
import Data.Range.Range ( Range(..) )
import Control.Parallel.Meta ( Executable(..) )
import Data.Array.Accelerate.LLVM.Native.Execute.LBS
import Control.Monad.State ( gets )
import Control.Monad.Trans ( liftIO )
import Data.ByteString.Short ( ShortByteString )
import Data.List ( find )
import Data.Maybe ( fromMaybe )
import Data.Word ( Word8 )
import Prelude hiding ( map, sum, scanl, scanr, init )
import qualified Data.ByteString.Short.Char8 as S8
import qualified Prelude as P
import Foreign.C
import Foreign.LibFFI
import Foreign.Ptr
instance Execute Native where
map = simpleOp
generate = simpleOp
transform = simpleOp
backpermute = simpleOp
fold = foldOp
fold1 = fold1Op
foldSeg = foldSegOp
fold1Seg = foldSegOp
scanl = scanOp
scanl1 = scan1Op
scanl' = scan'Op
scanr = scanOp
scanr1 = scan1Op
scanr' = scan'Op
permute = permuteOp
stencil1 = stencil1Op
stencil2 = stencil2Op
simpleOp
:: (Shape sh, Elt e)
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> sh
-> LLVM Native (Array sh e)
simpleOp exe gamma aenv () sh = withExecutable exe $ \nativeExecutable -> do
let fun = case functionTable nativeExecutable of
f:_ -> f
_ -> $internalError "simpleOp" "no functions found"
Native{..} <- gets llvmTarget
liftIO $ do
out <- allocateArray sh
executeOp defaultLargePPT fillP fun gamma aenv (IE 0 (size sh)) out
return out
simpleNamed
:: (Shape sh, Elt e)
=> ShortByteString
-> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> sh
-> LLVM Native (Array sh e)
simpleNamed name exe gamma aenv () sh = withExecutable exe $ \nativeExecutable -> do
Native{..} <- gets llvmTarget
liftIO $ do
out <- allocateArray sh
executeOp defaultLargePPT fillP (nativeExecutable !# name) gamma aenv (IE 0 (size sh)) out
return out
fold1Op
:: (Shape sh, Elt e)
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> (sh :. Int)
-> LLVM Native (Array sh e)
fold1Op kernel gamma aenv stream sh@(sx :. sz)
= $boundsCheck "fold1" "empty array" (sz > 0)
$ case size sh of
0 -> liftIO $ allocateArray sx
_ -> foldCore kernel gamma aenv stream sh
foldOp
:: (Shape sh, Elt e)
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> (sh :. Int)
-> LLVM Native (Array sh e)
foldOp kernel gamma aenv stream sh@(sx :. _) =
case size sh of
0 -> simpleNamed "generate" kernel gamma aenv stream (listToShape (P.map (max 1) (shapeToList sx)))
_ -> foldCore kernel gamma aenv stream sh
foldCore
:: (Shape sh, Elt e)
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> (sh :. Int)
-> LLVM Native (Array sh e)
foldCore kernel gamma aenv stream sh
| Just Refl <- matchShapeType sh (undefined::DIM1)
= foldAllOp kernel gamma aenv stream sh
| otherwise
= foldDimOp kernel gamma aenv stream sh
foldAllOp
:: forall aenv e. Elt e
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> DIM1
-> LLVM Native (Scalar e)
foldAllOp exe gamma aenv () (Z :. sz) = withExecutable exe $ \nativeExecutable -> do
Native{..} <- gets llvmTarget
let
ncpu = gangSize
stride = defaultLargePPT `min` ((sz + ncpu - 1) `quot` ncpu)
steps = (sz + stride - 1) `quot` stride
if ncpu == 1 || sz <= defaultLargePPT
then liftIO $ do
out <- allocateArray Z
executeOp 1 fillS (nativeExecutable !# "foldAllS") gamma aenv (IE 0 sz) out
return out
else liftIO $ do
out <- allocateArray Z
tmp <- allocateArray (Z :. steps) :: IO (Vector e)
executeOp 1 fillP (nativeExecutable !# "foldAllP1") gamma aenv (IE 0 steps) (sz, stride, tmp)
executeOp 1 fillS (nativeExecutable !# "foldAllP2") gamma aenv (IE 0 steps) (tmp, out)
return out
foldDimOp
:: (Shape sh, Elt e)
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> (sh :. Int)
-> LLVM Native (Array sh e)
foldDimOp exe gamma aenv () (sh :. sz) = withExecutable exe $ \nativeExecutable -> do
Native{..} <- gets llvmTarget
let ppt = defaultSmallPPT `max` (defaultLargePPT `quot` (max 1 sz))
liftIO $ do
out <- allocateArray sh
executeOp ppt fillP (nativeExecutable !# "fold") gamma aenv (IE 0 (size sh)) (sz, out)
return out
foldSegOp
:: (Shape sh, Elt e)
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> (sh :. Int)
-> (Z :. Int)
-> LLVM Native (Array (sh :. Int) e)
foldSegOp exe gamma aenv () (sh :. _) (Z :. ss) = withExecutable exe $ \nativeExecutable -> do
Native{..} <- gets llvmTarget
let
kernel | segmentOffset = "foldSegP"
| otherwise = "foldSegS"
n | segmentOffset = ss - 1
| otherwise = ss
ppt | rank sh == 0 = defaultLargePPT
| otherwise = n
liftIO $ do
out <- allocateArray (sh :. n)
executeOp ppt fillP (nativeExecutable !# kernel) gamma aenv (IE 0 (size (sh :. n))) out
return out
scanOp
:: (Shape sh, Elt e)
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> sh :. Int
-> LLVM Native (Array (sh:.Int) e)
scanOp kernel gamma aenv stream (sz :. n) =
case n of
0 -> simpleNamed "generate" kernel gamma aenv stream (sz :. 1)
_ -> scanCore kernel gamma aenv stream sz n (n+1)
scan1Op
:: (Shape sh, Elt e)
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> sh :. Int
-> LLVM Native (Array (sh:.Int) e)
scan1Op kernel gamma aenv stream (sz :. n)
= $boundsCheck "scan1" "empty array" (n > 0)
$ scanCore kernel gamma aenv stream sz n n
scanCore
:: forall aenv sh e. (Shape sh, Elt e)
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> sh
-> Int
-> Int
-> LLVM Native (Array (sh:.Int) e)
scanCore exe gamma aenv () sz n m = withExecutable exe $ \nativeExecutable -> do
Native{..} <- gets llvmTarget
let
ncpu = gangSize
stride = defaultLargePPT `min` ((n + ncpu - 1) `quot` ncpu)
steps = (n + stride - 1) `quot` stride
steps' = steps - 1
if ncpu == 1 || rank sz > 0 || n <= 2 * defaultLargePPT
then liftIO $ do
out <- allocateArray (sz :. m)
executeOp 1 fillP (nativeExecutable !# "scanS") gamma aenv (IE 0 (size sz)) out
return out
else liftIO $ do
out <- allocateArray (sz :. m)
tmp <- allocateArray (Z :. steps) :: IO (Vector e)
executeOp 1 fillP (nativeExecutable !# "scanP1") gamma aenv (IE 0 steps) (stride, steps', out, tmp)
executeOp 1 fillS (nativeExecutable !# "scanP2") gamma aenv (IE 0 steps) tmp
executeOp 1 fillP (nativeExecutable !# "scanP3") gamma aenv (IE 0 steps') (stride, out, tmp)
return out
scan'Op
:: forall aenv sh e. (Shape sh, Elt e)
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> sh :. Int
-> LLVM Native (Array (sh:.Int) e, Array sh e)
scan'Op native gamma aenv stream sh@(sz :. n) =
case n of
0 -> do
out <- liftIO $ allocateArray (sz :. 0)
sum <- simpleNamed "generate" native gamma aenv stream sz
return (out, sum)
_ -> scan'Core native gamma aenv stream sh
scan'Core
:: forall aenv sh e. (Shape sh, Elt e)
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> sh :. Int
-> LLVM Native (Array (sh:.Int) e, Array sh e)
scan'Core exe gamma aenv () sh@(sz :. n) = withExecutable exe $ \nativeExecutable -> do
Native{..} <- gets llvmTarget
let
ncpu = gangSize
stride = defaultLargePPT `min` ((n + ncpu - 1) `quot` ncpu)
steps = (n + stride - 1) `quot` stride
steps' = steps - 1
if ncpu == 1 || rank sz > 0 || n <= 2 * defaultLargePPT
then liftIO $ do
out <- allocateArray sh
sum <- allocateArray sz
executeOp 1 fillP (nativeExecutable !# "scanS") gamma aenv (IE 0 (size sz)) (out,sum)
return (out,sum)
else liftIO $ do
tmp <- allocateArray (Z :. steps) :: IO (Vector e)
out <- allocateArray sh
sum <- allocateArray sz
executeOp 1 fillP (nativeExecutable !# "scanP1") gamma aenv (IE 0 steps) (stride, steps', out, tmp)
executeOp 1 fillS (nativeExecutable !# "scanP2") gamma aenv (IE 0 steps) (sum, tmp)
executeOp 1 fillP (nativeExecutable !# "scanP3") gamma aenv (IE 0 steps') (stride, out, tmp)
return (out,sum)
permuteOp
:: (Shape sh, Shape sh', Elt e)
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> Bool
-> sh
-> Array sh' e
-> LLVM Native (Array sh' e)
permuteOp exe gamma aenv () inplace shIn dfs = withExecutable exe $ \nativeExecutable -> do
Native{..} <- gets llvmTarget
out <- if inplace
then return dfs
else cloneArray dfs
let
ncpu = gangSize
n = size shIn
m = size (shape out)
if ncpu == 1 || n <= defaultLargePPT
then liftIO $ do
executeOp 1 fillS (nativeExecutable !# "permuteS") gamma aenv (IE 0 n) out
else liftIO $ do
case lookupFunction "permuteP_rmw" nativeExecutable of
Just f -> executeOp defaultLargePPT fillP f gamma aenv (IE 0 n) out
Nothing -> do
barrier@(Array _ adb) <- allocateArray (Z :. m) :: IO (Vector Word8)
memset (ptrsOfArrayData adb) 0 m
executeOp defaultLargePPT fillP (nativeExecutable !# "permuteP_mutex") gamma aenv (IE 0 n) (out, barrier)
return out
stencil1Op
:: (Shape sh, Elt b)
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> Array sh a
-> LLVM Native (Array sh b)
stencil1Op kernel gamma aenv stream arr =
simpleOp kernel gamma aenv stream (shape arr)
stencil2Op
:: (Shape sh, Elt c)
=> ExecutableR Native
-> Gamma aenv
-> Aval aenv
-> Stream
-> Array sh a
-> Array sh b
-> LLVM Native (Array sh c)
stencil2Op kernel gamma aenv stream arr brr =
simpleOp kernel gamma aenv stream (shape arr `intersect` shape brr)
(!#) :: FunctionTable -> ShortByteString -> Function
(!#) exe name
= fromMaybe ($internalError "lookupFunction" ("function not found: " ++ S8.unpack name))
$ lookupFunction name exe
lookupFunction :: ShortByteString -> FunctionTable -> Maybe Function
lookupFunction name nativeExecutable = do
find (\(n,_) -> n == name) (functionTable nativeExecutable)
executeOp
:: Marshalable args
=> Int
-> Executable
-> Function
-> Gamma aenv
-> Aval aenv
-> Range
-> args
-> IO ()
executeOp ppt exe (name, f) gamma aenv r args =
runExecutable exe name ppt r $ \start end _tid ->
monitorProcTime $
callFFI f retVoid =<< marshal (undefined::Native) () (start, end, args, (gamma, aenv))
memset :: Ptr Word8 -> Word8 -> Int -> IO ()
memset p w s = c_memset p (fromIntegral w) (fromIntegral s) >> return ()
foreign import ccall unsafe "string.h memset" c_memset
:: Ptr Word8 -> CInt -> CSize -> IO (Ptr Word8)
monitorProcTime :: IO a -> IO a
monitorProcTime = Debug.withProcessor Debug.Native