{-# LANGUAGE QuasiQuotes #-} {-# LANGUAGE PackageImports #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {- Joel Svensson 2013..2015 -} module Obsidian.CodeGen.CompileIM where import Language.C.Quote.CUDA hiding (Block) import qualified "language-c-quote" Language.C.Syntax as C import Obsidian.Exp (IExp(..),IBinOp(..),IUnOp(..)) import Obsidian.Types as T import Obsidian.DimSpec import Obsidian.CodeGen.Program import Data.Word {- Notes: * Generate only CUDA * TODOs - Currently inserts some pointless code The values of tid,wid etc are updated before certain code blocks and reset to a standard value after, often to just be updated again to something else. It may result in prettier code if all code blocks that use a variable sets it to what it should be rather than expecting "standard". *** probably has no effect on performance *** -} --------------------------------------------------------------------------- -- Config --------------------------------------------------------------------------- data Config = Config { configThreadsPerBlock :: Word32, configSharedMem :: Word32} --------------------------------------------------------------------------- -- compileExp (maybe a bad name) --------------------------------------------------------------------------- compileExp :: IExp -> C.Exp compileExp (IVar name t) = [cexp| $id:name |] -- TODO: Fix all this! -- compileExp (IBlockIdx X) = [cexp| $id:("bid")|] -- [cexp| $id:("blockIdx.x") |] -- compileExp (IBlockIdx Y) = [cexp| $id:("blockIdx.y") |] -- compileExp (IBlockIdx Z) = [cexp| $id:("blockIdx.z") |] -- compileExp (IThreadIdx X) = [cexp| $id:("threadIdx.x") |] -- compileExp (IThreadIdx Y) = [cexp| $id:("threadIdx.y") |] -- compileExp (IThreadIdx Z) = [cexp| $id:("threadIdx.z") |] -- compileExp (IBlockDim X) = [cexp| $id:("blockDim.x") |] -- compileExp (IBlockDim Y) = [cexp| $id:("blockDim.y") |] -- compileExp (IBlockDim Z) = [cexp| $id:("blockDim.z") |] -- compileExp (IGridDim X) = [cexp| $id:("GridDim.x") |] -- compileExp (IGridDim Y) = [cexp| $id:("GridDim.y") |] -- compileExp (IGridDim Z) = [cexp| $id:("GridDim.z") |] compileExp (IBool True) = [cexp|1|] compileExp (IBool False) = [cexp|0|] compileExp (IInt8 n) = [cexp| $int:(toInteger n) |] compileExp (IInt16 n) = [cexp| $int:(toInteger n) |] compileExp (IInt32 n) = [cexp| $int:(toInteger n) |] compileExp (IInt64 n) = [cexp| $lint:(toInteger n) |] compileExp (IWord8 n) = [cexp| $uint:(toInteger n) |] compileExp (IWord16 n) = [cexp| $uint:(toInteger n) |] compileExp (IWord32 n) = [cexp| $uint:(toInteger n) |] compileExp (IWord64 n) = [cexp| $ulint:(toInteger n) |] compileExp (IFloat n) = [cexp| $float:(toRational n) |] compileExp (IDouble n) = [cexp| $double:(toRational n) |] -- Implementing these may be a bit awkward -- given there are no vector literals in cuda. compileExp (IFloat2 n m) = error "IFloat2 unhandled" compileExp (IFloat3 n m l) = error "IFloat3 unhandled" compileExp (IFloat4 n m l k) = error "IFloat4 unhandled" compileExp (IDouble2 n m) = error "IDouble2 unhandled" compileExp (IInt8_2 n m) = error "FIXME" compileExp (IInt8_3 n m k) = error "FIXME" compileExp (IInt8_4 n m k l) = error "FIXME" compileExp (IInt16_2 n m ) = error "FIXME" compileExp (IInt16_3 n m k) = error "FIXME" compileExp (IInt16_4 n m k l) = error "FIXME" compileExp (IInt32_2 n m) = error "FIXME" compileExp (IInt32_3 n m k) = error "FIXME" compileExp (IInt32_4 n m k l) = error "FIXME" compileExp (IInt64_2 n m) = error "FIXME" compileExp (IInt64_3 n m k) = error "FIXME" compileExp (IInt64_4 n m k l) = error "FIXME" compileExp (IWord8_2 n m) = error "FIXME" compileExp (IWord8_3 n m k) = error "FIXME" compileExp (IWord8_4 n m k l) = error "FIXME" compileExp (IWord16_2 n m ) = error "FIXME" compileExp (IWord16_3 n m k) = error "FIXME" compileExp (IWord16_4 n m k l) = error "FIXME" compileExp (IWord32_2 n m) = error "FIXME" compileExp (IWord32_3 n m k) = error "FIXME" compileExp (IWord32_4 n m k l) = error "FIXME" compileExp (IWord64_2 n m) = error "FIXME" compileExp (IWord64_3 n m k) = error "FIXME" compileExp (IWord64_4 n m k l) = error "FIXME" compileExp (IIndex (i1,[e]) t) = [cexp| $(compileExp i1)[$(compileExp e)] |] compileExp a@(IIndex (_,_) _) = error $ "compileExp: Malformed index expression " ++ show a compileExp (ICond e1 e2 e3 t) = [cexp| $(compileExp e1) ? $(compileExp e2) : $(compileExp e3) |] compileExp (IBinOp op e1 e2 t) = go op where x = compileExp e1 y = compileExp e2 go IAdd = [cexp| $x + $y |] go ISub = [cexp| $x - $y |] go IMul = [cexp| $x * $y |] go IDiv = [cexp| $x / $y |] go IMod = [cexp| $x % $y |] go IEq = [cexp| $x == $y |] go INotEq = [cexp| $x != $y |] go ILt = [cexp| $x < $y |] go IGt = [cexp| $x > $y |] go IGEq = [cexp| $x >= $y |] go ILEq = [cexp| $x <= $y |] go IAnd = [cexp| $x && $y |] go IOr = [cexp| $x || $y |] go IPow = case t of Float -> [cexp|powf($x,$y) |] Double -> [cexp|pow($x,$y) |] _ -> error $ "IPow applied at wrong type" go IBitwiseAnd = [cexp| $x & $y |] go IBitwiseOr = [cexp| $x | $y |] go IBitwiseXor = [cexp| $x ^ $y |] go IShiftL = [cexp| $x << $y |] go IShiftR = [cexp| $x >> $y |] compileExp (IUnOp op e t) = go op where x = compileExp e go IBitwiseNeg = [cexp| ~$x|] go INot = [cexp| !$x|] go IGetX = [cexp| $x.x|] go IGetY = [cexp| $x.y|] go IGetZ = [cexp| $x.z|] go IGetW = [cexp| $x.w|] compileExp (IFunCall name es t) = [cexp| $fc |] where es' = map compileExp es fc = [cexp| $id:(name)($args:(es')) |] compileExp (ICast e t) = [cexp| ($ty:(compileType t)) $e' |] where e' = compileExp e compileType :: T.Type -> C.Type compileType (Int8) = [cty| typename int8_t |] compileType (Int16) = [cty| typename int16_t |] compileType (Int32) = [cty| typename int32_t |] compileType (Int64) = [cty| typename int64_t |] compileType (Word8) = [cty| typename uint8_t |] compileType (Word16) = [cty| typename uint16_t |] compileType (Word32) = [cty| typename uint32_t |] compileType (Word64) = [cty| typename uint64_t |] compileType (Float) = [cty| float |] compileType (Double) = [cty| double |] compileType (Vec2 Float) = [cty| float4|] compileType (Vec3 Float) = [cty| float3|] compileType (Vec4 Float) = [cty| float2|] compileType (Vec2 Double) = [cty| double2|] -- How does this interplay with my use of uint8_t etc. Here it is char! compileType (Vec2 Int8) = [cty| char2|] compileType (Vec3 Int8) = [cty| char3|] compileType (Vec4 Int8) = [cty| char4|] compileType (Vec2 Int16) = [cty| short2|] compileType (Vec3 Int16) = [cty| short3|] compileType (Vec4 Int16) = [cty| short4|] compileType (Vec2 Int32) = [cty| int2|] compileType (Vec3 Int32) = [cty| int3|] compileType (Vec4 Int32) = [cty| int4|] compileType (Vec2 Word8) = [cty| uchar2|] compileType (Vec3 Word8) = [cty| uchar3|] compileType (Vec4 Word8) = [cty| uchar4|] compileType (Vec2 Word16) = [cty| ushort2|] compileType (Vec3 Word16) = [cty| ushort3|] compileType (Vec4 Word16) = [cty| ushort4|] compileType (Vec2 Word32) = [cty| uint2|] compileType (Vec3 Word32) = [cty| uint3|] compileType (Vec4 Word32) = [cty| uint4|] compileType (Shared t) = [cty| __shared__ $ty:(compileType t) |] compileType (Pointer t) = [cty| $ty:(compileType t)* |] compileType (Volatile t) = [cty| volatile $ty:(compileType t)|] compileType t = error $ "compileType: Not implemented " ++ show t {- *** THIS IS SOLVED ! *** Solve the volatile issue. When operating in a warp without syncs Add Volatile to pointers and only there! Need a way to convey this information all the way from the force functions down to the code generator. Also, Change code generation to declare names of pointers into shared memory at different types (that are used in the program) __shared__ uint8_t sbase[X] uint32_t *sbaseU32 = sbase; float *sbaseF = sbase; This changes how offsets are computed within code. May need names for each intermediate array used in the program __shared__ uint8_t sbase[X] uint32_t *arr1 = sbase; uint32_t *arr2 = (sbase + 16U); float *arr3 = sbase; float *arr4 = (sbase + 4U); -} --------------------------------------------------------------------------- -- Statement t to Stm --------------------------------------------------------------------------- compileStm :: Config -> Statement t -> [C.Stm] compileStm c (SAssign name [] e) = [[cstm| $(compileExp name) = $(compileExp e);|]] compileStm c (SAssign name [ix] e) = [[cstm| $(compileExp name)[$(compileExp ix)] = $(compileExp e); |]] compileStm c (SAtomicOp name ix atop) = case atop of AtInc -> [[cstm| atomicInc(&$(compileExp name)[$(compileExp ix)],0xFFFFFFFF); |]] AtAdd e -> [[cstm| atomicAdd(&$(compileExp name)[$(compileExp ix)],$(compileExp e));|]] AtSub e -> [[cstm| atomicSub(&$(compileExp name)[$(compileExp ix)],$(compileExp e));|]] AtExch e -> [[cstm| atomicExch(&$(compileExp name)[$(compileExp ix)],$(compileExp e));|]] compileStm c (SCond be im) = [[cstm| if ($(compileExp be)) { $stms:body } |]] where body = compileIM c im -- (compileIM p c im) compileStm c (SSeqFor loopVar n im) = [[cstm| for (int $id:loopVar = 0; $id:loopVar < $(compileExp n); ++$id:loopVar) { $stms:body } |]] where body = compileIM c im -- (compileIM p c im) -- Just relay to specific compileFunction compileStm c a@(SForAll lvl n im) = compileForAll c a compileStm c a@(SDistrPar lvl n im) = compileDistr c a compileStm c (SSeqWhile b im) = [[cstm| while ($(compileExp b)) { $stms:body}|]] where body = compileIM c im compileStm c SSynchronize = [[cstm| __syncthreads(); |]] compileStm _ (SAllocate _ _ _) = [] compileStm _ (SDeclare name t) = [] compileStm _ a = error $ "compileStm: missing case " --------------------------------------------------------------------------- -- DistrPar --------------------------------------------------------------------------- compileDistr :: Config -> Statement t -> [C.Stm] compileDistr c (SDistrPar Block n im) = codeQ ++ codeR -- New here is BLOCK virtualisation where cim = compileIM c im -- ++ [[cstm| __syncthreads();|]] numBlocks = [cexp| $id:("gridDim.x") |] blocksQ = [cexp| $exp:(compileExp n) / $exp:numBlocks|] blocksR = [cexp| $exp:(compileExp n) % $exp:numBlocks|] codeQ = [[cstm| for (int b = 0; b < $exp:blocksQ; ++b) { $stms:bodyQ }|]] bodyQ = [cstm| $id:("bid") = blockIdx.x * $exp:blocksQ + b;|] : cim ++ [[cstm| bid = blockIdx.x;|], [cstm| __syncthreads();|]] -- yes no ? codeR = [[cstm| bid = ($exp:numBlocks * $exp:blocksQ) + blockIdx.x;|], [cstm| if (blockIdx.x < $exp:blocksR) { $stms:cim }|], [cstm| bid = blockIdx.x;|], [cstm| __syncthreads();|]] -- yes no ? -- Can I be absolutely sure that 'n' here is statically known ? -- I must look over the functions that can potentially create this IM. -- Can make a separate case for unknown 'n' but generate worse code. -- (That is true for all levels) compileDistr c (SDistrPar Warp (IWord32 n) im) = codeQ ++ codeR -- Here the 'im' should be distributed over 'n'warps. -- 'im' uses a warpID variable to identify what warp it is. -- 'n' may be higher than the actual number of warps we have! -- So GPU warp virtualisation is needed. where cim = compileIM c im nWarps = fromIntegral $ configThreadsPerBlock c `div` 32 numWarps = [cexp| $int:nWarps|] warpsQ = [cexp| $int:(n `div` nWarps)|] warpsR = [cexp| $int:(n `mod` nWarps)|] codeQ = [[cstm| for (int w = 0; w < $exp:warpsQ; ++w) { $stms:bodyQ } |]] bodyQ = [cstm| warpID = (threadIdx.x / 32) * $exp:warpsQ + w;|] : cim ++ --[cstm| warpID = w * $exp:warpsQ + (threadIdx.x / 32);|] : cim ++ [[cstm| warpID = threadIdx.x / 32;|]] codeR = case (n `mod` nWarps) of 0 -> [] n -> [[cstm| warpID = ($exp:numWarps * $exp:warpsQ)+ (threadIdx.x / 32);|], [cstm| if (threadIdx.x / 32 < $exp:warpsR) { $stms:cim } |], [cstm| warpID = threadIdx.x / 32; |], [cstm| __syncthreads();|]] --------------------------------------------------------------------------- -- ForAll --------------------------------------------------------------------------- compileForAll :: Config -> Statement t -> [C.Stm] compileForAll c (SForAll Warp (IWord32 n) im) = codeQ ++ codeR where nt = 32 q = n `div` nt r = n `mod` nt cim = compileIM c im codeQ = case q of 0 -> [] 1 -> cim n -> [[cstm| for ( int vw = 0; vw < $int:q; ++vw) { $stms:body } |], [cstm| $id:("warpIx") = threadIdx.x % 32; |]] --[cstm| __syncthreads();|]] where body = [cstm|$id:("warpIx") = vw*$int:nt + (threadIdx.x % 32); |] : cim --body = [cstm|$id:("warpIx") = (threadIdx.x % 32) * q + vw; |] : cim codeR = case r of 0 -> [] n -> [[cstm| if ((threadIdx.x % 32) < $int:r) { $id:("warpIx") = $int:(q*32) + (threadIdx.x % 32); $stms:cim } |], -- [cstm| __syncthreads();|], [cstm| $id:("warpIx") = threadIdx.x % 32; |]] compileForAll c (SForAll Block (IWord32 n) im) = goQ ++ goR where cim = compileIM c im -- ++ [[cstm| __syncthreads();|]] nt = configThreadsPerBlock c q = n `quot` nt r = n `rem` nt -- q is the number full "passes" needed to cover the iteration -- space given we have nt threads. goQ = case q of 0 -> [] 1 -> cim -- [cstm|$id:loopVar = threadIdx.x; |]:cim --do -- stm <- updateTid [cexp| threadIdx.x |] -- return $ [cstm| $id:loopVar = threadIdx.x; |] : cim n -> [[cstm| for ( int i = 0; i < $int:q; ++i) { $stms:body } |], -- __syncthreads(); } |], [cstm| $id:("tid") = threadIdx.x; |]] -- [cstm| __syncthreads();|]] where body = [cstm|$id:("tid") = i*$int:nt + threadIdx.x; |] : cim -- r is the number of elements left. -- This generates code for when fewer threads are -- needed than available. (some threads shut down due to the conditional). goR = case (r,q) of (0,_) -> [] --(n,0) -> [[cstm| if (threadIdx.x < $int:n) { -- $stms:cim } |]] (n,m) -> [[cstm| if (threadIdx.x < $int:n) { $id:("tid") = $int:(q*nt) + threadIdx.x; $stms:cim } |], [cstm| $id:("tid") = threadIdx.x; |]] compileForAll c (SForAll Grid n im) = error "compileForAll: Grid" -- cim -- The grid case is special. May need more thought -- -- The problem with this case is that -- I need to come up with a blocksize (but without any guidance) -- from the programmer. -- Though! There is no way the programmer could provide any -- such info ... where cim = compileIM c im -- compileForAll PlatformC c (SForAll lvl (IWord32 n) im) = go -- where -- body = compileIM PlatformC c im -- go = [ [cstm| for (int i = 0; i <$int:n; ++i) { $stms:body } |] ] --------------------------------------------------------------------------- -- CompileIM to list of Stm --------------------------------------------------------------------------- compileIM :: Config -> IMList a -> [C.Stm] compileIM conf im = concatMap ((compileStm conf) . fst) im --------------------------------------------------------------------------- -- Generate entire Kernel --------------------------------------------------------------------------- type Parameters = [(String,T.Type)] compile :: Config -> String -> (Parameters,IMList a) -> C.Definition compile config kname (params,im) = go where stms = compileIM config im ps = compileParams params go = [cedecl| extern "C" __global__ void $id:kname($params:ps) {$items:cudabody} |] -- go PlatformOpenCL -- = [CL.cedecl| __kernel void $id:kname($params:ps) {$stms:stms} |] -- go PlatformC -- = [cedecl| extern "C" void $id:kname($params:ps) {$items:cbody} |] cudabody = (if (configSharedMem config > 0) -- then [BlockDecl [cdecl| extern volatile __shared__ typename uint8_t sbase[]; |]] then [C.BlockDecl [cdecl| __shared__ typename uint8_t sbase[$uint:(configSharedMem config)] ; |]] else []) ++ --[BlockDecl [cdecl| typename uint32_t tid = threadIdx.x; |]] ++ --[BlockDecl [cdecl| typename uint32_t warpID = threadIdx.x / 32; |], -- BlockDecl [cdecl| typename uint32_t warpIx = threadIdx.x % 32; |]] ++ -- [BlockDecl [cdecl| typename uint32_t bid = blockIdx.x; |]] ++ (if (usesGid im) then [C.BlockDecl [cdecl| typename uint32_t gid = blockIdx.x * blockDim.x + threadIdx.x; |]] else []) ++ (if (usesBid im) then [C.BlockDecl [cdecl| typename uint32_t bid = blockIdx.x; |]] else []) ++ (if (usesTid im) then [C.BlockDecl [cdecl| typename uint32_t tid = threadIdx.x; |]] else []) ++ (if (usesWarps im) then [C.BlockDecl [cdecl| typename uint32_t warpID = threadIdx.x / 32; |], C.BlockDecl [cdecl| typename uint32_t warpIx = threadIdx.x % 32; |]] else []) ++ -- All variables used will be unique and can be declared -- at the top level concatMap declares im ++ -- Not sure if I am using language.C correctly. -- Maybe compileSTM should create BlockStms ? -- TODO: look how Nikola does it. map C.BlockStm stms cbody = -- add memory allocation map C.BlockStm stms -- Declare variables. declares :: (Statement t,t) -> [C.BlockItem] declares (SDeclare name t,_) = [C.BlockDecl [cdecl| $ty:(compileType t) $id:name;|]] declares (SCond _ im,_) = concatMap declares im declares (SSeqWhile _ im,_) = concatMap declares im declares (SForAll _ _ im,_) = concatMap declares im declares (SDistrPar _ _ im,_) = concatMap declares im declares (SSeqFor _ _ im,_) = concatMap declares im declares _ = [] --------------------------------------------------------------------------- -- Parameter lists for functions (kernel head) --------------------------------------------------------------------------- compileParams :: Parameters -> [C.Param] -- compileParams PlatformOpenCL = map go -- where -- go (name,Pointer t) = [CL.cparam| global $ty:(compileType t) $id:name |] -- go (name, t) = [CL.cparam| $ty:(compileType t) $id:name |] -- C or CUDA compileParams = map go where go (name,t) = [cparam| $ty:(compileType t) $id:name |] --------------------------------------------------------------------------- -- Compile with shared memory arrays declared at top --------------------------------------------------------------------------- -- CODE DUPLICATION FOR NOW compileDeclsTop :: Config -> [(String,((Word32,Word32),T.Type))] -> String -> (Parameters,IMList a) -> C.Definition compileDeclsTop config toplevelarrs kname (params,im) = go where stms = compileIM config im ps = compileParams params go = [cedecl| extern "C" __global__ void $id:kname($params:ps) {$items:cudabody} |] -- go PlatformOpenCL -- = [CL.cedecl| __kernel void $id:kname($params:ps) {$stms:stms} |] -- go PlatformC -- = [cedecl| extern "C" void $id:kname($params:ps) {$items:cbody} |] cudabody = (if (configSharedMem config > 0) -- then [BlockDecl [cdecl| extern volatile __shared__ typename uint8_t sbase[]; |]] then [C.BlockDecl [cdecl| __shared__ typename uint8_t sbase[$uint:(configSharedMem config)]; |]] else []) ++ --[BlockDecl [cdecl| typename uint32_t tid = threadIdx.x; |]] ++ --[BlockDecl [cdecl| typename uint32_t warpID = threadIdx.x / 32; |], -- BlockDecl [cdecl| typename uint32_t warpIx = threadIdx.x % 32; |]] ++ -- [BlockDecl [cdecl| typename uint32_t bid = blockIdx.x; |]] ++ (if (usesGid im) then [C.BlockDecl [cdecl| typename uint32_t gid = blockIdx.x * blockDim.x + threadIdx.x; |]] else []) ++ (if (usesBid im) then [C.BlockDecl [cdecl| typename uint32_t bid = blockIdx.x; |]] else []) ++ (if (usesTid im) then [C.BlockDecl [cdecl| typename uint32_t tid = threadIdx.x; |]] else []) ++ (if (usesWarps im) then [C.BlockDecl [cdecl| typename uint32_t warpID = threadIdx.x / 32; |], C.BlockDecl [cdecl| typename uint32_t warpIx = threadIdx.x % 32; |]] else []) ++ -- declare all arrays used concatMap declareArr toplevelarrs ++ -- All variables used will be unique and can be declared -- at the top level concatMap declares im ++ -- Not sure if I am using language.C correctly. -- Maybe compileSTM should create BlockStms ? -- TODO: look how Nikola does it. map C.BlockStm stms cbody = -- add memory allocation map C.BlockStm stms declareArr :: (String, ((Word32,Word32),T.Type)) -> [C.BlockItem] declareArr (arr,((_,addr),t)) = [C.BlockDecl [cdecl| $ty:(compileType t) $id:arr = ($ty:(compileType t))(sbase + $int:addr);|]]