/* ----------------------------------------------------------------------------- * * Module : Extras * Copyright : (c) [2009..2010] Trevor L. McDonell * License : BSD * * ---------------------------------------------------------------------------*/ #ifndef __ACCELERATE_CUDA_EXTRAS_H__ #define __ACCELERATE_CUDA_EXTRAS_H__ #include #include #include /* ----------------------------------------------------------------------------- * Textures * ----------------------------------------------------------------------------- * * CUDA texture definitions and access functions are defined in terms of * templates, and hence only available through the C++ interface. Expose some * dummy wrappers to enable parsing with language-c. * * We don't have a definition for `Int' or `Word', since the bitwidth of the * Haskell and C types may be different. * * NOTE ON 64-BIT TYPES * The CUDA device uses little-endian arithmetic. We haven't accounted for the * fact that this may be different on the host, for both initial data transfer * and the unpacking below. */ #if defined(__cplusplus) && defined(__CUDACC__) typedef texture TexWord64; typedef texture TexWord32; typedef texture TexWord16; typedef texture TexWord8; typedef texture TexInt64; typedef texture TexInt32; typedef texture TexInt16; typedef texture TexInt8; typedef texture TexFloat; typedef texture TexCChar; static __inline__ __device__ uint8_t indexArray(TexWord8 t, const int x) { return tex1Dfetch(t,x); } static __inline__ __device__ uint16_t indexArray(TexWord16 t, const int x) { return tex1Dfetch(t,x); } static __inline__ __device__ uint32_t indexArray(TexWord32 t, const int x) { return tex1Dfetch(t,x); } static __inline__ __device__ uint64_t indexArray(TexWord64 t, const int x) { union { uint2 x; uint64_t y; } v; v.x = tex1Dfetch(t,x); return v.y; } static __inline__ __device__ int8_t indexArray(TexInt8 t, const int x) { return tex1Dfetch(t,x); } static __inline__ __device__ int16_t indexArray(TexInt16 t, const int x) { return tex1Dfetch(t,x); } static __inline__ __device__ int32_t indexArray(TexInt32 t, const int x) { return tex1Dfetch(t,x); } static __inline__ __device__ int64_t indexArray(TexInt64 t, const int x) { union { int2 x; int64_t y; } v; v.x = tex1Dfetch(t,x); return v.y; } static __inline__ __device__ float indexArray(TexFloat t, const int x) { return tex1Dfetch(t,x); } static __inline__ __device__ char indexArray(TexCChar t, const int x) { return tex1Dfetch(t,x); } #if defined(__LP64__) typedef TexInt64 TexCLong; typedef TexWord64 TexCULong; #else typedef TexInt32 TexCLong; typedef TexWord32 TexCULong; #endif /* * Synonyms for C-types. NVCC will force Ints to be 32-bits. */ typedef TexInt8 TexCSChar; typedef TexWord8 TexCUChar; typedef TexInt16 TexCShort; typedef TexWord16 TexCUShort; typedef TexInt32 TexCInt; typedef TexWord32 TexCUInt; typedef TexInt64 TexCLLong; typedef TexWord64 TexCULLong; typedef TexFloat TexCFloat; /* * Doubles, only available when compiled for Compute 1.3 and greater */ #if !defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS) typedef texture TexDouble; typedef TexDouble TexCDouble; static __inline__ __device__ double indexDArray(TexDouble t, const int x) { int2 v = tex1Dfetch(t,x); return __hiloint2double(v.y,v.x); } #endif #else typedef void* TexWord64; typedef void* TexWord32; typedef void* TexWord16; typedef void* TexWord8; typedef void* TexInt64; typedef void* TexInt32; typedef void* TexInt16; typedef void* TexInt8; typedef void* TexCShort; typedef void* TexCUShort; typedef void* TexCInt; typedef void* TexCUInt; typedef void* TexCLong; typedef void* TexCULong; typedef void* TexCLLong; typedef void* TexCULLong; typedef void* TexFloat; typedef void* TexDouble; typedef void* TexCFloat; typedef void* TexCDouble; typedef void* TexCChar; typedef void* TexCSChar; typedef void* TexCUChar; void* indexArray(const void*, const int); #endif /* ----------------------------------------------------------------------------- * Shapes * -------------------------------------------------------------------------- */ typedef int32_t Ix; typedef Ix DIM0; typedef Ix DIM1; typedef struct { Ix a1,a0; } DIM2; typedef struct { Ix a2,a1,a0; } DIM3; typedef struct { Ix a3,a2,a1,a0; } DIM4; typedef struct { Ix a4,a3,a2,a1,a0; } DIM5; #ifdef __cplusplus /* * Number of dimensions of a shape */ static __inline__ __device__ int dim(DIM1 sh) { return 1; } static __inline__ __device__ int dim(DIM2 sh) { return 2; } static __inline__ __device__ int dim(DIM3 sh) { return 3; } static __inline__ __device__ int dim(DIM4 sh) { return 4; } static __inline__ __device__ int dim(DIM5 sh) { return 5; } /* * Yield the total number of elements in a shape */ static __inline__ __device__ int size(DIM1 sh) { return sh; } static __inline__ __device__ int size(DIM2 sh) { return sh.a0 * sh.a1; } static __inline__ __device__ int size(DIM3 sh) { return sh.a0 * sh.a1 * sh.a2; } static __inline__ __device__ int size(DIM4 sh) { return sh.a0 * sh.a1 * sh.a2 * sh.a3; } static __inline__ __device__ int size(DIM5 sh) { return sh.a0 * sh.a1 * sh.a2 * sh.a3 * sh.a4; } /* * Convert the individual dimensions of a linear array into a shape */ static __inline__ __device__ DIM1 shape(Ix a) { return a; } static __inline__ __device__ DIM2 shape(Ix b, Ix a) { DIM2 sh = { b, a }; return sh; } static __inline__ __device__ DIM3 shape(Ix c, Ix b, Ix a) { DIM3 sh = { c, b, a }; return sh; } static __inline__ __device__ DIM4 shape(Ix d, Ix c, Ix b, Ix a) { DIM4 sh = { d, c, b, a }; return sh; } static __inline__ __device__ DIM5 shape(Ix e, Ix d, Ix c, Ix b, Ix a) { DIM5 sh = { e, d, c, b, a }; return sh; } /* * Yield the index position in a linear, row-major representation of the array. * First argument is the shape of the array, the second the index */ static __inline__ __device__ Ix toIndex(DIM1 sh, DIM1 ix) { return ix; } static __inline__ __device__ Ix toIndex(DIM2 sh, DIM2 ix) { DIM1 sh_ = sh.a1; DIM1 ix_ = ix.a1; return toIndex(sh_, ix_) * sh.a0 + ix.a0; } static __inline__ __device__ Ix toIndex(DIM3 sh, DIM3 ix) { DIM2 sh_ = { sh.a2, sh.a1 }; DIM2 ix_ = { ix.a2, ix.a1 }; return toIndex(sh_, ix_) * sh.a0 + ix.a0; } static __inline__ __device__ Ix toIndex(DIM4 sh, DIM4 ix) { DIM3 sh_ = { sh.a3, sh.a2, sh.a1 }; DIM3 ix_ = { ix.a3, ix.a2, ix.a1 }; return toIndex(sh_, ix_) * sh.a0 + ix.a0; } static __inline__ __device__ Ix toIndex(DIM5 sh, DIM5 ix) { DIM4 sh_ = { sh.a4, sh.a3, sh.a2, sh.a1 }; DIM4 ix_ = { ix.a4, ix.a3, ix.a2, ix.a1 }; return toIndex(sh_, ix_) * sh.a0 + ix.a0; } /* * Inverse of 'toIndex' */ static __inline__ __device__ DIM1 fromIndex(DIM1 sh, Ix ix) { return ix; } static __inline__ __device__ DIM2 fromIndex(DIM2 sh, Ix ix) { DIM1 sh_ = shape(sh.a1); DIM1 ix_ = fromIndex(sh_, ix / sh.a0); return shape(ix_, ix % sh.a0); } static __inline__ __device__ DIM3 fromIndex(DIM3 sh, Ix ix) { DIM2 sh_ = shape(sh.a2, sh.a1); DIM2 ix_ = fromIndex(sh_, ix / sh.a0); return shape(ix_.a1, ix_.a0, ix % sh.a0); } static __inline__ __device__ DIM4 fromIndex(DIM4 sh, Ix ix) { DIM3 sh_ = shape(sh.a3, sh.a2, sh.a1); DIM3 ix_ = fromIndex(sh_, ix / sh.a0); return shape(ix_.a2, ix_.a1, ix_.a0, ix % sh.a0); } static __inline__ __device__ DIM5 fromIndex(DIM5 sh, Ix ix) { DIM4 sh_ = shape(sh.a4, sh.a3, sh.a2, sh.a1); DIM4 ix_ = fromIndex(sh_, ix / sh.a0); return shape(ix_.a3, ix_.a2, ix_.a1, ix_.a0, ix % sh.a0); } /* * Test for the magic index `ignore` */ static __inline__ __device__ int ignore(DIM1 ix) { return ix == -1; } static __inline__ __device__ int ignore(DIM2 ix) { return ix.a0 == -1 && ix.a1 == -1; } static __inline__ __device__ int ignore(DIM3 ix) { return ix.a0 == -1 && ix.a1 == -1 && ix.a2 == -1; } static __inline__ __device__ int ignore(DIM4 ix) { return ix.a0 == -1 && ix.a1 == -1 && ix.a2 == -1 && ix.a3 == -1; } static __inline__ __device__ int ignore(DIM5 ix) { return ix.a0 == -1 && ix.a1 == -1 && ix.a2 == -1 && ix.a3 == -1 && ix.a4 == -1; } #else int dim(Ix sh); int size(Ix sh); int shape(Ix sh); int toIndex(Ix sh, Ix ix); int fromIndex(Ix sh, Ix ix); #endif /* ----------------------------------------------------------------------------- * Functions * -------------------------------------------------------------------------- */ #ifdef __cplusplus template static __inline__ __device__ T rotateL(const T x, int32_t i) { return (i &= 8 * sizeof(x) - 1) == 0 ? x : x << i | x >> 8 * sizeof(x) - i; } template static __inline__ __device__ T rotateR(const T x, int32_t i) { return (i &= 8 * sizeof(x) - 1) == 0 ? x : x >> i | x << 8 * sizeof(x) - i; } template static __inline__ __device__ T idiv(const T x, const T y) { return x > 0 && y < 0 ? (x - y - 1) / y : (x < 0 && y > 0 ? (x - y + 1) / y : x / y); } template static __inline__ __device__ T mod(const T x, const T y) { const T r = x % y; return x > 0 && y < 0 || x < 0 && y > 0 ? (r != 0 ? r + y : 0) : r; } #endif #endif // __ACCELERATE_CUDA_EXTRAS_H__ // vim: filetype=cuda.c