// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2014 Benoit Steiner // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H namespace Eigen { namespace internal { #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) // Full reducers for GPU, don't vectorize for now // Reducer function that enables multiple cuda thread to safely accumulate at the same // output address. It basically reads the current value of the output variable, and // attempts to update it with the new value. If in the meantime another cuda thread // updated the content of the output address it will try again. template __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { #if __CUDA_ARCH__ >= 300 if (sizeof(T) == 4) { unsigned int oldval = *reinterpret_cast(output); unsigned int newval = oldval; reducer.reduce(accum, reinterpret_cast(&newval)); if (newval == oldval) { return; } unsigned int readback; while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { oldval = readback; newval = oldval; reducer.reduce(accum, reinterpret_cast(&newval)); if (newval == oldval) { return; } } } else if (sizeof(T) == 8) { unsigned long long oldval = *reinterpret_cast(output); unsigned long long newval = oldval; reducer.reduce(accum, reinterpret_cast(&newval)); if (newval == oldval) { return; } unsigned long long readback; while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) { oldval = readback; newval = oldval; reducer.reduce(accum, reinterpret_cast(&newval)); if (newval == oldval) { return; } } } else { assert(0 && "Wordsize not supported"); } #else assert(0 && "Shouldn't be called on unsupported device"); #endif } // We extend atomicExch to support extra data types template __device__ inline Type atomicExchCustom(Type* address, Type val) { return atomicExch(address, val); } template <> __device__ inline double atomicExchCustom(double* address, double val) { unsigned long long int* address_as_ull = reinterpret_cast(address); return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val))); } #ifdef EIGEN_HAS_CUDA_FP16 template