// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Mehdi Goli Codeplay Software Ltd. // Ralph Potter Codeplay Software Ltd. // Luke Iwanski Codeplay Software Ltd. // Contact: // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. /***************************************************************** * TensorSyclPlaceHolderExpr.h * * \brief: * This is the specialisation of the placeholder expression based on the * operation type * *****************************************************************/ #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP namespace Eigen { namespace internal { template struct syclGenericBufferReducer{ template static void run(BufferTOut* bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ do { auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable { cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)}, cl::sycl::range<1>{std::min(length, local)}}; /* Two accessors are used: one to the buffer that is being reduced, * and a second to local memory, used to store intermediate data. */ auto aI = bufI.template get_access(h); auto aOut = bufOut->template get_access(h); cl::sycl::accessor scratch(cl::sycl::range<1>(local), h); /* The parallel_for invocation chosen is the variant with an nd_item * parameter, since the code requires barriers for correctness. */ h.parallel_for( r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) { size_t globalid = id.get_global(0); size_t localid = id.get_local(0); /* All threads collectively read from global memory into local. * The barrier ensures all threads' IO is resolved before * execution continues (strictly speaking, all threads within * a single work-group - there is no co-ordination between * work-groups, only work-items). */ if (globalid < length) { scratch[localid] = aI[globalid]; } id.barrier(cl::sycl::access::fence_space::local_space); /* Apply the reduction operation between the current local * id and the one on the other half of the vector. */ if (globalid < length) { int min = (length < local) ? length : local; for (size_t offset = min / 2; offset > 0; offset /= 2) { if (localid < offset) { scratch[localid] += scratch[localid + offset]; } id.barrier(cl::sycl::access::fence_space::local_space); } /* The final result will be stored in local id 0. */ if (localid == 0) { aI[id.get_group(0)] = scratch[localid]; if((length<=local) && globalid ==0){ aOut[globalid]=scratch[localid]; } } } }); }; dev.m_queue.submit(f); dev.m_queue.throw_asynchronous(); /* At this point, you could queue::wait_and_throw() to ensure that * errors are caught quickly. However, this would likely impact * performance negatively. */ length = length / local; } while (length > 1); } }; /// For now let's start with a full reducer /// Self is useless here because in expression construction we are going to treat reduction as a leafnode. /// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the /// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as // a leafNode. template struct FullReducer { typedef typename Self::CoeffReturnType CoeffReturnType; static const bool HasOptimizedImplementation = false; static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) { typedef const typename Self::ChildType HostExpr; /// this is the child of reduction typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; auto functors = TensorSycl::internal::extractFunctors(self.impl()); int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread. size_t inputSize =self.impl().dimensions().TotalSize(); size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input size_t remaining = inputSize% red_factor; if(rng ==0) { red_factor=1; }; size_t tileSize =dev.m_queue.get_device(). template get_info()/2; size_t GRange=std::max((size_t )1, rng); // convert global range to power of 2 for redecution GRange--; GRange |= GRange >> 1; GRange |= GRange >> 2; GRange |= GRange >> 4; GRange |= GRange >> 8; GRange |= GRange >> 16; #if __x86_64__ || __ppc64__ || _WIN64 GRange |= GRange >> 32; #endif GRange++; size_t outTileSize = tileSize; /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one. if (GRange < outTileSize) outTileSize=GRange; // getting final out buffer at the moment the created buffer is true because there is no need for assign auto out_buffer =dev.template get_sycl_buffer::type>(self.dimensions().TotalSize(), output); /// creating the shared memory for calculating reduction. /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can /// recursively apply reduction on it in order to reduce the whole. auto temp_global_buffer =cl::sycl::buffer(cl::sycl::range<1>(GRange)); typedef typename Eigen::internal::remove_all::type Dims; Dims dims= self.xprDims(); Op functor = reducer; dev.m_queue.submit([&](cl::sycl::handler &cgh) { // create a tuple of accessors from Evaluator auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); auto tmp_global_accessor = temp_global_buffer. template get_access(cgh); cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) { typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; auto device_expr = TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. const auto device_self_expr= TensorReductionOp(device_expr.expr, dims, functor); /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is /// the device_evaluator is detectable and recognisable on the device. auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::DefaultDevice()); /// const cast added as a naive solution to solve the qualifier drop error auto globalid=itemID.get_global_linear_id(); if(globalid::reduce(device_self_evaluator, red_factor*globalid, red_factor, const_cast(functor)); else tmp_global_accessor.get_pointer()[globalid]=static_cast(0); if(remaining!=0 && globalid==0 ) // this will add the rest of input buffer when the input size is not devidable to red_factor. tmp_global_accessor.get_pointer()[globalid]+=InnerMostDimReducer::reduce(device_self_evaluator, red_factor*(rng), remaining, const_cast(functor)); }); }); dev.m_queue.throw_asynchronous(); /// This is used to recursively reduce the tmp value to an element of 1; syclGenericBufferReducer::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize); } }; template struct InnerReducer { typedef typename Self::CoeffReturnType CoeffReturnType; static const bool HasOptimizedImplementation = false; static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) { typedef const typename Self::ChildType HostExpr; /// this is the child of reduction typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; auto functors = TensorSycl::internal::extractFunctors(self.impl()); size_t tileSize =dev.m_queue.get_device(). template get_info()/2; size_t GRange=num_coeffs_to_preserve; if (tileSize>GRange) tileSize=GRange; else if(GRange>tileSize){ size_t xMode = GRange % tileSize; if (xMode != 0) GRange += (tileSize - xMode); } // getting final out buffer at the moment the created buffer is true because there is no need for assign /// creating the shared memory for calculating reduction. /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can /// recursively apply reduction on it in order to reduce the whole. typedef typename Eigen::internal::remove_all::type Dims; Dims dims= self.xprDims(); Op functor = reducer; dev.m_queue.submit([&](cl::sycl::handler &cgh) { // create a tuple of accessors from Evaluator auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); auto output_accessor = dev.template get_sycl_accessor(num_coeffs_to_preserve,cgh, output); cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) { typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; auto device_expr = TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. const auto device_self_expr= TensorReductionOp(device_expr.expr, dims, functor); /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is /// the device_evaluator is detectable and recognisable on the device. typedef Eigen::TensorEvaluator DeiceSelf; auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::DefaultDevice()); /// const cast added as a naive solution to solve the qualifier drop error auto globalid=itemID.get_global_linear_id(); if (globalid< static_cast(num_coeffs_to_preserve)) { typename DeiceSelf::CoeffReturnType accum = functor.initialize(); GenericDimReducer::reduce(device_self_evaluator, device_self_evaluator.firstInput(globalid),const_cast(functor), &accum); functor.finalize(accum); output_accessor.get_pointer()[globalid]= accum; } }); }); dev.m_queue.throw_asynchronous(); return false; } }; } // end namespace internal } // namespace Eigen #endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP