// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2014 Benoit Steiner // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H // evaluator for thread pool device #ifdef EIGEN_USE_THREADS namespace Eigen { #ifdef EIGEN_USE_SIMPLE_THREAD_POOL namespace internal { template struct packLhsArg { LhsScalar* blockA; const LhsMapper& lhs; const Index m_start; const Index k_start; const Index mc; const Index kc; }; template struct packRhsAndKernelArg { const MaxSizeVector* blockAs; RhsScalar* blockB; const RhsMapper& rhs; OutputMapper& output; const Index m; const Index k; const Index n; const Index mc; const Index kc; const Index nc; const Index num_threads; const Index num_blockAs; const Index max_m; const Index k_block_idx; const Index m_block_idx; const Index n_block_idx; const Index m_blocks; const Index n_blocks; MaxSizeVector* kernel_notifications; const MaxSizeVector* lhs_notifications; const bool need_to_pack; }; } // end namespace internal #endif // EIGEN_USE_SIMPLE_THREAD_POOL template struct TensorEvaluator, ThreadPoolDevice> : public TensorContractionEvaluatorBase, ThreadPoolDevice> > { typedef ThreadPoolDevice Device; typedef TensorEvaluator, Device> Self; typedef TensorContractionEvaluatorBase Base; typedef TensorContractionOp XprType; typedef typename internal::remove_const::type Scalar; typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; enum { Layout = TensorEvaluator::Layout, }; // Most of the code is assuming that both input tensors are ColMajor. If the // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: // If we want to compute A * B = C, where A is LHS and B is RHS, the code // will pretend B is LHS and A is RHS. typedef typename internal::conditional< static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; typedef typename internal::conditional< static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; static const int LDims = internal::array_size::Dimensions>::value; static const int RDims = internal::array_size::Dimensions>::value; static const int ContractDims = internal::array_size::value; typedef array left_dim_mapper_t; typedef array right_dim_mapper_t; typedef array contract_t; typedef array left_nocontract_t; typedef array right_nocontract_t; static const int NumDims = LDims + RDims - 2 * ContractDims; typedef DSizes Dimensions; // typedefs needed in evalTo typedef typename internal::remove_const::type LhsScalar; typedef typename internal::remove_const::type RhsScalar; typedef typename internal::gebp_traits Traits; typedef TensorEvaluator LeftEvaluator; typedef TensorEvaluator RightEvaluator; TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} #ifndef EIGEN_USE_SIMPLE_THREAD_POOL template void evalProduct(Scalar* buffer) const { typedef typename internal::remove_const::type LhsScalar; typedef typename internal::remove_const::type RhsScalar; typedef typename internal::gebp_traits Traits; typedef TensorEvaluator LeftEvaluator; typedef TensorEvaluator RightEvaluator; typedef internal::TensorContractionInputMapper< LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t, contract_t, internal::packet_traits::size, lhs_inner_dim_contiguous, false, Unaligned> LhsMapper; typedef internal::TensorContractionInputMapper< RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t, contract_t, internal::packet_traits::size, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned> RhsMapper; typedef internal::blas_data_mapper OutputMapper; typedef internal::gemm_pack_lhs LhsPacker; typedef internal::gemm_pack_rhs< RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> RhsPacker; typedef internal::gebp_kernel GebpKernel; const Index m = this->m_i_size; const Index n = this->m_j_size; const Index k = this->m_k_size; if (m == 0 || n == 0 || k == 0) return; // Compute a set of algorithm parameters: // - kernel block sizes (bm, bn, bk) // - task grain sizes (number of kernels executed per task: gm, gn) // - number of threads // - sharding by row/column // - parallel packing or first lhs then rhs // and some derived parameters: // - number of tasks (nm, nn, nk) // - number of kernels (nm0, nn0) // Unfortunately, all these parameters are tightly interdependent. // So in some cases we first compute approximate values, then compute other // values based on these approximations and then refine the approximations. // There are lots of heuristics here. There is some reasoning behind them, // but ultimately they are just tuned on contraction benchmarks for // different input configurations, thread counts and instruction sets. // So feel free to question any of them. // Compute whether we want to shard by row or by column. // This is a first approximation, it will be refined later. Since we don't // know number of threads yet we use 2, because what's we are most // interested in at this point is whether it makes sense to use // parallelization at all or not. bool shard_by_col = shardByCol(m, n, 2); // First approximation of kernel blocking sizes. // Again, we don't know number of threads yet, so we use 2. Index bm, bn, bk; if (shard_by_col) { internal::TensorContractionBlocking blocking(k, m, n, 2); bm = blocking.mc(); bn = blocking.nc(); bk = blocking.kc(); } else { internal::TensorContractionBlocking blocking(k, m, n, 2); bm = blocking.mc(); bn = blocking.nc(); bk = blocking.kc(); } // Compute optimal number of threads. // Note: we use bk instead of k here because we are interested in amount of // _parallelizable_ computations, and computations are not parallelizable // across k dimension. const TensorOpCost cost = contractionCost(m, n, bm, bn, bk, shard_by_col, false); int num_threads = TensorCostModel::numThreads( static_cast(n) * m, cost, this->m_device.numThreads()); // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost // model is not tuned. Remove this when the cost model is tuned. if (n == 1) num_threads = 1; if (num_threads == 1) { // The single-threaded algorithm should be faster in this case. if (n == 1) this->template evalGemv(buffer); else this->template evalGemm(buffer); return; } // Now that we know number of threads, recalculate sharding and blocking. shard_by_col = shardByCol(m, n, num_threads); if (shard_by_col) { internal::TensorContractionBlocking blocking(k, m, n, num_threads); bm = blocking.mc(); bn = blocking.nc(); bk = blocking.kc(); } else { internal::TensorContractionBlocking blocking(k, m, n, num_threads); bm = blocking.mc(); bn = blocking.nc(); bk = blocking.kc(); } // Number of kernels for each dimension. Index nm0 = divup(m, bm); Index nn0 = divup(n, bn); Index nk = divup(k, bk); // Calculate task grain size (number of kernels executed per task). // This task size coarsening serves two purposes: // 1. It reduces per-task overheads including synchronization overheads. // 2. It allows to use caches better (reuse the same packed rhs in several // consecutive kernels). Index gm = 1; Index gn = 1; // If we are sharding by column, then we prefer to reduce rows first. if (shard_by_col) { gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col); gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col); } else { gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col); gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col); } // Number of tasks in each dimension. Index nm = divup(nm0, gm); Index nn = divup(nn0, gn); // Last by not least, decide whether we want to issue both lhs and rhs // packing in parallel; or issue lhs packing first, and then issue rhs // packing when lhs packing completes (for !shard_by_col lhs and rhs are // swapped). Parallel packing allows more parallelism (for both packing and // kernels), while sequential packing provides better locality (once // a thread finishes rhs packing it proceed to kernels with that rhs). // First, we are interested in parallel packing if there are few tasks. bool parallel_pack = num_threads >= nm * nn; // Also do parallel packing if all data fits into L2$. if (m * bk * Index(sizeof(LhsScalar)) + n * bk * Index(sizeof(RhsScalar)) <= l2CacheSize() * num_threads) parallel_pack = true; // But don't do it if we will use each rhs only once. Locality seems to be // more important in this case. if ((shard_by_col ? nm : nn) == 1) parallel_pack = false; LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, this->m_left_contracting_strides, this->m_k_strides); RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, this->m_right_contracting_strides, this->m_k_strides); Context(this->m_device, num_threads, lhs, rhs, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0, shard_by_col, parallel_pack) .run(); } // Context coordinates a single parallel gemm operation. template class Context { public: Context(const Device& device, int num_threads, LhsMapper& lhs, RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm, Index bn, Index bk, Index nm, Index nn, Index nk, Index gm, Index gn, Index nm0, Index nn0, bool shard_by_col, bool parallel_pack) : device_(device), lhs_(lhs), rhs_(rhs), buffer_(buffer), output_(buffer, tm), num_threads_(num_threads), shard_by_col_(shard_by_col), parallel_pack_(parallel_pack), m_(tm), n_(tn), k_(tk), bm_(bm), bn_(bn), bk_(bk), nm_(nm), nn_(nn), nk_(nk), gm_(gm), gn_(gn), nm0_(nm0), nn0_(nn0) { for (Index x = 0; x < P; x++) { // Normal number of notifications for k slice switch is // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only // nm_ + nn_ notifications, because they will not receive notifications // from preceeding kernels. state_switch_[x] = x == 0 ? 1 : (parallel_pack_ ? nn_ + nm_ : (shard_by_col_ ? nn_ : nm_)) + (x == P - 1 ? nm_ * nn_ : 0); state_packing_ready_[x] = parallel_pack_ ? 0 : (shard_by_col_ ? nm_ : nn_); state_kernel_[x] = new std::atomic*[nm_]; for (Index m = 0; m < nm_; m++) { state_kernel_[x][m] = new std::atomic[nn_]; // Kernels generally receive 3 notifications (previous kernel + 2 // packing), but the first slice won't get notifications from previous // kernels. for (Index n = 0; n < nn_; n++) state_kernel_[x][m][n].store( (x == 0 ? 0 : 1) + (parallel_pack_ ? 2 : 1), std::memory_order_relaxed); } } // Allocate memory for packed rhs/lhs matrices. size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); size_t lhs_size = divup(bm_ * bk_ * sizeof(LhsScalar), align) * align; size_t rhs_size = divup(bn_ * bk_ * sizeof(RhsScalar), align) * align; packed_mem_ = static_cast(internal::aligned_malloc( (nm0_ * lhs_size + nn0_ * rhs_size) * std::min(nk_, P - 1))); char* mem = static_cast(packed_mem_); for (Index x = 0; x < numext::mini(nk_, P - 1); x++) { packed_lhs_[x].resize(nm0_); for (Index m = 0; m < nm0_; m++) { packed_lhs_[x][m] = reinterpret_cast(mem); mem += lhs_size; } packed_rhs_[x].resize(nn0_); for (Index n = 0; n < nn0_; n++) { packed_rhs_[x][n] = reinterpret_cast(mem); mem += rhs_size; } } } ~Context() { for (Index x = 0; x < P; x++) { for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m]; delete[] state_kernel_[x]; } internal::aligned_free(packed_mem_); } void run() { // Kick off packing of the first slice. signal_switch(0, 1); // Wait for overall completion. // TODO(dvyukov): this wait can lead to deadlock. // If nthreads contractions are concurrently submitted from worker // threads, this wait will block all worker threads and the system will // deadlock. done_.Wait(); } private: Notification done_; const Device& device_; LhsMapper& lhs_; RhsMapper& rhs_; Scalar* const buffer_; OutputMapper output_; const int num_threads_; const bool shard_by_col_; const bool parallel_pack_; // Matrix sizes. const Index m_; const Index n_; const Index k_; // Block sizes. const Index bm_; const Index bn_; const Index bk_; // Number of tasks. const Index nm_; const Index nn_; const Index nk_; // Task grain sizes (number of kernels executed per task). const Index gm_; const Index gn_; // Number of blocks (this is different from ni_/nn_ because of task size // coarsening). const Index nm0_; const Index nn0_; // Parallelization strategy. // // Blocks related to the same k block can run in parallel because they write // to different output blocks. So we parallelize within k slices, this // gives us parallelism level of m x n. Before we can start any kernels // related to k-th slice, we need to issue m lhs packing tasks and n rhs // packing tasks. // // However, there is a bottleneck when we are finishing kernels for k-th // slice (at the very end there is only 1 runnable kernel). To mitigate this // bottleneck we allow kernels from k-th and k+1-th slices to run in // parallel. Note that (m, n, k) and (m, n, k+1) kernels write to the same // output block, so they must not run in parallel. // // This gives us the following dependency graph. // On each k slice we have m x n kernel tasks, m lhs paking tasks and n rhs // packing tasks. // Kernel (m, n, k) can start when: // - kernel (m, n, k-1) has finished // - lhs packing (m, k) has finished // - rhs packing (n, k) has finished // Lhs/rhs packing can start when: // - all k-1 packing has finished (artificially imposed to limit amount of // parallel packing) // // On top of that we limit runnable tasks to two consecutive k slices. // This is done to limit amount of memory we need for packed lhs/rhs // (for each k slice we need m*bk + n*bk memory in packed_lhs_/packed_rhs_). // // state_switch_ tracks when we are ready to switch to the next k slice. // state_kernel_[m][n] tracks when we are ready to kick off kernel (m, n). // These variable are rolling over 3 consecutive k slices: first two we are // actively executing + one to track completion of kernels in the second // slice. static const Index P = 3; void* packed_mem_; std::vector packed_lhs_[P - 1]; std::vector packed_rhs_[P - 1]; std::atomic** state_kernel_[P]; // state_switch_ is frequently modified by worker threads, while other // fields are read-only after constructor. Let's move it to a separate cache // line to reduce cache-coherency traffic. char pad_[128]; std::atomic state_packing_ready_[P]; std::atomic state_switch_[P]; void pack_lhs(Index m, Index k) { const Index mend = m * gm_ + gm(m); for (Index m1 = m * gm_; m1 < mend; m1++) LhsPacker()(packed_lhs_[k % (P - 1)][m1], lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1)); if (!parallel_pack_ && shard_by_col_) { signal_packing(k); } else { signal_switch(k + 1); for (Index n = nn_ - 1; n >= 0; n--) signal_kernel(m, n, k, n == 0); } } void pack_rhs(Index n, Index k) { const Index nend = n * gn_ + gn(n); for (Index n1 = n * gn_; n1 < nend; n1++) { if (k == 0) { // Zero the output memory in parallel. // On 10000x2x10000 mm zeroing can easily take half of time. // Zero (bn x m) row. Safe to do here because all kernels that will // write to this memory depend on completion of this task. // Note: don't call device_.memset() here. device_.memset() blocks on // thread pool worker thread, which can lead to underutilization and // deadlocks. memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar)); } RhsPacker()(packed_rhs_[k % (P - 1)][n1], rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1)); } if (parallel_pack_ || shard_by_col_) { signal_switch(k + 1); for (Index m = nm_ - 1; m >= 0; m--) signal_kernel(m, n, k, m == 0); } else { signal_packing(k); } } void kernel(Index m, Index n, Index k) { // Note: order of iteration matters here. Iteration over m is innermost // because we want to reuse the same packed rhs in consequetive tasks // (rhs fits into L2$ while lhs only into L3$). const Index nend = n * gn_ + gn(n); const Index mend = m * gm_ + gm(m); if (shard_by_col_) { for (Index n1 = n * gn_; n1 < nend; n1++) { for (Index m1 = m * gm_; m1 < mend; m1++) GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_), packed_lhs_[k % (P - 1)][m1], packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), Scalar(1), -1, -1, 0, 0); } } else { for (Index m1 = m * gm_; m1 < mend; m1++) for (Index n1 = n * gn_; n1 < nend; n1++) { GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_), packed_lhs_[k % (P - 1)][m1], packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), Scalar(1), -1, -1, 0, 0); } } signal_kernel(m, n, k + 1, false); signal_switch(k + 2); } void signal_packing(Index k) { eigen_assert(!parallel_pack_); Index s = state_packing_ready_[k % P].fetch_sub(1); eigen_assert(s > 0); if (s != 1) return; state_packing_ready_[k % P] = shard_by_col_ ? nm_ : nn_; enqueue_packing(k, shard_by_col_); } void signal_kernel(Index m, Index n, Index k, bool sync) { std::atomic* state = &state_kernel_[k % P][m][n]; Index s = state->load(); eigen_assert(s > 0); if (s != 1 && state->fetch_sub(1) != 1) return; state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed); if (sync) kernel(m, n, k); else device_.enqueueNoNotification([=]() { kernel(m, n, k); }); } void signal_switch(Index k, Index v = 1) { Index s = state_switch_[k % P].fetch_sub(v); eigen_assert(s >= v); if (s != v) return; // Ready to switch to the next k slice. // Reset counter for the next iteration. state_switch_[k % P] = (parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)) + nm_ * nn_; if (k < nk_) { // Issue lhs/rhs packing. Their completion will in turn kick off // kernels. if (parallel_pack_) { enqueue_packing(k, !shard_by_col_); enqueue_packing(k, shard_by_col_); } else if (shard_by_col_) { enqueue_packing(k, false); } else { enqueue_packing(k, true); } // Termination handling. // Because kernel completion signals k + 2 switch, we need to finish nk // + 2 slices without issuing any tasks on nk + 1 slice. So here we // pretend that all nk + 1 packing tasks just finish instantly; so that // nk + 2 switch only waits for completion of nk kernels. } else if (k == nk_) { signal_switch(k + 1, parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)); } else { done_.Notify(); } } // Enqueue all rhs/lhs packing for k-th slice. void enqueue_packing(Index k, bool rhs) { enqueue_packing_helper(0, rhs ? nn_ : nm_, k, rhs); } void enqueue_packing_helper(Index start, Index end, Index k, bool rhs) { if (end - start == 1) { if (rhs) pack_rhs(start, k); else pack_lhs(start, k); } else { Index mid = (start + end) / 2; device_.enqueueNoNotification( [=]() { enqueue_packing_helper(mid, end, k, rhs); }); device_.enqueueNoNotification( [=]() { enqueue_packing_helper(start, mid, k, rhs); }); } } // Block sizes with accounting for potentially incomplete last block. Index bm(Index m) const { return m + 1 < nm0_ ? bm_ : m_ + bm_ - bm_ * nm0_; } Index bn(Index n) const { return n + 1 < nn0_ ? bn_ : n_ + bn_ - bn_ * nn0_; } Index bk(Index k) const { return k + 1 < nk_ ? bk_ : k_ + bk_ - bk_ * nk_; } // Task grain sizes accounting for potentially incomplete last task. Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; } Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; } Context(const Context&) = delete; void operator=(const Context&) = delete; }; // Decide whether we want to shard m x n contraction by columns or by rows. static bool shardByCol(Index m, Index n, Index num_threads) { // Note: we are comparing both n and m against Traits::nr, it is not // a mistake. We are trying to figure out how both n and m will fit into // the main sharding dimension. // Sharding by column is the default // ... unless there is enough data for vectorization over rows if (m / num_threads >= Traits::nr && // and not enough data for vectorization over columns (n / num_threads < Traits::nr || // ... or barely enough data for vectorization over columns, // but it is not evenly dividable across threads (n / num_threads < 4 * Traits::nr && (n % (num_threads * Traits::nr)) != 0 && // ... and it is evenly dividable across threads for rows ((m % (num_threads * Traits::nr)) == 0 || // .. or it is not evenly dividable for both dimensions but // there is much more data over rows so that corner effects are // mitigated. (m / n >= 6))))) return false; // Wait, or if matrices are just substantially prolonged over the other // dimension. if (n / num_threads < 16 * Traits::nr && m > n * 32) return false; return true; } Index coarsenM(Index m, Index n, Index bm, Index bn, Index bk, Index gn, int num_threads, bool shard_by_col) const { Index gm = 1; Index gm1 = 1; Index nm0 = divup(m, bm); Index nm1 = nm0; for (;;) { // Find the next candidate for m grain size. It needs to result in // different number of blocks. E.g. if we have 10 kernels, we want to try // 5 and 10, but not 6, 7, 8 and 9. while (gm1 <= nm0 && nm1 == divup(nm0, gm1)) gm1++; if (gm1 > nm0) break; // Check the candidate. int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads, shard_by_col); if (res < 0) break; nm1 = divup(nm0, gm1); if (res == 0) continue; // Commit new grain size. gm = gm1; } return gm; } Index coarsenN(Index m, Index n, Index bm, Index bn, Index bk, Index gm, int num_threads, bool shard_by_col) const { Index gn = 1; Index gn1 = 1; Index nn0 = divup(n, bn); Index nn1 = nn0; for (;;) { while (gn1 <= nn0 && nn1 == divup(nn0, gn1)) gn1++; if (gn1 > nn0) break; int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads, shard_by_col); if (res < 0) break; nn1 = divup(nn0, gn1); if (res == 0) continue; gn = gn1; } return gn; } // checkGrain checks whether grain (gm, gn) is suitable and is better than // (oldgm, oldgn). int checkGrain(Index m, Index n, Index bm, Index bn, Index bk, Index gm, Index gn, Index oldgm, Index oldgn, int num_threads, bool shard_by_col) const { const TensorOpCost cost = contractionCost(bm * gm, bn * gn, bm, bn, bk, shard_by_col, true); double taskSize = TensorCostModel::taskSize( static_cast(bm) * gm * bn * gn, cost); // If the task is too small, then we agree on it regardless of anything // else. Otherwise synchronization overheads will dominate. if (taskSize < 1) return 1; // If it is too large, then we reject it and all larger tasks. if (taskSize > 2) return -1; // Now we are in presumably good task size range. // The main deciding factor here is parallelism. Consider that we have 12 // kernels and 4 threads. Grains of 2, 3 and 4 all yield good task sizes. // But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4 // of cores will be busy). While grain size 3 gives us 4 tasks, which gives // us parallelism of 1 (we can load all cores). Index nm0 = divup(m, bm); Index nn0 = divup(n, bn); Index new_tasks = divup(nm0, gm) * divup(nn0, gn); double new_parallelism = static_cast(new_tasks) / (divup(new_tasks, num_threads) * num_threads); Index old_tasks = divup(nm0, oldgm) * divup(nn0, oldgn); double old_parallelism = static_cast(old_tasks) / (divup(old_tasks, num_threads) * num_threads); if (new_parallelism > old_parallelism || new_parallelism == 1) return 1; return 0; } #else // EIGEN_USE_SIMPLE_THREAD_POOL template void evalProduct(Scalar* buffer) const { if (this->m_j_size == 1) { this->template evalGemv(buffer); return; } evalGemm(buffer); } template void evalGemm(Scalar* buffer) const { // columns in left side, rows in right side const Index k = this->m_k_size; // rows in left side const Index m = this->m_i_size; // columns in right side const Index n = this->m_j_size; // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); const int lhs_packet_size = internal::unpacket_traits::size; const int rhs_packet_size = internal::unpacket_traits::size; typedef internal::TensorContractionInputMapper LhsMapper; typedef internal::TensorContractionInputMapper RhsMapper; typedef internal::blas_data_mapper OutputMapper; // TODO: packing could be faster sometimes if we supported row major tensor mappers typedef internal::gemm_pack_lhs LhsPacker; typedef internal::gemm_pack_rhs RhsPacker; // TODO: replace false, false with conjugate values? typedef internal::gebp_kernel GebpKernel; typedef internal::packLhsArg packLArg; typedef internal::packRhsAndKernelArg packRKArg; // initialize data mappers LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, this->m_left_contracting_strides, this->m_k_strides); RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, this->m_right_contracting_strides, this->m_k_strides); OutputMapper output(buffer, m); // compute block sizes (which depend on number of threads) const Index num_threads = this->m_device.numThreads(); internal::TensorContractionBlocking blocking(k, m, n, num_threads); Index mc = blocking.mc(); Index nc = blocking.nc(); Index kc = blocking.kc(); eigen_assert(mc <= m); eigen_assert(nc <= n); eigen_assert(kc <= k); #define CEIL_DIV(a, b) (((a) + (b) - 1) / (b)) const Index k_blocks = CEIL_DIV(k, kc); const Index n_blocks = CEIL_DIV(n, nc); const Index m_blocks = CEIL_DIV(m, mc); const Index sizeA = mc * kc; const Index sizeB = kc * nc; /* cout << "m: " << m << " n: " << n << " k: " << k << endl; cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl; cout << "m_blocks: " << m_blocks << " n_blocks: " << n_blocks << " k_blocks: " << k_blocks << endl; cout << "num threads: " << num_threads << endl; */ // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB // aren't 16 byte aligned segfaults will happen due to SIMD instructions // note: You can get away with allocating just a single blockA and offsets and meet the // the alignment requirements with the assumption that // (Traits::mr * sizeof(ResScalar)) % 16 == 0 const Index numBlockAs = numext::mini(num_threads, m_blocks); MaxSizeVector blockAs(num_threads); for (int i = 0; i < num_threads; i++) { blockAs.push_back(static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar)))); } // To circumvent alignment issues, I'm just going to separately allocate the memory for each thread // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful. // Other options: (1) reuse memory when a thread finishes. con: tricky // (2) allocate block B memory in each thread. con: overhead MaxSizeVector blockBs(n_blocks); for (int i = 0; i < n_blocks; i++) { blockBs.push_back(static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar)))); } // lhs_notifications starts with all null Notifications MaxSizeVector lhs_notifications(num_threads, nullptr); // this should really be numBlockAs * n_blocks; const Index num_kernel_notifications = num_threads * n_blocks; MaxSizeVector kernel_notifications(num_kernel_notifications, nullptr); for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) { const Index k_start = k_block_idx * kc; // make sure we don't overshoot right edge of left matrix const Index actual_kc = numext::mini(k_start + kc, k) - k_start; for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) { const Index num_blocks = numext::mini(m_blocks-m_block_idx, numBlockAs); for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) { const Index m_start = mt_block_idx * mc; const Index actual_mc = numext::mini(m_start + mc, m) - m_start; eigen_assert(actual_mc > 0); Index blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads; for (int i = 0; i < n_blocks; ++i) { Index notification_id = (blockAId * n_blocks + i); // Wait for any current kernels using this slot to complete // before using it. if (kernel_notifications[notification_id]) { wait_until_ready(kernel_notifications[notification_id]); delete kernel_notifications[notification_id]; } kernel_notifications[notification_id] = new Notification(); } const packLArg arg = { blockAs[blockAId], // blockA lhs, // lhs m_start, // m k_start, // k actual_mc, // mc actual_kc, // kc }; // Delete any existing notification since we may be // replacing it. The algorithm should ensure that there are // no existing waiters on this notification. delete lhs_notifications[blockAId]; lhs_notifications[blockAId] = this->m_device.enqueue(&Self::packLhs, arg); } // now start kernels. const Index m_base_start = m_block_idx * mc; const bool need_to_pack = m_block_idx == 0; for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) { const Index n_start = n_block_idx * nc; const Index actual_nc = numext::mini(n_start + nc, n) - n_start; // first make sure the previous kernels are all done before overwriting rhs. Also wait if // we're going to start new k. In both cases need_to_pack is true. if (need_to_pack) { for (Index i = num_blocks; i < num_threads; ++i) { Index blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads; Index future_id = (blockAId * n_blocks + n_block_idx); wait_until_ready(kernel_notifications[future_id]); } } packRKArg arg = { &blockAs, // blockA blockBs[n_block_idx], // blockB rhs, // rhs output, // output m_base_start, // m k_start, // k n_start, // n mc, // mc actual_kc, // kc actual_nc, // nc num_threads, numBlockAs, m, k_block_idx, m_block_idx, n_block_idx, // n_block_idx m_blocks, // m_blocks n_blocks, // n_blocks &kernel_notifications, // kernel notifications &lhs_notifications, // lhs notifications need_to_pack, // need_to_pack }; // We asynchronously kick off this function, which ends up // notifying the appropriate kernel_notifications objects, // which this thread waits on before exiting. this->m_device.enqueueNoNotification(&Self::packRhsAndKernel, arg); } } } // Make sure all the kernels are done. for (size_t i = 0; i < kernel_notifications.size(); ++i) { wait_until_ready(kernel_notifications[i]); delete kernel_notifications[i]; } // No need to wait for lhs notifications since they should have // already been waited on. Just clean them up. for (size_t i = 0; i < lhs_notifications.size(); ++i) { delete lhs_notifications[i]; } // deallocate all of the memory for both A and B's for (size_t i = 0; i < blockAs.size(); i++) { this->m_device.deallocate(blockAs[i]); } for (size_t i = 0; i < blockBs.size(); i++) { this->m_device.deallocate(blockBs[i]); } #undef CEIL_DIV } /* * Packs a LHS block of size (mt, kc) starting at lhs(m, k). Before packing * the LHS block, check that all of the kernels that worked on the same * mt_block_idx in the previous m_block are done. */ template static void packLhs(const packLArg arg) { // perform actual packing LhsPacker pack_lhs; pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m_start, arg.k_start), arg.kc, arg.mc); } /* * Packs a RHS block of size (kc, nc) starting at (k, n) after checking that * all kernels in the previous block are done. * Then for each LHS future, we wait on the future and then call GEBP * on the area packed by the future (which starts at * blockA + future_idx * mt * kc) on the LHS and with the full packed * RHS block. * The output of this GEBP is written to output(m + i * mt, n). */ template static void packRhsAndKernel(packRKArg arg) { if (arg.need_to_pack) { RhsPacker pack_rhs; pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k, arg.n), arg.kc, arg.nc); } GebpKernel gebp; for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) { const Index m_base_start = arg.m + arg.mc*mt_block_idx; if (m_base_start < arg.max_m) { Index blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads; wait_until_ready((*arg.lhs_notifications)[blockAId]); const Index actual_mc = numext::mini(m_base_start + arg.mc, arg.max_m) - m_base_start; gebp(arg.output.getSubMapper(m_base_start, arg.n), (*arg.blockAs)[blockAId], arg.blockB, actual_mc, arg.kc, arg.nc, Scalar(1), -1, -1, 0, 0); // Notify that the kernel is done. const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx; (*arg.kernel_notifications)[set_idx]->Notify(); } } } #endif // EIGEN_USE_SIMPLE_THREAD_POOL TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk, bool shard_by_col, bool prepacked) const { const int packed_size = std::min(PacketType::size, PacketType::size); const int output_packet_size = internal::unpacket_traits::size; const double kd = static_cast(bk); // Peak VFMA bandwidth is 0.5. However if we have not enough data for // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined // experimentally. double computeBandwidth = bk == 1 ? 4.0 : (shard_by_col ? bn : bm) < Traits::nr || (shard_by_col ? bm : bn) < Traits::mr ? 2.0 : 0.5; #ifndef EIGEN_VECTORIZE_FMA // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors. // However for MULPS/ADDPS we have dependent sequence of 2 such instructions, // so overall bandwidth is 1.0. if (computeBandwidth == 0.5) computeBandwidth = 1.0; #endif // Computations. TensorOpCost cost = TensorOpCost(0, 0, kd * computeBandwidth, true, packed_size); // Output stores. cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size); if (prepacked) { // Packing and kernels are executed in different tasks. When we calculate // task grain size we look only at kernel cost assuming that kernel // is more expensive than packing. return cost; } // Lhs/rhs loads + computations. TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * (kd / n); TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * (kd / m); // Lhs packing memory cost does not contribute considerably to overall // execution time because lhs is prefetched early and accessed sequentially. if (shard_by_col) lhsCost.dropMemoryCost(); else rhsCost.dropMemoryCost(); return cost + lhsCost + rhsCost; } }; } // end namespace Eigen #endif // EIGEN_USE_THREADS #endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H