// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2016 Rasmus Munk Larsen // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H namespace Eigen { /** \class TensorEvaluator * \ingroup CXX11_Tensor_Module * * \brief A cost model used to limit the number of threads used for evaluating * tensor expression. * */ // Class storing the cost of evaluating a tensor expression in terms of the // estimated number of operand bytes loads, bytes stored, and compute cycles. class TensorOpCost { public: // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple // model based on minimal reciprocal throughput numbers from Intel or // Agner Fog's tables would be better than what is there now. template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() { return internal::functor_traits< internal::scalar_product_op >::Cost; } template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() { return internal::functor_traits >::Cost; } template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() { return internal::functor_traits< internal::scalar_quotient_op >::Cost; } template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() { return internal::functor_traits >::Cost; } template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() { return internal::functor_traits< internal::scalar_cast_op >::Cost; } EIGEN_DEVICE_FUNC TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {} EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles) : bytes_loaded_(bytes_loaded), bytes_stored_(bytes_stored), compute_cycles_(compute_cycles) {} EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, bool vectorized, double packet_size) : bytes_loaded_(bytes_loaded), bytes_stored_(bytes_stored), compute_cycles_(vectorized ? compute_cycles / packet_size : compute_cycles) { eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded)); eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored)); eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const { return bytes_loaded_; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const { return bytes_stored_; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const { return compute_cycles_; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost( double load_cost, double store_cost, double compute_cost) const { return load_cost * bytes_loaded_ + store_cost * bytes_stored_ + compute_cost * compute_cycles_; } // Drop memory access component. Intended for cases when memory accesses are // sequential or are completely masked by computations. EIGEN_DEVICE_FUNC void dropMemoryCost() { bytes_loaded_ = 0; bytes_stored_ = 0; } // TODO(rmlarsen): Define min in terms of total cost, not elementwise. EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin( const TensorOpCost& rhs) const { double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded()); double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored()); double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles()); return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles); } // TODO(rmlarsen): Define max in terms of total cost, not elementwise. EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax( const TensorOpCost& rhs) const { double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded()); double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored()); double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles()); return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=( const TensorOpCost& rhs) { bytes_loaded_ += rhs.bytes_loaded(); bytes_stored_ += rhs.bytes_stored(); compute_cycles_ += rhs.compute_cycles(); return *this; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) { bytes_loaded_ *= rhs; bytes_stored_ *= rhs; compute_cycles_ *= rhs; return *this; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+( TensorOpCost lhs, const TensorOpCost& rhs) { lhs += rhs; return lhs; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*( TensorOpCost lhs, double rhs) { lhs *= rhs; return lhs; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*( double lhs, TensorOpCost rhs) { rhs *= lhs; return rhs; } friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) { return os << "[bytes_loaded = " << tc.bytes_loaded() << ", bytes_stored = " << tc.bytes_stored() << ", compute_cycles = " << tc.compute_cycles() << "]"; } private: double bytes_loaded_; double bytes_stored_; double compute_cycles_; }; // TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads // in [1:max_threads] instead of just switching multi-threading off for small // work units. template class TensorCostModel { public: // Scaling from Eigen compute cost to device cycles. static const int kDeviceCyclesPerComputeCycle = 1; // Costs in device cycles. static const int kStartupCycles = 100000; static const int kPerThreadCycles = 100000; static const int kTaskSize = 40000; // Returns the number of threads in [1:max_threads] to use for // evaluating an expression with the given output size and cost per // coefficient. static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads( double output_size, const TensorOpCost& cost_per_coeff, int max_threads) { double cost = totalCost(output_size, cost_per_coeff); int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; return numext::mini(max_threads, numext::maxi(1, threads)); } // taskSize assesses parallel task size. // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task // granularity needs to be increased to mitigate parallelization overheads. static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize( double output_size, const TensorOpCost& cost_per_coeff) { return totalCost(output_size, cost_per_coeff) / kTaskSize; } private: static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost( double output_size, const TensorOpCost& cost_per_coeff) { // Cost of memory fetches from L2 cache. 64 is typical cache line size. // 11 is L2 cache latency on Haswell. // We don't know whether data is in L1, L2 or L3. But we are most interested // in single-threaded computational time around 100us-10ms (smaller time // is too small for parallelization, larger time is not intersting // either because we are probably using all available threads already). // And for the target time range, L2 seems to be what matters. Data set // fitting into L1 is too small to take noticeable time. Data set fitting // only into L3 presumably will take more than 10ms to load and process. const double kLoadCycles = 1.0 / 64 * 11; const double kStoreCycles = 1.0 / 64 * 11; // Scaling from Eigen compute cost to device cycles. return output_size * cost_per_coeff.total_cost(kLoadCycles, kStoreCycles, kDeviceCyclesPerComputeCycle); } }; } // namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H