From aeb5494a0b2edef3be447cec222e2d178e413389 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 14 Apr 2016 15:52:58 -0700 Subject: Improvements to cost model. --- .../Eigen/CXX11/src/Tensor/TensorCostModel.h | 57 +++++++++++++++++----- 1 file changed, 45 insertions(+), 12 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h index 366352853..32bc5d0b2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h @@ -88,6 +88,13 @@ class TensorOpCost { compute_cost * compute_cycles_; } + // Drop memory access component. Intended for cases when memory accesses are + // sequential or are completely masked by computations. + EIGEN_DEVICE_FUNC void dropMemoryCost() { + bytes_loaded_ = 0; + bytes_stored_ = 0; + } + // TODO(rmlarsen): Define min in terms of total cost, not elementwise. EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMin( const TensorOpCost& rhs) { @@ -155,24 +162,50 @@ class TensorOpCost { template class TensorCostModel { public: - // Costs in device cycles. - static const int kLoadCycles = 3; - static const int kStoreCycles = 3; // Scaling from Eigen compute cost to device cycles. static const int kDeviceCyclesPerComputeCycle = 1; - // Implements a simple "binary" policy: Return 1 if total cost is below - // kMinWorkToParallelize and max_threads otherwise. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int numThreads( + // Costs in device cycles. + static const int kStartupCycles = 100000; + static const int kPerThreadCycles = 100000; + static const int kTaskSize = 40000; + + // Returns the number of threads in [1:max_threads] to use for + // evaluating an expression with the given output size and cost per + // coefficient. + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads( double output_size, const TensorOpCost& cost_per_coeff, int max_threads) { - // Compute total cost C in device cycles. - const double total_cost = - output_size * + double cost = totalCost(output_size, cost_per_coeff); + int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; + return numext::mini(max_threads, numext::maxi(1, threads)); + } + + // taskSize assesses parallel task size. + // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task + // granularity needs to be increased to mitigate parallelization overheads. + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize( + double output_size, const TensorOpCost& cost_per_coeff) { + return totalCost(output_size, cost_per_coeff) / kTaskSize; + } + + private: + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost( + double output_size, const TensorOpCost& cost_per_coeff) { + // Cost of memory fetches from L2 cache. 64 is typical cache line size. + // 11 is L2 cache latency on Haswell. + // We don't know whether data is in L1, L2 or L3. But we are most interested + // in single-threaded computational time around 100us-10ms (smaller time + // is too small for parallelization, larger time is not intersting + // either because we are probably using all available threads already). + // And for the target time range, L2 seems to be what matters. Data set + // fitting into L1 is too small to take noticeable time. Data set fitting + // only into L3 presumably will take more than 10ms to load and process. + const double kLoadCycles = 1.0 / 64 * 11; + const double kStoreCycles = 1.0 / 64 * 11; + // Scaling from Eigen compute cost to device cycles. + return output_size * cost_per_coeff.total_cost(kLoadCycles, kStoreCycles, kDeviceCyclesPerComputeCycle); - // Smallest work unit to parallelize. - const double kMinParallelCost = 1e6; - return total_cost < kMinParallelCost ? 1 : max_threads; } }; -- cgit v1.2.3