aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
diff options
context:
space:
mode:
authorGravatar Rasmus Munk Larsen <rmlarsen@google.com>2016-04-14 15:52:58 -0700
committerGravatar Rasmus Munk Larsen <rmlarsen@google.com>2016-04-14 15:52:58 -0700
commitaeb5494a0b2edef3be447cec222e2d178e413389 (patch)
tree83caf412933530b86675c778dead1d9865efac20 /unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
parentd2e95492e712c9aaa659a1bbab54acb573f4f64c (diff)
Improvements to cost model.
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h57
1 files changed, 45 insertions, 12 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
index 366352853..32bc5d0b2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -88,6 +88,13 @@ class TensorOpCost {
compute_cost * compute_cycles_;
}
+ // Drop memory access component. Intended for cases when memory accesses are
+ // sequential or are completely masked by computations.
+ EIGEN_DEVICE_FUNC void dropMemoryCost() {
+ bytes_loaded_ = 0;
+ bytes_stored_ = 0;
+ }
+
// TODO(rmlarsen): Define min in terms of total cost, not elementwise.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMin(
const TensorOpCost& rhs) {
@@ -155,24 +162,50 @@ class TensorOpCost {
template <typename Device>
class TensorCostModel {
public:
- // Costs in device cycles.
- static const int kLoadCycles = 3;
- static const int kStoreCycles = 3;
// Scaling from Eigen compute cost to device cycles.
static const int kDeviceCyclesPerComputeCycle = 1;
- // Implements a simple "binary" policy: Return 1 if total cost is below
- // kMinWorkToParallelize and max_threads otherwise.
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int numThreads(
+ // Costs in device cycles.
+ static const int kStartupCycles = 100000;
+ static const int kPerThreadCycles = 100000;
+ static const int kTaskSize = 40000;
+
+ // Returns the number of threads in [1:max_threads] to use for
+ // evaluating an expression with the given output size and cost per
+ // coefficient.
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(
double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
- // Compute total cost C in device cycles.
- const double total_cost =
- output_size *
+ double cost = totalCost(output_size, cost_per_coeff);
+ int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
+ return numext::mini(max_threads, numext::maxi(1, threads));
+ }
+
+ // taskSize assesses parallel task size.
+ // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
+ // granularity needs to be increased to mitigate parallelization overheads.
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(
+ double output_size, const TensorOpCost& cost_per_coeff) {
+ return totalCost(output_size, cost_per_coeff) / kTaskSize;
+ }
+
+ private:
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(
+ double output_size, const TensorOpCost& cost_per_coeff) {
+ // Cost of memory fetches from L2 cache. 64 is typical cache line size.
+ // 11 is L2 cache latency on Haswell.
+ // We don't know whether data is in L1, L2 or L3. But we are most interested
+ // in single-threaded computational time around 100us-10ms (smaller time
+ // is too small for parallelization, larger time is not intersting
+ // either because we are probably using all available threads already).
+ // And for the target time range, L2 seems to be what matters. Data set
+ // fitting into L1 is too small to take noticeable time. Data set fitting
+ // only into L3 presumably will take more than 10ms to load and process.
+ const double kLoadCycles = 1.0 / 64 * 11;
+ const double kStoreCycles = 1.0 / 64 * 11;
+ // Scaling from Eigen compute cost to device cycles.
+ return output_size *
cost_per_coeff.total_cost(kLoadCycles, kStoreCycles,
kDeviceCyclesPerComputeCycle);
- // Smallest work unit to parallelize.
- const double kMinParallelCost = 1e6;
- return total_cost < kMinParallelCost ? 1 : max_threads;
}
};