From 039ee521250eab33e9f7aadc5ba2baef9661673c Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 12 Apr 2019 13:35:10 -0700 Subject: Tweak cost model for tensor contraction when parallelizing over the inner dimension. https://bitbucket.org/snippets/rmlarsen/MexxLo --- unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index caa8d1767..500f63e60 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -1169,7 +1169,7 @@ struct TensorEvaluator::size; - TensorOpCost cost(0, 0, (computeBandwidth(true, m, n, k) * m) * n); + TensorOpCost cost(0, 0, (computeBandwidth(true, m, n, k) * m) * n, true, output_packet_size); // Output stores. cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size); TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * m; @@ -1192,8 +1192,8 @@ struct TensorEvaluatorm_device.numThreads(); nt++) { + double kFixedOverHead = 50000; + for (int nt = 2; nt <= this->m_device.numThreads(); nt += 2) { double sequential_cost = kFixedOverHead + nt * (reduction_cost + kPerThreadOverHead); double parallel_cost = total_parallel_cost / nt + sequential_cost; -- cgit v1.2.3