From 039ee521250eab33e9f7aadc5ba2baef9661673c Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Fri, 12 Apr 2019 13:35:10 -0700
Subject: Tweak cost model for tensor contraction when parallelizing over the
 inner dimension.

https://bitbucket.org/snippets/rmlarsen/MexxLo
---
 unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h')
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index caa8d1767..500f63e60 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -1169,7 +1169,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   TensorOpCost contractionCostPerInnerDim(Index m, Index n, Index k) const {
     // Compute cost.
     const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
-    TensorOpCost cost(0, 0, (computeBandwidth(true, m, n, k) * m) * n);
+    TensorOpCost cost(0, 0, (computeBandwidth(true, m, n, k) * m) * n, true, output_packet_size);
     // Output stores.
     cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size);
     TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * m;
@@ -1192,8 +1192,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     int num_threads = 1;
     double min_cost = total_parallel_cost;
     double kPerThreadOverHead = 4000;
-    double kFixedOverHead = 100000;
-    for (int nt = 2; nt <= this->m_device.numThreads(); nt++) {
+    double kFixedOverHead = 50000;
+    for (int nt = 2; nt <= this->m_device.numThreads(); nt += 2) {
       double sequential_cost =
           kFixedOverHead + nt * (reduction_cost + kPerThreadOverHead);
       double parallel_cost = total_parallel_cost / nt + sequential_cost;
-- 
cgit v1.2.3