From a407e022e6046917b1ebeacd54b03fcb079a9706 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 5 Mar 2019 14:19:59 -0800
Subject: Tune tensor contraction threadpool heuristics

---
 .../Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h    | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h')
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index d7cd995fb..adf57c892 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -216,11 +216,14 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     const int num_worker_threads = this->m_device.numThreadsInPool();
 
     // With small number of threads we want to make sure that we do not reduce
-    // parallelism too much.
-    const int oversharding_factor =
-        num_worker_threads <= 4 ? 8 :
-        num_worker_threads <= 8 ? 4 :
-        num_worker_threads <= 16 ? 2 : 1;
+    // parallelism too much. With large number of threads we trade maximum
+    // parallelism for better memory locality.
+    const float oversharding_factor =
+        num_worker_threads <= 4  ? 8.0 :
+        num_worker_threads <= 8  ? 4.0 :
+        num_worker_threads <= 16 ? 2.0 :
+        num_worker_threads <= 32 ? 1.0 :
+        num_worker_threads <= 64 ? 0.8 : /* num_worker_threads > 64 */ 0.6;
 
     const bool parallelize_by_sharding_dim_only =
         sharding_dim_tasks >= oversharding_factor * num_worker_threads;
-- 
cgit v1.2.3