From 8491127082e5f6568983255a459ca737271aaf3f Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Mon, 4 Feb 2019 12:59:33 -0800
Subject: Do not reduce parallelism too much in contractions with small number
 of threads

---
 .../Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h  | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h')
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 4932514c7..4af8d3b18 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -339,10 +339,19 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       // If there is enough available parallelism in sharding dimension we can
       // call kernels in sync mode and use thread local memory for packed data.
       const Index sharding_dim_tasks = shard_by_col ? nn : nm;
-      if (!parallel_pack_ && sharding_dim_tasks >= device_.numThreadsInPool()) {
-        parallelize_by_sharding_dim_only_ = true;
 
-        int num_worker_threads = device_.numThreadsInPool();
+      const int num_worker_threads = device_.numThreadsInPool();
+
+      // With small number of threads we want to make sure that we do not reduce
+      // parallelism too much.
+      const int oversharding_factor =
+          num_worker_threads <= 4  ? 8 :
+          num_worker_threads <= 8  ? 4 :
+          num_worker_threads <= 16 ? 2 : 1;
+
+      if (!parallel_pack_ &&
+          sharding_dim_tasks >= oversharding_factor * num_worker_threads) {
+        parallelize_by_sharding_dim_only_ = true;
 
         if (shard_by_col) {
           can_use_thread_local_packed_ = new std::atomic<bool>[nn_];
-- 
cgit v1.2.3