From 32df1b1046967e6c52e087408cb3fa9e8f3746e6 Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Mon, 14 Nov 2016 14:18:16 -0800
Subject: Reduce dispatch overhead in parallelFor by only calling
 thread_pool.Schedule() for one of the two recursive calls in handleRange.
 This avoids going through the scedule path to push both recursive calls onto
 another thread-queue in the binary tree, but instead executes one of them on
 the main thread. At the leaf level this will still activate a full complement
 of threads, but will save up to 50% of the overhead in Schedule (random
 number generation, insertion in queue which includes signaling via atomics).

---
 unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h')

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index 069680a11..210ae1368 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -256,7 +256,7 @@ struct ThreadPoolDevice {
       // Split into halves and submit to the pool.
       Index mid = first + divup((last - first) / 2, block_size) * block_size;
       pool_->Schedule([=, &handleRange]() { handleRange(mid, last); });
-      pool_->Schedule([=, &handleRange]() { handleRange(first, mid); });
+      handleRange(first, mid);
     };
     handleRange(0, n);
     barrier.Wait();
-- 
cgit v1.2.3