aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
diff options
context:
space:
mode:
authorGravatar Rasmus Munk Larsen <rmlarsen@google.com>2016-11-14 14:18:16 -0800
committerGravatar Rasmus Munk Larsen <rmlarsen@google.com>2016-11-14 14:18:16 -0800
commit32df1b1046967e6c52e087408cb3fa9e8f3746e6 (patch)
tree8bebb503a90bcbcacc7f784f8277df1643f5f80a /unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
parent0ee92aa38eebcf5c3274f2f23ae56d0a4e722a65 (diff)
Reduce dispatch overhead in parallelFor by only calling thread_pool.Schedule() for one of the two recursive calls in handleRange. This avoids going through the scedule path to push both recursive calls onto another thread-queue in the binary tree, but instead executes one of them on the main thread. At the leaf level this will still activate a full complement of threads, but will save up to 50% of the overhead in Schedule (random number generation, insertion in queue which includes signaling via atomics).
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h2
1 files changed, 1 insertions, 1 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index 069680a11..210ae1368 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -256,7 +256,7 @@ struct ThreadPoolDevice {
// Split into halves and submit to the pool.
Index mid = first + divup((last - first) / 2, block_size) * block_size;
pool_->Schedule([=, &handleRange]() { handleRange(mid, last); });
- pool_->Schedule([=, &handleRange]() { handleRange(first, mid); });
+ handleRange(first, mid);
};
handleRange(0, n);
barrier.Wait();