aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Rasmus Munk Larsen <rmlarsen@google.com>2019-03-06 11:54:30 -0800
committerGravatar Rasmus Munk Larsen <rmlarsen@google.com>2019-03-06 11:54:30 -0800
commit3c3f639fe25918806f6f126482bd8886ee824e47 (patch)
tree0ef235dec1b05d2c7c7f6e03966a3939ce71e2b0
parentf4ec8edea8a8396e1b744db9ea61de2c451bd15d (diff)
parent41cdc370d02cadc662cb29d20d99d282707d500c (diff)
Merge.
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h2
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h5
2 files changed, 6 insertions, 1 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 1c44541bd..057e90e50 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -317,6 +317,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
// GPU: the evaluation of the expression is offloaded to a GPU.
#if defined(EIGEN_USE_GPU)
+#if defined(EIGEN_GPUCC)
template <typename Expression, bool Vectorizable, bool Tileable>
class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> {
@@ -326,7 +327,6 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> {
};
-#if defined(EIGEN_GPUCC)
template <typename Evaluator, typename StorageIndex, bool Vectorizable>
struct EigenMetaKernelEval {
static __device__ EIGEN_ALWAYS_INLINE
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
index 49603d6c1..bd1910dcc 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -56,6 +56,7 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
thread_data_[i].thread.reset(
env_.CreateThread([this, i]() { WorkerLoop(i); }));
}
+ global_steal_partition_ = EncodePartition(0, num_threads_);
#ifndef EIGEN_THREAD_LOCAL
// Wait for workers to initialize per_thread_map_. Otherwise we might race
// with them in Schedule or CurrentThreadId.
@@ -237,6 +238,7 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
MaxSizeVector<ThreadData> thread_data_;
MaxSizeVector<MaxSizeVector<unsigned>> all_coprimes_;
MaxSizeVector<EventCount::Waiter> waiters_;
+ unsigned global_steal_partition_;
std::atomic<unsigned> blocked_;
std::atomic<bool> spinning_;
std::atomic<bool> done_;
@@ -354,6 +356,9 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
Task LocalSteal() {
PerThread* pt = GetPerThread();
unsigned partition = GetStealPartition(pt->thread_id);
+ // If thread steal partition is the same as global partition, there is no
+ // need to go through the steal loop twice.
+ if (global_steal_partition_ == partition) return Task();
unsigned start, limit;
DecodePartition(partition, &start, &limit);
AssertBounds(start, limit);