aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2017-03-15 20:06:19 -0700
committerGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2017-03-15 20:06:19 -0700
commitf8a622ef3cce77dc2c067e13417104a9b51de345 (patch)
tree90b306f0e9554eb0b431ed61212df3a98c1d9a1f /unsupported
parent9597d6f6aba6091f986fbe2348106dc96a24e34e (diff)
parentfd7db52f9b3b1227719c6d8281ff5e5198aaae82 (diff)
Merged eigen/eigen into default
Diffstat (limited to 'unsupported')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/README.md6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h101
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h23
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h8
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h1
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h104
-rw-r--r--unsupported/Eigen/CXX11/src/util/EmulateArray.h1
-rwxr-xr-xunsupported/Eigen/src/AutoDiff/AutoDiffScalar.h4
-rw-r--r--unsupported/test/cxx11_non_blocking_thread_pool.cpp7
12 files changed, 155 insertions, 107 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md
index fbb7f3bfc..38cdb9c69 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/README.md
+++ b/unsupported/Eigen/CXX11/src/Tensor/README.md
@@ -75,16 +75,16 @@ large enough to hold all the data.
// Map a tensor of ints on top of stack-allocated storage.
int storage[128]; // 2 x 4 x 2 x 8 = 128
- TensorMap<int, 4> t_4d(storage, 2, 4, 2, 8);
+ TensorMap<Tensor<int, 4>> t_4d(storage, 2, 4, 2, 8);
// The same storage can be viewed as a different tensor.
// You can also pass the sizes as an array.
- TensorMap<int, 2> t_2d(storage, 16, 8);
+ TensorMap<Tensor<int, 2>> t_2d(storage, 16, 8);
// You can also map fixed-size tensors. Here we get a 1d view of
// the 2d fixed-size tensor.
Tensor<float, Sizes<4, 5>> t_4x3;
- TensorMap<float, 1> t_12(t_4x3, 12);
+ TensorMap<Tensor<float, 1>> t_12(t_4x3, 12);
#### Class TensorRef
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index f335edf7d..c46a778b5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -50,6 +50,7 @@ template <DenseIndex DimId>
struct DimensionId
{
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) {
+ EIGEN_UNUSED_VARIABLE(dim);
eigen_assert(dim == DimId);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index d65dbb40f..c04b784a4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -529,7 +529,6 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
float2 rhs_shmem2[][8], const Index m_size,
const Index n_size, const Index k_size,
const Index base_m, const Index base_n) {
- typedef float Scalar;
// prefetch registers
float4 lhs_pf0, rhs_pf0;
@@ -540,27 +539,27 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
}
-#define prefetch_lhs(reg, row, col) \
- if (!CHECK_LHS_BOUNDARY) { \
- if (col < k_size) { \
- reg =lhs.loadPacket<Unaligned>(row, col); \
- } \
- } else { \
- if (col < k_size) { \
- if (row + 3 < m_size) { \
- reg =lhs.loadPacket<Unaligned>(row, col); \
- } else if (row + 2 < m_size) { \
- reg.x =lhs(row + 0, col); \
- reg.y =lhs(row + 1, col); \
- reg.z =lhs(row + 2, col); \
- } else if (row + 1 < m_size) { \
- reg.x =lhs(row + 0, col); \
- reg.y =lhs(row + 1, col); \
- } else if (row < m_size) { \
- reg.x =lhs(row + 0, col); \
- } \
- } \
- } \
+#define prefetch_lhs(reg, row, col) \
+ if (!CHECK_LHS_BOUNDARY) { \
+ if (col < k_size) { \
+ reg =lhs.template loadPacket<Unaligned>(row, col); \
+ } \
+ } else { \
+ if (col < k_size) { \
+ if (row + 3 < m_size) { \
+ reg =lhs.template loadPacket<Unaligned>(row, col); \
+ } else if (row + 2 < m_size) { \
+ reg.x =lhs(row + 0, col); \
+ reg.y =lhs(row + 1, col); \
+ reg.z =lhs(row + 2, col); \
+ } else if (row + 1 < m_size) { \
+ reg.x =lhs(row + 0, col); \
+ reg.y =lhs(row + 1, col); \
+ } else if (row < m_size) { \
+ reg.x =lhs(row + 0, col); \
+ } \
+ } \
+ } \
Index lhs_vert = base_m+threadIdx.x*4;
@@ -578,7 +577,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
if (!CHECK_RHS_BOUNDARY) {
if ((rhs_vert + 3) < k_size) {
// just CHECK_RHS_BOUNDARY
- rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+ rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
} else if (rhs_vert + 2 < k_size) {
// just CHECK_RHS_BOUNDARY
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -593,7 +592,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
} else {
if (rhs_horiz0 < n_size) {
if ((rhs_vert + 3) < k_size) {
- rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+ rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
} else if ((rhs_vert + 2) < k_size) {
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
@@ -766,7 +765,6 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
float2 rhs_shmem2[][8], const Index m_size,
const Index n_size, const Index k_size,
const Index base_m, const Index base_n) {
- typedef float Scalar;
// prefetch registers
float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
@@ -790,37 +788,37 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
if (!CHECK_LHS_BOUNDARY) {
if ((threadIdx.y/4+k+24) < k_size) {
- lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
- lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
- lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
- lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
+ lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+ lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
} else if ((threadIdx.y/4+k+16) < k_size) {
- lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
- lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
- lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+ lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
} else if ((threadIdx.y/4+k+8) < k_size) {
- lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
- lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
} else if ((threadIdx.y/4+k) < k_size) {
- lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
}
} else {
// just CHECK_LHS_BOUNDARY
if (lhs_vert + 3 < m_size) {
if ((threadIdx.y/4+k+24) < k_size) {
- lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
- lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
- lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
- lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
+ lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+ lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
} else if ((threadIdx.y/4+k+16) < k_size) {
- lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
- lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
- lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+ lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
} else if ((threadIdx.y/4+k+8) < k_size) {
- lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
- lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
} else if ((threadIdx.y/4+k) < k_size) {
- lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
}
} else if (lhs_vert + 2 < m_size) {
if ((threadIdx.y/4+k+24) < k_size) {
@@ -909,8 +907,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
if (!CHECK_RHS_BOUNDARY) {
if ((rhs_vert + 3) < k_size) {
// just CHECK_RHS_BOUNDARY
- rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
- rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
+ rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+ rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
} else if (rhs_vert + 2 < k_size) {
// just CHECK_RHS_BOUNDARY
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -932,8 +930,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
if (rhs_horiz1 < n_size) {
if ((rhs_vert + 3) < k_size) {
// just CHECK_RHS_BOUNDARY
- rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
- rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
+ rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+ rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
} else if (rhs_vert + 2 < k_size) {
// just CHECK_RHS_BOUNDARY
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -954,7 +952,7 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
} else if (rhs_horiz0 < n_size) {
if ((rhs_vert + 3) < k_size) {
// just CHECK_RHS_BOUNDARY
- rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+ rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
} else if ((rhs_vert + 2) < k_size) {
// just CHECK_RHS_BOUNDARY
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -1137,9 +1135,6 @@ EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
typedef float2 LHS_MEM[64][32];
typedef float2 RHS_MEM[128][8];
- typedef float2 LHS_MEM16x16[32][16];
- typedef float2 RHS_MEM16x16[64][8];
-
const Index m_block_idx = blockIdx.x;
const Index n_block_idx = blockIdx.y;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index e6cee11ef..be8d69386 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -217,7 +217,10 @@ struct GpuDevice {
EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess);
#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
+ EIGEN_UNUSED_VARIABLE(dst);
+ EIGEN_UNUSED_VARIABLE(src);
+ EIGEN_UNUSED_VARIABLE(n);
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index c841786b8..e341e2e9b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -11,6 +11,17 @@
#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
+// clang is incompatible with the CUDA syntax wrt making a kernel a class friend,
+// so we'll use a macro to make clang happy.
+#ifndef KERNEL_FRIEND
+#if defined(__clang__) && defined(__CUDA__)
+#define KERNEL_FRIEND friend __global__
+#else
+#define KERNEL_FRIEND friend
+#endif
+#endif
+
+
namespace Eigen {
@@ -681,15 +692,15 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
#endif
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
- template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
+ template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
#ifdef EIGEN_HAS_CUDA_FP16
- template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
- template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
- template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
+ template <typename S, typename R, typename I> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
+ template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
+ template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
#endif
- template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+ template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
- template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+ template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
#endif
#if defined(EIGEN_USE_SYCL)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 65638b6a8..edb0ab280 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -287,7 +287,6 @@ struct FullReductionLauncher<
void>::type> {
static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
typedef typename Self::Index Index;
- typedef typename Self::CoeffReturnType Scalar;
const int block_size = 256;
const int num_per_thread = 128;
const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
index 2854a4a17..e6a666f78 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -31,12 +31,12 @@ namespace Eigen {
*
* \sa Tensor
*/
-template<typename T, typename Dimensions, int Options_> class TensorStorage;
+template<typename T, typename Dimensions, int Options> class TensorStorage;
// Pure fixed-size storage
-template<typename T, int Options_, typename FixedDimensions>
-class TensorStorage<T, FixedDimensions, Options_>
+template<typename T, typename FixedDimensions, int Options_>
+class TensorStorage
{
private:
static const std::size_t Size = FixedDimensions::total_size;
@@ -66,7 +66,7 @@ class TensorStorage<T, FixedDimensions, Options_>
// pure dynamic
-template<typename T, int Options_, typename IndexType, int NumIndices_>
+template<typename T, typename IndexType, int NumIndices_, int Options_>
class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
{
public:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
index 3523e7c94..d23f2e4c8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -23,6 +23,7 @@ struct static_val {
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) {
+ EIGEN_UNUSED_VARIABLE(v);
eigen_assert(v == n);
}
};
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
index ed1a761b6..9dcc9dab7 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -20,7 +20,13 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
typedef RunQueue<Task, 1024> Queue;
NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment())
- : env_(env),
+ : NonBlockingThreadPoolTempl(num_threads, true, env) {}
+
+ NonBlockingThreadPoolTempl(int num_threads, bool allow_spinning,
+ Environment env = Environment())
+ : num_threads_(num_threads),
+ allow_spinning_(allow_spinning),
+ env_(env),
threads_(num_threads),
queues_(num_threads),
coprimes_(num_threads),
@@ -30,18 +36,18 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
done_(false),
cancelled_(false),
ec_(waiters_) {
- waiters_.resize(num_threads);
+ waiters_.resize(num_threads_);
- // Calculate coprimes of num_threads.
+ // Calculate coprimes of num_threads_.
// Coprimes are used for a random walk over all threads in Steal
// and NonEmptyQueueIndex. Iteration is based on the fact that if we take
// a walk starting thread index t and calculate num_threads - 1 subsequent
// indices as (t + coprime) % num_threads, we will cover all threads without
// repetitions (effectively getting a presudo-random permutation of thread
// indices).
- for (int i = 1; i <= num_threads; i++) {
+ for (int i = 1; i <= num_threads_; i++) {
unsigned a = i;
- unsigned b = num_threads;
+ unsigned b = num_threads_;
// If GCD(a, b) == 1, then a and b are coprimes.
while (b != 0) {
unsigned tmp = a;
@@ -52,10 +58,10 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
coprimes_.push_back(i);
}
}
- for (int i = 0; i < num_threads; i++) {
+ for (int i = 0; i < num_threads_; i++) {
queues_.push_back(new Queue());
}
- for (int i = 0; i < num_threads; i++) {
+ for (int i = 0; i < num_threads_; i++) {
threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); }));
}
}
@@ -77,8 +83,8 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
}
// Join threads explicitly to avoid destruction order issues.
- for (size_t i = 0; i < threads_.size(); i++) delete threads_[i];
- for (size_t i = 0; i < threads_.size(); i++) delete queues_[i];
+ for (size_t i = 0; i < num_threads_; i++) delete threads_[i];
+ for (size_t i = 0; i < num_threads_; i++) delete queues_[i];
}
void Schedule(std::function<void()> fn) {
@@ -125,7 +131,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
}
int NumThreads() const final {
- return static_cast<int>(threads_.size());
+ return num_threads_;
}
int CurrentThreadId() const final {
@@ -149,6 +155,8 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
};
Environment env_;
+ const int num_threads_;
+ const bool allow_spinning_;
MaxSizeVector<Thread*> threads_;
MaxSizeVector<Queue*> queues_;
MaxSizeVector<unsigned> coprimes_;
@@ -167,36 +175,62 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
pt->thread_id = thread_id;
Queue* q = queues_[thread_id];
EventCount::Waiter* waiter = &waiters_[thread_id];
- while (!cancelled_) {
- Task t = q->PopFront();
- if (!t.f) {
- t = Steal();
+ // TODO(dvyukov,rmlarsen): The time spent in Steal() is proportional
+ // to num_threads_ and we assume that new work is scheduled at a
+ // constant rate, so we set spin_count to 5000 / num_threads_. The
+ // constant was picked based on a fair dice roll, tune it.
+ const int spin_count =
+ allow_spinning_ && num_threads_ > 0 ? 5000 / num_threads_ : 0;
+ if (num_threads_ == 1) {
+ // For num_threads_ == 1 there is no point in going through the expensive
+ // steal loop. Moreover, since Steal() calls PopBack() on the victim
+ // queues it might reverse the order in which ops are executed compared to
+ // the order in which they are scheduled, which tends to be
+ // counter-productive for the types of I/O workloads the single thread
+ // pools tend to be used for.
+ while (!cancelled_) {
+ Task t = q->PopFront();
+ for (int i = 0; i < spin_count && !t.f; i++) {
+ if (!cancelled_.load(std::memory_order_relaxed)) {
+ t = q->PopFront();
+ }
+ }
if (!t.f) {
- // Leave one thread spinning. This reduces latency.
- // TODO(dvyukov): 1000 iterations is based on fair dice roll, tune it.
- // Also, the time it takes to attempt to steal work 1000 times depends
- // on the size of the thread pool. However the speed at which the user
- // of the thread pool submit tasks is independent of the size of the
- // pool. Consider a time based limit instead.
- if (!spinning_ && !spinning_.exchange(true)) {
- for (int i = 0; i < 1000 && !t.f; i++) {
- if (!cancelled_.load(std::memory_order_relaxed)) {
- t = Steal();
- } else {
- return;
- }
- }
- spinning_ = false;
+ if (!WaitForWork(waiter, &t)) {
+ return;
}
+ }
+ if (t.f) {
+ env_.ExecuteTask(t);
+ }
+ }
+ } else {
+ while (!cancelled_) {
+ Task t = q->PopFront();
+ if (!t.f) {
+ t = Steal();
if (!t.f) {
- if (!WaitForWork(waiter, &t)) {
- return;
+ // Leave one thread spinning. This reduces latency.
+ if (allow_spinning_ && !spinning_ && !spinning_.exchange(true)) {
+ for (int i = 0; i < spin_count && !t.f; i++) {
+ if (!cancelled_.load(std::memory_order_relaxed)) {
+ t = Steal();
+ } else {
+ return;
+ }
+ }
+ spinning_ = false;
+ }
+ if (!t.f) {
+ if (!WaitForWork(waiter, &t)) {
+ return;
+ }
}
}
}
- }
- if (t.f) {
- env_.ExecuteTask(t);
+ if (t.f) {
+ env_.ExecuteTask(t);
+ }
}
}
}
@@ -244,7 +278,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
// If we are shutting down and all worker threads blocked without work,
// that's we are done.
blocked_++;
- if (done_ && blocked_ == threads_.size()) {
+ if (done_ && blocked_ == num_threads_) {
ec_.CancelWait(waiter);
// Almost done, but need to re-check queues.
// Consider that all queues are empty and all worker threads are preempted
diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
index 03169d591..573ca435a 100644
--- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
@@ -169,6 +169,7 @@ template <typename T> class array<T, 0> {
#if EIGEN_HAS_VARIADIC_TEMPLATES
EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() {
+ EIGEN_UNUSED_VARIABLE(l);
eigen_assert(l.size() == 0);
}
#endif
diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
index 50fedf6ac..d2808860c 100755
--- a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
+++ b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
@@ -108,7 +108,9 @@ class AutoDiffScalar
template<typename OtherDerType>
AutoDiffScalar(const AutoDiffScalar<OtherDerType>& other
#ifndef EIGEN_PARSED_BY_DOXYGEN
- , typename internal::enable_if<internal::is_same<Scalar, typename internal::traits<typename internal::remove_all<OtherDerType>::type>::Scalar>::value,void*>::type = 0
+ , typename internal::enable_if<
+ internal::is_same<Scalar, typename internal::traits<typename internal::remove_all<OtherDerType>::type>::Scalar>::value
+ && internal::is_convertible<OtherDerType,DerType>::value , void*>::type = 0
#endif
)
: m_value(other.value()), m_derivatives(other.derivatives())
diff --git a/unsupported/test/cxx11_non_blocking_thread_pool.cpp b/unsupported/test/cxx11_non_blocking_thread_pool.cpp
index 2c5765ce4..48cd2d4e4 100644
--- a/unsupported/test/cxx11_non_blocking_thread_pool.cpp
+++ b/unsupported/test/cxx11_non_blocking_thread_pool.cpp
@@ -23,11 +23,11 @@ static void test_create_destroy_empty_pool()
}
-static void test_parallelism()
+static void test_parallelism(bool allow_spinning)
{
// Test we never-ever fail to match available tasks with idle threads.
const int kThreads = 16; // code below expects that this is a multiple of 4
- NonBlockingThreadPool tp(kThreads);
+ NonBlockingThreadPool tp(kThreads, allow_spinning);
VERIFY_IS_EQUAL(tp.NumThreads(), kThreads);
VERIFY_IS_EQUAL(tp.CurrentThreadId(), -1);
for (int iter = 0; iter < 100; ++iter) {
@@ -119,6 +119,7 @@ static void test_cancel()
void test_cxx11_non_blocking_thread_pool()
{
CALL_SUBTEST(test_create_destroy_empty_pool());
- CALL_SUBTEST(test_parallelism());
+ CALL_SUBTEST(test_parallelism(true));
+ CALL_SUBTEST(test_parallelism(false));
CALL_SUBTEST(test_cancel());
}