diff options
author | 2017-03-15 20:06:19 -0700 | |
---|---|---|
committer | 2017-03-15 20:06:19 -0700 | |
commit | f8a622ef3cce77dc2c067e13417104a9b51de345 (patch) | |
tree | 90b306f0e9554eb0b431ed61212df3a98c1d9a1f /unsupported | |
parent | 9597d6f6aba6091f986fbe2348106dc96a24e34e (diff) | |
parent | fd7db52f9b3b1227719c6d8281ff5e5198aaae82 (diff) |
Merged eigen/eigen into default
Diffstat (limited to 'unsupported')
12 files changed, 155 insertions, 107 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md index fbb7f3bfc..38cdb9c69 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/README.md +++ b/unsupported/Eigen/CXX11/src/Tensor/README.md @@ -75,16 +75,16 @@ large enough to hold all the data. // Map a tensor of ints on top of stack-allocated storage. int storage[128]; // 2 x 4 x 2 x 8 = 128 - TensorMap<int, 4> t_4d(storage, 2, 4, 2, 8); + TensorMap<Tensor<int, 4>> t_4d(storage, 2, 4, 2, 8); // The same storage can be viewed as a different tensor. // You can also pass the sizes as an array. - TensorMap<int, 2> t_2d(storage, 16, 8); + TensorMap<Tensor<int, 2>> t_2d(storage, 16, 8); // You can also map fixed-size tensors. Here we get a 1d view of // the 2d fixed-size tensor. Tensor<float, Sizes<4, 5>> t_4x3; - TensorMap<float, 1> t_12(t_4x3, 12); + TensorMap<Tensor<float, 1>> t_12(t_4x3, 12); #### Class TensorRef diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index f335edf7d..c46a778b5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -50,6 +50,7 @@ template <DenseIndex DimId> struct DimensionId { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) { + EIGEN_UNUSED_VARIABLE(dim); eigen_assert(dim == DimId); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index d65dbb40f..c04b784a4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -529,7 +529,6 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh float2 rhs_shmem2[][8], const Index m_size, const Index n_size, const Index k_size, const Index base_m, const Index base_n) { - typedef float Scalar; // prefetch registers float4 lhs_pf0, rhs_pf0; @@ -540,27 +539,27 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh } -#define prefetch_lhs(reg, row, col) \ - if (!CHECK_LHS_BOUNDARY) { \ - if (col < k_size) { \ - reg =lhs.loadPacket<Unaligned>(row, col); \ - } \ - } else { \ - if (col < k_size) { \ - if (row + 3 < m_size) { \ - reg =lhs.loadPacket<Unaligned>(row, col); \ - } else if (row + 2 < m_size) { \ - reg.x =lhs(row + 0, col); \ - reg.y =lhs(row + 1, col); \ - reg.z =lhs(row + 2, col); \ - } else if (row + 1 < m_size) { \ - reg.x =lhs(row + 0, col); \ - reg.y =lhs(row + 1, col); \ - } else if (row < m_size) { \ - reg.x =lhs(row + 0, col); \ - } \ - } \ - } \ +#define prefetch_lhs(reg, row, col) \ + if (!CHECK_LHS_BOUNDARY) { \ + if (col < k_size) { \ + reg =lhs.template loadPacket<Unaligned>(row, col); \ + } \ + } else { \ + if (col < k_size) { \ + if (row + 3 < m_size) { \ + reg =lhs.template loadPacket<Unaligned>(row, col); \ + } else if (row + 2 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + reg.z =lhs(row + 2, col); \ + } else if (row + 1 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + } else if (row < m_size) { \ + reg.x =lhs(row + 0, col); \ + } \ + } \ + } \ Index lhs_vert = base_m+threadIdx.x*4; @@ -578,7 +577,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh if (!CHECK_RHS_BOUNDARY) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0); + rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0); } else if (rhs_vert + 2 < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -593,7 +592,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh } else { if (rhs_horiz0 < n_size) { if ((rhs_vert + 3) < k_size) { - rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0); + rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0); } else if ((rhs_vert + 2) < k_size) { rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); @@ -766,7 +765,6 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, float2 rhs_shmem2[][8], const Index m_size, const Index n_size, const Index k_size, const Index base_m, const Index base_n) { - typedef float Scalar; // prefetch registers float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3; @@ -790,37 +788,37 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, if (!CHECK_LHS_BOUNDARY) { if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16)); - lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24)); + lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24)); } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16)); } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); + lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); } } else { // just CHECK_LHS_BOUNDARY if (lhs_vert + 3 < m_size) { if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16)); - lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24)); + lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24)); } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16)); } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); + lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); } } else if (lhs_vert + 2 < m_size) { if ((threadIdx.y/4+k+24) < k_size) { @@ -909,8 +907,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, if (!CHECK_RHS_BOUNDARY) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0); - rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1); + rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1); } else if (rhs_vert + 2 < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -932,8 +930,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, if (rhs_horiz1 < n_size) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0); - rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1); + rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1); } else if (rhs_vert + 2 < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -954,7 +952,7 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, } else if (rhs_horiz0 < n_size) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0); + rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0); } else if ((rhs_vert + 2) < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -1137,9 +1135,6 @@ EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, typedef float2 LHS_MEM[64][32]; typedef float2 RHS_MEM[128][8]; - typedef float2 LHS_MEM16x16[32][16]; - typedef float2 RHS_MEM16x16[64][8]; - const Index m_block_idx = blockIdx.x; const Index n_block_idx = blockIdx.y; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index e6cee11ef..be8d69386 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -217,7 +217,10 @@ struct GpuDevice { EIGEN_UNUSED_VARIABLE(err) assert(err == cudaSuccess); #else - eigen_assert(false && "The default device should be used instead to generate kernel code"); + EIGEN_UNUSED_VARIABLE(dst); + EIGEN_UNUSED_VARIABLE(src); + EIGEN_UNUSED_VARIABLE(n); + eigen_assert(false && "The default device should be used instead to generate kernel code"); #endif } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index c841786b8..e341e2e9b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -11,6 +11,17 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H +// clang is incompatible with the CUDA syntax wrt making a kernel a class friend, +// so we'll use a macro to make clang happy. +#ifndef KERNEL_FRIEND +#if defined(__clang__) && defined(__CUDA__) +#define KERNEL_FRIEND friend __global__ +#else +#define KERNEL_FRIEND friend +#endif +#endif + + namespace Eigen { @@ -681,15 +692,15 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, template <typename S, typename O, bool V> friend struct internal::FullReducerShard; #endif #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) - template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); + template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); #ifdef EIGEN_HAS_CUDA_FP16 - template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); - template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); - template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*); + template <typename S, typename R, typename I> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); + template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); + template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*); #endif - template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); + template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); - template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); + template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); #endif #if defined(EIGEN_USE_SYCL) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 65638b6a8..edb0ab280 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -287,7 +287,6 @@ struct FullReductionLauncher< void>::type> { static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) { typedef typename Self::Index Index; - typedef typename Self::CoeffReturnType Scalar; const int block_size = 256; const int num_per_thread = 128; const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index 2854a4a17..e6a666f78 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -31,12 +31,12 @@ namespace Eigen { * * \sa Tensor */ -template<typename T, typename Dimensions, int Options_> class TensorStorage; +template<typename T, typename Dimensions, int Options> class TensorStorage; // Pure fixed-size storage -template<typename T, int Options_, typename FixedDimensions> -class TensorStorage<T, FixedDimensions, Options_> +template<typename T, typename FixedDimensions, int Options_> +class TensorStorage { private: static const std::size_t Size = FixedDimensions::total_size; @@ -66,7 +66,7 @@ class TensorStorage<T, FixedDimensions, Options_> // pure dynamic -template<typename T, int Options_, typename IndexType, int NumIndices_> +template<typename T, typename IndexType, int NumIndices_, int Options_> class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_> { public: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h index 3523e7c94..d23f2e4c8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -23,6 +23,7 @@ struct static_val { template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) { + EIGEN_UNUSED_VARIABLE(v); eigen_assert(v == n); } }; diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h index ed1a761b6..9dcc9dab7 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h @@ -20,7 +20,13 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { typedef RunQueue<Task, 1024> Queue; NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment()) - : env_(env), + : NonBlockingThreadPoolTempl(num_threads, true, env) {} + + NonBlockingThreadPoolTempl(int num_threads, bool allow_spinning, + Environment env = Environment()) + : num_threads_(num_threads), + allow_spinning_(allow_spinning), + env_(env), threads_(num_threads), queues_(num_threads), coprimes_(num_threads), @@ -30,18 +36,18 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { done_(false), cancelled_(false), ec_(waiters_) { - waiters_.resize(num_threads); + waiters_.resize(num_threads_); - // Calculate coprimes of num_threads. + // Calculate coprimes of num_threads_. // Coprimes are used for a random walk over all threads in Steal // and NonEmptyQueueIndex. Iteration is based on the fact that if we take // a walk starting thread index t and calculate num_threads - 1 subsequent // indices as (t + coprime) % num_threads, we will cover all threads without // repetitions (effectively getting a presudo-random permutation of thread // indices). - for (int i = 1; i <= num_threads; i++) { + for (int i = 1; i <= num_threads_; i++) { unsigned a = i; - unsigned b = num_threads; + unsigned b = num_threads_; // If GCD(a, b) == 1, then a and b are coprimes. while (b != 0) { unsigned tmp = a; @@ -52,10 +58,10 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { coprimes_.push_back(i); } } - for (int i = 0; i < num_threads; i++) { + for (int i = 0; i < num_threads_; i++) { queues_.push_back(new Queue()); } - for (int i = 0; i < num_threads; i++) { + for (int i = 0; i < num_threads_; i++) { threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); })); } } @@ -77,8 +83,8 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { } // Join threads explicitly to avoid destruction order issues. - for (size_t i = 0; i < threads_.size(); i++) delete threads_[i]; - for (size_t i = 0; i < threads_.size(); i++) delete queues_[i]; + for (size_t i = 0; i < num_threads_; i++) delete threads_[i]; + for (size_t i = 0; i < num_threads_; i++) delete queues_[i]; } void Schedule(std::function<void()> fn) { @@ -125,7 +131,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { } int NumThreads() const final { - return static_cast<int>(threads_.size()); + return num_threads_; } int CurrentThreadId() const final { @@ -149,6 +155,8 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { }; Environment env_; + const int num_threads_; + const bool allow_spinning_; MaxSizeVector<Thread*> threads_; MaxSizeVector<Queue*> queues_; MaxSizeVector<unsigned> coprimes_; @@ -167,36 +175,62 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { pt->thread_id = thread_id; Queue* q = queues_[thread_id]; EventCount::Waiter* waiter = &waiters_[thread_id]; - while (!cancelled_) { - Task t = q->PopFront(); - if (!t.f) { - t = Steal(); + // TODO(dvyukov,rmlarsen): The time spent in Steal() is proportional + // to num_threads_ and we assume that new work is scheduled at a + // constant rate, so we set spin_count to 5000 / num_threads_. The + // constant was picked based on a fair dice roll, tune it. + const int spin_count = + allow_spinning_ && num_threads_ > 0 ? 5000 / num_threads_ : 0; + if (num_threads_ == 1) { + // For num_threads_ == 1 there is no point in going through the expensive + // steal loop. Moreover, since Steal() calls PopBack() on the victim + // queues it might reverse the order in which ops are executed compared to + // the order in which they are scheduled, which tends to be + // counter-productive for the types of I/O workloads the single thread + // pools tend to be used for. + while (!cancelled_) { + Task t = q->PopFront(); + for (int i = 0; i < spin_count && !t.f; i++) { + if (!cancelled_.load(std::memory_order_relaxed)) { + t = q->PopFront(); + } + } if (!t.f) { - // Leave one thread spinning. This reduces latency. - // TODO(dvyukov): 1000 iterations is based on fair dice roll, tune it. - // Also, the time it takes to attempt to steal work 1000 times depends - // on the size of the thread pool. However the speed at which the user - // of the thread pool submit tasks is independent of the size of the - // pool. Consider a time based limit instead. - if (!spinning_ && !spinning_.exchange(true)) { - for (int i = 0; i < 1000 && !t.f; i++) { - if (!cancelled_.load(std::memory_order_relaxed)) { - t = Steal(); - } else { - return; - } - } - spinning_ = false; + if (!WaitForWork(waiter, &t)) { + return; } + } + if (t.f) { + env_.ExecuteTask(t); + } + } + } else { + while (!cancelled_) { + Task t = q->PopFront(); + if (!t.f) { + t = Steal(); if (!t.f) { - if (!WaitForWork(waiter, &t)) { - return; + // Leave one thread spinning. This reduces latency. + if (allow_spinning_ && !spinning_ && !spinning_.exchange(true)) { + for (int i = 0; i < spin_count && !t.f; i++) { + if (!cancelled_.load(std::memory_order_relaxed)) { + t = Steal(); + } else { + return; + } + } + spinning_ = false; + } + if (!t.f) { + if (!WaitForWork(waiter, &t)) { + return; + } } } } - } - if (t.f) { - env_.ExecuteTask(t); + if (t.f) { + env_.ExecuteTask(t); + } } } } @@ -244,7 +278,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { // If we are shutting down and all worker threads blocked without work, // that's we are done. blocked_++; - if (done_ && blocked_ == threads_.size()) { + if (done_ && blocked_ == num_threads_) { ec_.CancelWait(waiter); // Almost done, but need to re-check queues. // Consider that all queues are empty and all worker threads are preempted diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h index 03169d591..573ca435a 100644 --- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h @@ -169,6 +169,7 @@ template <typename T> class array<T, 0> { #if EIGEN_HAS_VARIADIC_TEMPLATES EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() { + EIGEN_UNUSED_VARIABLE(l); eigen_assert(l.size() == 0); } #endif diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h index 50fedf6ac..d2808860c 100755 --- a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h +++ b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h @@ -108,7 +108,9 @@ class AutoDiffScalar template<typename OtherDerType> AutoDiffScalar(const AutoDiffScalar<OtherDerType>& other #ifndef EIGEN_PARSED_BY_DOXYGEN - , typename internal::enable_if<internal::is_same<Scalar, typename internal::traits<typename internal::remove_all<OtherDerType>::type>::Scalar>::value,void*>::type = 0 + , typename internal::enable_if< + internal::is_same<Scalar, typename internal::traits<typename internal::remove_all<OtherDerType>::type>::Scalar>::value + && internal::is_convertible<OtherDerType,DerType>::value , void*>::type = 0 #endif ) : m_value(other.value()), m_derivatives(other.derivatives()) diff --git a/unsupported/test/cxx11_non_blocking_thread_pool.cpp b/unsupported/test/cxx11_non_blocking_thread_pool.cpp index 2c5765ce4..48cd2d4e4 100644 --- a/unsupported/test/cxx11_non_blocking_thread_pool.cpp +++ b/unsupported/test/cxx11_non_blocking_thread_pool.cpp @@ -23,11 +23,11 @@ static void test_create_destroy_empty_pool() } -static void test_parallelism() +static void test_parallelism(bool allow_spinning) { // Test we never-ever fail to match available tasks with idle threads. const int kThreads = 16; // code below expects that this is a multiple of 4 - NonBlockingThreadPool tp(kThreads); + NonBlockingThreadPool tp(kThreads, allow_spinning); VERIFY_IS_EQUAL(tp.NumThreads(), kThreads); VERIFY_IS_EQUAL(tp.CurrentThreadId(), -1); for (int iter = 0; iter < 100; ++iter) { @@ -119,6 +119,7 @@ static void test_cancel() void test_cxx11_non_blocking_thread_pool() { CALL_SUBTEST(test_create_destroy_empty_pool()); - CALL_SUBTEST(test_parallelism()); + CALL_SUBTEST(test_parallelism(true)); + CALL_SUBTEST(test_parallelism(false)); CALL_SUBTEST(test_cancel()); } |