diff options
author | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2015-02-10 13:16:22 -0800 |
---|---|---|
committer | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2015-02-10 13:16:22 -0800 |
commit | fefec723aa44703c1b7884b2ccfa73877a58f500 (patch) | |
tree | a130083960db87e30f616c2af904b000210dcbee /unsupported/Eigen/CXX11/src/Tensor | |
parent | 780b2422e2b3fd2b50121a6e5642c94b030fbf5b (diff) |
Fixed compilation error triggered when trying to vectorize a non vectorizable cuda kernel.
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 80 |
1 files changed, 58 insertions, 22 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index d93fdd907..05ac9bd2f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -22,8 +22,13 @@ namespace Eigen { */ namespace internal { +template <typename Device, typename Expression> +struct IsVectorizable { + static const bool value = TensorEvaluator<Expression, Device>::PacketAccess; +}; + // Default strategy: the expression is evaluated with a single cpu thread. -template<typename Expression, typename Device = DefaultDevice, bool Vectorizable = TensorEvaluator<Expression, Device>::PacketAccess> +template<typename Expression, typename Device = DefaultDevice, bool Vectorizable = IsVectorizable<Device, Expression>::value> class TensorExecutor { public: @@ -153,34 +158,45 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> template <typename Evaluator, typename Index> __global__ void __launch_bounds__(1024) - EigenMetaKernel(Evaluator eval, Index size) { +EigenMetaKernel_NonVectorizable(Evaluator eval, Index size) { const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; const Index step_size = blockDim.x * gridDim.x; - if (!Evaluator::PacketAccess || !Evaluator::IsAligned) { - // Use the scalar path - for (Index i = first_index; i < size; i += step_size) { - eval.evalScalar(i); - } + // Use the scalar path + for (Index i = first_index; i < size; i += step_size) { + eval.evalScalar(i); } - else { - // Use the vector path - const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; - const Index vectorized_step_size = step_size * PacketSize; - const Index vectorized_size = (size / PacketSize) * PacketSize; - for (Index i = first_index * PacketSize; i < vectorized_size; - i += vectorized_step_size) { - eval.evalPacket(i); - } - for (Index i = vectorized_size + first_index; i < size; i += step_size) { - eval.evalScalar(i); - } +} + +template <typename Evaluator, typename Index> +__global__ void +__launch_bounds__(1024) +EigenMetaKernel_Vectorizable(Evaluator eval, Index size) { + + const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; + const Index step_size = blockDim.x * gridDim.x; + + // Use the vector path + const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; + const Index vectorized_step_size = step_size * PacketSize; + const Index vectorized_size = (size / PacketSize) * PacketSize; + for (Index i = first_index * PacketSize; i < vectorized_size; + i += vectorized_step_size) { + eval.evalPacket(i); + } + for (Index i = vectorized_size + first_index; i < size; i += step_size) { + eval.evalScalar(i); } } -template<typename Expression, bool Vectorizable> -class TensorExecutor<Expression, GpuDevice, Vectorizable> +template <typename Expression> +struct IsVectorizable<GpuDevice, Expression> { + static const bool value = TensorEvaluator<Expression, GpuDevice>::PacketAccess && TensorEvaluator<Expression, GpuDevice>::IsAligned; +}; + +template<typename Expression> +class TensorExecutor<Expression, GpuDevice, false> { public: typedef typename Expression::Index Index; @@ -192,13 +208,33 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable> { const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock(); const int block_size = maxCudaThreadsPerBlock(); + const Index size = array_prod(evaluator.dimensions()); + LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size); + } + evaluator.cleanup(); + } +}; +template<typename Expression> +class TensorExecutor<Expression, GpuDevice, true> +{ + public: + typedef typename Expression::Index Index; + static inline void run(const Expression& expr, const GpuDevice& device) + { + TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock(); + const int block_size = maxCudaThreadsPerBlock(); const Index size = array_prod(evaluator.dimensions()); - LAUNCH_CUDA_KERNEL((EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size); + LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size); } evaluator.cleanup(); } }; + #endif } // end namespace internal |