aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2016-01-08 13:53:40 -0800
committerGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2016-01-08 13:53:40 -0800
commit53749ff4152191d2f1bd56090a14f6474fe059c2 (patch)
treedb47f940c3d31de9131dbc5a2c0019d6bf2b3bf7 /unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
parent6639b7d6e86ef36f6f78cf51e36efa5a004154eb (diff)
Prevent nvcc from miscompiling the cuda metakernel. Unfortunately this reintroduces some compulation warnings but it's much better than having to deal with random assertion failures.
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h16
1 files changed, 4 insertions, 12 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index c28078882..d93e1de1b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -156,14 +156,14 @@ template <typename Expression>
class TensorExecutor<Expression, GpuDevice, false> {
public:
typedef typename Expression::Index Index;
- EIGEN_DEVICE_FUNC static void run(const Expression& expr, const GpuDevice& device);
+ static void run(const Expression& expr, const GpuDevice& device);
};
template <typename Expression>
class TensorExecutor<Expression, GpuDevice, true> {
public:
typedef typename Expression::Index Index;
- EIGEN_DEVICE_FUNC static void run(const Expression& expr, const GpuDevice& device);
+ static void run(const Expression& expr, const GpuDevice& device);
};
#if defined(__CUDACC__)
@@ -213,9 +213,8 @@ EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) {
/*static*/
template <typename Expression>
-EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
+inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
{
-#ifndef __CUDA_ARCH__
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign)
@@ -228,17 +227,13 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(
LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
}
evaluator.cleanup();
-#else
- eigen_assert(false && "Cannot launch a kernel from another kernel");
-#endif
}
/*static*/
template<typename Expression>
-EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(const Expression& expr, const GpuDevice& device)
+inline void TensorExecutor<Expression, GpuDevice, true>::run(const Expression& expr, const GpuDevice& device)
{
-#ifndef __CUDA_ARCH__
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign)
@@ -251,9 +246,6 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(c
LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
}
evaluator.cleanup();
-#else
- eigen_assert(false && "Cannot launch a kernel from another kernel");
-#endif
}
#endif // __CUDACC__