aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2016-01-08 13:53:40 -0800
committerGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2016-01-08 13:53:40 -0800
commit53749ff4152191d2f1bd56090a14f6474fe059c2 (patch)
treedb47f940c3d31de9131dbc5a2c0019d6bf2b3bf7 /unsupported
parent6639b7d6e86ef36f6f78cf51e36efa5a004154eb (diff)
Prevent nvcc from miscompiling the cuda metakernel. Unfortunately this reintroduces some compulation warnings but it's much better than having to deal with random assertion failures.
Diffstat (limited to 'unsupported')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h16
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h4
3 files changed, 7 insertions, 19 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 4d7570077..af140a68b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -238,14 +238,10 @@ struct GpuDevice {
};
-#ifndef __CUDA_ARCH__
#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
(kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
assert(cudaGetLastError() == cudaSuccess);
-#else
-#define LAUNCH_CUDA_KERNEL(...) \
- eigen_assert(false && "Cannot launch a kernel from another kernel");
-#endif
+
// FIXME: Should be device and kernel specific.
#ifdef __CUDACC__
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index c28078882..d93e1de1b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -156,14 +156,14 @@ template <typename Expression>
class TensorExecutor<Expression, GpuDevice, false> {
public:
typedef typename Expression::Index Index;
- EIGEN_DEVICE_FUNC static void run(const Expression& expr, const GpuDevice& device);
+ static void run(const Expression& expr, const GpuDevice& device);
};
template <typename Expression>
class TensorExecutor<Expression, GpuDevice, true> {
public:
typedef typename Expression::Index Index;
- EIGEN_DEVICE_FUNC static void run(const Expression& expr, const GpuDevice& device);
+ static void run(const Expression& expr, const GpuDevice& device);
};
#if defined(__CUDACC__)
@@ -213,9 +213,8 @@ EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) {
/*static*/
template <typename Expression>
-EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
+inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
{
-#ifndef __CUDA_ARCH__
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign)
@@ -228,17 +227,13 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(
LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
}
evaluator.cleanup();
-#else
- eigen_assert(false && "Cannot launch a kernel from another kernel");
-#endif
}
/*static*/
template<typename Expression>
-EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(const Expression& expr, const GpuDevice& device)
+inline void TensorExecutor<Expression, GpuDevice, true>::run(const Expression& expr, const GpuDevice& device)
{
-#ifndef __CUDA_ARCH__
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign)
@@ -251,9 +246,6 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(c
LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
}
evaluator.cleanup();
-#else
- eigen_assert(false && "Cannot launch a kernel from another kernel");
-#endif
}
#endif // __CUDACC__
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index b046607b9..558d0c83d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -115,11 +115,11 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
internal::is_same<typename Self::CoeffReturnType, float>::value;
template <typename OutputType>
- EIGEN_DEVICE_FUNC static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
+ static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
assert(false && "Should only be called on floats");
}
- EIGEN_DEVICE_FUNC static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
+ static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
typedef typename Self::Index Index;
const Index num_coeffs = array_prod(self.m_impl.dimensions());