aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
diff options
context:
space:
mode:
authorGravatar Jeremy Barnes <jeremy@barneso.com>2016-01-10 22:39:13 -0500
committerGravatar Jeremy Barnes <jeremy@barneso.com>2016-01-10 22:39:13 -0500
commit403a7cb6c34d163e4f120387b5dc5487d30bb1d5 (patch)
treece6b06d27b3f71cfa8bdc8904cf9f2280217e886 /unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
parentb557662e589a76265f73b99d7ca54a988d7bdb59 (diff)
Alternative way of forcing instantiation of device kernels without
causing warnings or requiring device to device kernel invocations. This allows Tensorflow to work on SM 3.0 (ie, Amazon EC2) machines.
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h10
1 files changed, 10 insertions, 0 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index af140a68b..359a01b8f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -242,6 +242,16 @@ struct GpuDevice {
(kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
assert(cudaGetLastError() == cudaSuccess);
+#ifndef __CUDA_ARCH__
+#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
+ (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
+ assert(cudaGetLastError() == cudaSuccess);
+#else
+#define LAUNCH_CUDA_KERNEL(kernel, ...) \
+ { static const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \
+ eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__);
+#endif
+
// FIXME: Should be device and kernel specific.
#ifdef __CUDACC__