Alternative way of forcing instantiation of device kernels without

causing warnings or requiring device to device kernel invocations. This allows Tensorflow to work on SM 3.0 (ie, Amazon EC2) machines.
author: Jeremy Barnes <jeremy@barneso.com> 2016-01-10 22:39:13 -0500
committer: Jeremy Barnes <jeremy@barneso.com> 2016-01-10 22:39:13 -0500
commit: 403a7cb6c34d163e4f120387b5dc5487d30bb1d5 (patch)
tree: ce6b06d27b3f71cfa8bdc8904cf9f2280217e886 /unsupported/Eigen/CXX11
parent: b557662e589a76265f73b99d7ca54a988d7bdb59 (diff)
3 files changed, 13 insertions, 3 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index af140a68b..359a01b8f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -242,6 +242,16 @@ struct GpuDevice {
   (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);  \
   assert(cudaGetLastError() == cudaSuccess);
 
+#ifndef __CUDA_ARCH__
+#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)            \
+    (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
+  assert(cudaGetLastError() == cudaSuccess);
+#else
+#define LAUNCH_CUDA_KERNEL(kernel, ...)                                  \
+    { static const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \
+   eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__);
+#endif
+
 
 // FIXME: Should be device and kernel specific.
 #ifdef __CUDACC__
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index fd7064459..9a66e81f7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -506,7 +506,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
   typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
   typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
 
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) {
     m_impl.evalSubExprsIfNeeded(NULL);
 
     // Use the FullReducer if possible.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 558d0c83d..374edb605 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -116,7 +116,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
 
   template <typename OutputType>
   static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
-    assert(false && "Should only be called on floats");
+    eigen_assert(false && "Should only be called on floats");
   }
 
   static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
@@ -126,7 +126,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
     const int block_size = 256;
     const int num_per_thread = 128;
     const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
-    LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread>),
+    LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
                        num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
   }
 };
author	Jeremy Barnes <jeremy@barneso.com>	2016-01-10 22:39:13 -0500
committer	Jeremy Barnes <jeremy@barneso.com>	2016-01-10 22:39:13 -0500
commit	403a7cb6c34d163e4f120387b5dc5487d30bb1d5 (patch)
tree	ce6b06d27b3f71cfa8bdc8904cf9f2280217e886 /unsupported/Eigen/CXX11
parent	b557662e589a76265f73b99d7ca54a988d7bdb59 (diff)