1 files changed, 81 insertions, 19 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index c76d1ee3f..821835cf3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -10,7 +10,6 @@
 #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H)
 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
 
-
 namespace Eigen {
 
 // This defines an interface that GPUDevice can take to use
@@ -35,12 +34,23 @@ static void initializeDeviceProp() {
     if (!m_devicePropInitialized) {
       int num_devices;
       cudaError_t status = cudaGetDeviceCount(&num_devices);
-      EIGEN_UNUSED_VARIABLE(status)
-      assert(status == cudaSuccess);
+      if (status != cudaSuccess) {
+        std::cerr << "Failed to get the number of CUDA devices: "
+                  << cudaGetErrorString(status)
+                  << std::endl;
+        assert(status == cudaSuccess);
+      }
       m_deviceProperties = new cudaDeviceProp[num_devices];
       for (int i = 0; i < num_devices; ++i) {
         status = cudaGetDeviceProperties(&m_deviceProperties[i], i);
-        assert(status == cudaSuccess);
+        if (status != cudaSuccess) {
+          std::cerr << "Failed to initialize CUDA device #"
+                    << i
+                    << ": "
+                    << cudaGetErrorString(status)
+                    << std::endl;
+          assert(status == cudaSuccess);
+        }
       }
       m_devicePropInitialized = true;
     }
@@ -110,10 +120,12 @@ class CudaStreamDevice : public StreamInterface {
 struct GpuDevice {
   // The StreamInterface is not owned: the caller is
   // responsible for its initialization and eventual destruction.
-  explicit GpuDevice(const StreamInterface* stream) : stream_(stream) {
+  explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
+    eigen_assert(stream);
+  }
+  explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
     eigen_assert(stream);
   }
-
   // TODO(bsteiner): This is an internal API, we should not expose it.
   EIGEN_STRONG_INLINE const cudaStream_t& stream() const {
     return stream_->stream();
@@ -199,27 +211,68 @@ struct GpuDevice {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
 #if defined(__CUDACC__) && !defined(__CUDA_ARCH__)
     cudaError_t err = cudaStreamSynchronize(stream_->stream());
-    EIGEN_UNUSED_VARIABLE(err)
-    assert(err == cudaSuccess);
+    if (err != cudaSuccess) {
+      std::cerr << "Error detected in CUDA stream: "
+                << cudaGetErrorString(err)
+                << std::endl;
+      assert(err == cudaSuccess);
+    }
 #else
     assert(false && "The default device should be used instead to generate kernel code");
 #endif
   }
 
-  inline int getNumCudaMultiProcessors() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
+#ifndef __CUDA_ARCH__
     return stream_->deviceProperties().multiProcessorCount;
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+    return 0;
+#endif
   }
-  inline int maxCudaThreadsPerBlock() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
+#ifndef __CUDA_ARCH__
     return stream_->deviceProperties().maxThreadsPerBlock;
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+    return 0;
+#endif
   }
-  inline int maxCudaThreadsPerMultiProcessor() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
+#ifndef __CUDA_ARCH__
     return stream_->deviceProperties().maxThreadsPerMultiProcessor;
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+    return 0;
+#endif
   }
-  inline int sharedMemPerBlock() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
+#ifndef __CUDA_ARCH__
     return stream_->deviceProperties().sharedMemPerBlock;
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+    return 0;
+#endif
   }
-  inline int majorDeviceVersion() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+#ifndef __CUDA_ARCH__
     return stream_->deviceProperties().major;
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+    return 0;
+#endif
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int minorDeviceVersion() const {
+#ifndef __CUDA_ARCH__
+    return stream_->deviceProperties().minor;
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+    return 0;
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const {
+    return max_blocks_;
   }
 
   // This function checks if the CUDA runtime recorded an error for the
@@ -235,24 +288,33 @@ struct GpuDevice {
 
  private:
   const StreamInterface* stream_;
-
+  int max_blocks_;
 };
 
-
-#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)            \
-  (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);  \
+#ifndef __CUDA_ARCH__
+#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
+  (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);   \
   assert(cudaGetLastError() == cudaSuccess);
+#else
+#define LAUNCH_CUDA_KERNEL(kernel, ...)                                                     \
+  { const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); }        \
+  eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__);
+#endif
 
 
 // FIXME: Should be device and kernel specific.
 #ifdef __CUDACC__
-static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
+static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
+#ifndef __CUDA_ARCH__
   cudaError_t status = cudaDeviceSetSharedMemConfig(config);
   EIGEN_UNUSED_VARIABLE(status)
   assert(status == cudaSuccess);
+#else
+  EIGEN_UNUSED_VARIABLE(config)
+#endif
 }
 #endif
 
 }  // end namespace Eigen
 
-#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H