aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
diff options
context:
space:
mode:
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h100
1 files changed, 81 insertions, 19 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index c76d1ee3f..821835cf3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -10,7 +10,6 @@
#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H)
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
-
namespace Eigen {
// This defines an interface that GPUDevice can take to use
@@ -35,12 +34,23 @@ static void initializeDeviceProp() {
if (!m_devicePropInitialized) {
int num_devices;
cudaError_t status = cudaGetDeviceCount(&num_devices);
- EIGEN_UNUSED_VARIABLE(status)
- assert(status == cudaSuccess);
+ if (status != cudaSuccess) {
+ std::cerr << "Failed to get the number of CUDA devices: "
+ << cudaGetErrorString(status)
+ << std::endl;
+ assert(status == cudaSuccess);
+ }
m_deviceProperties = new cudaDeviceProp[num_devices];
for (int i = 0; i < num_devices; ++i) {
status = cudaGetDeviceProperties(&m_deviceProperties[i], i);
- assert(status == cudaSuccess);
+ if (status != cudaSuccess) {
+ std::cerr << "Failed to initialize CUDA device #"
+ << i
+ << ": "
+ << cudaGetErrorString(status)
+ << std::endl;
+ assert(status == cudaSuccess);
+ }
}
m_devicePropInitialized = true;
}
@@ -110,10 +120,12 @@ class CudaStreamDevice : public StreamInterface {
struct GpuDevice {
// The StreamInterface is not owned: the caller is
// responsible for its initialization and eventual destruction.
- explicit GpuDevice(const StreamInterface* stream) : stream_(stream) {
+ explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
+ eigen_assert(stream);
+ }
+ explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
eigen_assert(stream);
}
-
// TODO(bsteiner): This is an internal API, we should not expose it.
EIGEN_STRONG_INLINE const cudaStream_t& stream() const {
return stream_->stream();
@@ -199,27 +211,68 @@ struct GpuDevice {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
#if defined(__CUDACC__) && !defined(__CUDA_ARCH__)
cudaError_t err = cudaStreamSynchronize(stream_->stream());
- EIGEN_UNUSED_VARIABLE(err)
- assert(err == cudaSuccess);
+ if (err != cudaSuccess) {
+ std::cerr << "Error detected in CUDA stream: "
+ << cudaGetErrorString(err)
+ << std::endl;
+ assert(err == cudaSuccess);
+ }
#else
assert(false && "The default device should be used instead to generate kernel code");
#endif
}
- inline int getNumCudaMultiProcessors() const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
+#ifndef __CUDA_ARCH__
return stream_->deviceProperties().multiProcessorCount;
+#else
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
+ return 0;
+#endif
}
- inline int maxCudaThreadsPerBlock() const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
+#ifndef __CUDA_ARCH__
return stream_->deviceProperties().maxThreadsPerBlock;
+#else
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
+ return 0;
+#endif
}
- inline int maxCudaThreadsPerMultiProcessor() const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
+#ifndef __CUDA_ARCH__
return stream_->deviceProperties().maxThreadsPerMultiProcessor;
+#else
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
+ return 0;
+#endif
}
- inline int sharedMemPerBlock() const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
+#ifndef __CUDA_ARCH__
return stream_->deviceProperties().sharedMemPerBlock;
+#else
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
+ return 0;
+#endif
}
- inline int majorDeviceVersion() const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+#ifndef __CUDA_ARCH__
return stream_->deviceProperties().major;
+#else
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
+ return 0;
+#endif
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int minorDeviceVersion() const {
+#ifndef __CUDA_ARCH__
+ return stream_->deviceProperties().minor;
+#else
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
+ return 0;
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const {
+ return max_blocks_;
}
// This function checks if the CUDA runtime recorded an error for the
@@ -235,24 +288,33 @@ struct GpuDevice {
private:
const StreamInterface* stream_;
-
+ int max_blocks_;
};
-
-#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
- (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
+#ifndef __CUDA_ARCH__
+#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
+ (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
assert(cudaGetLastError() == cudaSuccess);
+#else
+#define LAUNCH_CUDA_KERNEL(kernel, ...) \
+ { const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \
+ eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__);
+#endif
// FIXME: Should be device and kernel specific.
#ifdef __CUDACC__
-static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
+static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
+#ifndef __CUDA_ARCH__
cudaError_t status = cudaDeviceSetSharedMemConfig(config);
EIGEN_UNUSED_VARIABLE(status)
assert(status == cudaSuccess);
+#else
+ EIGEN_UNUSED_VARIABLE(config)
+#endif
}
#endif
} // end namespace Eigen
-#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H