aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2015-10-23 09:15:34 -0700
committerGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2015-10-23 09:15:34 -0700
commit9ea39ce13c453127844cff474730af119e889cd1 (patch)
treea0871558cf3774c503c3a19dca6d5e3eaa6651c0
parentac99b4924976cb2d06a1747cd86e792de60f16c3 (diff)
Refined the #ifdef __CUDACC__ guard to ensure that when trying to compile gpu code with a non cuda compiler results in a linking error instead of bogus code.
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h11
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h78
3 files changed, 54 insertions, 37 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
index 17f10c07b..7b2485fb7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
@@ -106,7 +106,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPool
#endif
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU)
template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
{
public:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
index 2ff7d471d..300ee8ac0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
@@ -287,6 +287,7 @@ class StreamInterface {
virtual void deallocate(void* buffer) const = 0;
};
+#if defined(__CUDACC__)
static cudaDeviceProp* m_deviceProperties;
static bool m_devicePropInitialized = false;
@@ -362,7 +363,7 @@ class CudaStreamDevice : public StreamInterface {
const cudaStream_t* stream_;
int device_;
};
-
+#endif // __CUDACC__
struct GpuDevice {
// The StreamInterface is not owned: the caller is
@@ -450,7 +451,7 @@ struct GpuDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
-#ifndef __CUDA_ARCH__
+#if defined(__CUDACC__) && !defined(__CUDA_ARCH__)
cudaError_t err = cudaStreamSynchronize(stream_->stream());
assert(err == cudaSuccess);
#else
@@ -477,8 +478,12 @@ struct GpuDevice {
// This function checks if the CUDA runtime recorded an error for the
// underlying stream device.
inline bool ok() const {
+#ifdef __CUDACC__
cudaError_t error = cudaStreamQuery(stream_->stream());
return (error == cudaSuccess) || (error == cudaErrorNotReady);
+#else
+ return false;
+#endif
}
private:
@@ -493,10 +498,12 @@ struct GpuDevice {
// FIXME: Should be device and kernel specific.
+#ifdef __CUDACC__
static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
cudaError_t status = cudaDeviceSetSharedMemConfig(config);
assert(status == cudaSuccess);
}
+#endif
#endif
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index b2800aefb..95fc9fec6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -149,7 +149,24 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
// GPU: the evaluation of the expression is offloaded to a GPU.
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU)
+
+template <typename Expression>
+class TensorExecutor<Expression, GpuDevice, false> {
+ public:
+ typedef typename Expression::Index Index;
+ static void run(const Expression& expr, const GpuDevice& device);
+};
+
+template <typename Expression>
+class TensorExecutor<Expression, GpuDevice, true> {
+ public:
+ typedef typename Expression::Index Index;
+ static void run(const Expression& expr, const GpuDevice& device);
+};
+
+#if defined(__CUDACC__)
+
template <typename Evaluator, typename Index>
__global__ void
__launch_bounds__(1024)
@@ -193,48 +210,41 @@ EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) {
}
}
-
-template<typename Expression>
-class TensorExecutor<Expression, GpuDevice, false>
+/*static*/
+template <typename Expression>
+inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
{
- public:
- typedef typename Expression::Index Index;
- static inline void run(const Expression& expr, const GpuDevice& device)
+ TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+ if (needs_assign)
{
- TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
- const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
- if (needs_assign)
- {
- const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock();
- const int block_size = device.maxCudaThreadsPerBlock();
- const Index size = array_prod(evaluator.dimensions());
- LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
- }
- evaluator.cleanup();
+ const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock();
+ const int block_size = device.maxCudaThreadsPerBlock();
+ const Index size = array_prod(evaluator.dimensions());
+ LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
}
-};
+ evaluator.cleanup();
+}
+
+/*static*/
template<typename Expression>
-class TensorExecutor<Expression, GpuDevice, true>
+inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
{
- public:
- typedef typename Expression::Index Index;
- static inline void run(const Expression& expr, const GpuDevice& device)
+ TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+ if (needs_assign)
{
- TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
- const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
- if (needs_assign)
- {
- const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock();
- const int block_size = device.maxCudaThreadsPerBlock();
- const Index size = array_prod(evaluator.dimensions());
- LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
- }
- evaluator.cleanup();
+ const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock();
+ const int block_size = device.maxCudaThreadsPerBlock();
+ const Index size = array_prod(evaluator.dimensions());
+ LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
}
-};
+ evaluator.cleanup();
+}
-#endif
+#endif // __CUDACC__
+#endif // EIGEN_USE_GPU
} // end namespace internal