From 9ea39ce13c453127844cff474730af119e889cd1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 23 Oct 2015 09:15:34 -0700 Subject: Refined the #ifdef __CUDACC__ guard to ensure that when trying to compile gpu code with a non cuda compiler results in a linking error instead of bogus code. --- unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorDeviceType.h | 11 ++- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 78 ++++++++++++---------- 3 files changed, 54 insertions(+), 37 deletions(-) (limited to 'unsupported/Eigen/CXX11/src') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h index 17f10c07b..7b2485fb7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h @@ -106,7 +106,7 @@ template class TensorDevice class TensorDevice { public: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h index 2ff7d471d..300ee8ac0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h @@ -287,6 +287,7 @@ class StreamInterface { virtual void deallocate(void* buffer) const = 0; }; +#if defined(__CUDACC__) static cudaDeviceProp* m_deviceProperties; static bool m_devicePropInitialized = false; @@ -362,7 +363,7 @@ class CudaStreamDevice : public StreamInterface { const cudaStream_t* stream_; int device_; }; - +#endif // __CUDACC__ struct GpuDevice { // The StreamInterface is not owned: the caller is @@ -450,7 +451,7 @@ struct GpuDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { -#ifndef __CUDA_ARCH__ +#if defined(__CUDACC__) && !defined(__CUDA_ARCH__) cudaError_t err = cudaStreamSynchronize(stream_->stream()); assert(err == cudaSuccess); #else @@ -477,8 +478,12 @@ struct GpuDevice { // This function checks if the CUDA runtime recorded an error for the // underlying stream device. inline bool ok() const { +#ifdef __CUDACC__ cudaError_t error = cudaStreamQuery(stream_->stream()); return (error == cudaSuccess) || (error == cudaErrorNotReady); +#else + return false; +#endif } private: @@ -493,10 +498,12 @@ struct GpuDevice { // FIXME: Should be device and kernel specific. +#ifdef __CUDACC__ static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { cudaError_t status = cudaDeviceSetSharedMemConfig(config); assert(status == cudaSuccess); } +#endif #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index b2800aefb..95fc9fec6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -149,7 +149,24 @@ class TensorExecutor // GPU: the evaluation of the expression is offloaded to a GPU. -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#if defined(EIGEN_USE_GPU) + +template +class TensorExecutor { + public: + typedef typename Expression::Index Index; + static void run(const Expression& expr, const GpuDevice& device); +}; + +template +class TensorExecutor { + public: + typedef typename Expression::Index Index; + static void run(const Expression& expr, const GpuDevice& device); +}; + +#if defined(__CUDACC__) + template __global__ void __launch_bounds__(1024) @@ -193,48 +210,41 @@ EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) { } } - -template -class TensorExecutor +/*static*/ +template +inline void TensorExecutor::run(const Expression& expr, const GpuDevice& device) { - public: - typedef typename Expression::Index Index; - static inline void run(const Expression& expr, const GpuDevice& device) + TensorEvaluator evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { - TensorEvaluator evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { - const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock(); - const int block_size = device.maxCudaThreadsPerBlock(); - const Index size = array_prod(evaluator.dimensions()); - LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable, Index>), num_blocks, block_size, 0, device, evaluator, size); - } - evaluator.cleanup(); + const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock(); + const int block_size = device.maxCudaThreadsPerBlock(); + const Index size = array_prod(evaluator.dimensions()); + LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable, Index>), num_blocks, block_size, 0, device, evaluator, size); } -}; + evaluator.cleanup(); +} + +/*static*/ template -class TensorExecutor +inline void TensorExecutor::run(const Expression& expr, const GpuDevice& device) { - public: - typedef typename Expression::Index Index; - static inline void run(const Expression& expr, const GpuDevice& device) + TensorEvaluator evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { - TensorEvaluator evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { - const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock(); - const int block_size = device.maxCudaThreadsPerBlock(); - const Index size = array_prod(evaluator.dimensions()); - LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable, Index>), num_blocks, block_size, 0, device, evaluator, size); - } - evaluator.cleanup(); + const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock(); + const int block_size = device.maxCudaThreadsPerBlock(); + const Index size = array_prod(evaluator.dimensions()); + LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable, Index>), num_blocks, block_size, 0, device, evaluator, size); } -}; + evaluator.cleanup(); +} -#endif +#endif // __CUDACC__ +#endif // EIGEN_USE_GPU } // end namespace internal -- cgit v1.2.3