diff options
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 48 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 8 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 12 |
3 files changed, 46 insertions, 22 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index c74613873..0f67f0f57 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -10,7 +10,6 @@ #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H) #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H - namespace Eigen { // This defines an interface that GPUDevice can take to use @@ -206,20 +205,45 @@ struct GpuDevice { #endif } - inline int getNumCudaMultiProcessors() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const { +#ifndef __CUDA_ARCH__ return stream_->deviceProperties().multiProcessorCount; +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); + return 0; +#endif } - inline int maxCudaThreadsPerBlock() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const { +#ifndef __CUDA_ARCH__ return stream_->deviceProperties().maxThreadsPerBlock; +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); + return 0; +#endif } - inline int maxCudaThreadsPerMultiProcessor() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const { +#ifndef __CUDA_ARCH__ return stream_->deviceProperties().maxThreadsPerMultiProcessor; +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); + return 0; +#endif } - inline int sharedMemPerBlock() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int sharedMemPerBlock() const { +#ifndef __CUDA_ARCH__ return stream_->deviceProperties().sharedMemPerBlock; +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); + return 0; +#endif } - inline int majorDeviceVersion() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { +#ifndef __CUDA_ARCH__ return stream_->deviceProperties().major; +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); + return 0; +#endif } // This function checks if the CUDA runtime recorded an error for the @@ -239,13 +263,13 @@ struct GpuDevice { }; #ifndef __CUDA_ARCH__ -#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ - (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ +#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ + (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ assert(cudaGetLastError() == cudaSuccess); #else -#define LAUNCH_CUDA_KERNEL(kernel, ...) \ - { static const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \ - eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__); +#define LAUNCH_CUDA_KERNEL(kernel, ...) \ + { const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \ + eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__); #endif @@ -260,4 +284,4 @@ static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { } // end namespace Eigen -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index d93e1de1b..d2ab70f2b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -156,14 +156,14 @@ template <typename Expression> class TensorExecutor<Expression, GpuDevice, false> { public: typedef typename Expression::Index Index; - static void run(const Expression& expr, const GpuDevice& device); + static EIGEN_DEVICE_FUNC void run(const Expression& expr, const GpuDevice& device); }; template <typename Expression> class TensorExecutor<Expression, GpuDevice, true> { public: typedef typename Expression::Index Index; - static void run(const Expression& expr, const GpuDevice& device); + static EIGEN_DEVICE_FUNC void run(const Expression& expr, const GpuDevice& device); }; #if defined(__CUDACC__) @@ -213,7 +213,7 @@ EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) { /*static*/ template <typename Expression> -inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device) +EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device) { TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); @@ -232,7 +232,7 @@ inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& /*static*/ template<typename Expression> -inline void TensorExecutor<Expression, GpuDevice, true>::run(const Expression& expr, const GpuDevice& device) +EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(const Expression& expr, const GpuDevice& device) { TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 3fa3d5c3c..867654aff 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -115,8 +115,8 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> { internal::is_same<typename Self::CoeffReturnType, float>::value; template <typename OutputType> - static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { - eigen_assert(false && "Should only be called on floats"); + static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { + assert(false && "Should only be called on floats"); } static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) { @@ -210,11 +210,11 @@ struct InnerReducer<Self, Op, GpuDevice> { internal::is_same<typename Self::CoeffReturnType, float>::value; template <typename Device, typename OutputType> - static void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { + static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { assert(false && "Should only be called to reduce floats on a gpu device"); } - static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; @@ -264,11 +264,11 @@ struct OuterReducer<Self, Op, GpuDevice> { internal::is_same<typename Self::CoeffReturnType, float>::value; template <typename Device, typename OutputType> - static void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { + static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { assert(false && "Should only be called to reduce floats on a gpu device"); } - static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; |