Add the necessary CUDA >= 300 checks back

author: Igor Babuschkin <igor@babuschk.in> 2016-08-18 17:15:12 +0100
committer: Igor Babuschkin <igor@babuschk.in> 2016-08-18 17:15:12 +0100
commit: 1569a7d7ab35164b5de9c639cdf01c74a3a72050 (patch)
tree: c9e2f10b4b2947368e30d85ca9b6c89ee2e7c27f /unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
parent: 841e075154c1b423595b54bca3569855b1652cc0 (diff)
1 files changed, 12 insertions, 1 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index af5c71247..fa7364fd6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -23,6 +23,7 @@ namespace internal {
 // updated the content of the output address it will try again.
 template <typename T, typename R>
 __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
+#if __CUDA_ARCH__ >= 300
   if (sizeof(T) == 4)
   {
     unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
@@ -40,6 +41,9 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
         return;
       }
     }
+#else
+  assert(0 && "Shouldn't be called on unsupported device");
+#endif
   }
   else if (sizeof(T) == 8) {
     unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
@@ -98,7 +102,11 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer
 
 template <>
 __device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
+#if __CUDA_ARCH__ >= 300
   atomicAdd(output, accum);
+#else
+  assert(0 && "Shouldn't be called on unsupported device");
+#endif
 }
 
 
@@ -116,6 +124,7 @@ template <int BlockSize, int NumPerThread, typename Self,
           typename Reducer, typename Index>
 __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
                                     typename Self::CoeffReturnType* output, unsigned int* semaphore) {
+#if __CUDA_ARCH__ >= 300
   // Initialize the output value
   const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
   if (gridDim.x == 1) {
@@ -170,6 +179,9 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
     // Let the last block reset the semaphore
     atomicInc(semaphore, gridDim.x + 1);
   }
+#else
+  assert(0 && "Shouldn't be called on unsupported device");
+#endif
 }
 
 
@@ -684,7 +696,6 @@ struct OuterReducer<Self, Op, GpuDevice> {
   static const bool HasOptimizedImplementation = !Op::IsStateful &&
                                                  (internal::is_same<typename Self::CoeffReturnType, float>::value ||
                                                   internal::is_same<typename Self::CoeffReturnType, double>::value);
-
   template <typename Device, typename OutputType>
   static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
     assert(false && "Should only be called to reduce doubles or floats on a gpu device");
author	Igor Babuschkin <igor@babuschk.in>	2016-08-18 17:15:12 +0100
committer	Igor Babuschkin <igor@babuschk.in>	2016-08-18 17:15:12 +0100
commit	1569a7d7ab35164b5de9c639cdf01c74a3a72050 (patch)
tree	c9e2f10b4b2947368e30d85ca9b6c89ee2e7c27f /unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
parent	841e075154c1b423595b54bca3569855b1652cc0 (diff)