aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2017-08-31 02:49:39 +0000
committerGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2017-08-31 02:49:39 +0000
commita4089991eb6bdb9e8ebfef93d81ca7b5e67ea77d (patch)
tree49a9b6c0c4ec6d006debe862cf209a8f252cfe78 /unsupported
parent304ef2957134be386e50592ad7120177c5f3a7c0 (diff)
Added support for CUDA 9.0.
Diffstat (limited to 'unsupported')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h9
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h63
-rw-r--r--unsupported/test/cxx11_tensor_argmax_cuda.cu6
-rw-r--r--unsupported/test/cxx11_tensor_cast_float16_cuda.cu6
-rw-r--r--unsupported/test/cxx11_tensor_complex_cuda.cu6
-rw-r--r--unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu6
-rw-r--r--unsupported/test/cxx11_tensor_contract_cuda.cu6
-rw-r--r--unsupported/test/cxx11_tensor_cuda.cu6
-rw-r--r--unsupported/test/cxx11_tensor_device.cu6
-rw-r--r--unsupported/test/cxx11_tensor_of_float16_cuda.cu6
-rw-r--r--unsupported/test/cxx11_tensor_random_cuda.cu6
-rw-r--r--unsupported/test/cxx11_tensor_reduction_cuda.cu6
-rw-r--r--unsupported/test/cxx11_tensor_scan_cuda.cu6
13 files changed, 118 insertions, 20 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index 428b18499..903bc51cc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -388,7 +388,11 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
// the sum across all big k blocks of the product of little k block of index (x, y)
// with block of index (y, z). To compute the final output, we need to reduce
// the 8 threads over y by summation.
+#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
+#else
+#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
+#endif
#define reduceRow(i, mask) \
shuffleInc(i, 0, mask); \
@@ -614,8 +618,13 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
x1 = rhs_pf0.x;
x2 = rhs_pf0.z;
}
+ #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
x1 = __shfl_xor(x1, 4);
x2 = __shfl_xor(x2, 4);
+ #else
+ x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4);
+ x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4);
+ #endif
if((threadIdx.x%8) < 4) {
rhs_pf0.y = x1;
rhs_pf0.w = x2;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 974eb7deb..ebcbd6f41 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -62,9 +62,9 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
else {
assert(0 && "Wordsize not supported");
}
-#else // __CUDA_ARCH__ >= 300
+#else // EIGEN_CUDA_ARCH >= 300
assert(0 && "Shouldn't be called on unsupported device");
-#endif // __CUDA_ARCH__ >= 300
+#endif // EIGEN_CUDA_ARCH >= 300
}
// We extend atomicExch to support extra data types
@@ -104,9 +104,9 @@ template <>
__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
#if EIGEN_CUDA_ARCH >= 300
atomicAdd(output, accum);
-#else // __CUDA_ARCH__ >= 300
+#else // EIGEN_CUDA_ARCH >= 300
assert(0 && "Shouldn't be called on unsupported device");
-#endif // __CUDA_ARCH__ >= 300
+#endif // EIGEN_CUDA_ARCH >= 300
}
@@ -168,7 +168,11 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
#pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2) {
+ #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
+ #else
+ reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
+ #endif
}
if ((threadIdx.x & (warpSize - 1)) == 0) {
@@ -179,9 +183,9 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
// Let the last block reset the semaphore
atomicInc(semaphore, gridDim.x + 1);
}
-#else // __CUDA_ARCH__ >= 300
+#else // EIGEN_CUDA_ARCH >= 300
assert(0 && "Shouldn't be called on unsupported device");
-#endif // __CUDA_ARCH__ >= 300
+#endif // EIGEN_CUDA_ARCH >= 300
}
@@ -223,12 +227,14 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x;
// Initialize the output value if it wasn't initialized by the ReductionInitKernel
- if (gridDim.x == 1 && first_index == 0) {
- if (num_coeffs % 2 != 0) {
- half last = input.m_impl.coeff(num_coeffs-1);
- *scratch = __halves2half2(last, reducer.initialize());
- } else {
- *scratch = reducer.template initializePacket<half2>();
+ if (gridDim.x == 1) {
+ if (first_index == 0) {
+ if (num_coeffs % 2 != 0) {
+ half last = input.m_impl.coeff(num_coeffs-1);
+ *scratch = __halves2half2(last, reducer.initialize());
+ } else {
+ *scratch = reducer.template initializePacket<half2>();
+ }
}
__syncthreads();
}
@@ -244,19 +250,25 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
#pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2) {
+ #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
+ #else
+ int temp = __shfl_down_sync(0xFFFFFFFF, *(int*)(&accum), (unsigned)offset, warpSize);
+ reducer.reducePacket(*(half2*)(&temp), &accum);
+ #endif
}
if ((threadIdx.x & (warpSize - 1)) == 0) {
atomicReduce(scratch, accum, reducer);
}
- __syncthreads();
-
- if (gridDim.x == 1 && first_index == 0) {
- half tmp = __low2half(*scratch);
- reducer.reduce(__high2half(*scratch), &tmp);
- *output = tmp;
+ if (gridDim.x == 1) {
+ __syncthreads();
+ if (first_index == 0) {
+ half tmp = __low2half(*scratch);
+ reducer.reduce(__high2half(*scratch), &tmp);
+ *output = tmp;
+ }
}
}
@@ -425,7 +437,11 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
#pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2) {
+ #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
+ #else
+ reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
+ #endif
}
if ((threadIdx.x & (warpSize - 1)) == 0) {
@@ -433,9 +449,9 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
}
}
}
-#else // __CUDA_ARCH__ >= 300
+#else // EIGEN_CUDA_ARCH >= 300
assert(0 && "Shouldn't be called on unsupported device");
-#endif // __CUDA_ARCH__ >= 300
+#endif // EIGEN_CUDA_ARCH >= 300
}
#ifdef EIGEN_HAS_CUDA_FP16
@@ -515,8 +531,15 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
#pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2) {
+ #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
+ #else
+ int temp1 = __shfl_down_sync(0xFFFFFFFF, *(int*)(&reduced_val1), (unsigned)offset, warpSize);
+ int temp2 = __shfl_down_sync(0xFFFFFFFF, *(int*)(&reduced_val2), (unsigned)offset, warpSize);
+ reducer.reducePacket(*(half2*)(&temp1), &reduced_val1);
+ reducer.reducePacket(*(half2*)(&temp2), &reduced_val2);
+ #endif
}
half val1 = __low2half(reduced_val1);
diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_cuda.cu
index 3d73d491a..0e8b8125d 100644
--- a/unsupported/test/cxx11_tensor_argmax_cuda.cu
+++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu
@@ -15,6 +15,12 @@
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
+// The EIGEN_CUDACC_VER macro is provided by
+// unsupported/Eigen/CXX11/Tensor included above
+#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
+#include <cuda_fp16.h>
+#endif
+
using Eigen::Tensor;
template <int Layout>
diff --git a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
index 816e03220..dabf9e45f 100644
--- a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
@@ -16,6 +16,12 @@
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
+// The EIGEN_CUDACC_VER macro is provided by
+// unsupported/Eigen/CXX11/Tensor included above
+#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
+#include <cuda_fp16.h>
+#endif
+
using Eigen::Tensor;
void test_cuda_conversion() {
diff --git a/unsupported/test/cxx11_tensor_complex_cuda.cu b/unsupported/test/cxx11_tensor_complex_cuda.cu
index a52350f85..d25e1bee1 100644
--- a/unsupported/test/cxx11_tensor_complex_cuda.cu
+++ b/unsupported/test/cxx11_tensor_complex_cuda.cu
@@ -14,6 +14,12 @@
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
+// The EIGEN_CUDACC_VER macro is provided by
+// unsupported/Eigen/CXX11/Tensor included above
+#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
+#include <cuda_fp16.h>
+#endif
+
using Eigen::Tensor;
void test_cuda_nullary() {
diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
index aac780905..4f0f621b4 100644
--- a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
+++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
@@ -14,6 +14,12 @@
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
+// The EIGEN_CUDACC_VER macro is provided by
+// unsupported/Eigen/CXX11/Tensor included above
+#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
+#include <cuda_fp16.h>
+#endif
+
using Eigen::Tensor;
template<typename T>
diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu
index e821ccf0c..c68287e34 100644
--- a/unsupported/test/cxx11_tensor_contract_cuda.cu
+++ b/unsupported/test/cxx11_tensor_contract_cuda.cu
@@ -17,6 +17,12 @@
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
+// The EIGEN_CUDACC_VER macro is provided by
+// unsupported/Eigen/CXX11/Tensor included above
+#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
+#include <cuda_fp16.h>
+#endif
+
using Eigen::Tensor;
typedef Tensor<float, 1>::DimensionPair DimPair;
diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu
index 9584a539f..d9059a2dc 100644
--- a/unsupported/test/cxx11_tensor_cuda.cu
+++ b/unsupported/test/cxx11_tensor_cuda.cu
@@ -15,6 +15,12 @@
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
+// The EIGEN_CUDACC_VER macro is provided by
+// unsupported/Eigen/CXX11/Tensor included above
+#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
+#include <cuda_fp16.h>
+#endif
+
using Eigen::Tensor;
void test_cuda_nullary() {
diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu
index cbb43e210..d5bfeeb39 100644
--- a/unsupported/test/cxx11_tensor_device.cu
+++ b/unsupported/test/cxx11_tensor_device.cu
@@ -16,6 +16,12 @@
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
+// The EIGEN_CUDACC_VER macro is provided by
+// unsupported/Eigen/CXX11/Tensor included above
+#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
+#include <cuda_fp16.h>
+#endif
+
using Eigen::Tensor;
using Eigen::RowMajor;
diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
index b3aab0b9d..c9f3ae1ae 100644
--- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
@@ -16,6 +16,12 @@
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
+// The EIGEN_CUDACC_VER macro is provided by
+// unsupported/Eigen/CXX11/Tensor included above
+#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
+#include <cuda_fp16.h>
+#endif
+
using Eigen::Tensor;
template<typename>
diff --git a/unsupported/test/cxx11_tensor_random_cuda.cu b/unsupported/test/cxx11_tensor_random_cuda.cu
index fa1a46732..9d08605fc 100644
--- a/unsupported/test/cxx11_tensor_random_cuda.cu
+++ b/unsupported/test/cxx11_tensor_random_cuda.cu
@@ -16,6 +16,12 @@
#include "main.h"
#include <Eigen/CXX11/Tensor>
+// The EIGEN_CUDACC_VER macro is provided by
+// unsupported/Eigen/CXX11/Tensor included above
+#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
+#include <cuda_fp16.h>
+#endif
+
void test_cuda_random_uniform()
{
diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_cuda.cu
index ec0669704..d6ce04f1c 100644
--- a/unsupported/test/cxx11_tensor_reduction_cuda.cu
+++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu
@@ -15,6 +15,12 @@
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
+// The EIGEN_CUDACC_VER macro is provided by
+// unsupported/Eigen/CXX11/Tensor included above
+#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
+#include <cuda_fp16.h>
+#endif
+
template<typename Type, int DataLayout>
static void test_full_reductions() {
diff --git a/unsupported/test/cxx11_tensor_scan_cuda.cu b/unsupported/test/cxx11_tensor_scan_cuda.cu
index de1c0ac95..e99724b91 100644
--- a/unsupported/test/cxx11_tensor_scan_cuda.cu
+++ b/unsupported/test/cxx11_tensor_scan_cuda.cu
@@ -16,6 +16,12 @@
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
+// The EIGEN_CUDACC_VER macro is provided by
+// unsupported/Eigen/CXX11/Tensor included above
+#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
+#include <cuda_fp16.h>
+#endif
+
using Eigen::Tensor;
typedef Tensor<float, 1>::DimensionPair DimPair;