aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2017-08-31 02:49:39 +0000
committerGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2017-08-31 02:49:39 +0000
commita4089991eb6bdb9e8ebfef93d81ca7b5e67ea77d (patch)
tree49a9b6c0c4ec6d006debe862cf209a8f252cfe78 /unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
parent304ef2957134be386e50592ad7120177c5f3a7c0 (diff)
Added support for CUDA 9.0.
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h9
1 files changed, 9 insertions, 0 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index 428b18499..903bc51cc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -388,7 +388,11 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
// the sum across all big k blocks of the product of little k block of index (x, y)
// with block of index (y, z). To compute the final output, we need to reduce
// the 8 threads over y by summation.
+#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
+#else
+#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
+#endif
#define reduceRow(i, mask) \
shuffleInc(i, 0, mask); \
@@ -614,8 +618,13 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
x1 = rhs_pf0.x;
x2 = rhs_pf0.z;
}
+ #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
x1 = __shfl_xor(x1, 4);
x2 = __shfl_xor(x2, 4);
+ #else
+ x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4);
+ x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4);
+ #endif
if((threadIdx.x%8) < 4) {
rhs_pf0.y = x1;
rhs_pf0.w = x2;