From 1bb6fa99a31d2dcf5431087d3f238e2dcca03084 Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Wed, 20 Jun 2018 16:44:58 -0400 Subject: merging the CUDA and HIP implementation for the Tensor directory and the unit tests --- .../Eigen/CXX11/src/Tensor/TensorReductionGpu.h | 190 +++++++++++++++------ 1 file changed, 136 insertions(+), 54 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h index ebcbd6f41..ca854d670 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h @@ -7,23 +7,23 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H -#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H +#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H namespace Eigen { namespace internal { -#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) // Full reducers for GPU, don't vectorize for now -// Reducer function that enables multiple cuda thread to safely accumulate at the same +// Reducer function that enables multiple gpu thread to safely accumulate at the same // output address. It basically reads the current value of the output variable, and -// attempts to update it with the new value. If in the meantime another cuda thread +// attempts to update it with the new value. If in the meantime another gpu thread // updated the content of the output address it will try again. template __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { -#if EIGEN_CUDA_ARCH >= 300 +#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) if (sizeof(T) == 4) { unsigned int oldval = *reinterpret_cast(output); @@ -79,7 +79,7 @@ __device__ inline double atomicExchCustom(double* address, double val) { return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val))); } -#ifdef EIGEN_HAS_CUDA_FP16 +#ifdef EIGEN_HAS_GPU_FP16 template