From 1bb6fa99a31d2dcf5431087d3f238e2dcca03084 Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Wed, 20 Jun 2018 16:44:58 -0400 Subject: merging the CUDA and HIP implementation for the Tensor directory and the unit tests --- cmake/EigenTesting.cmake | 3 +- unsupported/Eigen/CXX11/Tensor | 24 +- .../Eigen/CXX11/src/Tensor/TensorContractionGpu.h | 189 +++++- .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 136 ++-- .../Eigen/CXX11/src/Tensor/TensorDeviceGpu.h | 173 ++--- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 21 +- .../CXX11/src/Tensor/TensorGpuHipCudaDefines.h | 86 +++ .../CXX11/src/Tensor/TensorGpuHipCudaUndefines.h | 39 ++ .../Eigen/CXX11/src/Tensor/TensorReductionGpu.h | 190 ++++-- unsupported/Eigen/CXX11/src/Tensor/TensorScan.h | 8 +- unsupported/test/CMakeLists.txt | 49 +- unsupported/test/cxx11_tensor_argmax_gpu.cu | 90 +-- unsupported/test/cxx11_tensor_cast_float16_gpu.cu | 10 +- .../test/cxx11_tensor_complex_cwise_ops_gpu.cu | 2 +- unsupported/test/cxx11_tensor_complex_gpu.cu | 8 +- unsupported/test/cxx11_tensor_contract_gpu.cu | 92 +-- unsupported/test/cxx11_tensor_device.cu | 58 +- unsupported/test/cxx11_tensor_gpu.cu | 706 +++++++++++---------- unsupported/test/cxx11_tensor_of_float16_gpu.cu | 80 +-- unsupported/test/cxx11_tensor_random_gpu.cu | 29 +- unsupported/test/cxx11_tensor_reduction_gpu.cu | 10 +- unsupported/test/cxx11_tensor_scan_gpu.cu | 25 +- 22 files changed, 1216 insertions(+), 812 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index f818ae840..1d4486c05 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -20,7 +20,8 @@ macro(ei_add_test_internal testname testname_with_suffix) if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu) if(EIGEN_TEST_HIP) - hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS "-DEIGEN_USE_HIP") + hip_reset_flags() + hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS "-DEIGEN_USE_HIP ${ARGV2}") elseif(EIGEN_TEST_CUDA_CLANG) set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX) if(CUDA_64_BIT_DEVICE_CODE) diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 4b7c7d724..ddbbcfba2 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -99,11 +99,7 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorCostModel.h" #include "src/Tensor/TensorDeviceDefault.h" #include "src/Tensor/TensorDeviceThreadPool.h" -#if defined(EIGEN_USE_HIP) - #include "src/Tensor/TensorDeviceHip.h" -#else - #include "src/Tensor/TensorDeviceCuda.h" -#endif +#include "src/Tensor/TensorDeviceGpu.h" #include "src/Tensor/TensorDeviceSycl.h" #include "src/Tensor/TensorIndexList.h" #include "src/Tensor/TensorDimensionList.h" @@ -120,28 +116,16 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorEvaluator.h" #include "src/Tensor/TensorExpr.h" #include "src/Tensor/TensorReduction.h" -#if defined(EIGEN_USE_HIP) - #include "src/Tensor/TensorReductionHip.h" -#else - #include "src/Tensor/TensorReductionCuda.h" -#endif +#include "src/Tensor/TensorReductionGpu.h" #include "src/Tensor/TensorArgMax.h" #include "src/Tensor/TensorConcatenation.h" #include "src/Tensor/TensorContractionMapper.h" #include "src/Tensor/TensorContractionBlocking.h" #include "src/Tensor/TensorContraction.h" #include "src/Tensor/TensorContractionThreadPool.h" -#if defined(EIGEN_USE_HIP) - #include "src/Tensor/TensorContractionHip.h" -#else - #include "src/Tensor/TensorContractionCuda.h" -#endif +#include "src/Tensor/TensorContractionGpu.h" #include "src/Tensor/TensorConversion.h" -#if defined(EIGEN_USE_HIP) - #include "src/Tensor/TensorConvolutionHip.h" -#else - #include "src/Tensor/TensorConvolution.h" -#endif +#include "src/Tensor/TensorConvolution.h" #include "src/Tensor/TensorFFT.h" #include "src/Tensor/TensorPatch.h" #include "src/Tensor/TensorImagePatch.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h index 903bc51cc..238754424 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h @@ -9,10 +9,10 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H -#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) namespace Eigen { @@ -388,7 +388,7 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, // the sum across all big k blocks of the product of little k block of index (x, y) // with block of index (y, z). To compute the final output, we need to reduce // the 8 threads over y by summation. -#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 +#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000) #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) #else #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask) @@ -503,7 +503,11 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, template __global__ void +#if defined(EIGEN_HIPCC) +__launch_bounds__(512, 1) +#else __launch_bounds__(512) +#endif EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, const OutputMapper output, const Index m_size, const Index n_size, const Index k_size) { @@ -542,7 +546,45 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh results[i].x = results[i].y = results[i].z = results[i].w = 0; } +#if defined(EIGEN_HIPCC) +#define prefetch_lhs(reg, row, col) \ + if (!CHECK_LHS_BOUNDARY) { \ + if (col < k_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + reg.z =lhs(row + 2, col); \ + reg.w =lhs(row + 3, col); \ + } \ + } else { \ + if (col < k_size) { \ + if (row + 3 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + reg.z =lhs(row + 2, col); \ + reg.w =lhs(row + 3, col); \ + } else if (row + 2 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + reg.z =lhs(row + 2, col); \ + } else if (row + 1 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + } else if (row < m_size) { \ + reg.x =lhs(row + 0, col); \ + } \ + } \ + } \ + +#define prefetch_rhs_hipcc(reg, row, col) \ + reg.x =rhs(row + 0, col); \ + reg.y =rhs(row + 1, col); \ + reg.z =rhs(row + 2, col); \ + reg.w =rhs(row + 3, col); \ + + +#else + #define prefetch_lhs(reg, row, col) \ if (!CHECK_LHS_BOUNDARY) { \ if (col < k_size) { \ @@ -563,14 +605,21 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh reg.x =lhs(row + 0, col); \ } \ } \ - } \ + } \ +#endif Index lhs_vert = base_m+threadIdx.x*4; for (Index k = 0; k < k_size; k += 16) { + +#if defined(EIGEN_HIPCC) + lhs_pf0 = make_float4(0, 0, 0, 0); + rhs_pf0 = make_float4(0, 0, 0, 0); +#else lhs_pf0 = internal::pset1(0); rhs_pf0 = internal::pset1(0); +#endif Index lhs_horiz = threadIdx.y+k; prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz) @@ -581,7 +630,11 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh if (!CHECK_RHS_BOUNDARY) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY +#if defined(EIGEN_HIPCC) + prefetch_rhs_hipcc(rhs_pf0, rhs_vert, rhs_horiz0) +#else rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); +#endif } else if (rhs_vert + 2 < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -596,7 +649,11 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh } else { if (rhs_horiz0 < n_size) { if ((rhs_vert + 3) < k_size) { +#if defined(EIGEN_HIPCC) + prefetch_rhs_hipcc(rhs_pf0, rhs_vert, rhs_horiz0) +#else rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); +#endif } else if ((rhs_vert + 2) < k_size) { rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); @@ -618,7 +675,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh x1 = rhs_pf0.x; x2 = rhs_pf0.z; } - #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 + #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000) x1 = __shfl_xor(x1, 4); x2 = __shfl_xor(x2, 4); #else @@ -695,7 +752,11 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh #undef prefetch_lhs #undef add_vals - + +#if defined(EIGEN_HIPCC) +#undef prefetch_rhs_hipcc +#endif + Index horiz_base = threadIdx.y*4+base_n; if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { for (int i = 0; i < 4; i++) { @@ -784,9 +845,33 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, results[i].x = results[i].y = results[i].z = results[i].w = 0; } +#if defined(EIGEN_HIPCC) + +#define prefetch_lhs_hipcc(reg, row, col) \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + reg.z =lhs(row + 2, col); \ + reg.w =lhs(row + 3, col); + +#define prefetch_rhs_hipcc(reg, row, col) \ + reg.x =rhs(row + 0, col); \ + reg.y =rhs(row + 1, col); \ + reg.z =rhs(row + 2, col); \ + reg.w =rhs(row + 3, col); + +#endif Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32; for (Index k = 0; k < k_size; k += 32) { +#if defined(EIGEN_HIPCC) + lhs_pf0 = make_float4(0, 0, 0, 0); + lhs_pf1 = make_float4(0, 0, 0, 0); + lhs_pf2 = make_float4(0, 0, 0, 0); + lhs_pf3 = make_float4(0, 0, 0, 0); + + rhs_pf0 = make_float4(0, 0, 0, 0); + rhs_pf1 = make_float4(0, 0, 0, 0); +#else lhs_pf0 = internal::pset1(0); lhs_pf1 = internal::pset1(0); lhs_pf2 = internal::pset1(0); @@ -794,40 +879,85 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, rhs_pf0 = internal::pset1(0); rhs_pf1 = internal::pset1(0); +#endif if (!CHECK_LHS_BOUNDARY) { if ((threadIdx.y/4+k+24) < k_size) { +#if defined(EIGEN_HIPCC) + prefetch_lhs_hipcc(lhs_pf0, lhs_vert, (threadIdx.y/4+k)) + prefetch_lhs_hipcc(lhs_pf1, lhs_vert, (threadIdx.y/4+k+8)) + prefetch_lhs_hipcc(lhs_pf2, lhs_vert, (threadIdx.y/4+k+16)) + prefetch_lhs_hipcc(lhs_pf3, lhs_vert, (threadIdx.y/4+k+24)) +#else lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); lhs_pf3 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+24)); +#endif } else if ((threadIdx.y/4+k+16) < k_size) { +#if defined(EIGEN_HIPCC) + prefetch_lhs_hipcc(lhs_pf0, lhs_vert, (threadIdx.y/4+k)) + prefetch_lhs_hipcc(lhs_pf1, lhs_vert, (threadIdx.y/4+k+8)) + prefetch_lhs_hipcc(lhs_pf2, lhs_vert, (threadIdx.y/4+k+16)) +#else lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); +#endif } else if ((threadIdx.y/4+k+8) < k_size) { +#if defined(EIGEN_HIPCC) + prefetch_lhs_hipcc(lhs_pf0, lhs_vert, (threadIdx.y/4+k)) + prefetch_lhs_hipcc(lhs_pf1, lhs_vert, (threadIdx.y/4+k+8)) +#else lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); +#endif } else if ((threadIdx.y/4+k) < k_size) { +#if defined(EIGEN_HIPCC) + prefetch_lhs_hipcc(lhs_pf0, lhs_vert, (threadIdx.y/4+k)) +#else lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); +#endif } } else { // just CHECK_LHS_BOUNDARY if (lhs_vert + 3 < m_size) { if ((threadIdx.y/4+k+24) < k_size) { +#if defined(EIGEN_HIPCC) + prefetch_lhs_hipcc(lhs_pf0, lhs_vert, (threadIdx.y/4+k)) + prefetch_lhs_hipcc(lhs_pf1, lhs_vert, (threadIdx.y/4+k+8)) + prefetch_lhs_hipcc(lhs_pf2, lhs_vert, (threadIdx.y/4+k+16)) + prefetch_lhs_hipcc(lhs_pf3, lhs_vert, (threadIdx.y/4+k+24)) +#else lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); lhs_pf3 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+24)); +#endif } else if ((threadIdx.y/4+k+16) < k_size) { +#if defined(EIGEN_HIPCC) + prefetch_lhs_hipcc(lhs_pf0, lhs_vert, (threadIdx.y/4+k)) + prefetch_lhs_hipcc(lhs_pf1, lhs_vert, (threadIdx.y/4+k+8)) + prefetch_lhs_hipcc(lhs_pf2, lhs_vert, (threadIdx.y/4+k+16)) +#else lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); +#endif } else if ((threadIdx.y/4+k+8) < k_size) { +#if defined(EIGEN_HIPCC) + prefetch_lhs_hipcc(lhs_pf0, lhs_vert, (threadIdx.y/4+k)) + prefetch_lhs_hipcc(lhs_pf1, lhs_vert, (threadIdx.y/4+k+8)) +#else lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); +#endif } else if ((threadIdx.y/4+k) < k_size) { +#if defined(EIGEN_HIPCC) + prefetch_lhs_hipcc(lhs_pf0, lhs_vert, (threadIdx.y/4+k)) +#else lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); +#endif } } else if (lhs_vert + 2 < m_size) { if ((threadIdx.y/4+k+24) < k_size) { @@ -916,8 +1046,13 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, if (!CHECK_RHS_BOUNDARY) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY +#if defined(EIGEN_HIPCC) + prefetch_rhs_hipcc(rhs_pf0, rhs_vert, rhs_horiz0) + prefetch_rhs_hipcc(rhs_pf1, rhs_vert, rhs_horiz1) +#else rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); +#endif } else if (rhs_vert + 2 < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -939,8 +1074,13 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, if (rhs_horiz1 < n_size) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY +#if defined(EIGEN_HIPCC) + prefetch_rhs_hipcc(rhs_pf0, rhs_vert, rhs_horiz0) + prefetch_rhs_hipcc(rhs_pf1, rhs_vert, rhs_horiz1) +#else rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); +#endif } else if (rhs_vert + 2 < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -961,7 +1101,11 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, } else if (rhs_horiz0 < n_size) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY +#if defined(EIGEN_HIPCC) + prefetch_rhs_hipcc(rhs_pf0, rhs_vert, rhs_horiz0) +#else rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); +#endif } else if ((rhs_vert + 2) < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -1069,7 +1213,11 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, __syncthreads(); } // end loop over k - +#if defined(EIGEN_HIPCC) +#undef prefetch_lhs_hipcc +#undef prefetch_rhs_hipcc +#endif + __syncthreads(); Index horiz_base = (threadIdx.y/4)*8+base_n; if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { @@ -1134,7 +1282,11 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, template __global__ void +#if defined(EIGEN_HIPCC) +__launch_bounds__(256, 1) +#else __launch_bounds__(256) +#endif EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, const OutputMapper output, const Index m_size, const Index n_size, const Index k_size) { @@ -1177,7 +1329,11 @@ EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, template __global__ void +#if defined(EIGEN_HIPCC) +__launch_bounds__(256, 1) +#else __launch_bounds__(256) +#endif EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs, const OutputMapper output, const Index m_size, const Index n_size, const Index k_size) { @@ -1323,7 +1479,7 @@ struct TensorEvaluator), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); + LAUNCH_GPU_KERNEL((EigenContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); } }; @@ -1334,13 +1490,13 @@ struct TensorEvaluator), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); + LAUNCH_GPU_KERNEL((EigenFloatContractionKernel16x16), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); } else { const Index m_blocks = (m + 127) / 128; const Index n_blocks = (n + 63) / 64; const dim3 num_blocks(m_blocks, n_blocks, 1); const dim3 block_size(8, 32, 1); - LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); + LAUNCH_GPU_KERNEL((EigenFloatContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); } } }; @@ -1384,12 +1540,17 @@ struct TensorEvaluator::Run(lhs, rhs, output, m, n, k, this->m_device); } }; } // end namespace Eigen -#endif // EIGEN_USE_GPU and EIGEN_CUDACC -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H +#endif // EIGEN_USE_GPU and EIGEN_GPUCC +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 84d5be173..3110887e1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -54,8 +54,8 @@ class IndexMapper { } } - array cudaInputDimensions; - array cudaOutputDimensions; + array gpuInputDimensions; + array gpuOutputDimensions; array tmp = dimensions; array ordering; const size_t offset = static_cast(Layout) == static_cast(ColMajor) @@ -65,8 +65,8 @@ class IndexMapper { const Index index = i + offset; ordering[index] = indices[i]; tmp[indices[i]] = -1; - cudaInputDimensions[index] = input_dims[indices[i]]; - cudaOutputDimensions[index] = dimensions[indices[i]]; + gpuInputDimensions[index] = input_dims[indices[i]]; + gpuOutputDimensions[index] = dimensions[indices[i]]; } int written = static_cast(Layout) == static_cast(ColMajor) @@ -75,8 +75,8 @@ class IndexMapper { for (int i = 0; i < NumDims; ++i) { if (tmp[i] >= 0) { ordering[written] = i; - cudaInputDimensions[written] = input_dims[i]; - cudaOutputDimensions[written] = dimensions[i]; + gpuInputDimensions[written] = input_dims[i]; + gpuOutputDimensions[written] = dimensions[i]; ++written; } } @@ -89,37 +89,37 @@ class IndexMapper { if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = 0; i < NumDims; ++i) { if (i > NumKernelDims) { - m_cudaInputStrides[i] = - m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1]; - m_cudaOutputStrides[i] = - m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1]; + m_gpuInputStrides[i] = + m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1]; + m_gpuOutputStrides[i] = + m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1]; } else { - m_cudaInputStrides[i] = 1; - m_cudaOutputStrides[i] = 1; + m_gpuInputStrides[i] = 1; + m_gpuOutputStrides[i] = 1; } } } else { for (int i = NumDims - 1; i >= 0; --i) { if (static_cast(i + 1) < offset) { - m_cudaInputStrides[i] = - m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1]; - m_cudaOutputStrides[i] = - m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1]; + m_gpuInputStrides[i] = + m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1]; + m_gpuOutputStrides[i] = + m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1]; } else { - m_cudaInputStrides[i] = 1; - m_cudaOutputStrides[i] = 1; + m_gpuInputStrides[i] = 1; + m_gpuOutputStrides[i] = 1; } } } } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const { Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { for (int d = NumDims - 1; d > NumKernelDims; --d) { - const Index idx = p / m_cudaInputStrides[d]; + const Index idx = p / m_gpuInputStrides[d]; inputIndex += idx * m_inputStrides[d]; - p -= idx * m_cudaInputStrides[d]; + p -= idx * m_gpuInputStrides[d]; } inputIndex += p * m_inputStrides[NumKernelDims]; } else { @@ -128,22 +128,22 @@ class IndexMapper { limit = NumDims - NumKernelDims - 1; } for (int d = 0; d < limit; ++d) { - const Index idx = p / m_cudaInputStrides[d]; + const Index idx = p / m_gpuInputStrides[d]; inputIndex += idx * m_inputStrides[d]; - p -= idx * m_cudaInputStrides[d]; + p -= idx * m_gpuInputStrides[d]; } inputIndex += p * m_inputStrides[limit]; } return inputIndex; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const { Index outputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { for (int d = NumDims - 1; d > NumKernelDims; --d) { - const Index idx = p / m_cudaOutputStrides[d]; + const Index idx = p / m_gpuOutputStrides[d]; outputIndex += idx * m_outputStrides[d]; - p -= idx * m_cudaOutputStrides[d]; + p -= idx * m_gpuOutputStrides[d]; } outputIndex += p * m_outputStrides[NumKernelDims]; } else { @@ -152,44 +152,44 @@ class IndexMapper { limit = NumDims - NumKernelDims - 1; } for (int d = 0; d < limit; ++d) { - const Index idx = p / m_cudaOutputStrides[d]; + const Index idx = p / m_gpuOutputStrides[d]; outputIndex += idx * m_outputStrides[d]; - p -= idx * m_cudaOutputStrides[d]; + p -= idx * m_gpuOutputStrides[d]; } outputIndex += p * m_outputStrides[limit]; } return outputIndex; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const { const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; return i * m_inputStrides[offset]; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const { const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; return i * m_outputStrides[offset]; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const { const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1]; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const { const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1]; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const { const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; @@ -197,7 +197,7 @@ class IndexMapper { k * m_inputStrides[offset + 2]; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; @@ -209,8 +209,8 @@ class IndexMapper { static const int NumDims = internal::array_size::value; array m_inputStrides; array m_outputStrides; - array m_cudaInputStrides; - array m_cudaOutputStrides; + array m_gpuInputStrides; + array m_gpuOutputStrides; }; @@ -553,7 +553,7 @@ struct TensorEvaluator struct GetKernelSize { @@ -576,8 +576,12 @@ __global__ void EigenConvolutionKernel1D( indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int kernelSize, float* buffer) { +#if defined(EIGEN_HIPCC) + HIP_DYNAMIC_SHARED(float, s) +#else extern __shared__ float s[]; - +#endif + const int first_x = blockIdx.x * maxX; const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; const int num_x_input = last_x - first_x + GetKernelSize()(kernelSize); @@ -588,18 +592,18 @@ __global__ void EigenConvolutionKernel1D( for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) { // Load inputs to shared memory - const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); const int plane_kernel_offset = threadIdx.y * num_x_input; #pragma unroll for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { - const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x); + const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x); s[i + plane_kernel_offset] = eval.coeff(tensor_index); } __syncthreads(); // Compute the convolution - const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); #pragma unroll for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { @@ -609,7 +613,7 @@ __global__ void EigenConvolutionKernel1D( for (int k = 0; k < GetKernelSize()(kernelSize); ++k) { result += s[k + kernel_offset] * kernel[k]; } - const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x); + const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x); buffer[tensor_index] = result; } __syncthreads(); @@ -625,7 +629,11 @@ __global__ void EigenConvolutionKernel2D( const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int numY, const int maxY, const int kernelSizeX, const int kernelSizeY, float* buffer) { +#if defined(EIGEN_HIPCC) + HIP_DYNAMIC_SHARED(float, s) +#else extern __shared__ float s[]; +#endif const int first_x = blockIdx.x * maxX; const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; @@ -642,7 +650,7 @@ __global__ void EigenConvolutionKernel2D( for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) { - const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); const int plane_kernel_offset = threadIdx.z * num_y_input; // Load inputs to shared memory @@ -651,7 +659,7 @@ __global__ void EigenConvolutionKernel2D( const int input_offset = num_x_input * (j + plane_kernel_offset); #pragma unroll for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { - const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y); + const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y); s[i + input_offset] = eval.coeff(tensor_index); } } @@ -659,7 +667,7 @@ __global__ void EigenConvolutionKernel2D( __syncthreads(); // Convolution - const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); #pragma unroll for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { @@ -675,7 +683,7 @@ __global__ void EigenConvolutionKernel2D( result += s[k + input_offset] * kernel[k + kernel_offset]; } } - const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y); + const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y); buffer[tensor_index] = result; } } @@ -693,7 +701,11 @@ __global__ void EigenConvolutionKernel3D( const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, const size_t kernelSizeZ, float* buffer) { +#if defined(EIGEN_HIPCC) + HIP_DYNAMIC_SHARED(float, s) +#else extern __shared__ float s[]; +#endif // Load inputs to shared memory const int first_x = blockIdx.x * maxX; @@ -710,13 +722,13 @@ __global__ void EigenConvolutionKernel3D( for (int p = 0; p < numPlanes; ++p) { - const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); const int plane_kernel_offset = 0; for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) { for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { - const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); + const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); } } @@ -728,7 +740,7 @@ __global__ void EigenConvolutionKernel3D( const int num_z_output = last_z - first_z + 1; const int num_y_output = last_y - first_y + 1; const int num_x_output = last_x - first_x + 1; - const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) { for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { @@ -741,7 +753,7 @@ __global__ void EigenConvolutionKernel3D( } } } - const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z); + const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z); buffer[tensor_index] = result; } } @@ -854,9 +866,9 @@ struct TensorEvaluator::Dimensions InputDims; const int maxSharedMem = m_device.sharedMemPerBlock(); - const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock(); - const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock; - const int numMultiProcessors = m_device.getNumCudaMultiProcessors(); + const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock(); + const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock; + const int numMultiProcessors = m_device.getNumGpuMultiProcessors(); const int warpSize = 32; switch (NumKernelDims) { @@ -908,15 +920,15 @@ struct TensorEvaluator, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); + LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); break; } case 7: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); + LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); break; } default: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); + LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); } } break; @@ -969,11 +981,11 @@ struct TensorEvaluator, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); + LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); break; } default: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); + LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); break; } } @@ -982,18 +994,18 @@ struct TensorEvaluator, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); + LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); break; } default: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); + LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); break; } } break; } default: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); + LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); break; } } @@ -1039,7 +1051,7 @@ struct TensorEvaluator indexMapper( m_inputImpl.dimensions(), kernel_dims, indices); - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); + LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); break; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h index ded7129da..64ef32793 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h @@ -7,21 +7,26 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H) -#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H +#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H) +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H + +// This header file container defines fo gpu* macros which will resolve to +// their equivalent hip* or cuda* versions depending on the compiler in use +// A separte header (included at the end of this file) will undefine all +#include "TensorGpuHipCudaDefines.h" namespace Eigen { -static const int kCudaScratchSize = 1024; +static const int kGpuScratchSize = 1024; // This defines an interface that GPUDevice can take to use -// CUDA streams underneath. +// HIP / CUDA streams underneath. class StreamInterface { public: virtual ~StreamInterface() {} - virtual const cudaStream_t& stream() const = 0; - virtual const cudaDeviceProp& deviceProperties() const = 0; + virtual const gpuStream_t& stream() const = 0; + virtual const gpuDeviceProp_t& deviceProperties() const = 0; // Allocate memory on the actual device where the computation will run virtual void* allocate(size_t num_bytes) const = 0; @@ -37,7 +42,7 @@ class StreamInterface { virtual unsigned int* semaphore() const = 0; }; -static cudaDeviceProp* m_deviceProperties; +static gpuDeviceProp_t* m_deviceProperties; static bool m_devicePropInitialized = false; static void initializeDeviceProp() { @@ -58,23 +63,23 @@ static void initializeDeviceProp() { #endif // We're the first thread to reach this point. int num_devices; - cudaError_t status = cudaGetDeviceCount(&num_devices); - if (status != cudaSuccess) { - std::cerr << "Failed to get the number of CUDA devices: " - << cudaGetErrorString(status) + gpuError_t status = gpuGetDeviceCount(&num_devices); + if (status != gpuSuccess) { + std::cerr << "Failed to get the number of GPU devices: " + << gpuGetErrorString(status) << std::endl; - assert(status == cudaSuccess); + assert(status == gpuSuccess); } - m_deviceProperties = new cudaDeviceProp[num_devices]; + m_deviceProperties = new gpuDeviceProp_t[num_devices]; for (int i = 0; i < num_devices; ++i) { - status = cudaGetDeviceProperties(&m_deviceProperties[i], i); - if (status != cudaSuccess) { - std::cerr << "Failed to initialize CUDA device #" + status = gpuGetDeviceProperties(&m_deviceProperties[i], i); + if (status != gpuSuccess) { + std::cerr << "Failed to initialize GPU device #" << i << ": " - << cudaGetErrorString(status) + << gpuGetErrorString(status) << std::endl; - assert(status == cudaSuccess); + assert(status == gpuSuccess); } } @@ -94,87 +99,87 @@ static void initializeDeviceProp() { } } -static const cudaStream_t default_stream = cudaStreamDefault; +static const gpuStream_t default_stream = gpuStreamDefault; -class CudaStreamDevice : public StreamInterface { +class GpuStreamDevice : public StreamInterface { public: // Use the default stream on the current device - CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { - cudaGetDevice(&device_); + GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { + gpuGetDevice(&device_); initializeDeviceProp(); } // Use the default stream on the specified device - CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) { + GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) { initializeDeviceProp(); } // Use the specified stream. Note that it's the // caller responsibility to ensure that the stream can run on // the specified device. If no device is specified the code // assumes that the stream is associated to the current gpu device. - CudaStreamDevice(const cudaStream_t* stream, int device = -1) + GpuStreamDevice(const gpuStream_t* stream, int device = -1) : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) { if (device < 0) { - cudaGetDevice(&device_); + gpuGetDevice(&device_); } else { int num_devices; - cudaError_t err = cudaGetDeviceCount(&num_devices); + gpuError_t err = gpuGetDeviceCount(&num_devices); EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); + assert(err == gpuSuccess); assert(device < num_devices); device_ = device; } initializeDeviceProp(); } - virtual ~CudaStreamDevice() { + virtual ~GpuStreamDevice() { if (scratch_) { deallocate(scratch_); } } - const cudaStream_t& stream() const { return *stream_; } - const cudaDeviceProp& deviceProperties() const { + const gpuStream_t& stream() const { return *stream_; } + const gpuDeviceProp_t& deviceProperties() const { return m_deviceProperties[device_]; } virtual void* allocate(size_t num_bytes) const { - cudaError_t err = cudaSetDevice(device_); + gpuError_t err = gpuSetDevice(device_); EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); + assert(err == gpuSuccess); void* result; - err = cudaMalloc(&result, num_bytes); - assert(err == cudaSuccess); + err = gpuMalloc(&result, num_bytes); + assert(err == gpuSuccess); assert(result != NULL); return result; } virtual void deallocate(void* buffer) const { - cudaError_t err = cudaSetDevice(device_); + gpuError_t err = gpuSetDevice(device_); EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); + assert(err == gpuSuccess); assert(buffer != NULL); - err = cudaFree(buffer); - assert(err == cudaSuccess); + err = gpuFree(buffer); + assert(err == gpuSuccess); } virtual void* scratchpad() const { if (scratch_ == NULL) { - scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int)); + scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int)); } return scratch_; } virtual unsigned int* semaphore() const { if (semaphore_ == NULL) { - char* scratch = static_cast(scratchpad()) + kCudaScratchSize; + char* scratch = static_cast(scratchpad()) + kGpuScratchSize; semaphore_ = reinterpret_cast(scratch); - cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); + gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); + assert(err == gpuSuccess); } return semaphore_; } private: - const cudaStream_t* stream_; + const gpuStream_t* stream_; int device_; mutable void* scratch_; mutable unsigned int* semaphore_; @@ -190,7 +195,7 @@ struct GpuDevice { eigen_assert(stream); } // TODO(bsteiner): This is an internal API, we should not expose it. - EIGEN_STRONG_INLINE const cudaStream_t& stream() const { + EIGEN_STRONG_INLINE const gpuStream_t& stream() const { return stream_->stream(); } @@ -211,11 +216,11 @@ struct GpuDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { -#ifndef EIGEN_CUDA_ARCH - cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, +#ifndef EIGEN_GPU_COMPILE_PHASE + gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice, stream_->stream()); EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); + assert(err == gpuSuccess); #else EIGEN_UNUSED_VARIABLE(dst); EIGEN_UNUSED_VARIABLE(src); @@ -225,24 +230,24 @@ struct GpuDevice { } EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { - cudaError_t err = - cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream()); + gpuError_t err = + gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream()); EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); + assert(err == gpuSuccess); } EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { - cudaError_t err = - cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream()); + gpuError_t err = + gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream()); EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); + assert(err == gpuSuccess); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { -#ifndef EIGEN_CUDA_ARCH - cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream()); +#ifndef EIGEN_GPU_COMPILE_PHASE + gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream()); EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); + assert(err == gpuSuccess); #else eigen_assert(false && "The default device should be used instead to generate kernel code"); #endif @@ -260,31 +265,31 @@ struct GpuDevice { EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { // We won't try to take advantage of the l2 cache for the time being, and - // there is no l3 cache on cuda devices. + // there is no l3 cache on hip/cuda devices. return firstLevelCacheSize(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { -#if defined(EIGEN_CUDACC) && !defined(EIGEN_CUDA_ARCH) - cudaError_t err = cudaStreamSynchronize(stream_->stream()); - if (err != cudaSuccess) { - std::cerr << "Error detected in CUDA stream: " - << cudaGetErrorString(err) +#if defined(EIGEN_GPUCC) && !defined(EIGEN_GPU_COMPILE_PHASE) + gpuError_t err = gpuStreamSynchronize(stream_->stream()); + if (err != gpuSuccess) { + std::cerr << "Error detected in GPU stream: " + << gpuGetErrorString(err) << std::endl; - assert(err == cudaSuccess); + assert(err == gpuSuccess); } #else assert(false && "The default device should be used instead to generate kernel code"); #endif } - EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const { + EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const { return stream_->deviceProperties().multiProcessorCount; } - EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const { + EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const { return stream_->deviceProperties().maxThreadsPerBlock; } - EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const { + EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const { return stream_->deviceProperties().maxThreadsPerMultiProcessor; } EIGEN_STRONG_INLINE int sharedMemPerBlock() const { @@ -301,12 +306,12 @@ struct GpuDevice { return max_blocks_; } - // This function checks if the CUDA runtime recorded an error for the + // This function checks if the GPU runtime recorded an error for the // underlying stream device. inline bool ok() const { -#ifdef EIGEN_CUDACC - cudaError_t error = cudaStreamQuery(stream_->stream()); - return (error == cudaSuccess) || (error == cudaErrorNotReady); +#ifdef EIGEN_GPUCC + gpuError_t error = gpuStreamQuery(stream_->stream()); + return (error == gpuSuccess) || (error == gpuErrorNotReady); #else return false; #endif @@ -317,18 +322,27 @@ struct GpuDevice { int max_blocks_; }; -#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ +#if defined(EIGEN_HIPCC) + +#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ + hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \ + assert(hipGetLastError() == hipSuccess); + +#else + +#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ assert(cudaGetLastError() == cudaSuccess); - +#endif + // FIXME: Should be device and kernel specific. -#ifdef EIGEN_CUDACC -static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { -#ifndef EIGEN_CUDA_ARCH - cudaError_t status = cudaDeviceSetSharedMemConfig(config); +#ifdef EIGEN_GPUCC +static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) { +#ifndef EIGEN_GPU_COMPILE_PHASE + gpuError_t status = gpuDeviceSetSharedMemConfig(config); EIGEN_UNUSED_VARIABLE(status) - assert(status == cudaSuccess); + assert(status == gpuSuccess); #else EIGEN_UNUSED_VARIABLE(config) #endif @@ -337,4 +351,7 @@ static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig } // end namespace Eigen -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H +// undefine all the gpu* macros we defined at the beginning of the file +#include "TensorGpuHipCudaUndefines.h" + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 8bbe449cc..1181c2753 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -250,28 +250,17 @@ inline void TensorExecutor::run( TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { -#if defined(EIGEN_HIPCC) - const int block_size = device.maxHipThreadsPerBlock(); - const int max_blocks = device.getNumHipMultiProcessors() * - device.maxHipThreadsPerMultiProcessor() / block_size; - const Index size = array_prod(evaluator.dimensions()); - // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. - const int num_blocks = numext::maxi(numext::mini(max_blocks, divup(size, block_size)), 1); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenMetaKernel, Index>), - dim3(num_blocks), dim3(block_size), 0, device.stream(), evaluator, size); -#else - const int block_size = device.maxCudaThreadsPerBlock(); - const int max_blocks = device.getNumCudaMultiProcessors() * - device.maxCudaThreadsPerMultiProcessor() / block_size; + + const int block_size = device.maxGpuThreadsPerBlock(); + const int max_blocks = device.getNumGpuMultiProcessors() * + device.maxGpuThreadsPerMultiProcessor() / block_size; const Index size = array_prod(evaluator.dimensions()); // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. const int num_blocks = numext::maxi(numext::mini(max_blocks, divup(size, block_size)), 1); - LAUNCH_CUDA_KERNEL( + LAUNCH_GPU_KERNEL( (EigenMetaKernel, Index>), num_blocks, block_size, 0, device, evaluator, size); -#endif } evaluator.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h new file mode 100644 index 000000000..f009ae855 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h @@ -0,0 +1,86 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2018 Deven Desai +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H) +#define EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H + +// Note that we are using EIGEN_USE_HIP here instead of EIGEN_HIPCC...this is by design +// There is code in the Tensorflow codebase that will define EIGEN_USE_GPU, but +// for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler +// When compiling such files, gcc will end up trying to pick up the CUDA headers by +// default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU) +// This will obsviously not work when trying to compile tensorflow on a sytem with no CUDA +// To work around this issue for HIP systems (and leave the default behaviour intact), the +// HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and +// "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is +// defined. In continuation of that requirement, the guard here needs to be EIGEN_USE_HIP as well + +#if defined(EIGEN_USE_HIP) + +#define gpuStream_t hipStream_t +#define gpuDeviceProp_t hipDeviceProp_t +#define gpuError_t hipError_t +#define gpuSuccess hipSuccess +#define gpuErrorNotReady hipErrorNotReady +#define gpuGetDeviceCount hipGetDeviceCount +#define gpuGetErrorString hipGetErrorString +#define gpuGetDeviceProperties hipGetDeviceProperties +// FIXME : use hipStreamDefault instead of 0x00 +#define gpuStreamDefault 0x00 +#define gpuGetDevice hipGetDevice +#define gpuSetDevice hipSetDevice +#define gpuMalloc hipMalloc +#define gpuFree hipFree +#define gpuMemsetAsync hipMemsetAsync +#define gpuMemcpyAsync hipMemcpyAsync +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuStreamQuery hipStreamQuery +#define gpuSharedMemConfig hipSharedMemConfig +#define gpuDeviceSetSharedMemConfig hipDeviceSetSharedMemConfig +#define gpuStreamSynchronize hipStreamSynchronize +#define gpuMemcpy hipMemcpy + +#else + +#define gpuStream_t cudaStream_t +#define gpuDeviceProp_t cudaDeviceProp +#define gpuError_t cudaError_t +#define gpuSuccess cudaSuccess +#define gpuErrorNotReady cudaErrorNotReady +#define gpuGetDeviceCount cudaGetDeviceCount +#define gpuGetErrorString cudaGetErrorString +#define gpuGetDeviceProperties cudaGetDeviceProperties +#define gpuStreamDefault cudaStreamDefault +#define gpuGetDevice cudaGetDevice +#define gpuSetDevice cudaSetDevice +#define gpuMalloc cudaMalloc +#define gpuFree cudaFree +#define gpuMemsetAsync cudaMemsetAsync +#define gpuMemcpyAsync cudaMemcpyAsync +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuStreamQuery cudaStreamQuery +#define gpuSharedMemConfig cudaSharedMemConfig +#define gpuDeviceSetSharedMemConfig cudaDeviceSetSharedMemConfig +#define gpuStreamSynchronize cudaStreamSynchronize +#define gpuMemcpy cudaMemcpy + +#endif + +#if defined(EIGEN_HIP_DEVICE_COMPILE) +// HIPCC does not support the use of assert on the GPU side. +#undef assert +#define assert(COND) +#endif + +#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h new file mode 100644 index 000000000..9bc0708ed --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h @@ -0,0 +1,39 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2018 Deven Desai +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H) + +#undef gpuStream_t +#undef gpuDeviceProp_t +#undef gpuError_t +#undef gpuSuccess +#undef gpuErrorNotReady +#undef gpuGetDeviceCount +#undef gpuGetErrorString +#undef gpuGetDeviceProperties +#undef gpuStreamDefault +#undef gpuGetDevice +#undef gpuSetDevice +#undef gpuMalloc +#undef gpuFree +#undef gpuMemsetAsync +#undef gpuMemcpyAsync +#undef gpuMemcpyDeviceToDevice +#undef gpuMemcpyDeviceToHost +#undef gpuMemcpyHostToDevice +#undef gpuStreamQuery +#undef gpuSharedMemConfig +#undef gpuDeviceSetSharedMemConfig +#undef gpuStreamSynchronize +#undef gpuMemcpy + +#undef EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H + +#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h index ebcbd6f41..ca854d670 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h @@ -7,23 +7,23 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H -#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H +#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H namespace Eigen { namespace internal { -#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) // Full reducers for GPU, don't vectorize for now -// Reducer function that enables multiple cuda thread to safely accumulate at the same +// Reducer function that enables multiple gpu thread to safely accumulate at the same // output address. It basically reads the current value of the output variable, and -// attempts to update it with the new value. If in the meantime another cuda thread +// attempts to update it with the new value. If in the meantime another gpu thread // updated the content of the output address it will try again. template __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { -#if EIGEN_CUDA_ARCH >= 300 +#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) if (sizeof(T) == 4) { unsigned int oldval = *reinterpret_cast(output); @@ -79,7 +79,7 @@ __device__ inline double atomicExchCustom(double* address, double val) { return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val))); } -#ifdef EIGEN_HAS_CUDA_FP16 +#ifdef EIGEN_HAS_GPU_FP16 template