Adding support for using Eigen in HIP kernels.

This commit enables the use of Eigen on HIP kernels / AMD GPUs. Support has been added along the same lines as what already exists for using Eigen in CUDA kernels / NVidia GPUs. Application code needs to explicitly define EIGEN_USE_HIP when using Eigen in HIP kernels. This is because some of the CUDA headers get picked up by default during Eigen compile (irrespective of whether or not the underlying compiler is CUDACC/NVCC, for e.g. Eigen/src/Core/arch/CUDA/Half.h). In order to maintain this behavior, the EIGEN_USE_HIP macro is used to switch to using the HIP version of those header files (see Eigen/Core and unsupported/Eigen/CXX11/Tensor) Use the "-DEIGEN_TEST_HIP" cmake option to enable the HIP specific unit tests.
author: Deven Desai <deven.desai.amd@gmail.com> 2018-06-06 10:12:58 -0400
committer: Deven Desai <deven.desai.amd@gmail.com> 2018-06-06 10:12:58 -0400
commit: 8fbd47052bcafea612b8ae2841c1de5db738f042 (patch)
tree: 1e8d3f8ab0bc9e48e18b0502b7d51500e72a7266 /unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
parent: e206f8d4a401fe2060bada4d4b5d92e3bf3b561c (diff)
1 files changed, 14 insertions, 2 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 0ffe68ab3..24a57970a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -201,7 +201,7 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable> {
 };
 
 
-#if defined(EIGEN_CUDACC)
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
 template <typename Evaluator, typename Index, bool Vectorizable>
 struct EigenMetaKernelEval {
   static __device__ EIGEN_ALWAYS_INLINE
@@ -250,6 +250,17 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
   TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
   const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
   if (needs_assign) {
+#if defined(EIGEN_HIPCC)
+    const int block_size = device.maxHipThreadsPerBlock();
+    const int max_blocks = device.getNumHipMultiProcessors() *
+                           device.maxHipThreadsPerMultiProcessor() / block_size;
+    const Index size = array_prod(evaluator.dimensions());
+    // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
+    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
+ 
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
+        dim3(num_blocks), dim3(block_size), 0, device.stream(), evaluator, size);
+#else
     const int block_size = device.maxCudaThreadsPerBlock();
     const int max_blocks = device.getNumCudaMultiProcessors() *
                            device.maxCudaThreadsPerMultiProcessor() / block_size;
@@ -260,11 +271,12 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
     LAUNCH_CUDA_KERNEL(
         (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
         num_blocks, block_size, 0, device, evaluator, size);
+#endif
   }
   evaluator.cleanup();
 }
 
-#endif  // EIGEN_CUDACC
+#endif  // EIGEN_CUDACC || EIGEN_HIPCC
 #endif  // EIGEN_USE_GPU
 
 // SYCL Executor policy
author	Deven Desai <deven.desai.amd@gmail.com>	2018-06-06 10:12:58 -0400
committer	Deven Desai <deven.desai.amd@gmail.com>	2018-06-06 10:12:58 -0400
commit	8fbd47052bcafea612b8ae2841c1de5db738f042 (patch)
tree	1e8d3f8ab0bc9e48e18b0502b7d51500e72a7266 /unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
parent	e206f8d4a401fe2060bada4d4b5d92e3bf3b561c (diff)