Adding support for using Eigen in HIP kernels.

This commit enables the use of Eigen on HIP kernels / AMD GPUs. Support has been added along the same lines as what already exists for using Eigen in CUDA kernels / NVidia GPUs. Application code needs to explicitly define EIGEN_USE_HIP when using Eigen in HIP kernels. This is because some of the CUDA headers get picked up by default during Eigen compile (irrespective of whether or not the underlying compiler is CUDACC/NVCC, for e.g. Eigen/src/Core/arch/CUDA/Half.h). In order to maintain this behavior, the EIGEN_USE_HIP macro is used to switch to using the HIP version of those header files (see Eigen/Core and unsupported/Eigen/CXX11/Tensor) Use the "-DEIGEN_TEST_HIP" cmake option to enable the HIP specific unit tests.
author: Deven Desai <deven.desai.amd@gmail.com> 2018-06-06 10:12:58 -0400
committer: Deven Desai <deven.desai.amd@gmail.com> 2018-06-06 10:12:58 -0400
commit: 8fbd47052bcafea612b8ae2841c1de5db738f042 (patch)
tree: 1e8d3f8ab0bc9e48e18b0502b7d51500e72a7266 /test
parent: e206f8d4a401fe2060bada4d4b5d92e3bf3b561c (diff)
4 files changed, 331 insertions, 8 deletions
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index e1eef086e..4a5c1d36d 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -407,6 +407,48 @@ endif(CUDA_FOUND)
 endif(EIGEN_TEST_CUDA)
 
 
+# HIP unit tests
+option(EIGEN_TEST_HIP "Add HIP support." OFF)
+if (EIGEN_TEST_HIP)
+
+  set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.")
+
+  if (EXISTS ${HIP_PATH})
+    
+    list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake) 
+
+    find_package(HIP REQUIRED)
+    if (HIP_FOUND)
+
+      execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)
+
+      if (${HIP_PLATFORM} STREQUAL "hcc")
+
+	include_directories(${CMAKE_CURRENT_BINARY_DIR})
+	include_directories(${HIP_PATH}/include)
+
+	set(EIGEN_ADD_TEST_FILENAME_EXTENSION  "cu")
+	ei_add_test(hip_basic)
+	unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+	
+      elseif (${HIP_PLATFORM} STREQUAL "nvcc")
+	message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen")
+      else ()
+	message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}")
+      endif()
+      
+    endif(HIP_FOUND)
+    
+  else ()
+
+    message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist")
+    
+  endif()
+    
+endif(EIGEN_TEST_HIP)
+
+
+
 file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests)    
 add_test(NAME failtests WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests COMMAND ${CMAKE_COMMAND} ${Eigen_SOURCE_DIR} -G "${CMAKE_GENERATOR}" -DEIGEN_FAILTEST=ON)
 
diff --git a/test/hip_basic.cu b/test/hip_basic.cu
new file mode 100644
index 000000000..2e1bf94a4
--- /dev/null
+++ b/test/hip_basic.cu
@@ -0,0 +1,172 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// workaround issue between gcc >= 4.7 and cuda 5.5
+#if (defined __GNUC__) && (__GNUC__>4 || __GNUC_MINOR__>=7)
+  #undef _GLIBCXX_ATOMIC_BUILTINS
+  #undef _GLIBCXX_USE_INT128
+#endif
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC hip_basic
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#include <hip/hip_runtime.h>
+
+#include "main.h"
+#include "hip_common.h"
+
+// Check that dense modules can be properly parsed by hipcc
+#include <Eigen/Dense>
+
+// struct Foo{
+//   EIGEN_DEVICE_FUNC
+//   void operator()(int i, const float* mats, float* vecs) const {
+//     using namespace Eigen;
+//   //   Matrix3f M(data);
+//   //   Vector3f x(data+9);
+//   //   Map<Vector3f>(data+9) = M.inverse() * x;
+//     Matrix3f M(mats+i/16);
+//     Vector3f x(vecs+i*3);
+//   //   using std::min;
+//   //   using std::sqrt;
+//     Map<Vector3f>(vecs+i*3) << x.minCoeff(), 1, 2;// / x.dot(x);//(M.inverse() *  x) / x.x();
+//     //x = x*2 + x.y() * x + x * x.maxCoeff() - x / x.sum();
+//   }
+// };
+
+template<typename T>
+struct coeff_wise {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    T x1(in+i);
+    T x2(in+i+1);
+    T x3(in+i+2);
+    Map<T> res(out+i*T::MaxSizeAtCompileTime);
+    
+    res.array() += (in[0] * x1 + x2).array() * x3.array();
+  }
+};
+
+template<typename T>
+struct replicate {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    T x1(in+i);
+    int step   = x1.size() * 4;
+    int stride = 3 * step;
+    
+    typedef Map<Array<typename T::Scalar,Dynamic,Dynamic> > MapType;
+    MapType(out+i*stride+0*step, x1.rows()*2, x1.cols()*2) = x1.replicate(2,2);
+    MapType(out+i*stride+1*step, x1.rows()*3, x1.cols()) = in[i] * x1.colwise().replicate(3);
+    MapType(out+i*stride+2*step, x1.rows(), x1.cols()*3) = in[i] * x1.rowwise().replicate(3);
+  }
+};
+
+template<typename T>
+struct redux {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    int N = 10;
+    T x1(in+i);
+    out[i*N+0] = x1.minCoeff();
+    out[i*N+1] = x1.maxCoeff();
+    out[i*N+2] = x1.sum();
+    out[i*N+3] = x1.prod();
+    out[i*N+4] = x1.matrix().squaredNorm();
+    out[i*N+5] = x1.matrix().norm();
+    out[i*N+6] = x1.colwise().sum().maxCoeff();
+    out[i*N+7] = x1.rowwise().maxCoeff().sum();
+    out[i*N+8] = x1.matrix().colwise().squaredNorm().sum();
+  }
+};
+
+template<typename T1, typename T2>
+struct prod_test {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const
+  {
+    using namespace Eigen;
+    typedef Matrix<typename T1::Scalar, T1::RowsAtCompileTime, T2::ColsAtCompileTime> T3;
+    T1 x1(in+i);
+    T2 x2(in+i+1);
+    Map<T3> res(out+i*T3::MaxSizeAtCompileTime);
+    res += in[i] * x1 * x2;
+  }
+};
+
+template<typename T1, typename T2>
+struct diagonal {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const
+  {
+    using namespace Eigen;
+    T1 x1(in+i);
+    Map<T2> res(out+i*T2::MaxSizeAtCompileTime);
+    res += x1.diagonal();
+  }
+};
+
+template<typename T>
+struct eigenvalues {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    typedef Matrix<typename T::Scalar, T::RowsAtCompileTime, 1> Vec;
+    T M(in+i);
+    Map<Vec> res(out+i*Vec::MaxSizeAtCompileTime);
+    T A = M*M.adjoint();
+    SelfAdjointEigenSolver<T> eig;
+    eig.computeDirect(M);
+    res = eig.eigenvalues();
+  }
+};
+
+void test_hip_basic()
+{
+  ei_test_init_hip();
+  
+  int nthreads = 100;
+  Eigen::VectorXf in, out;
+  
+  #ifndef __HIP_DEVICE_COMPILE__ 
+  int data_size = nthreads * 512;
+  in.setRandom(data_size);
+  out.setRandom(data_size);
+  #endif
+  
+  CALL_SUBTEST( run_and_compare_to_hip(coeff_wise<Vector3f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_hip(coeff_wise<Array44f>(), nthreads, in, out) );
+
+  // FIXME compile fails when we uncomment the followig two tests
+  // CALL_SUBTEST( run_and_compare_to_hip(replicate<Array4f>(), nthreads, in, out) );
+  // CALL_SUBTEST( run_and_compare_to_hip(replicate<Array33f>(), nthreads, in, out) );
+  
+  CALL_SUBTEST( run_and_compare_to_hip(redux<Array4f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_hip(redux<Matrix3f>(), nthreads, in, out) );
+  
+  CALL_SUBTEST( run_and_compare_to_hip(prod_test<Matrix3f,Matrix3f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_hip(prod_test<Matrix4f,Vector4f>(), nthreads, in, out) );
+  
+  CALL_SUBTEST( run_and_compare_to_hip(diagonal<Matrix3f,Vector3f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_hip(diagonal<Matrix4f,Vector4f>(), nthreads, in, out) );
+
+  // FIXME : Runtime failure occurs when we uncomment the following two tests
+  // CALL_SUBTEST( run_and_compare_to_hip(eigenvalues<Matrix3f>(), nthreads, in, out) );
+  // CALL_SUBTEST( run_and_compare_to_hip(eigenvalues<Matrix2f>(), nthreads, in, out) );
+
+}
diff --git a/test/hip_common.h b/test/hip_common.h
new file mode 100644
index 000000000..251585c52
--- /dev/null
+++ b/test/hip_common.h
@@ -0,0 +1,103 @@
+
+#ifndef EIGEN_TEST_HIP_COMMON_H
+#define EIGEN_TEST_HIP_COMMON_H
+
+#include "hip/hip_runtime.h"
+#include "hip/hip_runtime_api.h"
+#include <iostream>
+
+#ifndef __HIPCC__
+dim3 threadIdx, blockDim, blockIdx;
+#endif
+
+template<typename Kernel, typename Input, typename Output>
+void run_on_cpu(const Kernel& ker, int n, const Input& in, Output& out)
+{
+  for(int i=0; i<n; i++)
+    ker(i, in.data(), out.data());
+}
+
+
+template<typename Kernel, typename Input, typename Output>
+__global__ __attribute__((used))
+void run_on_hip_meta_kernel(const Kernel ker, int n, const Input* in, Output* out)
+{
+  int i = hipThreadIdx_x + hipBlockIdx_x*hipBlockDim_x;
+  if(i<n) {
+    ker(i, in, out);
+  }
+}
+
+
+template<typename Kernel, typename Input, typename Output>
+void run_on_hip(const Kernel& ker, int n, const Input& in, Output& out)
+{
+  typename Input::Scalar*  d_in;
+  typename Output::Scalar* d_out;
+  std::ptrdiff_t in_bytes  = in.size()  * sizeof(typename Input::Scalar);
+  std::ptrdiff_t out_bytes = out.size() * sizeof(typename Output::Scalar);
+  
+  hipMalloc((void**)(&d_in),  in_bytes);
+  hipMalloc((void**)(&d_out), out_bytes);
+  
+  hipMemcpy(d_in,  in.data(),  in_bytes,  hipMemcpyHostToDevice);
+  hipMemcpy(d_out, out.data(), out_bytes, hipMemcpyHostToDevice);
+  
+  // Simple and non-optimal 1D mapping assuming n is not too large
+  // That's only for unit testing!
+  dim3 Blocks(128);
+  dim3 Grids( (n+int(Blocks.x)-1)/int(Blocks.x) );
+
+  hipDeviceSynchronize();
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(run_on_hip_meta_kernel<Kernel,
+                                                         typename std::decay<decltype(*d_in)>::type,
+                                                         typename std::decay<decltype(*d_out)>::type>), 
+                  dim3(Grids), dim3(Blocks), 0, 0, ker, n, d_in, d_out);
+  hipDeviceSynchronize();
+  
+  // check inputs have not been modified
+  hipMemcpy(const_cast<typename Input::Scalar*>(in.data()),  d_in,  in_bytes,  hipMemcpyDeviceToHost);
+  hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost);
+  
+  hipFree(d_in);
+  hipFree(d_out);
+}
+
+
+template<typename Kernel, typename Input, typename Output>
+void run_and_compare_to_hip(const Kernel& ker, int n, const Input& in, Output& out)
+{
+  Input  in_ref,  in_hip;
+  Output out_ref, out_hip;
+  #ifndef __HIP_DEVICE_COMPILE__
+  in_ref = in_hip = in;
+  out_ref = out_hip = out;
+  #endif
+  run_on_cpu (ker, n, in_ref,  out_ref);
+  run_on_hip(ker, n, in_hip, out_hip);
+  #ifndef __HIP_DEVICE_COMPILE__
+  VERIFY_IS_APPROX(in_ref, in_hip);
+  VERIFY_IS_APPROX(out_ref, out_hip);
+  #endif
+}
+
+
+void ei_test_init_hip()
+{
+  int device = 0;
+  hipDeviceProp_t deviceProp;
+  hipGetDeviceProperties(&deviceProp, device);
+  std::cout << "HIP device info:\n";
+  std::cout << "  name:                        " << deviceProp.name << "\n";
+  std::cout << "  capability:                  " << deviceProp.major << "." << deviceProp.minor << "\n";
+  std::cout << "  multiProcessorCount:         " << deviceProp.multiProcessorCount << "\n";
+  std::cout << "  maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor << "\n";
+  std::cout << "  warpSize:                    " << deviceProp.warpSize << "\n";
+  std::cout << "  regsPerBlock:                " << deviceProp.regsPerBlock << "\n";
+  std::cout << "  concurrentKernels:           " << deviceProp.concurrentKernels << "\n";
+  std::cout << "  clockRate:                   " << deviceProp.clockRate << "\n";
+  std::cout << "  canMapHostMemory:            " << deviceProp.canMapHostMemory << "\n";
+  std::cout << "  computeMode:                 " << deviceProp.computeMode << "\n";
+}
+
+#endif // EIGEN_TEST_HIP_COMMON_H
diff --git a/test/main.h b/test/main.h
index 0fcd6cb76..79717a532 100644
--- a/test/main.h
+++ b/test/main.h
@@ -67,11 +67,17 @@
 // protected by parenthesis against macro expansion, the min()/max() macros
 // are defined here and any not-parenthesized min/max call will cause a
 // compiler error.
-#define min(A,B) please_protect_your_min_with_parentheses
-#define max(A,B) please_protect_your_max_with_parentheses
-#define isnan(X) please_protect_your_isnan_with_parentheses
-#define isinf(X) please_protect_your_isinf_with_parentheses
-#define isfinite(X) please_protect_your_isfinite_with_parentheses
+#if !defined(__HIPCC__)
+  // HIP headers include the <thread> header which contains not-parenthesized
+  // calls to "max", triggering the following check and causing the compile to fail
+  // so disabling the following checks for HIP
+  #define min(A,B) please_protect_your_min_with_parentheses
+  #define max(A,B) please_protect_your_max_with_parentheses
+  #define isnan(X) please_protect_your_isnan_with_parentheses
+  #define isinf(X) please_protect_your_isinf_with_parentheses
+  #define isfinite(X) please_protect_your_isfinite_with_parentheses
+#endif
+
 #ifdef M_PI
 #undef M_PI
 #endif
@@ -154,7 +160,7 @@ namespace Eigen
 
 #define EIGEN_DEFAULT_IO_FORMAT IOFormat(4, 0, "  ", "\n", "", "", "", "")
 
-#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__)
+#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
   #define EIGEN_EXCEPTIONS
 #endif
 
@@ -233,7 +239,7 @@ namespace Eigen
       }
     #endif //EIGEN_EXCEPTIONS
 
-  #elif !defined(__CUDACC__) // EIGEN_DEBUG_ASSERTS
+  #elif !defined(__CUDACC__) && !defined(__HIPCC__)// EIGEN_DEBUG_ASSERTS
     // see bug 89. The copy_bool here is working around a bug in gcc <= 4.3
     #define eigen_assert(a) \
       if( (!Eigen::internal::copy_bool(a)) && (!no_more_assert) )\
@@ -290,7 +296,7 @@ namespace Eigen
     std::cout << "Can't VERIFY_RAISES_STATIC_ASSERT( " #a " ) with exceptions disabled\n";
 #endif
     
-  #if !defined(__CUDACC__)
+  #if !defined(__CUDACC__) && !defined(__HIPCC__)
   #define EIGEN_USE_CUSTOM_ASSERT
   #endif
author	Deven Desai <deven.desai.amd@gmail.com>	2018-06-06 10:12:58 -0400
committer	Deven Desai <deven.desai.amd@gmail.com>	2018-06-06 10:12:58 -0400
commit	8fbd47052bcafea612b8ae2841c1de5db738f042 (patch)
tree	1e8d3f8ab0bc9e48e18b0502b7d51500e72a7266 /test
parent	e206f8d4a401fe2060bada4d4b5d92e3bf3b561c (diff)