Adding support for using Eigen in HIP kernels.

This commit enables the use of Eigen on HIP kernels / AMD GPUs. Support has been added along the same lines as what already exists for using Eigen in CUDA kernels / NVidia GPUs. Application code needs to explicitly define EIGEN_USE_HIP when using Eigen in HIP kernels. This is because some of the CUDA headers get picked up by default during Eigen compile (irrespective of whether or not the underlying compiler is CUDACC/NVCC, for e.g. Eigen/src/Core/arch/CUDA/Half.h). In order to maintain this behavior, the EIGEN_USE_HIP macro is used to switch to using the HIP version of those header files (see Eigen/Core and unsupported/Eigen/CXX11/Tensor) Use the "-DEIGEN_TEST_HIP" cmake option to enable the HIP specific unit tests.
author: Deven Desai <deven.desai.amd@gmail.com> 2018-06-06 10:12:58 -0400
committer: Deven Desai <deven.desai.amd@gmail.com> 2018-06-06 10:12:58 -0400
commit: 8fbd47052bcafea612b8ae2841c1de5db738f042 (patch)
tree: 1e8d3f8ab0bc9e48e18b0502b7d51500e72a7266 /unsupported/test/cxx11_tensor_contract_hip.cu
parent: e206f8d4a401fe2060bada4d4b5d92e3bf3b561c (diff)
1 files changed, 215 insertions, 0 deletions
diff --git a/unsupported/test/cxx11_tensor_contract_hip.cu b/unsupported/test/cxx11_tensor_contract_hip.cu
new file mode 100644
index 000000000..652af0ab0
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_contract_hip.cu
@@ -0,0 +1,215 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_hip
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+
+using Eigen::Tensor;
+typedef Tensor<float, 1>::DimensionPair DimPair;
+
+template<int DataLayout>
+void test_hip_contraction(int m_size, int k_size, int n_size)
+{
+  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  Tensor<float, 2, DataLayout> t_left(m_size, k_size);
+  Tensor<float, 2, DataLayout> t_right(k_size, n_size);
+  Tensor<float, 2, DataLayout> t_result(m_size, n_size);
+  Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size);
+  Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
+  std::size_t t_right_bytes = t_right.size() * sizeof(float);
+  std::size_t t_result_bytes = t_result.size() * sizeof(float);
+
+  float* d_t_left;
+  float* d_t_right;
+  float* d_t_result;
+
+  hipMalloc((void**)(&d_t_left), t_left_bytes);
+  hipMalloc((void**)(&d_t_right), t_right_bytes);
+  hipMalloc((void**)(&d_t_result), t_result_bytes);
+
+  hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice);
+  hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice);
+
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_left(d_t_left, Eigen::array<int, 2>(m_size, k_size));
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_right(d_t_right, Eigen::array<int, 2>(k_size, n_size));
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_result(d_t_result, Eigen::array<int, 2>(m_size, n_size));
+
+
+  gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
+  t_result = t_left.contract(t_right, dims);
+
+  hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
+  for (DenseIndex i = 0; i < t_result.size(); i++) {
+    if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
+      continue;
+    }
+    std::cout << "mismatch detected at index " << i << ": " << t_result(i)
+              << " vs " <<  t_result_gpu(i) << std::endl;
+    assert(false);
+  }
+
+  hipFree((void*)d_t_left);
+  hipFree((void*)d_t_right);
+  hipFree((void*)d_t_result);
+}
+
+
+template<int DataLayout>
+void test_scalar(int m_size, int k_size, int n_size)
+{
+  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  Tensor<float, 2, DataLayout> t_left(m_size, k_size);
+  Tensor<float, 2, DataLayout> t_right(k_size, n_size);
+  Tensor<float, 0, DataLayout> t_result;
+  Tensor<float, 0, DataLayout> t_result_gpu;
+  Eigen::array<DimPair, 2> dims(DimPair(0, 0), DimPair(1, 1));
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
+  std::size_t t_right_bytes = t_right.size() * sizeof(float);
+  std::size_t t_result_bytes = sizeof(float);
+
+  float* d_t_left;
+  float* d_t_right;
+  float* d_t_result;
+
+  hipMalloc((void**)(&d_t_left), t_left_bytes);
+  hipMalloc((void**)(&d_t_right), t_right_bytes);
+  hipMalloc((void**)(&d_t_result), t_result_bytes);
+
+  hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice);
+  hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice);
+
+  Eigen::HipStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_left(d_t_left, m_size, k_size);
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_right(d_t_right, k_size, n_size);
+  Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> >
+      gpu_t_result(d_t_result);
+
+  gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
+  t_result = t_left.contract(t_right, dims);
+
+  hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
+  if (fabs(t_result() - t_result_gpu()) > 1e-4f &&
+      !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
+    std::cout << "mismatch detected: " << t_result()
+              << " vs " <<  t_result_gpu() << std::endl;
+    assert(false);
+  }
+
+  hipFree((void*)d_t_left);
+  hipFree((void*)d_t_right);
+  hipFree((void*)d_t_result);
+}
+
+
+template<int DataLayout>
+void test_hip_contraction_m() {
+  for (int k = 32; k < 256; k++) {
+    test_hip_contraction<ColMajor>(k, 128, 128);
+    test_hip_contraction<RowMajor>(k, 128, 128);
+  }
+}
+
+template<int DataLayout>
+void test_hip_contraction_k() {
+  for (int k = 32; k < 256; k++) {
+    test_hip_contraction<ColMajor>(128, k, 128);
+    test_hip_contraction<RowMajor>(128, k, 128);
+  }
+}
+
+template<int DataLayout>
+void test_hip_contraction_n() {
+  for (int k = 32; k < 256; k++) {
+    test_hip_contraction<ColMajor>(128, 128, k);
+    test_hip_contraction<RowMajor>(128, 128, k);
+  }
+}
+
+
+template<int DataLayout>
+void test_hip_contraction_sizes() {
+  int m_sizes[] = { 31,  39,   63,   64,   65,
+                   127, 129,  255,  257 , 511,
+                   512, 513, 1023, 1024, 1025};
+
+  int n_sizes[] = { 31,  39,   63,   64,   65,
+                   127, 129,  255,  257,  511,
+                   512, 513, 1023, 1024, 1025};
+
+  int k_sizes[] = {  31,   39,  63,  64,   65,
+                     95,   96, 127, 129,  255,
+                    257,  511, 512, 513, 1023,
+                   1024, 1025};
+
+  for (int i = 0; i < 15; i++) {
+    for (int j = 0; j < 15; j++) {
+      for (int k = 0; k < 17; k++) {
+        test_hip_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
+      }
+    }
+  }
+}
+
+void test_cxx11_tensor_hip()
+{
+  CALL_SUBTEST(test_hip_contraction<ColMajor>(128, 128, 128));
+  CALL_SUBTEST(test_hip_contraction<RowMajor>(128, 128, 128));
+
+  CALL_SUBTEST(test_scalar<ColMajor>(128, 128, 128));
+  CALL_SUBTEST(test_scalar<RowMajor>(128, 128, 128));
+
+  CALL_SUBTEST(test_hip_contraction_m<ColMajor>());
+  CALL_SUBTEST(test_hip_contraction_m<RowMajor>());
+
+  CALL_SUBTEST(test_hip_contraction_k<ColMajor>());
+  CALL_SUBTEST(test_hip_contraction_k<RowMajor>());
+
+  CALL_SUBTEST(test_hip_contraction_n<ColMajor>());
+  CALL_SUBTEST(test_hip_contraction_n<RowMajor>());
+
+  // Commenting out these tests due to long runtimes
+  // CALL_SUBTEST(test_hip_contraction_sizes<ColMajor>());
+  // CALL_SUBTEST(test_hip_contraction_sizes<RowMajor>());
+}
author	Deven Desai <deven.desai.amd@gmail.com>	2018-06-06 10:12:58 -0400
committer	Deven Desai <deven.desai.amd@gmail.com>	2018-06-06 10:12:58 -0400
commit	8fbd47052bcafea612b8ae2841c1de5db738f042 (patch)
tree	1e8d3f8ab0bc9e48e18b0502b7d51500e72a7266 /unsupported/test/cxx11_tensor_contract_hip.cu
parent	e206f8d4a401fe2060bada4d4b5d92e3bf3b561c (diff)