aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/test
diff options
context:
space:
mode:
Diffstat (limited to 'unsupported/test')
-rw-r--r--unsupported/test/cxx11_tensor_argmax_hip.cu251
-rw-r--r--unsupported/test/cxx11_tensor_cast_float16_hip.cu79
-rw-r--r--unsupported/test/cxx11_tensor_contract_hip.cu215
-rw-r--r--unsupported/test/cxx11_tensor_device_hip.cu389
-rw-r--r--unsupported/test/cxx11_tensor_hip.cu1295
-rw-r--r--unsupported/test/cxx11_tensor_of_float16_hip.cu498
-rw-r--r--unsupported/test/cxx11_tensor_random_hip.cu85
-rw-r--r--unsupported/test/cxx11_tensor_reduction_hip.cu154
-rw-r--r--unsupported/test/cxx11_tensor_scan_hip.cu76
9 files changed, 0 insertions, 3042 deletions
diff --git a/unsupported/test/cxx11_tensor_argmax_hip.cu b/unsupported/test/cxx11_tensor_argmax_hip.cu
deleted file mode 100644
index 57d6ca000..000000000
--- a/unsupported/test/cxx11_tensor_argmax_hip.cu
+++ /dev/null
@@ -1,251 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_FUNC cxx11_tensor_hip
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-using Eigen::Tensor;
-
-template <int Layout>
-void test_hip_simple_argmax()
-{
- Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97));
- Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1));
- Tensor<DenseIndex, 1, Layout> out_min(Eigen::array<DenseIndex, 1>(1));
- in.setRandom();
- in *= in.constant(100.0);
- in(0, 0, 0) = -1000.0;
- in(71, 52, 96) = 1000.0;
-
- std::size_t in_bytes = in.size() * sizeof(double);
- std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
-
- double* d_in;
- DenseIndex* d_out_max;
- DenseIndex* d_out_min;
- hipMalloc((void**)(&d_in), in_bytes);
- hipMalloc((void**)(&d_out_max), out_bytes);
- hipMalloc((void**)(&d_out_min), out_bytes);
-
- hipMemcpy(d_in, in.data(), in_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97));
- Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_max(d_out_max, Eigen::array<DenseIndex, 1>(1));
- Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_min(d_out_min, Eigen::array<DenseIndex, 1>(1));
-
- gpu_out_max.device(gpu_device) = gpu_in.argmax();
- gpu_out_min.device(gpu_device) = gpu_in.argmin();
-
- assert(hipMemcpyAsync(out_max.data(), d_out_max, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipMemcpyAsync(out_min.data(), d_out_min, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1);
- VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0);
-
- hipFree(d_in);
- hipFree(d_out_max);
- hipFree(d_out_min);
-}
-
-template <int DataLayout>
-void test_hip_argmax_dim()
-{
- Tensor<float, 4, DataLayout> tensor(2,3,5,7);
- std::vector<int> dims;
- dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7);
-
- for (int dim = 0; dim < 4; ++dim) {
- tensor.setRandom();
- tensor = (tensor + tensor.constant(0.5)).log();
-
- array<DenseIndex, 3> out_shape;
- for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
-
- Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape);
-
- array<DenseIndex, 4> ix;
- for (int i = 0; i < 2; ++i) {
- for (int j = 0; j < 3; ++j) {
- for (int k = 0; k < 5; ++k) {
- for (int l = 0; l < 7; ++l) {
- ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
- if (ix[dim] != 0) continue;
- // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
- tensor(ix) = 10.0;
- }
- }
- }
- }
-
- std::size_t in_bytes = tensor.size() * sizeof(float);
- std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
-
- float* d_in;
- DenseIndex* d_out;
- hipMalloc((void**)(&d_in), in_bytes);
- hipMalloc((void**)(&d_out), out_bytes);
-
- hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
- Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape);
-
- gpu_out.device(gpu_device) = gpu_in.argmax(dim);
-
- assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- VERIFY_IS_EQUAL(tensor_arg.size(),
- size_t(2*3*5*7 / tensor.dimension(dim)));
-
- for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
- // Expect max to be in the first index of the reduced dimension
- VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
- }
-
- for (int i = 0; i < 2; ++i) {
- for (int j = 0; j < 3; ++j) {
- for (int k = 0; k < 5; ++k) {
- for (int l = 0; l < 7; ++l) {
- ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
- if (ix[dim] != tensor.dimension(dim) - 1) continue;
- // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
- tensor(ix) = 20.0;
- }
- }
- }
- }
-
- hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
-
- gpu_out.device(gpu_device) = gpu_in.argmax(dim);
-
- assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
- // Expect max to be in the last index of the reduced dimension
- VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
- }
-
- hipFree(d_in);
- hipFree(d_out);
- }
-}
-
-template <int DataLayout>
-void test_hip_argmin_dim()
-{
- Tensor<float, 4, DataLayout> tensor(2,3,5,7);
- std::vector<int> dims;
- dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7);
-
- for (int dim = 0; dim < 4; ++dim) {
- tensor.setRandom();
- tensor = (tensor + tensor.constant(0.5)).log();
-
- array<DenseIndex, 3> out_shape;
- for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
-
- Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape);
-
- array<DenseIndex, 4> ix;
- for (int i = 0; i < 2; ++i) {
- for (int j = 0; j < 3; ++j) {
- for (int k = 0; k < 5; ++k) {
- for (int l = 0; l < 7; ++l) {
- ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
- if (ix[dim] != 0) continue;
- // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
- tensor(ix) = -10.0;
- }
- }
- }
- }
-
- std::size_t in_bytes = tensor.size() * sizeof(float);
- std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
-
- float* d_in;
- DenseIndex* d_out;
- hipMalloc((void**)(&d_in), in_bytes);
- hipMalloc((void**)(&d_out), out_bytes);
-
- hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
- Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape);
-
- gpu_out.device(gpu_device) = gpu_in.argmin(dim);
-
- assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- VERIFY_IS_EQUAL(tensor_arg.size(),
- 2*3*5*7 / tensor.dimension(dim));
-
- for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
- // Expect min to be in the first index of the reduced dimension
- VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
- }
-
- for (int i = 0; i < 2; ++i) {
- for (int j = 0; j < 3; ++j) {
- for (int k = 0; k < 5; ++k) {
- for (int l = 0; l < 7; ++l) {
- ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
- if (ix[dim] != tensor.dimension(dim) - 1) continue;
- // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
- tensor(ix) = -20.0;
- }
- }
- }
- }
-
- hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
-
- gpu_out.device(gpu_device) = gpu_in.argmin(dim);
-
- assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
- // Expect max to be in the last index of the reduced dimension
- VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
- }
-
- hipFree(d_in);
- hipFree(d_out);
- }
-}
-
-void test_cxx11_tensor_hip()
-{
- CALL_SUBTEST(test_hip_simple_argmax<RowMajor>());
- CALL_SUBTEST(test_hip_simple_argmax<ColMajor>());
- CALL_SUBTEST(test_hip_argmax_dim<RowMajor>());
- CALL_SUBTEST(test_hip_argmax_dim<ColMajor>());
- CALL_SUBTEST(test_hip_argmin_dim<RowMajor>());
- CALL_SUBTEST(test_hip_argmin_dim<ColMajor>());
-}
diff --git a/unsupported/test/cxx11_tensor_cast_float16_hip.cu b/unsupported/test/cxx11_tensor_cast_float16_hip.cu
deleted file mode 100644
index bf6a49df4..000000000
--- a/unsupported/test/cxx11_tensor_cast_float16_hip.cu
+++ /dev/null
@@ -1,79 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_hip
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-using Eigen::Tensor;
-
-void test_hip_conversion() {
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
- int num_elem = 101;
-
- Tensor<float, 1> floats(num_elem);
- floats.setRandom();
-
- float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
- Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
- float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
-
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
- d_float, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
- d_half, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
- d_conv, num_elem);
-
- gpu_device.memcpyHostToDevice(d_float, floats.data(), num_elem*sizeof(float));
-
- gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
- gpu_conv.device(gpu_device) = gpu_half.cast<float>();
-
- Tensor<float, 1> initial(num_elem);
- Tensor<float, 1> final(num_elem);
- gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
- gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
- gpu_device.synchronize();
-
- for (int i = 0; i < num_elem; ++i) {
- VERIFY_IS_APPROX(initial(i), final(i));
- }
-
- gpu_device.deallocate(d_float);
- gpu_device.deallocate(d_half);
- gpu_device.deallocate(d_conv);
-}
-
-
-void test_fallback_conversion() {
- int num_elem = 101;
- Tensor<float, 1> floats(num_elem);
- floats.setRandom();
-
- Eigen::Tensor<Eigen::half, 1> halfs = floats.cast<Eigen::half>();
- Eigen::Tensor<float, 1> conv = halfs.cast<float>();
-
- for (int i = 0; i < num_elem; ++i) {
- VERIFY_IS_APPROX(floats(i), conv(i));
- }
-}
-
-
-void test_cxx11_tensor_cast_float16_hip()
-{
- CALL_SUBTEST(test_hip_conversion());
- CALL_SUBTEST(test_fallback_conversion());
-}
diff --git a/unsupported/test/cxx11_tensor_contract_hip.cu b/unsupported/test/cxx11_tensor_contract_hip.cu
deleted file mode 100644
index 652af0ab0..000000000
--- a/unsupported/test/cxx11_tensor_contract_hip.cu
+++ /dev/null
@@ -1,215 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_hip
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-
-using Eigen::Tensor;
-typedef Tensor<float, 1>::DimensionPair DimPair;
-
-template<int DataLayout>
-void test_hip_contraction(int m_size, int k_size, int n_size)
-{
- std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
- // with these dimensions, the output has 300 * 140 elements, which is
- // more than 30 * 1024, which is the number of threads in blocks on
- // a 15 SM GK110 GPU
- Tensor<float, 2, DataLayout> t_left(m_size, k_size);
- Tensor<float, 2, DataLayout> t_right(k_size, n_size);
- Tensor<float, 2, DataLayout> t_result(m_size, n_size);
- Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size);
- Eigen::array<DimPair, 1> dims(DimPair(1, 0));
-
- t_left.setRandom();
- t_right.setRandom();
-
- std::size_t t_left_bytes = t_left.size() * sizeof(float);
- std::size_t t_right_bytes = t_right.size() * sizeof(float);
- std::size_t t_result_bytes = t_result.size() * sizeof(float);
-
- float* d_t_left;
- float* d_t_right;
- float* d_t_result;
-
- hipMalloc((void**)(&d_t_left), t_left_bytes);
- hipMalloc((void**)(&d_t_right), t_right_bytes);
- hipMalloc((void**)(&d_t_result), t_result_bytes);
-
- hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
- gpu_t_left(d_t_left, Eigen::array<int, 2>(m_size, k_size));
- Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
- gpu_t_right(d_t_right, Eigen::array<int, 2>(k_size, n_size));
- Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
- gpu_t_result(d_t_result, Eigen::array<int, 2>(m_size, n_size));
-
-
- gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
- t_result = t_left.contract(t_right, dims);
-
- hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
- for (DenseIndex i = 0; i < t_result.size(); i++) {
- if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
- continue;
- }
- if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
- continue;
- }
- std::cout << "mismatch detected at index " << i << ": " << t_result(i)
- << " vs " << t_result_gpu(i) << std::endl;
- assert(false);
- }
-
- hipFree((void*)d_t_left);
- hipFree((void*)d_t_right);
- hipFree((void*)d_t_result);
-}
-
-
-template<int DataLayout>
-void test_scalar(int m_size, int k_size, int n_size)
-{
- std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
- // with these dimensions, the output has 300 * 140 elements, which is
- // more than 30 * 1024, which is the number of threads in blocks on
- // a 15 SM GK110 GPU
- Tensor<float, 2, DataLayout> t_left(m_size, k_size);
- Tensor<float, 2, DataLayout> t_right(k_size, n_size);
- Tensor<float, 0, DataLayout> t_result;
- Tensor<float, 0, DataLayout> t_result_gpu;
- Eigen::array<DimPair, 2> dims(DimPair(0, 0), DimPair(1, 1));
-
- t_left.setRandom();
- t_right.setRandom();
-
- std::size_t t_left_bytes = t_left.size() * sizeof(float);
- std::size_t t_right_bytes = t_right.size() * sizeof(float);
- std::size_t t_result_bytes = sizeof(float);
-
- float* d_t_left;
- float* d_t_right;
- float* d_t_result;
-
- hipMalloc((void**)(&d_t_left), t_left_bytes);
- hipMalloc((void**)(&d_t_right), t_right_bytes);
- hipMalloc((void**)(&d_t_result), t_result_bytes);
-
- hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
- gpu_t_left(d_t_left, m_size, k_size);
- Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
- gpu_t_right(d_t_right, k_size, n_size);
- Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> >
- gpu_t_result(d_t_result);
-
- gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
- t_result = t_left.contract(t_right, dims);
-
- hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
- if (fabs(t_result() - t_result_gpu()) > 1e-4f &&
- !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
- std::cout << "mismatch detected: " << t_result()
- << " vs " << t_result_gpu() << std::endl;
- assert(false);
- }
-
- hipFree((void*)d_t_left);
- hipFree((void*)d_t_right);
- hipFree((void*)d_t_result);
-}
-
-
-template<int DataLayout>
-void test_hip_contraction_m() {
- for (int k = 32; k < 256; k++) {
- test_hip_contraction<ColMajor>(k, 128, 128);
- test_hip_contraction<RowMajor>(k, 128, 128);
- }
-}
-
-template<int DataLayout>
-void test_hip_contraction_k() {
- for (int k = 32; k < 256; k++) {
- test_hip_contraction<ColMajor>(128, k, 128);
- test_hip_contraction<RowMajor>(128, k, 128);
- }
-}
-
-template<int DataLayout>
-void test_hip_contraction_n() {
- for (int k = 32; k < 256; k++) {
- test_hip_contraction<ColMajor>(128, 128, k);
- test_hip_contraction<RowMajor>(128, 128, k);
- }
-}
-
-
-template<int DataLayout>
-void test_hip_contraction_sizes() {
- int m_sizes[] = { 31, 39, 63, 64, 65,
- 127, 129, 255, 257 , 511,
- 512, 513, 1023, 1024, 1025};
-
- int n_sizes[] = { 31, 39, 63, 64, 65,
- 127, 129, 255, 257, 511,
- 512, 513, 1023, 1024, 1025};
-
- int k_sizes[] = { 31, 39, 63, 64, 65,
- 95, 96, 127, 129, 255,
- 257, 511, 512, 513, 1023,
- 1024, 1025};
-
- for (int i = 0; i < 15; i++) {
- for (int j = 0; j < 15; j++) {
- for (int k = 0; k < 17; k++) {
- test_hip_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
- }
- }
- }
-}
-
-void test_cxx11_tensor_hip()
-{
- CALL_SUBTEST(test_hip_contraction<ColMajor>(128, 128, 128));
- CALL_SUBTEST(test_hip_contraction<RowMajor>(128, 128, 128));
-
- CALL_SUBTEST(test_scalar<ColMajor>(128, 128, 128));
- CALL_SUBTEST(test_scalar<RowMajor>(128, 128, 128));
-
- CALL_SUBTEST(test_hip_contraction_m<ColMajor>());
- CALL_SUBTEST(test_hip_contraction_m<RowMajor>());
-
- CALL_SUBTEST(test_hip_contraction_k<ColMajor>());
- CALL_SUBTEST(test_hip_contraction_k<RowMajor>());
-
- CALL_SUBTEST(test_hip_contraction_n<ColMajor>());
- CALL_SUBTEST(test_hip_contraction_n<RowMajor>());
-
- // Commenting out these tests due to long runtimes
- // CALL_SUBTEST(test_hip_contraction_sizes<ColMajor>());
- // CALL_SUBTEST(test_hip_contraction_sizes<RowMajor>());
-}
diff --git a/unsupported/test/cxx11_tensor_device_hip.cu b/unsupported/test/cxx11_tensor_device_hip.cu
deleted file mode 100644
index b98c481ff..000000000
--- a/unsupported/test/cxx11_tensor_device_hip.cu
+++ /dev/null
@@ -1,389 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_device
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-using Eigen::Tensor;
-using Eigen::RowMajor;
-
-// Context for evaluation on cpu
-struct CPUContext {
- CPUContext(const Eigen::Tensor<float, 3>& in1, Eigen::Tensor<float, 3>& in2, Eigen::Tensor<float, 3>& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) {
- kernel_1d_(0) = 3.14f;
- kernel_1d_(1) = 2.7f;
-
- kernel_2d_(0,0) = 3.14f;
- kernel_2d_(1,0) = 2.7f;
- kernel_2d_(0,1) = 0.2f;
- kernel_2d_(1,1) = 7.0f;
-
- kernel_3d_(0,0,0) = 3.14f;
- kernel_3d_(0,1,0) = 2.7f;
- kernel_3d_(0,0,1) = 0.2f;
- kernel_3d_(0,1,1) = 7.0f;
- kernel_3d_(1,0,0) = -1.0f;
- kernel_3d_(1,1,0) = -0.3f;
- kernel_3d_(1,0,1) = -0.7f;
- kernel_3d_(1,1,1) = -0.5f;
- }
-
- const Eigen::DefaultDevice& device() const { return cpu_device_; }
-
- const Eigen::Tensor<float, 3>& in1() const { return in1_; }
- const Eigen::Tensor<float, 3>& in2() const { return in2_; }
- Eigen::Tensor<float, 3>& out() { return out_; }
- const Eigen::Tensor<float, 1>& kernel1d() const { return kernel_1d_; }
- const Eigen::Tensor<float, 2>& kernel2d() const { return kernel_2d_; }
- const Eigen::Tensor<float, 3>& kernel3d() const { return kernel_3d_; }
-
- private:
- const Eigen::Tensor<float, 3>& in1_;
- const Eigen::Tensor<float, 3>& in2_;
- Eigen::Tensor<float, 3>& out_;
-
- Eigen::Tensor<float, 1> kernel_1d_;
- Eigen::Tensor<float, 2> kernel_2d_;
- Eigen::Tensor<float, 3> kernel_3d_;
-
- Eigen::DefaultDevice cpu_device_;
-};
-
-
-// Context for evaluation on GPU
-struct GPUContext {
- GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
- assert(hipMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == hipSuccess);
- float kernel_1d_val[] = {3.14f, 2.7f};
- assert(hipMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), hipMemcpyHostToDevice) == hipSuccess);
-
- assert(hipMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == hipSuccess);
- float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f};
- assert(hipMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), hipMemcpyHostToDevice) == hipSuccess);
-
- assert(hipMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == hipSuccess);
- float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f};
- assert(hipMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), hipMemcpyHostToDevice) == hipSuccess);
- }
- ~GPUContext() {
- assert(hipFree(kernel_1d_) == hipSuccess);
- assert(hipFree(kernel_2d_) == hipSuccess);
- assert(hipFree(kernel_3d_) == hipSuccess);
- }
-
- const Eigen::GpuDevice& device() const { return gpu_device_; }
-
- const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1() const { return in1_; }
- const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2() const { return in2_; }
- Eigen::TensorMap<Eigen::Tensor<float, 3> >& out() { return out_; }
- Eigen::TensorMap<Eigen::Tensor<float, 1> > kernel1d() const { return Eigen::TensorMap<Eigen::Tensor<float, 1> >(kernel_1d_, 2); }
- Eigen::TensorMap<Eigen::Tensor<float, 2> > kernel2d() const { return Eigen::TensorMap<Eigen::Tensor<float, 2> >(kernel_2d_, 2, 2); }
- Eigen::TensorMap<Eigen::Tensor<float, 3> > kernel3d() const { return Eigen::TensorMap<Eigen::Tensor<float, 3> >(kernel_3d_, 2, 2, 2); }
-
- private:
- const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1_;
- const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2_;
- Eigen::TensorMap<Eigen::Tensor<float, 3> >& out_;
-
- float* kernel_1d_;
- float* kernel_2d_;
- float* kernel_3d_;
-
- Eigen::HipStreamDevice stream_;
- Eigen::GpuDevice gpu_device_;
-};
-
-
-// The actual expression to evaluate
-template <typename Context>
-void test_contextual_eval(Context* context)
-{
- context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f);
-}
-
-template <typename Context>
-void test_forced_contextual_eval(Context* context)
-{
- context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f);
-}
-
-template <typename Context>
-void test_compound_assignment(Context* context)
-{
- context->out().device(context->device()) = context->in1().constant(2.718f);
- context->out().device(context->device()) += context->in1() + context->in2() * 3.14f;
-}
-
-
-template <typename Context>
-void test_contraction(Context* context)
-{
- Eigen::array<std::pair<int, int>, 2> dims;
- dims[0] = std::make_pair(1, 1);
- dims[1] = std::make_pair(2, 2);
-
- Eigen::array<int, 2> shape(40, 50*70);
-
- Eigen::DSizes<int, 2> indices(0,0);
- Eigen::DSizes<int, 2> sizes(40,40);
-
- context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims);
-}
-
-
-template <typename Context>
-void test_1d_convolution(Context* context)
-{
- Eigen::DSizes<int, 3> indices(0,0,0);
- Eigen::DSizes<int, 3> sizes(40,49,70);
-
- Eigen::array<int, 1> dims(1);
- context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims);
-}
-
-template <typename Context>
-void test_2d_convolution(Context* context)
-{
- Eigen::DSizes<int, 3> indices(0,0,0);
- Eigen::DSizes<int, 3> sizes(40,49,69);
-
- Eigen::array<int, 2> dims(1,2);
- context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims);
-}
-
-template <typename Context>
-void test_3d_convolution(Context* context)
-{
- Eigen::DSizes<int, 3> indices(0,0,0);
- Eigen::DSizes<int, 3> sizes(39,49,69);
-
- Eigen::array<int, 3> dims(0,1,2);
- context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims);
-}
-
-
-void test_cpu() {
- Eigen::Tensor<float, 3> in1(40,50,70);
- Eigen::Tensor<float, 3> in2(40,50,70);
- Eigen::Tensor<float, 3> out(40,50,70);
-
- in1 = in1.random() + in1.constant(10.0f);
- in2 = in2.random() + in2.constant(10.0f);
-
- CPUContext context(in1, in2, out);
- test_contextual_eval(&context);
- for (int i = 0; i < 40; ++i) {
- for (int j = 0; j < 50; ++j) {
- for (int k = 0; k < 70; ++k) {
- VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
- }
- }
- }
-
- test_forced_contextual_eval(&context);
- for (int i = 0; i < 40; ++i) {
- for (int j = 0; j < 50; ++j) {
- for (int k = 0; k < 70; ++k) {
- VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
- }
- }
- }
-
- test_compound_assignment(&context);
- for (int i = 0; i < 40; ++i) {
- for (int j = 0; j < 50; ++j) {
- for (int k = 0; k < 70; ++k) {
- VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
- }
- }
- }
-
- test_contraction(&context);
- for (int i = 0; i < 40; ++i) {
- for (int j = 0; j < 40; ++j) {
- const float result = out(i,j,0);
- float expected = 0;
- for (int k = 0; k < 50; ++k) {
- for (int l = 0; l < 70; ++l) {
- expected += in1(i, k, l) * in2(j, k, l);
- }
- }
- VERIFY_IS_APPROX(expected, result);
- }
- }
-
- test_1d_convolution(&context);
- for (int i = 0; i < 40; ++i) {
- for (int j = 0; j < 49; ++j) {
- for (int k = 0; k < 70; ++k) {
- VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
- }
- }
- }
-
- test_2d_convolution(&context);
- for (int i = 0; i < 40; ++i) {
- for (int j = 0; j < 49; ++j) {
- for (int k = 0; k < 69; ++k) {
- const float result = out(i,j,k);
- const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) +
- (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
- if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
- continue;
- }
- VERIFY_IS_APPROX(expected, result);
- }
- }
- }
-
- test_3d_convolution(&context);
- for (int i = 0; i < 39; ++i) {
- for (int j = 0; j < 49; ++j) {
- for (int k = 0; k < 69; ++k) {
- const float result = out(i,j,k);
- const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
- in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) +
- (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
- in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
- if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
- continue;
- }
- VERIFY_IS_APPROX(expected, result);
- }
- }
- }
-}
-
-void test_gpu() {
- Eigen::Tensor<float, 3> in1(40,50,70);
- Eigen::Tensor<float, 3> in2(40,50,70);
- Eigen::Tensor<float, 3> out(40,50,70);
- in1 = in1.random() + in1.constant(10.0f);
- in2 = in2.random() + in2.constant(10.0f);
-
- std::size_t in1_bytes = in1.size() * sizeof(float);
- std::size_t in2_bytes = in2.size() * sizeof(float);
- std::size_t out_bytes = out.size() * sizeof(float);
-
- float* d_in1;
- float* d_in2;
- float* d_out;
- hipMalloc((void**)(&d_in1), in1_bytes);
- hipMalloc((void**)(&d_in2), in2_bytes);
- hipMalloc((void**)(&d_out), out_bytes);
-
- hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_in2, in2.data(), in2_bytes, hipMemcpyHostToDevice);
-
- Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
- Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
- Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, 40,50,70);
-
- GPUContext context(gpu_in1, gpu_in2, gpu_out);
- test_contextual_eval(&context);
- assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
- for (int i = 0; i < 40; ++i) {
- for (int j = 0; j < 50; ++j) {
- for (int k = 0; k < 70; ++k) {
- VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
- }
- }
- }
-
- test_forced_contextual_eval(&context);
- assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
- for (int i = 0; i < 40; ++i) {
- for (int j = 0; j < 50; ++j) {
- for (int k = 0; k < 70; ++k) {
- VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
- }
- }
- }
-
- test_compound_assignment(&context);
- assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
- for (int i = 0; i < 40; ++i) {
- for (int j = 0; j < 50; ++j) {
- for (int k = 0; k < 70; ++k) {
- VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
- }
- }
- }
-
- test_contraction(&context);
- assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
- for (int i = 0; i < 40; ++i) {
- for (int j = 0; j < 40; ++j) {
- const float result = out(i,j,0);
- float expected = 0;
- for (int k = 0; k < 50; ++k) {
- for (int l = 0; l < 70; ++l) {
- expected += in1(i, k, l) * in2(j, k, l);
- }
- }
- VERIFY_IS_APPROX(expected, result);
- }
- }
-
- test_1d_convolution(&context);
- assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess);
- assert(hipStreamSynchronize(context.device().stream()) == hipSuccess);
- for (int i = 0; i < 40; ++i) {
- for (int j = 0; j < 49; ++j) {
- for (int k = 0; k < 70; ++k) {
- VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
- }
- }
- }
-
- test_2d_convolution(&context);
- assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess);
- assert(hipStreamSynchronize(context.device().stream()) == hipSuccess);
- for (int i = 0; i < 40; ++i) {
- for (int j = 0; j < 49; ++j) {
- for (int k = 0; k < 69; ++k) {
- const float result = out(i,j,k);
- const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
- in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
- VERIFY_IS_APPROX(expected, result);
- }
- }
- }
-
- /*
- test_3d_convolution(&context);
- assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess);
- assert(hipStreamSynchronize(context.device().stream()) == hipSuccess);
- for (int i = 0; i < 39; ++i) {
- for (int j = 0; j < 49; ++j) {
- for (int k = 0; k < 69; ++k) {
- const float result = out(i,j,k);
- const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
- in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f +
- in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
- in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
- VERIFY_IS_APPROX(expected, result);
- }
- }
- }
- */
-}
-
-
-void test_cxx11_tensor_device()
-{
- CALL_SUBTEST(test_cpu());
- CALL_SUBTEST(test_gpu());
-}
diff --git a/unsupported/test/cxx11_tensor_hip.cu b/unsupported/test/cxx11_tensor_hip.cu
deleted file mode 100644
index b28840267..000000000
--- a/unsupported/test/cxx11_tensor_hip.cu
+++ /dev/null
@@ -1,1295 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_hip
-#define EIGEN_USE_GPU
-
-#ifdef __NVCC__
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
-#endif
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-using Eigen::Tensor;
-
-void test_hip_nullary() {
- Tensor<float, 1, 0, int> in1(2);
- Tensor<float, 1, 0, int> in2(2);
- in1.setRandom();
- in2.setRandom();
-
- std::size_t tensor_bytes = in1.size() * sizeof(float);
-
- float* d_in1;
- float* d_in2;
- hipMalloc((void**)(&d_in1), tensor_bytes);
- hipMalloc((void**)(&d_in2), tensor_bytes);
- hipMemcpy(d_in1, in1.data(), tensor_bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_in2, in2.data(), tensor_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1(
- d_in1, 2);
- Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in2(
- d_in2, 2);
-
- gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f);
- gpu_in2.device(gpu_device) = gpu_in2.random();
-
- Tensor<float, 1, 0, int> new1(2);
- Tensor<float, 1, 0, int> new2(2);
-
- assert(hipMemcpyAsync(new1.data(), d_in1, tensor_bytes, hipMemcpyDeviceToHost,
- gpu_device.stream()) == hipSuccess);
- assert(hipMemcpyAsync(new2.data(), d_in2, tensor_bytes, hipMemcpyDeviceToHost,
- gpu_device.stream()) == hipSuccess);
-
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 2; ++i) {
- VERIFY_IS_APPROX(new1(i), 3.14f);
- VERIFY_IS_NOT_EQUAL(new2(i), in2(i));
- }
-
- hipFree(d_in1);
- hipFree(d_in2);
-}
-
-void test_hip_elementwise_small() {
- Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>{2});
- Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>{2});
- Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>{2});
- in1.setRandom();
- in2.setRandom();
-
- std::size_t in1_bytes = in1.size() * sizeof(float);
- std::size_t in2_bytes = in2.size() * sizeof(float);
- std::size_t out_bytes = out.size() * sizeof(float);
-
- float* d_in1;
- float* d_in2;
- float* d_out;
- hipMalloc((void**)(&d_in1), in1_bytes);
- hipMalloc((void**)(&d_in2), in2_bytes);
- hipMalloc((void**)(&d_out), out_bytes);
-
- hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_in2, in2.data(), in2_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
- d_in1, Eigen::array<Eigen::DenseIndex, 1>{2});
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2(
- d_in2, Eigen::array<Eigen::DenseIndex, 1>{2});
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out(
- d_out, Eigen::array<Eigen::DenseIndex, 1>{2});
-
- gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
-
- assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost,
- gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 2; ++i) {
- VERIFY_IS_APPROX(
- out(Eigen::array<Eigen::DenseIndex, 1>{i}),
- in1(Eigen::array<Eigen::DenseIndex, 1>{i}) + in2(Eigen::array<Eigen::DenseIndex, 1>{i}));
- }
-
- hipFree(d_in1);
- hipFree(d_in2);
- hipFree(d_out);
-}
-
-void test_hip_elementwise()
-{
- Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
- Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
- Tensor<float, 3> in3(Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
- Tensor<float, 3> out(Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
- in1.setRandom();
- in2.setRandom();
- in3.setRandom();
-
- std::size_t in1_bytes = in1.size() * sizeof(float);
- std::size_t in2_bytes = in2.size() * sizeof(float);
- std::size_t in3_bytes = in3.size() * sizeof(float);
- std::size_t out_bytes = out.size() * sizeof(float);
-
- float* d_in1;
- float* d_in2;
- float* d_in3;
- float* d_out;
- hipMalloc((void**)(&d_in1), in1_bytes);
- hipMalloc((void**)(&d_in2), in2_bytes);
- hipMalloc((void**)(&d_in3), in3_bytes);
- hipMalloc((void**)(&d_out), out_bytes);
-
- hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_in2, in2.data(), in2_bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_in3, in3.data(), in3_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
- Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
- Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
- Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
-
- gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
-
- assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 72; ++i) {
- for (int j = 0; j < 53; ++j) {
- for (int k = 0; k < 97; ++k) {
- VERIFY_IS_APPROX(out(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}), in1(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}) + in2(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}) * in3(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}));
- }
- }
- }
-
- hipFree(d_in1);
- hipFree(d_in2);
- hipFree(d_in3);
- hipFree(d_out);
-}
-
-void test_hip_props() {
- Tensor<float, 1> in1(200);
- Tensor<bool, 1> out(200);
- in1.setRandom();
-
- std::size_t in1_bytes = in1.size() * sizeof(float);
- std::size_t out_bytes = out.size() * sizeof(bool);
-
- float* d_in1;
- bool* d_out;
- hipMalloc((void**)(&d_in1), in1_bytes);
- hipMalloc((void**)(&d_out), out_bytes);
-
- hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
- d_in1, 200);
- Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_out(
- d_out, 200);
-
- gpu_out.device(gpu_device) = (gpu_in1.isnan)();
-
- assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost,
- gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 200; ++i) {
- VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i)));
- }
-
- hipFree(d_in1);
- hipFree(d_out);
-}
-
-void test_hip_reduction()
-{
- Tensor<float, 4> in1(72,53,97,113);
- Tensor<float, 2> out(72,97);
- in1.setRandom();
-
- std::size_t in1_bytes = in1.size() * sizeof(float);
- std::size_t out_bytes = out.size() * sizeof(float);
-
- float* d_in1;
- float* d_out;
- hipMalloc((void**)(&d_in1), in1_bytes);
- hipMalloc((void**)(&d_out), out_bytes);
-
- hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
- Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
-
- array<Eigen::DenseIndex, 2> reduction_axis;
- reduction_axis[0] = 1;
- reduction_axis[1] = 3;
-
- gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis);
-
- assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 72; ++i) {
- for (int j = 0; j < 97; ++j) {
- float expected = 0;
- for (int k = 0; k < 53; ++k) {
- for (int l = 0; l < 113; ++l) {
- expected =
- std::max<float>(expected, in1(i, k, j, l));
- }
- }
- VERIFY_IS_APPROX(out(i,j), expected);
- }
- }
-
- hipFree(d_in1);
- hipFree(d_out);
-}
-
-template<int DataLayout>
-void test_hip_contraction()
-{
- // with these dimensions, the output has 300 * 140 elements, which is
- // more than 30 * 1024, which is the number of threads in blocks on
- // a 15 SM GK110 GPU
- Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31);
- Tensor<float, 5, DataLayout> t_right(Eigen::array<Eigen::DenseIndex, 5>{3, 31, 7, 20, 1});
- Tensor<float, 5, DataLayout> t_result(Eigen::array<Eigen::DenseIndex, 5>{6, 50, 7, 20, 1});
-
- t_left.setRandom();
- t_right.setRandom();
-
- std::size_t t_left_bytes = t_left.size() * sizeof(float);
- std::size_t t_right_bytes = t_right.size() * sizeof(float);
- std::size_t t_result_bytes = t_result.size() * sizeof(float);
-
- float* d_t_left;
- float* d_t_right;
- float* d_t_result;
-
- hipMalloc((void**)(&d_t_left), t_left_bytes);
- hipMalloc((void**)(&d_t_right), t_right_bytes);
- hipMalloc((void**)(&d_t_result), t_result_bytes);
-
- hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31);
- Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_right(d_t_right, 3, 31, 7, 20, 1);
- Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_result(d_t_result, 6, 50, 7, 20, 1);
-
- typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf;
- MapXf m_left(t_left.data(), 300, 93);
- MapXf m_right(t_right.data(), 93, 140);
- Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(300, 140);
-
- typedef Tensor<float, 1>::DimensionPair DimPair;
- Eigen::array<DimPair, 2> dims;
- dims[0] = DimPair(2, 0);
- dims[1] = DimPair(3, 1);
-
- m_result = m_left * m_right;
- gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
-
- hipMemcpy(t_result.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
-
- for (DenseIndex i = 0; i < t_result.size(); i++) {
- if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
- std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl;
- assert(false);
- }
- }
-
- hipFree(d_t_left);
- hipFree(d_t_right);
- hipFree(d_t_result);
-}
-
-
-template<int DataLayout>
-void test_hip_convolution_1d()
-{
- Tensor<float, 4, DataLayout> input(74,37,11,137);
- Tensor<float, 1, DataLayout> kernel(4);
- Tensor<float, 4, DataLayout> out(74,34,11,137);
- input = input.constant(10.0f) + input.random();
- kernel = kernel.constant(7.0f) + kernel.random();
-
- std::size_t input_bytes = input.size() * sizeof(float);
- std::size_t kernel_bytes = kernel.size() * sizeof(float);
- std::size_t out_bytes = out.size() * sizeof(float);
-
- float* d_input;
- float* d_kernel;
- float* d_out;
- hipMalloc((void**)(&d_input), input_bytes);
- hipMalloc((void**)(&d_kernel), kernel_bytes);
- hipMalloc((void**)(&d_out), out_bytes);
-
- hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137);
- Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4);
- Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137);
-
- Eigen::array<Eigen::DenseIndex, 1> dims{1};
- gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
-
- assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 74; ++i) {
- for (int j = 0; j < 34; ++j) {
- for (int k = 0; k < 11; ++k) {
- for (int l = 0; l < 137; ++l) {
- const float result = out(i,j,k,l);
- const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) +
- input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3);
- VERIFY_IS_APPROX(result, expected);
- }
- }
- }
- }
-
- hipFree(d_input);
- hipFree(d_kernel);
- hipFree(d_out);
-}
-
-void test_hip_convolution_inner_dim_col_major_1d()
-{
- Tensor<float, 4, ColMajor> input(74,9,11,7);
- Tensor<float, 1, ColMajor> kernel(4);
- Tensor<float, 4, ColMajor> out(71,9,11,7);
- input = input.constant(10.0f) + input.random();
- kernel = kernel.constant(7.0f) + kernel.random();
-
- std::size_t input_bytes = input.size() * sizeof(float);
- std::size_t kernel_bytes = kernel.size() * sizeof(float);
- std::size_t out_bytes = out.size() * sizeof(float);
-
- float* d_input;
- float* d_kernel;
- float* d_out;
- hipMalloc((void**)(&d_input), input_bytes);
- hipMalloc((void**)(&d_kernel), kernel_bytes);
- hipMalloc((void**)(&d_out), out_bytes);
-
- hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7);
- Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4);
- Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7);
-
- Eigen::array<Eigen::DenseIndex, 1> dims{0};
- gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
-
- assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 71; ++i) {
- for (int j = 0; j < 9; ++j) {
- for (int k = 0; k < 11; ++k) {
- for (int l = 0; l < 7; ++l) {
- const float result = out(i,j,k,l);
- const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) +
- input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3);
- VERIFY_IS_APPROX(result, expected);
- }
- }
- }
- }
-
- hipFree(d_input);
- hipFree(d_kernel);
- hipFree(d_out);
-}
-
-void test_hip_convolution_inner_dim_row_major_1d()
-{
- Tensor<float, 4, RowMajor> input(7,9,11,74);
- Tensor<float, 1, RowMajor> kernel(4);
- Tensor<float, 4, RowMajor> out(7,9,11,71);
- input = input.constant(10.0f) + input.random();
- kernel = kernel.constant(7.0f) + kernel.random();
-
- std::size_t input_bytes = input.size() * sizeof(float);
- std::size_t kernel_bytes = kernel.size() * sizeof(float);
- std::size_t out_bytes = out.size() * sizeof(float);
-
- float* d_input;
- float* d_kernel;
- float* d_out;
- hipMalloc((void**)(&d_input), input_bytes);
- hipMalloc((void**)(&d_kernel), kernel_bytes);
- hipMalloc((void**)(&d_out), out_bytes);
-
- hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74);
- Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4);
- Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71);
-
- Eigen::array<Eigen::DenseIndex, 1> dims{3};
- gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
-
- assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 7; ++i) {
- for (int j = 0; j < 9; ++j) {
- for (int k = 0; k < 11; ++k) {
- for (int l = 0; l < 71; ++l) {
- const float result = out(i,j,k,l);
- const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) +
- input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3);
- VERIFY_IS_APPROX(result, expected);
- }
- }
- }
- }
-
- hipFree(d_input);
- hipFree(d_kernel);
- hipFree(d_out);
-}
-
-template<int DataLayout>
-void test_hip_convolution_2d()
-{
- Tensor<float, 4, DataLayout> input(74,37,11,137);
- Tensor<float, 2, DataLayout> kernel(3,4);
- Tensor<float, 4, DataLayout> out(74,35,8,137);
- input = input.constant(10.0f) + input.random();
- kernel = kernel.constant(7.0f) + kernel.random();
-
- std::size_t input_bytes = input.size() * sizeof(float);
- std::size_t kernel_bytes = kernel.size() * sizeof(float);
- std::size_t out_bytes = out.size() * sizeof(float);
-
- float* d_input;
- float* d_kernel;
- float* d_out;
- hipMalloc((void**)(&d_input), input_bytes);
- hipMalloc((void**)(&d_kernel), kernel_bytes);
- hipMalloc((void**)(&d_out), out_bytes);
-
- hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137);
- Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4);
- Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137);
-
- Eigen::array<Eigen::DenseIndex, 2> dims{1,2};
- gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
-
- assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 74; ++i) {
- for (int j = 0; j < 35; ++j) {
- for (int k = 0; k < 8; ++k) {
- for (int l = 0; l < 137; ++l) {
- const float result = out(i,j,k,l);
- const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
- input(i,j+1,k+0,l) * kernel(1,0) +
- input(i,j+2,k+0,l) * kernel(2,0) +
- input(i,j+0,k+1,l) * kernel(0,1) +
- input(i,j+1,k+1,l) * kernel(1,1) +
- input(i,j+2,k+1,l) * kernel(2,1) +
- input(i,j+0,k+2,l) * kernel(0,2) +
- input(i,j+1,k+2,l) * kernel(1,2) +
- input(i,j+2,k+2,l) * kernel(2,2) +
- input(i,j+0,k+3,l) * kernel(0,3) +
- input(i,j+1,k+3,l) * kernel(1,3) +
- input(i,j+2,k+3,l) * kernel(2,3);
- VERIFY_IS_APPROX(result, expected);
- }
- }
- }
- }
-
- hipFree(d_input);
- hipFree(d_kernel);
- hipFree(d_out);
-}
-
-template<int DataLayout>
-void test_hip_convolution_3d()
-{
- Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>{74,37,11,137,17});
- Tensor<float, 3, DataLayout> kernel(3,4,2);
- Tensor<float, 5, DataLayout> out(Eigen::array<Eigen::DenseIndex, 5>{74,35,8,136,17});
- input = input.constant(10.0f) + input.random();
- kernel = kernel.constant(7.0f) + kernel.random();
-
- std::size_t input_bytes = input.size() * sizeof(float);
- std::size_t kernel_bytes = kernel.size() * sizeof(float);
- std::size_t out_bytes = out.size() * sizeof(float);
-
- float* d_input;
- float* d_kernel;
- float* d_out;
- hipMalloc((void**)(&d_input), input_bytes);
- hipMalloc((void**)(&d_kernel), kernel_bytes);
- hipMalloc((void**)(&d_out), out_bytes);
-
- hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17);
- Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2);
- Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17);
-
- Eigen::array<Eigen::DenseIndex, 3> dims{1,2,3};
- gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
-
- assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 74; ++i) {
- for (int j = 0; j < 35; ++j) {
- for (int k = 0; k < 8; ++k) {
- for (int l = 0; l < 136; ++l) {
- for (int m = 0; m < 17; ++m) {
- const float result = out(i,j,k,l,m);
- const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) +
- input(i,j+1,k+0,l+0,m) * kernel(1,0,0) +
- input(i,j+2,k+0,l+0,m) * kernel(2,0,0) +
- input(i,j+0,k+1,l+0,m) * kernel(0,1,0) +
- input(i,j+1,k+1,l+0,m) * kernel(1,1,0) +
- input(i,j+2,k+1,l+0,m) * kernel(2,1,0) +
- input(i,j+0,k+2,l+0,m) * kernel(0,2,0) +
- input(i,j+1,k+2,l+0,m) * kernel(1,2,0) +
- input(i,j+2,k+2,l+0,m) * kernel(2,2,0) +
- input(i,j+0,k+3,l+0,m) * kernel(0,3,0) +
- input(i,j+1,k+3,l+0,m) * kernel(1,3,0) +
- input(i,j+2,k+3,l+0,m) * kernel(2,3,0) +
- input(i,j+0,k+0,l+1,m) * kernel(0,0,1) +
- input(i,j+1,k+0,l+1,m) * kernel(1,0,1) +
- input(i,j+2,k+0,l+1,m) * kernel(2,0,1) +
- input(i,j+0,k+1,l+1,m) * kernel(0,1,1) +
- input(i,j+1,k+1,l+1,m) * kernel(1,1,1) +
- input(i,j+2,k+1,l+1,m) * kernel(2,1,1) +
- input(i,j+0,k+2,l+1,m) * kernel(0,2,1) +
- input(i,j+1,k+2,l+1,m) * kernel(1,2,1) +
- input(i,j+2,k+2,l+1,m) * kernel(2,2,1) +
- input(i,j+0,k+3,l+1,m) * kernel(0,3,1) +
- input(i,j+1,k+3,l+1,m) * kernel(1,3,1) +
- input(i,j+2,k+3,l+1,m) * kernel(2,3,1);
- VERIFY_IS_APPROX(result, expected);
- }
- }
- }
- }
- }
-
- hipFree(d_input);
- hipFree(d_kernel);
- hipFree(d_out);
-}
-
-
-template <typename Scalar>
-void test_hip_lgamma(const Scalar stddev)
-{
- Tensor<Scalar, 2> in(72,97);
- in.setRandom();
- in *= in.constant(stddev);
- Tensor<Scalar, 2> out(72,97);
- out.setZero();
-
- std::size_t bytes = in.size() * sizeof(Scalar);
-
- Scalar* d_in;
- Scalar* d_out;
- hipMalloc((void**)(&d_in), bytes);
- hipMalloc((void**)(&d_out), bytes);
-
- hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
-
- gpu_out.device(gpu_device) = gpu_in.lgamma();
-
- assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 72; ++i) {
- for (int j = 0; j < 97; ++j) {
- VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j)));
- }
- }
-
- hipFree(d_in);
- hipFree(d_out);
-}
-
-template <typename Scalar>
-void test_hip_digamma()
-{
- Tensor<Scalar, 1> in(7);
- Tensor<Scalar, 1> out(7);
- Tensor<Scalar, 1> expected_out(7);
- out.setZero();
-
- in(0) = Scalar(1);
- in(1) = Scalar(1.5);
- in(2) = Scalar(4);
- in(3) = Scalar(-10.5);
- in(4) = Scalar(10000.5);
- in(5) = Scalar(0);
- in(6) = Scalar(-1);
-
- expected_out(0) = Scalar(-0.5772156649015329);
- expected_out(1) = Scalar(0.03648997397857645);
- expected_out(2) = Scalar(1.2561176684318);
- expected_out(3) = Scalar(2.398239129535781);
- expected_out(4) = Scalar(9.210340372392849);
- expected_out(5) = std::numeric_limits<Scalar>::infinity();
- expected_out(6) = std::numeric_limits<Scalar>::infinity();
-
- std::size_t bytes = in.size() * sizeof(Scalar);
-
- Scalar* d_in;
- Scalar* d_out;
- hipMalloc((void**)(&d_in), bytes);
- hipMalloc((void**)(&d_out), bytes);
-
- hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
-
- gpu_out.device(gpu_device) = gpu_in.digamma();
-
- assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 5; ++i) {
- VERIFY_IS_APPROX(out(i), expected_out(i));
- }
- for (int i = 5; i < 7; ++i) {
- VERIFY_IS_EQUAL(out(i), expected_out(i));
- }
-
- hipFree(d_in);
- hipFree(d_out);
-}
-
-template <typename Scalar>
-void test_hip_zeta()
-{
- Tensor<Scalar, 1> in_x(6);
- Tensor<Scalar, 1> in_q(6);
- Tensor<Scalar, 1> out(6);
- Tensor<Scalar, 1> expected_out(6);
- out.setZero();
-
- in_x(0) = Scalar(1);
- in_x(1) = Scalar(1.5);
- in_x(2) = Scalar(4);
- in_x(3) = Scalar(-10.5);
- in_x(4) = Scalar(10000.5);
- in_x(5) = Scalar(3);
-
- in_q(0) = Scalar(1.2345);
- in_q(1) = Scalar(2);
- in_q(2) = Scalar(1.5);
- in_q(3) = Scalar(3);
- in_q(4) = Scalar(1.0001);
- in_q(5) = Scalar(-2.5);
-
- expected_out(0) = std::numeric_limits<Scalar>::infinity();
- expected_out(1) = Scalar(1.61237534869);
- expected_out(2) = Scalar(0.234848505667);
- expected_out(3) = Scalar(1.03086757337e-5);
- expected_out(4) = Scalar(0.367879440865);
- expected_out(5) = Scalar(0.054102025820864097);
-
- std::size_t bytes = in_x.size() * sizeof(Scalar);
-
- Scalar* d_in_x;
- Scalar* d_in_q;
- Scalar* d_out;
- hipMalloc((void**)(&d_in_x), bytes);
- hipMalloc((void**)(&d_in_q), bytes);
- hipMalloc((void**)(&d_out), bytes);
-
- hipMemcpy(d_in_x, in_x.data(), bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_in_q, in_q.data(), bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_q(d_in_q, 6);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
-
- gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q);
-
- assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- VERIFY_IS_EQUAL(out(0), expected_out(0));
- VERIFY((std::isnan)(out(3)));
-
- for (int i = 1; i < 6; ++i) {
- if (i != 3) {
- VERIFY_IS_APPROX(out(i), expected_out(i));
- }
- }
-
- hipFree(d_in_x);
- hipFree(d_in_q);
- hipFree(d_out);
-}
-
-template <typename Scalar>
-void test_hip_polygamma()
-{
- Tensor<Scalar, 1> in_x(7);
- Tensor<Scalar, 1> in_n(7);
- Tensor<Scalar, 1> out(7);
- Tensor<Scalar, 1> expected_out(7);
- out.setZero();
-
- in_n(0) = Scalar(1);
- in_n(1) = Scalar(1);
- in_n(2) = Scalar(1);
- in_n(3) = Scalar(17);
- in_n(4) = Scalar(31);
- in_n(5) = Scalar(28);
- in_n(6) = Scalar(8);
-
- in_x(0) = Scalar(2);
- in_x(1) = Scalar(3);
- in_x(2) = Scalar(25.5);
- in_x(3) = Scalar(4.7);
- in_x(4) = Scalar(11.8);
- in_x(5) = Scalar(17.7);
- in_x(6) = Scalar(30.2);
-
- expected_out(0) = Scalar(0.644934066848);
- expected_out(1) = Scalar(0.394934066848);
- expected_out(2) = Scalar(0.0399946696496);
- expected_out(3) = Scalar(293.334565435);
- expected_out(4) = Scalar(0.445487887616);
- expected_out(5) = Scalar(-2.47810300902e-07);
- expected_out(6) = Scalar(-8.29668781082e-09);
-
- std::size_t bytes = in_x.size() * sizeof(Scalar);
-
- Scalar* d_in_x;
- Scalar* d_in_n;
- Scalar* d_out;
- hipMalloc((void**)(&d_in_x), bytes);
- hipMalloc((void**)(&d_in_n), bytes);
- hipMalloc((void**)(&d_out), bytes);
-
- hipMemcpy(d_in_x, in_x.data(), bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_in_n, in_n.data(), bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_n(d_in_n, 7);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
-
- gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x);
-
- assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 7; ++i) {
- VERIFY_IS_APPROX(out(i), expected_out(i));
- }
-
- hipFree(d_in_x);
- hipFree(d_in_n);
- hipFree(d_out);
-}
-
-template <typename Scalar>
-void test_hip_igamma()
-{
- Tensor<Scalar, 2> a(6, 6);
- Tensor<Scalar, 2> x(6, 6);
- Tensor<Scalar, 2> out(6, 6);
- out.setZero();
-
- Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
- Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
-
- for (int i = 0; i < 6; ++i) {
- for (int j = 0; j < 6; ++j) {
- a(i, j) = a_s[i];
- x(i, j) = x_s[j];
- }
- }
-
- Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
- Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
- {0.0, 0.6321205588285578, 0.7768698398515702,
- 0.9816843611112658, 9.999500016666262e-05, 1.0},
- {0.0, 0.4275932955291202, 0.608374823728911,
- 0.9539882943107686, 7.522076445089201e-07, 1.0},
- {0.0, 0.01898815687615381, 0.06564245437845008,
- 0.5665298796332909, 4.166333347221828e-18, 1.0},
- {0.0, 0.9999780593618628, 0.9999899967080838,
- 0.9999996219837988, 0.9991370418689945, 1.0},
- {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
-
-
-
- std::size_t bytes = a.size() * sizeof(Scalar);
-
- Scalar* d_a;
- Scalar* d_x;
- Scalar* d_out;
- assert(hipMalloc((void**)(&d_a), bytes) == hipSuccess);
- assert(hipMalloc((void**)(&d_x), bytes) == hipSuccess);
- assert(hipMalloc((void**)(&d_out), bytes) == hipSuccess);
-
- hipMemcpy(d_a, a.data(), bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_x, x.data(), bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
-
- gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x);
-
- assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 6; ++i) {
- for (int j = 0; j < 6; ++j) {
- if ((std::isnan)(igamma_s[i][j])) {
- VERIFY((std::isnan)(out(i, j)));
- } else {
- VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]);
- }
- }
- }
-
- hipFree(d_a);
- hipFree(d_x);
- hipFree(d_out);
-}
-
-template <typename Scalar>
-void test_hip_igammac()
-{
- Tensor<Scalar, 2> a(6, 6);
- Tensor<Scalar, 2> x(6, 6);
- Tensor<Scalar, 2> out(6, 6);
- out.setZero();
-
- Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
- Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
-
- for (int i = 0; i < 6; ++i) {
- for (int j = 0; j < 6; ++j) {
- a(i, j) = a_s[i];
- x(i, j) = x_s[j];
- }
- }
-
- Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
- Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
- {1.0, 0.36787944117144233, 0.22313016014842982,
- 0.018315638888734182, 0.9999000049998333, 0.0},
- {1.0, 0.5724067044708798, 0.3916251762710878,
- 0.04601170568923136, 0.9999992477923555, 0.0},
- {1.0, 0.9810118431238462, 0.9343575456215499,
- 0.4334701203667089, 1.0, 0.0},
- {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
- 3.7801620118431334e-07, 0.0008629581310054535,
- 0.0},
- {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
-
- std::size_t bytes = a.size() * sizeof(Scalar);
-
- Scalar* d_a;
- Scalar* d_x;
- Scalar* d_out;
- hipMalloc((void**)(&d_a), bytes);
- hipMalloc((void**)(&d_x), bytes);
- hipMalloc((void**)(&d_out), bytes);
-
- hipMemcpy(d_a, a.data(), bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_x, x.data(), bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
-
- gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x);
-
- assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 6; ++i) {
- for (int j = 0; j < 6; ++j) {
- if ((std::isnan)(igammac_s[i][j])) {
- VERIFY((std::isnan)(out(i, j)));
- } else {
- VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]);
- }
- }
- }
-
- hipFree(d_a);
- hipFree(d_x);
- hipFree(d_out);
-}
-
-template <typename Scalar>
-void test_hip_erf(const Scalar stddev)
-{
- Tensor<Scalar, 2> in(72,97);
- in.setRandom();
- in *= in.constant(stddev);
- Tensor<Scalar, 2> out(72,97);
- out.setZero();
-
- std::size_t bytes = in.size() * sizeof(Scalar);
-
- Scalar* d_in;
- Scalar* d_out;
- assert(hipMalloc((void**)(&d_in), bytes) == hipSuccess);
- assert(hipMalloc((void**)(&d_out), bytes) == hipSuccess);
-
- hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
-
- gpu_out.device(gpu_device) = gpu_in.erf();
-
- assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 72; ++i) {
- for (int j = 0; j < 97; ++j) {
- VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j)));
- }
- }
-
- hipFree(d_in);
- hipFree(d_out);
-}
-
-template <typename Scalar>
-void test_hip_erfc(const Scalar stddev)
-{
- Tensor<Scalar, 2> in(72,97);
- in.setRandom();
- in *= in.constant(stddev);
- Tensor<Scalar, 2> out(72,97);
- out.setZero();
-
- std::size_t bytes = in.size() * sizeof(Scalar);
-
- Scalar* d_in;
- Scalar* d_out;
- hipMalloc((void**)(&d_in), bytes);
- hipMalloc((void**)(&d_out), bytes);
-
- hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
-
- gpu_out.device(gpu_device) = gpu_in.erfc();
-
- assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 0; i < 72; ++i) {
- for (int j = 0; j < 97; ++j) {
- VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j)));
- }
- }
-
- hipFree(d_in);
- hipFree(d_out);
-}
-
-template <typename Scalar>
-void test_hip_betainc()
-{
- Tensor<Scalar, 1> in_x(125);
- Tensor<Scalar, 1> in_a(125);
- Tensor<Scalar, 1> in_b(125);
- Tensor<Scalar, 1> out(125);
- Tensor<Scalar, 1> expected_out(125);
- out.setZero();
-
- Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
-
- Array<Scalar, 1, Dynamic> x(125);
- Array<Scalar, 1, Dynamic> a(125);
- Array<Scalar, 1, Dynamic> b(125);
- Array<Scalar, 1, Dynamic> v(125);
-
- a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
- 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
- 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
- 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
- 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
- 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
- 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
- 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
- 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
- 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
- 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
- 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
- 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
- 31.62177660168379, 31.62177660168379, 31.62177660168379,
- 31.62177660168379, 31.62177660168379, 31.62177660168379,
- 31.62177660168379, 31.62177660168379, 31.62177660168379,
- 31.62177660168379, 31.62177660168379, 31.62177660168379,
- 31.62177660168379, 31.62177660168379, 31.62177660168379,
- 31.62177660168379, 31.62177660168379, 31.62177660168379,
- 31.62177660168379, 31.62177660168379, 31.62177660168379,
- 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
- 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
- 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
- 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999;
-
- b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
- 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
- 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
- 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
- 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
- 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
- 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
- 31.62177660168379, 31.62177660168379, 31.62177660168379,
- 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
- 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
- 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
- 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
- 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
- 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
- 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
- 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
- 31.62177660168379, 31.62177660168379, 31.62177660168379,
- 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
- 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
- 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
- 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
- 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
- 999.999, 999.999, 999.999;
-
- x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
- 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
- 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
- 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
- 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1,
- -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
- 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
- 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
- 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1;
-
- v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
- nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
- nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
- 0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan,
- 0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan,
- 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan,
- nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256,
- 0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001,
- 0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403,
- 0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999,
- 0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
- 1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan,
- nan, 7.864342668429763e-23, 3.015969667594166e-10, 0.0008598571564165444,
- nan, nan, 6.031987710123844e-08, 0.5000000000000007, 0.9999999396801229,
- nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan,
- nan, nan, nan, nan, nan, nan, 0.0, 7.029920380986636e-306,
- 2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302,
- 1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252,
- 2.9303043666183996e-60, nan, nan, 2.248913486879199e-196,
- 0.5000000000004947, 0.9999999999999999, nan;
-
- for (int i = 0; i < 125; ++i) {
- in_x(i) = x(i);
- in_a(i) = a(i);
- in_b(i) = b(i);
- expected_out(i) = v(i);
- }
-
- std::size_t bytes = in_x.size() * sizeof(Scalar);
-
- Scalar* d_in_x;
- Scalar* d_in_a;
- Scalar* d_in_b;
- Scalar* d_out;
- hipMalloc((void**)(&d_in_x), bytes);
- hipMalloc((void**)(&d_in_a), bytes);
- hipMalloc((void**)(&d_in_b), bytes);
- hipMalloc((void**)(&d_out), bytes);
-
- hipMemcpy(d_in_x, in_x.data(), bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_in_a, in_a.data(), bytes, hipMemcpyHostToDevice);
- hipMemcpy(d_in_b, in_b.data(), bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_a(d_in_a, 125);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_b(d_in_b, 125);
- Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 125);
-
- gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x);
-
- assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- for (int i = 1; i < 125; ++i) {
- if ((std::isnan)(expected_out(i))) {
- VERIFY((std::isnan)(out(i)));
- } else {
- VERIFY_IS_APPROX(out(i), expected_out(i));
- }
- }
-
- hipFree(d_in_x);
- hipFree(d_in_a);
- hipFree(d_in_b);
- hipFree(d_out);
-}
-
-
-void test_cxx11_tensor_hip()
-{
- CALL_SUBTEST(test_hip_nullary());
- CALL_SUBTEST(test_hip_elementwise_small());
- CALL_SUBTEST(test_hip_elementwise());
- CALL_SUBTEST(test_hip_props());
- CALL_SUBTEST(test_hip_reduction());
- CALL_SUBTEST(test_hip_contraction<ColMajor>());
- CALL_SUBTEST(test_hip_contraction<RowMajor>());
- CALL_SUBTEST(test_hip_convolution_1d<ColMajor>());
- CALL_SUBTEST(test_hip_convolution_1d<RowMajor>());
- CALL_SUBTEST(test_hip_convolution_inner_dim_col_major_1d());
- CALL_SUBTEST(test_hip_convolution_inner_dim_row_major_1d());
- CALL_SUBTEST(test_hip_convolution_2d<ColMajor>());
- CALL_SUBTEST(test_hip_convolution_2d<RowMajor>());
- // The following two tests commented out due to long runtime
- // CALL_SUBTEST(test_hip_convolution_3d<ColMajor>());
- // CALL_SUBTEST(test_hip_convolution_3d<RowMajor>());
-
-#if __cplusplus > 199711L
- // std::erf, std::erfc, and so on where only added in c++11. We use them
- // as a golden reference to validate the results produced by Eigen. Therefore
- // we can only run these tests if we use a c++11 compiler.
-
- CALL_SUBTEST(test_hip_lgamma<float>(1.0f));
- CALL_SUBTEST(test_hip_lgamma<float>(100.0f));
- CALL_SUBTEST(test_hip_lgamma<float>(0.01f));
- CALL_SUBTEST(test_hip_lgamma<float>(0.001f));
-
- CALL_SUBTEST(test_hip_lgamma<double>(1.0));
- CALL_SUBTEST(test_hip_lgamma<double>(100.0));
- CALL_SUBTEST(test_hip_lgamma<double>(0.01));
- CALL_SUBTEST(test_hip_lgamma<double>(0.001));
-
- CALL_SUBTEST(test_hip_erf<float>(1.0f));
- CALL_SUBTEST(test_hip_erf<float>(100.0f));
- CALL_SUBTEST(test_hip_erf<float>(0.01f));
- CALL_SUBTEST(test_hip_erf<float>(0.001f));
-
- CALL_SUBTEST(test_hip_erfc<float>(1.0f));
- // CALL_SUBTEST(test_hip_erfc<float>(100.0f)); // HIP erfc lacks precision for large inputs
- CALL_SUBTEST(test_hip_erfc<float>(5.0f));
- CALL_SUBTEST(test_hip_erfc<float>(0.01f));
- CALL_SUBTEST(test_hip_erfc<float>(0.001f));
-
- CALL_SUBTEST(test_hip_erf<double>(1.0));
- CALL_SUBTEST(test_hip_erf<double>(100.0));
- CALL_SUBTEST(test_hip_erf<double>(0.01));
- CALL_SUBTEST(test_hip_erf<double>(0.001));
-
- CALL_SUBTEST(test_hip_erfc<double>(1.0));
- // CALL_SUBTEST(test_hip_erfc<double>(100.0)); // HIP erfc lacks precision for large inputs
- CALL_SUBTEST(test_hip_erfc<double>(5.0));
- CALL_SUBTEST(test_hip_erfc<double>(0.01));
- CALL_SUBTEST(test_hip_erfc<double>(0.001));
-
- // Following tests have functional failures on some seeds
- // CALL_SUBTEST(test_hip_digamma<float>());
- // CALL_SUBTEST(test_hip_digamma<double>());
-
- // Following tests have functional failures on some seeds
- // CALL_SUBTEST(test_hip_polygamma<float>());
- // CALL_SUBTEST(test_hip_polygamma<double>());
-
- // Following tests have functional failures on some seeds
- // CALL_SUBTEST(test_hip_zeta<float>());
- // CALL_SUBTEST(test_hip_zeta<double>());
-
- CALL_SUBTEST(test_hip_igamma<float>());
- CALL_SUBTEST(test_hip_igammac<float>());
-
- CALL_SUBTEST(test_hip_igamma<double>());
- CALL_SUBTEST(test_hip_igammac<double>());
-
- CALL_SUBTEST(test_hip_betainc<float>());
- CALL_SUBTEST(test_hip_betainc<double>());
-#endif
-}
diff --git a/unsupported/test/cxx11_tensor_of_float16_hip.cu b/unsupported/test/cxx11_tensor_of_float16_hip.cu
deleted file mode 100644
index 6c1a401bf..000000000
--- a/unsupported/test/cxx11_tensor_of_float16_hip.cu
+++ /dev/null
@@ -1,498 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_hip
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-
-using Eigen::Tensor;
-
-template<typename>
-void test_hip_numext() {
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
- int num_elem = 101;
-
- float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
- bool* d_res_half = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
- bool* d_res_float = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
-
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
- d_float, num_elem);
- Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_half(
- d_res_half, num_elem);
- Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_float(
- d_res_float, num_elem);
-
- gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
- gpu_res_float.device(gpu_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>());
- gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>());
-
- Tensor<bool, 1> half_prec(num_elem);
- Tensor<bool, 1> full_prec(num_elem);
- gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(bool));
- gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(bool));
- gpu_device.synchronize();
-
- for (int i = 0; i < num_elem; ++i) {
- std::cout << "Checking numext " << i << std::endl;
- VERIFY_IS_EQUAL(full_prec(i), half_prec(i));
- }
-
- gpu_device.deallocate(d_float);
- gpu_device.deallocate(d_res_half);
- gpu_device.deallocate(d_res_float);
-}
-
-
-#ifdef EIGEN_HAS_HIP_FP16
-
-template<typename>
-void test_hip_conversion() {
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
- int num_elem = 101;
-
- float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
- Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
- float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
-
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
- d_float, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
- d_half, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
- d_conv, num_elem);
-
- gpu_float.device(gpu_device) = gpu_float.random();
- gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
- gpu_conv.device(gpu_device) = gpu_half.cast<float>();
-
- Tensor<float, 1> initial(num_elem);
- Tensor<float, 1> final(num_elem);
- gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
- gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
-
- for (int i = 0; i < num_elem; ++i) {
- VERIFY_IS_APPROX(initial(i), final(i));
- }
-
- gpu_device.deallocate(d_float);
- gpu_device.deallocate(d_half);
- gpu_device.deallocate(d_conv);
-}
-
-template<typename>
-void test_hip_unary() {
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
- int num_elem = 101;
-
- float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
- float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
- float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
-
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
- d_float, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
- d_res_half, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
- d_res_float, num_elem);
-
- gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
- gpu_res_float.device(gpu_device) = gpu_float.abs();
- gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().cast<float>();
-
- Tensor<float, 1> half_prec(num_elem);
- Tensor<float, 1> full_prec(num_elem);
- gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
- gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
- gpu_device.synchronize();
-
- for (int i = 0; i < num_elem; ++i) {
- std::cout << "Checking unary " << i << std::endl;
- VERIFY_IS_APPROX(full_prec(i), half_prec(i));
- }
-
- gpu_device.deallocate(d_float);
- gpu_device.deallocate(d_res_half);
- gpu_device.deallocate(d_res_float);
-}
-
-template<typename>
-void test_hip_elementwise() {
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
- int num_elem = 101;
-
- float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
- float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
- float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
- float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
-
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(
- d_float1, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(
- d_float2, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
- d_res_half, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
- d_res_float, num_elem);
-
- gpu_float1.device(gpu_device) = gpu_float1.random();
- gpu_float2.device(gpu_device) = gpu_float2.random();
- gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1;
- gpu_res_half.device(gpu_device) = ((gpu_float1.cast<Eigen::half>() + gpu_float2.cast<Eigen::half>()) * gpu_float1.cast<Eigen::half>()).cast<float>();
-
- Tensor<float, 1> half_prec(num_elem);
- Tensor<float, 1> full_prec(num_elem);
- gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
- gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
- gpu_device.synchronize();
-
- for (int i = 0; i < num_elem; ++i) {
- std::cout << "Checking elemwise " << i << ": full prec = " << full_prec(i) << " vs half prec = " << half_prec(i) << std::endl;
- VERIFY_IS_APPROX(static_cast<Eigen::half>(full_prec(i)), static_cast<Eigen::half>(half_prec(i)));
- }
-
- gpu_device.deallocate(d_float1);
- gpu_device.deallocate(d_float2);
- gpu_device.deallocate(d_res_half);
- gpu_device.deallocate(d_res_float);
-}
-
-template<typename>
-void test_hip_trancendental() {
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
- int num_elem = 101;
-
- float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
- float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
- float* d_float3 = (float*)gpu_device.allocate(num_elem * sizeof(float));
- Eigen::half* d_res1_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
- Eigen::half* d_res1_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
- Eigen::half* d_res2_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
- Eigen::half* d_res2_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
- Eigen::half* d_res3_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
- Eigen::half* d_res3_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
-
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_half(d_res3_half, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_float(d_res3_float, num_elem);
-
- gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
- gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
- gpu_float3.device(gpu_device) = gpu_float3.random();
- gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>();
- gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>();
- gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>();
- gpu_res4_float.device(gpu_device) = gpu_float3.expm1().cast<Eigen::half>();
-
- gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>();
- gpu_res1_half.device(gpu_device) = gpu_res1_half.exp();
-
- gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>();
- gpu_res2_half.device(gpu_device) = gpu_res2_half.log();
-
- gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
- gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p();
-
- gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
- gpu_res3_half.device(gpu_device) = gpu_res3_half.expm1();
-
- Tensor<float, 1> input1(num_elem);
- Tensor<Eigen::half, 1> half_prec1(num_elem);
- Tensor<Eigen::half, 1> full_prec1(num_elem);
- Tensor<float, 1> input2(num_elem);
- Tensor<Eigen::half, 1> half_prec2(num_elem);
- Tensor<Eigen::half, 1> full_prec2(num_elem);
- Tensor<float, 1> input3(num_elem);
- Tensor<Eigen::half, 1> half_prec3(num_elem);
- Tensor<Eigen::half, 1> full_prec3(num_elem);
- gpu_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float));
- gpu_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float));
- gpu_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float));
- gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half));
- gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half));
- gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half));
- gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half));
- gpu_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half));
- gpu_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half));
- gpu_device.synchronize();
-
- for (int i = 0; i < num_elem; ++i) {
- std::cout << "Checking elemwise exp " << i << " input = " << input1(i) << " full = " << full_prec1(i) << " half = " << half_prec1(i) << std::endl;
- VERIFY_IS_APPROX(full_prec1(i), half_prec1(i));
- }
- for (int i = 0; i < num_elem; ++i) {
- std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
- if(std::abs(input2(i)-1.f)<0.05f) // log lacks accurary nearby 1
- VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
- else
- VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
- }
- for (int i = 0; i < num_elem; ++i) {
- std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl;
- VERIFY_IS_APPROX(full_prec3(i), half_prec3(i));
- }
- gpu_device.deallocate(d_float1);
- gpu_device.deallocate(d_float2);
- gpu_device.deallocate(d_float3);
- gpu_device.deallocate(d_res1_half);
- gpu_device.deallocate(d_res1_float);
- gpu_device.deallocate(d_res2_half);
- gpu_device.deallocate(d_res2_float);
- gpu_device.deallocate(d_res3_float);
- gpu_device.deallocate(d_res3_half);
-}
-
-template<typename>
-void test_hip_contractions() {
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
- int rows = 23;
- int cols = 23;
- int num_elem = rows*cols;
-
- float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
- float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
- Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
- Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
-
- Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
- d_float1, rows, cols);
- Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
- d_float2, rows, cols);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_half(
- d_res_half, rows, cols);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_float(
- d_res_float, rows, cols);
-
- gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
- gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float2.constant(0.5f);
-
- typedef Tensor<float, 2>::DimensionPair DimPair;
- Eigen::array<DimPair, 1> dims(DimPair(1, 0));
- gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims).cast<Eigen::half>();
- gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims);
-
- Tensor<Eigen::half, 2> half_prec(rows, cols);
- Tensor<Eigen::half, 2> full_prec(rows, cols);
- gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half));
- gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half));
- gpu_device.synchronize();
-
- for (int i = 0; i < rows; ++i) {
- for (int j = 0; j < cols; ++j) {
- std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl;
- if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) {
- VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j));
- }
- }
- }
-
- gpu_device.deallocate(d_float1);
- gpu_device.deallocate(d_float2);
- gpu_device.deallocate(d_res_half);
- gpu_device.deallocate(d_res_float);
-}
-
-template<typename>
-void test_hip_reductions(int size1, int size2, int redux) {
-
- std::cout << "Reducing " << size1 << " by " << size2
- << " tensor along dim " << redux << std::endl;
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
- int num_elem = size1*size2;
- int result_size = (redux == 1 ? size1 : size2);
-
- float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
- float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
- Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
- Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
-
- Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
- d_float1, size1, size2);
- Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
- d_float2, size1, size2);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half(
- d_res_half, result_size);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float(
- d_res_float, result_size);
-
- gpu_float1.device(gpu_device) = gpu_float1.random() * 2.0f;
- gpu_float2.device(gpu_device) = gpu_float2.random() * 2.0f;
-
- Eigen::array<int, 1> redux_dim(redux);
- gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast<Eigen::half>();
- gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim);
-
- Tensor<Eigen::half, 1> half_prec(result_size);
- Tensor<Eigen::half, 1> full_prec(result_size);
- gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, result_size*sizeof(Eigen::half));
- gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, result_size*sizeof(Eigen::half));
- gpu_device.synchronize();
-
- for (int i = 0; i < result_size; ++i) {
- std::cout << "EXPECTED " << full_prec(i) << " GOT " << half_prec(i) << std::endl;
- VERIFY_IS_APPROX(full_prec(i), half_prec(i));
- }
-
- gpu_device.deallocate(d_float1);
- gpu_device.deallocate(d_float2);
- gpu_device.deallocate(d_res_half);
- gpu_device.deallocate(d_res_float);
-}
-
-template<typename>
-void test_hip_reductions() {
- test_hip_reductions<void>(13, 13, 0);
- test_hip_reductions<void>(13, 13, 1);
-
- test_hip_reductions<void>(35, 36, 0);
- test_hip_reductions<void>(35, 36, 1);
-
- test_hip_reductions<void>(36, 35, 0);
- test_hip_reductions<void>(36, 35, 1);
-}
-
-template<typename>
-void test_hip_full_reductions() {
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
- int size = 13;
- int num_elem = size*size;
-
- float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
- float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
- Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
- Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
-
- Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
- d_float1, size, size);
- Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
- d_float2, size, size);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half(
- d_res_half);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float(
- d_res_float);
-
- gpu_float1.device(gpu_device) = gpu_float1.random();
- gpu_float2.device(gpu_device) = gpu_float2.random();
-
- gpu_res_float.device(gpu_device) = gpu_float1.sum().cast<Eigen::half>();
- gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum();
-
- Tensor<Eigen::half, 0> half_prec;
- Tensor<Eigen::half, 0> full_prec;
- gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
- gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
- gpu_device.synchronize();
-
- VERIFY_IS_APPROX(full_prec(), half_prec());
-
- gpu_res_float.device(gpu_device) = gpu_float1.maximum().cast<Eigen::half>();
- gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().maximum();
- gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
- gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
- gpu_device.synchronize();
-
- VERIFY_IS_APPROX(full_prec(), half_prec());
-
- gpu_device.deallocate(d_float1);
- gpu_device.deallocate(d_float2);
- gpu_device.deallocate(d_res_half);
- gpu_device.deallocate(d_res_float);
-}
-
-template<typename>
-void test_hip_forced_evals() {
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
- int num_elem = 101;
-
- float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
- float* d_res_half1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
- float* d_res_half2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
- float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
-
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
- d_float, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1(
- d_res_half1, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
- d_res_half2, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
- d_res_float, num_elem);
-
- Eigen::array<int, 1> no_bcast;
- no_bcast[0] = 1;
-
- gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
- gpu_res_float.device(gpu_device) = gpu_float.abs();
- gpu_res_half1.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>();
- gpu_res_half2.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().broadcast(no_bcast).eval().cast<float>();
-
- Tensor<float, 1> half_prec1(num_elem);
- Tensor<float, 1> half_prec2(num_elem);
- Tensor<float, 1> full_prec(num_elem);
- gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float));
- gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half1, num_elem*sizeof(float));
- gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
- gpu_device.synchronize();
-
- for (int i = 0; i < num_elem; ++i) {
- std::cout << "Checking forced eval " << i << full_prec(i) << " vs " << half_prec1(i) << " vs " << half_prec2(i) << std::endl;
- VERIFY_IS_APPROX(full_prec(i), half_prec1(i));
- VERIFY_IS_APPROX(full_prec(i), half_prec2(i));
- }
-
- gpu_device.deallocate(d_float);
- gpu_device.deallocate(d_res_half1);
- gpu_device.deallocate(d_res_half2);
- gpu_device.deallocate(d_res_float);
-}
-#endif
-
-
-void test_cxx11_tensor_of_float16_hip()
-{
- CALL_SUBTEST(test_hip_numext<void>());
-
-#ifdef EIGEN_HAS_HIP_FP16
- CALL_SUBTEST(test_hip_conversion<void>());
- CALL_SUBTEST(test_hip_unary<void>());
- CALL_SUBTEST(test_hip_elementwise<void>());
- CALL_SUBTEST(test_hip_trancendental<void>());
- CALL_SUBTEST(test_hip_contractions<void>());
- CALL_SUBTEST(test_hip_reductions<void>());
- CALL_SUBTEST(test_hip_full_reductions<void>());
- CALL_SUBTEST(test_hip_forced_evals<void>());
-#else
- std::cout << "Half floats are not supported by this version of hip: skipping the test" << std::endl;
-#endif
-}
diff --git a/unsupported/test/cxx11_tensor_random_hip.cu b/unsupported/test/cxx11_tensor_random_hip.cu
deleted file mode 100644
index 7d7e72ca2..000000000
--- a/unsupported/test/cxx11_tensor_random_hip.cu
+++ /dev/null
@@ -1,85 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_random_hip
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <Eigen/CXX11/Tensor>
-
-
-void test_hip_random_uniform()
-{
- Tensor<float, 2> out(72,97);
- out.setZero();
-
- std::size_t out_bytes = out.size() * sizeof(float);
-
- float* d_out;
- hipMalloc((void**)(&d_out), out_bytes);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
-
- gpu_out.device(gpu_device) = gpu_out.random();
-
- assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-
- // For now we just check thes code doesn't crash.
- // TODO: come up with a valid test of randomness
-}
-
-
-void test_hip_random_normal()
-{
- Tensor<float, 2> out(72,97);
- out.setZero();
-
- std::size_t out_bytes = out.size() * sizeof(float);
-
- float* d_out;
- hipMalloc((void**)(&d_out), out_bytes);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
-
- Eigen::internal::NormalRandomGenerator<float> gen(true);
- gpu_out.device(gpu_device) = gpu_out.random(gen);
-
- assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
- assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
-}
-
-static void test_complex()
-{
- Tensor<std::complex<float>, 1> vec(6);
- vec.setRandom();
-
- // Fixme: we should check that the generated numbers follow a uniform
- // distribution instead.
- for (int i = 1; i < 6; ++i) {
- VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1));
- }
-}
-
-
-void test_cxx11_tensor_random_hip()
-{
- CALL_SUBTEST(test_hip_random_uniform());
- CALL_SUBTEST(test_hip_random_normal());
- CALL_SUBTEST(test_complex());
-}
diff --git a/unsupported/test/cxx11_tensor_reduction_hip.cu b/unsupported/test/cxx11_tensor_reduction_hip.cu
deleted file mode 100644
index c5aad05be..000000000
--- a/unsupported/test/cxx11_tensor_reduction_hip.cu
+++ /dev/null
@@ -1,154 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_reduction_hip
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-
-template<typename Type, int DataLayout>
-static void test_full_reductions() {
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- const int num_rows = internal::random<int>(1024, 5*1024);
- const int num_cols = internal::random<int>(1024, 5*1024);
-
- Tensor<Type, 2, DataLayout> in(num_rows, num_cols);
- in.setRandom();
-
- Tensor<Type, 0, DataLayout> full_redux;
- full_redux = in.sum();
-
- std::size_t in_bytes = in.size() * sizeof(Type);
- std::size_t out_bytes = full_redux.size() * sizeof(Type);
- Type* gpu_in_ptr = static_cast<Type*>(gpu_device.allocate(in_bytes));
- Type* gpu_out_ptr = static_cast<Type*>(gpu_device.allocate(out_bytes));
- gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
-
- TensorMap<Tensor<Type, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
- TensorMap<Tensor<Type, 0, DataLayout> > out_gpu(gpu_out_ptr);
-
- out_gpu.device(gpu_device) = in_gpu.sum();
-
- Tensor<Type, 0, DataLayout> full_redux_gpu;
- gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
- gpu_device.synchronize();
-
- // Check that the CPU and GPU reductions return the same result.
- VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
-
- gpu_device.deallocate(gpu_in_ptr);
- gpu_device.deallocate(gpu_out_ptr);
-}
-
-template<typename Type, int DataLayout>
-static void test_first_dim_reductions() {
- int dim_x = 33;
- int dim_y = 1;
- int dim_z = 128;
-
- Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
- in.setRandom();
-
- Eigen::array<int, 1> red_axis;
- red_axis[0] = 0;
- Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
-
- // Create device
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice dev(&stream);
-
- // Create data(T)
- Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
- Type* out_data = (Type*)dev.allocate(dim_z*dim_y*sizeof(Type));
- Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
- Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_y, dim_z);
-
- // Perform operation
- dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
- gpu_out.device(dev) = gpu_in.sum(red_axis);
- gpu_out.device(dev) += gpu_in.sum(red_axis);
- Tensor<Type, 2, DataLayout> redux_gpu(dim_y, dim_z);
- dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
- dev.synchronize();
-
- // Check that the CPU and GPU reductions return the same result.
- for (int i = 0; i < gpu_out.size(); ++i) {
- VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
- }
-
- dev.deallocate(in_data);
- dev.deallocate(out_data);
-}
-
-template<typename Type, int DataLayout>
-static void test_last_dim_reductions() {
- int dim_x = 128;
- int dim_y = 1;
- int dim_z = 33;
-
- Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
- in.setRandom();
-
- Eigen::array<int, 1> red_axis;
- red_axis[0] = 2;
- Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
-
- // Create device
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice dev(&stream);
-
- // Create data
- Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
- Type* out_data = (Type*)dev.allocate(dim_x*dim_y*sizeof(Type));
- Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
- Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_x, dim_y);
-
- // Perform operation
- dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
- gpu_out.device(dev) = gpu_in.sum(red_axis);
- gpu_out.device(dev) += gpu_in.sum(red_axis);
- Tensor<Type, 2, DataLayout> redux_gpu(dim_x, dim_y);
- dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
- dev.synchronize();
-
- // Check that the CPU and GPU reductions return the same result.
- for (int i = 0; i < gpu_out.size(); ++i) {
- VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
- }
-
- dev.deallocate(in_data);
- dev.deallocate(out_data);
-}
-
-
-void test_cxx11_tensor_reduction_hip() {
- CALL_SUBTEST((test_full_reductions<float, ColMajor>()));
- CALL_SUBTEST((test_full_reductions<double, ColMajor>()));
- CALL_SUBTEST((test_full_reductions<float, RowMajor>()));
- CALL_SUBTEST((test_full_reductions<double, RowMajor>()));
-
- CALL_SUBTEST((test_first_dim_reductions<float, ColMajor>()));
- CALL_SUBTEST((test_first_dim_reductions<double, ColMajor>()));
- CALL_SUBTEST((test_first_dim_reductions<float, RowMajor>()));
-// Outer reductions of doubles aren't supported just yet.
-// CALL_SUBTEST((test_first_dim_reductions<double, RowMajor>()))
-
- CALL_SUBTEST((test_last_dim_reductions<float, ColMajor>()));
-// Outer reductions of doubles aren't supported just yet.
-// CALL_SUBTEST((test_last_dim_reductions<double, ColMajor>()));
- CALL_SUBTEST((test_last_dim_reductions<float, RowMajor>()));
- CALL_SUBTEST((test_last_dim_reductions<double, RowMajor>()));
-}
diff --git a/unsupported/test/cxx11_tensor_scan_hip.cu b/unsupported/test/cxx11_tensor_scan_hip.cu
deleted file mode 100644
index f4c4f59b9..000000000
--- a/unsupported/test/cxx11_tensor_scan_hip.cu
+++ /dev/null
@@ -1,76 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_scan_hip
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
-#define EIGEN_USE_GPU
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-using Eigen::Tensor;
-typedef Tensor<float, 1>::DimensionPair DimPair;
-
-template<int DataLayout>
-void test_hip_cumsum(int m_size, int k_size, int n_size)
-{
- std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
- Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size);
- Tensor<float, 3, DataLayout> t_result(m_size, k_size, n_size);
- Tensor<float, 3, DataLayout> t_result_gpu(m_size, k_size, n_size);
-
- t_input.setRandom();
-
- std::size_t t_input_bytes = t_input.size() * sizeof(float);
- std::size_t t_result_bytes = t_result.size() * sizeof(float);
-
- float* d_t_input;
- float* d_t_result;
-
- hipMalloc((void**)(&d_t_input), t_input_bytes);
- hipMalloc((void**)(&d_t_result), t_result_bytes);
-
- hipMemcpy(d_t_input, t_input.data(), t_input_bytes, hipMemcpyHostToDevice);
-
- Eigen::HipStreamDevice stream;
- Eigen::GpuDevice gpu_device(&stream);
-
- Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
- gpu_t_input(d_t_input, Eigen::array<int, 3>(m_size, k_size, n_size));
- Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
- gpu_t_result(d_t_result, Eigen::array<int, 3>(m_size, k_size, n_size));
-
- gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1);
- t_result = t_input.cumsum(1);
-
- hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
- for (DenseIndex i = 0; i < t_result.size(); i++) {
- if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
- continue;
- }
- if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
- continue;
- }
- std::cout << "mismatch detected at index " << i << ": " << t_result(i)
- << " vs " << t_result_gpu(i) << std::endl;
- assert(false);
- }
-
- hipFree((void*)d_t_input);
- hipFree((void*)d_t_result);
-}
-
-
-void test_cxx11_tensor_scan_hip()
-{
- CALL_SUBTEST(test_hip_cumsum<ColMajor>(128, 128, 128));
- CALL_SUBTEST(test_hip_cumsum<RowMajor>(128, 128, 128));
-}