diff options
Diffstat (limited to 'unsupported/test')
-rw-r--r-- | unsupported/test/cxx11_tensor_argmax_hip.cu | 251 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_cast_float16_hip.cu | 79 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_contract_hip.cu | 215 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_device_hip.cu | 389 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_hip.cu | 1295 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_of_float16_hip.cu | 498 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_random_hip.cu | 85 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_reduction_hip.cu | 154 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_scan_hip.cu | 76 |
9 files changed, 0 insertions, 3042 deletions
diff --git a/unsupported/test/cxx11_tensor_argmax_hip.cu b/unsupported/test/cxx11_tensor_argmax_hip.cu deleted file mode 100644 index 57d6ca000..000000000 --- a/unsupported/test/cxx11_tensor_argmax_hip.cu +++ /dev/null @@ -1,251 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_FUNC cxx11_tensor_hip -#define EIGEN_USE_GPU - -#include "main.h" -#include <unsupported/Eigen/CXX11/Tensor> - -using Eigen::Tensor; - -template <int Layout> -void test_hip_simple_argmax() -{ - Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97)); - Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1)); - Tensor<DenseIndex, 1, Layout> out_min(Eigen::array<DenseIndex, 1>(1)); - in.setRandom(); - in *= in.constant(100.0); - in(0, 0, 0) = -1000.0; - in(71, 52, 96) = 1000.0; - - std::size_t in_bytes = in.size() * sizeof(double); - std::size_t out_bytes = out_max.size() * sizeof(DenseIndex); - - double* d_in; - DenseIndex* d_out_max; - DenseIndex* d_out_min; - hipMalloc((void**)(&d_in), in_bytes); - hipMalloc((void**)(&d_out_max), out_bytes); - hipMalloc((void**)(&d_out_min), out_bytes); - - hipMemcpy(d_in, in.data(), in_bytes, hipMemcpyHostToDevice); - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97)); - Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_max(d_out_max, Eigen::array<DenseIndex, 1>(1)); - Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_min(d_out_min, Eigen::array<DenseIndex, 1>(1)); - - gpu_out_max.device(gpu_device) = gpu_in.argmax(); - gpu_out_min.device(gpu_device) = gpu_in.argmin(); - - assert(hipMemcpyAsync(out_max.data(), d_out_max, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); - assert(hipMemcpyAsync(out_min.data(), d_out_min, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); - assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); - - VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1); - VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0); - - hipFree(d_in); - hipFree(d_out_max); - hipFree(d_out_min); -} - -template <int DataLayout> -void test_hip_argmax_dim() -{ - Tensor<float, 4, DataLayout> tensor(2,3,5,7); - std::vector<int> dims; - dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7); - - for (int dim = 0; dim < 4; ++dim) { - tensor.setRandom(); - tensor = (tensor + tensor.constant(0.5)).log(); - - array<DenseIndex, 3> out_shape; - for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1]; - - Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape); - - array<DenseIndex, 4> ix; - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 7; ++l) { - ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; - if (ix[dim] != 0) continue; - // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0 - tensor(ix) = 10.0; - } - } - } - } - - std::size_t in_bytes = tensor.size() * sizeof(float); - std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex); - - float* d_in; - DenseIndex* d_out; - hipMalloc((void**)(&d_in), in_bytes); - hipMalloc((void**)(&d_out), out_bytes); - - hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice); - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7)); - Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape); - - gpu_out.device(gpu_device) = gpu_in.argmax(dim); - - assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); - assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); - - VERIFY_IS_EQUAL(tensor_arg.size(), - size_t(2*3*5*7 / tensor.dimension(dim))); - - for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { - // Expect max to be in the first index of the reduced dimension - VERIFY_IS_EQUAL(tensor_arg.data()[n], 0); - } - - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 7; ++l) { - ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; - if (ix[dim] != tensor.dimension(dim) - 1) continue; - // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0 - tensor(ix) = 20.0; - } - } - } - } - - hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice); - - gpu_out.device(gpu_device) = gpu_in.argmax(dim); - - assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); - assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); - - for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { - // Expect max to be in the last index of the reduced dimension - VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); - } - - hipFree(d_in); - hipFree(d_out); - } -} - -template <int DataLayout> -void test_hip_argmin_dim() -{ - Tensor<float, 4, DataLayout> tensor(2,3,5,7); - std::vector<int> dims; - dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7); - - for (int dim = 0; dim < 4; ++dim) { - tensor.setRandom(); - tensor = (tensor + tensor.constant(0.5)).log(); - - array<DenseIndex, 3> out_shape; - for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1]; - - Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape); - - array<DenseIndex, 4> ix; - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 7; ++l) { - ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; - if (ix[dim] != 0) continue; - // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0 - tensor(ix) = -10.0; - } - } - } - } - - std::size_t in_bytes = tensor.size() * sizeof(float); - std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex); - - float* d_in; - DenseIndex* d_out; - hipMalloc((void**)(&d_in), in_bytes); - hipMalloc((void**)(&d_out), out_bytes); - - hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice); - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7)); - Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape); - - gpu_out.device(gpu_device) = gpu_in.argmin(dim); - - assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); - assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); - - VERIFY_IS_EQUAL(tensor_arg.size(), - 2*3*5*7 / tensor.dimension(dim)); - - for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { - // Expect min to be in the first index of the reduced dimension - VERIFY_IS_EQUAL(tensor_arg.data()[n], 0); - } - - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 7; ++l) { - ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; - if (ix[dim] != tensor.dimension(dim) - 1) continue; - // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0 - tensor(ix) = -20.0; - } - } - } - } - - hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice); - - gpu_out.device(gpu_device) = gpu_in.argmin(dim); - - assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); - assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); - - for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { - // Expect max to be in the last index of the reduced dimension - VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); - } - - hipFree(d_in); - hipFree(d_out); - } -} - -void test_cxx11_tensor_hip() -{ - CALL_SUBTEST(test_hip_simple_argmax<RowMajor>()); - CALL_SUBTEST(test_hip_simple_argmax<ColMajor>()); - CALL_SUBTEST(test_hip_argmax_dim<RowMajor>()); - CALL_SUBTEST(test_hip_argmax_dim<ColMajor>()); - CALL_SUBTEST(test_hip_argmin_dim<RowMajor>()); - CALL_SUBTEST(test_hip_argmin_dim<ColMajor>()); -} diff --git a/unsupported/test/cxx11_tensor_cast_float16_hip.cu b/unsupported/test/cxx11_tensor_cast_float16_hip.cu deleted file mode 100644 index bf6a49df4..000000000 --- a/unsupported/test/cxx11_tensor_cast_float16_hip.cu +++ /dev/null @@ -1,79 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_hip -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int -#define EIGEN_USE_GPU - -#include "main.h" -#include <unsupported/Eigen/CXX11/Tensor> - -using Eigen::Tensor; - -void test_hip_conversion() { - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - int num_elem = 101; - - Tensor<float, 1> floats(num_elem); - floats.setRandom(); - - float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); - Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); - float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float)); - - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float( - d_float, num_elem); - Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half( - d_half, num_elem); - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv( - d_conv, num_elem); - - gpu_device.memcpyHostToDevice(d_float, floats.data(), num_elem*sizeof(float)); - - gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>(); - gpu_conv.device(gpu_device) = gpu_half.cast<float>(); - - Tensor<float, 1> initial(num_elem); - Tensor<float, 1> final(num_elem); - gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float)); - gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float)); - gpu_device.synchronize(); - - for (int i = 0; i < num_elem; ++i) { - VERIFY_IS_APPROX(initial(i), final(i)); - } - - gpu_device.deallocate(d_float); - gpu_device.deallocate(d_half); - gpu_device.deallocate(d_conv); -} - - -void test_fallback_conversion() { - int num_elem = 101; - Tensor<float, 1> floats(num_elem); - floats.setRandom(); - - Eigen::Tensor<Eigen::half, 1> halfs = floats.cast<Eigen::half>(); - Eigen::Tensor<float, 1> conv = halfs.cast<float>(); - - for (int i = 0; i < num_elem; ++i) { - VERIFY_IS_APPROX(floats(i), conv(i)); - } -} - - -void test_cxx11_tensor_cast_float16_hip() -{ - CALL_SUBTEST(test_hip_conversion()); - CALL_SUBTEST(test_fallback_conversion()); -} diff --git a/unsupported/test/cxx11_tensor_contract_hip.cu b/unsupported/test/cxx11_tensor_contract_hip.cu deleted file mode 100644 index 652af0ab0..000000000 --- a/unsupported/test/cxx11_tensor_contract_hip.cu +++ /dev/null @@ -1,215 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_hip -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int -#define EIGEN_USE_GPU - -#include "main.h" -#include <unsupported/Eigen/CXX11/Tensor> - - -using Eigen::Tensor; -typedef Tensor<float, 1>::DimensionPair DimPair; - -template<int DataLayout> -void test_hip_contraction(int m_size, int k_size, int n_size) -{ - std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; - // with these dimensions, the output has 300 * 140 elements, which is - // more than 30 * 1024, which is the number of threads in blocks on - // a 15 SM GK110 GPU - Tensor<float, 2, DataLayout> t_left(m_size, k_size); - Tensor<float, 2, DataLayout> t_right(k_size, n_size); - Tensor<float, 2, DataLayout> t_result(m_size, n_size); - Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size); - Eigen::array<DimPair, 1> dims(DimPair(1, 0)); - - t_left.setRandom(); - t_right.setRandom(); - - std::size_t t_left_bytes = t_left.size() * sizeof(float); - std::size_t t_right_bytes = t_right.size() * sizeof(float); - std::size_t t_result_bytes = t_result.size() * sizeof(float); - - float* d_t_left; - float* d_t_right; - float* d_t_result; - - hipMalloc((void**)(&d_t_left), t_left_bytes); - hipMalloc((void**)(&d_t_right), t_right_bytes); - hipMalloc((void**)(&d_t_result), t_result_bytes); - - hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice); - hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice); - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > - gpu_t_left(d_t_left, Eigen::array<int, 2>(m_size, k_size)); - Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > - gpu_t_right(d_t_right, Eigen::array<int, 2>(k_size, n_size)); - Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > - gpu_t_result(d_t_result, Eigen::array<int, 2>(m_size, n_size)); - - - gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); - t_result = t_left.contract(t_right, dims); - - hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost); - for (DenseIndex i = 0; i < t_result.size(); i++) { - if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) { - continue; - } - if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) { - continue; - } - std::cout << "mismatch detected at index " << i << ": " << t_result(i) - << " vs " << t_result_gpu(i) << std::endl; - assert(false); - } - - hipFree((void*)d_t_left); - hipFree((void*)d_t_right); - hipFree((void*)d_t_result); -} - - -template<int DataLayout> -void test_scalar(int m_size, int k_size, int n_size) -{ - std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; - // with these dimensions, the output has 300 * 140 elements, which is - // more than 30 * 1024, which is the number of threads in blocks on - // a 15 SM GK110 GPU - Tensor<float, 2, DataLayout> t_left(m_size, k_size); - Tensor<float, 2, DataLayout> t_right(k_size, n_size); - Tensor<float, 0, DataLayout> t_result; - Tensor<float, 0, DataLayout> t_result_gpu; - Eigen::array<DimPair, 2> dims(DimPair(0, 0), DimPair(1, 1)); - - t_left.setRandom(); - t_right.setRandom(); - - std::size_t t_left_bytes = t_left.size() * sizeof(float); - std::size_t t_right_bytes = t_right.size() * sizeof(float); - std::size_t t_result_bytes = sizeof(float); - - float* d_t_left; - float* d_t_right; - float* d_t_result; - - hipMalloc((void**)(&d_t_left), t_left_bytes); - hipMalloc((void**)(&d_t_right), t_right_bytes); - hipMalloc((void**)(&d_t_result), t_result_bytes); - - hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice); - hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice); - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > - gpu_t_left(d_t_left, m_size, k_size); - Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > - gpu_t_right(d_t_right, k_size, n_size); - Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> > - gpu_t_result(d_t_result); - - gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); - t_result = t_left.contract(t_right, dims); - - hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost); - if (fabs(t_result() - t_result_gpu()) > 1e-4f && - !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) { - std::cout << "mismatch detected: " << t_result() - << " vs " << t_result_gpu() << std::endl; - assert(false); - } - - hipFree((void*)d_t_left); - hipFree((void*)d_t_right); - hipFree((void*)d_t_result); -} - - -template<int DataLayout> -void test_hip_contraction_m() { - for (int k = 32; k < 256; k++) { - test_hip_contraction<ColMajor>(k, 128, 128); - test_hip_contraction<RowMajor>(k, 128, 128); - } -} - -template<int DataLayout> -void test_hip_contraction_k() { - for (int k = 32; k < 256; k++) { - test_hip_contraction<ColMajor>(128, k, 128); - test_hip_contraction<RowMajor>(128, k, 128); - } -} - -template<int DataLayout> -void test_hip_contraction_n() { - for (int k = 32; k < 256; k++) { - test_hip_contraction<ColMajor>(128, 128, k); - test_hip_contraction<RowMajor>(128, 128, k); - } -} - - -template<int DataLayout> -void test_hip_contraction_sizes() { - int m_sizes[] = { 31, 39, 63, 64, 65, - 127, 129, 255, 257 , 511, - 512, 513, 1023, 1024, 1025}; - - int n_sizes[] = { 31, 39, 63, 64, 65, - 127, 129, 255, 257, 511, - 512, 513, 1023, 1024, 1025}; - - int k_sizes[] = { 31, 39, 63, 64, 65, - 95, 96, 127, 129, 255, - 257, 511, 512, 513, 1023, - 1024, 1025}; - - for (int i = 0; i < 15; i++) { - for (int j = 0; j < 15; j++) { - for (int k = 0; k < 17; k++) { - test_hip_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]); - } - } - } -} - -void test_cxx11_tensor_hip() -{ - CALL_SUBTEST(test_hip_contraction<ColMajor>(128, 128, 128)); - CALL_SUBTEST(test_hip_contraction<RowMajor>(128, 128, 128)); - - CALL_SUBTEST(test_scalar<ColMajor>(128, 128, 128)); - CALL_SUBTEST(test_scalar<RowMajor>(128, 128, 128)); - - CALL_SUBTEST(test_hip_contraction_m<ColMajor>()); - CALL_SUBTEST(test_hip_contraction_m<RowMajor>()); - - CALL_SUBTEST(test_hip_contraction_k<ColMajor>()); - CALL_SUBTEST(test_hip_contraction_k<RowMajor>()); - - CALL_SUBTEST(test_hip_contraction_n<ColMajor>()); - CALL_SUBTEST(test_hip_contraction_n<RowMajor>()); - - // Commenting out these tests due to long runtimes - // CALL_SUBTEST(test_hip_contraction_sizes<ColMajor>()); - // CALL_SUBTEST(test_hip_contraction_sizes<RowMajor>()); -} diff --git a/unsupported/test/cxx11_tensor_device_hip.cu b/unsupported/test/cxx11_tensor_device_hip.cu deleted file mode 100644 index b98c481ff..000000000 --- a/unsupported/test/cxx11_tensor_device_hip.cu +++ /dev/null @@ -1,389 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_device -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int -#define EIGEN_USE_GPU - -#include "main.h" -#include <unsupported/Eigen/CXX11/Tensor> - -using Eigen::Tensor; -using Eigen::RowMajor; - -// Context for evaluation on cpu -struct CPUContext { - CPUContext(const Eigen::Tensor<float, 3>& in1, Eigen::Tensor<float, 3>& in2, Eigen::Tensor<float, 3>& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) { - kernel_1d_(0) = 3.14f; - kernel_1d_(1) = 2.7f; - - kernel_2d_(0,0) = 3.14f; - kernel_2d_(1,0) = 2.7f; - kernel_2d_(0,1) = 0.2f; - kernel_2d_(1,1) = 7.0f; - - kernel_3d_(0,0,0) = 3.14f; - kernel_3d_(0,1,0) = 2.7f; - kernel_3d_(0,0,1) = 0.2f; - kernel_3d_(0,1,1) = 7.0f; - kernel_3d_(1,0,0) = -1.0f; - kernel_3d_(1,1,0) = -0.3f; - kernel_3d_(1,0,1) = -0.7f; - kernel_3d_(1,1,1) = -0.5f; - } - - const Eigen::DefaultDevice& device() const { return cpu_device_; } - - const Eigen::Tensor<float, 3>& in1() const { return in1_; } - const Eigen::Tensor<float, 3>& in2() const { return in2_; } - Eigen::Tensor<float, 3>& out() { return out_; } - const Eigen::Tensor<float, 1>& kernel1d() const { return kernel_1d_; } - const Eigen::Tensor<float, 2>& kernel2d() const { return kernel_2d_; } - const Eigen::Tensor<float, 3>& kernel3d() const { return kernel_3d_; } - - private: - const Eigen::Tensor<float, 3>& in1_; - const Eigen::Tensor<float, 3>& in2_; - Eigen::Tensor<float, 3>& out_; - - Eigen::Tensor<float, 1> kernel_1d_; - Eigen::Tensor<float, 2> kernel_2d_; - Eigen::Tensor<float, 3> kernel_3d_; - - Eigen::DefaultDevice cpu_device_; -}; - - -// Context for evaluation on GPU -struct GPUContext { - GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) { - assert(hipMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == hipSuccess); - float kernel_1d_val[] = {3.14f, 2.7f}; - assert(hipMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), hipMemcpyHostToDevice) == hipSuccess); - - assert(hipMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == hipSuccess); - float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f}; - assert(hipMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), hipMemcpyHostToDevice) == hipSuccess); - - assert(hipMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == hipSuccess); - float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f}; - assert(hipMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), hipMemcpyHostToDevice) == hipSuccess); - } - ~GPUContext() { - assert(hipFree(kernel_1d_) == hipSuccess); - assert(hipFree(kernel_2d_) == hipSuccess); - assert(hipFree(kernel_3d_) == hipSuccess); - } - - const Eigen::GpuDevice& device() const { return gpu_device_; } - - const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1() const { return in1_; } - const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2() const { return in2_; } - Eigen::TensorMap<Eigen::Tensor<float, 3> >& out() { return out_; } - Eigen::TensorMap<Eigen::Tensor<float, 1> > kernel1d() const { return Eigen::TensorMap<Eigen::Tensor<float, 1> >(kernel_1d_, 2); } - Eigen::TensorMap<Eigen::Tensor<float, 2> > kernel2d() const { return Eigen::TensorMap<Eigen::Tensor<float, 2> >(kernel_2d_, 2, 2); } - Eigen::TensorMap<Eigen::Tensor<float, 3> > kernel3d() const { return Eigen::TensorMap<Eigen::Tensor<float, 3> >(kernel_3d_, 2, 2, 2); } - - private: - const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1_; - const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2_; - Eigen::TensorMap<Eigen::Tensor<float, 3> >& out_; - - float* kernel_1d_; - float* kernel_2d_; - float* kernel_3d_; - - Eigen::HipStreamDevice stream_; - Eigen::GpuDevice gpu_device_; -}; - - -// The actual expression to evaluate -template <typename Context> -void test_contextual_eval(Context* context) -{ - context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f); -} - -template <typename Context> -void test_forced_contextual_eval(Context* context) -{ - context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f); -} - -template <typename Context> -void test_compound_assignment(Context* context) -{ - context->out().device(context->device()) = context->in1().constant(2.718f); - context->out().device(context->device()) += context->in1() + context->in2() * 3.14f; -} - - -template <typename Context> -void test_contraction(Context* context) -{ - Eigen::array<std::pair<int, int>, 2> dims; - dims[0] = std::make_pair(1, 1); - dims[1] = std::make_pair(2, 2); - - Eigen::array<int, 2> shape(40, 50*70); - - Eigen::DSizes<int, 2> indices(0,0); - Eigen::DSizes<int, 2> sizes(40,40); - - context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims); -} - - -template <typename Context> -void test_1d_convolution(Context* context) -{ - Eigen::DSizes<int, 3> indices(0,0,0); - Eigen::DSizes<int, 3> sizes(40,49,70); - - Eigen::array<int, 1> dims(1); - context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims); -} - -template <typename Context> -void test_2d_convolution(Context* context) -{ - Eigen::DSizes<int, 3> indices(0,0,0); - Eigen::DSizes<int, 3> sizes(40,49,69); - - Eigen::array<int, 2> dims(1,2); - context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims); -} - -template <typename Context> -void test_3d_convolution(Context* context) -{ - Eigen::DSizes<int, 3> indices(0,0,0); - Eigen::DSizes<int, 3> sizes(39,49,69); - - Eigen::array<int, 3> dims(0,1,2); - context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims); -} - - -void test_cpu() { - Eigen::Tensor<float, 3> in1(40,50,70); - Eigen::Tensor<float, 3> in2(40,50,70); - Eigen::Tensor<float, 3> out(40,50,70); - - in1 = in1.random() + in1.constant(10.0f); - in2 = in2.random() + in2.constant(10.0f); - - CPUContext context(in1, in2, out); - test_contextual_eval(&context); - for (int i = 0; i < 40; ++i) { - for (int j = 0; j < 50; ++j) { - for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); - } - } - } - - test_forced_contextual_eval(&context); - for (int i = 0; i < 40; ++i) { - for (int j = 0; j < 50; ++j) { - for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f); - } - } - } - - test_compound_assignment(&context); - for (int i = 0; i < 40; ++i) { - for (int j = 0; j < 50; ++j) { - for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); - } - } - } - - test_contraction(&context); - for (int i = 0; i < 40; ++i) { - for (int j = 0; j < 40; ++j) { - const float result = out(i,j,0); - float expected = 0; - for (int k = 0; k < 50; ++k) { - for (int l = 0; l < 70; ++l) { - expected += in1(i, k, l) * in2(j, k, l); - } - } - VERIFY_IS_APPROX(expected, result); - } - } - - test_1d_convolution(&context); - for (int i = 0; i < 40; ++i) { - for (int j = 0; j < 49; ++j) { - for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f)); - } - } - } - - test_2d_convolution(&context); - for (int i = 0; i < 40; ++i) { - for (int j = 0; j < 49; ++j) { - for (int k = 0; k < 69; ++k) { - const float result = out(i,j,k); - const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) + - (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f); - if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) { - continue; - } - VERIFY_IS_APPROX(expected, result); - } - } - } - - test_3d_convolution(&context); - for (int i = 0; i < 39; ++i) { - for (int j = 0; j < 49; ++j) { - for (int k = 0; k < 69; ++k) { - const float result = out(i,j,k); - const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f + - in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) + - (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f + - in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f); - if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) { - continue; - } - VERIFY_IS_APPROX(expected, result); - } - } - } -} - -void test_gpu() { - Eigen::Tensor<float, 3> in1(40,50,70); - Eigen::Tensor<float, 3> in2(40,50,70); - Eigen::Tensor<float, 3> out(40,50,70); - in1 = in1.random() + in1.constant(10.0f); - in2 = in2.random() + in2.constant(10.0f); - - std::size_t in1_bytes = in1.size() * sizeof(float); - std::size_t in2_bytes = in2.size() * sizeof(float); - std::size_t out_bytes = out.size() * sizeof(float); - - float* d_in1; - float* d_in2; - float* d_out; - hipMalloc((void**)(&d_in1), in1_bytes); - hipMalloc((void**)(&d_in2), in2_bytes); - hipMalloc((void**)(&d_out), out_bytes); - - hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice); - hipMemcpy(d_in2, in2.data(), in2_bytes, hipMemcpyHostToDevice); - - Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70); - Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70); - Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, 40,50,70); - - GPUContext context(gpu_in1, gpu_in2, gpu_out); - test_contextual_eval(&context); - assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess); - for (int i = 0; i < 40; ++i) { - for (int j = 0; j < 50; ++j) { - for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); - } - } - } - - test_forced_contextual_eval(&context); - assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess); - for (int i = 0; i < 40; ++i) { - for (int j = 0; j < 50; ++j) { - for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f); - } - } - } - - test_compound_assignment(&context); - assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess); - for (int i = 0; i < 40; ++i) { - for (int j = 0; j < 50; ++j) { - for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); - } - } - } - - test_contraction(&context); - assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess); - for (int i = 0; i < 40; ++i) { - for (int j = 0; j < 40; ++j) { - const float result = out(i,j,0); - float expected = 0; - for (int k = 0; k < 50; ++k) { - for (int l = 0; l < 70; ++l) { - expected += in1(i, k, l) * in2(j, k, l); - } - } - VERIFY_IS_APPROX(expected, result); - } - } - - test_1d_convolution(&context); - assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess); - assert(hipStreamSynchronize(context.device().stream()) == hipSuccess); - for (int i = 0; i < 40; ++i) { - for (int j = 0; j < 49; ++j) { - for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f)); - } - } - } - - test_2d_convolution(&context); - assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess); - assert(hipStreamSynchronize(context.device().stream()) == hipSuccess); - for (int i = 0; i < 40; ++i) { - for (int j = 0; j < 49; ++j) { - for (int k = 0; k < 69; ++k) { - const float result = out(i,j,k); - const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f + - in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f); - VERIFY_IS_APPROX(expected, result); - } - } - } - - /* - test_3d_convolution(&context); - assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess); - assert(hipStreamSynchronize(context.device().stream()) == hipSuccess); - for (int i = 0; i < 39; ++i) { - for (int j = 0; j < 49; ++j) { - for (int k = 0; k < 69; ++k) { - const float result = out(i,j,k); - const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f + - in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f + - in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f + - in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f); - VERIFY_IS_APPROX(expected, result); - } - } - } - */ -} - - -void test_cxx11_tensor_device() -{ - CALL_SUBTEST(test_cpu()); - CALL_SUBTEST(test_gpu()); -} diff --git a/unsupported/test/cxx11_tensor_hip.cu b/unsupported/test/cxx11_tensor_hip.cu deleted file mode 100644 index b28840267..000000000 --- a/unsupported/test/cxx11_tensor_hip.cu +++ /dev/null @@ -1,1295 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_hip -#define EIGEN_USE_GPU - -#ifdef __NVCC__ -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 -#include <cuda_fp16.h> -#endif -#endif -#include "main.h" -#include <unsupported/Eigen/CXX11/Tensor> - -using Eigen::Tensor; - -void test_hip_nullary() { - Tensor<float, 1, 0, int> in1(2); - Tensor<float, 1, 0, int> in2(2); - in1.setRandom(); - in2.setRandom(); - - std::size_t tensor_bytes = in1.size() * sizeof(float); - - float* d_in1; - float* d_in2; - hipMalloc((void**)(&d_in1), tensor_bytes); - hipMalloc((void**)(&d_in2), tensor_bytes); - hipMemcpy(d_in1, in1.data(), tensor_bytes, hipMemcpyHostToDevice); - hipMemcpy(d_in2, in2.data(), tensor_bytes, hipMemcpyHostToDevice); - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1( - d_in1, 2); - Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in2( - d_in2, 2); - - gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f); - gpu_in2.device(gpu_device) = gpu_in2.random(); - - Tensor<float, 1, 0, int> new1(2); - Tensor<float, 1, 0, int> new2(2); - - assert(hipMemcpyAsync(new1.data(), d_in1, tensor_bytes, hipMemcpyDeviceToHost, - gpu_device.stream()) == hipSuccess); - assert(hipMemcpyAsync(new2.data(), d_in2, tensor_bytes, hipMemcpyDeviceToHost, - gpu_device.stream()) == hipSuccess); - - assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); - - for (int i = 0; i < 2; ++i) { - VERIFY_IS_APPROX(new1(i), 3.14f); - VERIFY_IS_NOT_EQUAL(new2(i), in2(i)); - } - - hipFree(d_in1); - hipFree(d_in2); -} - -void test_hip_elementwise_small() { - Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>{2}); - Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>{2}); - Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>{2}); - in1.setRandom(); - in2.setRandom(); - - std::size_t in1_bytes = in1.size() * sizeof(float); - std::size_t in2_bytes = in2.size() * sizeof(float); - std::size_t out_bytes = out.size() * sizeof(float); - - float* d_in1; - float* d_in2; - float* d_out; - hipMalloc((void**)(&d_in1), in1_bytes); - hipMalloc((void**)(&d_in2), in2_bytes); - hipMalloc((void**)(&d_out), out_bytes); - - hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice); - hipMemcpy(d_in2, in2.data(), in2_bytes, hipMemcpyHostToDevice); - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1( - d_in1, Eigen::array<Eigen::DenseIndex, 1>{2}); - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2( - d_in2, Eigen::array<Eigen::DenseIndex, 1>{2}); - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out( - d_out, Eigen::array<Eigen::DenseIndex, 1>{2}); - - gpu_out.device(gpu_device) = gpu_in1 + gpu_in2; - - assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, - gpu_device.stream()) == hipSuccess); - assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); - - for (int i = 0; i < 2; ++i) { - VERIFY_IS_APPROX( - out(Eigen::array<Eigen::DenseIndex, 1>{i}), - in1(Eigen::array<Eigen::DenseIndex, 1>{i}) + in2(Eigen::array<Eigen::DenseIndex, 1>{i})); - } - - hipFree(d_in1); - hipFree(d_in2); - hipFree(d_out); -} - -void test_hip_elementwise() -{ - Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>{72,53,97}); - Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>{72,53,97}); - Tensor<float, 3> in3(Eigen::array<Eigen::DenseIndex, 3>{72,53,97}); - Tensor<float, 3> out(Eigen::array<Eigen::DenseIndex, 3>{72,53,97}); - in1.setRandom(); - in2.setRandom(); - in3.setRandom(); - - std::size_t in1_bytes = in1.size() * sizeof(float); - std::size_t in2_bytes = in2.size() * sizeof(float); - std::size_t in3_bytes = in3.size() * sizeof(float); - std::size_t out_bytes = out.size() * sizeof(float); - - float* d_in1; - float* d_in2; - float* d_in3; - float* d_out; - hipMalloc((void**)(&d_in1), in1_bytes); - hipMalloc((void**)(&d_in2), in2_bytes); - hipMalloc((void**)(&d_in3), in3_bytes); - hipMalloc((void**)(&d_out), out_bytes); - - hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice); - hipMemcpy(d_in2, in2.data(), in2_bytes, hipMemcpyHostToDevice); - hipMemcpy(d_in3, in3.data(), in3_bytes, hipMemcpyHostToDevice); - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>{72,53,97}); - Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<Eigen::DenseIndex, 3>{72,53,97}); - Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<Eigen::DenseIndex, 3>{72,53,97}); - Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<Eigen::DenseIndex, 3>{72,53,97}); - - gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3; - - assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); - assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); - - for (int i = 0; i < 72; ++i) { - for (int j = 0; j < 53; ++j) { - for (int k = 0; k < 97; ++k) { - VERIFY_IS_APPROX(out(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}), in1(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}) + in2(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}) * in3(Eigen::array<Eigen::DenseIndex, 3>{i,j,k})); - } - } - } - - hipFree(d_in1); - hipFree(d_in2); - hipFree(d_in3); - hipFree(d_out); -} - -void test_hip_props() { - Tensor<float, 1> in1(200); - Tensor<bool, 1> out(200); - in1.setRandom(); - - std::size_t in1_bytes = in1.size() * sizeof(float); - std::size_t out_bytes = out.size() * sizeof(bool); - - float* d_in1; - bool* d_out; - hipMalloc((void**)(&d_in1), in1_bytes); - hipMalloc((void**)(&d_out), out_bytes); - - hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice); - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1( - d_in1, 200); - Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_out( - d_out, 200); - - gpu_out.device(gpu_device) = (gpu_in1.isnan)(); - - assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, - gpu_device.stream()) == hipSuccess); - assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); - - for (int i = 0; i < 200; ++i) { - VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i))); - } - - hipFree(d_in1); - hipFree(d_out); -} - -void test_hip_reduction() -{ - Tensor<float, 4> in1(72,53,97,113); - Tensor<float, 2> out(72,97); - in1.setRandom(); - - std::size_t in1_bytes = in1.size() * sizeof(float); - std::size_t out_bytes = out.size() * sizeof(float); - - float* d_in1; - float* d_out; - hipMalloc((void**)(&d_in1), in1_bytes); - hipMalloc((void**)(&d_out), out_bytes); - - hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice); - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113); - Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97); - - array<Eigen::DenseIndex, 2> reduction_axis; - reduction_axis[0] = 1; - reduction_axis[1] = 3; - - gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis); - - assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); - assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); - - for (int i = 0; i < 72; ++i) { - for (int j = 0; j < 97; ++j) { - float expected = 0; - for (int k = 0; k < 53; ++k) { - for (int l = 0; l < 113; ++l) { - expected = - std::max<float>(expected, in1(i, k, j, l)); - } - } - VERIFY_IS_APPROX(out(i,j), expected); - } - } - - hipFree(d_in1); - hipFree(d_out); -} - -template<int DataLayout> -void test_hip_contraction() -{ - // with these dimensions, the output has 300 * 140 elements, which is - // more than 30 * 1024, which is the number of threads in blocks on - // a 15 SM GK110 GPU - Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31); - Tensor<float, 5, DataLayout> t_right(Eigen::array<Eigen::DenseIndex, 5>{3, 31, 7, 20, 1}); - Tensor<float, 5, DataLayout> t_result(Eigen::array<Eigen::DenseIndex, 5>{6, 50, 7, 20, 1}); - - t_left.setRandom(); - t_right.setRandom(); - - std::size_t t_left_bytes = t_left.size() * sizeof(float); - std::size_t t_right_bytes = t_right.size() * sizeof(float); - std::size_t t_result_bytes = t_result.size() * sizeof(float); - - float* d_t_left; - float* d_t_right; - float* d_t_result; - - hipMalloc((void**)(&d_t_left), t_left_bytes); - hipMalloc((void**)(&d_t_right), t_right_bytes); - hipMalloc((void**)(&d_t_result), t_result_bytes); - - hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice); - hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice); - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31); - Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_right(d_t_right, 3, 31, 7, 20, 1); - Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_result(d_t_result, 6, 50, 7, 20, 1); - - typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf; - MapXf m_left(t_left.data(), 300, 93); - MapXf m_right(t_right.data(), 93, 140); - Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(300, 140); - - typedef Tensor<float, 1>::DimensionPair DimPair; - Eigen::array<DimPair, 2> dims; - dims[0] = DimPair(2, 0); - dims[1] = DimPair(3, 1); - - m_result = m_left * m_right; - gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); - - hipMemcpy(t_result.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost); - - for (DenseIndex i = 0; i < t_result.size(); i++) { - if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) { - std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; - assert(false); - } - } - - hipFree(d_t_left); - hipFree(d_t_right); - hipFree(d_t_result); -} - - -template<int DataLayout> -void test_hip_convolution_1d() -{ - Tensor<float, 4, DataLayout> input(74,37,11,137); - Tensor<float, 1, DataLayout> kernel(4); - Tensor<float, 4, DataLayout> out(74,34,11,137); - input = input.constant(10.0f) + input.random(); - kernel = kernel.constant(7.0f) + kernel.random(); - - std::size_t input_bytes = input.size() * sizeof(float); - std::size_t kernel_bytes = kernel.size() * sizeof(float); - std::size_t out_bytes = out.size() * sizeof(float); - - float* d_input; - float* d_kernel; - float* d_out; - hipMalloc((void**)(&d_input), input_bytes); - hipMalloc((void**)(&d_kernel), kernel_bytes); - hipMalloc((void**)(&d_out), out_bytes); - - hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice); - hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice); - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137); - Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4); - Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137); - - Eigen::array<Eigen::DenseIndex, 1> dims{1}; - gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); - - assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); - assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); - - for (int i = 0; i < 74; ++i) { - for (int j = 0; j < 34; ++j) { - for (int k = 0; k < 11; ++k) { - for (int l = 0; l < 137; ++l) { - const float result = out(i,j,k,l); - const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) + - input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3); - VERIFY_IS_APPROX(result, expected); - } - } - } - } - - hipFree(d_input); - hipFree(d_kernel); - hipFree(d_out); -} - -void test_hip_convolution_inner_dim_col_major_1d() -{ - Tensor<float, 4, ColMajor> input(74,9,11,7); - Tensor<float, 1, ColMajor> kernel(4); - Tensor<float, 4, ColMajor> out(71,9,11,7); - input = input.constant(10.0f) + input.random(); - kernel = kernel.constant(7.0f) + kernel.random(); - - std::size_t input_bytes = input.size() * sizeof(float); - std::size_t kernel_bytes = kernel.size() * sizeof(float); - std::size_t out_bytes = out.size() * sizeof(float); - - float* d_input; - float* d_kernel; - float* d_out; - hipMalloc((void**)(&d_input), input_bytes); - hipMalloc((void**)(&d_kernel), kernel_bytes); - hipMalloc((void**)(&d_out), out_bytes); - - hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice); - hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice); - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7); - Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4); - Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7); - - Eigen::array<Eigen::DenseIndex, 1> dims{0}; - gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); - - assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); - assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); - - for (int i = 0; i < 71; ++i) { - for (int j = 0; j < 9; ++j) { - for (int k = 0; k < 11; ++k) { - for (int l = 0; l < 7; ++l) { - const float result = out(i,j,k,l); - const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) + - input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3); - VERIFY_IS_APPROX(result, expected); - } - } - } - } - - hipFree(d_input); - hipFree(d_kernel); - hipFree(d_out); -} - -void test_hip_convolution_inner_dim_row_major_1d() -{ - Tensor<float, 4, RowMajor> input(7,9,11,74); - Tensor<float, 1, RowMajor> kernel(4); - Tensor<float, 4, RowMajor> out(7,9,11,71); - input = input.constant(10.0f) + input.random(); - kernel = kernel.constant(7.0f) + kernel.random(); - - std::size_t input_bytes = input.size() * sizeof(float); - std::size_t kernel_bytes = kernel.size() * sizeof(float); - std::size_t out_bytes = out.size() * sizeof(float); - - float* d_input; - float* d_kernel; - float* d_out; - hipMalloc((void**)(&d_input), input_bytes); - hipMalloc((void**)(&d_kernel), kernel_bytes); - hipMalloc((void**)(&d_out), out_bytes); - - hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice); - hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice); - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74); - Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4); - Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71); - - Eigen::array<Eigen::DenseIndex, 1> dims{3}; - gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); - - assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); - assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); - - for (int i = 0; i < 7; ++i) { - for (int j = 0; j < 9; ++j) { - for (int k = 0; k < 11; ++k) { - for (int l = 0; l < 71; ++l) { - const float result = out(i,j,k,l); - const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) + - input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3); - VERIFY_IS_APPROX(result, expected); - } - } - } - } - - hipFree(d_input); - hipFree(d_kernel); - hipFree(d_out); -} - -template<int DataLayout> -void test_hip_convolution_2d() -{ - Tensor<float, 4, DataLayout> input(74,37,11,137); - Tensor<float, 2, DataLayout> kernel(3,4); - Tensor<float, 4, DataLayout> out(74,35,8,137); - input = input.constant(10.0f) + input.random(); - kernel = kernel.constant(7.0f) + kernel.random(); - - std::size_t input_bytes = input.size() * sizeof(float); - std::size_t kernel_bytes = kernel.size() * sizeof(float); - std::size_t out_bytes = out.size() * sizeof(float); - - float* d_input; - float* d_kernel; - float* d_out; - hipMalloc((void**)(&d_input), input_bytes); - hipMalloc((void**)(&d_kernel), kernel_bytes); - hipMalloc((void**)(&d_out), out_bytes); - - hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice); - hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice); - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137); - Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4); - Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137); - - Eigen::array<Eigen::DenseIndex, 2> dims{1,2}; - gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); - - assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); - assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); - - for (int i = 0; i < 74; ++i) { - for (int j = 0; j < 35; ++j) { - for (int k = 0; k < 8; ++k) { - for (int l = 0; l < 137; ++l) { - const float result = out(i,j,k,l); - const float expected = input(i,j+0,k+0,l) * kernel(0,0) + - input(i,j+1,k+0,l) * kernel(1,0) + - input(i,j+2,k+0,l) * kernel(2,0) + - input(i,j+0,k+1,l) * kernel(0,1) + - input(i,j+1,k+1,l) * kernel(1,1) + - input(i,j+2,k+1,l) * kernel(2,1) + - input(i,j+0,k+2,l) * kernel(0,2) + - input(i,j+1,k+2,l) * kernel(1,2) + - input(i,j+2,k+2,l) * kernel(2,2) + - input(i,j+0,k+3,l) * kernel(0,3) + - input(i,j+1,k+3,l) * kernel(1,3) + - input(i,j+2,k+3,l) * kernel(2,3); - VERIFY_IS_APPROX(result, expected); - } - } - } - } - - hipFree(d_input); - hipFree(d_kernel); - hipFree(d_out); -} - -template<int DataLayout> -void test_hip_convolution_3d() -{ - Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>{74,37,11,137,17}); - Tensor<float, 3, DataLayout> kernel(3,4,2); - Tensor<float, 5, DataLayout> out(Eigen::array<Eigen::DenseIndex, 5>{74,35,8,136,17}); - input = input.constant(10.0f) + input.random(); - kernel = kernel.constant(7.0f) + kernel.random(); - - std::size_t input_bytes = input.size() * sizeof(float); - std::size_t kernel_bytes = kernel.size() * sizeof(float); - std::size_t out_bytes = out.size() * sizeof(float); - - float* d_input; - float* d_kernel; - float* d_out; - hipMalloc((void**)(&d_input), input_bytes); - hipMalloc((void**)(&d_kernel), kernel_bytes); - hipMalloc((void**)(&d_out), out_bytes); - - hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice); - hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice); - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17); - Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2); - Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17); - - Eigen::array<Eigen::DenseIndex, 3> dims{1,2,3}; - gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); - - assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); - assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); - - for (int i = 0; i < 74; ++i) { - for (int j = 0; j < 35; ++j) { - for (int k = 0; k < 8; ++k) { - for (int l = 0; l < 136; ++l) { - for (int m = 0; m < 17; ++m) { - const float result = out(i,j,k,l,m); - const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) + - input(i,j+1,k+0,l+0,m) * kernel(1,0,0) + - input(i,j+2,k+0,l+0,m) * kernel(2,0,0) + - input(i,j+0,k+1,l+0,m) * kernel(0,1,0) + - input(i,j+1,k+1,l+0,m) * kernel(1,1,0) + - input(i,j+2,k+1,l+0,m) * kernel(2,1,0) + - input(i,j+0,k+2,l+0,m) * kernel(0,2,0) + - input(i,j+1,k+2,l+0,m) * kernel(1,2,0) + - input(i,j+2,k+2,l+0,m) * kernel(2,2,0) + - input(i,j+0,k+3,l+0,m) * kernel(0,3,0) + - input(i,j+1,k+3,l+0,m) * kernel(1,3,0) + - input(i,j+2,k+3,l+0,m) * kernel(2,3,0) + - input(i,j+0,k+0,l+1,m) * kernel(0,0,1) + - input(i,j+1,k+0,l+1,m) * kernel(1,0,1) + - input(i,j+2,k+0,l+1,m) * kernel(2,0,1) + - input(i,j+0,k+1,l+1,m) * kernel(0,1,1) + - input(i,j+1,k+1,l+1,m) * kernel(1,1,1) + - input(i,j+2,k+1,l+1,m) * kernel(2,1,1) + - input(i,j+0,k+2,l+1,m) * kernel(0,2,1) + - input(i,j+1,k+2,l+1,m) * kernel(1,2,1) + - input(i,j+2,k+2,l+1,m) * kernel(2,2,1) + - input(i,j+0,k+3,l+1,m) * kernel(0,3,1) + - input(i,j+1,k+3,l+1,m) * kernel(1,3,1) + - input(i,j+2,k+3,l+1,m) * kernel(2,3,1); - VERIFY_IS_APPROX(result, expected); - } - } - } - } - } - - hipFree(d_input); - hipFree(d_kernel); - hipFree(d_out); -} - - -template <typename Scalar> -void test_hip_lgamma(const Scalar stddev) -{ - Tensor<Scalar, 2> in(72,97); - in.setRandom(); - in *= in.constant(stddev); - Tensor<Scalar, 2> out(72,97); - out.setZero(); - - std::size_t bytes = in.size() * sizeof(Scalar); - - Scalar* d_in; - Scalar* d_out; - hipMalloc((void**)(&d_in), bytes); - hipMalloc((void**)(&d_out), bytes); - - hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice); - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97); - Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97); - - gpu_out.device(gpu_device) = gpu_in.lgamma(); - - assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); - assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); - - for (int i = 0; i < 72; ++i) { - for (int j = 0; j < 97; ++j) { - VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j))); - } - } - - hipFree(d_in); - hipFree(d_out); -} - -template <typename Scalar> -void test_hip_digamma() -{ - Tensor<Scalar, 1> in(7); - Tensor<Scalar, 1> out(7); - Tensor<Scalar, 1> expected_out(7); - out.setZero(); - - in(0) = Scalar(1); - in(1) = Scalar(1.5); - in(2) = Scalar(4); - in(3) = Scalar(-10.5); - in(4) = Scalar(10000.5); - in(5) = Scalar(0); - in(6) = Scalar(-1); - - expected_out(0) = Scalar(-0.5772156649015329); - expected_out(1) = Scalar(0.03648997397857645); - expected_out(2) = Scalar(1.2561176684318); - expected_out(3) = Scalar(2.398239129535781); - expected_out(4) = Scalar(9.210340372392849); - expected_out(5) = std::numeric_limits<Scalar>::infinity(); - expected_out(6) = std::numeric_limits<Scalar>::infinity(); - - std::size_t bytes = in.size() * sizeof(Scalar); - - Scalar* d_in; - Scalar* d_out; - hipMalloc((void**)(&d_in), bytes); - hipMalloc((void**)(&d_out), bytes); - - hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice); - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7); - Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7); - - gpu_out.device(gpu_device) = gpu_in.digamma(); - - assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); - assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); - - for (int i = 0; i < 5; ++i) { - VERIFY_IS_APPROX(out(i), expected_out(i)); - } - for (int i = 5; i < 7; ++i) { - VERIFY_IS_EQUAL(out(i), expected_out(i)); - } - - hipFree(d_in); - hipFree(d_out); -} - -template <typename Scalar> -void test_hip_zeta() -{ - Tensor<Scalar, 1> in_x(6); - Tensor<Scalar, 1> in_q(6); - Tensor<Scalar, 1> out(6); - Tensor<Scalar, 1> expected_out(6); - out.setZero(); - - in_x(0) = Scalar(1); - in_x(1) = Scalar(1.5); - in_x(2) = Scalar(4); - in_x(3) = Scalar(-10.5); - in_x(4) = Scalar(10000.5); - in_x(5) = Scalar(3); - - in_q(0) = Scalar(1.2345); - in_q(1) = Scalar(2); - in_q(2) = Scalar(1.5); - in_q(3) = Scalar(3); - in_q(4) = Scalar(1.0001); - in_q(5) = Scalar(-2.5); - - expected_out(0) = std::numeric_limits<Scalar>::infinity(); - expected_out(1) = Scalar(1.61237534869); - expected_out(2) = Scalar(0.234848505667); - expected_out(3) = Scalar(1.03086757337e-5); - expected_out(4) = Scalar(0.367879440865); - expected_out(5) = Scalar(0.054102025820864097); - - std::size_t bytes = in_x.size() * sizeof(Scalar); - - Scalar* d_in_x; - Scalar* d_in_q; - Scalar* d_out; - hipMalloc((void**)(&d_in_x), bytes); - hipMalloc((void**)(&d_in_q), bytes); - hipMalloc((void**)(&d_out), bytes); - - hipMemcpy(d_in_x, in_x.data(), bytes, hipMemcpyHostToDevice); - hipMemcpy(d_in_q, in_q.data(), bytes, hipMemcpyHostToDevice); - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6); - Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_q(d_in_q, 6); - Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6); - - gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q); - - assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); - assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); - - VERIFY_IS_EQUAL(out(0), expected_out(0)); - VERIFY((std::isnan)(out(3))); - - for (int i = 1; i < 6; ++i) { - if (i != 3) { - VERIFY_IS_APPROX(out(i), expected_out(i)); - } - } - - hipFree(d_in_x); - hipFree(d_in_q); - hipFree(d_out); -} - -template <typename Scalar> -void test_hip_polygamma() -{ - Tensor<Scalar, 1> in_x(7); - Tensor<Scalar, 1> in_n(7); - Tensor<Scalar, 1> out(7); - Tensor<Scalar, 1> expected_out(7); - out.setZero(); - - in_n(0) = Scalar(1); - in_n(1) = Scalar(1); - in_n(2) = Scalar(1); - in_n(3) = Scalar(17); - in_n(4) = Scalar(31); - in_n(5) = Scalar(28); - in_n(6) = Scalar(8); - - in_x(0) = Scalar(2); - in_x(1) = Scalar(3); - in_x(2) = Scalar(25.5); - in_x(3) = Scalar(4.7); - in_x(4) = Scalar(11.8); - in_x(5) = Scalar(17.7); - in_x(6) = Scalar(30.2); - - expected_out(0) = Scalar(0.644934066848); - expected_out(1) = Scalar(0.394934066848); - expected_out(2) = Scalar(0.0399946696496); - expected_out(3) = Scalar(293.334565435); - expected_out(4) = Scalar(0.445487887616); - expected_out(5) = Scalar(-2.47810300902e-07); - expected_out(6) = Scalar(-8.29668781082e-09); - - std::size_t bytes = in_x.size() * sizeof(Scalar); - - Scalar* d_in_x; - Scalar* d_in_n; - Scalar* d_out; - hipMalloc((void**)(&d_in_x), bytes); - hipMalloc((void**)(&d_in_n), bytes); - hipMalloc((void**)(&d_out), bytes); - - hipMemcpy(d_in_x, in_x.data(), bytes, hipMemcpyHostToDevice); - hipMemcpy(d_in_n, in_n.data(), bytes, hipMemcpyHostToDevice); - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7); - Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_n(d_in_n, 7); - Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7); - - gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x); - - assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); - assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); - - for (int i = 0; i < 7; ++i) { - VERIFY_IS_APPROX(out(i), expected_out(i)); - } - - hipFree(d_in_x); - hipFree(d_in_n); - hipFree(d_out); -} - -template <typename Scalar> -void test_hip_igamma() -{ - Tensor<Scalar, 2> a(6, 6); - Tensor<Scalar, 2> x(6, 6); - Tensor<Scalar, 2> out(6, 6); - out.setZero(); - - Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; - Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; - - for (int i = 0; i < 6; ++i) { - for (int j = 0; j < 6; ++j) { - a(i, j) = a_s[i]; - x(i, j) = x_s[j]; - } - } - - Scalar nan = std::numeric_limits<Scalar>::quiet_NaN(); - Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan}, - {0.0, 0.6321205588285578, 0.7768698398515702, - 0.9816843611112658, 9.999500016666262e-05, 1.0}, - {0.0, 0.4275932955291202, 0.608374823728911, - 0.9539882943107686, 7.522076445089201e-07, 1.0}, - {0.0, 0.01898815687615381, 0.06564245437845008, - 0.5665298796332909, 4.166333347221828e-18, 1.0}, - {0.0, 0.9999780593618628, 0.9999899967080838, - 0.9999996219837988, 0.9991370418689945, 1.0}, - {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}}; - - - - std::size_t bytes = a.size() * sizeof(Scalar); - - Scalar* d_a; - Scalar* d_x; - Scalar* d_out; - assert(hipMalloc((void**)(&d_a), bytes) == hipSuccess); - assert(hipMalloc((void**)(&d_x), bytes) == hipSuccess); - assert(hipMalloc((void**)(&d_out), bytes) == hipSuccess); - - hipMemcpy(d_a, a.data(), bytes, hipMemcpyHostToDevice); - hipMemcpy(d_x, x.data(), bytes, hipMemcpyHostToDevice); - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6); - Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6); - Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6); - - gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x); - - assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); - assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); - - for (int i = 0; i < 6; ++i) { - for (int j = 0; j < 6; ++j) { - if ((std::isnan)(igamma_s[i][j])) { - VERIFY((std::isnan)(out(i, j))); - } else { - VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]); - } - } - } - - hipFree(d_a); - hipFree(d_x); - hipFree(d_out); -} - -template <typename Scalar> -void test_hip_igammac() -{ - Tensor<Scalar, 2> a(6, 6); - Tensor<Scalar, 2> x(6, 6); - Tensor<Scalar, 2> out(6, 6); - out.setZero(); - - Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; - Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; - - for (int i = 0; i < 6; ++i) { - for (int j = 0; j < 6; ++j) { - a(i, j) = a_s[i]; - x(i, j) = x_s[j]; - } - } - - Scalar nan = std::numeric_limits<Scalar>::quiet_NaN(); - Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan}, - {1.0, 0.36787944117144233, 0.22313016014842982, - 0.018315638888734182, 0.9999000049998333, 0.0}, - {1.0, 0.5724067044708798, 0.3916251762710878, - 0.04601170568923136, 0.9999992477923555, 0.0}, - {1.0, 0.9810118431238462, 0.9343575456215499, - 0.4334701203667089, 1.0, 0.0}, - {1.0, 2.1940638138146658e-05, 1.0003291916285e-05, - 3.7801620118431334e-07, 0.0008629581310054535, - 0.0}, - {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}}; - - std::size_t bytes = a.size() * sizeof(Scalar); - - Scalar* d_a; - Scalar* d_x; - Scalar* d_out; - hipMalloc((void**)(&d_a), bytes); - hipMalloc((void**)(&d_x), bytes); - hipMalloc((void**)(&d_out), bytes); - - hipMemcpy(d_a, a.data(), bytes, hipMemcpyHostToDevice); - hipMemcpy(d_x, x.data(), bytes, hipMemcpyHostToDevice); - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6); - Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6); - Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6); - - gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x); - - assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); - assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); - - for (int i = 0; i < 6; ++i) { - for (int j = 0; j < 6; ++j) { - if ((std::isnan)(igammac_s[i][j])) { - VERIFY((std::isnan)(out(i, j))); - } else { - VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]); - } - } - } - - hipFree(d_a); - hipFree(d_x); - hipFree(d_out); -} - -template <typename Scalar> -void test_hip_erf(const Scalar stddev) -{ - Tensor<Scalar, 2> in(72,97); - in.setRandom(); - in *= in.constant(stddev); - Tensor<Scalar, 2> out(72,97); - out.setZero(); - - std::size_t bytes = in.size() * sizeof(Scalar); - - Scalar* d_in; - Scalar* d_out; - assert(hipMalloc((void**)(&d_in), bytes) == hipSuccess); - assert(hipMalloc((void**)(&d_out), bytes) == hipSuccess); - - hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice); - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97); - Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97); - - gpu_out.device(gpu_device) = gpu_in.erf(); - - assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); - assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); - - for (int i = 0; i < 72; ++i) { - for (int j = 0; j < 97; ++j) { - VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j))); - } - } - - hipFree(d_in); - hipFree(d_out); -} - -template <typename Scalar> -void test_hip_erfc(const Scalar stddev) -{ - Tensor<Scalar, 2> in(72,97); - in.setRandom(); - in *= in.constant(stddev); - Tensor<Scalar, 2> out(72,97); - out.setZero(); - - std::size_t bytes = in.size() * sizeof(Scalar); - - Scalar* d_in; - Scalar* d_out; - hipMalloc((void**)(&d_in), bytes); - hipMalloc((void**)(&d_out), bytes); - - hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice); - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97); - Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97); - - gpu_out.device(gpu_device) = gpu_in.erfc(); - - assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); - assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); - - for (int i = 0; i < 72; ++i) { - for (int j = 0; j < 97; ++j) { - VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j))); - } - } - - hipFree(d_in); - hipFree(d_out); -} - -template <typename Scalar> -void test_hip_betainc() -{ - Tensor<Scalar, 1> in_x(125); - Tensor<Scalar, 1> in_a(125); - Tensor<Scalar, 1> in_b(125); - Tensor<Scalar, 1> out(125); - Tensor<Scalar, 1> expected_out(125); - out.setZero(); - - Scalar nan = std::numeric_limits<Scalar>::quiet_NaN(); - - Array<Scalar, 1, Dynamic> x(125); - Array<Scalar, 1, Dynamic> a(125); - Array<Scalar, 1, Dynamic> b(125); - Array<Scalar, 1, Dynamic> v(125); - - a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, - 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, - 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, - 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, - 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, - 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, - 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, - 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, - 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, - 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, - 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, - 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379, - 31.62177660168379, 31.62177660168379, 31.62177660168379, - 31.62177660168379, 31.62177660168379, 31.62177660168379, - 31.62177660168379, 31.62177660168379, 31.62177660168379, - 31.62177660168379, 31.62177660168379, 31.62177660168379, - 31.62177660168379, 31.62177660168379, 31.62177660168379, - 31.62177660168379, 31.62177660168379, 31.62177660168379, - 31.62177660168379, 31.62177660168379, 31.62177660168379, - 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999, - 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, - 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, - 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999; - - b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, - 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999, - 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379, - 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999, - 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, - 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, - 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379, - 31.62177660168379, 31.62177660168379, 31.62177660168379, - 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, - 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999, - 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379, - 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999, - 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, - 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, - 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379, - 31.62177660168379, 31.62177660168379, 31.62177660168379, - 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, - 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999, - 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379, - 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999, - 999.999, 999.999, 999.999; - - x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, - 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, - 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, - 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, - 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, - -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, - 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, - 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, - 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1; - - v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, - nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, - nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan, - 0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan, - 0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan, - 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, - nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256, - 0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001, - 0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403, - 0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999, - 0.9999999999999999, nan, nan, nan, nan, nan, nan, nan, - 1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan, - nan, 7.864342668429763e-23, 3.015969667594166e-10, 0.0008598571564165444, - nan, nan, 6.031987710123844e-08, 0.5000000000000007, 0.9999999396801229, - nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, - nan, nan, nan, nan, nan, nan, 0.0, 7.029920380986636e-306, - 2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302, - 1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252, - 2.9303043666183996e-60, nan, nan, 2.248913486879199e-196, - 0.5000000000004947, 0.9999999999999999, nan; - - for (int i = 0; i < 125; ++i) { - in_x(i) = x(i); - in_a(i) = a(i); - in_b(i) = b(i); - expected_out(i) = v(i); - } - - std::size_t bytes = in_x.size() * sizeof(Scalar); - - Scalar* d_in_x; - Scalar* d_in_a; - Scalar* d_in_b; - Scalar* d_out; - hipMalloc((void**)(&d_in_x), bytes); - hipMalloc((void**)(&d_in_a), bytes); - hipMalloc((void**)(&d_in_b), bytes); - hipMalloc((void**)(&d_out), bytes); - - hipMemcpy(d_in_x, in_x.data(), bytes, hipMemcpyHostToDevice); - hipMemcpy(d_in_a, in_a.data(), bytes, hipMemcpyHostToDevice); - hipMemcpy(d_in_b, in_b.data(), bytes, hipMemcpyHostToDevice); - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125); - Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_a(d_in_a, 125); - Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_b(d_in_b, 125); - Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 125); - - gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x); - - assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); - assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); - - for (int i = 1; i < 125; ++i) { - if ((std::isnan)(expected_out(i))) { - VERIFY((std::isnan)(out(i))); - } else { - VERIFY_IS_APPROX(out(i), expected_out(i)); - } - } - - hipFree(d_in_x); - hipFree(d_in_a); - hipFree(d_in_b); - hipFree(d_out); -} - - -void test_cxx11_tensor_hip() -{ - CALL_SUBTEST(test_hip_nullary()); - CALL_SUBTEST(test_hip_elementwise_small()); - CALL_SUBTEST(test_hip_elementwise()); - CALL_SUBTEST(test_hip_props()); - CALL_SUBTEST(test_hip_reduction()); - CALL_SUBTEST(test_hip_contraction<ColMajor>()); - CALL_SUBTEST(test_hip_contraction<RowMajor>()); - CALL_SUBTEST(test_hip_convolution_1d<ColMajor>()); - CALL_SUBTEST(test_hip_convolution_1d<RowMajor>()); - CALL_SUBTEST(test_hip_convolution_inner_dim_col_major_1d()); - CALL_SUBTEST(test_hip_convolution_inner_dim_row_major_1d()); - CALL_SUBTEST(test_hip_convolution_2d<ColMajor>()); - CALL_SUBTEST(test_hip_convolution_2d<RowMajor>()); - // The following two tests commented out due to long runtime - // CALL_SUBTEST(test_hip_convolution_3d<ColMajor>()); - // CALL_SUBTEST(test_hip_convolution_3d<RowMajor>()); - -#if __cplusplus > 199711L - // std::erf, std::erfc, and so on where only added in c++11. We use them - // as a golden reference to validate the results produced by Eigen. Therefore - // we can only run these tests if we use a c++11 compiler. - - CALL_SUBTEST(test_hip_lgamma<float>(1.0f)); - CALL_SUBTEST(test_hip_lgamma<float>(100.0f)); - CALL_SUBTEST(test_hip_lgamma<float>(0.01f)); - CALL_SUBTEST(test_hip_lgamma<float>(0.001f)); - - CALL_SUBTEST(test_hip_lgamma<double>(1.0)); - CALL_SUBTEST(test_hip_lgamma<double>(100.0)); - CALL_SUBTEST(test_hip_lgamma<double>(0.01)); - CALL_SUBTEST(test_hip_lgamma<double>(0.001)); - - CALL_SUBTEST(test_hip_erf<float>(1.0f)); - CALL_SUBTEST(test_hip_erf<float>(100.0f)); - CALL_SUBTEST(test_hip_erf<float>(0.01f)); - CALL_SUBTEST(test_hip_erf<float>(0.001f)); - - CALL_SUBTEST(test_hip_erfc<float>(1.0f)); - // CALL_SUBTEST(test_hip_erfc<float>(100.0f)); // HIP erfc lacks precision for large inputs - CALL_SUBTEST(test_hip_erfc<float>(5.0f)); - CALL_SUBTEST(test_hip_erfc<float>(0.01f)); - CALL_SUBTEST(test_hip_erfc<float>(0.001f)); - - CALL_SUBTEST(test_hip_erf<double>(1.0)); - CALL_SUBTEST(test_hip_erf<double>(100.0)); - CALL_SUBTEST(test_hip_erf<double>(0.01)); - CALL_SUBTEST(test_hip_erf<double>(0.001)); - - CALL_SUBTEST(test_hip_erfc<double>(1.0)); - // CALL_SUBTEST(test_hip_erfc<double>(100.0)); // HIP erfc lacks precision for large inputs - CALL_SUBTEST(test_hip_erfc<double>(5.0)); - CALL_SUBTEST(test_hip_erfc<double>(0.01)); - CALL_SUBTEST(test_hip_erfc<double>(0.001)); - - // Following tests have functional failures on some seeds - // CALL_SUBTEST(test_hip_digamma<float>()); - // CALL_SUBTEST(test_hip_digamma<double>()); - - // Following tests have functional failures on some seeds - // CALL_SUBTEST(test_hip_polygamma<float>()); - // CALL_SUBTEST(test_hip_polygamma<double>()); - - // Following tests have functional failures on some seeds - // CALL_SUBTEST(test_hip_zeta<float>()); - // CALL_SUBTEST(test_hip_zeta<double>()); - - CALL_SUBTEST(test_hip_igamma<float>()); - CALL_SUBTEST(test_hip_igammac<float>()); - - CALL_SUBTEST(test_hip_igamma<double>()); - CALL_SUBTEST(test_hip_igammac<double>()); - - CALL_SUBTEST(test_hip_betainc<float>()); - CALL_SUBTEST(test_hip_betainc<double>()); -#endif -} diff --git a/unsupported/test/cxx11_tensor_of_float16_hip.cu b/unsupported/test/cxx11_tensor_of_float16_hip.cu deleted file mode 100644 index 6c1a401bf..000000000 --- a/unsupported/test/cxx11_tensor_of_float16_hip.cu +++ /dev/null @@ -1,498 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_hip -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int -#define EIGEN_USE_GPU - -#include "main.h" -#include <unsupported/Eigen/CXX11/Tensor> - - -using Eigen::Tensor; - -template<typename> -void test_hip_numext() { - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - int num_elem = 101; - - float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); - bool* d_res_half = (bool*)gpu_device.allocate(num_elem * sizeof(bool)); - bool* d_res_float = (bool*)gpu_device.allocate(num_elem * sizeof(bool)); - - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float( - d_float, num_elem); - Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_half( - d_res_half, num_elem); - Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_float( - d_res_float, num_elem); - - gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f); - gpu_res_float.device(gpu_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>()); - gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>()); - - Tensor<bool, 1> half_prec(num_elem); - Tensor<bool, 1> full_prec(num_elem); - gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(bool)); - gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(bool)); - gpu_device.synchronize(); - - for (int i = 0; i < num_elem; ++i) { - std::cout << "Checking numext " << i << std::endl; - VERIFY_IS_EQUAL(full_prec(i), half_prec(i)); - } - - gpu_device.deallocate(d_float); - gpu_device.deallocate(d_res_half); - gpu_device.deallocate(d_res_float); -} - - -#ifdef EIGEN_HAS_HIP_FP16 - -template<typename> -void test_hip_conversion() { - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - int num_elem = 101; - - float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); - Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); - float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float)); - - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float( - d_float, num_elem); - Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half( - d_half, num_elem); - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv( - d_conv, num_elem); - - gpu_float.device(gpu_device) = gpu_float.random(); - gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>(); - gpu_conv.device(gpu_device) = gpu_half.cast<float>(); - - Tensor<float, 1> initial(num_elem); - Tensor<float, 1> final(num_elem); - gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float)); - gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float)); - - for (int i = 0; i < num_elem; ++i) { - VERIFY_IS_APPROX(initial(i), final(i)); - } - - gpu_device.deallocate(d_float); - gpu_device.deallocate(d_half); - gpu_device.deallocate(d_conv); -} - -template<typename> -void test_hip_unary() { - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - int num_elem = 101; - - float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); - - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float( - d_float, num_elem); - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half( - d_res_half, num_elem); - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float( - d_res_float, num_elem); - - gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f); - gpu_res_float.device(gpu_device) = gpu_float.abs(); - gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().cast<float>(); - - Tensor<float, 1> half_prec(num_elem); - Tensor<float, 1> full_prec(num_elem); - gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); - gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); - gpu_device.synchronize(); - - for (int i = 0; i < num_elem; ++i) { - std::cout << "Checking unary " << i << std::endl; - VERIFY_IS_APPROX(full_prec(i), half_prec(i)); - } - - gpu_device.deallocate(d_float); - gpu_device.deallocate(d_res_half); - gpu_device.deallocate(d_res_float); -} - -template<typename> -void test_hip_elementwise() { - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - int num_elem = 101; - - float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); - - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1( - d_float1, num_elem); - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2( - d_float2, num_elem); - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half( - d_res_half, num_elem); - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float( - d_res_float, num_elem); - - gpu_float1.device(gpu_device) = gpu_float1.random(); - gpu_float2.device(gpu_device) = gpu_float2.random(); - gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1; - gpu_res_half.device(gpu_device) = ((gpu_float1.cast<Eigen::half>() + gpu_float2.cast<Eigen::half>()) * gpu_float1.cast<Eigen::half>()).cast<float>(); - - Tensor<float, 1> half_prec(num_elem); - Tensor<float, 1> full_prec(num_elem); - gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); - gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); - gpu_device.synchronize(); - - for (int i = 0; i < num_elem; ++i) { - std::cout << "Checking elemwise " << i << ": full prec = " << full_prec(i) << " vs half prec = " << half_prec(i) << std::endl; - VERIFY_IS_APPROX(static_cast<Eigen::half>(full_prec(i)), static_cast<Eigen::half>(half_prec(i))); - } - - gpu_device.deallocate(d_float1); - gpu_device.deallocate(d_float2); - gpu_device.deallocate(d_res_half); - gpu_device.deallocate(d_res_float); -} - -template<typename> -void test_hip_trancendental() { - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - int num_elem = 101; - - float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_float3 = (float*)gpu_device.allocate(num_elem * sizeof(float)); - Eigen::half* d_res1_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); - Eigen::half* d_res1_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); - Eigen::half* d_res2_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); - Eigen::half* d_res2_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); - Eigen::half* d_res3_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); - Eigen::half* d_res3_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); - - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem); - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem); - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem); - Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem); - Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem); - Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem); - Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem); - Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem); - Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem); - Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_half(d_res3_half, num_elem); - Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_float(d_res3_float, num_elem); - - gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f); - gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f); - gpu_float3.device(gpu_device) = gpu_float3.random(); - gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>(); - gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>(); - gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>(); - gpu_res4_float.device(gpu_device) = gpu_float3.expm1().cast<Eigen::half>(); - - gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>(); - gpu_res1_half.device(gpu_device) = gpu_res1_half.exp(); - - gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>(); - gpu_res2_half.device(gpu_device) = gpu_res2_half.log(); - - gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>(); - gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p(); - - gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>(); - gpu_res3_half.device(gpu_device) = gpu_res3_half.expm1(); - - Tensor<float, 1> input1(num_elem); - Tensor<Eigen::half, 1> half_prec1(num_elem); - Tensor<Eigen::half, 1> full_prec1(num_elem); - Tensor<float, 1> input2(num_elem); - Tensor<Eigen::half, 1> half_prec2(num_elem); - Tensor<Eigen::half, 1> full_prec2(num_elem); - Tensor<float, 1> input3(num_elem); - Tensor<Eigen::half, 1> half_prec3(num_elem); - Tensor<Eigen::half, 1> full_prec3(num_elem); - gpu_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float)); - gpu_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float)); - gpu_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float)); - gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half)); - gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half)); - gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half)); - gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half)); - gpu_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half)); - gpu_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half)); - gpu_device.synchronize(); - - for (int i = 0; i < num_elem; ++i) { - std::cout << "Checking elemwise exp " << i << " input = " << input1(i) << " full = " << full_prec1(i) << " half = " << half_prec1(i) << std::endl; - VERIFY_IS_APPROX(full_prec1(i), half_prec1(i)); - } - for (int i = 0; i < num_elem; ++i) { - std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl; - if(std::abs(input2(i)-1.f)<0.05f) // log lacks accurary nearby 1 - VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f)); - else - VERIFY_IS_APPROX(full_prec2(i), half_prec2(i)); - } - for (int i = 0; i < num_elem; ++i) { - std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl; - VERIFY_IS_APPROX(full_prec3(i), half_prec3(i)); - } - gpu_device.deallocate(d_float1); - gpu_device.deallocate(d_float2); - gpu_device.deallocate(d_float3); - gpu_device.deallocate(d_res1_half); - gpu_device.deallocate(d_res1_float); - gpu_device.deallocate(d_res2_half); - gpu_device.deallocate(d_res2_float); - gpu_device.deallocate(d_res3_float); - gpu_device.deallocate(d_res3_half); -} - -template<typename> -void test_hip_contractions() { - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - int rows = 23; - int cols = 23; - int num_elem = rows*cols; - - float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); - Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); - Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); - - Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1( - d_float1, rows, cols); - Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2( - d_float2, rows, cols); - Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_half( - d_res_half, rows, cols); - Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_float( - d_res_float, rows, cols); - - gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f); - gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float2.constant(0.5f); - - typedef Tensor<float, 2>::DimensionPair DimPair; - Eigen::array<DimPair, 1> dims(DimPair(1, 0)); - gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims).cast<Eigen::half>(); - gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims); - - Tensor<Eigen::half, 2> half_prec(rows, cols); - Tensor<Eigen::half, 2> full_prec(rows, cols); - gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half)); - gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half)); - gpu_device.synchronize(); - - for (int i = 0; i < rows; ++i) { - for (int j = 0; j < cols; ++j) { - std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl; - if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) { - VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j)); - } - } - } - - gpu_device.deallocate(d_float1); - gpu_device.deallocate(d_float2); - gpu_device.deallocate(d_res_half); - gpu_device.deallocate(d_res_float); -} - -template<typename> -void test_hip_reductions(int size1, int size2, int redux) { - - std::cout << "Reducing " << size1 << " by " << size2 - << " tensor along dim " << redux << std::endl; - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - int num_elem = size1*size2; - int result_size = (redux == 1 ? size1 : size2); - - float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); - Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half)); - Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half)); - - Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1( - d_float1, size1, size2); - Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2( - d_float2, size1, size2); - Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half( - d_res_half, result_size); - Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float( - d_res_float, result_size); - - gpu_float1.device(gpu_device) = gpu_float1.random() * 2.0f; - gpu_float2.device(gpu_device) = gpu_float2.random() * 2.0f; - - Eigen::array<int, 1> redux_dim(redux); - gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast<Eigen::half>(); - gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim); - - Tensor<Eigen::half, 1> half_prec(result_size); - Tensor<Eigen::half, 1> full_prec(result_size); - gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, result_size*sizeof(Eigen::half)); - gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, result_size*sizeof(Eigen::half)); - gpu_device.synchronize(); - - for (int i = 0; i < result_size; ++i) { - std::cout << "EXPECTED " << full_prec(i) << " GOT " << half_prec(i) << std::endl; - VERIFY_IS_APPROX(full_prec(i), half_prec(i)); - } - - gpu_device.deallocate(d_float1); - gpu_device.deallocate(d_float2); - gpu_device.deallocate(d_res_half); - gpu_device.deallocate(d_res_float); -} - -template<typename> -void test_hip_reductions() { - test_hip_reductions<void>(13, 13, 0); - test_hip_reductions<void>(13, 13, 1); - - test_hip_reductions<void>(35, 36, 0); - test_hip_reductions<void>(35, 36, 1); - - test_hip_reductions<void>(36, 35, 0); - test_hip_reductions<void>(36, 35, 1); -} - -template<typename> -void test_hip_full_reductions() { - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - int size = 13; - int num_elem = size*size; - - float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); - Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half)); - Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half)); - - Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1( - d_float1, size, size); - Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2( - d_float2, size, size); - Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half( - d_res_half); - Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float( - d_res_float); - - gpu_float1.device(gpu_device) = gpu_float1.random(); - gpu_float2.device(gpu_device) = gpu_float2.random(); - - gpu_res_float.device(gpu_device) = gpu_float1.sum().cast<Eigen::half>(); - gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(); - - Tensor<Eigen::half, 0> half_prec; - Tensor<Eigen::half, 0> full_prec; - gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half)); - gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half)); - gpu_device.synchronize(); - - VERIFY_IS_APPROX(full_prec(), half_prec()); - - gpu_res_float.device(gpu_device) = gpu_float1.maximum().cast<Eigen::half>(); - gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().maximum(); - gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half)); - gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half)); - gpu_device.synchronize(); - - VERIFY_IS_APPROX(full_prec(), half_prec()); - - gpu_device.deallocate(d_float1); - gpu_device.deallocate(d_float2); - gpu_device.deallocate(d_res_half); - gpu_device.deallocate(d_res_float); -} - -template<typename> -void test_hip_forced_evals() { - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - int num_elem = 101; - - float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_res_half1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_res_half2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); - - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float( - d_float, num_elem); - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1( - d_res_half1, num_elem); - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2( - d_res_half2, num_elem); - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float( - d_res_float, num_elem); - - Eigen::array<int, 1> no_bcast; - no_bcast[0] = 1; - - gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f); - gpu_res_float.device(gpu_device) = gpu_float.abs(); - gpu_res_half1.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>(); - gpu_res_half2.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().broadcast(no_bcast).eval().cast<float>(); - - Tensor<float, 1> half_prec1(num_elem); - Tensor<float, 1> half_prec2(num_elem); - Tensor<float, 1> full_prec(num_elem); - gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float)); - gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half1, num_elem*sizeof(float)); - gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); - gpu_device.synchronize(); - - for (int i = 0; i < num_elem; ++i) { - std::cout << "Checking forced eval " << i << full_prec(i) << " vs " << half_prec1(i) << " vs " << half_prec2(i) << std::endl; - VERIFY_IS_APPROX(full_prec(i), half_prec1(i)); - VERIFY_IS_APPROX(full_prec(i), half_prec2(i)); - } - - gpu_device.deallocate(d_float); - gpu_device.deallocate(d_res_half1); - gpu_device.deallocate(d_res_half2); - gpu_device.deallocate(d_res_float); -} -#endif - - -void test_cxx11_tensor_of_float16_hip() -{ - CALL_SUBTEST(test_hip_numext<void>()); - -#ifdef EIGEN_HAS_HIP_FP16 - CALL_SUBTEST(test_hip_conversion<void>()); - CALL_SUBTEST(test_hip_unary<void>()); - CALL_SUBTEST(test_hip_elementwise<void>()); - CALL_SUBTEST(test_hip_trancendental<void>()); - CALL_SUBTEST(test_hip_contractions<void>()); - CALL_SUBTEST(test_hip_reductions<void>()); - CALL_SUBTEST(test_hip_full_reductions<void>()); - CALL_SUBTEST(test_hip_forced_evals<void>()); -#else - std::cout << "Half floats are not supported by this version of hip: skipping the test" << std::endl; -#endif -} diff --git a/unsupported/test/cxx11_tensor_random_hip.cu b/unsupported/test/cxx11_tensor_random_hip.cu deleted file mode 100644 index 7d7e72ca2..000000000 --- a/unsupported/test/cxx11_tensor_random_hip.cu +++ /dev/null @@ -1,85 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_random_hip -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int -#define EIGEN_USE_GPU - -#include "main.h" -#include <Eigen/CXX11/Tensor> - - -void test_hip_random_uniform() -{ - Tensor<float, 2> out(72,97); - out.setZero(); - - std::size_t out_bytes = out.size() * sizeof(float); - - float* d_out; - hipMalloc((void**)(&d_out), out_bytes); - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97); - - gpu_out.device(gpu_device) = gpu_out.random(); - - assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); - assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); - - // For now we just check thes code doesn't crash. - // TODO: come up with a valid test of randomness -} - - -void test_hip_random_normal() -{ - Tensor<float, 2> out(72,97); - out.setZero(); - - std::size_t out_bytes = out.size() * sizeof(float); - - float* d_out; - hipMalloc((void**)(&d_out), out_bytes); - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97); - - Eigen::internal::NormalRandomGenerator<float> gen(true); - gpu_out.device(gpu_device) = gpu_out.random(gen); - - assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); - assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); -} - -static void test_complex() -{ - Tensor<std::complex<float>, 1> vec(6); - vec.setRandom(); - - // Fixme: we should check that the generated numbers follow a uniform - // distribution instead. - for (int i = 1; i < 6; ++i) { - VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1)); - } -} - - -void test_cxx11_tensor_random_hip() -{ - CALL_SUBTEST(test_hip_random_uniform()); - CALL_SUBTEST(test_hip_random_normal()); - CALL_SUBTEST(test_complex()); -} diff --git a/unsupported/test/cxx11_tensor_reduction_hip.cu b/unsupported/test/cxx11_tensor_reduction_hip.cu deleted file mode 100644 index c5aad05be..000000000 --- a/unsupported/test/cxx11_tensor_reduction_hip.cu +++ /dev/null @@ -1,154 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_reduction_hip -#define EIGEN_USE_GPU - -#include "main.h" -#include <unsupported/Eigen/CXX11/Tensor> - - -template<typename Type, int DataLayout> -static void test_full_reductions() { - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - const int num_rows = internal::random<int>(1024, 5*1024); - const int num_cols = internal::random<int>(1024, 5*1024); - - Tensor<Type, 2, DataLayout> in(num_rows, num_cols); - in.setRandom(); - - Tensor<Type, 0, DataLayout> full_redux; - full_redux = in.sum(); - - std::size_t in_bytes = in.size() * sizeof(Type); - std::size_t out_bytes = full_redux.size() * sizeof(Type); - Type* gpu_in_ptr = static_cast<Type*>(gpu_device.allocate(in_bytes)); - Type* gpu_out_ptr = static_cast<Type*>(gpu_device.allocate(out_bytes)); - gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes); - - TensorMap<Tensor<Type, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols); - TensorMap<Tensor<Type, 0, DataLayout> > out_gpu(gpu_out_ptr); - - out_gpu.device(gpu_device) = in_gpu.sum(); - - Tensor<Type, 0, DataLayout> full_redux_gpu; - gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes); - gpu_device.synchronize(); - - // Check that the CPU and GPU reductions return the same result. - VERIFY_IS_APPROX(full_redux(), full_redux_gpu()); - - gpu_device.deallocate(gpu_in_ptr); - gpu_device.deallocate(gpu_out_ptr); -} - -template<typename Type, int DataLayout> -static void test_first_dim_reductions() { - int dim_x = 33; - int dim_y = 1; - int dim_z = 128; - - Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z); - in.setRandom(); - - Eigen::array<int, 1> red_axis; - red_axis[0] = 0; - Tensor<Type, 2, DataLayout> redux = in.sum(red_axis); - - // Create device - Eigen::HipStreamDevice stream; - Eigen::GpuDevice dev(&stream); - - // Create data(T) - Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type)); - Type* out_data = (Type*)dev.allocate(dim_z*dim_y*sizeof(Type)); - Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z); - Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_y, dim_z); - - // Perform operation - dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type)); - gpu_out.device(dev) = gpu_in.sum(red_axis); - gpu_out.device(dev) += gpu_in.sum(red_axis); - Tensor<Type, 2, DataLayout> redux_gpu(dim_y, dim_z); - dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type)); - dev.synchronize(); - - // Check that the CPU and GPU reductions return the same result. - for (int i = 0; i < gpu_out.size(); ++i) { - VERIFY_IS_APPROX(2*redux(i), redux_gpu(i)); - } - - dev.deallocate(in_data); - dev.deallocate(out_data); -} - -template<typename Type, int DataLayout> -static void test_last_dim_reductions() { - int dim_x = 128; - int dim_y = 1; - int dim_z = 33; - - Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z); - in.setRandom(); - - Eigen::array<int, 1> red_axis; - red_axis[0] = 2; - Tensor<Type, 2, DataLayout> redux = in.sum(red_axis); - - // Create device - Eigen::HipStreamDevice stream; - Eigen::GpuDevice dev(&stream); - - // Create data - Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type)); - Type* out_data = (Type*)dev.allocate(dim_x*dim_y*sizeof(Type)); - Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z); - Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_x, dim_y); - - // Perform operation - dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type)); - gpu_out.device(dev) = gpu_in.sum(red_axis); - gpu_out.device(dev) += gpu_in.sum(red_axis); - Tensor<Type, 2, DataLayout> redux_gpu(dim_x, dim_y); - dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type)); - dev.synchronize(); - - // Check that the CPU and GPU reductions return the same result. - for (int i = 0; i < gpu_out.size(); ++i) { - VERIFY_IS_APPROX(2*redux(i), redux_gpu(i)); - } - - dev.deallocate(in_data); - dev.deallocate(out_data); -} - - -void test_cxx11_tensor_reduction_hip() { - CALL_SUBTEST((test_full_reductions<float, ColMajor>())); - CALL_SUBTEST((test_full_reductions<double, ColMajor>())); - CALL_SUBTEST((test_full_reductions<float, RowMajor>())); - CALL_SUBTEST((test_full_reductions<double, RowMajor>())); - - CALL_SUBTEST((test_first_dim_reductions<float, ColMajor>())); - CALL_SUBTEST((test_first_dim_reductions<double, ColMajor>())); - CALL_SUBTEST((test_first_dim_reductions<float, RowMajor>())); -// Outer reductions of doubles aren't supported just yet. -// CALL_SUBTEST((test_first_dim_reductions<double, RowMajor>())) - - CALL_SUBTEST((test_last_dim_reductions<float, ColMajor>())); -// Outer reductions of doubles aren't supported just yet. -// CALL_SUBTEST((test_last_dim_reductions<double, ColMajor>())); - CALL_SUBTEST((test_last_dim_reductions<float, RowMajor>())); - CALL_SUBTEST((test_last_dim_reductions<double, RowMajor>())); -} diff --git a/unsupported/test/cxx11_tensor_scan_hip.cu b/unsupported/test/cxx11_tensor_scan_hip.cu deleted file mode 100644 index f4c4f59b9..000000000 --- a/unsupported/test/cxx11_tensor_scan_hip.cu +++ /dev/null @@ -1,76 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_scan_hip -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int -#define EIGEN_USE_GPU - -#include "main.h" -#include <unsupported/Eigen/CXX11/Tensor> - -using Eigen::Tensor; -typedef Tensor<float, 1>::DimensionPair DimPair; - -template<int DataLayout> -void test_hip_cumsum(int m_size, int k_size, int n_size) -{ - std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; - Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size); - Tensor<float, 3, DataLayout> t_result(m_size, k_size, n_size); - Tensor<float, 3, DataLayout> t_result_gpu(m_size, k_size, n_size); - - t_input.setRandom(); - - std::size_t t_input_bytes = t_input.size() * sizeof(float); - std::size_t t_result_bytes = t_result.size() * sizeof(float); - - float* d_t_input; - float* d_t_result; - - hipMalloc((void**)(&d_t_input), t_input_bytes); - hipMalloc((void**)(&d_t_result), t_result_bytes); - - hipMemcpy(d_t_input, t_input.data(), t_input_bytes, hipMemcpyHostToDevice); - - Eigen::HipStreamDevice stream; - Eigen::GpuDevice gpu_device(&stream); - - Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > - gpu_t_input(d_t_input, Eigen::array<int, 3>(m_size, k_size, n_size)); - Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > - gpu_t_result(d_t_result, Eigen::array<int, 3>(m_size, k_size, n_size)); - - gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1); - t_result = t_input.cumsum(1); - - hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost); - for (DenseIndex i = 0; i < t_result.size(); i++) { - if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) { - continue; - } - if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) { - continue; - } - std::cout << "mismatch detected at index " << i << ": " << t_result(i) - << " vs " << t_result_gpu(i) << std::endl; - assert(false); - } - - hipFree((void*)d_t_input); - hipFree((void*)d_t_result); -} - - -void test_cxx11_tensor_scan_hip() -{ - CALL_SUBTEST(test_hip_cumsum<ColMajor>(128, 128, 128)); - CALL_SUBTEST(test_hip_cumsum<RowMajor>(128, 128, 128)); -} |