From 7e41c8f1a98c2a3beed667dca416ea8d20ad373e Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Wed, 20 Jun 2018 12:52:30 -0400 Subject: renaming *Cuda files to *Gpu in the unsupported/Eigen/CXX11/src/Tensor and unsupported/test directories --- unsupported/test/cxx11_tensor_gpu.cu | 1557 ++++++++++++++++++++++++++++++++++ 1 file changed, 1557 insertions(+) create mode 100644 unsupported/test/cxx11_tensor_gpu.cu (limited to 'unsupported/test/cxx11_tensor_gpu.cu') diff --git a/unsupported/test/cxx11_tensor_gpu.cu b/unsupported/test/cxx11_tensor_gpu.cu new file mode 100644 index 000000000..f238ed5be --- /dev/null +++ b/unsupported/test/cxx11_tensor_gpu.cu @@ -0,0 +1,1557 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_cuda +#define EIGEN_USE_GPU + +#include "main.h" +#include + +using Eigen::Tensor; + +void test_cuda_nullary() { + Tensor in1(2); + Tensor in2(2); + in1.setRandom(); + in2.setRandom(); + + std::size_t tensor_bytes = in1.size() * sizeof(float); + + float* d_in1; + float* d_in2; + cudaMalloc((void**)(&d_in1), tensor_bytes); + cudaMalloc((void**)(&d_in2), tensor_bytes); + cudaMemcpy(d_in1, in1.data(), tensor_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in2, in2.data(), tensor_bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap, Eigen::Aligned> gpu_in1( + d_in1, 2); + Eigen::TensorMap, Eigen::Aligned> gpu_in2( + d_in2, 2); + + gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f); + gpu_in2.device(gpu_device) = gpu_in2.random(); + + Tensor new1(2); + Tensor new2(2); + + assert(cudaMemcpyAsync(new1.data(), d_in1, tensor_bytes, cudaMemcpyDeviceToHost, + gpu_device.stream()) == cudaSuccess); + assert(cudaMemcpyAsync(new2.data(), d_in2, tensor_bytes, cudaMemcpyDeviceToHost, + gpu_device.stream()) == cudaSuccess); + + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 2; ++i) { + VERIFY_IS_APPROX(new1(i), 3.14f); + VERIFY_IS_NOT_EQUAL(new2(i), in2(i)); + } + + cudaFree(d_in1); + cudaFree(d_in2); +} + +void test_cuda_elementwise_small() { + Tensor in1(Eigen::array(2)); + Tensor in2(Eigen::array(2)); + Tensor out(Eigen::array(2)); + in1.setRandom(); + in2.setRandom(); + + std::size_t in1_bytes = in1.size() * sizeof(float); + std::size_t in2_bytes = in2.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_in1; + float* d_in2; + float* d_out; + cudaMalloc((void**)(&d_in1), in1_bytes); + cudaMalloc((void**)(&d_in2), in2_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap, Eigen::Aligned> gpu_in1( + d_in1, Eigen::array(2)); + Eigen::TensorMap, Eigen::Aligned> gpu_in2( + d_in2, Eigen::array(2)); + Eigen::TensorMap, Eigen::Aligned> gpu_out( + d_out, Eigen::array(2)); + + gpu_out.device(gpu_device) = gpu_in1 + gpu_in2; + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, + gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 2; ++i) { + VERIFY_IS_APPROX( + out(Eigen::array(i)), + in1(Eigen::array(i)) + in2(Eigen::array(i))); + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); +} + +void test_cuda_elementwise() +{ + Tensor in1(Eigen::array(72,53,97)); + Tensor in2(Eigen::array(72,53,97)); + Tensor in3(Eigen::array(72,53,97)); + Tensor out(Eigen::array(72,53,97)); + in1.setRandom(); + in2.setRandom(); + in3.setRandom(); + + std::size_t in1_bytes = in1.size() * sizeof(float); + std::size_t in2_bytes = in2.size() * sizeof(float); + std::size_t in3_bytes = in3.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_in1; + float* d_in2; + float* d_in3; + float* d_out; + cudaMalloc((void**)(&d_in1), in1_bytes); + cudaMalloc((void**)(&d_in2), in2_bytes); + cudaMalloc((void**)(&d_in3), in3_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in3, in3.data(), in3_bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in1(d_in1, Eigen::array(72,53,97)); + Eigen::TensorMap > gpu_in2(d_in2, Eigen::array(72,53,97)); + Eigen::TensorMap > gpu_in3(d_in3, Eigen::array(72,53,97)); + Eigen::TensorMap > gpu_out(d_out, Eigen::array(72,53,97)); + + gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3; + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 53; ++j) { + for (int k = 0; k < 97; ++k) { + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * in3(Eigen::array(i,j,k))); + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); +} + +void test_cuda_props() { + Tensor in1(200); + Tensor out(200); + in1.setRandom(); + + std::size_t in1_bytes = in1.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(bool); + + float* d_in1; + bool* d_out; + cudaMalloc((void**)(&d_in1), in1_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap, Eigen::Aligned> gpu_in1( + d_in1, 200); + Eigen::TensorMap, Eigen::Aligned> gpu_out( + d_out, 200); + + gpu_out.device(gpu_device) = (gpu_in1.isnan)(); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, + gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 200; ++i) { + VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i))); + } + + cudaFree(d_in1); + cudaFree(d_out); +} + +void test_cuda_reduction() +{ + Tensor in1(72,53,97,113); + Tensor out(72,97); + in1.setRandom(); + + std::size_t in1_bytes = in1.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_in1; + float* d_out; + cudaMalloc((void**)(&d_in1), in1_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in1(d_in1, 72,53,97,113); + Eigen::TensorMap > gpu_out(d_out, 72,97); + + array reduction_axis; + reduction_axis[0] = 1; + reduction_axis[1] = 3; + + gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 97; ++j) { + float expected = 0; + for (int k = 0; k < 53; ++k) { + for (int l = 0; l < 113; ++l) { + expected = + std::max(expected, in1(i, k, j, l)); + } + } + VERIFY_IS_APPROX(out(i,j), expected); + } + } + + cudaFree(d_in1); + cudaFree(d_out); +} + +template +void test_cuda_contraction() +{ + // with these dimensions, the output has 300 * 140 elements, which is + // more than 30 * 1024, which is the number of threads in blocks on + // a 15 SM GK110 GPU + Tensor t_left(6, 50, 3, 31); + Tensor t_right(Eigen::array(3, 31, 7, 20, 1)); + Tensor t_result(Eigen::array(6, 50, 7, 20, 1)); + + t_left.setRandom(); + t_right.setRandom(); + + std::size_t t_left_bytes = t_left.size() * sizeof(float); + std::size_t t_right_bytes = t_right.size() * sizeof(float); + std::size_t t_result_bytes = t_result.size() * sizeof(float); + + float* d_t_left; + float* d_t_right; + float* d_t_result; + + cudaMalloc((void**)(&d_t_left), t_left_bytes); + cudaMalloc((void**)(&d_t_right), t_right_bytes); + cudaMalloc((void**)(&d_t_result), t_result_bytes); + + cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_t_left(d_t_left, 6, 50, 3, 31); + Eigen::TensorMap > gpu_t_right(d_t_right, 3, 31, 7, 20, 1); + Eigen::TensorMap > gpu_t_result(d_t_result, 6, 50, 7, 20, 1); + + typedef Eigen::Map > MapXf; + MapXf m_left(t_left.data(), 300, 93); + MapXf m_right(t_right.data(), 93, 140); + Eigen::Matrix m_result(300, 140); + + typedef Tensor::DimensionPair DimPair; + Eigen::array dims; + dims[0] = DimPair(2, 0); + dims[1] = DimPair(3, 1); + + m_result = m_left * m_right; + gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); + + cudaMemcpy(t_result.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); + + for (DenseIndex i = 0; i < t_result.size(); i++) { + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) { + std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; + assert(false); + } + } + + cudaFree(d_t_left); + cudaFree(d_t_right); + cudaFree(d_t_result); +} + +template +void test_cuda_convolution_1d() +{ + Tensor input(74,37,11,137); + Tensor kernel(4); + Tensor out(74,34,11,137); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); + + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_input; + float* d_kernel; + float* d_out; + cudaMalloc((void**)(&d_input), input_bytes); + cudaMalloc((void**)(&d_kernel), kernel_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_input(d_input, 74,37,11,137); + Eigen::TensorMap > gpu_kernel(d_kernel, 4); + Eigen::TensorMap > gpu_out(d_out, 74,34,11,137); + + Eigen::array dims(1); + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 74; ++i) { + for (int j = 0; j < 34; ++j) { + for (int k = 0; k < 11; ++k) { + for (int l = 0; l < 137; ++l) { + const float result = out(i,j,k,l); + const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) + + input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3); + VERIFY_IS_APPROX(result, expected); + } + } + } + } + + cudaFree(d_input); + cudaFree(d_kernel); + cudaFree(d_out); +} + +void test_cuda_convolution_inner_dim_col_major_1d() +{ + Tensor input(74,9,11,7); + Tensor kernel(4); + Tensor out(71,9,11,7); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); + + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_input; + float* d_kernel; + float* d_out; + cudaMalloc((void**)(&d_input), input_bytes); + cudaMalloc((void**)(&d_kernel), kernel_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_input(d_input,74,9,11,7); + Eigen::TensorMap > gpu_kernel(d_kernel,4); + Eigen::TensorMap > gpu_out(d_out,71,9,11,7); + + Eigen::array dims(0); + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 71; ++i) { + for (int j = 0; j < 9; ++j) { + for (int k = 0; k < 11; ++k) { + for (int l = 0; l < 7; ++l) { + const float result = out(i,j,k,l); + const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) + + input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3); + VERIFY_IS_APPROX(result, expected); + } + } + } + } + + cudaFree(d_input); + cudaFree(d_kernel); + cudaFree(d_out); +} + +void test_cuda_convolution_inner_dim_row_major_1d() +{ + Tensor input(7,9,11,74); + Tensor kernel(4); + Tensor out(7,9,11,71); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); + + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_input; + float* d_kernel; + float* d_out; + cudaMalloc((void**)(&d_input), input_bytes); + cudaMalloc((void**)(&d_kernel), kernel_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_input(d_input, 7,9,11,74); + Eigen::TensorMap > gpu_kernel(d_kernel, 4); + Eigen::TensorMap > gpu_out(d_out, 7,9,11,71); + + Eigen::array dims(3); + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 7; ++i) { + for (int j = 0; j < 9; ++j) { + for (int k = 0; k < 11; ++k) { + for (int l = 0; l < 71; ++l) { + const float result = out(i,j,k,l); + const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) + + input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3); + VERIFY_IS_APPROX(result, expected); + } + } + } + } + + cudaFree(d_input); + cudaFree(d_kernel); + cudaFree(d_out); +} + +template +void test_cuda_convolution_2d() +{ + Tensor input(74,37,11,137); + Tensor kernel(3,4); + Tensor out(74,35,8,137); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); + + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_input; + float* d_kernel; + float* d_out; + cudaMalloc((void**)(&d_input), input_bytes); + cudaMalloc((void**)(&d_kernel), kernel_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_input(d_input,74,37,11,137); + Eigen::TensorMap > gpu_kernel(d_kernel,3,4); + Eigen::TensorMap > gpu_out(d_out,74,35,8,137); + + Eigen::array dims(1,2); + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 74; ++i) { + for (int j = 0; j < 35; ++j) { + for (int k = 0; k < 8; ++k) { + for (int l = 0; l < 137; ++l) { + const float result = out(i,j,k,l); + const float expected = input(i,j+0,k+0,l) * kernel(0,0) + + input(i,j+1,k+0,l) * kernel(1,0) + + input(i,j+2,k+0,l) * kernel(2,0) + + input(i,j+0,k+1,l) * kernel(0,1) + + input(i,j+1,k+1,l) * kernel(1,1) + + input(i,j+2,k+1,l) * kernel(2,1) + + input(i,j+0,k+2,l) * kernel(0,2) + + input(i,j+1,k+2,l) * kernel(1,2) + + input(i,j+2,k+2,l) * kernel(2,2) + + input(i,j+0,k+3,l) * kernel(0,3) + + input(i,j+1,k+3,l) * kernel(1,3) + + input(i,j+2,k+3,l) * kernel(2,3); + VERIFY_IS_APPROX(result, expected); + } + } + } + } + + cudaFree(d_input); + cudaFree(d_kernel); + cudaFree(d_out); +} + +template +void test_cuda_convolution_3d() +{ + Tensor input(Eigen::array(74,37,11,137,17)); + Tensor kernel(3,4,2); + Tensor out(Eigen::array(74,35,8,136,17)); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); + + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_input; + float* d_kernel; + float* d_out; + cudaMalloc((void**)(&d_input), input_bytes); + cudaMalloc((void**)(&d_kernel), kernel_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_input(d_input,74,37,11,137,17); + Eigen::TensorMap > gpu_kernel(d_kernel,3,4,2); + Eigen::TensorMap > gpu_out(d_out,74,35,8,136,17); + + Eigen::array dims(1,2,3); + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 74; ++i) { + for (int j = 0; j < 35; ++j) { + for (int k = 0; k < 8; ++k) { + for (int l = 0; l < 136; ++l) { + for (int m = 0; m < 17; ++m) { + const float result = out(i,j,k,l,m); + const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) + + input(i,j+1,k+0,l+0,m) * kernel(1,0,0) + + input(i,j+2,k+0,l+0,m) * kernel(2,0,0) + + input(i,j+0,k+1,l+0,m) * kernel(0,1,0) + + input(i,j+1,k+1,l+0,m) * kernel(1,1,0) + + input(i,j+2,k+1,l+0,m) * kernel(2,1,0) + + input(i,j+0,k+2,l+0,m) * kernel(0,2,0) + + input(i,j+1,k+2,l+0,m) * kernel(1,2,0) + + input(i,j+2,k+2,l+0,m) * kernel(2,2,0) + + input(i,j+0,k+3,l+0,m) * kernel(0,3,0) + + input(i,j+1,k+3,l+0,m) * kernel(1,3,0) + + input(i,j+2,k+3,l+0,m) * kernel(2,3,0) + + input(i,j+0,k+0,l+1,m) * kernel(0,0,1) + + input(i,j+1,k+0,l+1,m) * kernel(1,0,1) + + input(i,j+2,k+0,l+1,m) * kernel(2,0,1) + + input(i,j+0,k+1,l+1,m) * kernel(0,1,1) + + input(i,j+1,k+1,l+1,m) * kernel(1,1,1) + + input(i,j+2,k+1,l+1,m) * kernel(2,1,1) + + input(i,j+0,k+2,l+1,m) * kernel(0,2,1) + + input(i,j+1,k+2,l+1,m) * kernel(1,2,1) + + input(i,j+2,k+2,l+1,m) * kernel(2,2,1) + + input(i,j+0,k+3,l+1,m) * kernel(0,3,1) + + input(i,j+1,k+3,l+1,m) * kernel(1,3,1) + + input(i,j+2,k+3,l+1,m) * kernel(2,3,1); + VERIFY_IS_APPROX(result, expected); + } + } + } + } + } + + cudaFree(d_input); + cudaFree(d_kernel); + cudaFree(d_out); +} + + +template +void test_cuda_lgamma(const Scalar stddev) +{ + Tensor in(72,97); + in.setRandom(); + in *= in.constant(stddev); + Tensor out(72,97); + out.setZero(); + + std::size_t bytes = in.size() * sizeof(Scalar); + + Scalar* d_in; + Scalar* d_out; + cudaMalloc((void**)(&d_in), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in(d_in, 72, 97); + Eigen::TensorMap > gpu_out(d_out, 72, 97); + + gpu_out.device(gpu_device) = gpu_in.lgamma(); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 97; ++j) { + VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j))); + } + } + + cudaFree(d_in); + cudaFree(d_out); +} + +template +void test_cuda_digamma() +{ + Tensor in(7); + Tensor out(7); + Tensor expected_out(7); + out.setZero(); + + in(0) = Scalar(1); + in(1) = Scalar(1.5); + in(2) = Scalar(4); + in(3) = Scalar(-10.5); + in(4) = Scalar(10000.5); + in(5) = Scalar(0); + in(6) = Scalar(-1); + + expected_out(0) = Scalar(-0.5772156649015329); + expected_out(1) = Scalar(0.03648997397857645); + expected_out(2) = Scalar(1.2561176684318); + expected_out(3) = Scalar(2.398239129535781); + expected_out(4) = Scalar(9.210340372392849); + expected_out(5) = std::numeric_limits::infinity(); + expected_out(6) = std::numeric_limits::infinity(); + + std::size_t bytes = in.size() * sizeof(Scalar); + + Scalar* d_in; + Scalar* d_out; + cudaMalloc((void**)(&d_in), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in(d_in, 7); + Eigen::TensorMap > gpu_out(d_out, 7); + + gpu_out.device(gpu_device) = gpu_in.digamma(); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 5; ++i) { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } + for (int i = 5; i < 7; ++i) { + VERIFY_IS_EQUAL(out(i), expected_out(i)); + } + + cudaFree(d_in); + cudaFree(d_out); +} + +template +void test_cuda_zeta() +{ + Tensor in_x(6); + Tensor in_q(6); + Tensor out(6); + Tensor expected_out(6); + out.setZero(); + + in_x(0) = Scalar(1); + in_x(1) = Scalar(1.5); + in_x(2) = Scalar(4); + in_x(3) = Scalar(-10.5); + in_x(4) = Scalar(10000.5); + in_x(5) = Scalar(3); + + in_q(0) = Scalar(1.2345); + in_q(1) = Scalar(2); + in_q(2) = Scalar(1.5); + in_q(3) = Scalar(3); + in_q(4) = Scalar(1.0001); + in_q(5) = Scalar(-2.5); + + expected_out(0) = std::numeric_limits::infinity(); + expected_out(1) = Scalar(1.61237534869); + expected_out(2) = Scalar(0.234848505667); + expected_out(3) = Scalar(1.03086757337e-5); + expected_out(4) = Scalar(0.367879440865); + expected_out(5) = Scalar(0.054102025820864097); + + std::size_t bytes = in_x.size() * sizeof(Scalar); + + Scalar* d_in_x; + Scalar* d_in_q; + Scalar* d_out; + cudaMalloc((void**)(&d_in_x), bytes); + cudaMalloc((void**)(&d_in_q), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in_q, in_q.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in_x(d_in_x, 6); + Eigen::TensorMap > gpu_in_q(d_in_q, 6); + Eigen::TensorMap > gpu_out(d_out, 6); + + gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + VERIFY_IS_EQUAL(out(0), expected_out(0)); + VERIFY((std::isnan)(out(3))); + + for (int i = 1; i < 6; ++i) { + if (i != 3) { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } + } + + cudaFree(d_in_x); + cudaFree(d_in_q); + cudaFree(d_out); +} + +template +void test_cuda_polygamma() +{ + Tensor in_x(7); + Tensor in_n(7); + Tensor out(7); + Tensor expected_out(7); + out.setZero(); + + in_n(0) = Scalar(1); + in_n(1) = Scalar(1); + in_n(2) = Scalar(1); + in_n(3) = Scalar(17); + in_n(4) = Scalar(31); + in_n(5) = Scalar(28); + in_n(6) = Scalar(8); + + in_x(0) = Scalar(2); + in_x(1) = Scalar(3); + in_x(2) = Scalar(25.5); + in_x(3) = Scalar(4.7); + in_x(4) = Scalar(11.8); + in_x(5) = Scalar(17.7); + in_x(6) = Scalar(30.2); + + expected_out(0) = Scalar(0.644934066848); + expected_out(1) = Scalar(0.394934066848); + expected_out(2) = Scalar(0.0399946696496); + expected_out(3) = Scalar(293.334565435); + expected_out(4) = Scalar(0.445487887616); + expected_out(5) = Scalar(-2.47810300902e-07); + expected_out(6) = Scalar(-8.29668781082e-09); + + std::size_t bytes = in_x.size() * sizeof(Scalar); + + Scalar* d_in_x; + Scalar* d_in_n; + Scalar* d_out; + cudaMalloc((void**)(&d_in_x), bytes); + cudaMalloc((void**)(&d_in_n), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in_n, in_n.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in_x(d_in_x, 7); + Eigen::TensorMap > gpu_in_n(d_in_n, 7); + Eigen::TensorMap > gpu_out(d_out, 7); + + gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 7; ++i) { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } + + cudaFree(d_in_x); + cudaFree(d_in_n); + cudaFree(d_out); +} + +template +void test_cuda_igamma() +{ + Tensor a(6, 6); + Tensor x(6, 6); + Tensor out(6, 6); + out.setZero(); + + Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + a(i, j) = a_s[i]; + x(i, j) = x_s[j]; + } + } + + Scalar nan = std::numeric_limits::quiet_NaN(); + Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan}, + {0.0, 0.6321205588285578, 0.7768698398515702, + 0.9816843611112658, 9.999500016666262e-05, 1.0}, + {0.0, 0.4275932955291202, 0.608374823728911, + 0.9539882943107686, 7.522076445089201e-07, 1.0}, + {0.0, 0.01898815687615381, 0.06564245437845008, + 0.5665298796332909, 4.166333347221828e-18, 1.0}, + {0.0, 0.9999780593618628, 0.9999899967080838, + 0.9999996219837988, 0.9991370418689945, 1.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}}; + + + + std::size_t bytes = a.size() * sizeof(Scalar); + + Scalar* d_a; + Scalar* d_x; + Scalar* d_out; + assert(cudaMalloc((void**)(&d_a), bytes) == cudaSuccess); + assert(cudaMalloc((void**)(&d_x), bytes) == cudaSuccess); + assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess); + + cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_a(d_a, 6, 6); + Eigen::TensorMap > gpu_x(d_x, 6, 6); + Eigen::TensorMap > gpu_out(d_out, 6, 6); + + gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + if ((std::isnan)(igamma_s[i][j])) { + VERIFY((std::isnan)(out(i, j))); + } else { + VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]); + } + } + } + + cudaFree(d_a); + cudaFree(d_x); + cudaFree(d_out); +} + +template +void test_cuda_igammac() +{ + Tensor a(6, 6); + Tensor x(6, 6); + Tensor out(6, 6); + out.setZero(); + + Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + a(i, j) = a_s[i]; + x(i, j) = x_s[j]; + } + } + + Scalar nan = std::numeric_limits::quiet_NaN(); + Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan}, + {1.0, 0.36787944117144233, 0.22313016014842982, + 0.018315638888734182, 0.9999000049998333, 0.0}, + {1.0, 0.5724067044708798, 0.3916251762710878, + 0.04601170568923136, 0.9999992477923555, 0.0}, + {1.0, 0.9810118431238462, 0.9343575456215499, + 0.4334701203667089, 1.0, 0.0}, + {1.0, 2.1940638138146658e-05, 1.0003291916285e-05, + 3.7801620118431334e-07, 0.0008629581310054535, + 0.0}, + {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}}; + + std::size_t bytes = a.size() * sizeof(Scalar); + + Scalar* d_a; + Scalar* d_x; + Scalar* d_out; + cudaMalloc((void**)(&d_a), bytes); + cudaMalloc((void**)(&d_x), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_a(d_a, 6, 6); + Eigen::TensorMap > gpu_x(d_x, 6, 6); + Eigen::TensorMap > gpu_out(d_out, 6, 6); + + gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + if ((std::isnan)(igammac_s[i][j])) { + VERIFY((std::isnan)(out(i, j))); + } else { + VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]); + } + } + } + + cudaFree(d_a); + cudaFree(d_x); + cudaFree(d_out); +} + +template +void test_cuda_erf(const Scalar stddev) +{ + Tensor in(72,97); + in.setRandom(); + in *= in.constant(stddev); + Tensor out(72,97); + out.setZero(); + + std::size_t bytes = in.size() * sizeof(Scalar); + + Scalar* d_in; + Scalar* d_out; + assert(cudaMalloc((void**)(&d_in), bytes) == cudaSuccess); + assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess); + + cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in(d_in, 72, 97); + Eigen::TensorMap > gpu_out(d_out, 72, 97); + + gpu_out.device(gpu_device) = gpu_in.erf(); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 97; ++j) { + VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j))); + } + } + + cudaFree(d_in); + cudaFree(d_out); +} + +template +void test_cuda_erfc(const Scalar stddev) +{ + Tensor in(72,97); + in.setRandom(); + in *= in.constant(stddev); + Tensor out(72,97); + out.setZero(); + + std::size_t bytes = in.size() * sizeof(Scalar); + + Scalar* d_in; + Scalar* d_out; + cudaMalloc((void**)(&d_in), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in(d_in, 72, 97); + Eigen::TensorMap > gpu_out(d_out, 72, 97); + + gpu_out.device(gpu_device) = gpu_in.erfc(); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 97; ++j) { + VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j))); + } + } + + cudaFree(d_in); + cudaFree(d_out); +} + +template +void test_cuda_betainc() +{ + Tensor in_x(125); + Tensor in_a(125); + Tensor in_b(125); + Tensor out(125); + Tensor expected_out(125); + out.setZero(); + + Scalar nan = std::numeric_limits::quiet_NaN(); + + Array x(125); + Array a(125); + Array b(125); + Array v(125); + + a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, + 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, + 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999, + 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, + 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, + 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999; + + b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999, + 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999, + 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999, + 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999, + 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999, + 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999, + 999.999, 999.999, 999.999; + + x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, + 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, + 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, + 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, + 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, + -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, + 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, + 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, + 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1; + + v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, + nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, + nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan, + 0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan, + 0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan, + 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, + nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256, + 0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001, + 0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403, + 0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999, + 0.9999999999999999, nan, nan, nan, nan, nan, nan, nan, + 1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan, + nan, 7.864342668429763e-23, 3.015969667594166e-10, 0.0008598571564165444, + nan, nan, 6.031987710123844e-08, 0.5000000000000007, 0.9999999396801229, + nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, + nan, nan, nan, nan, nan, nan, 0.0, 7.029920380986636e-306, + 2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302, + 1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252, + 2.9303043666183996e-60, nan, nan, 2.248913486879199e-196, + 0.5000000000004947, 0.9999999999999999, nan; + + for (int i = 0; i < 125; ++i) { + in_x(i) = x(i); + in_a(i) = a(i); + in_b(i) = b(i); + expected_out(i) = v(i); + } + + std::size_t bytes = in_x.size() * sizeof(Scalar); + + Scalar* d_in_x; + Scalar* d_in_a; + Scalar* d_in_b; + Scalar* d_out; + cudaMalloc((void**)(&d_in_x), bytes); + cudaMalloc((void**)(&d_in_a), bytes); + cudaMalloc((void**)(&d_in_b), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in_a, in_a.data(), bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in_b, in_b.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in_x(d_in_x, 125); + Eigen::TensorMap > gpu_in_a(d_in_a, 125); + Eigen::TensorMap > gpu_in_b(d_in_b, 125); + Eigen::TensorMap > gpu_out(d_out, 125); + + gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 1; i < 125; ++i) { + if ((std::isnan)(expected_out(i))) { + VERIFY((std::isnan)(out(i))); + } else { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } + } + + cudaFree(d_in_x); + cudaFree(d_in_a); + cudaFree(d_in_b); + cudaFree(d_out); +} + +template +void test_cuda_i0e() +{ + Tensor in_x(21); + Tensor out(21); + Tensor expected_out(21); + out.setZero(); + + Array in_x_array(21); + Array expected_out_array(21); + + in_x_array << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, + -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0; + + expected_out_array << 0.0897803118848, 0.0947062952128, 0.100544127361, + 0.107615251671, 0.116426221213, 0.127833337163, 0.143431781857, + 0.16665743264, 0.207001921224, 0.308508322554, 1.0, 0.308508322554, + 0.207001921224, 0.16665743264, 0.143431781857, 0.127833337163, + 0.116426221213, 0.107615251671, 0.100544127361, 0.0947062952128, + 0.0897803118848; + + for (int i = 0; i < 21; ++i) { + in_x(i) = in_x_array(i); + expected_out(i) = expected_out_array(i); + } + + std::size_t bytes = in_x.size() * sizeof(Scalar); + + Scalar* d_in; + Scalar* d_out; + cudaMalloc((void**)(&d_in), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in, in_x.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in(d_in, 21); + Eigen::TensorMap > gpu_out(d_out, 21); + + gpu_out.device(gpu_device) = gpu_in.i0e(); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, + gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 21; ++i) { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } + + cudaFree(d_in); + cudaFree(d_out); +} + +template +void test_cuda_i1e() +{ + Tensor in_x(21); + Tensor out(21); + Tensor expected_out(21); + out.setZero(); + + Array in_x_array(21); + Array expected_out_array(21); + + in_x_array << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, + -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0; + + expected_out_array << -0.0875062221833, -0.092036796872, -0.0973496147565, + -0.103697667463, -0.11146429929, -0.121262681384, -0.134142493293, + -0.152051459309, -0.178750839502, -0.215269289249, 0.0, 0.215269289249, + 0.178750839502, 0.152051459309, 0.134142493293, 0.121262681384, + 0.11146429929, 0.103697667463, 0.0973496147565, 0.092036796872, + 0.0875062221833; + + for (int i = 0; i < 21; ++i) { + in_x(i) = in_x_array(i); + expected_out(i) = expected_out_array(i); + } + + std::size_t bytes = in_x.size() * sizeof(Scalar); + + Scalar* d_in; + Scalar* d_out; + cudaMalloc((void**)(&d_in), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in, in_x.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in(d_in, 21); + Eigen::TensorMap > gpu_out(d_out, 21); + + gpu_out.device(gpu_device) = gpu_in.i1e(); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, + gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 21; ++i) { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } + + cudaFree(d_in); + cudaFree(d_out); +} + +template +void test_cuda_igamma_der_a() +{ + Tensor in_x(30); + Tensor in_a(30); + Tensor out(30); + Tensor expected_out(30); + out.setZero(); + + Array in_a_array(30); + Array in_x_array(30); + Array expected_out_array(30); + + // See special_functions.cpp for the Python code that generates the test data. + + in_a_array << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, + 1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, 100.0, + 100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0; + + in_x_array << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05, + 1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16, 0.0132865061065, + 0.0200034203853, 6.29263709118e-17, 1.37160367764e-06, 0.333412038288, + 1.18135687766, 0.580629033777, 0.170631439426, 0.786686768458, + 7.63873279537, 13.1944344379, 11.896042354, 10.5830172417, 10.5020942233, + 92.8918587747, 95.003720371, 86.3715926467, 96.0330217672, 82.6389930677, + 968.702906754, 969.463546828, 1001.79726022, 955.047416547, 1044.27458568; + + expected_out_array << -32.7256441441, -36.4394150514, -9.66467612263, + -36.4394150514, -36.4394150514, -1.0891900302, -2.66351229645, + -2.48666868596, -0.929700494428, -3.56327722764, -0.455320135314, + -0.391437214323, -0.491352055991, -0.350454834292, -0.471773162921, + -0.104084440522, -0.0723646747909, -0.0992828975532, -0.121638215446, + -0.122619605294, -0.0317670267286, -0.0359974812869, -0.0154359225363, + -0.0375775365921, -0.00794899153653, -0.00777303219211, -0.00796085782042, + -0.0125850719397, -0.00455500206958, -0.00476436993148; + + for (int i = 0; i < 30; ++i) { + in_x(i) = in_x_array(i); + in_a(i) = in_a_array(i); + expected_out(i) = expected_out_array(i); + } + + std::size_t bytes = in_x.size() * sizeof(Scalar); + + Scalar* d_a; + Scalar* d_x; + Scalar* d_out; + cudaMalloc((void**)(&d_a), bytes); + cudaMalloc((void**)(&d_x), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_a, in_a.data(), bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_x, in_x.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_a(d_a, 30); + Eigen::TensorMap > gpu_x(d_x, 30); + Eigen::TensorMap > gpu_out(d_out, 30); + + gpu_out.device(gpu_device) = gpu_a.igamma_der_a(gpu_x); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, + gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 30; ++i) { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } + + cudaFree(d_a); + cudaFree(d_x); + cudaFree(d_out); +} + +template +void test_cuda_gamma_sample_der_alpha() +{ + Tensor in_alpha(30); + Tensor in_sample(30); + Tensor out(30); + Tensor expected_out(30); + out.setZero(); + + Array in_alpha_array(30); + Array in_sample_array(30); + Array expected_out_array(30); + + // See special_functions.cpp for the Python code that generates the test data. + + in_alpha_array << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, + 1.0, 1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, + 100.0, 100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0; + + in_sample_array << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05, + 1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16, 0.0132865061065, + 0.0200034203853, 6.29263709118e-17, 1.37160367764e-06, 0.333412038288, + 1.18135687766, 0.580629033777, 0.170631439426, 0.786686768458, + 7.63873279537, 13.1944344379, 11.896042354, 10.5830172417, 10.5020942233, + 92.8918587747, 95.003720371, 86.3715926467, 96.0330217672, 82.6389930677, + 968.702906754, 969.463546828, 1001.79726022, 955.047416547, 1044.27458568; + + expected_out_array << 7.42424742367e-23, 1.02004297287e-34, 0.0130155240738, + 1.02004297287e-34, 1.02004297287e-34, 1.96505168277e-13, 0.525575786243, + 0.713903991771, 2.32077561808e-14, 0.000179348049886, 0.635500453302, + 1.27561284917, 0.878125852156, 0.41565819538, 1.03606488534, + 0.885964824887, 1.16424049334, 1.10764479598, 1.04590810812, + 1.04193666963, 0.965193152414, 0.976217589464, 0.93008035061, + 0.98153216096, 0.909196397698, 0.98434963993, 0.984738050206, + 1.00106492525, 0.97734200649, 1.02198794179; + + for (int i = 0; i < 30; ++i) { + in_alpha(i) = in_alpha_array(i); + in_sample(i) = in_sample_array(i); + expected_out(i) = expected_out_array(i); + } + + std::size_t bytes = in_alpha.size() * sizeof(Scalar); + + Scalar* d_alpha; + Scalar* d_sample; + Scalar* d_out; + cudaMalloc((void**)(&d_alpha), bytes); + cudaMalloc((void**)(&d_sample), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_alpha, in_alpha.data(), bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_sample, in_sample.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_alpha(d_alpha, 30); + Eigen::TensorMap > gpu_sample(d_sample, 30); + Eigen::TensorMap > gpu_out(d_out, 30); + + gpu_out.device(gpu_device) = gpu_alpha.gamma_sample_der_alpha(gpu_sample); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, + gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 30; ++i) { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } + + cudaFree(d_alpha); + cudaFree(d_sample); + cudaFree(d_out); +} + +void test_cxx11_tensor_cuda() +{ + CALL_SUBTEST_1(test_cuda_nullary()); + CALL_SUBTEST_1(test_cuda_elementwise_small()); + CALL_SUBTEST_1(test_cuda_elementwise()); + CALL_SUBTEST_1(test_cuda_props()); + CALL_SUBTEST_1(test_cuda_reduction()); + CALL_SUBTEST_2(test_cuda_contraction()); + CALL_SUBTEST_2(test_cuda_contraction()); + CALL_SUBTEST_3(test_cuda_convolution_1d()); + CALL_SUBTEST_3(test_cuda_convolution_1d()); + CALL_SUBTEST_3(test_cuda_convolution_inner_dim_col_major_1d()); + CALL_SUBTEST_3(test_cuda_convolution_inner_dim_row_major_1d()); + CALL_SUBTEST_3(test_cuda_convolution_2d()); + CALL_SUBTEST_3(test_cuda_convolution_2d()); + CALL_SUBTEST_3(test_cuda_convolution_3d()); + CALL_SUBTEST_3(test_cuda_convolution_3d()); + +#if __cplusplus > 199711L + // std::erf, std::erfc, and so on where only added in c++11. We use them + // as a golden reference to validate the results produced by Eigen. Therefore + // we can only run these tests if we use a c++11 compiler. + CALL_SUBTEST_4(test_cuda_lgamma(1.0f)); + CALL_SUBTEST_4(test_cuda_lgamma(100.0f)); + CALL_SUBTEST_4(test_cuda_lgamma(0.01f)); + CALL_SUBTEST_4(test_cuda_lgamma(0.001f)); + + CALL_SUBTEST_4(test_cuda_lgamma(1.0)); + CALL_SUBTEST_4(test_cuda_lgamma(100.0)); + CALL_SUBTEST_4(test_cuda_lgamma(0.01)); + CALL_SUBTEST_4(test_cuda_lgamma(0.001)); + + CALL_SUBTEST_4(test_cuda_erf(1.0f)); + CALL_SUBTEST_4(test_cuda_erf(100.0f)); + CALL_SUBTEST_4(test_cuda_erf(0.01f)); + CALL_SUBTEST_4(test_cuda_erf(0.001f)); + + CALL_SUBTEST_4(test_cuda_erfc(1.0f)); + // CALL_SUBTEST(test_cuda_erfc(100.0f)); + CALL_SUBTEST_4(test_cuda_erfc(5.0f)); // CUDA erfc lacks precision for large inputs + CALL_SUBTEST_4(test_cuda_erfc(0.01f)); + CALL_SUBTEST_4(test_cuda_erfc(0.001f)); + + CALL_SUBTEST_4(test_cuda_erf(1.0)); + CALL_SUBTEST_4(test_cuda_erf(100.0)); + CALL_SUBTEST_4(test_cuda_erf(0.01)); + CALL_SUBTEST_4(test_cuda_erf(0.001)); + + CALL_SUBTEST_4(test_cuda_erfc(1.0)); + // CALL_SUBTEST(test_cuda_erfc(100.0)); + CALL_SUBTEST_4(test_cuda_erfc(5.0)); // CUDA erfc lacks precision for large inputs + CALL_SUBTEST_4(test_cuda_erfc(0.01)); + CALL_SUBTEST_4(test_cuda_erfc(0.001)); + + CALL_SUBTEST_5(test_cuda_digamma()); + CALL_SUBTEST_5(test_cuda_digamma()); + + CALL_SUBTEST_5(test_cuda_polygamma()); + CALL_SUBTEST_5(test_cuda_polygamma()); + + CALL_SUBTEST_5(test_cuda_zeta()); + CALL_SUBTEST_5(test_cuda_zeta()); + + CALL_SUBTEST_5(test_cuda_igamma()); + CALL_SUBTEST_5(test_cuda_igammac()); + + CALL_SUBTEST_5(test_cuda_igamma()); + CALL_SUBTEST_5(test_cuda_igammac()); + + CALL_SUBTEST_6(test_cuda_betainc()); + CALL_SUBTEST_6(test_cuda_betainc()); + + CALL_SUBTEST_6(test_cuda_i0e()); + CALL_SUBTEST_6(test_cuda_i0e()); + + CALL_SUBTEST_6(test_cuda_i1e()); + CALL_SUBTEST_6(test_cuda_i1e()); + + CALL_SUBTEST_6(test_cuda_igamma_der_a()); + CALL_SUBTEST_6(test_cuda_igamma_der_a()); + + CALL_SUBTEST_6(test_cuda_gamma_sample_der_alpha()); + CALL_SUBTEST_6(test_cuda_gamma_sample_der_alpha()); +#endif +} -- cgit v1.2.3