Add special functions to Eigen: lgamma, erf, erfc.

Includes CUDA support and unit tests.
author: Eugene Brevdo <ebrevdo@gmail.com> 2015-12-07 15:24:49 -0800
committer: Eugene Brevdo <ebrevdo@gmail.com> 2015-12-07 15:24:49 -0800
commit: fa4f933c0fe65eda6a051f978db12210f11f5cdb (patch)
tree: cab17ee4bbffd52a778b00ec40770e7fa4b361cf /unsupported/test/cxx11_tensor_cuda.cpp
parent: 7dfe75f445835baff18bbe82ba7253f7563cbdc6 (diff)
1 files changed, 139 insertions, 0 deletions
diff --git a/unsupported/test/cxx11_tensor_cuda.cpp b/unsupported/test/cxx11_tensor_cuda.cpp
index 5ff082a3a..49e1894ab 100644
--- a/unsupported/test/cxx11_tensor_cuda.cpp
+++ b/unsupported/test/cxx11_tensor_cuda.cpp
@@ -507,6 +507,115 @@ static void test_cuda_convolution_3d()
   }
 }
 
+
+template <typename Scalar>
+void test_cuda_lgamma(const Scalar stddev)
+{
+  Tensor<Scalar, 2> in(72,97);
+  in.setRandom();
+  in *= in.constant(stddev);
+  Tensor<Scalar, 2> out(72,97);
+  out.setZero();
+
+  std::size_t bytes = in.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_in), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
+
+  gpu_out.device(gpu_device) = gpu_in.lgamma();
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j)));
+    }
+  }
+}
+
+template <typename Scalar>
+void test_cuda_erf(const Scalar stddev)
+{
+  Tensor<Scalar, 2> in(72,97);
+  in.setRandom();
+  in *= in.constant(stddev);
+  Tensor<Scalar, 2> out(72,97);
+  out.setZero();
+
+  std::size_t bytes = in.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_in), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
+
+  gpu_out.device(gpu_device) = gpu_in.erf();
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j)));
+    }
+  }
+}
+
+template <typename Scalar>
+void test_cuda_erfc(const Scalar stddev)
+{
+  Tensor<Scalar, 2> in(72,97);
+  in.setRandom();
+  in *= in.constant(stddev);
+  Tensor<Scalar, 2> out(72,97);
+  out.setZero();
+
+  std::size_t bytes = in.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_in), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
+
+  gpu_out.device(gpu_device) = gpu_in.erfc();
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j)));
+    }
+  }
+}
+
 void test_cxx11_tensor_cuda()
 {
   CALL_SUBTEST(test_cuda_elementwise_small());
@@ -522,4 +631,34 @@ void test_cxx11_tensor_cuda()
   CALL_SUBTEST(test_cuda_convolution_2d<RowMajor>());
   CALL_SUBTEST(test_cuda_convolution_3d<ColMajor>());
   CALL_SUBTEST(test_cuda_convolution_3d<RowMajor>());
+  CALL_SUBTEST(test_cuda_lgamma<float>(1.0f));
+  CALL_SUBTEST(test_cuda_lgamma<float>(100.0f));
+  CALL_SUBTEST(test_cuda_lgamma<float>(0.01f));
+  CALL_SUBTEST(test_cuda_lgamma<float>(0.001f));
+  CALL_SUBTEST(test_cuda_erf<float>(1.0f));
+  CALL_SUBTEST(test_cuda_erf<float>(100.0f));
+  CALL_SUBTEST(test_cuda_erf<float>(0.01f));
+  CALL_SUBTEST(test_cuda_erf<float>(0.001f));
+  CALL_SUBTEST(test_cuda_erfc<float>(1.0f));
+  // CALL_SUBTEST(test_cuda_erfc<float>(100.0f));
+  CALL_SUBTEST(test_cuda_erfc<float>(5.0f)); // CUDA erfc lacks precision for large inputs
+  CALL_SUBTEST(test_cuda_erfc<float>(0.01f));
+  CALL_SUBTEST(test_cuda_erfc<float>(0.001f));
+  CALL_SUBTEST(test_cuda_tanh<double>(1.0));
+  CALL_SUBTEST(test_cuda_tanh<double>(100.0));
+  CALL_SUBTEST(test_cuda_tanh<double>(0.01));
+  CALL_SUBTEST(test_cuda_tanh<double>(0.001));
+  CALL_SUBTEST(test_cuda_lgamma<double>(1.0));
+  CALL_SUBTEST(test_cuda_lgamma<double>(100.0));
+  CALL_SUBTEST(test_cuda_lgamma<double>(0.01));
+  CALL_SUBTEST(test_cuda_lgamma<double>(0.001));
+  CALL_SUBTEST(test_cuda_erf<double>(1.0));
+  CALL_SUBTEST(test_cuda_erf<double>(100.0));
+  CALL_SUBTEST(test_cuda_erf<double>(0.01));
+  CALL_SUBTEST(test_cuda_erf<double>(0.001));
+  CALL_SUBTEST(test_cuda_erfc<double>(1.0));
+  // CALL_SUBTEST(test_cuda_erfc<double>(100.0));
+  CALL_SUBTEST(test_cuda_erfc<double>(5.0)); // CUDA erfc lacks precision for large inputs
+  CALL_SUBTEST(test_cuda_erfc<double>(0.01));
+  CALL_SUBTEST(test_cuda_erfc<double>(0.001));
 }
author	Eugene Brevdo <ebrevdo@gmail.com>	2015-12-07 15:24:49 -0800
committer	Eugene Brevdo <ebrevdo@gmail.com>	2015-12-07 15:24:49 -0800
commit	fa4f933c0fe65eda6a051f978db12210f11f5cdb (patch)
tree	cab17ee4bbffd52a778b00ec40770e7fa4b361cf /unsupported/test/cxx11_tensor_cuda.cpp
parent	7dfe75f445835baff18bbe82ba7253f7563cbdc6 (diff)