1 files changed, 76 insertions, 29 deletions
diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu
index 284b46803..bf216587a 100644
--- a/unsupported/test/cxx11_tensor_cuda.cu
+++ b/unsupported/test/cxx11_tensor_cuda.cu
@@ -10,19 +10,65 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_TEST_FUNC cxx11_tensor_cuda
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
-
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
 using Eigen::Tensor;
 
+void test_cuda_nullary() {
+  Tensor<float, 1, 0, int> in1(2);
+  Tensor<float, 1, 0, int> in2(2);
+  in1.setRandom();
+  in2.setRandom();
+
+  std::size_t tensor_bytes = in1.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_in2;
+  cudaMalloc((void**)(&d_in1), tensor_bytes);
+  cudaMalloc((void**)(&d_in2), tensor_bytes);
+  cudaMemcpy(d_in1, in1.data(), tensor_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2.data(), tensor_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1(
+      d_in1, 2);
+  Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in2(
+      d_in2, 2);
+
+  gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f);
+  gpu_in2.device(gpu_device) = gpu_in2.random();
+
+  Tensor<float, 1, 0, int> new1(2);
+  Tensor<float, 1, 0, int> new2(2);
+
+  assert(cudaMemcpyAsync(new1.data(), d_in1, tensor_bytes, cudaMemcpyDeviceToHost,
+                         gpu_device.stream()) == cudaSuccess);
+  assert(cudaMemcpyAsync(new2.data(), d_in2, tensor_bytes, cudaMemcpyDeviceToHost,
+                         gpu_device.stream()) == cudaSuccess);
+
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 2; ++i) {
+    VERIFY_IS_APPROX(new1(i), 3.14f);
+    VERIFY_IS_NOT_EQUAL(new2(i), in2(i));
+  }
+
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+}
+
 void test_cuda_elementwise_small() {
-  Tensor<float, 1> in1(Eigen::array<int, 1>(2));
-  Tensor<float, 1> in2(Eigen::array<int, 1>(2));
-  Tensor<float, 1> out(Eigen::array<int, 1>(2));
+  Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2));
+  Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2));
+  Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>(2));
   in1.setRandom();
   in2.setRandom();
 
@@ -44,11 +90,11 @@ void test_cuda_elementwise_small() {
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
-      d_in1, Eigen::array<int, 1>(2));
+      d_in1, Eigen::array<Eigen::DenseIndex, 1>(2));
   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2(
-      d_in2, Eigen::array<int, 1>(2));
+      d_in2, Eigen::array<Eigen::DenseIndex, 1>(2));
   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out(
-      d_out, Eigen::array<int, 1>(2));
+      d_out, Eigen::array<Eigen::DenseIndex, 1>(2));
 
   gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
 
@@ -58,8 +104,8 @@ void test_cuda_elementwise_small() {
 
   for (int i = 0; i < 2; ++i) {
     VERIFY_IS_APPROX(
-        out(Eigen::array<int, 1>(i)),
-        in1(Eigen::array<int, 1>(i)) + in2(Eigen::array<int, 1>(i)));
+        out(Eigen::array<Eigen::DenseIndex, 1>(i)),
+        in1(Eigen::array<Eigen::DenseIndex, 1>(i)) + in2(Eigen::array<Eigen::DenseIndex, 1>(i)));
   }
 
   cudaFree(d_in1);
@@ -69,10 +115,10 @@ void test_cuda_elementwise_small() {
 
 void test_cuda_elementwise()
 {
-  Tensor<float, 3> in1(Eigen::array<int, 3>(72,53,97));
-  Tensor<float, 3> in2(Eigen::array<int, 3>(72,53,97));
-  Tensor<float, 3> in3(Eigen::array<int, 3>(72,53,97));
-  Tensor<float, 3> out(Eigen::array<int, 3>(72,53,97));
+  Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Tensor<float, 3> in3(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Tensor<float, 3> out(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
   in1.setRandom();
   in2.setRandom();
   in3.setRandom();
@@ -98,10 +144,10 @@ void test_cuda_elementwise()
   Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<int, 3>(72,53,97));
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<int, 3>(72,53,97));
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<int, 3>(72,53,97));
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<int, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
 
   gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
 
@@ -111,7 +157,7 @@ void test_cuda_elementwise()
   for (int i = 0; i < 72; ++i) {
     for (int j = 0; j < 53; ++j) {
       for (int k = 0; k < 97; ++k) {
-        VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), in1(Eigen::array<int, 3>(i,j,k)) + in2(Eigen::array<int, 3>(i,j,k)) * in3(Eigen::array<int, 3>(i,j,k)));
+        VERIFY_IS_APPROX(out(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)), in1(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) + in2(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) * in3(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)));
       }
     }
   }
@@ -181,7 +227,7 @@ void test_cuda_reduction()
   Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
   Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
 
-  array<int, 2> reduction_axis;
+  array<Eigen::DenseIndex, 2> reduction_axis;
   reduction_axis[0] = 1;
   reduction_axis[1] = 3;
 
@@ -214,8 +260,8 @@ void test_cuda_contraction()
   // more than 30 * 1024, which is the number of threads in blocks on
   // a 15 SM GK110 GPU
   Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31);
-  Tensor<float, 5, DataLayout> t_right(Eigen::array<int, 5>(3, 31, 7, 20, 1));
-  Tensor<float, 5, DataLayout> t_result(Eigen::array<int, 5>(6, 50, 7, 20, 1));
+  Tensor<float, 5, DataLayout> t_right(Eigen::array<Eigen::DenseIndex, 5>(3, 31, 7, 20, 1));
+  Tensor<float, 5, DataLayout> t_result(Eigen::array<Eigen::DenseIndex, 5>(6, 50, 7, 20, 1));
 
   t_left.setRandom();
   t_right.setRandom();
@@ -299,7 +345,7 @@ void test_cuda_convolution_1d()
   Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4);
   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137);
 
-  Eigen::array<int, 1> dims(1);
+  Eigen::array<Eigen::DenseIndex, 1> dims(1);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
@@ -352,7 +398,7 @@ void test_cuda_convolution_inner_dim_col_major_1d()
   Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4);
   Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7);
 
-  Eigen::array<int, 1> dims(0);
+  Eigen::array<Eigen::DenseIndex, 1> dims(0);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
@@ -405,7 +451,7 @@ void test_cuda_convolution_inner_dim_row_major_1d()
   Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4);
   Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71);
 
-  Eigen::array<int, 1> dims(3);
+  Eigen::array<Eigen::DenseIndex, 1> dims(3);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
@@ -459,7 +505,7 @@ void test_cuda_convolution_2d()
   Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4);
   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137);
 
-  Eigen::array<int, 2> dims(1,2);
+  Eigen::array<Eigen::DenseIndex, 2> dims(1,2);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
@@ -496,9 +542,9 @@ void test_cuda_convolution_2d()
 template<int DataLayout>
 void test_cuda_convolution_3d()
 {
-  Tensor<float, 5, DataLayout> input(Eigen::array<int, 5>(74,37,11,137,17));
+  Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>(74,37,11,137,17));
   Tensor<float, 3, DataLayout> kernel(3,4,2);
-  Tensor<float, 5, DataLayout> out(Eigen::array<int, 5>(74,35,8,136,17));
+  Tensor<float, 5, DataLayout> out(Eigen::array<Eigen::DenseIndex, 5>(74,35,8,136,17));
   input = input.constant(10.0f) + input.random();
   kernel = kernel.constant(7.0f) + kernel.random();
 
@@ -523,7 +569,7 @@ void test_cuda_convolution_3d()
   Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2);
   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17);
 
-  Eigen::array<int, 3> dims(1,2,3);
+  Eigen::array<Eigen::DenseIndex, 3> dims(1,2,3);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
@@ -1168,6 +1214,7 @@ void test_cuda_betainc()
 
 void test_cxx11_tensor_cuda()
 {
+  CALL_SUBTEST_1(test_cuda_nullary());
   CALL_SUBTEST_1(test_cuda_elementwise_small());
   CALL_SUBTEST_1(test_cuda_elementwise());
   CALL_SUBTEST_1(test_cuda_props());