diff options
author | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2015-03-31 09:08:08 -0700 |
---|---|---|
committer | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2015-03-31 09:08:08 -0700 |
commit | 678207e02a35e32f2098fcb2cb8a510c9ee191e1 (patch) | |
tree | 4ba38b8adaaa274cc3128afef6faa8010163e518 /unsupported | |
parent | 68d4afe985f994f10e64b76d1476f5f08f006350 (diff) |
Added regression tests for tensor convolutions
Diffstat (limited to 'unsupported')
-rw-r--r-- | unsupported/test/cxx11_tensor_convolution.cpp | 58 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_cuda.cpp | 313 |
2 files changed, 193 insertions, 178 deletions
diff --git a/unsupported/test/cxx11_tensor_convolution.cpp b/unsupported/test/cxx11_tensor_convolution.cpp index 4672db463..3a12dae62 100644 --- a/unsupported/test/cxx11_tensor_convolution.cpp +++ b/unsupported/test/cxx11_tensor_convolution.cpp @@ -14,15 +14,16 @@ using Eigen::Tensor; using Eigen::DefaultDevice; +template <int DataLayout> static void test_evals() { - Tensor<float, 2> input(3, 3); - Tensor<float, 1> kernel(2); + Tensor<float, 2, DataLayout> input(3, 3); + Tensor<float, 1, DataLayout> kernel(2); input.setRandom(); kernel.setRandom(); - Tensor<float, 2> result(2,3); + Tensor<float, 2, DataLayout> result(2,3); result.setZero(); Eigen::array<Tensor<float, 2>::Index, 1> dims3({0}); @@ -41,15 +42,15 @@ static void test_evals() VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1)); // index 5 } - +template <int DataLayout> static void test_expr() { - Tensor<float, 2> input(3, 3); - Tensor<float, 2> kernel(2, 2); + Tensor<float, 2, DataLayout> input(3, 3); + Tensor<float, 2, DataLayout> kernel(2, 2); input.setRandom(); kernel.setRandom(); - Tensor<float, 2> result(2,2); + Tensor<float, 2, DataLayout> result(2,2); Eigen::array<ptrdiff_t, 2> dims({0, 1}); result = input.convolve(kernel, dims); @@ -63,10 +64,10 @@ static void test_expr() input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1)); } - +template <int DataLayout> static void test_modes() { - Tensor<float, 1> input(3); - Tensor<float, 1> kernel(3); + Tensor<float, 1, DataLayout> input(3); + Tensor<float, 1, DataLayout> kernel(3); input(0) = 1.0f; input(1) = 2.0f; input(2) = 3.0f; @@ -74,13 +75,13 @@ static void test_modes() { kernel(1) = 1.0f; kernel(2) = 0.0f; - const Eigen::array<ptrdiff_t, 1> dims{{0}}; + const Eigen::array<ptrdiff_t, 1> dims({0}); Eigen::array<std::pair<ptrdiff_t, ptrdiff_t>, 1> padding; // Emulate VALID mode (as defined in // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). padding[0] = std::make_pair(0, 0); - Tensor<float, 1> valid(1); + Tensor<float, 1, DataLayout> valid(1); valid = input.pad(padding).convolve(kernel, dims); VERIFY_IS_EQUAL(valid.dimension(0), 1); VERIFY_IS_APPROX(valid(0), 2.5f); @@ -88,7 +89,7 @@ static void test_modes() { // Emulate SAME mode (as defined in // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). padding[0] = std::make_pair(1, 1); - Tensor<float, 1> same(3); + Tensor<float, 1, DataLayout> same(3); same = input.pad(padding).convolve(kernel, dims); VERIFY_IS_EQUAL(same.dimension(0), 3); VERIFY_IS_APPROX(same(0), 1.0f); @@ -98,7 +99,7 @@ static void test_modes() { // Emulate FULL mode (as defined in // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). padding[0] = std::make_pair(2, 2); - Tensor<float, 1> full(5); + Tensor<float, 1, DataLayout> full(5); full = input.pad(padding).convolve(kernel, dims); VERIFY_IS_EQUAL(full.dimension(0), 5); VERIFY_IS_APPROX(full(0), 0.0f); @@ -108,18 +109,18 @@ static void test_modes() { VERIFY_IS_APPROX(full(4), 1.5f); } - +template <int DataLayout> static void test_strides() { - Tensor<float, 1> input(13); - Tensor<float, 1> kernel(3); + Tensor<float, 1, DataLayout> input(13); + Tensor<float, 1, DataLayout> kernel(3); input.setRandom(); kernel.setRandom(); - const Eigen::array<ptrdiff_t, 1> dims{{0}}; - const Eigen::array<ptrdiff_t, 1> stride_of_3{{3}}; - const Eigen::array<ptrdiff_t, 1> stride_of_2{{2}}; + const Eigen::array<ptrdiff_t, 1> dims({0}); + const Eigen::array<ptrdiff_t, 1> stride_of_3({3}); + const Eigen::array<ptrdiff_t, 1> stride_of_2({2}); - Tensor<float, 1> result; + Tensor<float, 1, DataLayout> result; result = input.stride(stride_of_3).convolve(kernel, dims).stride(stride_of_2); VERIFY_IS_EQUAL(result.dimension(0), 2); @@ -129,13 +130,14 @@ static void test_strides() { input(12)*kernel(2))); } - - - void test_cxx11_tensor_convolution() { - CALL_SUBTEST(test_evals()); - CALL_SUBTEST(test_expr()); - CALL_SUBTEST(test_modes()); - CALL_SUBTEST(test_strides()); + CALL_SUBTEST(test_evals<ColMajor>()); + CALL_SUBTEST(test_evals<RowMajor>()); + CALL_SUBTEST(test_expr<ColMajor>()); + CALL_SUBTEST(test_expr<RowMajor>()); + CALL_SUBTEST(test_modes<ColMajor>()); + CALL_SUBTEST(test_modes<RowMajor>()); + CALL_SUBTEST(test_strides<ColMajor>()); + CALL_SUBTEST(test_strides<RowMajor>()); } diff --git a/unsupported/test/cxx11_tensor_cuda.cpp b/unsupported/test/cxx11_tensor_cuda.cpp index 8c1ca1bf8..78934165f 100644 --- a/unsupported/test/cxx11_tensor_cuda.cpp +++ b/unsupported/test/cxx11_tensor_cuda.cpp @@ -117,11 +117,10 @@ void test_cuda_elementwise() } } - void test_cuda_reduction() { - Tensor<float, 4> in1(Eigen::array<int, 4>(72,53,97,113)); - Tensor<float, 2> out(Eigen::array<int, 2>(72,97)); + Tensor<float, 4> in1(72,53,97,113); + Tensor<float, 2> out(72,97); in1.setRandom(); std::size_t in1_bytes = in1.size() * sizeof(float); @@ -138,8 +137,8 @@ void test_cuda_reduction() assert(cudaStreamCreate(&stream) == cudaSuccess); Eigen::GpuDevice gpu_device(&stream); - Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, Eigen::array<int, 4>(72,53,97,113)); - Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, Eigen::array<int, 2>(72,97)); + Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113); + Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97); array<int, 2> reduction_axis; reduction_axis[0] = 1; @@ -156,10 +155,10 @@ void test_cuda_reduction() for (int k = 0; k < 53; ++k) { for (int l = 0; l < 113; ++l) { expected = - std::max<float>(expected, in1(Eigen::array<int, 4>(i, k, j, l))); + std::max<float>(expected, in1(i, k, j, l)); } } - VERIFY_IS_APPROX(out(Eigen::array<int, 2>(i,j)), expected); + VERIFY_IS_APPROX(out(i,j), expected); } } } @@ -170,7 +169,7 @@ static void test_cuda_contraction() // with these dimensions, the output has 300 * 140 elements, which is // more than 30 * 1024, which is the number of threads in blocks on // a 15 SM GK110 GPU - Tensor<float, 4, DataLayout> t_left(Eigen::array<int, 4>(6, 50, 3, 31)); + Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31); Tensor<float, 5, DataLayout> t_right(Eigen::array<int, 5>(3, 31, 7, 20, 1)); Tensor<float, 5, DataLayout> t_result(Eigen::array<int, 5>(6, 50, 7, 20, 1)); @@ -196,12 +195,9 @@ static void test_cuda_contraction() assert(cudaStreamCreate(&stream) == cudaSuccess); Eigen::GpuDevice gpu_device(&stream); - Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > - gpu_t_left(d_t_left, Eigen::array<int, 4>(6, 50, 3, 31)); - Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > - gpu_t_right(d_t_right, Eigen::array<int, 5>(3, 31, 7, 20, 1)); - Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > - gpu_t_result(d_t_result, Eigen::array<int, 5>(6, 50, 7, 20, 1)); + Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31); + Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_right(d_t_right, 3, 31, 7, 20, 1); + Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_result(d_t_result, 6, 50, 7, 20, 1); typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf; MapXf m_left(t_left.data(), 300, 93); @@ -226,11 +222,12 @@ static void test_cuda_contraction() } } +template<int DataLayout> static void test_cuda_convolution_1d() { - Tensor<float, 4> input(Eigen::array<int, 4>(74,37,11,137)); - Tensor<float, 1> kernel(Eigen::array<int, 1>(4)); - Tensor<float, 4> out(Eigen::array<int, 4>(74,34,11,137)); + Tensor<float, 4, DataLayout> input(74,37,11,137); + Tensor<float, 1, DataLayout> kernel(4); + Tensor<float, 4, DataLayout> out(74,34,11,137); input = input.constant(10.0f) + input.random(); kernel = kernel.constant(7.0f) + kernel.random(); @@ -252,9 +249,9 @@ static void test_cuda_convolution_1d() assert(cudaStreamCreate(&stream) == cudaSuccess); Eigen::GpuDevice gpu_device(&stream); - Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_input(d_input, Eigen::array<int, 4>(74,37,11,137)); - Eigen::TensorMap<Eigen::Tensor<float, 1> > gpu_kernel(d_kernel, Eigen::array<int, 1>(4)); - Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_out(d_out, Eigen::array<int, 4>(74,34,11,137)); + Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137); + Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4); + Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137); Eigen::array<int, 1> dims(1); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); @@ -266,11 +263,9 @@ static void test_cuda_convolution_1d() for (int j = 0; j < 34; ++j) { for (int k = 0; k < 11; ++k) { for (int l = 0; l < 137; ++l) { - const float result = out(Eigen::array<int, 4>(i,j,k,l)); - const float expected = input(Eigen::array<int, 4>(i,j+0,k,l)) * kernel(Eigen::array<int, 1>(0)) + - input(Eigen::array<int, 4>(i,j+1,k,l)) * kernel(Eigen::array<int, 1>(1)) + - input(Eigen::array<int, 4>(i,j+2,k,l)) * kernel(Eigen::array<int, 1>(2)) + - input(Eigen::array<int, 4>(i,j+3,k,l)) * kernel(Eigen::array<int, 1>(3)); + const float result = out(i,j,k,l); + const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) + + input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3); VERIFY_IS_APPROX(result, expected); } } @@ -278,12 +273,11 @@ static void test_cuda_convolution_1d() } } - -static void test_cuda_convolution_2d() +static void test_cuda_convolution_inner_dim_col_major_1d() { - Tensor<float, 4> input(Eigen::array<int, 4>(74,37,11,137)); - Tensor<float, 2> kernel(Eigen::array<int, 2>(3,4)); - Tensor<float, 4> out(Eigen::array<int, 4>(74,35,8,137)); + Tensor<float, 4, ColMajor> input(74,9,11,7); + Tensor<float, 1, ColMajor> kernel(4); + Tensor<float, 4, ColMajor> out(71,9,11,7); input = input.constant(10.0f) + input.random(); kernel = kernel.constant(7.0f) + kernel.random(); @@ -305,46 +299,35 @@ static void test_cuda_convolution_2d() assert(cudaStreamCreate(&stream) == cudaSuccess); Eigen::GpuDevice gpu_device(&stream); - Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_input(d_input, Eigen::array<int, 4>(74,37,11,137)); - Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_kernel(d_kernel, Eigen::array<int, 2>(3,4)); - Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_out(d_out, Eigen::array<int, 4>(74,35,8,137)); + Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7); + Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4); + Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7); - Eigen::array<int, 2> dims(1,2); + Eigen::array<int, 1> dims(0); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); - for (int i = 0; i < 74; ++i) { - for (int j = 0; j < 35; ++j) { - for (int k = 0; k < 8; ++k) { - for (int l = 0; l < 137; ++l) { - const float result = out(Eigen::array<int, 4>(i,j,k,l)); - const float expected = input(Eigen::array<int, 4>(i,j+0,k+0,l)) * kernel(Eigen::array<int, 2>(0,0)) + - input(Eigen::array<int, 4>(i,j+1,k+0,l)) * kernel(Eigen::array<int, 2>(1,0)) + - input(Eigen::array<int, 4>(i,j+2,k+0,l)) * kernel(Eigen::array<int, 2>(2,0)) + - input(Eigen::array<int, 4>(i,j+0,k+1,l)) * kernel(Eigen::array<int, 2>(0,1)) + - input(Eigen::array<int, 4>(i,j+1,k+1,l)) * kernel(Eigen::array<int, 2>(1,1)) + - input(Eigen::array<int, 4>(i,j+2,k+1,l)) * kernel(Eigen::array<int, 2>(2,1)) + - input(Eigen::array<int, 4>(i,j+0,k+2,l)) * kernel(Eigen::array<int, 2>(0,2)) + - input(Eigen::array<int, 4>(i,j+1,k+2,l)) * kernel(Eigen::array<int, 2>(1,2)) + - input(Eigen::array<int, 4>(i,j+2,k+2,l)) * kernel(Eigen::array<int, 2>(2,2)) + - input(Eigen::array<int, 4>(i,j+0,k+3,l)) * kernel(Eigen::array<int, 2>(0,3)) + - input(Eigen::array<int, 4>(i,j+1,k+3,l)) * kernel(Eigen::array<int, 2>(1,3)) + - input(Eigen::array<int, 4>(i,j+2,k+3,l)) * kernel(Eigen::array<int, 2>(2,3)); - VERIFY_IS_APPROX(result, expected); + for (int i = 0; i < 71; ++i) { + for (int j = 0; j < 9; ++j) { + for (int k = 0; k < 11; ++k) { + for (int l = 0; l < 7; ++l) { + const float result = out(i,j,k,l); + const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) + + input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3); + VERIFY_IS_APPROX(result, expected); } } } } } - -static void test_cuda_convolution_3d() +static void test_cuda_convolution_inner_dim_row_major_1d() { - Tensor<float, 5> input(Eigen::array<int, 5>(74,37,11,137,17)); - Tensor<float, 3> kernel(Eigen::array<int, 3>(3,4,2)); - Tensor<float, 5> out(Eigen::array<int, 5>(74,35,8,136,17)); + Tensor<float, 4, RowMajor> input(7,9,11,74); + Tensor<float, 1, RowMajor> kernel(4); + Tensor<float, 4, RowMajor> out(7,9,11,71); input = input.constant(10.0f) + input.random(); kernel = kernel.constant(7.0f) + kernel.random(); @@ -366,139 +349,166 @@ static void test_cuda_convolution_3d() assert(cudaStreamCreate(&stream) == cudaSuccess); Eigen::GpuDevice gpu_device(&stream); - Eigen::TensorMap<Eigen::Tensor<float, 5> > gpu_input(d_input, Eigen::array<int, 5>(74,37,11,137,17)); - Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_kernel(d_kernel, Eigen::array<int, 3>(3,4,2)); - Eigen::TensorMap<Eigen::Tensor<float, 5> > gpu_out(d_out, Eigen::array<int, 5>(74,35,8,136,17)); + Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74); + Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4); + Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71); - Eigen::array<int, 3> dims(1,2,3); + Eigen::array<int, 1> dims(3); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); - for (int i = 0; i < 74; ++i) { - for (int j = 0; j < 35; ++j) { - for (int k = 0; k < 8; ++k) { - for (int l = 0; l < 136; ++l) { - for (int m = 0; m < 17; ++m) { - const float result = out(Eigen::array<int, 5>(i,j,k,l,m)); - const float expected = input(Eigen::array<int, 5>(i,j+0,k+0,l+0,m)) * kernel(Eigen::array<int, 3>(0,0,0)) + - input(Eigen::array<int, 5>(i,j+1,k+0,l+0,m)) * kernel(Eigen::array<int, 3>(1,0,0)) + - input(Eigen::array<int, 5>(i,j+2,k+0,l+0,m)) * kernel(Eigen::array<int, 3>(2,0,0)) + - input(Eigen::array<int, 5>(i,j+0,k+1,l+0,m)) * kernel(Eigen::array<int, 3>(0,1,0)) + - input(Eigen::array<int, 5>(i,j+1,k+1,l+0,m)) * kernel(Eigen::array<int, 3>(1,1,0)) + - input(Eigen::array<int, 5>(i,j+2,k+1,l+0,m)) * kernel(Eigen::array<int, 3>(2,1,0)) + - input(Eigen::array<int, 5>(i,j+0,k+2,l+0,m)) * kernel(Eigen::array<int, 3>(0,2,0)) + - input(Eigen::array<int, 5>(i,j+1,k+2,l+0,m)) * kernel(Eigen::array<int, 3>(1,2,0)) + - input(Eigen::array<int, 5>(i,j+2,k+2,l+0,m)) * kernel(Eigen::array<int, 3>(2,2,0)) + - input(Eigen::array<int, 5>(i,j+0,k+3,l+0,m)) * kernel(Eigen::array<int, 3>(0,3,0)) + - input(Eigen::array<int, 5>(i,j+1,k+3,l+0,m)) * kernel(Eigen::array<int, 3>(1,3,0)) + - input(Eigen::array<int, 5>(i,j+2,k+3,l+0,m)) * kernel(Eigen::array<int, 3>(2,3,0)) + - input(Eigen::array<int, 5>(i,j+0,k+0,l+1,m)) * kernel(Eigen::array<int, 3>(0,0,1)) + - input(Eigen::array<int, 5>(i,j+1,k+0,l+1,m)) * kernel(Eigen::array<int, 3>(1,0,1)) + - input(Eigen::array<int, 5>(i,j+2,k+0,l+1,m)) * kernel(Eigen::array<int, 3>(2,0,1)) + - input(Eigen::array<int, 5>(i,j+0,k+1,l+1,m)) * kernel(Eigen::array<int, 3>(0,1,1)) + - input(Eigen::array<int, 5>(i,j+1,k+1,l+1,m)) * kernel(Eigen::array<int, 3>(1,1,1)) + - input(Eigen::array<int, 5>(i,j+2,k+1,l+1,m)) * kernel(Eigen::array<int, 3>(2,1,1)) + - input(Eigen::array<int, 5>(i,j+0,k+2,l+1,m)) * kernel(Eigen::array<int, 3>(0,2,1)) + - input(Eigen::array<int, 5>(i,j+1,k+2,l+1,m)) * kernel(Eigen::array<int, 3>(1,2,1)) + - input(Eigen::array<int, 5>(i,j+2,k+2,l+1,m)) * kernel(Eigen::array<int, 3>(2,2,1)) + - input(Eigen::array<int, 5>(i,j+0,k+3,l+1,m)) * kernel(Eigen::array<int, 3>(0,3,1)) + - input(Eigen::array<int, 5>(i,j+1,k+3,l+1,m)) * kernel(Eigen::array<int, 3>(1,3,1)) + - input(Eigen::array<int, 5>(i,j+2,k+3,l+1,m)) * kernel(Eigen::array<int, 3>(2,3,1)); - VERIFY_IS_APPROX(result, expected); - } + for (int i = 0; i < 7; ++i) { + for (int j = 0; j < 9; ++j) { + for (int k = 0; k < 11; ++k) { + for (int l = 0; l < 71; ++l) { + const float result = out(i,j,k,l); + const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) + + input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3); + VERIFY_IS_APPROX(result, expected); } } } } } -static float* CudaCopyFloat(float* data, int size) { - const int nbytes = size * sizeof(float); - float* result = NULL; - if (cudaMalloc((void**)(&result), nbytes) != cudaSuccess) { - return NULL; - } else { - if (data != NULL) { - cudaMemcpy(result, data, nbytes, cudaMemcpyHostToDevice); - } - return result; - } -} - -static void test_cuda_constant_broadcast() +template<int DataLayout> +static void test_cuda_convolution_2d() { + Tensor<float, 4, DataLayout> input(74,37,11,137); + Tensor<float, 2, DataLayout> kernel(3,4); + Tensor<float, 4, DataLayout> out(74,35,8,137); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); + + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_input; + float* d_kernel; + float* d_out; + cudaMalloc((void**)(&d_input), input_bytes); + cudaMalloc((void**)(&d_kernel), kernel_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + cudaStream_t stream; assert(cudaStreamCreate(&stream) == cudaSuccess); Eigen::GpuDevice gpu_device(&stream); - Tensor<float, 1> t1(10); - for (int i = 0; i < 10; ++i) { - t1(i) = 10.0f * i; - } - float* t1_cuda = CudaCopyFloat(t1.data(), t1.size()); - Eigen::TensorMap<Eigen::Tensor<float, 1> > t1_gpu(t1_cuda, 10); - - Tensor<float, 1> t2(1); - t2 = t2.constant(20.0f); - float* t2_cuda = CudaCopyFloat(t2.data(), t2.size()); - Eigen::TensorMap<Eigen::TensorFixedSize<float, Sizes<1> > > t2_gpu(t2_cuda, 1); + Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137); + Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4); + Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137); - float* t3_cuda = CudaCopyFloat(NULL, 10); - Eigen::TensorMap<Eigen::Tensor<float, 1> > t3_gpu(t3_cuda, 10); - - t3_gpu.device(gpu_device) = - t1_gpu + t2_gpu.broadcast(Eigen::array<int, 1>(10)); + Eigen::array<int, 2> dims(1,2); + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); - Eigen::Tensor<float, 1> t3(10); - cudaMemcpy(t3.data(), t3_gpu.data(), 10 * sizeof(float), - cudaMemcpyDeviceToHost); + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); - for (int i = 0; i < 10; ++i) { - VERIFY_IS_APPROX(t3(i), t1(i) + t2(0)); + for (int i = 0; i < 74; ++i) { + for (int j = 0; j < 35; ++j) { + for (int k = 0; k < 8; ++k) { + for (int l = 0; l < 137; ++l) { + const float result = out(i,j,k,l); + const float expected = input(i,j+0,k+0,l) * kernel(0,0) + + input(i,j+1,k+0,l) * kernel(1,0) + + input(i,j+2,k+0,l) * kernel(2,0) + + input(i,j+0,k+1,l) * kernel(0,1) + + input(i,j+1,k+1,l) * kernel(1,1) + + input(i,j+2,k+1,l) * kernel(2,1) + + input(i,j+0,k+2,l) * kernel(0,2) + + input(i,j+1,k+2,l) * kernel(1,2) + + input(i,j+2,k+2,l) * kernel(2,2) + + input(i,j+0,k+3,l) * kernel(0,3) + + input(i,j+1,k+3,l) * kernel(1,3) + + input(i,j+2,k+3,l) * kernel(2,3); + VERIFY_IS_APPROX(result, expected); + } + } + } } } - -void test_cuda_cast() +template<int DataLayout> +static void test_cuda_convolution_3d() { - Tensor<double, 3> in(Eigen::array<int, 3>(72,53,97)); - Tensor<float, 3> out(Eigen::array<int, 3>(72,53,97)); - in.setRandom(); + Tensor<float, 5, DataLayout> input(Eigen::array<int, 5>(74,37,11,137,17)); + Tensor<float, 3, DataLayout> kernel(3,4,2); + Tensor<float, 5, DataLayout> out(Eigen::array<int, 5>(74,35,8,136,17)); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); - std::size_t in_bytes = in.size() * sizeof(double); + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); std::size_t out_bytes = out.size() * sizeof(float); - double* d_in; + float* d_input; + float* d_kernel; float* d_out; - cudaMalloc((void**)(&d_in), in_bytes); + cudaMalloc((void**)(&d_input), input_bytes); + cudaMalloc((void**)(&d_kernel), kernel_bytes); cudaMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); cudaStream_t stream; assert(cudaStreamCreate(&stream) == cudaSuccess); Eigen::GpuDevice gpu_device(&stream); - Eigen::TensorMap<Eigen::Tensor<double, 3> > gpu_in(d_in, Eigen::array<int, 3>(72,53,97)); - Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<int, 3>(72,53,97)); + Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17); + Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2); + Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17); - gpu_out.device(gpu_device) = gpu_in.template cast<float>(); + Eigen::array<int, 3> dims(1,2,3); + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); - for (int i = 0; i < 72; ++i) { - for (int j = 0; j < 53; ++j) { - for (int k = 0; k < 97; ++k) { - VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), static_cast<float>(in(Eigen::array<int, 3>(i,j,k)))); + for (int i = 0; i < 74; ++i) { + for (int j = 0; j < 35; ++j) { + for (int k = 0; k < 8; ++k) { + for (int l = 0; l < 136; ++l) { + for (int m = 0; m < 17; ++m) { + const float result = out(i,j,k,l,m); + const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) + + input(i,j+1,k+0,l+0,m) * kernel(1,0,0) + + input(i,j+2,k+0,l+0,m) * kernel(2,0,0) + + input(i,j+0,k+1,l+0,m) * kernel(0,1,0) + + input(i,j+1,k+1,l+0,m) * kernel(1,1,0) + + input(i,j+2,k+1,l+0,m) * kernel(2,1,0) + + input(i,j+0,k+2,l+0,m) * kernel(0,2,0) + + input(i,j+1,k+2,l+0,m) * kernel(1,2,0) + + input(i,j+2,k+2,l+0,m) * kernel(2,2,0) + + input(i,j+0,k+3,l+0,m) * kernel(0,3,0) + + input(i,j+1,k+3,l+0,m) * kernel(1,3,0) + + input(i,j+2,k+3,l+0,m) * kernel(2,3,0) + + input(i,j+0,k+0,l+1,m) * kernel(0,0,1) + + input(i,j+1,k+0,l+1,m) * kernel(1,0,1) + + input(i,j+2,k+0,l+1,m) * kernel(2,0,1) + + input(i,j+0,k+1,l+1,m) * kernel(0,1,1) + + input(i,j+1,k+1,l+1,m) * kernel(1,1,1) + + input(i,j+2,k+1,l+1,m) * kernel(2,1,1) + + input(i,j+0,k+2,l+1,m) * kernel(0,2,1) + + input(i,j+1,k+2,l+1,m) * kernel(1,2,1) + + input(i,j+2,k+2,l+1,m) * kernel(2,2,1) + + input(i,j+0,k+3,l+1,m) * kernel(0,3,1) + + input(i,j+1,k+3,l+1,m) * kernel(1,3,1) + + input(i,j+2,k+3,l+1,m) * kernel(2,3,1); + VERIFY_IS_APPROX(result, expected); + } + } } } } } - void test_cxx11_tensor_cuda() { CALL_SUBTEST(test_cuda_elementwise_small()); @@ -506,9 +516,12 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST(test_cuda_reduction()); CALL_SUBTEST(test_cuda_contraction<ColMajor>()); CALL_SUBTEST(test_cuda_contraction<RowMajor>()); - CALL_SUBTEST(test_cuda_convolution_1d()); - CALL_SUBTEST(test_cuda_convolution_2d()); - CALL_SUBTEST(test_cuda_convolution_3d()); - CALL_SUBTEST(test_cuda_constant_broadcast()); - CALL_SUBTEST(test_cuda_cast()); + CALL_SUBTEST(test_cuda_convolution_1d<ColMajor>()); + CALL_SUBTEST(test_cuda_convolution_1d<RowMajor>()); + CALL_SUBTEST(test_cuda_convolution_inner_dim_col_major_1d()); + CALL_SUBTEST(test_cuda_convolution_inner_dim_row_major_1d()); + CALL_SUBTEST(test_cuda_convolution_2d<ColMajor>()); + CALL_SUBTEST(test_cuda_convolution_2d<RowMajor>()); + CALL_SUBTEST(test_cuda_convolution_3d<ColMajor>()); + CALL_SUBTEST(test_cuda_convolution_3d<RowMajor>()); } |