diff options
author | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2016-05-05 09:05:45 -0700 |
---|---|---|
committer | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2016-05-05 09:05:45 -0700 |
commit | f363e533aac5aac0d67fd5728b2e5b509c756bc8 (patch) | |
tree | b4ca443fba8fcb4bf454a07db83bed7424587c0a /unsupported/test | |
parent | 06d774bf5865cbecfde868b2554c177d95988552 (diff) |
Added tests for full contractions using thread pools and gpu devices.
Fixed a couple of issues in the corresponding code.
Diffstat (limited to 'unsupported/test')
-rw-r--r-- | unsupported/test/cxx11_tensor_contract_cuda.cu | 62 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_thread_pool.cpp | 39 |
2 files changed, 101 insertions, 0 deletions
diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu index 6d1ef07f9..98ac180ef 100644 --- a/unsupported/test/cxx11_tensor_contract_cuda.cu +++ b/unsupported/test/cxx11_tensor_contract_cuda.cu @@ -84,6 +84,65 @@ void test_cuda_contraction(int m_size, int k_size, int n_size) cudaFree((void*)d_t_result); } + +template<int DataLayout> +void test_scalar(int m_size, int k_size, int n_size) +{ + std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; + // with these dimensions, the output has 300 * 140 elements, which is + // more than 30 * 1024, which is the number of threads in blocks on + // a 15 SM GK110 GPU + Tensor<float, 2, DataLayout> t_left(m_size, k_size); + Tensor<float, 2, DataLayout> t_right(k_size, n_size); + Tensor<float, 0, DataLayout> t_result; + Tensor<float, 0, DataLayout> t_result_gpu; + Eigen::array<DimPair, 2> dims(DimPair(0, 0), DimPair(1, 1)); + + t_left.setRandom(); + t_right.setRandom(); + + std::size_t t_left_bytes = t_left.size() * sizeof(float); + std::size_t t_right_bytes = t_right.size() * sizeof(float); + std::size_t t_result_bytes = sizeof(float); + + float* d_t_left; + float* d_t_right; + float* d_t_result; + + cudaMalloc((void**)(&d_t_left), t_left_bytes); + cudaMalloc((void**)(&d_t_right), t_right_bytes); + cudaMalloc((void**)(&d_t_result), t_result_bytes); + + cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > + gpu_t_left(d_t_left, m_size, k_size); + Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > + gpu_t_right(d_t_right, k_size, n_size); + Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> > + gpu_t_result(d_t_result); + + gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); + t_result = t_left.contract(t_right, dims); + + cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); + if (fabs(t_result() - t_result_gpu()) > 1e-4f && + !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) { + std::cout << "mismatch detected: " << t_result() + << " vs " << t_result_gpu() << std::endl; + assert(false); + } + + cudaFree((void*)d_t_left); + cudaFree((void*)d_t_right); + cudaFree((void*)d_t_result); +} + + template<int DataLayout> void test_cuda_contraction_m() { for (int k = 32; k < 256; k++) { @@ -138,6 +197,9 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST_1(test_cuda_contraction<ColMajor>(128, 128, 128)); CALL_SUBTEST_1(test_cuda_contraction<RowMajor>(128, 128, 128)); + CALL_SUBTEST_1(test_scalar<ColMajor>(128, 128, 128)); + CALL_SUBTEST_1(test_scalar<RowMajor>(128, 128, 128)); + CALL_SUBTEST_2(test_cuda_contraction_m<ColMajor>()); CALL_SUBTEST_3(test_cuda_contraction_m<RowMajor>()); diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index e46197464..5fd3f0bf1 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -234,6 +234,42 @@ void test_multithread_contraction_agrees_with_singlethread() { template<int DataLayout> +void test_full_contraction() { + int contract_size1 = internal::random<int>(1, 500); + int contract_size2 = internal::random<int>(1, 500); + + Tensor<float, 2, DataLayout> left(contract_size1, + contract_size2); + Tensor<float, 2, DataLayout> right(contract_size1, + contract_size2); + left.setRandom(); + right.setRandom(); + + // add constants to shift values away from 0 for more precision + left += left.constant(1.5f); + right += right.constant(1.5f); + + typedef Tensor<float, 2>::DimensionPair DimPair; + Eigen::array<DimPair, 2> dims({{DimPair(0, 0), DimPair(1, 1)}}); + + Eigen::ThreadPool tp(internal::random<int>(2, 11)); + Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(2, 11)); + + Tensor<float, 0, DataLayout> st_result; + st_result = left.contract(right, dims); + + Tensor<float, 0, DataLayout> tp_result; + tp_result.device(thread_pool_device) = left.contract(right, dims); + + VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions())); + // if both of the values are very small, then do nothing (because the test will fail + // due to numerical precision issues when values are small) + if (fabs(st_result() - tp_result()) >= 1e-4) { + VERIFY_IS_APPROX(st_result(), tp_result()); + } +} + +template<int DataLayout> void test_multithreaded_reductions() { const int num_threads = internal::random<int>(3, 11); ThreadPool thread_pool(num_threads); @@ -324,6 +360,9 @@ void test_cxx11_tensor_thread_pool() CALL_SUBTEST_4(test_contraction_corner_cases<ColMajor>()); CALL_SUBTEST_4(test_contraction_corner_cases<RowMajor>()); + CALL_SUBTEST_4(test_full_contraction<ColMajor>()); + CALL_SUBTEST_4(test_full_contraction<RowMajor>()); + CALL_SUBTEST_5(test_multithreaded_reductions<ColMajor>()); CALL_SUBTEST_5(test_multithreaded_reductions<RowMajor>()); |