diff options
Diffstat (limited to 'unsupported/test')
34 files changed, 1760 insertions, 284 deletions
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 937acc432..f75bf9798 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -1,3 +1,17 @@ +# generate split test header file only if it does not yet exist +# in order to prevent a rebuild everytime cmake is configured +if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h) + file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h "") + foreach(i RANGE 1 999) + file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h + "#ifdef EIGEN_TEST_PART_${i}\n" + "#define CALL_SUBTEST_${i}(FUNC) CALL_SUBTEST(FUNC)\n" + "#else\n" + "#define CALL_SUBTEST_${i}(FUNC)\n" + "#endif\n\n" + ) + endforeach() +endif() set_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT "Unsupported") add_custom_target(BuildUnsupported) @@ -99,61 +113,104 @@ ei_add_test(kronecker_product) if(EIGEN_TEST_CXX11) # It should be safe to always run these tests as there is some fallback code for # older compiler that don't support cxx11. - ei_add_test(cxx11_meta "-std=c++0x") - ei_add_test(cxx11_tensor_simple "-std=c++0x") -# ei_add_test(cxx11_tensor_symmetry "-std=c++0x") - ei_add_test(cxx11_tensor_assign "-std=c++0x") - ei_add_test(cxx11_tensor_dimension "-std=c++0x") - ei_add_test(cxx11_tensor_index_list "-std=c++0x") - ei_add_test(cxx11_tensor_mixed_indices "-std=c++0x") - ei_add_test(cxx11_tensor_comparisons "-std=c++0x") - ei_add_test(cxx11_tensor_contraction "-std=c++0x") - ei_add_test(cxx11_tensor_convolution "-std=c++0x") - ei_add_test(cxx11_tensor_expr "-std=c++0x") - ei_add_test(cxx11_tensor_math "-std=c++0x") - ei_add_test(cxx11_tensor_forced_eval "-std=c++0x") - ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") - ei_add_test(cxx11_tensor_const "-std=c++0x") - ei_add_test(cxx11_tensor_of_const_values "-std=c++0x") - ei_add_test(cxx11_tensor_of_complex "-std=c++0x") - ei_add_test(cxx11_tensor_of_strings "-std=c++0x") - ei_add_test(cxx11_tensor_uint128 "-std=c++0x") - ei_add_test(cxx11_tensor_intdiv "-std=c++0x") - ei_add_test(cxx11_tensor_lvalue "-std=c++0x") - ei_add_test(cxx11_tensor_map "-std=c++0x") - ei_add_test(cxx11_tensor_broadcasting "-std=c++0x") - ei_add_test(cxx11_tensor_chipping "-std=c++0x") - ei_add_test(cxx11_tensor_concatenation "-std=c++0x") - ei_add_test(cxx11_tensor_inflation "-std=c++0x") - ei_add_test(cxx11_tensor_morphing "-std=c++0x") - ei_add_test(cxx11_tensor_padding "-std=c++0x") - ei_add_test(cxx11_tensor_patch "-std=c++0x") - ei_add_test(cxx11_tensor_image_patch "-std=c++0x") - ei_add_test(cxx11_tensor_volume_patch "-std=c++0x") - ei_add_test(cxx11_tensor_reduction "-std=c++0x") - ei_add_test(cxx11_tensor_argmax "-std=c++0x") - ei_add_test(cxx11_tensor_shuffling "-std=c++0x") - ei_add_test(cxx11_tensor_striding "-std=c++0x") + set(CMAKE_CXX_STANDARD 11) + + ei_add_test(cxx11_float16) + ei_add_test(cxx11_meta) + ei_add_test(cxx11_tensor_simple) +# ei_add_test(cxx11_tensor_symmetry) + ei_add_test(cxx11_tensor_assign) + ei_add_test(cxx11_tensor_dimension) + ei_add_test(cxx11_tensor_index_list) + ei_add_test(cxx11_tensor_mixed_indices) + ei_add_test(cxx11_tensor_comparisons) + ei_add_test(cxx11_tensor_contraction) + ei_add_test(cxx11_tensor_convolution) + ei_add_test(cxx11_tensor_expr) + ei_add_test(cxx11_tensor_math) + ei_add_test(cxx11_tensor_forced_eval) + ei_add_test(cxx11_tensor_fixed_size) + ei_add_test(cxx11_tensor_const) + ei_add_test(cxx11_tensor_of_const_values) + ei_add_test(cxx11_tensor_of_complex) + ei_add_test(cxx11_tensor_of_strings) + ei_add_test(cxx11_tensor_intdiv) + ei_add_test(cxx11_tensor_lvalue) + ei_add_test(cxx11_tensor_map) + ei_add_test(cxx11_tensor_broadcasting) + ei_add_test(cxx11_tensor_chipping) + ei_add_test(cxx11_tensor_concatenation) + ei_add_test(cxx11_tensor_inflation) + ei_add_test(cxx11_tensor_morphing) + ei_add_test(cxx11_tensor_padding) + ei_add_test(cxx11_tensor_patch) + ei_add_test(cxx11_tensor_image_patch) + ei_add_test(cxx11_tensor_volume_patch) + ei_add_test(cxx11_tensor_reduction) + ei_add_test(cxx11_tensor_argmax) + ei_add_test(cxx11_tensor_shuffling) + ei_add_test(cxx11_tensor_striding) + ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}") ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}") - ei_add_test(cxx11_tensor_ref "-std=c++0x") - ei_add_test(cxx11_tensor_random "-std=c++0x") - ei_add_test(cxx11_tensor_casts "-std=c++0x") - ei_add_test(cxx11_tensor_reverse "-std=c++0x") - ei_add_test(cxx11_tensor_layout_swap "-std=c++0x") - ei_add_test(cxx11_tensor_io "-std=c++0x") - ei_add_test(cxx11_tensor_generator "-std=c++0x") - ei_add_test(cxx11_tensor_custom_op "-std=c++0x") - ei_add_test(cxx11_tensor_custom_index "-std=c++0x") - ei_add_test(cxx11_tensor_sugar "-std=c++0x") - ei_add_test(cxx11_tensor_fft "-std=c++0x") - ei_add_test(cxx11_tensor_ifft "-std=c++0x") - - # These tests needs nvcc -# ei_add_test(cxx11_tensor_device "-std=c++0x") -# ei_add_test(cxx11_tensor_cuda "-std=c++0x") -# ei_add_test(cxx11_tensor_contract_cuda "-std=c++0x") -# ei_add_test(cxx11_tensor_reduction_cuda "-std=c++0x") -# ei_add_test(cxx11_tensor_random_cuda "-std=c++0x") -# ei_add_test(cxx11_tensor_argmax_cuda "-std=c++0x") + ei_add_test(cxx11_tensor_ref) + ei_add_test(cxx11_tensor_random) + ei_add_test(cxx11_tensor_casts) + ei_add_test(cxx11_tensor_roundings) + ei_add_test(cxx11_tensor_reverse) + ei_add_test(cxx11_tensor_layout_swap) + ei_add_test(cxx11_tensor_io) + ei_add_test(cxx11_tensor_generator) + ei_add_test(cxx11_tensor_custom_op) + ei_add_test(cxx11_tensor_custom_index) + ei_add_test(cxx11_tensor_sugar) + ei_add_test(cxx11_tensor_fft) + ei_add_test(cxx11_tensor_ifft) + ei_add_test(cxx11_tensor_empty) + + if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") + # This test requires __uint128_t which is only available on 64bit systems + ei_add_test(cxx11_tensor_uint128) + endif() + +endif() + +# These tests needs nvcc +find_package(CUDA 7.0) +if(CUDA_FOUND AND EIGEN_TEST_CUDA) + # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor + # and -fno-check-new flags since they trigger thousands of compilation warnings + # in the CUDA runtime + string(REPLACE "-pedantic" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + string(REPLACE "-Wundef" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + string(REPLACE "-Wnon-virtual-dtor" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + + message(STATUS "Flags used to compile cuda code: " ${CMAKE_CXX_FLAGS}) + + if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE) + endif() + if(EIGEN_TEST_CUDA_CLANG) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_${EIGEN_CUDA_COMPUTE_ARCH}") + endif() + + set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_${EIGEN_CUDA_COMPUTE_ARCH} -Xcudafe \"--display_error_number\"") + cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") + set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") + + ei_add_test(cxx11_tensor_device) + ei_add_test(cxx11_tensor_cuda) + ei_add_test(cxx11_tensor_contract_cuda) + ei_add_test(cxx11_tensor_reduction_cuda) + ei_add_test(cxx11_tensor_argmax_cuda) + ei_add_test(cxx11_tensor_cast_float16_cuda) + + # The random number generation code requires arch 3.5 or greater. + if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 34) + ei_add_test(cxx11_tensor_random_cuda) + endif() + + ei_add_test(cxx11_tensor_of_float16_cuda) + unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) endif() diff --git a/unsupported/test/NonLinearOptimization.cpp b/unsupported/test/NonLinearOptimization.cpp index 724ea7b5b..6a5ed057f 100644 --- a/unsupported/test/NonLinearOptimization.cpp +++ b/unsupported/test/NonLinearOptimization.cpp @@ -14,6 +14,9 @@ using std::sqrt; +// tolerance for chekcing number of iterations +#define LM_EVAL_COUNT_TOL 4/3 + int fcn_chkder(const VectorXd &x, VectorXd &fvec, MatrixXd &fjac, int iflag) { /* subroutine fcn for chkder example. */ @@ -1023,7 +1026,8 @@ void testNistLanczos1(void) VERIFY_IS_EQUAL(lm.njev, 72); // check norm^2 std::cout.precision(30); - VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.4290986055242372e-25); // should be 1.4307867721E-25, but nist results are on 128-bit floats + std::cout << lm.fvec.squaredNorm() << "\n"; + VERIFY(lm.fvec.squaredNorm() <= 1.4307867721E-25); // check x VERIFY_IS_APPROX(x[0], 9.5100000027E-02); VERIFY_IS_APPROX(x[1], 1.0000000001E+00); @@ -1044,7 +1048,7 @@ void testNistLanczos1(void) VERIFY_IS_EQUAL(lm.nfev, 9); VERIFY_IS_EQUAL(lm.njev, 8); // check norm^2 - VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.430571737783119393e-25); // should be 1.4307867721E-25, but nist results are on 128-bit floats + VERIFY(lm.fvec.squaredNorm() <= 1.4307867721E-25); // check x VERIFY_IS_APPROX(x[0], 9.5100000027E-02); VERIFY_IS_APPROX(x[1], 1.0000000001E+00); @@ -1354,8 +1358,12 @@ void testNistMGH17(void) // check return value VERIFY_IS_EQUAL(info, 2); - VERIFY(lm.nfev < 650); // 602 - VERIFY(lm.njev < 600); // 545 + ++g_test_level; + VERIFY_IS_EQUAL(lm.nfev, 602); // 602 + VERIFY_IS_EQUAL(lm.njev, 545); // 545 + --g_test_level; + VERIFY(lm.nfev < 602 * LM_EVAL_COUNT_TOL); + VERIFY(lm.njev < 545 * LM_EVAL_COUNT_TOL); /* * Second try diff --git a/unsupported/test/autodiff.cpp b/unsupported/test/autodiff.cpp index 1aa1b3d2d..374f86df9 100644 --- a/unsupported/test/autodiff.cpp +++ b/unsupported/test/autodiff.cpp @@ -16,7 +16,7 @@ EIGEN_DONT_INLINE Scalar foo(const Scalar& x, const Scalar& y) using namespace std; // return x+std::sin(y); EIGEN_ASM_COMMENT("mybegin"); - return static_cast<Scalar>(x*2 - pow(x,2) + 2*sqrt(y*y) - 4 * sin(x) + 2 * cos(y) - exp(-0.5*x*x)); + return static_cast<Scalar>(x*2 - 1 + pow(1+x,2) + 2*sqrt(y*y+0) - 4 * sin(0+x) + 2 * cos(y+0) - exp(-0.5*x*x+0)); //return x+2*y*x;//x*2 -std::pow(x,2);//(2*y/x);// - y*2; EIGEN_ASM_COMMENT("myend"); } diff --git a/unsupported/test/autodiff_scalar.cpp b/unsupported/test/autodiff_scalar.cpp index ba4b5aec4..c631c734a 100644 --- a/unsupported/test/autodiff_scalar.cpp +++ b/unsupported/test/autodiff_scalar.cpp @@ -30,6 +30,10 @@ template<typename Scalar> void check_atan2() VERIFY_IS_APPROX(res.value(), x.value()); VERIFY_IS_APPROX(res.derivatives(), x.derivatives()); + + res = atan2(r*s+0, r*c+0); + VERIFY_IS_APPROX(res.value(), x.value()); + VERIFY_IS_APPROX(res.derivatives(), x.derivatives()); } diff --git a/unsupported/test/cxx11_float16.cpp b/unsupported/test/cxx11_float16.cpp new file mode 100644 index 000000000..2dc0872d8 --- /dev/null +++ b/unsupported/test/cxx11_float16.cpp @@ -0,0 +1,155 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_float16 + +#include "main.h" +#include <Eigen/src/Core/arch/CUDA/Half.h> + +using Eigen::half; + +void test_conversion() +{ + // Conversion from float. + VERIFY_IS_EQUAL(half(1.0f).x, 0x3c00); + VERIFY_IS_EQUAL(half(0.5f).x, 0x3800); + VERIFY_IS_EQUAL(half(0.33333f).x, 0x3555); + VERIFY_IS_EQUAL(half(0.0f).x, 0x0000); + VERIFY_IS_EQUAL(half(-0.0f).x, 0x8000); + VERIFY_IS_EQUAL(half(65504.0f).x, 0x7bff); + VERIFY_IS_EQUAL(half(65536.0f).x, 0x7c00); // Becomes infinity. + + // Denormals. + VERIFY_IS_EQUAL(half(-5.96046e-08f).x, 0x8001); + VERIFY_IS_EQUAL(half(5.96046e-08f).x, 0x0001); + VERIFY_IS_EQUAL(half(1.19209e-07f).x, 0x0002); + + // Verify round-to-nearest-even behavior. + float val1 = float(half(__half{0x3c00})); + float val2 = float(half(__half{0x3c01})); + float val3 = float(half(__half{0x3c02})); + VERIFY_IS_EQUAL(half(0.5 * (val1 + val2)).x, 0x3c00); + VERIFY_IS_EQUAL(half(0.5 * (val2 + val3)).x, 0x3c02); + + // Conversion from int. + VERIFY_IS_EQUAL(half(-1).x, 0xbc00); + VERIFY_IS_EQUAL(half(0).x, 0x0000); + VERIFY_IS_EQUAL(half(1).x, 0x3c00); + VERIFY_IS_EQUAL(half(2).x, 0x4000); + VERIFY_IS_EQUAL(half(3).x, 0x4200); + + // Conversion from bool. + VERIFY_IS_EQUAL(half(false).x, 0x0000); + VERIFY_IS_EQUAL(half(true).x, 0x3c00); + + // Conversion to float. + VERIFY_IS_EQUAL(float(half(__half{0x0000})), 0.0f); + VERIFY_IS_EQUAL(float(half(__half{0x3c00})), 1.0f); + + // Denormals. + VERIFY_IS_APPROX(float(half(__half{0x8001})), -5.96046e-08f); + VERIFY_IS_APPROX(float(half(__half{0x0001})), 5.96046e-08f); + VERIFY_IS_APPROX(float(half(__half{0x0002})), 1.19209e-07f); + + // NaNs and infinities. + VERIFY(!(numext::isinf)(float(half(65504.0f)))); // Largest finite number. + VERIFY(!(numext::isnan)(float(half(0.0f)))); + VERIFY((numext::isinf)(float(half(__half{0xfc00})))); + VERIFY((numext::isnan)(float(half(__half{0xfc01})))); + VERIFY((numext::isinf)(float(half(__half{0x7c00})))); + VERIFY((numext::isnan)(float(half(__half{0x7c01})))); + +#if !EIGEN_COMP_MSVC + // Visual Studio errors out on divisions by 0 + VERIFY((numext::isnan)(float(half(0.0 / 0.0)))); + VERIFY((numext::isinf)(float(half(1.0 / 0.0)))); + VERIFY((numext::isinf)(float(half(-1.0 / 0.0)))); +#endif + + // Exactly same checks as above, just directly on the half representation. + VERIFY(!(numext::isinf)(half(__half{0x7bff}))); + VERIFY(!(numext::isnan)(half(__half{0x0000}))); + VERIFY((numext::isinf)(half(__half{0xfc00}))); + VERIFY((numext::isnan)(half(__half{0xfc01}))); + VERIFY((numext::isinf)(half(__half{0x7c00}))); + VERIFY((numext::isnan)(half(__half{0x7c01}))); + +#if !EIGEN_COMP_MSVC + // Visual Studio errors out on divisions by 0 + VERIFY((numext::isnan)(half(0.0 / 0.0))); + VERIFY((numext::isinf)(half(1.0 / 0.0))); + VERIFY((numext::isinf)(half(-1.0 / 0.0))); +#endif +} + +void test_arithmetic() +{ + VERIFY_IS_EQUAL(float(half(2) + half(2)), 4); + VERIFY_IS_EQUAL(float(half(2) + half(-2)), 0); + VERIFY_IS_APPROX(float(half(0.33333f) + half(0.66667f)), 1.0f); + VERIFY_IS_EQUAL(float(half(2.0f) * half(-5.5f)), -11.0f); + VERIFY_IS_APPROX(float(half(1.0f) / half(3.0f)), 0.33333f); + VERIFY_IS_EQUAL(float(-half(4096.0f)), -4096.0f); + VERIFY_IS_EQUAL(float(-half(-4096.0f)), 4096.0f); +} + +void test_comparison() +{ + VERIFY(half(1.0f) > half(0.5f)); + VERIFY(half(0.5f) < half(1.0f)); + VERIFY(!(half(1.0f) < half(0.5f))); + VERIFY(!(half(0.5f) > half(1.0f))); + + VERIFY(!(half(4.0f) > half(4.0f))); + VERIFY(!(half(4.0f) < half(4.0f))); + + VERIFY(!(half(0.0f) < half(-0.0f))); + VERIFY(!(half(-0.0f) < half(0.0f))); + VERIFY(!(half(0.0f) > half(-0.0f))); + VERIFY(!(half(-0.0f) > half(0.0f))); + + VERIFY(half(0.2f) > half(-1.0f)); + VERIFY(half(-1.0f) < half(0.2f)); + VERIFY(half(-16.0f) < half(-15.0f)); + + VERIFY(half(1.0f) == half(1.0f)); + VERIFY(half(1.0f) != half(2.0f)); + + // Comparisons with NaNs and infinities. + VERIFY(!(half(0.0 / 0.0) == half(0.0 / 0.0))); + VERIFY(half(0.0 / 0.0) != half(0.0 / 0.0)); + + VERIFY(!(half(1.0) == half(0.0 / 0.0))); + VERIFY(!(half(1.0) < half(0.0 / 0.0))); + VERIFY(!(half(1.0) > half(0.0 / 0.0))); + VERIFY(half(1.0) != half(0.0 / 0.0)); + + VERIFY(half(1.0) < half(1.0 / 0.0)); + VERIFY(half(1.0) > half(-1.0 / 0.0)); +} + +void test_functions() +{ + VERIFY_IS_EQUAL(float(numext::abs(half(3.5f))), 3.5f); + VERIFY_IS_EQUAL(float(numext::abs(half(-3.5f))), 3.5f); + + VERIFY_IS_EQUAL(float(numext::exp(half(0.0f))), 1.0f); + VERIFY_IS_APPROX(float(numext::exp(half(EIGEN_PI))), float(20.0 + EIGEN_PI)); + + VERIFY_IS_EQUAL(float(numext::log(half(1.0f))), 0.0f); + VERIFY_IS_APPROX(float(numext::log(half(10.0f))), 2.30273f); +} + +void test_cxx11_float16() +{ + CALL_SUBTEST(test_conversion()); + CALL_SUBTEST(test_arithmetic()); + CALL_SUBTEST(test_comparison()); + CALL_SUBTEST(test_functions()); +} diff --git a/unsupported/test/cxx11_meta.cpp b/unsupported/test/cxx11_meta.cpp index 4f45e1dd3..ecac3add1 100644 --- a/unsupported/test/cxx11_meta.cpp +++ b/unsupported/test/cxx11_meta.cpp @@ -9,6 +9,7 @@ #include "main.h" +#include <array> #include <Eigen/CXX11/Core> using Eigen::internal::is_same; @@ -249,8 +250,8 @@ static void test_is_same_gf() { VERIFY((!is_same_gf<dummy_a, dummy_b>::value)); VERIFY((!!is_same_gf<dummy_a, dummy_a>::value)); - VERIFY_IS_EQUAL((!!is_same_gf<dummy_a, dummy_b>::global_flags), 0); - VERIFY_IS_EQUAL((!!is_same_gf<dummy_a, dummy_a>::global_flags), 0); + VERIFY_IS_EQUAL((!!is_same_gf<dummy_a, dummy_b>::global_flags), false); + VERIFY_IS_EQUAL((!!is_same_gf<dummy_a, dummy_a>::global_flags), false); } static void test_apply_op() @@ -293,8 +294,8 @@ static void test_arg_reductions() static void test_array_reverse_and_reduce() { - std::array<int, 6> a{{4, 8, 15, 16, 23, 42}}; - std::array<int, 6> b{{42, 23, 16, 15, 8, 4}}; + array<int, 6> a{{4, 8, 15, 16, 23, 42}}; + array<int, 6> b{{42, 23, 16, 15, 8, 4}}; // there is no operator<< for std::array, so VERIFY_IS_EQUAL will // not compile @@ -308,11 +309,11 @@ static void test_array_reverse_and_reduce() static void test_array_zip_and_apply() { - std::array<int, 6> a{{4, 8, 15, 16, 23, 42}}; - std::array<int, 6> b{{0, 1, 2, 3, 4, 5}}; - std::array<int, 6> c{{4, 9, 17, 19, 27, 47}}; - std::array<int, 6> d{{0, 8, 30, 48, 92, 210}}; - std::array<int, 6> e{{0, 2, 4, 6, 8, 10}}; + array<int, 6> a{{4, 8, 15, 16, 23, 42}}; + array<int, 6> b{{0, 1, 2, 3, 4, 5}}; + array<int, 6> c{{4, 9, 17, 19, 27, 47}}; + array<int, 6> d{{0, 8, 30, 48, 92, 210}}; + array<int, 6> e{{0, 2, 4, 6, 8, 10}}; VERIFY((array_zip<sum_op>(a, b) == c)); VERIFY((array_zip<product_op>(a, b) == d)); @@ -325,8 +326,8 @@ static void test_array_zip_and_apply() static void test_array_misc() { - std::array<int, 3> a3{{1, 1, 1}}; - std::array<int, 6> a6{{2, 2, 2, 2, 2, 2}}; + array<int, 3> a3{{1, 1, 1}}; + array<int, 6> a6{{2, 2, 2, 2, 2, 2}}; VERIFY((repeat<3, int>(1) == a3)); VERIFY((repeat<6, int>(2) == a6)); diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cpp b/unsupported/test/cxx11_tensor_argmax_cuda.cu index d37490d15..41ccbe974 100644 --- a/unsupported/test/cxx11_tensor_argmax_cuda.cpp +++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu @@ -7,8 +7,8 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -// TODO(mdevin): Free the cuda memory. +#define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_FUNC cxx11_tensor_cuda #define EIGEN_USE_GPU @@ -56,6 +56,10 @@ void test_cuda_simple_argmax() VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1); VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0); + + cudaFree(d_in); + cudaFree(d_out_max); + cudaFree(d_out_min); } template <int DataLayout> @@ -141,6 +145,9 @@ void test_cuda_argmax_dim() // Expect max to be in the last index of the reduced dimension VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); } + + cudaFree(d_in); + cudaFree(d_out); } } @@ -227,15 +234,18 @@ void test_cuda_argmin_dim() // Expect max to be in the last index of the reduced dimension VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); } + + cudaFree(d_in); + cudaFree(d_out); } } void test_cxx11_tensor_cuda() { - CALL_SUBTEST(test_cuda_simple_argmax<RowMajor>()); - CALL_SUBTEST(test_cuda_simple_argmax<ColMajor>()); - CALL_SUBTEST(test_cuda_argmax_dim<RowMajor>()); - CALL_SUBTEST(test_cuda_argmax_dim<ColMajor>()); - CALL_SUBTEST(test_cuda_argmin_dim<RowMajor>()); - CALL_SUBTEST(test_cuda_argmin_dim<ColMajor>()); + CALL_SUBTEST_1(test_cuda_simple_argmax<RowMajor>()); + CALL_SUBTEST_1(test_cuda_simple_argmax<ColMajor>()); + CALL_SUBTEST_2(test_cuda_argmax_dim<RowMajor>()); + CALL_SUBTEST_2(test_cuda_argmax_dim<ColMajor>()); + CALL_SUBTEST_3(test_cuda_argmin_dim<RowMajor>()); + CALL_SUBTEST_3(test_cuda_argmin_dim<ColMajor>()); } diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp index d16aaf847..e5cf61fe1 100644 --- a/unsupported/test/cxx11_tensor_assign.cpp +++ b/unsupported/test/cxx11_tensor_assign.cpp @@ -29,8 +29,8 @@ static void test_1d() int row_major[6]; memset(col_major, 0, 6*sizeof(int)); memset(row_major, 0, 6*sizeof(int)); - TensorMap<Tensor<int, 1>> vec3(col_major, 6); - TensorMap<Tensor<int, 1, RowMajor>> vec4(row_major, 6); + TensorMap<Tensor<int, 1> > vec3(col_major, 6); + TensorMap<Tensor<int, 1, RowMajor> > vec4(row_major, 6); vec3 = vec1; vec4 = vec2; @@ -92,8 +92,8 @@ static void test_2d() int row_major[6]; memset(col_major, 0, 6*sizeof(int)); memset(row_major, 0, 6*sizeof(int)); - TensorMap<Tensor<int, 2>> mat3(row_major, 2, 3); - TensorMap<Tensor<int, 2, RowMajor>> mat4(col_major, 2, 3); + TensorMap<Tensor<int, 2> > mat3(row_major, 2, 3); + TensorMap<Tensor<int, 2, RowMajor> > mat4(col_major, 2, 3); mat3 = mat1; mat4 = mat2; @@ -152,8 +152,8 @@ static void test_3d() int row_major[2*3*7]; memset(col_major, 0, 2*3*7*sizeof(int)); memset(row_major, 0, 2*3*7*sizeof(int)); - TensorMap<Tensor<int, 3>> mat3(col_major, 2, 3, 7); - TensorMap<Tensor<int, 3, RowMajor>> mat4(row_major, 2, 3, 7); + TensorMap<Tensor<int, 3> > mat3(col_major, 2, 3, 7); + TensorMap<Tensor<int, 3, RowMajor> > mat4(row_major, 2, 3, 7); mat3 = mat1; mat4 = mat2; diff --git a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu new file mode 100644 index 000000000..f22b99de8 --- /dev/null +++ b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu @@ -0,0 +1,80 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_cuda +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_GPU + + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +void test_cuda_conversion() { + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int num_elem = 101; + + Tensor<float, 1> floats(num_elem); + floats.setRandom(); + + float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); + float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float)); + + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float( + d_float, num_elem); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half( + d_half, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv( + d_conv, num_elem); + + gpu_device.memcpyHostToDevice(d_float, floats.data(), num_elem*sizeof(float)); + + gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>(); + gpu_conv.device(gpu_device) = gpu_half.cast<float>(); + + Tensor<float, 1> initial(num_elem); + Tensor<float, 1> final(num_elem); + gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float)); + gpu_device.synchronize(); + + for (int i = 0; i < num_elem; ++i) { + VERIFY_IS_APPROX(initial(i), final(i)); + } + + gpu_device.deallocate(d_float); + gpu_device.deallocate(d_half); + gpu_device.deallocate(d_conv); +} + + +void test_fallback_conversion() { + int num_elem = 101; + Tensor<float, 1> floats(num_elem); + floats.setRandom(); + + Eigen::Tensor<Eigen::half, 1> halfs = floats.cast<Eigen::half>(); + Eigen::Tensor<float, 1> conv = halfs.cast<float>(); + + for (int i = 0; i < num_elem; ++i) { + VERIFY_IS_APPROX(floats(i), conv(i)); + } +} + + +void test_cxx11_tensor_cast_float16_cuda() +{ + CALL_SUBTEST(test_cuda_conversion()); + CALL_SUBTEST(test_fallback_conversion()); +} diff --git a/unsupported/test/cxx11_tensor_casts.cpp b/unsupported/test/cxx11_tensor_casts.cpp index 729e43327..3c6d0d2ff 100644 --- a/unsupported/test/cxx11_tensor_casts.cpp +++ b/unsupported/test/cxx11_tensor_casts.cpp @@ -24,12 +24,12 @@ static void test_simple_cast() cplextensor.setRandom(); chartensor = ftensor.cast<char>(); - cplextensor = ftensor.cast<std::complex<float>>(); + cplextensor = ftensor.cast<std::complex<float> >(); for (int i = 0; i < 20; ++i) { for (int j = 0; j < 30; ++j) { VERIFY_IS_EQUAL(chartensor(i,j), static_cast<char>(ftensor(i,j))); - VERIFY_IS_EQUAL(cplextensor(i,j), static_cast<std::complex<float>>(ftensor(i,j))); + VERIFY_IS_EQUAL(cplextensor(i,j), static_cast<std::complex<float> >(ftensor(i,j))); } } } diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cpp b/unsupported/test/cxx11_tensor_contract_cuda.cu index 035a093e6..6d1ef07f9 100644 --- a/unsupported/test/cxx11_tensor_contract_cuda.cpp +++ b/unsupported/test/cxx11_tensor_contract_cuda.cu @@ -22,16 +22,16 @@ using Eigen::Tensor; typedef Tensor<float, 1>::DimensionPair DimPair; template<int DataLayout> -static void test_cuda_contraction(int m_size, int k_size, int n_size) +void test_cuda_contraction(int m_size, int k_size, int n_size) { - cout<<"Calling with ("<<m_size<<","<<k_size<<","<<n_size<<")"<<std::endl; + std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; // with these dimensions, the output has 300 * 140 elements, which is // more than 30 * 1024, which is the number of threads in blocks on // a 15 SM GK110 GPU - Tensor<float, 2, DataLayout> t_left(Eigen::array<int, 2>(m_size, k_size)); - Tensor<float, 2, DataLayout> t_right(Eigen::array<int, 2>(k_size, n_size)); - Tensor<float, 2, DataLayout> t_result(Eigen::array<int, 2>(m_size, n_size)); - Tensor<float, 2, DataLayout> t_result_gpu(Eigen::array<int, 2>(m_size, n_size)); + Tensor<float, 2, DataLayout> t_left(m_size, k_size); + Tensor<float, 2, DataLayout> t_right(k_size, n_size); + Tensor<float, 2, DataLayout> t_result(m_size, n_size); + Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size); Eigen::array<DimPair, 1> dims(DimPair(1, 0)); t_left.setRandom(); @@ -67,12 +67,16 @@ static void test_cuda_contraction(int m_size, int k_size, int n_size) t_result = t_left.contract(t_right, dims); cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); - for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { - if (fabs(t_result.data()[i] - t_result_gpu.data()[i]) >= 1e-4) { - cout << "mismatch detected at index " << i << ": " << t_result.data()[i] - << " vs " << t_result_gpu.data()[i] << endl; - assert(false); + for (size_t i = 0; i < t_result.size(); i++) { + if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) { + continue; } + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) { + continue; + } + std::cout << "mismatch detected at index " << i << ": " << t_result(i) + << " vs " << t_result_gpu(i) << std::endl; + assert(false); } cudaFree((void*)d_t_left); @@ -80,41 +84,69 @@ static void test_cuda_contraction(int m_size, int k_size, int n_size) cudaFree((void*)d_t_result); } - -void test_cxx11_tensor_cuda() -{ - cout<<"Calling contraction tests"<<std::endl; - CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, 128, 128)); - CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, 128, 128)); +template<int DataLayout> +void test_cuda_contraction_m() { for (int k = 32; k < 256; k++) { - CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, k, 128)); - CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, k, 128)); + test_cuda_contraction<ColMajor>(k, 128, 128); + test_cuda_contraction<RowMajor>(k, 128, 128); } +} + +template<int DataLayout> +void test_cuda_contraction_k() { for (int k = 32; k < 256; k++) { - CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, 128, k)); - CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, 128, k)); + test_cuda_contraction<ColMajor>(128, k, 128); + test_cuda_contraction<RowMajor>(128, k, 128); } +} + +template<int DataLayout> +void test_cuda_contraction_n() { for (int k = 32; k < 256; k++) { - CALL_SUBTEST(test_cuda_contraction<ColMajor>(k, 128, 128)); - CALL_SUBTEST(test_cuda_contraction<RowMajor>(k, 128, 128)); + test_cuda_contraction<ColMajor>(128, 128, k); + test_cuda_contraction<RowMajor>(128, 128, k); } +} - int m_sizes[] = {31, 39, 63, 64, 65, - 127, 129, 255, 257, 511, - 512, 513, 1023, 1024, 1025 }; - int n_sizes[] = {31, 39, 63, 64, 65, - 127, 129, 255, 257, 511, - 512, 513, 1023, 1024, 1025 }; - - int k_sizes[] = { 31, 39, 63, 64, 65, - 95, 96, 127, 129, 255, - 257, 511, 512, 513, 1023, - 1024, 1025}; - for (int i = 0; i <15; i++) - for (int j = 0; j < 15; j++) +template<int DataLayout> +void test_cuda_contraction_sizes() { + int m_sizes[] = { 31, 39, 63, 64, 65, + 127, 129, 255, 257 , 511, + 512, 513, 1023, 1024, 1025}; + + int n_sizes[] = { 31, 39, 63, 64, 65, + 127, 129, 255, 257, 511, + 512, 513, 1023, 1024, 1025}; + + int k_sizes[] = { 31, 39, 63, 64, 65, + 95, 96, 127, 129, 255, + 257, 511, 512, 513, 1023, + 1024, 1025}; + + for (int i = 0; i < 15; i++) { + for (int j = 0; j < 15; j++) { for (int k = 0; k < 17; k++) { - CALL_SUBTEST(test_cuda_contraction<ColMajor>(m_sizes[i], n_sizes[j], k_sizes[k])); - CALL_SUBTEST(test_cuda_contraction<RowMajor>(m_sizes[i], n_sizes[j], k_sizes[k])); + test_cuda_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]); } + } + } +} + +void test_cxx11_tensor_cuda() +{ + CALL_SUBTEST_1(test_cuda_contraction<ColMajor>(128, 128, 128)); + CALL_SUBTEST_1(test_cuda_contraction<RowMajor>(128, 128, 128)); + + CALL_SUBTEST_2(test_cuda_contraction_m<ColMajor>()); + CALL_SUBTEST_3(test_cuda_contraction_m<RowMajor>()); + + CALL_SUBTEST_4(test_cuda_contraction_k<ColMajor>()); + CALL_SUBTEST_5(test_cuda_contraction_k<RowMajor>()); + + CALL_SUBTEST_6(test_cuda_contraction_n<ColMajor>()); + CALL_SUBTEST_7(test_cuda_contraction_n<RowMajor>()); + + CALL_SUBTEST_8(test_cuda_contraction_sizes<ColMajor>()); + CALL_SUBTEST_9(test_cuda_contraction_sizes<RowMajor>()); } diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp index b0d52c6cf..0e16308a2 100644 --- a/unsupported/test/cxx11_tensor_contraction.cpp +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -29,7 +29,7 @@ static void test_evals() Tensor<float, 2, DataLayout> mat4(3,3); mat4.setZero(); - Eigen::array<DimPair, 1> dims3({{DimPair(0, 0)}}); + Eigen::array<DimPair, 1> dims3 = {{DimPair(0, 0)}}; typedef TensorEvaluator<decltype(mat1.contract(mat2, dims3)), DefaultDevice> Evaluator; Evaluator eval(mat1.contract(mat2, dims3), DefaultDevice()); eval.evalTo(mat4.data()); @@ -49,7 +49,7 @@ static void test_evals() Tensor<float, 2, DataLayout> mat5(2,2); mat5.setZero(); - Eigen::array<DimPair, 1> dims4({{DimPair(1, 1)}}); + Eigen::array<DimPair, 1> dims4 = {{DimPair(1, 1)}}; typedef TensorEvaluator<decltype(mat1.contract(mat2, dims4)), DefaultDevice> Evaluator2; Evaluator2 eval2(mat1.contract(mat2, dims4), DefaultDevice()); eval2.evalTo(mat5.data()); @@ -64,7 +64,7 @@ static void test_evals() Tensor<float, 2, DataLayout> mat6(2,2); mat6.setZero(); - Eigen::array<DimPair, 1> dims6({{DimPair(1, 0)}}); + Eigen::array<DimPair, 1> dims6 = {{DimPair(1, 0)}}; typedef TensorEvaluator<decltype(mat1.contract(mat3, dims6)), DefaultDevice> Evaluator3; Evaluator3 eval3(mat1.contract(mat3, dims6), DefaultDevice()); eval3.evalTo(mat6.data()); @@ -89,7 +89,7 @@ static void test_scalar() Tensor<float, 1, DataLayout> scalar(1); scalar.setZero(); - Eigen::array<DimPair, 1> dims({{DimPair(0, 0)}}); + Eigen::array<DimPair, 1> dims = {{DimPair(0, 0)}}; typedef TensorEvaluator<decltype(vec1.contract(vec2, dims)), DefaultDevice> Evaluator; Evaluator eval(vec1.contract(vec2, dims), DefaultDevice()); eval.evalTo(scalar.data()); @@ -113,7 +113,7 @@ static void test_multidims() Tensor<float, 3, DataLayout> mat3(2, 2, 2); mat3.setZero(); - Eigen::array<DimPair, 2> dims({{DimPair(1, 2), DimPair(2, 3)}}); + Eigen::array<DimPair, 2> dims = {{DimPair(1, 2), DimPair(2, 3)}}; typedef TensorEvaluator<decltype(mat1.contract(mat2, dims)), DefaultDevice> Evaluator; Evaluator eval(mat1.contract(mat2, dims), DefaultDevice()); eval.evalTo(mat3.data()); @@ -138,6 +138,26 @@ static void test_multidims() mat1(1,0,1)*mat2(1,0,0,1) + mat1(1,1,1)*mat2(1,0,1,1)); VERIFY_IS_APPROX(mat3(1,1,1), mat1(1,0,0)*mat2(1,1,0,0) + mat1(1,1,0)*mat2(1,1,1,0) + mat1(1,0,1)*mat2(1,1,0,1) + mat1(1,1,1)*mat2(1,1,1,1)); + + Tensor<float, 2, DataLayout> mat4(2, 2); + Tensor<float, 3, DataLayout> mat5(2, 2, 2); + + mat4.setRandom(); + mat5.setRandom(); + + Tensor<float, 1, DataLayout> mat6(2); + mat6.setZero(); + Eigen::array<DimPair, 2> dims2({{DimPair(0, 1), DimPair(1, 0)}}); + typedef TensorEvaluator<decltype(mat4.contract(mat5, dims2)), DefaultDevice> Evaluator2; + Evaluator2 eval2(mat4.contract(mat5, dims2), DefaultDevice()); + eval2.evalTo(mat6.data()); + EIGEN_STATIC_ASSERT(Evaluator2::NumDims==1ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + VERIFY_IS_EQUAL(eval2.dimensions()[0], 2); + + VERIFY_IS_APPROX(mat6(0), mat4(0,0)*mat5(0,0,0) + mat4(1,0)*mat5(0,1,0) + + mat4(0,1)*mat5(1,0,0) + mat4(1,1)*mat5(1,1,0)); + VERIFY_IS_APPROX(mat6(1), mat4(0,0)*mat5(0,0,1) + mat4(1,0)*mat5(0,1,1) + + mat4(0,1)*mat5(1,0,1) + mat4(1,1)*mat5(1,1,1)); } template<int DataLayout> @@ -147,7 +167,7 @@ static void test_holes() { t1.setRandom(); t2.setRandom(); - Eigen::array<DimPair, 2> dims({{DimPair(0, 0), DimPair(3, 4)}}); + Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(3, 4)}}; Tensor<float, 5, DataLayout> result = t1.contract(t2, dims); VERIFY_IS_EQUAL(result.dimension(0), 5); VERIFY_IS_EQUAL(result.dimension(1), 7); @@ -182,7 +202,7 @@ static void test_full_redux() t1.setRandom(); t2.setRandom(); - Eigen::array<DimPair, 2> dims({{DimPair(0, 0), DimPair(1, 1)}}); + Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(1, 1)}}; Tensor<float, 1, DataLayout> result = t1.contract(t2, dims); VERIFY_IS_EQUAL(result.dimension(0), 2); VERIFY_IS_APPROX(result(0), t1(0, 0) * t2(0, 0, 0) + t1(1, 0) * t2(1, 0, 0) @@ -212,7 +232,7 @@ static void test_contraction_of_contraction() t3.setRandom(); t4.setRandom(); - Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}}); + Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}}; auto contract1 = t1.contract(t2, dims); auto diff = t3 - contract1; auto contract2 = t1.contract(t4, dims); @@ -243,7 +263,7 @@ static void test_expr() Tensor<float, 2, DataLayout> mat3(2,2); - Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}}); + Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}}; mat3 = mat1.contract(mat2, dims); VERIFY_IS_APPROX(mat3(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(1,0) + mat1(0,2)*mat2(2,0)); @@ -263,7 +283,7 @@ static void test_out_of_order_contraction() Tensor<float, 2, DataLayout> mat3(2, 2); - Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(0, 2)}}); + Eigen::array<DimPair, 2> dims = {{DimPair(2, 0), DimPair(0, 2)}}; mat3 = mat1.contract(mat2, dims); VERIFY_IS_APPROX(mat3(0, 0), @@ -279,7 +299,7 @@ static void test_out_of_order_contraction() mat1(0,1,0)*mat2(0,1,0) + mat1(1,1,0)*mat2(0,1,1) + mat1(0,1,1)*mat2(1,1,0) + mat1(1,1,1)*mat2(1,1,1)); - Eigen::array<DimPair, 2> dims2({{DimPair(0, 2), DimPair(2, 0)}}); + Eigen::array<DimPair, 2> dims2 = {{DimPair(0, 2), DimPair(2, 0)}}; mat3 = mat1.contract(mat2, dims2); VERIFY_IS_APPROX(mat3(0, 0), @@ -311,8 +331,8 @@ static void test_consistency() Tensor<float, 4, DataLayout> mat4(2, 1, 5, 5); // contract on dimensions of size 4 and 3 - Eigen::array<DimPair, 2> dims1({{DimPair(0, 4), DimPair(1, 0)}}); - Eigen::array<DimPair, 2> dims2({{DimPair(4, 0), DimPair(0, 1)}}); + Eigen::array<DimPair, 2> dims1 = {{DimPair(0, 4), DimPair(1, 0)}}; + Eigen::array<DimPair, 2> dims2 = {{DimPair(4, 0), DimPair(0, 1)}}; mat3 = mat1.contract(mat2, dims1); mat4 = mat2.contract(mat1, dims2); @@ -354,7 +374,7 @@ static void test_large_contraction() Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400); // this contraction should be equivalent to a single matrix multiplication - Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}}); + Eigen::array<DimPair, 2> dims = {{DimPair(2, 0), DimPair(3, 1)}}; // compute results by separate methods t_result = t_left.contract(t_right, dims); @@ -399,10 +419,10 @@ static void test_tensor_vector() { Tensor<float, 3, DataLayout> t_left(7, 13, 17); Tensor<float, 2, DataLayout> t_right(1, 7); - + t_left.setRandom(); t_right.setRandom(); - + typedef typename Tensor<float, 1, DataLayout>::DimensionPair DimensionPair; Eigen::array<DimensionPair, 1> dim_pair01{{{0, 1}}}; Tensor<float, 3, DataLayout> t_result = t_left.contract(t_right, dim_pair01); @@ -434,7 +454,7 @@ static void test_small_blocking_factors() Eigen::setCpuCacheSizes(896, 1920, 2944); // this contraction should be equivalent to a single matrix multiplication - Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}}); + Eigen::array<DimPair, 2> dims = {{DimPair(2, 0), DimPair(3, 1)}}; Tensor<float, 5, DataLayout> t_result; t_result = t_left.contract(t_right, dims); diff --git a/unsupported/test/cxx11_tensor_cuda.cpp b/unsupported/test/cxx11_tensor_cuda.cu index 5ff082a3a..134359611 100644 --- a/unsupported/test/cxx11_tensor_cuda.cpp +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -7,8 +7,6 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -// TODO(mdevin): Free the cuda memory. - #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_cuda @@ -63,6 +61,10 @@ void test_cuda_elementwise_small() { out(Eigen::array<int, 1>(i)), in1(Eigen::array<int, 1>(i)) + in2(Eigen::array<int, 1>(i))); } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); } void test_cuda_elementwise() @@ -113,6 +115,48 @@ void test_cuda_elementwise() } } } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); +} + +void test_cuda_props() { + Tensor<float, 1> in1(200); + Tensor<bool, 1> out(200); + in1.setRandom(); + + std::size_t in1_bytes = in1.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(bool); + + float* d_in1; + bool* d_out; + cudaMalloc((void**)(&d_in1), in1_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1( + d_in1, 200); + Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_out( + d_out, 200); + + gpu_out.device(gpu_device) = (gpu_in1.isnan)(); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, + gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 200; ++i) { + VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i))); + } + + cudaFree(d_in1); + cudaFree(d_out); } void test_cuda_reduction() @@ -131,8 +175,7 @@ void test_cuda_reduction() cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113); @@ -159,10 +202,13 @@ void test_cuda_reduction() VERIFY_IS_APPROX(out(i,j), expected); } } + + cudaFree(d_in1); + cudaFree(d_out); } template<int DataLayout> -static void test_cuda_contraction() +void test_cuda_contraction() { // with these dimensions, the output has 300 * 140 elements, which is // more than 30 * 1024, which is the number of threads in blocks on @@ -189,8 +235,7 @@ static void test_cuda_contraction() cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31); @@ -214,14 +259,18 @@ static void test_cuda_contraction() for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { - cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << endl; + std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; assert(false); } } + + cudaFree(d_t_left); + cudaFree(d_t_right); + cudaFree(d_t_result); } template<int DataLayout> -static void test_cuda_convolution_1d() +void test_cuda_convolution_1d() { Tensor<float, 4, DataLayout> input(74,37,11,137); Tensor<float, 1, DataLayout> kernel(4); @@ -243,8 +292,7 @@ static void test_cuda_convolution_1d() cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137); @@ -269,9 +317,13 @@ static void test_cuda_convolution_1d() } } } + + cudaFree(d_input); + cudaFree(d_kernel); + cudaFree(d_out); } -static void test_cuda_convolution_inner_dim_col_major_1d() +void test_cuda_convolution_inner_dim_col_major_1d() { Tensor<float, 4, ColMajor> input(74,9,11,7); Tensor<float, 1, ColMajor> kernel(4); @@ -293,8 +345,7 @@ static void test_cuda_convolution_inner_dim_col_major_1d() cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7); @@ -319,9 +370,13 @@ static void test_cuda_convolution_inner_dim_col_major_1d() } } } + + cudaFree(d_input); + cudaFree(d_kernel); + cudaFree(d_out); } -static void test_cuda_convolution_inner_dim_row_major_1d() +void test_cuda_convolution_inner_dim_row_major_1d() { Tensor<float, 4, RowMajor> input(7,9,11,74); Tensor<float, 1, RowMajor> kernel(4); @@ -343,8 +398,7 @@ static void test_cuda_convolution_inner_dim_row_major_1d() cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74); @@ -369,10 +423,14 @@ static void test_cuda_convolution_inner_dim_row_major_1d() } } } + + cudaFree(d_input); + cudaFree(d_kernel); + cudaFree(d_out); } template<int DataLayout> -static void test_cuda_convolution_2d() +void test_cuda_convolution_2d() { Tensor<float, 4, DataLayout> input(74,37,11,137); Tensor<float, 2, DataLayout> kernel(3,4); @@ -394,8 +452,7 @@ static void test_cuda_convolution_2d() cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137); @@ -430,10 +487,14 @@ static void test_cuda_convolution_2d() } } } + + cudaFree(d_input); + cudaFree(d_kernel); + cudaFree(d_out); } template<int DataLayout> -static void test_cuda_convolution_3d() +void test_cuda_convolution_3d() { Tensor<float, 5, DataLayout> input(Eigen::array<int, 5>(74,37,11,137,17)); Tensor<float, 3, DataLayout> kernel(3,4,2); @@ -455,8 +516,7 @@ static void test_cuda_convolution_3d() cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); - cudaStream_t stream; - assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17); @@ -505,21 +565,507 @@ static void test_cuda_convolution_3d() } } } + + cudaFree(d_input); + cudaFree(d_kernel); + cudaFree(d_out); +} + + +template <typename Scalar> +void test_cuda_lgamma(const Scalar stddev) +{ + Tensor<Scalar, 2> in(72,97); + in.setRandom(); + in *= in.constant(stddev); + Tensor<Scalar, 2> out(72,97); + out.setZero(); + + std::size_t bytes = in.size() * sizeof(Scalar); + + Scalar* d_in; + Scalar* d_out; + cudaMalloc((void**)(&d_in), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97); + Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97); + + gpu_out.device(gpu_device) = gpu_in.lgamma(); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 97; ++j) { + VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j))); + } + } + + cudaFree(d_in); + cudaFree(d_out); +} + +template <typename Scalar> +void test_cuda_digamma() +{ + Tensor<Scalar, 1> in(7); + Tensor<Scalar, 1> out(7); + Tensor<Scalar, 1> expected_out(7); + out.setZero(); + + in(0) = Scalar(1); + in(1) = Scalar(1.5); + in(2) = Scalar(4); + in(3) = Scalar(-10.5); + in(4) = Scalar(10000.5); + in(5) = Scalar(0); + in(6) = Scalar(-1); + + expected_out(0) = Scalar(-0.5772156649015329); + expected_out(1) = Scalar(0.03648997397857645); + expected_out(2) = Scalar(1.2561176684318); + expected_out(3) = Scalar(2.398239129535781); + expected_out(4) = Scalar(9.210340372392849); + expected_out(5) = std::numeric_limits<Scalar>::infinity(); + expected_out(6) = std::numeric_limits<Scalar>::infinity(); + + std::size_t bytes = in.size() * sizeof(Scalar); + + Scalar* d_in; + Scalar* d_out; + cudaMalloc((void**)(&d_in), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7); + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7); + + gpu_out.device(gpu_device) = gpu_in.digamma(); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 5; ++i) { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } + for (int i = 5; i < 7; ++i) { + VERIFY_IS_EQUAL(out(i), expected_out(i)); + } +} + +template <typename Scalar> +void test_cuda_zeta() +{ + Tensor<Scalar, 1> in_x(6); + Tensor<Scalar, 1> in_q(6); + Tensor<Scalar, 1> out(6); + Tensor<Scalar, 1> expected_out(6); + out.setZero(); + + in_x(0) = Scalar(1); + in_x(1) = Scalar(1.5); + in_x(2) = Scalar(4); + in_x(3) = Scalar(-10.5); + in_x(4) = Scalar(10000.5); + in_x(5) = Scalar(3); + + in_q(0) = Scalar(1.2345); + in_q(1) = Scalar(2); + in_q(2) = Scalar(1.5); + in_q(3) = Scalar(3); + in_q(4) = Scalar(1.0001); + in_q(5) = Scalar(-2.5); + + expected_out(0) = std::numeric_limits<Scalar>::infinity(); + expected_out(1) = Scalar(1.61237534869); + expected_out(2) = Scalar(0.234848505667); + expected_out(3) = Scalar(1.03086757337e-5); + expected_out(4) = Scalar(0.367879440865); + expected_out(5) = Scalar(0.054102025820864097); + + std::size_t bytes = in_x.size() * sizeof(Scalar); + + Scalar* d_in_x; + Scalar* d_in_q; + Scalar* d_out; + cudaMalloc((void**)(&d_in_x), bytes); + cudaMalloc((void**)(&d_in_q), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in_q, in_q.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6); + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_q(d_in_q, 6); + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6); + + gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + VERIFY_IS_EQUAL(out(0), expected_out(0)); + VERIFY_IS_APPROX_OR_LESS_THAN(out(3), expected_out(3)); + + for (int i = 1; i < 6; ++i) { + if (i != 3) { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } + } +} + +template <typename Scalar> +void test_cuda_polygamma() +{ + Tensor<Scalar, 1> in_x(7); + Tensor<Scalar, 1> in_n(7); + Tensor<Scalar, 1> out(7); + Tensor<Scalar, 1> expected_out(7); + out.setZero(); + + in_n(0) = Scalar(1); + in_n(1) = Scalar(1); + in_n(2) = Scalar(1); + in_n(3) = Scalar(17); + in_n(4) = Scalar(31); + in_n(5) = Scalar(28); + in_n(6) = Scalar(8); + + in_x(0) = Scalar(2); + in_x(1) = Scalar(3); + in_x(2) = Scalar(25.5); + in_x(3) = Scalar(4.7); + in_x(4) = Scalar(11.8); + in_x(5) = Scalar(17.7); + in_x(6) = Scalar(30.2); + + expected_out(0) = Scalar(0.644934066848); + expected_out(1) = Scalar(0.394934066848); + expected_out(2) = Scalar(0.0399946696496); + expected_out(3) = Scalar(293.334565435); + expected_out(4) = Scalar(0.445487887616); + expected_out(5) = Scalar(-2.47810300902e-07); + expected_out(6) = Scalar(-8.29668781082e-09); + + std::size_t bytes = in_x.size() * sizeof(Scalar); + + Scalar* d_in_x; + Scalar* d_in_n; + Scalar* d_out; + cudaMalloc((void**)(&d_in_x), bytes); + cudaMalloc((void**)(&d_in_n), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in_n, in_n.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7); + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_n(d_in_n, 7); + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7); + + gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 7; ++i) { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } +} + +template <typename Scalar> +void test_cuda_igamma() +{ + Tensor<Scalar, 2> a(6, 6); + Tensor<Scalar, 2> x(6, 6); + Tensor<Scalar, 2> out(6, 6); + out.setZero(); + + Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + a(i, j) = a_s[i]; + x(i, j) = x_s[j]; + } + } + + Scalar nan = std::numeric_limits<Scalar>::quiet_NaN(); + Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan}, + {0.0, 0.6321205588285578, 0.7768698398515702, + 0.9816843611112658, 9.999500016666262e-05, 1.0}, + {0.0, 0.4275932955291202, 0.608374823728911, + 0.9539882943107686, 7.522076445089201e-07, 1.0}, + {0.0, 0.01898815687615381, 0.06564245437845008, + 0.5665298796332909, 4.166333347221828e-18, 1.0}, + {0.0, 0.9999780593618628, 0.9999899967080838, + 0.9999996219837988, 0.9991370418689945, 1.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}}; + + + + std::size_t bytes = a.size() * sizeof(Scalar); + + Scalar* d_a; + Scalar* d_x; + Scalar* d_out; + cudaMalloc((void**)(&d_a), bytes); + cudaMalloc((void**)(&d_x), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6); + Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6); + Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6); + + gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + if ((std::isnan)(igamma_s[i][j])) { + VERIFY((std::isnan)(out(i, j))); + } else { + VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]); + } + } + } +} + +template <typename Scalar> +void test_cuda_igammac() +{ + Tensor<Scalar, 2> a(6, 6); + Tensor<Scalar, 2> x(6, 6); + Tensor<Scalar, 2> out(6, 6); + out.setZero(); + + Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + a(i, j) = a_s[i]; + x(i, j) = x_s[j]; + } + } + + Scalar nan = std::numeric_limits<Scalar>::quiet_NaN(); + Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan}, + {1.0, 0.36787944117144233, 0.22313016014842982, + 0.018315638888734182, 0.9999000049998333, 0.0}, + {1.0, 0.5724067044708798, 0.3916251762710878, + 0.04601170568923136, 0.9999992477923555, 0.0}, + {1.0, 0.9810118431238462, 0.9343575456215499, + 0.4334701203667089, 1.0, 0.0}, + {1.0, 2.1940638138146658e-05, 1.0003291916285e-05, + 3.7801620118431334e-07, 0.0008629581310054535, + 0.0}, + {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}}; + + std::size_t bytes = a.size() * sizeof(Scalar); + + Scalar* d_a; + Scalar* d_x; + Scalar* d_out; + cudaMalloc((void**)(&d_a), bytes); + cudaMalloc((void**)(&d_x), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6); + Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6); + Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6); + + gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + if ((std::isnan)(igammac_s[i][j])) { + VERIFY((std::isnan)(out(i, j))); + } else { + VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]); + } + } + } +} + +template <typename Scalar> +void test_cuda_erf(const Scalar stddev) +{ + Tensor<Scalar, 2> in(72,97); + in.setRandom(); + in *= in.constant(stddev); + Tensor<Scalar, 2> out(72,97); + out.setZero(); + + std::size_t bytes = in.size() * sizeof(Scalar); + + Scalar* d_in; + Scalar* d_out; + cudaMalloc((void**)(&d_in), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97); + Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97); + + gpu_out.device(gpu_device) = gpu_in.erf(); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 97; ++j) { + VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j))); + } + } + + cudaFree(d_in); + cudaFree(d_out); +} + +template <typename Scalar> +void test_cuda_erfc(const Scalar stddev) +{ + Tensor<Scalar, 2> in(72,97); + in.setRandom(); + in *= in.constant(stddev); + Tensor<Scalar, 2> out(72,97); + out.setZero(); + + std::size_t bytes = in.size() * sizeof(Scalar); + + Scalar* d_in; + Scalar* d_out; + cudaMalloc((void**)(&d_in), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97); + Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97); + + gpu_out.device(gpu_device) = gpu_in.erfc(); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 97; ++j) { + VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j))); + } + } + + cudaFree(d_in); + cudaFree(d_out); } void test_cxx11_tensor_cuda() { - CALL_SUBTEST(test_cuda_elementwise_small()); - CALL_SUBTEST(test_cuda_elementwise()); - CALL_SUBTEST(test_cuda_reduction()); - CALL_SUBTEST(test_cuda_contraction<ColMajor>()); - CALL_SUBTEST(test_cuda_contraction<RowMajor>()); - CALL_SUBTEST(test_cuda_convolution_1d<ColMajor>()); - CALL_SUBTEST(test_cuda_convolution_1d<RowMajor>()); - CALL_SUBTEST(test_cuda_convolution_inner_dim_col_major_1d()); - CALL_SUBTEST(test_cuda_convolution_inner_dim_row_major_1d()); - CALL_SUBTEST(test_cuda_convolution_2d<ColMajor>()); - CALL_SUBTEST(test_cuda_convolution_2d<RowMajor>()); - CALL_SUBTEST(test_cuda_convolution_3d<ColMajor>()); - CALL_SUBTEST(test_cuda_convolution_3d<RowMajor>()); + CALL_SUBTEST_1(test_cuda_elementwise_small()); + CALL_SUBTEST_1(test_cuda_elementwise()); + CALL_SUBTEST_1(test_cuda_props()); + CALL_SUBTEST_1(test_cuda_reduction()); + CALL_SUBTEST_2(test_cuda_contraction<ColMajor>()); + CALL_SUBTEST_2(test_cuda_contraction<RowMajor>()); + CALL_SUBTEST_3(test_cuda_convolution_1d<ColMajor>()); + CALL_SUBTEST_3(test_cuda_convolution_1d<RowMajor>()); + CALL_SUBTEST_3(test_cuda_convolution_inner_dim_col_major_1d()); + CALL_SUBTEST_3(test_cuda_convolution_inner_dim_row_major_1d()); + CALL_SUBTEST_3(test_cuda_convolution_2d<ColMajor>()); + CALL_SUBTEST_3(test_cuda_convolution_2d<RowMajor>()); + CALL_SUBTEST_3(test_cuda_convolution_3d<ColMajor>()); + CALL_SUBTEST_3(test_cuda_convolution_3d<RowMajor>()); + +#if __cplusplus > 199711L + // std::erf, std::erfc, and so on where only added in c++11. We use them + // as a golden reference to validate the results produced by Eigen. Therefore + // we can only run these tests if we use a c++11 compiler. + CALL_SUBTEST_4(test_cuda_lgamma<float>(1.0f)); + CALL_SUBTEST_4(test_cuda_lgamma<float>(100.0f)); + CALL_SUBTEST_4(test_cuda_lgamma<float>(0.01f)); + CALL_SUBTEST_4(test_cuda_lgamma<float>(0.001f)); + + CALL_SUBTEST_4(test_cuda_lgamma<double>(1.0)); + CALL_SUBTEST_4(test_cuda_lgamma<double>(100.0)); + CALL_SUBTEST_4(test_cuda_lgamma<double>(0.01)); + CALL_SUBTEST_4(test_cuda_lgamma<double>(0.001)); + + CALL_SUBTEST_4(test_cuda_erf<float>(1.0f)); + CALL_SUBTEST_4(test_cuda_erf<float>(100.0f)); + CALL_SUBTEST_4(test_cuda_erf<float>(0.01f)); + CALL_SUBTEST_4(test_cuda_erf<float>(0.001f)); + + CALL_SUBTEST_4(test_cuda_erfc<float>(1.0f)); + // CALL_SUBTEST(test_cuda_erfc<float>(100.0f)); + CALL_SUBTEST_4(test_cuda_erfc<float>(5.0f)); // CUDA erfc lacks precision for large inputs + CALL_SUBTEST_4(test_cuda_erfc<float>(0.01f)); + CALL_SUBTEST_4(test_cuda_erfc<float>(0.001f)); + + CALL_SUBTEST_4(test_cuda_erf<double>(1.0)); + CALL_SUBTEST_4(test_cuda_erf<double>(100.0)); + CALL_SUBTEST_4(test_cuda_erf<double>(0.01)); + CALL_SUBTEST_4(test_cuda_erf<double>(0.001)); + + CALL_SUBTEST_4(test_cuda_erfc<double>(1.0)); + // CALL_SUBTEST(test_cuda_erfc<double>(100.0)); + CALL_SUBTEST_4(test_cuda_erfc<double>(5.0)); // CUDA erfc lacks precision for large inputs + CALL_SUBTEST_4(test_cuda_erfc<double>(0.01)); + CALL_SUBTEST_4(test_cuda_erfc<double>(0.001)); + + CALL_SUBTEST_5(test_cuda_digamma<float>()); + CALL_SUBTEST_5(test_cuda_digamma<double>()); + + CALL_SUBTEST_5(test_cuda_polygamma<float>()); + CALL_SUBTEST_5(test_cuda_polygamma<double>()); + + CALL_SUBTEST_5(test_cuda_zeta<float>()); + CALL_SUBTEST_5(test_cuda_zeta<double>()); + + CALL_SUBTEST_5(test_cuda_igamma<float>()); + CALL_SUBTEST_5(test_cuda_igammac<float>()); + + CALL_SUBTEST_5(test_cuda_igamma<double>()); + CALL_SUBTEST_5(test_cuda_igammac<double>()); +#endif } diff --git a/unsupported/test/cxx11_tensor_custom_op.cpp b/unsupported/test/cxx11_tensor_custom_op.cpp index 7e33c9580..8baa477cc 100644 --- a/unsupported/test/cxx11_tensor_custom_op.cpp +++ b/unsupported/test/cxx11_tensor_custom_op.cpp @@ -25,7 +25,9 @@ struct InsertZeros { template <typename Output, typename Device> void eval(const Tensor<float, 2>& input, Output& output, const Device& device) const { - array<DenseIndex, 2> strides{{2, 2}}; + array<DenseIndex, 2> strides; + strides[0] = 2; + strides[1] = 2; output.stride(strides).device(device) = input; Eigen::DSizes<DenseIndex, 2> offsets(1,1); @@ -70,7 +72,8 @@ struct BatchMatMul { Output& output, const Device& device) const { typedef Tensor<float, 3>::DimensionPair DimPair; - array<DimPair, 1> dims({{DimPair(1, 0)}}); + array<DimPair, 1> dims; + dims[0] = DimPair(1, 0); for (int i = 0; i < output.dimension(2); ++i) { output.template chip<2>(i).device(device) = input1.chip<2>(i).contract(input2.chip<2>(i), dims); } @@ -88,9 +91,10 @@ static void test_custom_binary_op() Tensor<float, 3> result = tensor1.customOp(tensor2, BatchMatMul()); for (int i = 0; i < 5; ++i) { typedef Tensor<float, 3>::DimensionPair DimPair; - array<DimPair, 1> dims({{DimPair(1, 0)}}); + array<DimPair, 1> dims; + dims[0] = DimPair(1, 0); Tensor<float, 2> reference = tensor1.chip<2>(i).contract(tensor2.chip<2>(i), dims); - TensorRef<Tensor<float, 2>> val = result.chip<2>(i); + TensorRef<Tensor<float, 2> > val = result.chip<2>(i); for (int j = 0; j < 2; ++j) { for (int k = 0; k < 7; ++k) { VERIFY_IS_APPROX(val(j, k), reference(j, k)); diff --git a/unsupported/test/cxx11_tensor_device.cpp b/unsupported/test/cxx11_tensor_device.cu index ed5dd7505..cbe9e6449 100644 --- a/unsupported/test/cxx11_tensor_device.cpp +++ b/unsupported/test/cxx11_tensor_device.cu @@ -109,19 +109,19 @@ struct GPUContext { // The actual expression to evaluate template <typename Context> -static void test_contextual_eval(Context* context) +void test_contextual_eval(Context* context) { context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f); } template <typename Context> -static void test_forced_contextual_eval(Context* context) +void test_forced_contextual_eval(Context* context) { context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f); } template <typename Context> -static void test_compound_assignment(Context* context) +void test_compound_assignment(Context* context) { context->out().device(context->device()) = context->in1().constant(2.718f); context->out().device(context->device()) += context->in1() + context->in2() * 3.14f; @@ -129,7 +129,7 @@ static void test_compound_assignment(Context* context) template <typename Context> -static void test_contraction(Context* context) +void test_contraction(Context* context) { Eigen::array<std::pair<int, int>, 2> dims; dims[0] = std::make_pair(1, 1); @@ -145,7 +145,7 @@ static void test_contraction(Context* context) template <typename Context> -static void test_1d_convolution(Context* context) +void test_1d_convolution(Context* context) { Eigen::DSizes<int, 3> indices(0,0,0); Eigen::DSizes<int, 3> sizes(40,49,70); @@ -155,7 +155,7 @@ static void test_1d_convolution(Context* context) } template <typename Context> -static void test_2d_convolution(Context* context) +void test_2d_convolution(Context* context) { Eigen::DSizes<int, 3> indices(0,0,0); Eigen::DSizes<int, 3> sizes(40,49,69); @@ -165,7 +165,7 @@ static void test_2d_convolution(Context* context) } template <typename Context> -static void test_3d_convolution(Context* context) +void test_3d_convolution(Context* context) { Eigen::DSizes<int, 3> indices(0,0,0); Eigen::DSizes<int, 3> sizes(39,49,69); @@ -175,7 +175,7 @@ static void test_3d_convolution(Context* context) } -static void test_cpu() { +void test_cpu() { Eigen::Tensor<float, 3> in1(40,50,70); Eigen::Tensor<float, 3> in2(40,50,70); Eigen::Tensor<float, 3> out(40,50,70); @@ -267,7 +267,7 @@ static void test_cpu() { } } -static void test_gpu() { +void test_gpu() { Eigen::Tensor<float, 3> in1(40,50,70); Eigen::Tensor<float, 3> in2(40,50,70); Eigen::Tensor<float, 3> out(40,50,70); @@ -383,6 +383,6 @@ static void test_gpu() { void test_cxx11_tensor_device() { - CALL_SUBTEST(test_cpu()); - CALL_SUBTEST(test_gpu()); + CALL_SUBTEST_1(test_cpu()); + CALL_SUBTEST_2(test_gpu()); } diff --git a/unsupported/test/cxx11_tensor_empty.cpp b/unsupported/test/cxx11_tensor_empty.cpp new file mode 100644 index 000000000..9130fff35 --- /dev/null +++ b/unsupported/test/cxx11_tensor_empty.cpp @@ -0,0 +1,40 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + + +static void test_empty_tensor() +{ + Tensor<float, 2> source; + Tensor<float, 2> tgt1 = source; + Tensor<float, 2> tgt2(source); + Tensor<float, 2> tgt3; + tgt3 = tgt1; + tgt3 = tgt2; +} + +static void test_empty_fixed_size_tensor() +{ + TensorFixedSize<float, Sizes<0>> source; + TensorFixedSize<float, Sizes<0>> tgt1 = source; + TensorFixedSize<float, Sizes<0>> tgt2(source); + TensorFixedSize<float, Sizes<0>> tgt3; + tgt3 = tgt1; + tgt3 = tgt2; +} + + +void test_cxx11_tensor_empty() +{ + CALL_SUBTEST(test_empty_tensor()); + CALL_SUBTEST(test_empty_fixed_size_tensor()); +} diff --git a/unsupported/test/cxx11_tensor_fft.cpp b/unsupported/test/cxx11_tensor_fft.cpp index 0f6e09106..89874349f 100644 --- a/unsupported/test/cxx11_tensor_fft.cpp +++ b/unsupported/test/cxx11_tensor_fft.cpp @@ -14,7 +14,7 @@ using Eigen::Tensor; template <int DataLayout> static void test_fft_2D_golden() { - Tensor<float, 2, DataLayout, long> input(2, 3); + Tensor<float, 2, DataLayout> input(2, 3); input(0, 0) = 1; input(0, 1) = 2; input(0, 2) = 3; @@ -22,11 +22,11 @@ static void test_fft_2D_golden() { input(1, 1) = 5; input(1, 2) = 6; - array<int, 2> fft; + array<ptrdiff_t, 2> fft; fft[0] = 0; fft[1] = 1; - Tensor<std::complex<float>, 2, DataLayout, long> output = input.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(fft); + Tensor<std::complex<float>, 2, DataLayout> output = input.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(fft); std::complex<float> output_golden[6]; // in ColMajor order output_golden[0] = std::complex<float>(21, 0); @@ -57,24 +57,24 @@ static void test_fft_2D_golden() { } static void test_fft_complex_input_golden() { - Tensor<std::complex<float>, 1, ColMajor, long> input(5); + Tensor<std::complex<float>, 1, ColMajor> input(5); input(0) = std::complex<float>(1, 1); input(1) = std::complex<float>(2, 2); input(2) = std::complex<float>(3, 3); input(3) = std::complex<float>(4, 4); input(4) = std::complex<float>(5, 5); - array<int, 1> fft; + array<ptrdiff_t, 1> fft; fft[0] = 0; - Tensor<std::complex<float>, 1, ColMajor, long> forward_output_both_parts = input.fft<BothParts, FFT_FORWARD>(fft); - Tensor<std::complex<float>, 1, ColMajor, long> reverse_output_both_parts = input.fft<BothParts, FFT_REVERSE>(fft); + Tensor<std::complex<float>, 1, ColMajor> forward_output_both_parts = input.fft<BothParts, FFT_FORWARD>(fft); + Tensor<std::complex<float>, 1, ColMajor> reverse_output_both_parts = input.fft<BothParts, FFT_REVERSE>(fft); - Tensor<float, 1, ColMajor, long> forward_output_real_part = input.fft<RealPart, FFT_FORWARD>(fft); - Tensor<float, 1, ColMajor, long> reverse_output_real_part = input.fft<RealPart, FFT_REVERSE>(fft); + Tensor<float, 1, ColMajor> forward_output_real_part = input.fft<RealPart, FFT_FORWARD>(fft); + Tensor<float, 1, ColMajor> reverse_output_real_part = input.fft<RealPart, FFT_REVERSE>(fft); - Tensor<float, 1, ColMajor, long> forward_output_imag_part = input.fft<ImagPart, FFT_FORWARD>(fft); - Tensor<float, 1, ColMajor, long> reverse_output_imag_part = input.fft<ImagPart, FFT_REVERSE>(fft); + Tensor<float, 1, ColMajor> forward_output_imag_part = input.fft<ImagPart, FFT_FORWARD>(fft); + Tensor<float, 1, ColMajor> reverse_output_imag_part = input.fft<ImagPart, FFT_REVERSE>(fft); VERIFY_IS_EQUAL(forward_output_both_parts.dimension(0), input.dimension(0)); VERIFY_IS_EQUAL(reverse_output_both_parts.dimension(0), input.dimension(0)); @@ -114,24 +114,24 @@ static void test_fft_complex_input_golden() { } static void test_fft_real_input_golden() { - Tensor<float, 1, ColMajor, long> input(5); + Tensor<float, 1, ColMajor> input(5); input(0) = 1.0; input(1) = 2.0; input(2) = 3.0; input(3) = 4.0; input(4) = 5.0; - array<int, 1> fft; + array<ptrdiff_t, 1> fft; fft[0] = 0; - Tensor<std::complex<float>, 1, ColMajor, long> forward_output_both_parts = input.fft<BothParts, FFT_FORWARD>(fft); - Tensor<std::complex<float>, 1, ColMajor, long> reverse_output_both_parts = input.fft<BothParts, FFT_REVERSE>(fft); + Tensor<std::complex<float>, 1, ColMajor> forward_output_both_parts = input.fft<BothParts, FFT_FORWARD>(fft); + Tensor<std::complex<float>, 1, ColMajor> reverse_output_both_parts = input.fft<BothParts, FFT_REVERSE>(fft); - Tensor<float, 1, ColMajor, long> forward_output_real_part = input.fft<RealPart, FFT_FORWARD>(fft); - Tensor<float, 1, ColMajor, long> reverse_output_real_part = input.fft<RealPart, FFT_REVERSE>(fft); + Tensor<float, 1, ColMajor> forward_output_real_part = input.fft<RealPart, FFT_FORWARD>(fft); + Tensor<float, 1, ColMajor> reverse_output_real_part = input.fft<RealPart, FFT_REVERSE>(fft); - Tensor<float, 1, ColMajor, long> forward_output_imag_part = input.fft<ImagPart, FFT_FORWARD>(fft); - Tensor<float, 1, ColMajor, long> reverse_output_imag_part = input.fft<ImagPart, FFT_REVERSE>(fft); + Tensor<float, 1, ColMajor> forward_output_imag_part = input.fft<ImagPart, FFT_FORWARD>(fft); + Tensor<float, 1, ColMajor> reverse_output_imag_part = input.fft<ImagPart, FFT_REVERSE>(fft); VERIFY_IS_EQUAL(forward_output_both_parts.dimension(0), input.dimension(0)); VERIFY_IS_EQUAL(reverse_output_both_parts.dimension(0), input.dimension(0)); @@ -178,21 +178,21 @@ static void test_fft_real_input_golden() { template <int DataLayout, typename RealScalar, bool isComplexInput, int FFTResultType, int FFTDirection, int TensorRank> static void test_fft_real_input_energy() { - Eigen::DSizes<long, TensorRank> dimensions; - int total_size = 1; + Eigen::DSizes<ptrdiff_t, TensorRank> dimensions; + ptrdiff_t total_size = 1; for (int i = 0; i < TensorRank; ++i) { dimensions[i] = rand() % 20 + 1; total_size *= dimensions[i]; } - const DSizes<long, TensorRank> arr = dimensions; + const DSizes<ptrdiff_t, TensorRank> arr = dimensions; typedef typename internal::conditional<isComplexInput == true, std::complex<RealScalar>, RealScalar>::type InputScalar; - Tensor<InputScalar, TensorRank, DataLayout, long> input; + Tensor<InputScalar, TensorRank, DataLayout> input; input.resize(arr); input.setRandom(); - array<int, TensorRank> fft; + array<ptrdiff_t, TensorRank> fft; for (int i = 0; i < TensorRank; ++i) { fft[i] = i; } diff --git a/unsupported/test/cxx11_tensor_notification.cpp b/unsupported/test/cxx11_tensor_notification.cpp new file mode 100644 index 000000000..c946007b8 --- /dev/null +++ b/unsupported/test/cxx11_tensor_notification.cpp @@ -0,0 +1,81 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Vijay Vasudevan <vrv@google.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_USE_THREADS + +#include <stdlib.h> +#include "main.h" +#include <Eigen/CXX11/Tensor> + +#if EIGEN_OS_WIN || EIGEN_OS_WIN64 +#include <windows.h> +void sleep(int seconds) { + Sleep(seconds*1000); +} +#else +#include <unistd.h> +#endif + + +namespace { + +void WaitAndAdd(Eigen::Notification* n, int* counter) { + n->Wait(); + *counter = *counter + 1; +} + +} // namespace + +static void test_notification_single() +{ + ThreadPool thread_pool(1); + + int counter = 0; + Eigen::Notification n; + std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter); + thread_pool.Schedule(func); + sleep(1); + + // The thread should be waiting for the notification. + VERIFY_IS_EQUAL(counter, 0); + + // Unblock the thread + n.Notify(); + + sleep(1); + + // Verify the counter has been incremented + VERIFY_IS_EQUAL(counter, 1); +} + +// Like test_notification_single() but enqueues multiple threads to +// validate that all threads get notified by Notify(). +static void test_notification_multiple() +{ + ThreadPool thread_pool(1); + + int counter = 0; + Eigen::Notification n; + std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter); + thread_pool.Schedule(func); + thread_pool.Schedule(func); + thread_pool.Schedule(func); + thread_pool.Schedule(func); + sleep(1); + VERIFY_IS_EQUAL(counter, 0); + n.Notify(); + sleep(1); + VERIFY_IS_EQUAL(counter, 4); +} + +void test_cxx11_tensor_notification() +{ + CALL_SUBTEST(test_notification_single()); + CALL_SUBTEST(test_notification_multiple()); +} diff --git a/unsupported/test/cxx11_tensor_of_complex.cpp b/unsupported/test/cxx11_tensor_of_complex.cpp index 8ad04f699..e9d1b2d3c 100644 --- a/unsupported/test/cxx11_tensor_of_complex.cpp +++ b/unsupported/test/cxx11_tensor_of_complex.cpp @@ -48,6 +48,25 @@ static void test_abs() } +static void test_conjugate() +{ + Tensor<std::complex<float>, 1> data1(3); + Tensor<std::complex<double>, 1> data2(3); + Tensor<int, 1> data3(3); + data1.setRandom(); + data2.setRandom(); + data3.setRandom(); + + Tensor<std::complex<float>, 1> conj1 = data1.conjugate(); + Tensor<std::complex<double>, 1> conj2 = data2.conjugate(); + Tensor<int, 1> conj3 = data3.conjugate(); + for (int i = 0; i < 3; ++i) { + VERIFY_IS_APPROX(conj1(i), std::conj(data1(i))); + VERIFY_IS_APPROX(conj2(i), std::conj(data2(i))); + VERIFY_IS_APPROX(conj3(i), data3(i)); + } +} + static void test_contractions() { Tensor<std::complex<float>, 4> t_left(30, 50, 8, 31); @@ -64,7 +83,9 @@ static void test_contractions() // This contraction should be equivalent to a regular matrix multiplication typedef Tensor<float, 1>::DimensionPair DimPair; - Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}}); + Eigen::array<DimPair, 2> dims; + dims[0] = DimPair(2, 0); + dims[1] = DimPair(3, 1); t_result = t_left.contract(t_right, dims); m_result = m_left * m_right; for (int i = 0; i < t_result.dimensions().TotalSize(); i++) { @@ -77,5 +98,6 @@ void test_cxx11_tensor_of_complex() { CALL_SUBTEST(test_additions()); CALL_SUBTEST(test_abs()); + CALL_SUBTEST(test_conjugate()); CALL_SUBTEST(test_contractions()); } diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu new file mode 100644 index 000000000..cb917bb37 --- /dev/null +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -0,0 +1,256 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_cuda +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_GPU + + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +#ifdef EIGEN_HAS_CUDA_FP16 + +void test_cuda_conversion() { + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int num_elem = 101; + + float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); + float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float)); + + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float( + d_float, num_elem); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half( + d_half, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv( + d_conv, num_elem); + + gpu_float.device(gpu_device) = gpu_float.random(); + gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>(); + gpu_conv.device(gpu_device) = gpu_half.cast<float>(); + + Tensor<float, 1> initial(num_elem); + Tensor<float, 1> final(num_elem); + gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float)); + + for (int i = 0; i < num_elem; ++i) { + VERIFY_IS_APPROX(initial(i), final(i)); + } + + gpu_device.deallocate(d_float); + gpu_device.deallocate(d_half); + gpu_device.deallocate(d_conv); +} + + +void test_cuda_unary() { + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int num_elem = 101; + + float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float( + d_float, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half( + d_res_half, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float( + d_res_float, num_elem); + + gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f); + gpu_res_float.device(gpu_device) = gpu_float.abs(); + gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().cast<float>(); + + Tensor<float, 1> half_prec(num_elem); + Tensor<float, 1> full_prec(num_elem); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); + gpu_device.synchronize(); + + for (int i = 0; i < num_elem; ++i) { + std::cout << "Checking unary " << i << std::endl; + VERIFY_IS_APPROX(full_prec(i), half_prec(i)); + } + + gpu_device.deallocate(d_float); + gpu_device.deallocate(d_res_half); + gpu_device.deallocate(d_res_float); +} + + +void test_cuda_elementwise() { + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int num_elem = 101; + + float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1( + d_float1, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2( + d_float2, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half( + d_res_half, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float( + d_res_float, num_elem); + + gpu_float1.device(gpu_device) = gpu_float1.random(); + gpu_float2.device(gpu_device) = gpu_float2.random(); + gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1; + gpu_res_half.device(gpu_device) = ((gpu_float1.cast<Eigen::half>() + gpu_float2.cast<Eigen::half>()) * gpu_float1.cast<Eigen::half>()).cast<float>(); + + Tensor<float, 1> half_prec(num_elem); + Tensor<float, 1> full_prec(num_elem); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); + gpu_device.synchronize(); + + for (int i = 0; i < num_elem; ++i) { + std::cout << "Checking elemwise " << i << std::endl; + VERIFY_IS_APPROX(full_prec(i), half_prec(i)); + } + + gpu_device.deallocate(d_float1); + gpu_device.deallocate(d_float2); + gpu_device.deallocate(d_res_half); + gpu_device.deallocate(d_res_float); +} + + +void test_cuda_contractions() { + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int rows = 23; + int cols = 23; + int num_elem = rows*cols; + + float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + + Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1( + d_float1, rows, cols); + Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2( + d_float2, rows, cols); + Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_res_half( + d_res_half, rows, cols); + Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_res_float( + d_res_float, rows, cols); + + gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f); + gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float1.constant(0.5f); + + typedef Tensor<float, 2>::DimensionPair DimPair; + Eigen::array<DimPair, 1> dims(DimPair(1, 0)); + gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims); + gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims).cast<float>(); + + Tensor<float, 2> half_prec(rows, cols); + Tensor<float, 2> full_prec(rows, cols); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); + gpu_device.synchronize(); + + for (int i = 0; i < rows; ++i) { + for (int j = 0; j < cols; ++j) { + std::cout << "Checking contract " << i << " " << j << std::endl; + VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j)); + } + } + + gpu_device.deallocate(d_float1); + gpu_device.deallocate(d_float2); + gpu_device.deallocate(d_res_half); + gpu_device.deallocate(d_res_float); +} + + +void test_cuda_reductions() { + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int size = 13; + int num_elem = size*size; + + float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_half = (float*)gpu_device.allocate(size * sizeof(float)); + float* d_res_float = (float*)gpu_device.allocate(size * sizeof(float)); + + Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1( + d_float1, size, size); + Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2( + d_float2, size, size); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half( + d_res_half, size); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float( + d_res_float, size); + + gpu_float1.device(gpu_device) = gpu_float1.random(); + gpu_float2.device(gpu_device) = gpu_float2.random(); + + Eigen::array<int, 1> redux_dim = {{0}}; + gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim); + gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim).cast<float>(); + + Tensor<float, 1> half_prec(size); + Tensor<float, 1> full_prec(size); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, size*sizeof(float)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, size*sizeof(float)); + gpu_device.synchronize(); + + for (int i = 0; i < size; ++i) { + std::cout << "Checking redux " << i << std::endl; + VERIFY_IS_APPROX(full_prec(i), half_prec(i)); + } + + gpu_device.deallocate(d_float1); + gpu_device.deallocate(d_float2); + gpu_device.deallocate(d_res_half); + gpu_device.deallocate(d_res_float); +} + + +#endif + + +void test_cxx11_tensor_of_float16_cuda() +{ +#ifdef EIGEN_HAS_CUDA_FP16 + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice device(&stream); + if (device.majorDeviceVersion() > 5 || + (device.majorDeviceVersion() == 5 && device.minorDeviceVersion() >= 3)) { + std::cout << "Running test on device with capability " << device.majorDeviceVersion() << "." << device.minorDeviceVersion() << std::endl; + + CALL_SUBTEST_1(test_cuda_conversion()); + CALL_SUBTEST_1(test_cuda_unary()); + CALL_SUBTEST_1(test_cuda_elementwise()); + CALL_SUBTEST_2(test_cuda_contractions()); + CALL_SUBTEST_3(test_cuda_reductions()); + } + else { + std::cout << "Half floats require compute capability of at least 5.3. This device only supports " << device.majorDeviceVersion() << "." << device.minorDeviceVersion() << ". Skipping the test" << std::endl; + } +#else + std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl; +#endif +} diff --git a/unsupported/test/cxx11_tensor_random.cpp b/unsupported/test/cxx11_tensor_random.cpp index 389896c54..0f3dc5787 100644 --- a/unsupported/test/cxx11_tensor_random.cpp +++ b/unsupported/test/cxx11_tensor_random.cpp @@ -48,7 +48,7 @@ struct MyGenerator { } // Same as above but generates several numbers at a time. - typename internal::packet_traits<int>::type packetOp( + internal::packet_traits<int>::type packetOp( Eigen::DenseIndex packet_location, Eigen::DenseIndex /*unused*/ = 0) const { const int packetSize = internal::packet_traits<int>::size; EIGEN_ALIGN_MAX int values[packetSize]; diff --git a/unsupported/test/cxx11_tensor_random_cuda.cpp b/unsupported/test/cxx11_tensor_random_cuda.cu index 5d091de15..5d091de15 100644 --- a/unsupported/test/cxx11_tensor_random_cuda.cpp +++ b/unsupported/test/cxx11_tensor_random_cuda.cu diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp index 0ec316991..6a128901a 100644 --- a/unsupported/test/cxx11_tensor_reduction.cpp +++ b/unsupported/test/cxx11_tensor_reduction.cpp @@ -9,6 +9,7 @@ #include "main.h" #include <limits> +#include <numeric> #include <Eigen/CXX11/Tensor> using Eigen::Tensor; diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cpp b/unsupported/test/cxx11_tensor_reduction_cuda.cu index 9e06eb126..cad0c08e0 100644 --- a/unsupported/test/cxx11_tensor_reduction_cuda.cpp +++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu @@ -48,9 +48,12 @@ static void test_full_reductions() { // Check that the CPU and GPU reductions return the same result. VERIFY_IS_APPROX(full_redux(), full_redux_gpu()); + + gpu_device.deallocate(gpu_in_ptr); + gpu_device.deallocate(gpu_out_ptr); } void test_cxx11_tensor_reduction_cuda() { - CALL_SUBTEST(test_full_reductions<ColMajor>()); - CALL_SUBTEST(test_full_reductions<RowMajor>()); + CALL_SUBTEST_1(test_full_reductions<ColMajor>()); + CALL_SUBTEST_2(test_full_reductions<RowMajor>()); } diff --git a/unsupported/test/cxx11_tensor_reverse.cpp b/unsupported/test/cxx11_tensor_reverse.cpp index f96c21fa3..b35b8d29e 100644 --- a/unsupported/test/cxx11_tensor_reverse.cpp +++ b/unsupported/test/cxx11_tensor_reverse.cpp @@ -114,10 +114,18 @@ static void test_expr_reverse(bool LValue) Tensor<float, 4, DataLayout> result(2,3,5,7); - array<ptrdiff_t, 4> src_slice_dim{{2,3,1,7}}; - array<ptrdiff_t, 4> src_slice_start{{0,0,0,0}}; - array<ptrdiff_t, 4> dst_slice_dim{{2,3,1,7}}; - array<ptrdiff_t, 4> dst_slice_start{{0,0,0,0}}; + array<ptrdiff_t, 4> src_slice_dim; + src_slice_dim[0] = 2; + src_slice_dim[1] = 3; + src_slice_dim[2] = 1; + src_slice_dim[3] = 7; + array<ptrdiff_t, 4> src_slice_start; + src_slice_start[0] = 0; + src_slice_start[1] = 0; + src_slice_start[2] = 0; + src_slice_start[3] = 0; + array<ptrdiff_t, 4> dst_slice_dim = src_slice_dim; + array<ptrdiff_t, 4> dst_slice_start = src_slice_start; for (int i = 0; i < 5; ++i) { if (LValue) { diff --git a/unsupported/test/cxx11_tensor_roundings.cpp b/unsupported/test/cxx11_tensor_roundings.cpp new file mode 100644 index 000000000..2c26151ab --- /dev/null +++ b/unsupported/test/cxx11_tensor_roundings.cpp @@ -0,0 +1,62 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + + +static void test_float_rounding() +{ + Tensor<float, 2> ftensor(20,30); + ftensor = ftensor.random() * 100.f; + + Tensor<float, 2> result = ftensor.round(); + + for (int i = 0; i < 20; ++i) { + for (int j = 0; j < 30; ++j) { + VERIFY_IS_EQUAL(result(i,j), numext::round(ftensor(i,j))); + } + } +} + +static void test_float_flooring() +{ + Tensor<float, 2> ftensor(20,30); + ftensor = ftensor.random() * 100.f; + + Tensor<float, 2> result = ftensor.floor(); + + for (int i = 0; i < 20; ++i) { + for (int j = 0; j < 30; ++j) { + VERIFY_IS_EQUAL(result(i,j), numext::floor(ftensor(i,j))); + } + } +} + +static void test_float_ceiling() +{ + Tensor<float, 2> ftensor(20,30); + ftensor = ftensor.random() * 100.f; + + Tensor<float, 2> result = ftensor.ceil(); + + for (int i = 0; i < 20; ++i) { + for (int j = 0; j < 30; ++j) { + VERIFY_IS_EQUAL(result(i,j), numext::ceil(ftensor(i,j))); + } + } +} + +void test_cxx11_tensor_roundings() +{ + CALL_SUBTEST(test_float_rounding()); + CALL_SUBTEST(test_float_ceiling()); + CALL_SUBTEST(test_float_flooring()); +} diff --git a/unsupported/test/cxx11_tensor_sugar.cpp b/unsupported/test/cxx11_tensor_sugar.cpp index 98671a986..a03f75cfe 100644 --- a/unsupported/test/cxx11_tensor_sugar.cpp +++ b/unsupported/test/cxx11_tensor_sugar.cpp @@ -18,7 +18,7 @@ static void test_comparison_sugar() { #define TEST_TENSOR_EQUAL(e1, e2) \ b = ((e1) == (e2)).all(); \ - VERIFY(b(0)) + VERIFY(b()) #define TEST_OP(op) TEST_TENSOR_EQUAL(t op 0, t op t.constant(0)) @@ -32,7 +32,30 @@ static void test_comparison_sugar() { #undef TEST_TENSOR_EQUAL } + +static void test_scalar_sugar() { + Tensor<float, 3> A(6, 7, 5); + Tensor<float, 3> B(6, 7, 5); + A.setRandom(); + B.setRandom(); + + const float alpha = 0.43f; + const float beta = 0.21f; + + Tensor<float, 3> R = A * A.constant(alpha) + B * B.constant(beta); + Tensor<float, 3> S = A * alpha + B * beta; + + // TODO: add enough syntactic sugar to support this + // Tensor<float, 3> T = alpha * A + beta * B; + + for (int i = 0; i < 6*7*5; ++i) { + VERIFY_IS_APPROX(R(i), S(i)); + } +} + + void test_cxx11_tensor_sugar() { CALL_SUBTEST(test_comparison_sugar()); + CALL_SUBTEST(test_scalar_sugar()); } diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index e28cf55e2..e46197464 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -17,7 +17,7 @@ using Eigen::Tensor; -static void test_multithread_elementwise() +void test_multithread_elementwise() { Tensor<float, 3> in1(2,3,7); Tensor<float, 3> in2(2,3,7); @@ -40,7 +40,7 @@ static void test_multithread_elementwise() } -static void test_multithread_compound_assignment() +void test_multithread_compound_assignment() { Tensor<float, 3> in1(2,3,7); Tensor<float, 3> in2(2,3,7); @@ -64,7 +64,7 @@ static void test_multithread_compound_assignment() } template<int DataLayout> -static void test_multithread_contraction() +void test_multithread_contraction() { Tensor<float, 4, DataLayout> t_left(30, 50, 37, 31); Tensor<float, 5, DataLayout> t_right(37, 31, 70, 2, 10); @@ -91,15 +91,20 @@ static void test_multithread_contraction() for (ptrdiff_t i = 0; i < t_result.size(); i++) { VERIFY(&t_result.data()[i] != &m_result.data()[i]); - if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { - std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; - assert(false); + if (fabs(t_result(i) - m_result(i)) < 1e-4) { + continue; + } + if (Eigen::internal::isApprox(t_result(i), m_result(i), 1e-4f)) { + continue; } + std::cout << "mismatch detected at index " << i << ": " << t_result(i) + << " vs " << m_result(i) << std::endl; + assert(false); } } template<int DataLayout> -static void test_contraction_corner_cases() +void test_contraction_corner_cases() { Tensor<float, 2, DataLayout> t_left(32, 500); Tensor<float, 2, DataLayout> t_right(32, 28*28); @@ -186,7 +191,7 @@ static void test_contraction_corner_cases() } template<int DataLayout> -static void test_multithread_contraction_agrees_with_singlethread() { +void test_multithread_contraction_agrees_with_singlethread() { int contract_size = internal::random<int>(1, 5000); Tensor<float, 3, DataLayout> left(internal::random<int>(1, 80), @@ -229,7 +234,7 @@ static void test_multithread_contraction_agrees_with_singlethread() { template<int DataLayout> -static void test_multithreaded_reductions() { +void test_multithreaded_reductions() { const int num_threads = internal::random<int>(3, 11); ThreadPool thread_pool(num_threads); Eigen::ThreadPoolDevice thread_pool_device(&thread_pool, num_threads); @@ -239,19 +244,19 @@ static void test_multithreaded_reductions() { Tensor<float, 2, DataLayout> t1(num_rows, num_cols); t1.setRandom(); - Tensor<float, 1, DataLayout> full_redux(1); + Tensor<float, 0, DataLayout> full_redux; full_redux = t1.sum(); - Tensor<float, 1, DataLayout> full_redux_tp(1); + Tensor<float, 0, DataLayout> full_redux_tp; full_redux_tp.device(thread_pool_device) = t1.sum(); // Check that the single threaded and the multi threaded reductions return // the same result. - VERIFY_IS_APPROX(full_redux(0), full_redux_tp(0)); + VERIFY_IS_APPROX(full_redux(), full_redux_tp()); } -static void test_memcpy() { +void test_memcpy() { for (int i = 0; i < 5; ++i) { const int num_threads = internal::random<int>(3, 11); @@ -270,7 +275,7 @@ static void test_memcpy() { } -static void test_multithread_random() +void test_multithread_random() { Eigen::ThreadPool tp(2); Eigen::ThreadPoolDevice device(&tp, 2); @@ -278,26 +283,52 @@ static void test_multithread_random() t.device(device) = t.random<Eigen::internal::NormalRandomGenerator<float>>(); } +template<int DataLayout> +void test_multithread_shuffle() +{ + Tensor<float, 4, DataLayout> tensor(17,5,7,11); + tensor.setRandom(); + + const int num_threads = internal::random<int>(2, 11); + ThreadPool threads(num_threads); + Eigen::ThreadPoolDevice device(&threads, num_threads); + + Tensor<float, 4, DataLayout> shuffle(7,5,11,17); + array<ptrdiff_t, 4> shuffles = {{2,1,3,0}}; + shuffle.device(device) = tensor.shuffle(shuffles); + + for (int i = 0; i < 17; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 11; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,j,l,i)); + } + } + } + } +} + void test_cxx11_tensor_thread_pool() { - CALL_SUBTEST(test_multithread_elementwise()); - CALL_SUBTEST(test_multithread_compound_assignment()); + CALL_SUBTEST_1(test_multithread_elementwise()); + CALL_SUBTEST_1(test_multithread_compound_assignment()); - CALL_SUBTEST(test_multithread_contraction<ColMajor>()); - CALL_SUBTEST(test_multithread_contraction<RowMajor>()); + CALL_SUBTEST_2(test_multithread_contraction<ColMajor>()); + CALL_SUBTEST_2(test_multithread_contraction<RowMajor>()); - CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread<ColMajor>()); - CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread<RowMajor>()); + CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<ColMajor>()); + CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<RowMajor>()); // Exercise various cases that have been problematic in the past. - CALL_SUBTEST(test_contraction_corner_cases<ColMajor>()); - CALL_SUBTEST(test_contraction_corner_cases<RowMajor>()); - - CALL_SUBTEST(test_multithreaded_reductions<ColMajor>()); - CALL_SUBTEST(test_multithreaded_reductions<RowMajor>()); + CALL_SUBTEST_4(test_contraction_corner_cases<ColMajor>()); + CALL_SUBTEST_4(test_contraction_corner_cases<RowMajor>()); - CALL_SUBTEST(test_memcpy()); + CALL_SUBTEST_5(test_multithreaded_reductions<ColMajor>()); + CALL_SUBTEST_5(test_multithreaded_reductions<RowMajor>()); - CALL_SUBTEST(test_multithread_random()); + CALL_SUBTEST_6(test_memcpy()); + CALL_SUBTEST_6(test_multithread_random()); + CALL_SUBTEST_6(test_multithread_shuffle<ColMajor>()); + CALL_SUBTEST_6(test_multithread_shuffle<RowMajor>()); } diff --git a/unsupported/test/cxx11_tensor_uint128.cpp b/unsupported/test/cxx11_tensor_uint128.cpp index ee3767e58..d2a1e8673 100644 --- a/unsupported/test/cxx11_tensor_uint128.cpp +++ b/unsupported/test/cxx11_tensor_uint128.cpp @@ -11,10 +11,20 @@ #include <Eigen/CXX11/Tensor> + +#if EIGEN_COMP_MSVC +#define EIGEN_NO_INT128 +#else +typedef __uint128_t uint128_t; +#endif + +// Only run the test on compilers that support 128bit integers natively +#ifndef EIGEN_NO_INT128 + using Eigen::internal::TensorUInt128; using Eigen::internal::static_val; -void VERIFY_EQUAL(TensorUInt128<uint64_t, uint64_t> actual, __uint128_t expected) { +void VERIFY_EQUAL(TensorUInt128<uint64_t, uint64_t> actual, uint128_t expected) { bool matchl = actual.lower() == static_cast<uint64_t>(expected); bool matchh = actual.upper() == static_cast<uint64_t>(expected >> 64); if (!matchl || !matchh) { @@ -32,13 +42,13 @@ void test_add() { for (uint64_t i1 = 0; i1 < 100; ++i1) { for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) { TensorUInt128<uint64_t, uint64_t> i(i1, i2); - __uint128_t a = (static_cast<__uint128_t>(i1) << 64) + static_cast<__uint128_t>(i2); + uint128_t a = (static_cast<uint128_t>(i1) << 64) + static_cast<uint128_t>(i2); for (uint64_t j1 = 0; j1 < 100; ++j1) { for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) { TensorUInt128<uint64_t, uint64_t> j(j1, j2); - __uint128_t b = (static_cast<__uint128_t>(j1) << 64) + static_cast<__uint128_t>(j2); + uint128_t b = (static_cast<uint128_t>(j1) << 64) + static_cast<uint128_t>(j2); TensorUInt128<uint64_t, uint64_t> actual = i + j; - __uint128_t expected = a + b; + uint128_t expected = a + b; VERIFY_EQUAL(actual, expected); } } @@ -51,13 +61,13 @@ void test_sub() { for (uint64_t i1 = 0; i1 < 100; ++i1) { for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) { TensorUInt128<uint64_t, uint64_t> i(i1, i2); - __uint128_t a = (static_cast<__uint128_t>(i1) << 64) + static_cast<__uint128_t>(i2); + uint128_t a = (static_cast<uint128_t>(i1) << 64) + static_cast<uint128_t>(i2); for (uint64_t j1 = 0; j1 < 100; ++j1) { for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) { TensorUInt128<uint64_t, uint64_t> j(j1, j2); - __uint128_t b = (static_cast<__uint128_t>(j1) << 64) + static_cast<__uint128_t>(j2); + uint128_t b = (static_cast<uint128_t>(j1) << 64) + static_cast<uint128_t>(j2); TensorUInt128<uint64_t, uint64_t> actual = i - j; - __uint128_t expected = a - b; + uint128_t expected = a - b; VERIFY_EQUAL(actual, expected); } } @@ -70,13 +80,13 @@ void test_mul() { for (uint64_t i1 = 0; i1 < 100; ++i1) { for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) { TensorUInt128<uint64_t, uint64_t> i(i1, i2); - __uint128_t a = (static_cast<__uint128_t>(i1) << 64) + static_cast<__uint128_t>(i2); + uint128_t a = (static_cast<uint128_t>(i1) << 64) + static_cast<uint128_t>(i2); for (uint64_t j1 = 0; j1 < 100; ++j1) { for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) { TensorUInt128<uint64_t, uint64_t> j(j1, j2); - __uint128_t b = (static_cast<__uint128_t>(j1) << 64) + static_cast<__uint128_t>(j2); + uint128_t b = (static_cast<uint128_t>(j1) << 64) + static_cast<uint128_t>(j2); TensorUInt128<uint64_t, uint64_t> actual = i * j; - __uint128_t expected = a * b; + uint128_t expected = a * b; VERIFY_EQUAL(actual, expected); } } @@ -89,13 +99,13 @@ void test_div() { for (uint64_t i1 = 0; i1 < 100; ++i1) { for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) { TensorUInt128<uint64_t, uint64_t> i(i1, i2); - __uint128_t a = (static_cast<__uint128_t>(i1) << 64) + static_cast<__uint128_t>(i2); + uint128_t a = (static_cast<uint128_t>(i1) << 64) + static_cast<uint128_t>(i2); for (uint64_t j1 = 0; j1 < 100; ++j1) { for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) { TensorUInt128<uint64_t, uint64_t> j(j1, j2); - __uint128_t b = (static_cast<__uint128_t>(j1) << 64) + static_cast<__uint128_t>(j2); + uint128_t b = (static_cast<uint128_t>(j1) << 64) + static_cast<uint128_t>(j2); TensorUInt128<uint64_t, uint64_t> actual = i / j; - __uint128_t expected = a / b; + uint128_t expected = a / b; VERIFY_EQUAL(actual, expected); } } @@ -107,10 +117,10 @@ void test_misc1() { uint64_t incr = internal::random<uint64_t>(1, 9999999999); for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) { TensorUInt128<static_val<0>, uint64_t> i(0, i2); - __uint128_t a = static_cast<__uint128_t>(i2); + uint128_t a = static_cast<uint128_t>(i2); for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) { TensorUInt128<static_val<0>, uint64_t> j(0, j2); - __uint128_t b = static_cast<__uint128_t>(j2); + uint128_t b = static_cast<uint128_t>(j2); uint64_t actual = (i * j).upper(); uint64_t expected = (a * b) >> 64; VERIFY_IS_EQUAL(actual, expected); @@ -122,23 +132,29 @@ void test_misc2() { int64_t incr = internal::random<int64_t>(1, 100); for (int64_t log_div = 0; log_div < 63; ++log_div) { for (int64_t divider = 1; divider <= 1000000 * incr; divider += incr) { - uint64_t expected = (static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1; + uint64_t expected = (static_cast<uint128_t>(1) << (64+log_div)) / static_cast<uint128_t>(divider) - (static_cast<uint128_t>(1) << 64) + 1; uint64_t shift = 1ULL << log_div; TensorUInt128<uint64_t, uint64_t> result = (TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) - TensorUInt128<static_val<1>, static_val<0> >(1, 0) + TensorUInt128<static_val<0>, static_val<1> >(1)); uint64_t actual = static_cast<uint64_t>(result); - VERIFY_EQUAL(actual, expected); + VERIFY_IS_EQUAL(actual, expected); } } } +#endif void test_cxx11_tensor_uint128() { +#ifdef EIGEN_NO_INT128 + // Skip the test on compilers that don't support 128bit integers natively + return; +#else CALL_SUBTEST_1(test_add()); CALL_SUBTEST_2(test_sub()); CALL_SUBTEST_3(test_mul()); CALL_SUBTEST_4(test_div()); CALL_SUBTEST_5(test_misc1()); CALL_SUBTEST_6(test_misc2()); +#endif } diff --git a/unsupported/test/levenberg_marquardt.cpp b/unsupported/test/levenberg_marquardt.cpp index a2bdb99e4..6dc17bd17 100644 --- a/unsupported/test/levenberg_marquardt.cpp +++ b/unsupported/test/levenberg_marquardt.cpp @@ -23,6 +23,9 @@ using std::sqrt; +// tolerance for chekcing number of iterations +#define LM_EVAL_COUNT_TOL 4/3 + struct lmder_functor : DenseFunctor<double> { lmder_functor(void): DenseFunctor<double>(3,15) {} @@ -631,7 +634,7 @@ void testNistLanczos1(void) VERIFY_IS_EQUAL(lm.nfev(), 79); VERIFY_IS_EQUAL(lm.njev(), 72); // check norm^2 -// VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.430899764097e-25); // should be 1.4307867721E-25, but nist results are on 128-bit floats + VERIFY(lm.fvec().squaredNorm() <= 1.4307867721E-25); // check x VERIFY_IS_APPROX(x[0], 9.5100000027E-02); VERIFY_IS_APPROX(x[1], 1.0000000001E+00); @@ -652,7 +655,7 @@ void testNistLanczos1(void) VERIFY_IS_EQUAL(lm.nfev(), 9); VERIFY_IS_EQUAL(lm.njev(), 8); // check norm^2 -// VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.428595533845e-25); // should be 1.4307867721E-25, but nist results are on 128-bit floats + VERIFY(lm.fvec().squaredNorm() <= 1.4307867721E-25); // check x VERIFY_IS_APPROX(x[0], 9.5100000027E-02); VERIFY_IS_APPROX(x[1], 1.0000000001E+00); @@ -789,7 +792,8 @@ void testNistMGH10(void) MGH10_functor functor; LevenbergMarquardt<MGH10_functor> lm(functor); info = lm.minimize(x); - VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeErrorTooSmall); + VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall); + // was: VERIFY_IS_EQUAL(info, 1); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7945855171E+01); @@ -799,9 +803,13 @@ void testNistMGH10(void) VERIFY_IS_APPROX(x[2], 3.4522363462E+02); // check return value - //VERIFY_IS_EQUAL(info, 1); + + ++g_test_level; VERIFY_IS_EQUAL(lm.nfev(), 284 ); VERIFY_IS_EQUAL(lm.njev(), 249 ); + --g_test_level; + VERIFY(lm.nfev() < 284 * LM_EVAL_COUNT_TOL); + VERIFY(lm.njev() < 249 * LM_EVAL_COUNT_TOL); /* * Second try @@ -809,7 +817,10 @@ void testNistMGH10(void) x<< 0.02, 4000., 250.; // do the computation info = lm.minimize(x); + ++g_test_level; VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall); + // was: VERIFY_IS_EQUAL(info, 1); + --g_test_level; // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7945855171E+01); @@ -819,9 +830,12 @@ void testNistMGH10(void) VERIFY_IS_APPROX(x[2], 3.4522363462E+02); // check return value - //VERIFY_IS_EQUAL(info, 1); + ++g_test_level; VERIFY_IS_EQUAL(lm.nfev(), 126); VERIFY_IS_EQUAL(lm.njev(), 116); + --g_test_level; + VERIFY(lm.nfev() < 126 * LM_EVAL_COUNT_TOL); + VERIFY(lm.njev() < 116 * LM_EVAL_COUNT_TOL); } @@ -896,8 +910,12 @@ void testNistBoxBOD(void) // check return value VERIFY_IS_EQUAL(info, 1); + ++g_test_level; VERIFY_IS_EQUAL(lm.nfev(), 16 ); VERIFY_IS_EQUAL(lm.njev(), 15 ); + --g_test_level; + VERIFY(lm.nfev() < 16 * LM_EVAL_COUNT_TOL); + VERIFY(lm.njev() < 15 * LM_EVAL_COUNT_TOL); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.1680088766E+03); // check x diff --git a/unsupported/test/matrix_function.cpp b/unsupported/test/matrix_function.cpp index 487d5a9b8..9a995f941 100644 --- a/unsupported/test/matrix_function.cpp +++ b/unsupported/test/matrix_function.cpp @@ -113,8 +113,8 @@ void testMatrixLogarithm(const MatrixType& A) MatrixType scaledA; RealScalar maxImagPartOfSpectrum = A.eigenvalues().imag().cwiseAbs().maxCoeff(); - if (maxImagPartOfSpectrum >= 0.9 * M_PI) - scaledA = A * 0.9 * M_PI / maxImagPartOfSpectrum; + if (maxImagPartOfSpectrum >= 0.9 * EIGEN_PI) + scaledA = A * 0.9 * EIGEN_PI / maxImagPartOfSpectrum; else scaledA = A; diff --git a/unsupported/test/matrix_power.cpp b/unsupported/test/matrix_power.cpp index baf183d12..8e104ed1e 100644 --- a/unsupported/test/matrix_power.cpp +++ b/unsupported/test/matrix_power.cpp @@ -24,7 +24,7 @@ void test2dRotation(double tol) s = std::sin(angle); B << c, s, -s, c; - C = Apow(std::ldexp(angle,1) / M_PI); + C = Apow(std::ldexp(angle,1) / EIGEN_PI); std::cout << "test2dRotation: i = " << i << " error powerm = " << relerr(C,B) << '\n'; VERIFY(C.isApprox(B, tol)); } diff --git a/unsupported/test/mpreal/mpreal.h b/unsupported/test/mpreal/mpreal.h index c4f6cf0cb..9b0cf7268 100644 --- a/unsupported/test/mpreal/mpreal.h +++ b/unsupported/test/mpreal/mpreal.h @@ -72,14 +72,12 @@ #define MPREAL_VERSION_STRING "3.6.2"
// Detect compiler using signatures from http://predef.sourceforge.net/
-#if defined(__GNUC__) && defined(__INTEL_COMPILER)
- #define IsInf(x) isinf(x) // Intel ICC compiler on Linux
-
+#if defined(__GNUC__)
+ #define IsInf(x) (isinf)(x) // GNU C++/Intel ICC compiler on Linux
#elif defined(_MSC_VER) // Microsoft Visual C++
#define IsInf(x) (!_finite(x))
-
#else
- #define IsInf(x) std::isinf EIGEN_NOT_A_MACRO (x) // GNU C/C++ (and/or other compilers), just hope for C99 conformance
+ #define IsInf(x) (std::isinf)(x) // GNU C/C++ (and/or other compilers), just hope for C99 conformance
#endif
// A Clang feature extension to determine compiler features.
@@ -3103,4 +3101,4 @@ namespace std }
-#endif /* __MPREAL_H__ */
\ No newline at end of file +#endif /* __MPREAL_H__ */
diff --git a/unsupported/test/splines.cpp b/unsupported/test/splines.cpp index 97665af96..3be020434 100644 --- a/unsupported/test/splines.cpp +++ b/unsupported/test/splines.cpp @@ -239,7 +239,7 @@ void check_global_interpolation_with_derivatives2d() typedef Spline2d::PointType PointType; typedef Spline2d::KnotVectorType KnotVectorType; - const unsigned int numPoints = 100; + const Eigen::DenseIndex numPoints = 100; const unsigned int dimension = 2; const unsigned int degree = 3; |