aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/test
diff options
context:
space:
mode:
Diffstat (limited to 'unsupported/test')
-rw-r--r--unsupported/test/CMakeLists.txt15
-rw-r--r--unsupported/test/EulerAngles.cpp208
-rw-r--r--unsupported/test/cxx11_eventcount.cpp6
-rw-r--r--unsupported/test/cxx11_tensor_argmax_cuda.cu3
-rw-r--r--unsupported/test/cxx11_tensor_cast_float16_cuda.cu4
-rw-r--r--unsupported/test/cxx11_tensor_complex_cuda.cu78
-rw-r--r--unsupported/test/cxx11_tensor_contract_cuda.cu4
-rw-r--r--unsupported/test/cxx11_tensor_contraction.cpp23
-rw-r--r--unsupported/test/cxx11_tensor_cuda.cu105
-rw-r--r--unsupported/test/cxx11_tensor_device.cu4
-rw-r--r--unsupported/test/cxx11_tensor_of_float16_cuda.cu56
-rw-r--r--unsupported/test/cxx11_tensor_random_cuda.cu3
-rw-r--r--unsupported/test/cxx11_tensor_reduction_cuda.cu122
-rw-r--r--unsupported/test/cxx11_tensor_scan_cuda.cu4
-rw-r--r--unsupported/test/kronecker_product.cpp22
15 files changed, 589 insertions, 68 deletions
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index de9b5243a..0d7ed1db2 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -59,6 +59,8 @@ ei_add_test(alignedvector3)
ei_add_test(FFT)
+ei_add_test(EulerAngles)
+
find_package(MPFR 2.3.0)
find_package(GMP)
if(MPFR_FOUND AND EIGEN_COMPILER_SUPPORT_CXX11)
@@ -230,20 +232,25 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
- ei_add_test(cxx11_tensor_device)
- ei_add_test(cxx11_tensor_cuda)
- ei_add_test(cxx11_tensor_contract_cuda)
+ ei_add_test(cxx11_tensor_complex_cuda)
ei_add_test(cxx11_tensor_reduction_cuda)
ei_add_test(cxx11_tensor_argmax_cuda)
ei_add_test(cxx11_tensor_cast_float16_cuda)
ei_add_test(cxx11_tensor_scan_cuda)
+ # Contractions require arch 3.0 or higher
+ if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 29)
+ ei_add_test(cxx11_tensor_device)
+ ei_add_test(cxx11_tensor_cuda)
+ ei_add_test(cxx11_tensor_contract_cuda)
+ ei_add_test(cxx11_tensor_of_float16_cuda)
+ endif()
+
# The random number generation code requires arch 3.5 or greater.
if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 34)
ei_add_test(cxx11_tensor_random_cuda)
endif()
- ei_add_test(cxx11_tensor_of_float16_cuda)
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
endif()
diff --git a/unsupported/test/EulerAngles.cpp b/unsupported/test/EulerAngles.cpp
new file mode 100644
index 000000000..a8cb52864
--- /dev/null
+++ b/unsupported/test/EulerAngles.cpp
@@ -0,0 +1,208 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <unsupported/Eigen/EulerAngles>
+
+using namespace Eigen;
+
+template<typename EulerSystem, typename Scalar>
+void verify_euler_ranged(const Matrix<Scalar,3,1>& ea,
+ bool positiveRangeAlpha, bool positiveRangeBeta, bool positiveRangeGamma)
+{
+ typedef EulerAngles<Scalar, EulerSystem> EulerAnglesType;
+ typedef Matrix<Scalar,3,3> Matrix3;
+ typedef Matrix<Scalar,3,1> Vector3;
+ typedef Quaternion<Scalar> QuaternionType;
+ typedef AngleAxis<Scalar> AngleAxisType;
+ using std::abs;
+
+ Scalar alphaRangeStart, alphaRangeEnd;
+ Scalar betaRangeStart, betaRangeEnd;
+ Scalar gammaRangeStart, gammaRangeEnd;
+
+ if (positiveRangeAlpha)
+ {
+ alphaRangeStart = Scalar(0);
+ alphaRangeEnd = Scalar(2 * EIGEN_PI);
+ }
+ else
+ {
+ alphaRangeStart = -Scalar(EIGEN_PI);
+ alphaRangeEnd = Scalar(EIGEN_PI);
+ }
+
+ if (positiveRangeBeta)
+ {
+ betaRangeStart = Scalar(0);
+ betaRangeEnd = Scalar(2 * EIGEN_PI);
+ }
+ else
+ {
+ betaRangeStart = -Scalar(EIGEN_PI);
+ betaRangeEnd = Scalar(EIGEN_PI);
+ }
+
+ if (positiveRangeGamma)
+ {
+ gammaRangeStart = Scalar(0);
+ gammaRangeEnd = Scalar(2 * EIGEN_PI);
+ }
+ else
+ {
+ gammaRangeStart = -Scalar(EIGEN_PI);
+ gammaRangeEnd = Scalar(EIGEN_PI);
+ }
+
+ const int i = EulerSystem::AlphaAxisAbs - 1;
+ const int j = EulerSystem::BetaAxisAbs - 1;
+ const int k = EulerSystem::GammaAxisAbs - 1;
+
+ const int iFactor = EulerSystem::IsAlphaOpposite ? -1 : 1;
+ const int jFactor = EulerSystem::IsBetaOpposite ? -1 : 1;
+ const int kFactor = EulerSystem::IsGammaOpposite ? -1 : 1;
+
+ const Vector3 I = EulerAnglesType::AlphaAxisVector();
+ const Vector3 J = EulerAnglesType::BetaAxisVector();
+ const Vector3 K = EulerAnglesType::GammaAxisVector();
+
+ EulerAnglesType e(ea[0], ea[1], ea[2]);
+
+ Matrix3 m(e);
+ Vector3 eabis = EulerAnglesType(m, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma).angles();
+
+ // Check that eabis in range
+ VERIFY(alphaRangeStart <= eabis[0] && eabis[0] <= alphaRangeEnd);
+ VERIFY(betaRangeStart <= eabis[1] && eabis[1] <= betaRangeEnd);
+ VERIFY(gammaRangeStart <= eabis[2] && eabis[2] <= gammaRangeEnd);
+
+ Vector3 eabis2 = m.eulerAngles(i, j, k);
+
+ // Invert the relevant axes
+ eabis2[0] *= iFactor;
+ eabis2[1] *= jFactor;
+ eabis2[2] *= kFactor;
+
+ // Saturate the angles to the correct range
+ if (positiveRangeAlpha && (eabis2[0] < 0))
+ eabis2[0] += Scalar(2 * EIGEN_PI);
+ if (positiveRangeBeta && (eabis2[1] < 0))
+ eabis2[1] += Scalar(2 * EIGEN_PI);
+ if (positiveRangeGamma && (eabis2[2] < 0))
+ eabis2[2] += Scalar(2 * EIGEN_PI);
+
+ VERIFY_IS_APPROX(eabis, eabis2);// Verify that our estimation is the same as m.eulerAngles() is
+
+ Matrix3 mbis(AngleAxisType(eabis[0], I) * AngleAxisType(eabis[1], J) * AngleAxisType(eabis[2], K));
+ VERIFY_IS_APPROX(m, mbis);
+
+ // Tests that are only relevant for no possitive range
+ if (!(positiveRangeAlpha || positiveRangeBeta || positiveRangeGamma))
+ {
+ /* If I==K, and ea[1]==0, then there no unique solution. */
+ /* The remark apply in the case where I!=K, and |ea[1]| is close to pi/2. */
+ if( (i!=k || ea[1]!=0) && (i==k || !internal::isApprox(abs(ea[1]),Scalar(EIGEN_PI/2),test_precision<Scalar>())) )
+ VERIFY((ea-eabis).norm() <= test_precision<Scalar>());
+
+ // approx_or_less_than does not work for 0
+ VERIFY(0 < eabis[0] || test_isMuchSmallerThan(eabis[0], Scalar(1)));
+ }
+
+ // Quaternions
+ QuaternionType q(e);
+ eabis = EulerAnglesType(q, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma).angles();
+ VERIFY_IS_APPROX(eabis, eabis2);// Verify that the euler angles are still the same
+}
+
+template<typename EulerSystem, typename Scalar>
+void verify_euler(const Matrix<Scalar,3,1>& ea)
+{
+ verify_euler_ranged<EulerSystem>(ea, false, false, false);
+ verify_euler_ranged<EulerSystem>(ea, false, false, true);
+ verify_euler_ranged<EulerSystem>(ea, false, true, false);
+ verify_euler_ranged<EulerSystem>(ea, false, true, true);
+ verify_euler_ranged<EulerSystem>(ea, true, false, false);
+ verify_euler_ranged<EulerSystem>(ea, true, false, true);
+ verify_euler_ranged<EulerSystem>(ea, true, true, false);
+ verify_euler_ranged<EulerSystem>(ea, true, true, true);
+}
+
+template<typename Scalar> void check_all_var(const Matrix<Scalar,3,1>& ea)
+{
+ verify_euler<EulerSystemXYZ>(ea);
+ verify_euler<EulerSystemXYX>(ea);
+ verify_euler<EulerSystemXZY>(ea);
+ verify_euler<EulerSystemXZX>(ea);
+
+ verify_euler<EulerSystemYZX>(ea);
+ verify_euler<EulerSystemYZY>(ea);
+ verify_euler<EulerSystemYXZ>(ea);
+ verify_euler<EulerSystemYXY>(ea);
+
+ verify_euler<EulerSystemZXY>(ea);
+ verify_euler<EulerSystemZXZ>(ea);
+ verify_euler<EulerSystemZYX>(ea);
+ verify_euler<EulerSystemZYZ>(ea);
+}
+
+template<typename Scalar> void eulerangles()
+{
+ typedef Matrix<Scalar,3,3> Matrix3;
+ typedef Matrix<Scalar,3,1> Vector3;
+ typedef Array<Scalar,3,1> Array3;
+ typedef Quaternion<Scalar> Quaternionx;
+ typedef AngleAxis<Scalar> AngleAxisType;
+
+ Scalar a = internal::random<Scalar>(-Scalar(EIGEN_PI), Scalar(EIGEN_PI));
+ Quaternionx q1;
+ q1 = AngleAxisType(a, Vector3::Random().normalized());
+ Matrix3 m;
+ m = q1;
+
+ Vector3 ea = m.eulerAngles(0,1,2);
+ check_all_var(ea);
+ ea = m.eulerAngles(0,1,0);
+ check_all_var(ea);
+
+ // Check with purely random Quaternion:
+ q1.coeffs() = Quaternionx::Coefficients::Random().normalized();
+ m = q1;
+ ea = m.eulerAngles(0,1,2);
+ check_all_var(ea);
+ ea = m.eulerAngles(0,1,0);
+ check_all_var(ea);
+
+ // Check with random angles in range [0:pi]x[-pi:pi]x[-pi:pi].
+ ea = (Array3::Random() + Array3(1,0,0))*Scalar(EIGEN_PI)*Array3(0.5,1,1);
+ check_all_var(ea);
+
+ ea[2] = ea[0] = internal::random<Scalar>(0,Scalar(EIGEN_PI));
+ check_all_var(ea);
+
+ ea[0] = ea[1] = internal::random<Scalar>(0,Scalar(EIGEN_PI));
+ check_all_var(ea);
+
+ ea[1] = 0;
+ check_all_var(ea);
+
+ ea.head(2).setZero();
+ check_all_var(ea);
+
+ ea.setZero();
+ check_all_var(ea);
+}
+
+void test_EulerAngles()
+{
+ for(int i = 0; i < g_repeat; i++) {
+ CALL_SUBTEST_1( eulerangles<float>() );
+ CALL_SUBTEST_2( eulerangles<double>() );
+ }
+}
diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp
index f16cc6f07..3b598bf42 100644
--- a/unsupported/test/cxx11_eventcount.cpp
+++ b/unsupported/test/cxx11_eventcount.cpp
@@ -25,7 +25,8 @@ int rand_reentrant(unsigned int* s) {
static void test_basic_eventcount()
{
- std::vector<EventCount::Waiter> waiters(1);
+ MaxSizeVector<EventCount::Waiter> waiters(1);
+ waiters.resize(1);
EventCount ec(waiters);
EventCount::Waiter& w = waiters[0];
ec.Notify(false);
@@ -81,7 +82,8 @@ static void test_stress_eventcount()
static const int kEvents = 1 << 16;
static const int kQueues = 10;
- std::vector<EventCount::Waiter> waiters(kThreads);
+ MaxSizeVector<EventCount::Waiter> waiters(kThreads);
+ waiters.resize(kThreads);
EventCount ec(waiters);
TestQueue queues[kQueues];
diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_cuda.cu
index 41ccbe974..6fe8982f2 100644
--- a/unsupported/test/cxx11_tensor_argmax_cuda.cu
+++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu
@@ -12,6 +12,9 @@
#define EIGEN_TEST_FUNC cxx11_tensor_cuda
#define EIGEN_USE_GPU
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
diff --git a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
index f22b99de8..88c233994 100644
--- a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
@@ -13,7 +13,9 @@
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
-
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
diff --git a/unsupported/test/cxx11_tensor_complex_cuda.cu b/unsupported/test/cxx11_tensor_complex_cuda.cu
new file mode 100644
index 000000000..74befe670
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_complex_cuda.cu
@@ -0,0 +1,78 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_FUNC cxx11_tensor_complex
+#define EIGEN_USE_GPU
+
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+void test_cuda_nullary() {
+ Tensor<std::complex<float>, 1, 0, int> in1(2);
+ Tensor<std::complex<float>, 1, 0, int> in2(2);
+ in1.setRandom();
+ in2.setRandom();
+
+ std::size_t float_bytes = in1.size() * sizeof(float);
+ std::size_t complex_bytes = in1.size() * sizeof(std::complex<float>);
+
+ std::complex<float>* d_in1;
+ std::complex<float>* d_in2;
+ float* d_out2;
+ cudaMalloc((void**)(&d_in1), complex_bytes);
+ cudaMalloc((void**)(&d_in2), complex_bytes);
+ cudaMalloc((void**)(&d_out2), float_bytes);
+ cudaMemcpy(d_in1, in1.data(), complex_bytes, cudaMemcpyHostToDevice);
+ cudaMemcpy(d_in2, in2.data(), complex_bytes, cudaMemcpyHostToDevice);
+
+ Eigen::CudaStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in1(
+ d_in1, 2);
+ Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in2(
+ d_in2, 2);
+ Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_out2(
+ d_out2, 2);
+
+ gpu_in1.device(gpu_device) = gpu_in1.constant(std::complex<float>(3.14f, 2.7f));
+ gpu_out2.device(gpu_device) = gpu_in2.abs();
+
+ Tensor<std::complex<float>, 1, 0, int> new1(2);
+ Tensor<float, 1, 0, int> new2(2);
+
+ assert(cudaMemcpyAsync(new1.data(), d_in1, complex_bytes, cudaMemcpyDeviceToHost,
+ gpu_device.stream()) == cudaSuccess);
+ assert(cudaMemcpyAsync(new2.data(), d_out2, float_bytes, cudaMemcpyDeviceToHost,
+ gpu_device.stream()) == cudaSuccess);
+
+ assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+ for (int i = 0; i < 2; ++i) {
+ VERIFY_IS_APPROX(new1(i), std::complex<float>(3.14f, 2.7f));
+ VERIFY_IS_APPROX(new2(i), std::abs(in2(i)));
+ }
+
+ cudaFree(d_in1);
+ cudaFree(d_in2);
+ cudaFree(d_out2);
+}
+
+
+
+void test_cxx11_tensor_complex()
+{
+ CALL_SUBTEST(test_cuda_nullary());
+}
diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu
index 98ac180ef..767e9c678 100644
--- a/unsupported/test/cxx11_tensor_contract_cuda.cu
+++ b/unsupported/test/cxx11_tensor_contract_cuda.cu
@@ -14,7 +14,9 @@
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
-
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp
index 73623b2ed..ace97057f 100644
--- a/unsupported/test/cxx11_tensor_contraction.cpp
+++ b/unsupported/test/cxx11_tensor_contraction.cpp
@@ -489,6 +489,27 @@ static void test_tensor_product()
}
+template<int DataLayout>
+static void test_const_inputs()
+{
+ Tensor<float, 2, DataLayout> in1(2, 3);
+ Tensor<float, 2, DataLayout> in2(3, 2);
+ in1.setRandom();
+ in2.setRandom();
+
+ TensorMap<Tensor<const float, 2, DataLayout> > mat1(in1.data(), 2, 3);
+ TensorMap<Tensor<const float, 2, DataLayout> > mat2(in2.data(), 3, 2);
+ Tensor<float, 2, DataLayout> mat3(2,2);
+
+ Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+ mat3 = mat1.contract(mat2, dims);
+
+ VERIFY_IS_APPROX(mat3(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(1,0) + mat1(0,2)*mat2(2,0));
+ VERIFY_IS_APPROX(mat3(0,1), mat1(0,0)*mat2(0,1) + mat1(0,1)*mat2(1,1) + mat1(0,2)*mat2(2,1));
+ VERIFY_IS_APPROX(mat3(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(1,0) + mat1(1,2)*mat2(2,0));
+ VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1));
+}
+
void test_cxx11_tensor_contraction()
{
CALL_SUBTEST(test_evals<ColMajor>());
@@ -519,4 +540,6 @@ void test_cxx11_tensor_contraction()
CALL_SUBTEST(test_small_blocking_factors<RowMajor>());
CALL_SUBTEST(test_tensor_product<ColMajor>());
CALL_SUBTEST(test_tensor_product<RowMajor>());
+ CALL_SUBTEST(test_const_inputs<ColMajor>());
+ CALL_SUBTEST(test_const_inputs<RowMajor>());
}
diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu
index 284b46803..bf216587a 100644
--- a/unsupported/test/cxx11_tensor_cuda.cu
+++ b/unsupported/test/cxx11_tensor_cuda.cu
@@ -10,19 +10,65 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_cuda
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
-
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
using Eigen::Tensor;
+void test_cuda_nullary() {
+ Tensor<float, 1, 0, int> in1(2);
+ Tensor<float, 1, 0, int> in2(2);
+ in1.setRandom();
+ in2.setRandom();
+
+ std::size_t tensor_bytes = in1.size() * sizeof(float);
+
+ float* d_in1;
+ float* d_in2;
+ cudaMalloc((void**)(&d_in1), tensor_bytes);
+ cudaMalloc((void**)(&d_in2), tensor_bytes);
+ cudaMemcpy(d_in1, in1.data(), tensor_bytes, cudaMemcpyHostToDevice);
+ cudaMemcpy(d_in2, in2.data(), tensor_bytes, cudaMemcpyHostToDevice);
+
+ Eigen::CudaStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1(
+ d_in1, 2);
+ Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in2(
+ d_in2, 2);
+
+ gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f);
+ gpu_in2.device(gpu_device) = gpu_in2.random();
+
+ Tensor<float, 1, 0, int> new1(2);
+ Tensor<float, 1, 0, int> new2(2);
+
+ assert(cudaMemcpyAsync(new1.data(), d_in1, tensor_bytes, cudaMemcpyDeviceToHost,
+ gpu_device.stream()) == cudaSuccess);
+ assert(cudaMemcpyAsync(new2.data(), d_in2, tensor_bytes, cudaMemcpyDeviceToHost,
+ gpu_device.stream()) == cudaSuccess);
+
+ assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+ for (int i = 0; i < 2; ++i) {
+ VERIFY_IS_APPROX(new1(i), 3.14f);
+ VERIFY_IS_NOT_EQUAL(new2(i), in2(i));
+ }
+
+ cudaFree(d_in1);
+ cudaFree(d_in2);
+}
+
void test_cuda_elementwise_small() {
- Tensor<float, 1> in1(Eigen::array<int, 1>(2));
- Tensor<float, 1> in2(Eigen::array<int, 1>(2));
- Tensor<float, 1> out(Eigen::array<int, 1>(2));
+ Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2));
+ Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2));
+ Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>(2));
in1.setRandom();
in2.setRandom();
@@ -44,11 +90,11 @@ void test_cuda_elementwise_small() {
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
- d_in1, Eigen::array<int, 1>(2));
+ d_in1, Eigen::array<Eigen::DenseIndex, 1>(2));
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2(
- d_in2, Eigen::array<int, 1>(2));
+ d_in2, Eigen::array<Eigen::DenseIndex, 1>(2));
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out(
- d_out, Eigen::array<int, 1>(2));
+ d_out, Eigen::array<Eigen::DenseIndex, 1>(2));
gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
@@ -58,8 +104,8 @@ void test_cuda_elementwise_small() {
for (int i = 0; i < 2; ++i) {
VERIFY_IS_APPROX(
- out(Eigen::array<int, 1>(i)),
- in1(Eigen::array<int, 1>(i)) + in2(Eigen::array<int, 1>(i)));
+ out(Eigen::array<Eigen::DenseIndex, 1>(i)),
+ in1(Eigen::array<Eigen::DenseIndex, 1>(i)) + in2(Eigen::array<Eigen::DenseIndex, 1>(i)));
}
cudaFree(d_in1);
@@ -69,10 +115,10 @@ void test_cuda_elementwise_small() {
void test_cuda_elementwise()
{
- Tensor<float, 3> in1(Eigen::array<int, 3>(72,53,97));
- Tensor<float, 3> in2(Eigen::array<int, 3>(72,53,97));
- Tensor<float, 3> in3(Eigen::array<int, 3>(72,53,97));
- Tensor<float, 3> out(Eigen::array<int, 3>(72,53,97));
+ Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+ Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+ Tensor<float, 3> in3(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+ Tensor<float, 3> out(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
in1.setRandom();
in2.setRandom();
in3.setRandom();
@@ -98,10 +144,10 @@ void test_cuda_elementwise()
Eigen::CudaStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
- Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<int, 3>(72,53,97));
- Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<int, 3>(72,53,97));
- Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<int, 3>(72,53,97));
- Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<int, 3>(72,53,97));
+ Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+ Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+ Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+ Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
@@ -111,7 +157,7 @@ void test_cuda_elementwise()
for (int i = 0; i < 72; ++i) {
for (int j = 0; j < 53; ++j) {
for (int k = 0; k < 97; ++k) {
- VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), in1(Eigen::array<int, 3>(i,j,k)) + in2(Eigen::array<int, 3>(i,j,k)) * in3(Eigen::array<int, 3>(i,j,k)));
+ VERIFY_IS_APPROX(out(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)), in1(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) + in2(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) * in3(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)));
}
}
}
@@ -181,7 +227,7 @@ void test_cuda_reduction()
Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
- array<int, 2> reduction_axis;
+ array<Eigen::DenseIndex, 2> reduction_axis;
reduction_axis[0] = 1;
reduction_axis[1] = 3;
@@ -214,8 +260,8 @@ void test_cuda_contraction()
// more than 30 * 1024, which is the number of threads in blocks on
// a 15 SM GK110 GPU
Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31);
- Tensor<float, 5, DataLayout> t_right(Eigen::array<int, 5>(3, 31, 7, 20, 1));
- Tensor<float, 5, DataLayout> t_result(Eigen::array<int, 5>(6, 50, 7, 20, 1));
+ Tensor<float, 5, DataLayout> t_right(Eigen::array<Eigen::DenseIndex, 5>(3, 31, 7, 20, 1));
+ Tensor<float, 5, DataLayout> t_result(Eigen::array<Eigen::DenseIndex, 5>(6, 50, 7, 20, 1));
t_left.setRandom();
t_right.setRandom();
@@ -299,7 +345,7 @@ void test_cuda_convolution_1d()
Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4);
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137);
- Eigen::array<int, 1> dims(1);
+ Eigen::array<Eigen::DenseIndex, 1> dims(1);
gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
@@ -352,7 +398,7 @@ void test_cuda_convolution_inner_dim_col_major_1d()
Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4);
Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7);
- Eigen::array<int, 1> dims(0);
+ Eigen::array<Eigen::DenseIndex, 1> dims(0);
gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
@@ -405,7 +451,7 @@ void test_cuda_convolution_inner_dim_row_major_1d()
Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4);
Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71);
- Eigen::array<int, 1> dims(3);
+ Eigen::array<Eigen::DenseIndex, 1> dims(3);
gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
@@ -459,7 +505,7 @@ void test_cuda_convolution_2d()
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4);
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137);
- Eigen::array<int, 2> dims(1,2);
+ Eigen::array<Eigen::DenseIndex, 2> dims(1,2);
gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
@@ -496,9 +542,9 @@ void test_cuda_convolution_2d()
template<int DataLayout>
void test_cuda_convolution_3d()
{
- Tensor<float, 5, DataLayout> input(Eigen::array<int, 5>(74,37,11,137,17));
+ Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>(74,37,11,137,17));
Tensor<float, 3, DataLayout> kernel(3,4,2);
- Tensor<float, 5, DataLayout> out(Eigen::array<int, 5>(74,35,8,136,17));
+ Tensor<float, 5, DataLayout> out(Eigen::array<Eigen::DenseIndex, 5>(74,35,8,136,17));
input = input.constant(10.0f) + input.random();
kernel = kernel.constant(7.0f) + kernel.random();
@@ -523,7 +569,7 @@ void test_cuda_convolution_3d()
Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2);
Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17);
- Eigen::array<int, 3> dims(1,2,3);
+ Eigen::array<Eigen::DenseIndex, 3> dims(1,2,3);
gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
@@ -1168,6 +1214,7 @@ void test_cuda_betainc()
void test_cxx11_tensor_cuda()
{
+ CALL_SUBTEST_1(test_cuda_nullary());
CALL_SUBTEST_1(test_cuda_elementwise_small());
CALL_SUBTEST_1(test_cuda_elementwise());
CALL_SUBTEST_1(test_cuda_props());
diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu
index b6ca54d93..fde20ddf2 100644
--- a/unsupported/test/cxx11_tensor_device.cu
+++ b/unsupported/test/cxx11_tensor_device.cu
@@ -13,7 +13,9 @@
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
-
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
index 2f55f9361..cbf401c86 100644
--- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
@@ -13,7 +13,9 @@
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
-
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
@@ -181,30 +183,39 @@ void test_cuda_trancendental() {
float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+ float* d_float3 = (float*)gpu_device.allocate(num_elem * sizeof(float));
Eigen::half* d_res1_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
Eigen::half* d_res1_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
Eigen::half* d_res2_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
Eigen::half* d_res2_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
-
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(
- d_float1, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(
- d_float2, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(
- d_res1_half, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(
- d_res1_float, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(
- d_res2_half, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(
- d_res2_float, num_elem);
+ Eigen::half* d_res3_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+ Eigen::half* d_res3_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
+ gpu_float3.device(gpu_device) = gpu_float3.random();
gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>();
gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>();
- gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().exp();
- gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>().log();
+ gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>();
+
+ gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>();
+ gpu_res1_half.device(gpu_device) = gpu_res1_half.exp();
+
+ gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>();
+ gpu_res2_half.device(gpu_device) = gpu_res2_half.log();
+
+ gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
+ gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p();
Tensor<float, 1> input1(num_elem);
Tensor<Eigen::half, 1> half_prec1(num_elem);
@@ -212,12 +223,18 @@ void test_cuda_trancendental() {
Tensor<float, 1> input2(num_elem);
Tensor<Eigen::half, 1> half_prec2(num_elem);
Tensor<Eigen::half, 1> full_prec2(num_elem);
+ Tensor<float, 1> input3(num_elem);
+ Tensor<Eigen::half, 1> half_prec3(num_elem);
+ Tensor<Eigen::half, 1> full_prec3(num_elem);
gpu_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float));
gpu_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float));
+ gpu_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float));
gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half));
gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half));
gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half));
gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half));
+ gpu_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half));
+ gpu_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half));
gpu_device.synchronize();
for (int i = 0; i < num_elem; ++i) {
@@ -231,12 +248,19 @@ void test_cuda_trancendental() {
else
VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
}
+ for (int i = 0; i < num_elem; ++i) {
+ std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl;
+ VERIFY_IS_APPROX(full_prec3(i), half_prec3(i));
+ }
gpu_device.deallocate(d_float1);
gpu_device.deallocate(d_float2);
+ gpu_device.deallocate(d_float3);
gpu_device.deallocate(d_res1_half);
gpu_device.deallocate(d_res1_float);
gpu_device.deallocate(d_res2_half);
gpu_device.deallocate(d_res2_float);
+ gpu_device.deallocate(d_res3_float);
+ gpu_device.deallocate(d_res3_half);
}
template<typename>
diff --git a/unsupported/test/cxx11_tensor_random_cuda.cu b/unsupported/test/cxx11_tensor_random_cuda.cu
index fa1a46732..b3be199e1 100644
--- a/unsupported/test/cxx11_tensor_random_cuda.cu
+++ b/unsupported/test/cxx11_tensor_random_cuda.cu
@@ -13,6 +13,9 @@
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
#include "main.h"
#include <Eigen/CXX11/Tensor>
diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_cuda.cu
index cad0c08e0..6858b43a7 100644
--- a/unsupported/test/cxx11_tensor_reduction_cuda.cu
+++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu
@@ -12,11 +12,14 @@
#define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda
#define EIGEN_USE_GPU
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
-template<int DataLayout>
+template<typename Type, int DataLayout>
static void test_full_reductions() {
Eigen::CudaStreamDevice stream;
@@ -25,24 +28,24 @@ static void test_full_reductions() {
const int num_rows = internal::random<int>(1024, 5*1024);
const int num_cols = internal::random<int>(1024, 5*1024);
- Tensor<float, 2, DataLayout> in(num_rows, num_cols);
+ Tensor<Type, 2, DataLayout> in(num_rows, num_cols);
in.setRandom();
- Tensor<float, 0, DataLayout> full_redux;
+ Tensor<Type, 0, DataLayout> full_redux;
full_redux = in.sum();
- std::size_t in_bytes = in.size() * sizeof(float);
- std::size_t out_bytes = full_redux.size() * sizeof(float);
- float* gpu_in_ptr = static_cast<float*>(gpu_device.allocate(in_bytes));
- float* gpu_out_ptr = static_cast<float*>(gpu_device.allocate(out_bytes));
+ std::size_t in_bytes = in.size() * sizeof(Type);
+ std::size_t out_bytes = full_redux.size() * sizeof(Type);
+ Type* gpu_in_ptr = static_cast<Type*>(gpu_device.allocate(in_bytes));
+ Type* gpu_out_ptr = static_cast<Type*>(gpu_device.allocate(out_bytes));
gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
- TensorMap<Tensor<float, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
- TensorMap<Tensor<float, 0, DataLayout> > out_gpu(gpu_out_ptr);
+ TensorMap<Tensor<Type, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
+ TensorMap<Tensor<Type, 0, DataLayout> > out_gpu(gpu_out_ptr);
out_gpu.device(gpu_device) = in_gpu.sum();
- Tensor<float, 0, DataLayout> full_redux_gpu;
+ Tensor<Type, 0, DataLayout> full_redux_gpu;
gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
gpu_device.synchronize();
@@ -53,7 +56,102 @@ static void test_full_reductions() {
gpu_device.deallocate(gpu_out_ptr);
}
+template<typename Type, int DataLayout>
+static void test_first_dim_reductions() {
+ int dim_x = 33;
+ int dim_y = 1;
+ int dim_z = 128;
+
+ Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
+ in.setRandom();
+
+ Eigen::array<int, 1> red_axis;
+ red_axis[0] = 0;
+ Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
+
+ // Create device
+ Eigen::CudaStreamDevice stream;
+ Eigen::GpuDevice dev(&stream);
+
+ // Create data(T)
+ Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
+ Type* out_data = (Type*)dev.allocate(dim_z*dim_y*sizeof(Type));
+ Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
+ Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_y, dim_z);
+
+ // Perform operation
+ dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
+ gpu_out.device(dev) = gpu_in.sum(red_axis);
+ gpu_out.device(dev) += gpu_in.sum(red_axis);
+ Tensor<Type, 2, DataLayout> redux_gpu(dim_y, dim_z);
+ dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
+ dev.synchronize();
+
+ // Check that the CPU and GPU reductions return the same result.
+ for (int i = 0; i < gpu_out.size(); ++i) {
+ VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
+ }
+
+ dev.deallocate(in_data);
+ dev.deallocate(out_data);
+}
+
+template<typename Type, int DataLayout>
+static void test_last_dim_reductions() {
+ int dim_x = 128;
+ int dim_y = 1;
+ int dim_z = 33;
+
+ Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
+ in.setRandom();
+
+ Eigen::array<int, 1> red_axis;
+ red_axis[0] = 2;
+ Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
+
+ // Create device
+ Eigen::CudaStreamDevice stream;
+ Eigen::GpuDevice dev(&stream);
+
+ // Create data
+ Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
+ Type* out_data = (Type*)dev.allocate(dim_x*dim_y*sizeof(Type));
+ Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
+ Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_x, dim_y);
+
+ // Perform operation
+ dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
+ gpu_out.device(dev) = gpu_in.sum(red_axis);
+ gpu_out.device(dev) += gpu_in.sum(red_axis);
+ Tensor<Type, 2, DataLayout> redux_gpu(dim_x, dim_y);
+ dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
+ dev.synchronize();
+
+ // Check that the CPU and GPU reductions return the same result.
+ for (int i = 0; i < gpu_out.size(); ++i) {
+ VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
+ }
+
+ dev.deallocate(in_data);
+ dev.deallocate(out_data);
+}
+
+
void test_cxx11_tensor_reduction_cuda() {
- CALL_SUBTEST_1(test_full_reductions<ColMajor>());
- CALL_SUBTEST_2(test_full_reductions<RowMajor>());
+ CALL_SUBTEST_1((test_full_reductions<float, ColMajor>()));
+ CALL_SUBTEST_1((test_full_reductions<double, ColMajor>()));
+ CALL_SUBTEST_2((test_full_reductions<float, RowMajor>()));
+ CALL_SUBTEST_2((test_full_reductions<double, RowMajor>()));
+
+ CALL_SUBTEST_3((test_first_dim_reductions<float, ColMajor>()));
+ CALL_SUBTEST_3((test_first_dim_reductions<double, ColMajor>()));
+ CALL_SUBTEST_4((test_first_dim_reductions<float, RowMajor>()));
+// Outer reductions of doubles aren't supported just yet.
+// CALL_SUBTEST_4((test_first_dim_reductions<double, RowMajor>()))
+
+ CALL_SUBTEST_5((test_last_dim_reductions<float, ColMajor>()));
+// Outer reductions of doubles aren't supported just yet.
+// CALL_SUBTEST_5((test_last_dim_reductions<double, ColMajor>()));
+ CALL_SUBTEST_6((test_last_dim_reductions<float, RowMajor>()));
+ CALL_SUBTEST_6((test_last_dim_reductions<double, RowMajor>()));
}
diff --git a/unsupported/test/cxx11_tensor_scan_cuda.cu b/unsupported/test/cxx11_tensor_scan_cuda.cu
index 35e19e51c..761d11fd1 100644
--- a/unsupported/test/cxx11_tensor_scan_cuda.cu
+++ b/unsupported/test/cxx11_tensor_scan_cuda.cu
@@ -13,7 +13,9 @@
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
-
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
diff --git a/unsupported/test/kronecker_product.cpp b/unsupported/test/kronecker_product.cpp
index 02411a262..e770049e5 100644
--- a/unsupported/test/kronecker_product.cpp
+++ b/unsupported/test/kronecker_product.cpp
@@ -9,12 +9,12 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#ifdef EIGEN_TEST_PART_1
#include "sparse.h"
#include <Eigen/SparseExtra>
#include <Eigen/KroneckerProduct>
-
template<typename MatrixType>
void check_dimension(const MatrixType& ab, const int rows, const int cols)
{
@@ -230,3 +230,23 @@ void test_kronecker_product()
VERIFY_IS_APPROX(MatrixXf(sC2),dC);
}
}
+
+#endif
+
+#ifdef EIGEN_TEST_PART_2
+
+// simply check that for a dense kronecker product, sparse module is not needed
+
+#include "main.h"
+#include <Eigen/KroneckerProduct>
+
+void test_kronecker_product()
+{
+ MatrixXd a(2,2), b(3,3), c;
+ a.setRandom();
+ b.setRandom();
+ c = kroneckerProduct(a,b);
+ VERIFY_IS_APPROX(c.block(3,3,3,3), a(1,1)*b);
+}
+
+#endif