applying EIGEN_DECLARE_TEST to *gpu* tests

Also, a few minor fixes for GPU tests running in HIP mode. 1. Adding an include for hip/hip_runtime.h in the Macros.h file For HIP __host__ and __device__ are macros which are defined in hip headers. Their definitions need to be included before their use in the file. 2. Fixing the compile failure in TensorContractionGpu introduced by the commit to "Fuse computations into the Tensor contractions using output kernel" 3. Fixing a HIP/clang specific compile error by making the struct-member assignment explicit
author: Deven Desai <deven.desai.amd@gmail.com> 2018-07-17 14:16:48 -0400
committer: Deven Desai <deven.desai.amd@gmail.com> 2018-07-17 14:16:48 -0400
commit: f124f0796533081cb7b061c4aa8667df6ed58863 (patch)
tree: 20efd29fdfaf3da79cad39f468d330927b62ebc2
parent: 82f0ce27261df3b21037d93d4595655b3df754a6 (diff)
14 files changed, 24 insertions, 22 deletions
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index fc4c0815c..adf25ee9b 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -395,8 +395,10 @@
   // Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP)
   #define EIGEN_HIPCC __HIPCC__
 
-  // We need hip_common.h here because __HIP_DEVICE_COMPILE__ is defined in this header.
-  #include <hip/hip_common.h>
+  // We need to include hip_runtime.h here because it pulls in
+  // ++ hip_common.h which contains the define for  __HIP_DEVICE_COMPILE__ 
+  // ++ host_defines.h which contains the defines for the __host__ and __device__ macros
+  #include <hip/hip_runtime.h>
 
   #if defined(__HIP_DEVICE_COMPILE__)
     // analogous to EIGEN_CUDA_ARCH, but for HIP
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 956f64d1d..8dbd4d93b 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -580,7 +580,7 @@ template<typename T> struct smart_memmove_helper<T,false> {
 
 // you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA
 // to the appropriate stack allocation function
-#if ! defined EIGEN_ALLOCA && ! defined EIGEN_CUDA_ARCH
+#if ! defined EIGEN_ALLOCA && ! defined EIGEN_GPU_COMPILE_PHASE
   #if EIGEN_OS_LINUX || EIGEN_OS_MAC || (defined alloca)
     #define EIGEN_ALLOCA alloca
   #elif EIGEN_COMP_MSVC
diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
index fbc1ee2f6..2d1a82c57 100644
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
@@ -610,6 +610,7 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
   static inline bool extract_kernel(MatrixType& mat, Ref<VectorType> res, Ref<VectorType> representative)
   {
     EIGEN_USING_STD_MATH(abs);
+    EIGEN_USING_STD_MATH(sqrt);
     Index i0;
     // Find non-zero column i0 (by construction, there must exist a non zero coefficient on the diagonal):
     mat.diagonal().cwiseAbs().maxCoeff(&i0);
@@ -620,8 +621,8 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
     VectorType c0, c1;
     n0 = (c0 = representative.cross(mat.col((i0+1)%3))).squaredNorm();
     n1 = (c1 = representative.cross(mat.col((i0+2)%3))).squaredNorm();
-    if(n0>n1) res = c0/std::sqrt(n0);
-    else      res = c1/std::sqrt(n1);
+    if(n0>n1) res = c0/sqrt(n0);
+    else      res = c1/sqrt(n1);
 
     return true;
   }
@@ -723,7 +724,7 @@ struct direct_selfadjoint_eigenvalues<SolverType,2,false>
   EIGEN_DEVICE_FUNC
   static inline void computeRoots(const MatrixType& m, VectorType& roots)
   {
-    using std::sqrt;
+    EIGEN_USING_STD_MATH(sqrt);
     const Scalar t0 = Scalar(0.5) * sqrt( numext::abs2(m(0,0)-m(1,1)) + Scalar(4)*numext::abs2(m(1,0)));
     const Scalar t1 = Scalar(0.5) * (m(0,0) + m(1,1));
     roots(0) = t1 - t0;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 0fbffa34c..c7c443a59 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -449,8 +449,7 @@ struct TensorContractionEvaluatorBase
     // tensor dimensions (i, j) into the original tensor dimensions.
     // TODO(ezhulenev): Add parameters required to infer output tensor index for
     // more complex contractions than 2x2 on internal dimension.
-    m_tensor_contraction_params = {
-        /**swapped_arguments=*/static_cast<int>(Layout) == RowMajor};
+    m_tensor_contraction_params.swapped_arguments = static_cast<int>(Layout) == RowMajor;
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
index a4f92ee44..b9956cd43 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
@@ -1215,16 +1215,16 @@ EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
 }
 
 
-template<typename Indices, typename LeftArgType, typename RightArgType>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> :
-    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> > {
+template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> :
+    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> > {
 
   typedef GpuDevice Device;
 
-  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
   typedef TensorContractionEvaluatorBase<Self> Base;
 
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
diff --git a/unsupported/test/cxx11_tensor_argmax_gpu.cu b/unsupported/test/cxx11_tensor_argmax_gpu.cu
index f6c3a9908..79f4066e9 100644
--- a/unsupported/test/cxx11_tensor_argmax_gpu.cu
+++ b/unsupported/test/cxx11_tensor_argmax_gpu.cu
@@ -242,7 +242,7 @@ void test_gpu_argmin_dim()
   }
 }
 
-void test_cxx11_tensor_gpu()
+EIGEN_DECLARE_TEST(cxx11_tensor_argmax_gpu)
 {
   CALL_SUBTEST_1(test_gpu_simple_argmax<RowMajor>());
   CALL_SUBTEST_1(test_gpu_simple_argmax<ColMajor>());
diff --git a/unsupported/test/cxx11_tensor_cast_float16_gpu.cu b/unsupported/test/cxx11_tensor_cast_float16_gpu.cu
index 0a37a555c..97923d15f 100644
--- a/unsupported/test/cxx11_tensor_cast_float16_gpu.cu
+++ b/unsupported/test/cxx11_tensor_cast_float16_gpu.cu
@@ -72,7 +72,7 @@ void test_fallback_conversion() {
 }
 
 
-void test_cxx11_tensor_cast_float16_gpu()
+EIGEN_DECLARE_TEST(cxx11_tensor_cast_float16_gpu)
 {
   CALL_SUBTEST(test_gpu_conversion());
   CALL_SUBTEST(test_fallback_conversion());
diff --git a/unsupported/test/cxx11_tensor_contract_gpu.cu b/unsupported/test/cxx11_tensor_contract_gpu.cu
index cb1416478..575bdc1f9 100644
--- a/unsupported/test/cxx11_tensor_contract_gpu.cu
+++ b/unsupported/test/cxx11_tensor_contract_gpu.cu
@@ -193,7 +193,7 @@ void test_gpu_contraction_sizes() {
   }
 }
 
-void test_cxx11_tensor_gpu()
+EIGEN_DECLARE_TEST(cxx11_tensor_contract_gpu)
 {
   CALL_SUBTEST_1(test_gpu_contraction<ColMajor>(128, 128, 128));
   CALL_SUBTEST_1(test_gpu_contraction<RowMajor>(128, 128, 128));
diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu
index cd9ba3ecd..c9f78d2d3 100644
--- a/unsupported/test/cxx11_tensor_device.cu
+++ b/unsupported/test/cxx11_tensor_device.cu
@@ -389,7 +389,7 @@ void test_gpu() {
 }
 
 
-void test_cxx11_tensor_device()
+EIGEN_DECLARE_TEST(cxx11_tensor_device)
 {
   CALL_SUBTEST_1(test_cpu());
   CALL_SUBTEST_2(test_gpu());
diff --git a/unsupported/test/cxx11_tensor_gpu.cu b/unsupported/test/cxx11_tensor_gpu.cu
index faaac73cf..14fc0bd04 100644
--- a/unsupported/test/cxx11_tensor_gpu.cu
+++ b/unsupported/test/cxx11_tensor_gpu.cu
@@ -1472,7 +1472,7 @@ void test_gpu_gamma_sample_der_alpha()
   gpuFree(d_out);
 }
 
-void test_cxx11_tensor_gpu()
+EIGEN_DECLARE_TEST(cxx11_tensor_gpu)
 {
   CALL_SUBTEST_1(test_gpu_nullary());
   CALL_SUBTEST_1(test_gpu_elementwise_small());
diff --git a/unsupported/test/cxx11_tensor_of_float16_gpu.cu b/unsupported/test/cxx11_tensor_of_float16_gpu.cu
index 1f1ec26c2..4d74e6138 100644
--- a/unsupported/test/cxx11_tensor_of_float16_gpu.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_gpu.cu
@@ -479,7 +479,7 @@ void test_gpu_forced_evals() {
 #endif
 
 
-void test_cxx11_tensor_of_float16_gpu()
+EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_gpu)
 {
   CALL_SUBTEST_1(test_gpu_numext<void>());
 
diff --git a/unsupported/test/cxx11_tensor_random_gpu.cu b/unsupported/test/cxx11_tensor_random_gpu.cu
index 262182d30..090986ebc 100644
--- a/unsupported/test/cxx11_tensor_random_gpu.cu
+++ b/unsupported/test/cxx11_tensor_random_gpu.cu
@@ -78,7 +78,7 @@ static void test_complex()
 }
 
 
-void test_cxx11_tensor_random_gpu()
+EIGEN_DECLARE_TEST(cxx11_tensor_random_gpu)
 {
   CALL_SUBTEST(test_gpu_random_uniform());
   CALL_SUBTEST(test_gpu_random_normal());
diff --git a/unsupported/test/cxx11_tensor_reduction_gpu.cu b/unsupported/test/cxx11_tensor_reduction_gpu.cu
index 7b8ac2309..122ac946b 100644
--- a/unsupported/test/cxx11_tensor_reduction_gpu.cu
+++ b/unsupported/test/cxx11_tensor_reduction_gpu.cu
@@ -134,7 +134,7 @@ static void test_last_dim_reductions() {
 }
 
 
-void test_cxx11_tensor_reduction_gpu() {
+EIGEN_DECLARE_TEST(cxx11_tensor_reduction_gpu) {
   CALL_SUBTEST_1((test_full_reductions<float, ColMajor>()));
   CALL_SUBTEST_1((test_full_reductions<double, ColMajor>()));
   CALL_SUBTEST_2((test_full_reductions<float, RowMajor>()));
diff --git a/unsupported/test/cxx11_tensor_scan_gpu.cu b/unsupported/test/cxx11_tensor_scan_gpu.cu
index f3e773db5..770a144f1 100644
--- a/unsupported/test/cxx11_tensor_scan_gpu.cu
+++ b/unsupported/test/cxx11_tensor_scan_gpu.cu
@@ -71,7 +71,7 @@ void test_gpu_cumsum(int m_size, int k_size, int n_size)
 }
 
 
-void test_cxx11_tensor_scan_gpu()
+EIGEN_DECLARE_TEST(cxx11_tensor_scan_gpu)
 {
   CALL_SUBTEST_1(test_gpu_cumsum<ColMajor>(128, 128, 128));
   CALL_SUBTEST_2(test_gpu_cumsum<RowMajor>(128, 128, 128));
author	Deven Desai <deven.desai.amd@gmail.com>	2018-07-17 14:16:48 -0400
committer	Deven Desai <deven.desai.amd@gmail.com>	2018-07-17 14:16:48 -0400
commit	f124f0796533081cb7b061c4aa8667df6ed58863 (patch)
tree	20efd29fdfaf3da79cad39f468d330927b62ebc2
parent	82f0ce27261df3b21037d93d4595655b3df754a6 (diff)