aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--Eigen/Core69
-rw-r--r--Eigen/src/Core/GenericPacketMath.h8
-rw-r--r--Eigen/src/Core/MathFunctions.h61
-rw-r--r--Eigen/src/Core/ProductEvaluators.h6
-rw-r--r--Eigen/src/Core/arch/HIP/hcc/Half.h705
-rw-r--r--Eigen/src/Core/arch/HIP/hcc/PacketMathHalf.h1019
-rw-r--r--Eigen/src/Core/arch/HIP/hcc/TypeCasting.h212
-rw-r--r--Eigen/src/Core/arch/HIP/hcc/math_constants.h23
-rw-r--r--Eigen/src/Core/functors/BinaryFunctors.h6
-rwxr-xr-xEigen/src/Core/util/BlasUtil.h5
-rw-r--r--Eigen/src/Core/util/Macros.h5
-rw-r--r--Eigen/src/Core/util/Memory.h38
-rwxr-xr-xEigen/src/Core/util/Meta.h52
-rw-r--r--Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h1
-rw-r--r--Eigen/src/SVD/BDCSVD.h2
-rw-r--r--cmake/EigenTesting.cmake9
-rw-r--r--test/CMakeLists.txt42
-rw-r--r--test/hip_basic.cu172
-rw-r--r--test/hip_common.h103
-rw-r--r--test/main.h22
-rw-r--r--unsupported/Eigen/CXX11/Tensor40
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h10
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h1521
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h1119
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h15
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h352
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h16
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h24
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h815
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorScan.h9
-rw-r--r--unsupported/Eigen/CXX11/src/util/CXX11Meta.h12
-rw-r--r--unsupported/Eigen/CXX11/src/util/EmulateArray.h2
-rw-r--r--unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h4
-rw-r--r--unsupported/test/CMakeLists.txt52
-rw-r--r--unsupported/test/cxx11_tensor_argmax_hip.cu251
-rw-r--r--unsupported/test/cxx11_tensor_cast_float16_hip.cu79
-rw-r--r--unsupported/test/cxx11_tensor_contract_hip.cu215
-rw-r--r--unsupported/test/cxx11_tensor_device_hip.cu389
-rw-r--r--unsupported/test/cxx11_tensor_hip.cu1296
-rw-r--r--unsupported/test/cxx11_tensor_of_float16_hip.cu498
-rw-r--r--unsupported/test/cxx11_tensor_random_hip.cu85
-rw-r--r--unsupported/test/cxx11_tensor_reduction_hip.cu154
-rw-r--r--unsupported/test/cxx11_tensor_scan_hip.cu76
50 files changed, 9527 insertions, 94 deletions
diff --git a/Eigen/Core b/Eigen/Core
index f6bc18a08..c72d5468a 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -22,6 +22,17 @@
#define EIGEN_CUDA_ARCH __CUDA_ARCH__
#endif
+#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP)
+ // analogous to EIGEN_CUDACC, but for HIP
+ #define EIGEN_HIPCC __HIPCC__
+#endif
+
+// NVCC is not supported as the target platform for HIPCC
+// Note that this also makes EIGEN_CUDACC and EIGEN_HIPCC mutually exclusive
+#if defined(__NVCC__) && defined(__HIPCC__)
+ #error "NVCC as the target platform for HIPCC is currently not supported."
+#endif
+
// Starting with CUDA 9 the composite __CUDACC_VER__ is not available.
#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
#define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
@@ -32,8 +43,8 @@
#endif
// Handle NVCC/CUDA/SYCL
-#if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__)
- // Do not try asserts on CUDA and SYCL!
+#if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__) || defined(EIGEN_HIPCC)
+ // Do not try asserts on CUDA, HIP and SYCL!
#ifndef EIGEN_NO_DEBUG
#define EIGEN_NO_DEBUG
#endif
@@ -57,6 +68,26 @@
// We need cuda_runtime.h to ensure that that EIGEN_USING_STD_MATH macro
// works properly on the device side
#include <cuda_runtime.h>
+
+ #elif defined(EIGEN_HIPCC)
+ // Do not try to vectorize on HIP
+ #ifndef EIGEN_DONT_VECTORIZE
+ #define EIGEN_DONT_VECTORIZE
+ #endif
+
+ #define EIGEN_DEVICE_FUNC __host__ __device__
+ // We need hip_runtime.h to ensure that that EIGEN_USING_STD_MATH macro
+ // works properly on the device side
+ #include <hip/hip_runtime.h>
+
+ #if defined(__HIP_DEVICE_COMPILE__) && !defined(EIGEN_NO_HIP)
+ // analogous to EIGEN_CUDA_ARCH, but for HIP
+ #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__
+ // Note this check needs to come after we include hip_runtime.h since
+ // hip_runtime.h includes hip_common.h which in turn has the define
+ // for __HIP_DEVICE_COMPILE__
+ #endif
+
#else
#define EIGEN_DEVICE_FUNC
#endif
@@ -68,16 +99,16 @@
#define EIGEN_DONT_VECTORIZE
#endif
-// When compiling CUDA device code with NVCC, pull in math functions from the
-// global namespace. In host mode, and when device doee with clang, use the
-// std versions.
-#if defined(EIGEN_CUDA_ARCH) && defined(__NVCC__)
+// When compiling CUDA device code with NVCC, or HIP device code with HIPCC
+// pull in math functions from the global namespace. In host mode, and when
+// device doee with clang, use the std versions.
+#if (defined(EIGEN_CUDA_ARCH) && defined(__NVCC__)) || (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIPCC__))
#define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
#else
#define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
#endif
-#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL)
+#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) && !defined(EIGEN_HIP_DEVICE_COMPILE)
#define EIGEN_EXCEPTIONS
#endif
@@ -270,6 +301,17 @@
#include <cuda_fp16.h>
#endif
+#if defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE)
+ #define EIGEN_HAS_HIP_FP16
+ #include <hip/hip_fp16.h>
+ #define HIP_PATCH_WITH_NEW_FP16 18215
+ #if (HIP_VERSION_PATCH < HIP_PATCH_WITH_NEW_FP16)
+ #define EIGEN_HAS_OLD_HIP_FP16
+ // Old HIP implementation does not have a explicit typedef for "half2"
+ typedef __half2 half2;
+ #endif
+#endif
+
#if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
#define EIGEN_HAS_OPENMP
#endif
@@ -390,7 +432,6 @@ using std::ptrdiff_t;
#include "src/Core/util/IntegralConstant.h"
#include "src/Core/util/SymbolicIndex.h"
-
#include "src/Core/NumTraits.h"
#include "src/Core/MathFunctions.h"
#include "src/Core/GenericPacketMath.h"
@@ -434,9 +475,15 @@ using std::ptrdiff_t;
#endif
// Half float support
-#include "src/Core/arch/CUDA/Half.h"
-#include "src/Core/arch/CUDA/PacketMathHalf.h"
-#include "src/Core/arch/CUDA/TypeCasting.h"
+#if defined EIGEN_USE_HIP
+ #include "src/Core/arch/HIP/hcc/Half.h"
+ #include "src/Core/arch/HIP/hcc/PacketMathHalf.h"
+ #include "src/Core/arch/HIP/hcc/TypeCasting.h"
+#else
+ #include "src/Core/arch/CUDA/Half.h"
+ #include "src/Core/arch/CUDA/PacketMathHalf.h"
+ #include "src/Core/arch/CUDA/TypeCasting.h"
+#endif
#if defined EIGEN_VECTORIZE_CUDA
#include "src/Core/arch/CUDA/PacketMath.h"
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 888a3f7ea..0903c3a6e 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -299,7 +299,11 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu
{ pstore(to, from); }
/** \internal tries to do cache prefetching of \a addr */
-template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
+template<typename Scalar>
+ #if !defined(EIGEN_HIPCC)
+ EIGEN_DEVICE_FUNC
+ #endif
+ inline void prefetch(const Scalar* addr)
{
#ifdef EIGEN_CUDA_ARCH
#if defined(__LP64__)
@@ -528,7 +532,7 @@ inline void palign(PacketType& first, const PacketType& second)
***************************************************************************/
// Eigen+CUDA does not support complexes.
-#ifndef EIGEN_CUDACC
+#if !defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC)
template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
{ return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 05462c5e1..6beef5def 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -96,7 +96,7 @@ struct real_default_impl<Scalar,true>
template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
-#ifdef EIGEN_CUDA_ARCH
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
template<typename T>
struct real_impl<std::complex<T> >
{
@@ -144,7 +144,7 @@ struct imag_default_impl<Scalar,true>
template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
-#ifdef EIGEN_CUDA_ARCH
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
template<typename T>
struct imag_impl<std::complex<T> >
{
@@ -260,7 +260,7 @@ struct conj_default_impl<Scalar,true>
template<typename Scalar> struct conj_impl : conj_default_impl<Scalar> {};
-#ifdef EIGEN_CUDA_ARCH
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
template<typename T>
struct conj_impl<std::complex<T> >
{
@@ -435,7 +435,12 @@ struct round_retval
struct arg_impl {
static inline Scalar run(const Scalar& x)
{
+ #if defined(EIGEN_HIP_DEVICE_COMPILE)
+ // HIP does not seem to have a native device side implementation for the math routine "arg"
+ using std::arg;
+ #else
EIGEN_USING_STD_MATH(arg);
+ #endif
return arg(x);
}
};
@@ -768,7 +773,9 @@ EIGEN_DEVICE_FUNC
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
isfinite_impl(const T& x)
{
- #ifdef EIGEN_CUDA_ARCH
+ #if defined(EIGEN_HIP_DEVICE_COMPILE)
+ return isfinite(x);
+ #elif defined(EIGEN_CUDA_ARCH)
return (::isfinite)(x);
#elif EIGEN_USE_STD_FPCLASSIFY
using std::isfinite;
@@ -783,7 +790,9 @@ EIGEN_DEVICE_FUNC
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
isinf_impl(const T& x)
{
- #ifdef EIGEN_CUDA_ARCH
+ #if defined(EIGEN_HIP_DEVICE_COMPILE)
+ return isinf(x);
+ #elif defined(EIGEN_CUDA_ARCH)
return (::isinf)(x);
#elif EIGEN_USE_STD_FPCLASSIFY
using std::isinf;
@@ -798,7 +807,9 @@ EIGEN_DEVICE_FUNC
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
isnan_impl(const T& x)
{
- #ifdef EIGEN_CUDA_ARCH
+ #if defined(EIGEN_HIP_DEVICE_COMPILE)
+ return isnan(x);
+ #elif defined(EIGEN_CUDA_ARCH)
return (::isnan)(x);
#elif EIGEN_USE_STD_FPCLASSIFY
using std::isnan;
@@ -864,7 +875,7 @@ template<typename T> T generic_fast_tanh_float(const T& a_x);
namespace numext {
-#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
+#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE) && !defined(__SYCL_DEVICE_ONLY__)
template<typename T>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
@@ -1078,7 +1089,7 @@ EIGEN_ALWAYS_INLINE float log1p(float x) { return cl::sycl::log1p(x); }
EIGEN_ALWAYS_INLINE double log1p(double x) { return cl::sycl::log1p(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float log1p(const float &x) { return ::log1pf(x); }
@@ -1136,7 +1147,7 @@ EIGEN_ALWAYS_INLINE float floor(float x) { return cl::sycl::floor(x); }
EIGEN_ALWAYS_INLINE double floor(double x) { return cl::sycl::floor(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float floor(const float &x) { return ::floorf(x); }
@@ -1157,7 +1168,7 @@ EIGEN_ALWAYS_INLINE float ceil(float x) { return cl::sycl::ceil(x); }
EIGEN_ALWAYS_INLINE double ceil(double x) { return cl::sycl::ceil(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float ceil(const float &x) { return ::ceilf(x); }
@@ -1215,7 +1226,7 @@ EIGEN_ALWAYS_INLINE double log(double x) { return cl::sycl::log(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float log(const float &x) { return ::logf(x); }
@@ -1243,7 +1254,7 @@ EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); }
EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float abs(const float &x) { return ::fabsf(x); }
@@ -1273,7 +1284,7 @@ EIGEN_ALWAYS_INLINE float exp(float x) { return cl::sycl::exp(x); }
EIGEN_ALWAYS_INLINE double exp(double x) { return cl::sycl::exp(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float exp(const float &x) { return ::expf(x); }
@@ -1309,7 +1320,7 @@ EIGEN_ALWAYS_INLINE float expm1(float x) { return cl::sycl::expm1(x); }
EIGEN_ALWAYS_INLINE double expm1(double x) { return cl::sycl::expm1(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float expm1(const float &x) { return ::expm1f(x); }
@@ -1329,7 +1340,7 @@ EIGEN_ALWAYS_INLINE float cos(float x) { return cl::sycl::cos(x); }
EIGEN_ALWAYS_INLINE double cos(double x) { return cl::sycl::cos(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float cos(const float &x) { return ::cosf(x); }
@@ -1349,7 +1360,7 @@ EIGEN_ALWAYS_INLINE float sin(float x) { return cl::sycl::sin(x); }
EIGEN_ALWAYS_INLINE double sin(double x) { return cl::sycl::sin(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float sin(const float &x) { return ::sinf(x); }
@@ -1369,7 +1380,7 @@ EIGEN_ALWAYS_INLINE float tan(float x) { return cl::sycl::tan(x); }
EIGEN_ALWAYS_INLINE double tan(double x) { return cl::sycl::tan(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float tan(const float &x) { return ::tanf(x); }
@@ -1400,7 +1411,7 @@ EIGEN_ALWAYS_INLINE float acosh(float x) { return cl::sycl::acosh(x); }
EIGEN_ALWAYS_INLINE double acosh(double x) { return cl::sycl::acosh(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float acos(const float &x) { return ::acosf(x); }
@@ -1431,7 +1442,7 @@ EIGEN_ALWAYS_INLINE float asinh(float x) { return cl::sycl::asinh(x); }
EIGEN_ALWAYS_INLINE double asinh(double x) { return cl::sycl::asinh(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float asin(const float &x) { return ::asinf(x); }
@@ -1462,7 +1473,7 @@ EIGEN_ALWAYS_INLINE float atanh(float x) { return cl::sycl::atanh(x); }
EIGEN_ALWAYS_INLINE double atanh(double x) { return cl::sycl::atanh(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float atan(const float &x) { return ::atanf(x); }
@@ -1483,7 +1494,7 @@ EIGEN_ALWAYS_INLINE float cosh(float x) { return cl::sycl::cosh(x); }
EIGEN_ALWAYS_INLINE double cosh(double x) { return cl::sycl::cosh(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float cosh(const float &x) { return ::coshf(x); }
@@ -1503,7 +1514,7 @@ EIGEN_ALWAYS_INLINE float sinh(float x) { return cl::sycl::sinh(x); }
EIGEN_ALWAYS_INLINE double sinh(double x) { return cl::sycl::sinh(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float sinh(const float &x) { return ::sinhf(x); }
@@ -1521,12 +1532,12 @@ T tanh(const T &x) {
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float tanh(float x) { return cl::sycl::tanh(x); }
EIGEN_ALWAYS_INLINE double tanh(double x) { return cl::sycl::tanh(x); }
-#elif (!defined(EIGEN_CUDACC)) && EIGEN_FAST_MATH
+#elif (!defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC)) && EIGEN_FAST_MATH
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float tanh(float x) { return internal::generic_fast_tanh_float(x); }
#endif
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float tanh(const float &x) { return ::tanhf(x); }
@@ -1546,7 +1557,7 @@ EIGEN_ALWAYS_INLINE float fmod(float x, float y) { return cl::sycl::fmod(x, y)
EIGEN_ALWAYS_INLINE double fmod(double x, double y) { return cl::sycl::fmod(x, y); }
#endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float fmod(const float& a, const float& b) {
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index fb637191d..cc75fbce3 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -137,6 +137,9 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scal
typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
{
typedef Product<Lhs,Rhs,Options> SrcXprType;
+ #if defined(EIGEN_HIPCC)
+ EIGEN_DEVICE_FUNC
+ #endif
static EIGEN_STRONG_INLINE
void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
{
@@ -390,6 +393,9 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
template<typename Dst>
+ #if defined(EIGEN_HIPCC)
+ EIGEN_DEVICE_FUNC
+ #endif
static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
// Same as: dst.noalias() = lhs.lazyProduct(rhs);
diff --git a/Eigen/src/Core/arch/HIP/hcc/Half.h b/Eigen/src/Core/arch/HIP/hcc/Half.h
new file mode 100644
index 000000000..2ce8a412c
--- /dev/null
+++ b/Eigen/src/Core/arch/HIP/hcc/Half.h
@@ -0,0 +1,705 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+//
+// The conversion routines are Copyright (c) Fabian Giesen, 2016.
+// The original license follows:
+//
+// Copyright (c) Fabian Giesen, 2016
+// All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted.
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Standard 16-bit float type, mostly useful for GPUs. Defines a new
+// type Eigen::half (inheriting from HIP's __half struct) with
+// operator overloads such that it behaves basically as an arithmetic
+// type. It will be quite slow on CPUs (so it is recommended to stay
+// in fp32 for CPUs, except for simple parameter conversions, I/O
+// to disk and the likes), but fast on GPUs.
+
+
+#ifndef EIGEN_HALF_HIP_H
+#define EIGEN_HALF_HIP_H
+
+#if __cplusplus > 199711L
+#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type()
+#else
+#define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type()
+#endif
+
+
+namespace Eigen {
+
+struct half;
+
+namespace half_impl {
+
+#if !defined(EIGEN_HAS_HIP_FP16)
+// Make our own __half_raw definition that is similar to CUDA's.
+struct __half_raw {
+ EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
+ explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
+ unsigned short x;
+};
+#elif defined(EIGEN_HAS_OLD_HIP_FP16)
+// Make a __half_raw definition that is
+// ++ compatible with that of Eigen and
+// ++ add a implcit conversion to the native __half of the old HIP implementation.
+//
+// Keeping ".x" as "unsigned short" keeps the interface the same between the Eigen and HIP implementation.
+//
+// In the old HIP implementation,
+// ++ __half is a typedef of __fp16
+// ++ the "__h*" routines take "__half" arguments
+// so we need to implicitly convert "__half_raw" to "__half" to avoid having to explicitly make
+// that conversiion in each call to a "__h*" routine...that is why we have "operator __half" routine
+struct __half_raw {
+ EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
+ explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
+ union {
+ unsigned short x;
+ __half data;
+ };
+ operator __half(void) const { return data; }
+};
+#endif
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
+
+struct half_base : public __half_raw {
+ EIGEN_DEVICE_FUNC half_base() {}
+ EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
+ EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
+#if defined(EIGEN_HAS_HIP_FP16)
+ #if defined(EIGEN_HAS_OLD_HIP_FP16)
+ EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(__half_as_ushort(h)) {}
+ #else
+ EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
+ #endif
+#endif
+};
+
+} // namespace half_impl
+
+// Class definition.
+struct half : public half_impl::half_base {
+ #if !defined(EIGEN_HAS_HIP_FP16) || defined(EIGEN_HAS_OLD_HIP_FP16)
+ typedef half_impl::__half_raw __half_raw;
+ #endif
+
+ EIGEN_DEVICE_FUNC half() {}
+
+ EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
+ EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
+#if defined(EIGEN_HAS_HIP_FP16)
+ EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
+#endif
+
+ explicit EIGEN_DEVICE_FUNC half(bool b)
+ : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
+ template<class T>
+ explicit EIGEN_DEVICE_FUNC half(const T& val)
+ : half_impl::half_base(half_impl::float_to_half_rtne(static_cast<float>(val))) {}
+ explicit EIGEN_DEVICE_FUNC half(float f)
+ : half_impl::half_base(half_impl::float_to_half_rtne(f)) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const {
+ // +0.0 and -0.0 become false, everything else becomes true.
+ return (x & 0x7fff) != 0;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const {
+ return static_cast<signed char>(half_impl::half_to_float(*this));
+ }
+ EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const {
+ return static_cast<unsigned char>(half_impl::half_to_float(*this));
+ }
+ EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const {
+ return static_cast<short>(half_impl::half_to_float(*this));
+ }
+ EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const {
+ return static_cast<unsigned short>(half_impl::half_to_float(*this));
+ }
+ EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const {
+ return static_cast<int>(half_impl::half_to_float(*this));
+ }
+ EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const {
+ return static_cast<unsigned int>(half_impl::half_to_float(*this));
+ }
+ EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const {
+ return static_cast<long>(half_impl::half_to_float(*this));
+ }
+ EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const {
+ return static_cast<unsigned long>(half_impl::half_to_float(*this));
+ }
+ EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const {
+ return static_cast<long long>(half_impl::half_to_float(*this));
+ }
+ EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const {
+ return static_cast<unsigned long long>(half_impl::half_to_float(*this));
+ }
+ EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const {
+ return half_impl::half_to_float(*this);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const {
+ return static_cast<double>(half_impl::half_to_float(*this));
+ }
+
+ EIGEN_DEVICE_FUNC half& operator=(const half& other) {
+ x = other.x;
+ return *this;
+ }
+};
+
+} // end namespace Eigen
+
+namespace std {
+template<>
+struct numeric_limits<Eigen::half> {
+ static const bool is_specialized = true;
+ static const bool is_signed = true;
+ static const bool is_integer = false;
+ static const bool is_exact = false;
+ static const bool has_infinity = true;
+ static const bool has_quiet_NaN = true;
+ static const bool has_signaling_NaN = true;
+ static const float_denorm_style has_denorm = denorm_present;
+ static const bool has_denorm_loss = false;
+ static const std::float_round_style round_style = std::round_to_nearest;
+ static const bool is_iec559 = false;
+ static const bool is_bounded = false;
+ static const bool is_modulo = false;
+ static const int digits = 11;
+ static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
+ static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
+ static const int radix = 2;
+ static const int min_exponent = -13;
+ static const int min_exponent10 = -4;
+ static const int max_exponent = 16;
+ static const int max_exponent10 = 4;
+ static const bool traps = true;
+ static const bool tinyness_before = false;
+
+ static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); }
+ static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
+ static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
+ static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); }
+ static Eigen::half round_error() { return Eigen::half(0.5); }
+ static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
+ static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
+ static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
+ static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); }
+};
+
+// If std::numeric_limits<T> is specialized, should also specialize
+// std::numeric_limits<const T>, std::numeric_limits<volatile T>, and
+// std::numeric_limits<const volatile T>
+// https://stackoverflow.com/a/16519653/
+template<>
+struct numeric_limits<const Eigen::half> : numeric_limits<Eigen::half> {};
+template<>
+struct numeric_limits<volatile Eigen::half> : numeric_limits<Eigen::half> {};
+template<>
+struct numeric_limits<const volatile Eigen::half> : numeric_limits<Eigen::half> {};
+} // end namespace std
+
+namespace Eigen {
+
+namespace half_impl {
+
+#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
+
+// Intrinsics for native fp16 support. Note that on current hardware,
+// these are no faster than fp32 arithmetic (you need to use the half2
+// versions to get the ALU speed increased), but you do save the
+// conversion steps back and forth.
+
+EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
+ return __hadd(a, b);
+}
+EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) {
+ return __hmul(a, b);
+}
+EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
+ return __hsub(a, b);
+}
+EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
+ float num = __half2float(a);
+ float denom = __half2float(b);
+ return __float2half(num / denom);
+}
+EIGEN_STRONG_INLINE __device__ half operator - (const half& a) {
+ return __hneg(a);
+}
+EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) {
+ a = a + b;
+ return a;
+}
+EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) {
+ a = a * b;
+ return a;
+}
+EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) {
+ a = a - b;
+ return a;
+}
+EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) {
+ a = a / b;
+ return a;
+}
+EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) {
+ return __heq(a, b);
+}
+EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) {
+ return __hne(a, b);
+}
+EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) {
+ return __hlt(a, b);
+}
+EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) {
+ return __hle(a, b);
+}
+EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) {
+ return __hgt(a, b);
+}
+EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
+ return __hge(a, b);
+}
+
+#else // Emulate support for half floats
+
+// Definitions for CPUs mostly working through conversion to/from fp32.
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
+ return half(float(a) + float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) {
+ return half(float(a) * float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) {
+ return half(float(a) - float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) {
+ return half(float(a) / float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) {
+ half result;
+ result.x = a.x ^ 0x8000;
+ return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) {
+ a = half(float(a) + float(b));
+ return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) {
+ a = half(float(a) * float(b));
+ return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) {
+ a = half(float(a) - float(b));
+ return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) {
+ a = half(float(a) / float(b));
+ return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
+ return numext::equal_strict(float(a),float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
+ return numext::not_equal_strict(float(a), float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
+ return float(a) < float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) {
+ return float(a) <= float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) {
+ return float(a) > float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) {
+ return float(a) >= float(b);
+}
+
+#endif // Emulate support for half floats
+
+// Division by an index. Do it in full float precision to avoid accuracy
+// issues in converting the denominator to half.
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
+ return half(static_cast<float>(a) / static_cast<float>(b));
+}
+
+// Conversion routines, including fallbacks for the host or older CUDA.
+// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of
+// these in hardware. If we need more performance on older/other CPUs, they are
+// also possible to vectorize directly.
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) {
+ __half_raw h;
+ h.x = x;
+ return h;
+}
+
+union FP32 {
+ unsigned int u;
+ float f;
+};
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
+#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
+ __half tmp_ff = __float2half(ff);
+ #if defined(EIGEN_HAS_OLD_HIP_FP16)
+ __half_raw h;
+ h.data = tmp_ff;
+ return h;
+ #else
+ return *(__half_raw*)&tmp_ff;
+ #endif
+
+#elif defined(EIGEN_HAS_FP16_C)
+ __half_raw h;
+ h.x = _cvtss_sh(ff, 0);
+ return h;
+
+#else
+ FP32 f; f.f = ff;
+
+ const FP32 f32infty = { 255 << 23 };
+ const FP32 f16max = { (127 + 16) << 23 };
+ const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
+ unsigned int sign_mask = 0x80000000u;
+ __half_raw o;
+ o.x = static_cast<unsigned short>(0x0u);
+
+ unsigned int sign = f.u & sign_mask;
+ f.u ^= sign;
+
+ // NOTE all the integer compares in this function can be safely
+ // compiled into signed compares since all operands are below
+ // 0x80000000. Important if you want fast straight SSE2 code
+ // (since there's no unsigned PCMPGTD).
+
+ if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set)
+ o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
+ } else { // (De)normalized number or zero
+ if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero
+ // use a magic value to align our 10 mantissa bits at the bottom of
+ // the float. as long as FP addition is round-to-nearest-even this
+ // just works.
+ f.f += denorm_magic.f;
+
+ // and one integer subtract of the bias later, we have our final float!
+ o.x = static_cast<unsigned short>(f.u - denorm_magic.u);
+ } else {
+ unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
+
+ // update exponent, rounding bias part 1
+ f.u += ((unsigned int)(15 - 127) << 23) + 0xfff;
+ // rounding bias part 2
+ f.u += mant_odd;
+ // take the bits!
+ o.x = static_cast<unsigned short>(f.u >> 13);
+ }
+ }
+
+ o.x |= static_cast<unsigned short>(sign >> 16);
+ return o;
+#endif
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
+#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
+ return __half2float(h);
+
+#elif defined(EIGEN_HAS_FP16_C)
+ return _cvtsh_ss(h.x);
+
+#else
+ const FP32 magic = { 113 << 23 };
+ const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
+ FP32 o;
+
+ o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits
+ unsigned int exp = shifted_exp & o.u; // just the exponent
+ o.u += (127 - 15) << 23; // exponent adjust
+
+ // handle exponent special cases
+ if (exp == shifted_exp) { // Inf/NaN?
+ o.u += (128 - 16) << 23; // extra exp adjust
+ } else if (exp == 0) { // Zero/Denormal?
+ o.u += 1 << 23; // extra exp adjust
+ o.f -= magic.f; // renormalize
+ }
+
+ o.u |= (h.x & 0x8000) << 16; // sign bit
+ return o.f;
+#endif
+}
+
+// --- standard functions ---
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
+ return (a.x & 0x7fff) == 0x7c00;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
+#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
+ return __hisnan(a);
+#else
+ return (a.x & 0x7fff) > 0x7c00;
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const half& a) {
+ return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a));
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
+ half result;
+ result.x = a.x & 0x7FFF;
+ return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
+#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
+ return half(hexp(a));
+#else
+ return half(::expf(float(a)));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
+ return half(numext::expm1(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
+#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
+ return half(hlog(a));
+#else
+ return half(::logf(float(a)));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) {
+ return half(numext::log1p(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
+ return half(::log10f(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
+#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
+ return half(hsqrt(a));
+#else
+ return half(::sqrtf(float(a)));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) {
+ return half(::powf(float(a), float(b)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) {
+ return half(::sinf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) {
+ return half(::cosf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) {
+ return half(::tanf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
+ return half(::tanhf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
+#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
+ return half(hfloor(a));
+#else
+ return half(::floorf(float(a)));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
+#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
+ return half(hceil(a));
+#else
+ return half(::ceilf(float(a)));
+#endif
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
+#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
+ return __hlt(b, a) ? b : a;
+#else
+ const float f1 = static_cast<float>(a);
+ const float f2 = static_cast<float>(b);
+ return f2 < f1 ? b : a;
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
+#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
+ return __hlt(a, b) ? b : a;
+#else
+ const float f1 = static_cast<float>(a);
+ const float f2 = static_cast<float>(b);
+ return f1 < f2 ? b : a;
+#endif
+}
+
+EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const half& v) {
+ os << static_cast<float>(v);
+ return os;
+}
+
+} // end namespace half_impl
+
+// import Eigen::half_impl::half into Eigen namespace
+// using half_impl::half;
+
+namespace internal {
+
+template<>
+struct random_default_impl<half, false, false>
+{
+ static inline half run(const half& x, const half& y)
+ {
+ return x + (y-x) * half(float(std::rand()) / float(RAND_MAX));
+ }
+ static inline half run()
+ {
+ return run(half(-1.f), half(1.f));
+ }
+};
+
+template<> struct is_arithmetic<half> { enum { value = true }; };
+
+} // end namespace internal
+
+template<> struct NumTraits<Eigen::half>
+ : GenericNumTraits<Eigen::half>
+{
+ enum {
+ IsSigned = true,
+ IsInteger = false,
+ IsComplex = false,
+ RequireInitialization = false
+ };
+
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() {
+ return half_impl::raw_uint16_to_half(0x0800);
+ }
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { return Eigen::half(1e-2f); }
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() {
+ return half_impl::raw_uint16_to_half(0x7bff);
+ }
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() {
+ return half_impl::raw_uint16_to_half(0xfbff);
+ }
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() {
+ return half_impl::raw_uint16_to_half(0x7c00);
+ }
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() {
+ return half_impl::raw_uint16_to_half(0x7c01);
+ }
+};
+
+} // end namespace Eigen
+
+// C-like standard mathematical functions and trancendentals.
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half fabsh(const Eigen::half& a) {
+ Eigen::half result;
+ result.x = a.x & 0x7FFF;
+ return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
+ return Eigen::half(::expf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
+#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
+ return Eigen::half(hlog(a));
+#else
+ return Eigen::half(::logf(float(a)));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) {
+ return Eigen::half(::sqrtf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half powh(const Eigen::half& a, const Eigen::half& b) {
+ return Eigen::half(::powf(float(a), float(b)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) {
+ return Eigen::half(::floorf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceilh(const Eigen::half& a) {
+ return Eigen::half(::ceilf(float(a)));
+}
+
+namespace std {
+
+#if __cplusplus > 199711L
+template <>
+struct hash<Eigen::half> {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::half& a) const {
+ return static_cast<std::size_t>(a.x);
+ }
+};
+#endif
+
+} // end namespace std
+
+
+// Add the missing shfl_xor intrinsic
+#if defined(EIGEN_HAS_HIP_FP16) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
+ // FIXME
+ //return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
+ return var;
+}
+#endif
+
+// ldg() has an overload for __half, but we also need one for Eigen::half.
+#if defined(EIGEN_HAS_HIP_FP16) && \
+ defined(__HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__) && defined(__HIP_ARCH_HAS_DYNAMIC_PARALLEL__)
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
+ // FIXME
+ //return Eigen::half_impl::raw_uint16_to_half(
+ // __ldg(reinterpret_cast<const unsigned short*>(ptr)));
+ return *ptr;
+}
+#endif
+
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+namespace Eigen {
+namespace numext {
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool (isnan)(const Eigen::half& h) {
+ return (half_impl::isnan)(h);
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool (isinf)(const Eigen::half& h) {
+ return (half_impl::isinf)(h);
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool (isfinite)(const Eigen::half& h) {
+ return (half_impl::isfinite)(h);
+}
+
+} // namespace Eigen
+} // namespace numext
+#endif
+
+#endif // EIGEN_HALF_HIP_H
diff --git a/Eigen/src/Core/arch/HIP/hcc/PacketMathHalf.h b/Eigen/src/Core/arch/HIP/hcc/PacketMathHalf.h
new file mode 100644
index 000000000..29c3f4671
--- /dev/null
+++ b/Eigen/src/Core/arch/HIP/hcc/PacketMathHalf.h
@@ -0,0 +1,1019 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_HALF_HIP_H
+#define EIGEN_PACKET_MATH_HALF_HIP_H
+
+
+namespace Eigen {
+namespace internal {
+
+// Most of the following operations require arch >= 3.0
+#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
+
+template<> struct is_arithmetic<half2> { enum { value = true }; };
+
+template<> struct packet_traits<Eigen::half> : default_packet_traits
+{
+ typedef half2 type;
+ typedef half2 half;
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 1,
+ size=2,
+ HasHalfPacket = 0,
+ HasAdd = 1,
+ HasMul = 1,
+ HasDiv = 1,
+ HasSqrt = 1,
+ HasRsqrt = 1,
+ HasExp = 1,
+ HasExpm1 = 1,
+ HasLog = 1,
+ HasLog1p = 1
+ };
+};
+
+template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
+#if defined(EIGEN_HAS_OLD_HIP_FP16)
+ return half2half2(from);
+#else
+ return __half2half2(from);
+#endif
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
+ return *reinterpret_cast<const half2*>(from);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
+ return __halves2half2(from[0], from[1]);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half* from) {
+ return __halves2half2(from[0], from[0]);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
+ *reinterpret_cast<half2*>(to) = from;
+}
+
+template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
+ to[0] = __low2half(from);
+ to[1] = __high2half(from);
+}
+
+template<>
+ __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
+#if defined(EIGEN_HAS_OLD_HIP_FP16)
+ return __halves2half2((*(from+0)), (*(from+1)));
+#else
+ return __ldg((const half2*)from);
+#endif
+}
+
+template<>
+__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
+#if defined(EIGEN_HAS_OLD_HIP_FP16)
+ return __halves2half2((*(from+0)), (*(from+1)));
+#else
+ return __halves2half2(__ldg(from+0), __ldg(from+1));
+#endif
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
+ return __halves2half2(from[0*stride], from[1*stride]);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
+ to[stride*0] = __low2half(from);
+ to[stride*1] = __high2half(from);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
+ return __low2half(a);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
+ __half x = __ushort_as_half(__half_as_ushort(__low2half(a)) & 0x7FFF);
+ __half y = __ushort_as_half(__half_as_ushort(__high2half(a)) & 0x7FFF);
+ return __halves2half2(x, y);
+}
+
+
+__device__ EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<half2,2>& kernel) {
+ __half a1 = __low2half(kernel.packet[0]);
+ __half a2 = __high2half(kernel.packet[0]);
+ __half b1 = __low2half(kernel.packet[1]);
+ __half b2 = __high2half(kernel.packet[1]);
+ kernel.packet[0] = __halves2half2(a1, b1);
+ kernel.packet[1] = __halves2half2(a2, b2);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
+ return __halves2half2(a, __hadd(a, __float2half(1.0f)));
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
+ return __hadd2(a, b);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
+ return __hsub2(a, b);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
+ return __hneg2(a);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
+ return __hmul2(a, b);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
+ return __hfma2(a, b, c);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
+#if defined(EIGEN_HAS_OLD_HIP_FP16)
+ return h2div(a, b);
+#else
+ return __h2div(a, b);
+#endif
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
+ float a1 = __low2float(a);
+ float a2 = __high2float(a);
+ float b1 = __low2float(b);
+ float b2 = __high2float(b);
+ __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
+ __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
+ return __halves2half2(r1, r2);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
+ float a1 = __low2float(a);
+ float a2 = __high2float(a);
+ float b1 = __low2float(b);
+ float b2 = __high2float(b);
+ __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
+ __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
+ return __halves2half2(r1, r2);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
+ return __hadd(__low2half(a), __high2half(a));
+}
+
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
+ __half first = __low2half(a);
+ __half second = __high2half(a);
+ return __hgt(first, second) ? first : second;
+}
+
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
+ __half first = __low2half(a);
+ __half second = __high2half(a);
+ return __hlt(first, second) ? first : second;
+}
+
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
+ return __hmul(__low2half(a), __high2half(a));
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
+ float a1 = __low2float(a);
+ float a2 = __high2float(a);
+ float r1 = log1pf(a1);
+ float r2 = log1pf(a2);
+ return __floats2half2_rn(r1, r2);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
+ float a1 = __low2float(a);
+ float a2 = __high2float(a);
+ float r1 = expm1f(a1);
+ float r2 = expm1f(a2);
+ return __floats2half2_rn(r1, r2);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE
+half2 plog<half2>(const half2& a) {
+ return h2log(a);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE
+half2 pexp<half2>(const half2& a) {
+ return h2exp(a);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE
+half2 psqrt<half2>(const half2& a) {
+ return h2sqrt(a);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE
+half2 prsqrt<half2>(const half2& a) {
+ return h2rsqrt(a);
+}
+
+#elif defined EIGEN_VECTORIZE_AVX512
+
+typedef struct {
+ __m256i x;
+} Packet16h;
+
+
+template<> struct is_arithmetic<Packet16h> { enum { value = true }; };
+
+template <>
+struct packet_traits<half> : default_packet_traits {
+ typedef Packet16h type;
+ // There is no half-size packet for Packet16h.
+ typedef Packet16h half;
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 1,
+ size = 16,
+ HasHalfPacket = 0,
+ HasAdd = 0,
+ HasSub = 0,
+ HasMul = 0,
+ HasNegate = 0,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 0,
+ HasMax = 0,
+ HasConj = 0,
+ HasSetLinear = 0,
+ HasDiv = 0,
+ HasSqrt = 0,
+ HasRsqrt = 0,
+ HasExp = 0,
+ HasLog = 0,
+ HasBlend = 0
+ };
+};
+
+
+template<> struct unpacket_traits<Packet16h> { typedef Eigen::half type; enum {size=16, alignment=Aligned32}; typedef Packet16h half; };
+
+template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
+ Packet16h result;
+ result.x = _mm256_set1_epi16(from.x);
+ return result;
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
+ return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm256_extract_epi16(from.x, 0)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
+ Packet16h result;
+ result.x = _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
+ return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
+ Packet16h result;
+ result.x = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
+ return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
+ _mm256_store_si256((__m256i*)to, from.x);
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
+ _mm256_storeu_si256((__m256i*)to, from.x);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h
+ploadquad(const Eigen::half* from) {
+ Packet16h result;
+ unsigned short a = from[0].x;
+ unsigned short b = from[1].x;
+ unsigned short c = from[2].x;
+ unsigned short d = from[3].x;
+ result.x = _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
+ return result;
+}
+
+EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) {
+#ifdef EIGEN_HAS_FP16_C
+ return _mm512_cvtph_ps(a.x);
+#else
+ EIGEN_ALIGN64 half aux[16];
+ pstore(aux, a);
+ float f0(aux[0]);
+ float f1(aux[1]);
+ float f2(aux[2]);
+ float f3(aux[3]);
+ float f4(aux[4]);
+ float f5(aux[5]);
+ float f6(aux[6]);
+ float f7(aux[7]);
+ float f8(aux[8]);
+ float f9(aux[9]);
+ float fa(aux[10]);
+ float fb(aux[11]);
+ float fc(aux[12]);
+ float fd(aux[13]);
+ float fe(aux[14]);
+ float ff(aux[15]);
+
+ return _mm512_set_ps(
+ ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0);
+#endif
+}
+
+EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
+#ifdef EIGEN_HAS_FP16_C
+ Packet16h result;
+ result.x = _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
+ return result;
+#else
+ EIGEN_ALIGN64 float aux[16];
+ pstore(aux, a);
+ half h0(aux[0]);
+ half h1(aux[1]);
+ half h2(aux[2]);
+ half h3(aux[3]);
+ half h4(aux[4]);
+ half h5(aux[5]);
+ half h6(aux[6]);
+ half h7(aux[7]);
+ half h8(aux[8]);
+ half h9(aux[9]);
+ half ha(aux[10]);
+ half hb(aux[11]);
+ half hc(aux[12]);
+ half hd(aux[13]);
+ half he(aux[14]);
+ half hf(aux[15]);
+
+ Packet16h result;
+ result.x = _mm256_set_epi16(
+ hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x,
+ h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
+ return result;
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
+ Packet16f af = half2float(a);
+ Packet16f bf = half2float(b);
+ Packet16f rf = padd(af, bf);
+ return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
+ Packet16f af = half2float(a);
+ Packet16f bf = half2float(b);
+ Packet16f rf = pmul(af, bf);
+ return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
+ Packet16f from_float = half2float(from);
+ return half(predux(from_float));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
+{
+ Packet16h result;
+ result.x = _mm256_set_epi16(
+ from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x,
+ from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x,
+ from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x,
+ from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
+ return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride)
+{
+ EIGEN_ALIGN64 half aux[16];
+ pstore(aux, from);
+ to[stride*0].x = aux[0].x;
+ to[stride*1].x = aux[1].x;
+ to[stride*2].x = aux[2].x;
+ to[stride*3].x = aux[3].x;
+ to[stride*4].x = aux[4].x;
+ to[stride*5].x = aux[5].x;
+ to[stride*6].x = aux[6].x;
+ to[stride*7].x = aux[7].x;
+ to[stride*8].x = aux[8].x;
+ to[stride*9].x = aux[9].x;
+ to[stride*10].x = aux[10].x;
+ to[stride*11].x = aux[11].x;
+ to[stride*12].x = aux[12].x;
+ to[stride*13].x = aux[13].x;
+ to[stride*14].x = aux[14].x;
+ to[stride*15].x = aux[15].x;
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet16h,16>& kernel) {
+ __m256i a = kernel.packet[0].x;
+ __m256i b = kernel.packet[1].x;
+ __m256i c = kernel.packet[2].x;
+ __m256i d = kernel.packet[3].x;
+ __m256i e = kernel.packet[4].x;
+ __m256i f = kernel.packet[5].x;
+ __m256i g = kernel.packet[6].x;
+ __m256i h = kernel.packet[7].x;
+ __m256i i = kernel.packet[8].x;
+ __m256i j = kernel.packet[9].x;
+ __m256i k = kernel.packet[10].x;
+ __m256i l = kernel.packet[11].x;
+ __m256i m = kernel.packet[12].x;
+ __m256i n = kernel.packet[13].x;
+ __m256i o = kernel.packet[14].x;
+ __m256i p = kernel.packet[15].x;
+
+ __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
+ __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
+ __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
+ __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
+ __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
+ __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
+ __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
+ __m256i op_07 = _mm256_unpacklo_epi16(o, p);
+
+ __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
+ __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
+ __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
+ __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
+ __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
+ __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
+ __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
+ __m256i op_8f = _mm256_unpackhi_epi16(o, p);
+
+ __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
+ __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
+ __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
+ __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
+ __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
+ __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
+ __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
+ __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
+
+ __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
+ __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
+ __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
+ __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
+ __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
+ __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
+ __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
+ __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
+
+ __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
+ __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
+ __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
+ __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
+ __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
+ __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
+ __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
+ __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
+ __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
+ __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
+ __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
+ __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
+ __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
+ __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
+ __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
+ __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
+
+ // NOTE: no unpacklo/hi instr in this case, so using permute instr.
+ __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
+ __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
+ __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
+ __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
+ __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
+ __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
+ __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
+ __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
+ __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
+ __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
+ __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
+ __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
+ __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
+ __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
+ __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
+ __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
+
+ kernel.packet[0].x = a_p_0;
+ kernel.packet[1].x = a_p_1;
+ kernel.packet[2].x = a_p_2;
+ kernel.packet[3].x = a_p_3;
+ kernel.packet[4].x = a_p_4;
+ kernel.packet[5].x = a_p_5;
+ kernel.packet[6].x = a_p_6;
+ kernel.packet[7].x = a_p_7;
+ kernel.packet[8].x = a_p_8;
+ kernel.packet[9].x = a_p_9;
+ kernel.packet[10].x = a_p_a;
+ kernel.packet[11].x = a_p_b;
+ kernel.packet[12].x = a_p_c;
+ kernel.packet[13].x = a_p_d;
+ kernel.packet[14].x = a_p_e;
+ kernel.packet[15].x = a_p_f;
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet16h,8>& kernel) {
+ EIGEN_ALIGN64 half in[8][16];
+ pstore<half>(in[0], kernel.packet[0]);
+ pstore<half>(in[1], kernel.packet[1]);
+ pstore<half>(in[2], kernel.packet[2]);
+ pstore<half>(in[3], kernel.packet[3]);
+ pstore<half>(in[4], kernel.packet[4]);
+ pstore<half>(in[5], kernel.packet[5]);
+ pstore<half>(in[6], kernel.packet[6]);
+ pstore<half>(in[7], kernel.packet[7]);
+
+ EIGEN_ALIGN64 half out[8][16];
+
+ for (int i = 0; i < 8; ++i) {
+ for (int j = 0; j < 8; ++j) {
+ out[i][j] = in[j][2*i];
+ }
+ for (int j = 0; j < 8; ++j) {
+ out[i][j+8] = in[j][2*i+1];
+ }
+ }
+
+ kernel.packet[0] = pload<Packet16h>(out[0]);
+ kernel.packet[1] = pload<Packet16h>(out[1]);
+ kernel.packet[2] = pload<Packet16h>(out[2]);
+ kernel.packet[3] = pload<Packet16h>(out[3]);
+ kernel.packet[4] = pload<Packet16h>(out[4]);
+ kernel.packet[5] = pload<Packet16h>(out[5]);
+ kernel.packet[6] = pload<Packet16h>(out[6]);
+ kernel.packet[7] = pload<Packet16h>(out[7]);
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet16h,4>& kernel) {
+ EIGEN_ALIGN64 half in[4][16];
+ pstore<half>(in[0], kernel.packet[0]);
+ pstore<half>(in[1], kernel.packet[1]);
+ pstore<half>(in[2], kernel.packet[2]);
+ pstore<half>(in[3], kernel.packet[3]);
+
+ EIGEN_ALIGN64 half out[4][16];
+
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ out[i][j] = in[j][4*i];
+ }
+ for (int j = 0; j < 4; ++j) {
+ out[i][j+4] = in[j][4*i+1];
+ }
+ for (int j = 0; j < 4; ++j) {
+ out[i][j+8] = in[j][4*i+2];
+ }
+ for (int j = 0; j < 4; ++j) {
+ out[i][j+12] = in[j][4*i+3];
+ }
+ }
+
+ kernel.packet[0] = pload<Packet16h>(out[0]);
+ kernel.packet[1] = pload<Packet16h>(out[1]);
+ kernel.packet[2] = pload<Packet16h>(out[2]);
+ kernel.packet[3] = pload<Packet16h>(out[3]);
+}
+
+
+#elif defined EIGEN_VECTORIZE_AVX
+
+typedef struct {
+ __m128i x;
+} Packet8h;
+
+
+template<> struct is_arithmetic<Packet8h> { enum { value = true }; };
+
+template <>
+struct packet_traits<Eigen::half> : default_packet_traits {
+ typedef Packet8h type;
+ // There is no half-size packet for Packet8h.
+ typedef Packet8h half;
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 1,
+ size = 8,
+ HasHalfPacket = 0,
+ HasAdd = 0,
+ HasSub = 0,
+ HasMul = 0,
+ HasNegate = 0,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 0,
+ HasMax = 0,
+ HasConj = 0,
+ HasSetLinear = 0,
+ HasDiv = 0,
+ HasSqrt = 0,
+ HasRsqrt = 0,
+ HasExp = 0,
+ HasLog = 0,
+ HasBlend = 0
+ };
+};
+
+
+template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16}; typedef Packet8h half; };
+
+template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
+ Packet8h result;
+ result.x = _mm_set1_epi16(from.x);
+ return result;
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
+ return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_extract_epi16(from.x, 0)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
+ Packet8h result;
+ result.x = _mm_load_si128(reinterpret_cast<const __m128i*>(from));
+ return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
+ Packet8h result;
+ result.x = _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
+ return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) {
+ _mm_store_si128(reinterpret_cast<__m128i*>(to), from.x);
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) {
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h
+ploadquad<Packet8h>(const Eigen::half* from) {
+ Packet8h result;
+ unsigned short a = from[0].x;
+ unsigned short b = from[1].x;
+ result.x = _mm_set_epi16(b, b, b, b, a, a, a, a);
+ return result;
+}
+
+EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) {
+#ifdef EIGEN_HAS_FP16_C
+ return _mm256_cvtph_ps(a.x);
+#else
+ EIGEN_ALIGN32 Eigen::half aux[8];
+ pstore(aux, a);
+ float f0(aux[0]);
+ float f1(aux[1]);
+ float f2(aux[2]);
+ float f3(aux[3]);
+ float f4(aux[4]);
+ float f5(aux[5]);
+ float f6(aux[6]);
+ float f7(aux[7]);
+
+ return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0);
+#endif
+}
+
+EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
+#ifdef EIGEN_HAS_FP16_C
+ Packet8h result;
+ result.x = _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
+ return result;
+#else
+ EIGEN_ALIGN32 float aux[8];
+ pstore(aux, a);
+ Eigen::half h0(aux[0]);
+ Eigen::half h1(aux[1]);
+ Eigen::half h2(aux[2]);
+ Eigen::half h3(aux[3]);
+ Eigen::half h4(aux[4]);
+ Eigen::half h5(aux[5]);
+ Eigen::half h6(aux[6]);
+ Eigen::half h7(aux[7]);
+
+ Packet8h result;
+ result.x = _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
+ return result;
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
+ Packet8f af = half2float(a);
+ Packet8f bf = half2float(b);
+ Packet8f rf = padd(af, bf);
+ return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
+ Packet8f af = half2float(a);
+ Packet8f bf = half2float(b);
+ Packet8f rf = pmul(af, bf);
+ return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride)
+{
+ Packet8h result;
+ result.x = _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
+ return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride)
+{
+ EIGEN_ALIGN32 Eigen::half aux[8];
+ pstore(aux, from);
+ to[stride*0].x = aux[0].x;
+ to[stride*1].x = aux[1].x;
+ to[stride*2].x = aux[2].x;
+ to[stride*3].x = aux[3].x;
+ to[stride*4].x = aux[4].x;
+ to[stride*5].x = aux[5].x;
+ to[stride*6].x = aux[6].x;
+ to[stride*7].x = aux[7].x;
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) {
+ Packet8f af = half2float(a);
+ float reduced = predux<Packet8f>(af);
+ return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) {
+ Packet8f af = half2float(a);
+ float reduced = predux_max<Packet8f>(af);
+ return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) {
+ Packet8f af = half2float(a);
+ float reduced = predux_min<Packet8f>(af);
+ return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) {
+ Packet8f af = half2float(a);
+ float reduced = predux_mul<Packet8f>(af);
+ return Eigen::half(reduced);
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet8h,8>& kernel) {
+ __m128i a = kernel.packet[0].x;
+ __m128i b = kernel.packet[1].x;
+ __m128i c = kernel.packet[2].x;
+ __m128i d = kernel.packet[3].x;
+ __m128i e = kernel.packet[4].x;
+ __m128i f = kernel.packet[5].x;
+ __m128i g = kernel.packet[6].x;
+ __m128i h = kernel.packet[7].x;
+
+ __m128i a03b03 = _mm_unpacklo_epi16(a, b);
+ __m128i c03d03 = _mm_unpacklo_epi16(c, d);
+ __m128i e03f03 = _mm_unpacklo_epi16(e, f);
+ __m128i g03h03 = _mm_unpacklo_epi16(g, h);
+ __m128i a47b47 = _mm_unpackhi_epi16(a, b);
+ __m128i c47d47 = _mm_unpackhi_epi16(c, d);
+ __m128i e47f47 = _mm_unpackhi_epi16(e, f);
+ __m128i g47h47 = _mm_unpackhi_epi16(g, h);
+
+ __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
+ __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
+ __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
+ __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
+ __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
+ __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
+ __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
+ __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
+
+ __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
+ __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
+ __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
+ __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
+ __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
+ __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
+ __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
+ __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
+
+ kernel.packet[0].x = a0b0c0d0e0f0g0h0;
+ kernel.packet[1].x = a1b1c1d1e1f1g1h1;
+ kernel.packet[2].x = a2b2c2d2e2f2g2h2;
+ kernel.packet[3].x = a3b3c3d3e3f3g3h3;
+ kernel.packet[4].x = a4b4c4d4e4f4g4h4;
+ kernel.packet[5].x = a5b5c5d5e5f5g5h5;
+ kernel.packet[6].x = a6b6c6d6e6f6g6h6;
+ kernel.packet[7].x = a7b7c7d7e7f7g7h7;
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet8h,4>& kernel) {
+ EIGEN_ALIGN32 Eigen::half in[4][8];
+ pstore<Eigen::half>(in[0], kernel.packet[0]);
+ pstore<Eigen::half>(in[1], kernel.packet[1]);
+ pstore<Eigen::half>(in[2], kernel.packet[2]);
+ pstore<Eigen::half>(in[3], kernel.packet[3]);
+
+ EIGEN_ALIGN32 Eigen::half out[4][8];
+
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ out[i][j] = in[j][2*i];
+ }
+ for (int j = 0; j < 4; ++j) {
+ out[i][j+4] = in[j][2*i+1];
+ }
+ }
+
+ kernel.packet[0] = pload<Packet8h>(out[0]);
+ kernel.packet[1] = pload<Packet8h>(out[1]);
+ kernel.packet[2] = pload<Packet8h>(out[2]);
+ kernel.packet[3] = pload<Packet8h>(out[3]);
+}
+
+
+// Disable the following code since it's broken on too many platforms / compilers.
+//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
+#elif 0
+
+typedef struct {
+ __m64 x;
+} Packet4h;
+
+
+template<> struct is_arithmetic<Packet4h> { enum { value = true }; };
+
+template <>
+struct packet_traits<Eigen::half> : default_packet_traits {
+ typedef Packet4h type;
+ // There is no half-size packet for Packet4h.
+ typedef Packet4h half;
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 1,
+ size = 4,
+ HasHalfPacket = 0,
+ HasAdd = 0,
+ HasSub = 0,
+ HasMul = 0,
+ HasNegate = 0,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 0,
+ HasMax = 0,
+ HasConj = 0,
+ HasSetLinear = 0,
+ HasDiv = 0,
+ HasSqrt = 0,
+ HasRsqrt = 0,
+ HasExp = 0,
+ HasLog = 0,
+ HasBlend = 0
+ };
+};
+
+
+template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16}; typedef Packet4h half; };
+
+template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
+ Packet4h result;
+ result.x = _mm_set1_pi16(from.x);
+ return result;
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) {
+ return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) {
+ __int64_t a64 = _mm_cvtm64_si64(a.x);
+ __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+ Eigen::half h[4];
+
+ Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+ Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+ h[0] = ha + hb;
+ ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+ hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+ h[1] = ha + hb;
+ ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+ hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+ h[2] = ha + hb;
+ ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+ hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+ h[3] = ha + hb;
+ Packet4h result;
+ result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+ return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
+ __int64_t a64 = _mm_cvtm64_si64(a.x);
+ __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+ Eigen::half h[4];
+
+ Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+ Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+ h[0] = ha * hb;
+ ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+ hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+ h[1] = ha * hb;
+ ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+ hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+ h[2] = ha * hb;
+ ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+ hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+ h[3] = ha * hb;
+ Packet4h result;
+ result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+ return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
+ Packet4h result;
+ result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
+ return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) {
+ Packet4h result;
+ result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
+ return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) {
+ __int64_t r = _mm_cvtm64_si64(from.x);
+ *(reinterpret_cast<__int64_t*>(to)) = r;
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) {
+ __int64_t r = _mm_cvtm64_si64(from.x);
+ *(reinterpret_cast<__int64_t*>(to)) = r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h
+ploadquad<Packet4h>(const Eigen::half* from) {
+ return pset1<Packet4h>(*from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride)
+{
+ Packet4h result;
+ result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
+ return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride)
+{
+ __int64_t a = _mm_cvtm64_si64(from.x);
+ to[stride*0].x = static_cast<unsigned short>(a);
+ to[stride*1].x = static_cast<unsigned short>(a >> 16);
+ to[stride*2].x = static_cast<unsigned short>(a >> 32);
+ to[stride*3].x = static_cast<unsigned short>(a >> 48);
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet4h,4>& kernel) {
+ __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
+ __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
+ __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
+ __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
+
+ kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1);
+ kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1);
+ kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3);
+ kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3);
+}
+
+#endif
+
+}
+}
+
+#endif // EIGEN_PACKET_MATH_HALF_HIP_H
diff --git a/Eigen/src/Core/arch/HIP/hcc/TypeCasting.h b/Eigen/src/Core/arch/HIP/hcc/TypeCasting.h
new file mode 100644
index 000000000..915266a9d
--- /dev/null
+++ b/Eigen/src/Core/arch/HIP/hcc/TypeCasting.h
@@ -0,0 +1,212 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_HIP_H
+#define EIGEN_TYPE_CASTING_HIP_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<>
+struct scalar_cast_op<float, Eigen::half> {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+ typedef Eigen::half result_type;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const {
+ #if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
+ return __float2half(a);
+ #else
+ return Eigen::half(a);
+ #endif
+ }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<float, Eigen::half> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+template<>
+struct scalar_cast_op<int, Eigen::half> {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+ typedef Eigen::half result_type;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const {
+ #if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
+ return __float2half(static_cast<float>(a));
+ #else
+ return Eigen::half(static_cast<float>(a));
+ #endif
+ }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<int, Eigen::half> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+template<>
+struct scalar_cast_op<Eigen::half, float> {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+ typedef float result_type;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const {
+ #if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
+ return __half2float(a);
+ #else
+ return static_cast<float>(a);
+ #endif
+ }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<Eigen::half, float> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+
+#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
+
+template <>
+struct type_casting_traits<Eigen::half, float> {
+ enum {
+ VectorizedCast = 1,
+ SrcCoeffRatio = 2,
+ TgtCoeffRatio = 1
+ };
+};
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(const half2& a, const half2& b) {
+ float2 r1 = __half22float2(a);
+ float2 r2 = __half22float2(b);
+ return make_float4(r1.x, r1.y, r2.x, r2.y);
+}
+
+template <>
+struct type_casting_traits<float, Eigen::half> {
+ enum {
+ VectorizedCast = 1,
+ SrcCoeffRatio = 1,
+ TgtCoeffRatio = 2
+ };
+};
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
+ // Simply discard the second half of the input
+ return __floats2half2_rn(a.x, a.y);
+}
+
+#elif defined EIGEN_VECTORIZE_AVX512
+template <>
+struct type_casting_traits<half, float> {
+ enum {
+ VectorizedCast = 1,
+ SrcCoeffRatio = 1,
+ TgtCoeffRatio = 1
+ };
+};
+
+template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) {
+ return half2float(a);
+}
+
+template <>
+struct type_casting_traits<float, half> {
+ enum {
+ VectorizedCast = 1,
+ SrcCoeffRatio = 1,
+ TgtCoeffRatio = 1
+ };
+};
+
+template<> EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packet16f& a) {
+ return float2half(a);
+}
+
+#elif defined EIGEN_VECTORIZE_AVX
+
+template <>
+struct type_casting_traits<Eigen::half, float> {
+ enum {
+ VectorizedCast = 1,
+ SrcCoeffRatio = 1,
+ TgtCoeffRatio = 1
+ };
+};
+
+template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) {
+ return half2float(a);
+}
+
+template <>
+struct type_casting_traits<float, Eigen::half> {
+ enum {
+ VectorizedCast = 1,
+ SrcCoeffRatio = 1,
+ TgtCoeffRatio = 1
+ };
+};
+
+template<> EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
+ return float2half(a);
+}
+
+// Disable the following code since it's broken on too many platforms / compilers.
+//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
+#elif 0
+
+template <>
+struct type_casting_traits<Eigen::half, float> {
+ enum {
+ VectorizedCast = 1,
+ SrcCoeffRatio = 1,
+ TgtCoeffRatio = 1
+ };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4h, Packet4f>(const Packet4h& a) {
+ __int64_t a64 = _mm_cvtm64_si64(a.x);
+ Eigen::half h = raw_uint16_to_half(static_cast<unsigned short>(a64));
+ float f1 = static_cast<float>(h);
+ h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+ float f2 = static_cast<float>(h);
+ h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+ float f3 = static_cast<float>(h);
+ h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+ float f4 = static_cast<float>(h);
+ return _mm_set_ps(f4, f3, f2, f1);
+}
+
+template <>
+struct type_casting_traits<float, Eigen::half> {
+ enum {
+ VectorizedCast = 1,
+ SrcCoeffRatio = 1,
+ TgtCoeffRatio = 1
+ };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f& a) {
+ EIGEN_ALIGN16 float aux[4];
+ pstore(aux, a);
+ Eigen::half h0(aux[0]);
+ Eigen::half h1(aux[1]);
+ Eigen::half h2(aux[2]);
+ Eigen::half h3(aux[3]);
+
+ Packet4h result;
+ result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x);
+ return result;
+}
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_HIP_H
diff --git a/Eigen/src/Core/arch/HIP/hcc/math_constants.h b/Eigen/src/Core/arch/HIP/hcc/math_constants.h
new file mode 100644
index 000000000..25375a0a4
--- /dev/null
+++ b/Eigen/src/Core/arch/HIP/hcc/math_constants.h
@@ -0,0 +1,23 @@
+/*
+ * math_constants.h -
+ * HIP equivalent of the CUDA header of the same name
+ */
+
+#ifndef __MATH_CONSTANTS_H__
+#define __MATH_CONSTANTS_H__
+
+/* single precision constants */
+
+#define HIPRT_INF_F __int_as_float(0x7f800000)
+#define HIPRT_NAN_F __int_as_float(0x7fffffff)
+#define HIPRT_MIN_DENORM_F __int_as_float(0x00000001)
+#define HIPRT_MAX_NORMAL_F __int_as_float(0x7f7fffff)
+#define HIPRT_NEG_ZERO_F __int_as_float(0x80000000)
+#define HIPRT_ZERO_F 0.0f
+#define HIPRT_ONE_F 1.0f
+
+/* double precision constants */
+#define HIPRT_INF __hiloint2double(0x7ff00000, 0x00000000)
+#define HIPRT_NAN __hiloint2double(0xfff80000, 0x00000000)
+
+#endif
diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h
index 3eae6b8ca..e269140bd 100644
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@@ -436,6 +436,9 @@ template<typename BinaryOp> struct bind1st_op : BinaryOp {
typedef typename BinaryOp::second_argument_type second_argument_type;
typedef typename BinaryOp::result_type result_type;
+ #if defined(EIGEN_HIPCC)
+ EIGEN_DEVICE_FUNC explicit
+ #endif
bind1st_op(const first_argument_type &val) : m_value(val) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); }
@@ -455,6 +458,9 @@ template<typename BinaryOp> struct bind2nd_op : BinaryOp {
typedef typename BinaryOp::second_argument_type second_argument_type;
typedef typename BinaryOp::result_type result_type;
+ #if defined(EIGEN_HIPCC)
+ EIGEN_DEVICE_FUNC explicit
+ #endif
bind2nd_op(const second_argument_type &val) : m_value(val) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); }
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index b1791fb3a..a4cde6d95 100755
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -163,7 +163,10 @@ class BlasLinearMapper {
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {}
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const {
+ #if !defined(EIGEN_HIPCC)
+ EIGEN_DEVICE_FUNC
+ #endif
+ EIGEN_ALWAYS_INLINE void prefetch(int i) const {
internal::prefetch(&operator()(i));
}
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 9c68ecb7d..c6e27f6af 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -1008,9 +1008,12 @@ namespace Eigen {
# define EIGEN_TRY try
# define EIGEN_CATCH(X) catch (X)
#else
-# ifdef EIGEN_CUDA_ARCH
+# if defined(EIGEN_CUDA_ARCH)
# define EIGEN_THROW_X(X) asm("trap;")
# define EIGEN_THROW asm("trap;")
+# elif defined(EIGEN_HIP_DEVICE_COMPILE)
+# define EIGEN_THROW_X(X) asm("s_trap 0")
+# define EIGEN_THROW asm("s_trap 0")
# else
# define EIGEN_THROW_X(X) std::abort()
# define EIGEN_THROW std::abort()
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 53300c388..87fcc30f5 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -70,7 +70,20 @@ inline void throw_std_bad_alloc()
throw std::bad_alloc();
#else
std::size_t huge = static_cast<std::size_t>(-1);
+ #if defined(EIGEN_HIPCC)
+ //
+ // calls to "::operator new" are to be treated as opaque function calls (i.e no inlining),
+ // and as a consequence the code in the #else block triggers the hipcc warning :
+ // "no overloaded function has restriction specifiers that are compatible with the ambient context"
+ //
+ // "throw_std_bad_alloc" has the EIGEN_DEVICE_FUNC attribute, so it seems that hipcc expects
+ // the same on "operator new"
+ // Reverting code back to the old version in this #if block for the hipcc compiler
+ //
+ new int[huge];
+ #else
::operator new(huge);
+ #endif
#endif
}
@@ -156,7 +169,13 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size)
void *result;
#if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
+
+ #if defined(EIGEN_HIP_DEVICE_COMPILE)
+ result = aligned_malloc(size);
+ #else
result = std::malloc(size);
+ #endif
+
#if EIGEN_DEFAULT_ALIGN_BYTES==16
eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade alignd memory allocator.");
#endif
@@ -174,7 +193,13 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size)
EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr)
{
#if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
+
+ #if defined(EIGEN_HIP_DEVICE_COMPILE)
+ aligned_free(ptr);
+ #else
std::free(ptr);
+ #endif
+
#else
handmade_aligned_free(ptr);
#endif
@@ -218,7 +243,12 @@ template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(std:
{
check_that_malloc_is_allowed();
+ #if defined(EIGEN_HIP_DEVICE_COMPILE)
+ void *result = aligned_malloc(size);
+ #else
void *result = std::malloc(size);
+ #endif
+
if(!result && size)
throw_std_bad_alloc();
return result;
@@ -232,7 +262,11 @@ template<bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void
template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void *ptr)
{
+ #if defined(EIGEN_HIP_DEVICE_COMPILE)
+ aligned_free(ptr);
+ #else
std::free(ptr);
+ #endif
}
template<bool Align> inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size)
@@ -493,7 +527,11 @@ template<typename T> struct smart_copy_helper<T,true> {
IntPtr size = IntPtr(end)-IntPtr(start);
if(size==0) return;
eigen_internal_assert(start!=0 && end!=0 && target!=0);
+ #if defined(EIGEN_HIP_DEVICE_COMPILE)
+ ::memcpy(target, start, size);
+ #else
std::memcpy(target, start, size);
+ #endif
}
};
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 6e5af35c0..ca6fa6ce9 100755
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -16,6 +16,12 @@
#include <math_constants.h>
#endif
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+#include <cfloat>
+#include "Eigen/src/Core/arch/HIP/hcc/math_constants.h"
+#endif
+
+
#if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L
#include <cstdint>
#endif
@@ -175,7 +181,7 @@ template<bool Condition, typename T=void> struct enable_if;
template<typename T> struct enable_if<true,T>
{ typedef T type; };
-#if defined(EIGEN_CUDA_ARCH)
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
#if !defined(__FLT_EPSILON__)
#define __FLT_EPSILON__ FLT_EPSILON
#define __DBL_EPSILON__ DBL_EPSILON
@@ -197,13 +203,31 @@ template<> struct numeric_limits<float>
EIGEN_DEVICE_FUNC
static float epsilon() { return __FLT_EPSILON__; }
EIGEN_DEVICE_FUNC
- static float (max)() { return CUDART_MAX_NORMAL_F; }
+ static float (max)() {
+ #if defined(EIGEN_CUDA_ARCH)
+ return CUDART_MAX_NORMAL_F;
+ #else
+ return HIPRT_MAX_NORMAL_F;
+ #endif
+ }
EIGEN_DEVICE_FUNC
static float (min)() { return FLT_MIN; }
EIGEN_DEVICE_FUNC
- static float infinity() { return CUDART_INF_F; }
+ static float infinity() {
+ #if defined(EIGEN_CUDA_ARCH)
+ return CUDART_INF_F;
+ #else
+ return HIPRT_INF_F;
+ #endif
+ }
EIGEN_DEVICE_FUNC
- static float quiet_NaN() { return CUDART_NAN_F; }
+ static float quiet_NaN() {
+ #if defined(EIGEN_CUDA_ARCH)
+ return CUDART_NAN_F;
+ #else
+ return HIPRT_NAN_F;
+ #endif
+ }
};
template<> struct numeric_limits<double>
{
@@ -214,9 +238,21 @@ template<> struct numeric_limits<double>
EIGEN_DEVICE_FUNC
static double (min)() { return DBL_MIN; }
EIGEN_DEVICE_FUNC
- static double infinity() { return CUDART_INF; }
+ static double infinity() {
+ #if defined(EIGEN_CUDA_ARCH)
+ return CUDART_INF;
+ #else
+ return HIPRT_INF;
+ #endif
+ }
EIGEN_DEVICE_FUNC
- static double quiet_NaN() { return CUDART_NAN; }
+ static double quiet_NaN() {
+ #if defined(EIGEN_CUDA_ARCH)
+ return CUDART_NAN;
+ #else
+ return HIPRT_NAN;
+ #endif
+ }
};
template<> struct numeric_limits<int>
{
@@ -529,13 +565,13 @@ template<typename T, typename U> struct scalar_product_traits
namespace numext {
-#if defined(EIGEN_CUDA_ARCH)
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
template<typename T> EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; }
#else
template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); }
#endif
-#if defined(EIGEN_CUDA_ARCH)
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
using internal::device::numeric_limits;
#else
using std::numeric_limits;
diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
index 040f8d3bb..bf28edc0e 100644
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
@@ -354,6 +354,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
static const int m_maxIterations = 30;
protected:
+ EIGEN_DEVICE_FUNC
static void check_template_parameters()
{
EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h
index a24deb96a..e977b9623 100644
--- a/Eigen/src/SVD/BDCSVD.h
+++ b/Eigen/src/SVD/BDCSVD.h
@@ -1299,7 +1299,7 @@ void BDCSVD<MatrixType>::deflation(Eigen::Index firstCol, Eigen::Index lastCol,
#endif
}//end deflation
-#ifndef EIGEN_CUDACC
+#if !defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC)
/** \svd_module
*
* \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm
diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake
index 7d2d63722..f818ae840 100644
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake
@@ -19,7 +19,9 @@ macro(ei_add_test_internal testname testname_with_suffix)
endif()
if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu)
- if(EIGEN_TEST_CUDA_CLANG)
+ if(EIGEN_TEST_HIP)
+ hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS "-DEIGEN_USE_HIP")
+ elseif(EIGEN_TEST_CUDA_CLANG)
set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX)
if(CUDA_64_BIT_DEVICE_CODE)
link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64")
@@ -491,6 +493,11 @@ macro(ei_testing_print_summary)
else()
message(STATUS "CUDA: OFF")
endif()
+ if(EIGEN_TEST_HIP)
+ message(STATUS "HIP: ON (using hipcc)")
+ else()
+ message(STATUS "HIP: OFF")
+ endif()
endif() # vectorization / alignment options
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index e1eef086e..4a5c1d36d 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -407,6 +407,48 @@ endif(CUDA_FOUND)
endif(EIGEN_TEST_CUDA)
+# HIP unit tests
+option(EIGEN_TEST_HIP "Add HIP support." OFF)
+if (EIGEN_TEST_HIP)
+
+ set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.")
+
+ if (EXISTS ${HIP_PATH})
+
+ list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake)
+
+ find_package(HIP REQUIRED)
+ if (HIP_FOUND)
+
+ execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)
+
+ if (${HIP_PLATFORM} STREQUAL "hcc")
+
+ include_directories(${CMAKE_CURRENT_BINARY_DIR})
+ include_directories(${HIP_PATH}/include)
+
+ set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
+ ei_add_test(hip_basic)
+ unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+
+ elseif (${HIP_PLATFORM} STREQUAL "nvcc")
+ message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen")
+ else ()
+ message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}")
+ endif()
+
+ endif(HIP_FOUND)
+
+ else ()
+
+ message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist")
+
+ endif()
+
+endif(EIGEN_TEST_HIP)
+
+
+
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests)
add_test(NAME failtests WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests COMMAND ${CMAKE_COMMAND} ${Eigen_SOURCE_DIR} -G "${CMAKE_GENERATOR}" -DEIGEN_FAILTEST=ON)
diff --git a/test/hip_basic.cu b/test/hip_basic.cu
new file mode 100644
index 000000000..2e1bf94a4
--- /dev/null
+++ b/test/hip_basic.cu
@@ -0,0 +1,172 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// workaround issue between gcc >= 4.7 and cuda 5.5
+#if (defined __GNUC__) && (__GNUC__>4 || __GNUC_MINOR__>=7)
+ #undef _GLIBCXX_ATOMIC_BUILTINS
+ #undef _GLIBCXX_USE_INT128
+#endif
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC hip_basic
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#include <hip/hip_runtime.h>
+
+#include "main.h"
+#include "hip_common.h"
+
+// Check that dense modules can be properly parsed by hipcc
+#include <Eigen/Dense>
+
+// struct Foo{
+// EIGEN_DEVICE_FUNC
+// void operator()(int i, const float* mats, float* vecs) const {
+// using namespace Eigen;
+// // Matrix3f M(data);
+// // Vector3f x(data+9);
+// // Map<Vector3f>(data+9) = M.inverse() * x;
+// Matrix3f M(mats+i/16);
+// Vector3f x(vecs+i*3);
+// // using std::min;
+// // using std::sqrt;
+// Map<Vector3f>(vecs+i*3) << x.minCoeff(), 1, 2;// / x.dot(x);//(M.inverse() * x) / x.x();
+// //x = x*2 + x.y() * x + x * x.maxCoeff() - x / x.sum();
+// }
+// };
+
+template<typename T>
+struct coeff_wise {
+ EIGEN_DEVICE_FUNC
+ void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+ {
+ using namespace Eigen;
+ T x1(in+i);
+ T x2(in+i+1);
+ T x3(in+i+2);
+ Map<T> res(out+i*T::MaxSizeAtCompileTime);
+
+ res.array() += (in[0] * x1 + x2).array() * x3.array();
+ }
+};
+
+template<typename T>
+struct replicate {
+ EIGEN_DEVICE_FUNC
+ void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+ {
+ using namespace Eigen;
+ T x1(in+i);
+ int step = x1.size() * 4;
+ int stride = 3 * step;
+
+ typedef Map<Array<typename T::Scalar,Dynamic,Dynamic> > MapType;
+ MapType(out+i*stride+0*step, x1.rows()*2, x1.cols()*2) = x1.replicate(2,2);
+ MapType(out+i*stride+1*step, x1.rows()*3, x1.cols()) = in[i] * x1.colwise().replicate(3);
+ MapType(out+i*stride+2*step, x1.rows(), x1.cols()*3) = in[i] * x1.rowwise().replicate(3);
+ }
+};
+
+template<typename T>
+struct redux {
+ EIGEN_DEVICE_FUNC
+ void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+ {
+ using namespace Eigen;
+ int N = 10;
+ T x1(in+i);
+ out[i*N+0] = x1.minCoeff();
+ out[i*N+1] = x1.maxCoeff();
+ out[i*N+2] = x1.sum();
+ out[i*N+3] = x1.prod();
+ out[i*N+4] = x1.matrix().squaredNorm();
+ out[i*N+5] = x1.matrix().norm();
+ out[i*N+6] = x1.colwise().sum().maxCoeff();
+ out[i*N+7] = x1.rowwise().maxCoeff().sum();
+ out[i*N+8] = x1.matrix().colwise().squaredNorm().sum();
+ }
+};
+
+template<typename T1, typename T2>
+struct prod_test {
+ EIGEN_DEVICE_FUNC
+ void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const
+ {
+ using namespace Eigen;
+ typedef Matrix<typename T1::Scalar, T1::RowsAtCompileTime, T2::ColsAtCompileTime> T3;
+ T1 x1(in+i);
+ T2 x2(in+i+1);
+ Map<T3> res(out+i*T3::MaxSizeAtCompileTime);
+ res += in[i] * x1 * x2;
+ }
+};
+
+template<typename T1, typename T2>
+struct diagonal {
+ EIGEN_DEVICE_FUNC
+ void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const
+ {
+ using namespace Eigen;
+ T1 x1(in+i);
+ Map<T2> res(out+i*T2::MaxSizeAtCompileTime);
+ res += x1.diagonal();
+ }
+};
+
+template<typename T>
+struct eigenvalues {
+ EIGEN_DEVICE_FUNC
+ void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+ {
+ using namespace Eigen;
+ typedef Matrix<typename T::Scalar, T::RowsAtCompileTime, 1> Vec;
+ T M(in+i);
+ Map<Vec> res(out+i*Vec::MaxSizeAtCompileTime);
+ T A = M*M.adjoint();
+ SelfAdjointEigenSolver<T> eig;
+ eig.computeDirect(M);
+ res = eig.eigenvalues();
+ }
+};
+
+void test_hip_basic()
+{
+ ei_test_init_hip();
+
+ int nthreads = 100;
+ Eigen::VectorXf in, out;
+
+ #ifndef __HIP_DEVICE_COMPILE__
+ int data_size = nthreads * 512;
+ in.setRandom(data_size);
+ out.setRandom(data_size);
+ #endif
+
+ CALL_SUBTEST( run_and_compare_to_hip(coeff_wise<Vector3f>(), nthreads, in, out) );
+ CALL_SUBTEST( run_and_compare_to_hip(coeff_wise<Array44f>(), nthreads, in, out) );
+
+ // FIXME compile fails when we uncomment the followig two tests
+ // CALL_SUBTEST( run_and_compare_to_hip(replicate<Array4f>(), nthreads, in, out) );
+ // CALL_SUBTEST( run_and_compare_to_hip(replicate<Array33f>(), nthreads, in, out) );
+
+ CALL_SUBTEST( run_and_compare_to_hip(redux<Array4f>(), nthreads, in, out) );
+ CALL_SUBTEST( run_and_compare_to_hip(redux<Matrix3f>(), nthreads, in, out) );
+
+ CALL_SUBTEST( run_and_compare_to_hip(prod_test<Matrix3f,Matrix3f>(), nthreads, in, out) );
+ CALL_SUBTEST( run_and_compare_to_hip(prod_test<Matrix4f,Vector4f>(), nthreads, in, out) );
+
+ CALL_SUBTEST( run_and_compare_to_hip(diagonal<Matrix3f,Vector3f>(), nthreads, in, out) );
+ CALL_SUBTEST( run_and_compare_to_hip(diagonal<Matrix4f,Vector4f>(), nthreads, in, out) );
+
+ // FIXME : Runtime failure occurs when we uncomment the following two tests
+ // CALL_SUBTEST( run_and_compare_to_hip(eigenvalues<Matrix3f>(), nthreads, in, out) );
+ // CALL_SUBTEST( run_and_compare_to_hip(eigenvalues<Matrix2f>(), nthreads, in, out) );
+
+}
diff --git a/test/hip_common.h b/test/hip_common.h
new file mode 100644
index 000000000..251585c52
--- /dev/null
+++ b/test/hip_common.h
@@ -0,0 +1,103 @@
+
+#ifndef EIGEN_TEST_HIP_COMMON_H
+#define EIGEN_TEST_HIP_COMMON_H
+
+#include "hip/hip_runtime.h"
+#include "hip/hip_runtime_api.h"
+#include <iostream>
+
+#ifndef __HIPCC__
+dim3 threadIdx, blockDim, blockIdx;
+#endif
+
+template<typename Kernel, typename Input, typename Output>
+void run_on_cpu(const Kernel& ker, int n, const Input& in, Output& out)
+{
+ for(int i=0; i<n; i++)
+ ker(i, in.data(), out.data());
+}
+
+
+template<typename Kernel, typename Input, typename Output>
+__global__ __attribute__((used))
+void run_on_hip_meta_kernel(const Kernel ker, int n, const Input* in, Output* out)
+{
+ int i = hipThreadIdx_x + hipBlockIdx_x*hipBlockDim_x;
+ if(i<n) {
+ ker(i, in, out);
+ }
+}
+
+
+template<typename Kernel, typename Input, typename Output>
+void run_on_hip(const Kernel& ker, int n, const Input& in, Output& out)
+{
+ typename Input::Scalar* d_in;
+ typename Output::Scalar* d_out;
+ std::ptrdiff_t in_bytes = in.size() * sizeof(typename Input::Scalar);
+ std::ptrdiff_t out_bytes = out.size() * sizeof(typename Output::Scalar);
+
+ hipMalloc((void**)(&d_in), in_bytes);
+ hipMalloc((void**)(&d_out), out_bytes);
+
+ hipMemcpy(d_in, in.data(), in_bytes, hipMemcpyHostToDevice);
+ hipMemcpy(d_out, out.data(), out_bytes, hipMemcpyHostToDevice);
+
+ // Simple and non-optimal 1D mapping assuming n is not too large
+ // That's only for unit testing!
+ dim3 Blocks(128);
+ dim3 Grids( (n+int(Blocks.x)-1)/int(Blocks.x) );
+
+ hipDeviceSynchronize();
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(run_on_hip_meta_kernel<Kernel,
+ typename std::decay<decltype(*d_in)>::type,
+ typename std::decay<decltype(*d_out)>::type>),
+ dim3(Grids), dim3(Blocks), 0, 0, ker, n, d_in, d_out);
+ hipDeviceSynchronize();
+
+ // check inputs have not been modified
+ hipMemcpy(const_cast<typename Input::Scalar*>(in.data()), d_in, in_bytes, hipMemcpyDeviceToHost);
+ hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost);
+
+ hipFree(d_in);
+ hipFree(d_out);
+}
+
+
+template<typename Kernel, typename Input, typename Output>
+void run_and_compare_to_hip(const Kernel& ker, int n, const Input& in, Output& out)
+{
+ Input in_ref, in_hip;
+ Output out_ref, out_hip;
+ #ifndef __HIP_DEVICE_COMPILE__
+ in_ref = in_hip = in;
+ out_ref = out_hip = out;
+ #endif
+ run_on_cpu (ker, n, in_ref, out_ref);
+ run_on_hip(ker, n, in_hip, out_hip);
+ #ifndef __HIP_DEVICE_COMPILE__
+ VERIFY_IS_APPROX(in_ref, in_hip);
+ VERIFY_IS_APPROX(out_ref, out_hip);
+ #endif
+}
+
+
+void ei_test_init_hip()
+{
+ int device = 0;
+ hipDeviceProp_t deviceProp;
+ hipGetDeviceProperties(&deviceProp, device);
+ std::cout << "HIP device info:\n";
+ std::cout << " name: " << deviceProp.name << "\n";
+ std::cout << " capability: " << deviceProp.major << "." << deviceProp.minor << "\n";
+ std::cout << " multiProcessorCount: " << deviceProp.multiProcessorCount << "\n";
+ std::cout << " maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor << "\n";
+ std::cout << " warpSize: " << deviceProp.warpSize << "\n";
+ std::cout << " regsPerBlock: " << deviceProp.regsPerBlock << "\n";
+ std::cout << " concurrentKernels: " << deviceProp.concurrentKernels << "\n";
+ std::cout << " clockRate: " << deviceProp.clockRate << "\n";
+ std::cout << " canMapHostMemory: " << deviceProp.canMapHostMemory << "\n";
+ std::cout << " computeMode: " << deviceProp.computeMode << "\n";
+}
+
+#endif // EIGEN_TEST_HIP_COMMON_H
diff --git a/test/main.h b/test/main.h
index 0fcd6cb76..79717a532 100644
--- a/test/main.h
+++ b/test/main.h
@@ -67,11 +67,17 @@
// protected by parenthesis against macro expansion, the min()/max() macros
// are defined here and any not-parenthesized min/max call will cause a
// compiler error.
-#define min(A,B) please_protect_your_min_with_parentheses
-#define max(A,B) please_protect_your_max_with_parentheses
-#define isnan(X) please_protect_your_isnan_with_parentheses
-#define isinf(X) please_protect_your_isinf_with_parentheses
-#define isfinite(X) please_protect_your_isfinite_with_parentheses
+#if !defined(__HIPCC__)
+ // HIP headers include the <thread> header which contains not-parenthesized
+ // calls to "max", triggering the following check and causing the compile to fail
+ // so disabling the following checks for HIP
+ #define min(A,B) please_protect_your_min_with_parentheses
+ #define max(A,B) please_protect_your_max_with_parentheses
+ #define isnan(X) please_protect_your_isnan_with_parentheses
+ #define isinf(X) please_protect_your_isinf_with_parentheses
+ #define isfinite(X) please_protect_your_isfinite_with_parentheses
+#endif
+
#ifdef M_PI
#undef M_PI
#endif
@@ -154,7 +160,7 @@ namespace Eigen
#define EIGEN_DEFAULT_IO_FORMAT IOFormat(4, 0, " ", "\n", "", "", "", "")
-#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__)
+#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
#define EIGEN_EXCEPTIONS
#endif
@@ -233,7 +239,7 @@ namespace Eigen
}
#endif //EIGEN_EXCEPTIONS
- #elif !defined(__CUDACC__) // EIGEN_DEBUG_ASSERTS
+ #elif !defined(__CUDACC__) && !defined(__HIPCC__)// EIGEN_DEBUG_ASSERTS
// see bug 89. The copy_bool here is working around a bug in gcc <= 4.3
#define eigen_assert(a) \
if( (!Eigen::internal::copy_bool(a)) && (!no_more_assert) )\
@@ -290,7 +296,7 @@ namespace Eigen
std::cout << "Can't VERIFY_RAISES_STATIC_ASSERT( " #a " ) with exceptions disabled\n";
#endif
- #if !defined(__CUDACC__)
+ #if !defined(__CUDACC__) && !defined(__HIPCC__)
#define EIGEN_USE_CUSTOM_ASSERT
#endif
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index d243fe035..4b7c7d724 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -80,12 +80,16 @@ typedef unsigned __int64 uint64_t;
#endif
#ifdef EIGEN_USE_GPU
-#include <iostream>
-#include <cuda_runtime.h>
-#if __cplusplus >= 201103L
-#include <atomic>
-#include <unistd.h>
-#endif
+ #include <iostream>
+ #if defined(EIGEN_USE_HIP)
+ #include <hip/hip_runtime.h>
+ #else
+ #include <cuda_runtime.h>
+ #endif
+ #if __cplusplus >= 201103L
+ #include <atomic>
+ #include <unistd.h>
+ #endif
#endif
#include "src/Tensor/TensorMacros.h"
@@ -95,7 +99,11 @@ typedef unsigned __int64 uint64_t;
#include "src/Tensor/TensorCostModel.h"
#include "src/Tensor/TensorDeviceDefault.h"
#include "src/Tensor/TensorDeviceThreadPool.h"
-#include "src/Tensor/TensorDeviceCuda.h"
+#if defined(EIGEN_USE_HIP)
+ #include "src/Tensor/TensorDeviceHip.h"
+#else
+ #include "src/Tensor/TensorDeviceCuda.h"
+#endif
#include "src/Tensor/TensorDeviceSycl.h"
#include "src/Tensor/TensorIndexList.h"
#include "src/Tensor/TensorDimensionList.h"
@@ -112,16 +120,28 @@ typedef unsigned __int64 uint64_t;
#include "src/Tensor/TensorEvaluator.h"
#include "src/Tensor/TensorExpr.h"
#include "src/Tensor/TensorReduction.h"
-#include "src/Tensor/TensorReductionCuda.h"
+#if defined(EIGEN_USE_HIP)
+ #include "src/Tensor/TensorReductionHip.h"
+#else
+ #include "src/Tensor/TensorReductionCuda.h"
+#endif
#include "src/Tensor/TensorArgMax.h"
#include "src/Tensor/TensorConcatenation.h"
#include "src/Tensor/TensorContractionMapper.h"
#include "src/Tensor/TensorContractionBlocking.h"
#include "src/Tensor/TensorContraction.h"
#include "src/Tensor/TensorContractionThreadPool.h"
-#include "src/Tensor/TensorContractionCuda.h"
+#if defined(EIGEN_USE_HIP)
+ #include "src/Tensor/TensorContractionHip.h"
+#else
+ #include "src/Tensor/TensorContractionCuda.h"
+#endif
#include "src/Tensor/TensorConversion.h"
-#include "src/Tensor/TensorConvolution.h"
+#if defined(EIGEN_USE_HIP)
+ #include "src/Tensor/TensorConvolutionHip.h"
+#else
+ #include "src/Tensor/TensorConvolution.h"
+#endif
#include "src/Tensor/TensorFFT.h"
#include "src/Tensor/TensorPatch.h"
#include "src/Tensor/TensorImagePatch.h"
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index e72ddb4a9..979fcf4d9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -448,7 +448,10 @@ struct TensorContractionEvaluatorBase
}
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
- EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const {
+ #if !defined(EIGEN_HIPCC)
+ EIGEN_DEVICE_FUNC
+ #endif
+ void evalGemv(Scalar* buffer) const {
const Index rows = m_i_size;
const Index cols = m_k_size;
@@ -489,7 +492,10 @@ struct TensorContractionEvaluatorBase
}
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
- EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
+ #if !defined(EIGEN_HIPCC)
+ EIGEN_DEVICE_FUNC
+ #endif
+ void evalGemm(Scalar* buffer) const {
#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
if (m_can_use_xsmm) {
evalGemmXSMM(buffer);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
index d34f9caee..4853dd37b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@@ -28,7 +28,10 @@ class TensorContractionBlocking {
typedef typename LhsMapper::Scalar LhsScalar;
typedef typename RhsMapper::Scalar RhsScalar;
- EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
+ #if !defined(EIGEN_HIPCC)
+ EIGEN_DEVICE_FUNC
+ #endif
+ TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
kc_(k), mc_(m), nc_(n)
{
if (ShardingType == ShardByCol) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h
new file mode 100644
index 000000000..7561846a3
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h
@@ -0,0 +1,1521 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2015 Navdeep Jaitly <ndjaitly@google.com>
+// Copyright (C) 2014 Eric Martin <eric@ericmart.in>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_HIP_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_HIP_H
+
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_HIPCC)
+
+namespace Eigen {
+
+template<typename Scalar, typename Index, typename LhsMapper,
+ typename RhsMapper, typename OutputMapper, bool needs_edge_check>
+__device__ EIGEN_STRONG_INLINE void
+EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
+ const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem,
+ const Index m_size, const Index n_size, const Index k_size) {
+
+ const Index m_block_idx = hipBlockIdx_x;
+ const Index n_block_idx = hipBlockIdx_y;
+
+ const Index base_m = 64 * m_block_idx;
+ const Index base_n = 64 * n_block_idx;
+
+ // declare and initialize 64 registers for output 8x8 block
+
+ // prefetch registers
+ Scalar lhs_pf0;
+ Scalar lhs_pf1;
+ Scalar lhs_pf2;
+ Scalar lhs_pf3;
+ Scalar lhs_pf4;
+ Scalar lhs_pf5;
+ Scalar lhs_pf6;
+ Scalar lhs_pf7;
+
+ Scalar rhs_pf0;
+ Scalar rhs_pf1;
+ Scalar rhs_pf2;
+ Scalar rhs_pf3;
+ Scalar rhs_pf4;
+ Scalar rhs_pf5;
+ Scalar rhs_pf6;
+ Scalar rhs_pf7;
+
+ // shared memory is formatted
+ // (contract idx in block, nocontract idx in block, block idx)
+ // where block idx is column major. This transposition limits the number of
+ // bank conflicts when reading the LHS. The core idea is that since the contracting
+ // index is shared by both sides, then the contracting index should be in hipThreadIdx_x.
+
+ // On the LHS, we pad each row inside of each block with an extra element. This makes
+ // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts
+ // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks.
+
+ // On the RHS we just add 8 padding elements to the end of each block. This gives no bank
+ // conflicts on writes and also none on reads.
+
+ // storage indices
+ const Index lhs_store_idx_base = hipThreadIdx_y * 72 + hipThreadIdx_x * 9 + hipThreadIdx_z;
+ const Index rhs_store_idx_base = hipThreadIdx_y * 72 + hipThreadIdx_z * 8 + hipThreadIdx_x;
+
+ const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0;
+ const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1;
+ const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2;
+ const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3;
+ const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4;
+ const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5;
+ const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6;
+ const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7;
+
+ const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0;
+ const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1;
+ const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2;
+ const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3;
+ const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4;
+ const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5;
+ const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6;
+ const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7;
+
+ // in the loading code, the following variables are important:
+ // hipThreadIdx_x: the vertical position in an 8x8 block
+ // hipThreadIdx_y: the vertical index of the 8x8 block in the grid
+ // hipThreadIdx_z: the horizontal position in an 8x8 block
+ // k: the horizontal index of the 8x8 block in the grid
+ //
+ // The k parameter is implicit (it was the loop counter for a loop that went
+ // from 0 to <8, but now that loop is unrolled in the below code.
+
+ const Index load_idx_vert = hipThreadIdx_x + 8 * hipThreadIdx_y;
+ const Index lhs_vert = base_m + load_idx_vert;
+
+#define prefetchIntoRegisters(base_k) \
+ { \
+ lhs_pf0 = conv(0); \
+ lhs_pf1 = conv(0); \
+ lhs_pf2 = conv(0); \
+ lhs_pf3 = conv(0); \
+ lhs_pf4 = conv(0); \
+ lhs_pf5 = conv(0); \
+ lhs_pf6 = conv(0); \
+ lhs_pf7 = conv(0); \
+ \
+ rhs_pf0 = conv(0); \
+ rhs_pf1 = conv(0); \
+ rhs_pf2 = conv(0); \
+ rhs_pf3 = conv(0); \
+ rhs_pf4 = conv(0); \
+ rhs_pf5 = conv(0); \
+ rhs_pf6 = conv(0); \
+ rhs_pf7 = conv(0); \
+ \
+ if (!needs_edge_check || lhs_vert < m_size) { \
+ const Index lhs_horiz_0 = base_k + hipThreadIdx_z + 0 * 8; \
+ const Index lhs_horiz_1 = base_k + hipThreadIdx_z + 1 * 8; \
+ const Index lhs_horiz_2 = base_k + hipThreadIdx_z + 2 * 8; \
+ const Index lhs_horiz_3 = base_k + hipThreadIdx_z + 3 * 8; \
+ const Index lhs_horiz_4 = base_k + hipThreadIdx_z + 4 * 8; \
+ const Index lhs_horiz_5 = base_k + hipThreadIdx_z + 5 * 8; \
+ const Index lhs_horiz_6 = base_k + hipThreadIdx_z + 6 * 8; \
+ const Index lhs_horiz_7 = base_k + hipThreadIdx_z + 7 * 8; \
+ \
+ if (!needs_edge_check || lhs_horiz_7 < k_size) { \
+ lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
+ lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
+ lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
+ lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
+ lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
+ lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \
+ lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \
+ lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \
+ } else if (lhs_horiz_6 < k_size) { \
+ lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
+ lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
+ lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
+ lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
+ lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
+ lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \
+ lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \
+ } else if (lhs_horiz_5 < k_size) { \
+ lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
+ lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
+ lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
+ lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
+ lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
+ lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \
+ } else if (lhs_horiz_4 < k_size) { \
+ lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
+ lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
+ lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
+ lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
+ lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
+ } else if (lhs_horiz_3 < k_size) { \
+ lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
+ lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
+ lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
+ lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
+ } else if (lhs_horiz_2 < k_size) { \
+ lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
+ lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
+ lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
+ } else if (lhs_horiz_1 < k_size) { \
+ lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
+ lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
+ } else if (lhs_horiz_0 < k_size) { \
+ lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
+ } \
+ } \
+ \
+ const Index rhs_vert = base_k + load_idx_vert; \
+ if (!needs_edge_check || rhs_vert < k_size) { \
+ const Index rhs_horiz_0 = base_n + hipThreadIdx_z + 0 * 8; \
+ const Index rhs_horiz_1 = base_n + hipThreadIdx_z + 1 * 8; \
+ const Index rhs_horiz_2 = base_n + hipThreadIdx_z + 2 * 8; \
+ const Index rhs_horiz_3 = base_n + hipThreadIdx_z + 3 * 8; \
+ const Index rhs_horiz_4 = base_n + hipThreadIdx_z + 4 * 8; \
+ const Index rhs_horiz_5 = base_n + hipThreadIdx_z + 5 * 8; \
+ const Index rhs_horiz_6 = base_n + hipThreadIdx_z + 6 * 8; \
+ const Index rhs_horiz_7 = base_n + hipThreadIdx_z + 7 * 8; \
+ \
+ if (rhs_horiz_7 < n_size) { \
+ rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
+ rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
+ rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
+ rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
+ rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
+ rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \
+ rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \
+ rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \
+ } else if (rhs_horiz_6 < n_size) { \
+ rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
+ rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
+ rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
+ rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
+ rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
+ rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \
+ rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \
+ } else if (rhs_horiz_5 < n_size) { \
+ rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
+ rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
+ rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
+ rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
+ rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
+ rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \
+ } else if (rhs_horiz_4 < n_size) { \
+ rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
+ rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
+ rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
+ rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
+ rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
+ } else if (rhs_horiz_3 < n_size) { \
+ rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
+ rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
+ rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
+ rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
+ } else if (rhs_horiz_2 < n_size) { \
+ rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
+ rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
+ rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
+ } else if (rhs_horiz_1 < n_size) { \
+ rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
+ rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
+ } else if (rhs_horiz_0 < n_size) { \
+ rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
+ } \
+ } \
+ } \
+
+#define writeRegToShmem(_) \
+ lhs_shmem[lhs_store_idx_0] = lhs_pf0; \
+ rhs_shmem[rhs_store_idx_0] = rhs_pf0; \
+ \
+ lhs_shmem[lhs_store_idx_1] = lhs_pf1; \
+ rhs_shmem[rhs_store_idx_1] = rhs_pf1; \
+ \
+ lhs_shmem[lhs_store_idx_2] = lhs_pf2; \
+ rhs_shmem[rhs_store_idx_2] = rhs_pf2; \
+ \
+ lhs_shmem[lhs_store_idx_3] = lhs_pf3; \
+ rhs_shmem[rhs_store_idx_3] = rhs_pf3; \
+ \
+ lhs_shmem[lhs_store_idx_4] = lhs_pf4; \
+ rhs_shmem[rhs_store_idx_4] = rhs_pf4; \
+ \
+ lhs_shmem[lhs_store_idx_5] = lhs_pf5; \
+ rhs_shmem[rhs_store_idx_5] = rhs_pf5; \
+ \
+ lhs_shmem[lhs_store_idx_6] = lhs_pf6; \
+ rhs_shmem[rhs_store_idx_6] = rhs_pf6; \
+ \
+ lhs_shmem[lhs_store_idx_7] = lhs_pf7; \
+ rhs_shmem[rhs_store_idx_7] = rhs_pf7; \
+
+ // declare and initialize result array
+#define res(i, j) _res_##i##j
+#define initResultRow(i) \
+ Scalar res(i, 0) = conv(0); \
+ Scalar res(i, 1) = conv(0); \
+ Scalar res(i, 2) = conv(0); \
+ Scalar res(i, 3) = conv(0); \
+ Scalar res(i, 4) = conv(0); \
+ Scalar res(i, 5) = conv(0); \
+ Scalar res(i, 6) = conv(0); \
+ Scalar res(i, 7) = conv(0); \
+
+ internal::scalar_cast_op<int, Scalar> conv;
+ initResultRow(0);
+ initResultRow(1);
+ initResultRow(2);
+ initResultRow(3);
+ initResultRow(4);
+ initResultRow(5);
+ initResultRow(6);
+ initResultRow(7);
+#undef initResultRow
+
+ for (Index base_k = 0; base_k < k_size; base_k += 64) {
+ // wait for previous iteration to finish with shmem. Despite common sense,
+ // the code is a bit faster with this here then at bottom of loop
+ __syncthreads();
+
+ prefetchIntoRegisters(base_k);
+ writeRegToShmem();
+
+ #undef prefetchIntoRegisters
+ #undef writeRegToShmem
+
+ // wait for shared mem packing to be done before starting computation
+ __syncthreads();
+
+ // compute 8x8 matrix product by outer product. This involves packing one column
+ // of LHS and one row of RHS into registers (takes 16 registers).
+
+#define lcol(i) _lcol##i
+ Scalar lcol(0);
+ Scalar lcol(1);
+ Scalar lcol(2);
+ Scalar lcol(3);
+ Scalar lcol(4);
+ Scalar lcol(5);
+ Scalar lcol(6);
+ Scalar lcol(7);
+
+#define rrow(j) _rrow##j
+ Scalar rrow(0);
+ Scalar rrow(1);
+ Scalar rrow(2);
+ Scalar rrow(3);
+ Scalar rrow(4);
+ Scalar rrow(5);
+ Scalar rrow(6);
+ Scalar rrow(7);
+
+ // Now x corresponds to k, y to m, and z to n
+ const Scalar* lhs_block = &lhs_shmem[hipThreadIdx_x + 9 * hipThreadIdx_y];
+ const Scalar* rhs_block = &rhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_z];
+
+#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
+#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
+
+#define loadData(i, j) \
+ lcol(0) = lhs_element(0, j); \
+ rrow(0) = rhs_element(i, 0); \
+ lcol(1) = lhs_element(1, j); \
+ rrow(1) = rhs_element(i, 1); \
+ lcol(2) = lhs_element(2, j); \
+ rrow(2) = rhs_element(i, 2); \
+ lcol(3) = lhs_element(3, j); \
+ rrow(3) = rhs_element(i, 3); \
+ lcol(4) = lhs_element(4, j); \
+ rrow(4) = rhs_element(i, 4); \
+ lcol(5) = lhs_element(5, j); \
+ rrow(5) = rhs_element(i, 5); \
+ lcol(6) = lhs_element(6, j); \
+ rrow(6) = rhs_element(i, 6); \
+ lcol(7) = lhs_element(7, j); \
+ rrow(7) = rhs_element(i, 7); \
+
+#define computeCol(j) \
+ res(0, j) += lcol(0) * rrow(j); \
+ res(1, j) += lcol(1) * rrow(j); \
+ res(2, j) += lcol(2) * rrow(j); \
+ res(3, j) += lcol(3) * rrow(j); \
+ res(4, j) += lcol(4) * rrow(j); \
+ res(5, j) += lcol(5) * rrow(j); \
+ res(6, j) += lcol(6) * rrow(j); \
+ res(7, j) += lcol(7) * rrow(j); \
+
+#define computePass(i) \
+ loadData(i, i); \
+ \
+ computeCol(0); \
+ computeCol(1); \
+ computeCol(2); \
+ computeCol(3); \
+ computeCol(4); \
+ computeCol(5); \
+ computeCol(6); \
+ computeCol(7); \
+
+ computePass(0);
+ computePass(1);
+ computePass(2);
+ computePass(3);
+ computePass(4);
+ computePass(5);
+ computePass(6);
+ computePass(7);
+
+#undef lcol
+#undef rrow
+#undef lhs_element
+#undef rhs_element
+#undef loadData
+#undef computeCol
+#undef computePass
+ } // end loop over k
+
+ // we've now iterated over all of the large (ie width 64) k blocks and
+ // accumulated results in registers. At this point thread (x, y, z) contains
+ // the sum across all big k blocks of the product of little k block of index (x, y)
+ // with block of index (y, z). To compute the final output, we need to reduce
+ // the 8 threads over y by summation.
+#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
+
+#define reduceRow(i, mask) \
+ shuffleInc(i, 0, mask); \
+ shuffleInc(i, 1, mask); \
+ shuffleInc(i, 2, mask); \
+ shuffleInc(i, 3, mask); \
+ shuffleInc(i, 4, mask); \
+ shuffleInc(i, 5, mask); \
+ shuffleInc(i, 6, mask); \
+ shuffleInc(i, 7, mask); \
+
+#define reduceMatrix(mask) \
+ reduceRow(0, mask); \
+ reduceRow(1, mask); \
+ reduceRow(2, mask); \
+ reduceRow(3, mask); \
+ reduceRow(4, mask); \
+ reduceRow(5, mask); \
+ reduceRow(6, mask); \
+ reduceRow(7, mask); \
+
+ // actually perform the reduction, now each thread of index (_, y, z)
+ // contains the correct values in its registers that belong in the output
+ // block
+ reduceMatrix(1);
+ reduceMatrix(2);
+ reduceMatrix(4);
+
+#undef shuffleInc
+#undef reduceRow
+#undef reduceMatrix
+
+ // now we need to copy the 64 values into main memory. We can't split work
+ // among threads because all variables are in registers. There's 2 ways
+ // to do this:
+ // (1) have 1 thread do 64 writes from registers into global memory
+ // (2) have 1 thread do 64 writes into shared memory, and then 8 threads
+ // each do 8 writes into global memory. We can just overwrite the shared
+ // memory from the problem we just solved.
+ // (2) is slightly faster than (1) due to less branching and more ILP
+
+ // TODO: won't yield much gain, but could just use currently unused shared mem
+ // and then we won't have to sync
+ // wait for shared mem to be out of use
+ __syncthreads();
+
+#define writeResultShmem(i, j) \
+ lhs_shmem[i + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * j] = res(i, j); \
+
+#define writeRow(i) \
+ writeResultShmem(i, 0); \
+ writeResultShmem(i, 1); \
+ writeResultShmem(i, 2); \
+ writeResultShmem(i, 3); \
+ writeResultShmem(i, 4); \
+ writeResultShmem(i, 5); \
+ writeResultShmem(i, 6); \
+ writeResultShmem(i, 7); \
+
+ if (hipThreadIdx_x == 0) {
+ writeRow(0);
+ writeRow(1);
+ writeRow(2);
+ writeRow(3);
+ writeRow(4);
+ writeRow(5);
+ writeRow(6);
+ writeRow(7);
+ }
+#undef writeResultShmem
+#undef writeRow
+
+ const int max_i_write = numext::mini((int)((m_size - base_m - hipThreadIdx_y + 7) / 8), 8);
+ const int max_j_write = numext::mini((int)((n_size - base_n - hipThreadIdx_z + 7) / 8), 8);
+
+ if (hipThreadIdx_x < max_i_write) {
+ if (max_j_write == 8) {
+ // TODO: can i trade bank conflicts for coalesced writes?
+ Scalar val0 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 0];
+ Scalar val1 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 1];
+ Scalar val2 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 2];
+ Scalar val3 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 3];
+ Scalar val4 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 4];
+ Scalar val5 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 5];
+ Scalar val6 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 6];
+ Scalar val7 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 7];
+
+ output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 0) = val0;
+ output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 1) = val1;
+ output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 2) = val2;
+ output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 3) = val3;
+ output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 4) = val4;
+ output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 5) = val5;
+ output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 6) = val6;
+ output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 7) = val7;
+ } else {
+#pragma unroll 7
+ for (int j = 0; j < max_j_write; j++) {
+ Scalar val = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * j];
+ output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * j) = val;
+ }
+ }
+ }
+#undef res
+}
+
+
+template<typename Scalar, typename Index, typename LhsMapper,
+ typename RhsMapper, typename OutputMapper>
+__global__ void
+__launch_bounds__(512, 1)
+EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
+ const OutputMapper output,
+ const Index m_size, const Index n_size, const Index k_size) {
+ __shared__ Scalar lhs_shmem[72 * 64];
+ __shared__ Scalar rhs_shmem[72 * 64];
+
+ const Index m_block_idx = hipBlockIdx_x;
+ const Index n_block_idx = hipBlockIdx_y;
+
+ const Index base_m = 64 * m_block_idx;
+ const Index base_n = 64 * n_block_idx;
+
+ if (base_m + 63 < m_size && base_n + 63 < n_size) {
+ EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
+ } else {
+ EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
+ }
+}
+
+
+template<typename Index, typename LhsMapper,
+ typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
+ bool CHECK_RHS_BOUNDARY>
+__device__ EIGEN_STRONG_INLINE void
+EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs,
+ const OutputMapper output, float2 lhs_shmem2[][16],
+ float2 rhs_shmem2[][8], const Index m_size,
+ const Index n_size, const Index k_size,
+ const Index base_m, const Index base_n) {
+
+ // prefetch registers
+ float4 lhs_pf0, rhs_pf0;
+
+ float4 results[4];
+ for (int i=0; i < 4; i++) {
+ results[i].x = results[i].y = results[i].z = results[i].w = 0;
+ }
+
+
+#define prefetch_lhs(reg, row, col) \
+ if (!CHECK_LHS_BOUNDARY) { \
+ if (col < k_size) { \
+ /*reg = lhs.template loadPacket<Unaligned>(row, col);*/ \
+ reg.x =lhs(row + 0, col); \
+ reg.y =lhs(row + 1, col); \
+ reg.z =lhs(row + 2, col); \
+ reg.w =lhs(row + 3, col); \
+ } \
+ } else { \
+ if (col < k_size) { \
+ if (row + 3 < m_size) { \
+ /*reg =lhs.template loadPacket<Unaligned>(row, col);*/ \
+ reg.x =lhs(row + 0, col); \
+ reg.y =lhs(row + 1, col); \
+ reg.z =lhs(row + 2, col); \
+ reg.w =lhs(row + 3, col); \
+ } else if (row + 2 < m_size) { \
+ reg.x =lhs(row + 0, col); \
+ reg.y =lhs(row + 1, col); \
+ reg.z =lhs(row + 2, col); \
+ } else if (row + 1 < m_size) { \
+ reg.x =lhs(row + 0, col); \
+ reg.y =lhs(row + 1, col); \
+ } else if (row < m_size) { \
+ reg.x =lhs(row + 0, col); \
+ } \
+ } \
+ } \
+
+
+ Index lhs_vert = base_m+hipThreadIdx_x*4;
+
+ for (Index k = 0; k < k_size; k += 16) {
+ //lhs_pf0 = internal::pset1<float4>(0);
+ //rhs_pf0 = internal::pset1<float4>(0);
+ lhs_pf0 = make_float4(0, 0, 0, 0);
+ rhs_pf0 = make_float4(0, 0, 0, 0);
+
+ Index lhs_horiz = hipThreadIdx_y+k;
+ prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz)
+
+ Index rhs_vert = k+(hipThreadIdx_x%4)*4;
+ Index rhs_horiz0 = (hipThreadIdx_x>>2)+hipThreadIdx_y*4+base_n;
+
+ if (!CHECK_RHS_BOUNDARY) {
+ if ((rhs_vert + 3) < k_size) {
+ // just CHECK_RHS_BOUNDARY
+ //rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+ rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0);
+ } else if (rhs_vert + 2 < k_size) {
+ // just CHECK_RHS_BOUNDARY
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+ } else if (rhs_vert + 1 < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ } else if (rhs_vert < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ }
+ } else {
+ if (rhs_horiz0 < n_size) {
+ if ((rhs_vert + 3) < k_size) {
+ //rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+ rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0);
+ } else if ((rhs_vert + 2) < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+ } else if ((rhs_vert + 1) < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ } else if (rhs_vert < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ }
+ }
+ }
+ float x1, x2 ;
+ // the following can be a bitwise operation..... some day.
+ if((hipThreadIdx_x%8) < 4) {
+ x1 = rhs_pf0.y;
+ x2 = rhs_pf0.w;
+ } else {
+ x1 = rhs_pf0.x;
+ x2 = rhs_pf0.z;
+ }
+ x1 = __shfl_xor(x1, 4);
+ x2 = __shfl_xor(x2, 4);
+ if((hipThreadIdx_x%8) < 4) {
+ rhs_pf0.y = x1;
+ rhs_pf0.w = x2;
+ } else {
+ rhs_pf0.x = x1;
+ rhs_pf0.z = x2;
+ }
+
+ // We have 64 features.
+ // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1.
+ // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3.
+ // ...
+ // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63
+ // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1
+ // ...
+ rhs_shmem2[(hipThreadIdx_x>>3)+ hipThreadIdx_y*2][hipThreadIdx_x%8] = make_float2(rhs_pf0.x, rhs_pf0.y);
+ rhs_shmem2[(hipThreadIdx_x>>3)+ hipThreadIdx_y*2+32][hipThreadIdx_x%8] = make_float2(rhs_pf0.z, rhs_pf0.w);
+
+ // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61)
+ // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61)
+ // ...
+ // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61)
+ // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63)
+ // ...
+
+ lhs_shmem2[hipThreadIdx_y][hipThreadIdx_x] = make_float2(lhs_pf0.x, lhs_pf0.y);
+ lhs_shmem2[hipThreadIdx_y+16][hipThreadIdx_x] = make_float2(lhs_pf0.z, lhs_pf0.w);
+
+
+#define add_vals(fl1, fl2, fr1, fr2)\
+ results[0].x += fl1.x * fr1.x;\
+ results[0].y += fl1.y * fr1.x;\
+ results[0].z += fl2.x * fr1.x;\
+ results[0].w += fl2.y * fr1.x;\
+\
+ results[1].x += fl1.x * fr1.y;\
+ results[1].y += fl1.y * fr1.y;\
+ results[1].z += fl2.x * fr1.y;\
+ results[1].w += fl2.y * fr1.y;\
+\
+ results[2].x += fl1.x * fr2.x;\
+ results[2].y += fl1.y * fr2.x;\
+ results[2].z += fl2.x * fr2.x;\
+ results[2].w += fl2.y * fr2.x;\
+\
+ results[3].x += fl1.x * fr2.y;\
+ results[3].y += fl1.y * fr2.y;\
+ results[3].z += fl2.x * fr2.y;\
+ results[3].w += fl2.y * fr2.y;\
+
+ __syncthreads();
+
+ // Do the multiplies.
+ #pragma unroll
+ for (int koff = 0; koff < 16; koff ++) {
+ // 32 x threads.
+ float2 fl1 = lhs_shmem2[koff][hipThreadIdx_x];
+ float2 fl2 = lhs_shmem2[koff + 16][hipThreadIdx_x];
+
+ int start_feature = hipThreadIdx_y * 4;
+ float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
+ float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
+
+ add_vals(fl1, fl2, fr1, fr2)
+ }
+ __syncthreads();
+ }
+
+#undef prefetch_lhs
+#undef add_vals
+
+ Index horiz_base = hipThreadIdx_y*4+base_n;
+ if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
+ for (int i = 0; i < 4; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ output(lhs_vert + 3, horiz_base + i) = results[i].w;
+ }
+ } else if (!CHECK_RHS_BOUNDARY) {
+ // CHECK LHS
+ if (lhs_vert + 3 < m_size) {
+ for (int i = 0; i < 4; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ output(lhs_vert + 3, horiz_base + i) = results[i].w;
+ }
+ } else if (lhs_vert + 2 < m_size) {
+ for (int i = 0; i < 4; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ }
+ } else if (lhs_vert + 1 < m_size) {
+ for (int i = 0; i < 4; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ }
+ } else if (lhs_vert < m_size) {
+ for (int i = 0; i < 4; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ }
+ }
+ } else if (!CHECK_LHS_BOUNDARY) {
+ // CHECK RHS
+ /*
+ int ncols_rem = fminf(n_size- horiz_base, 4);
+ for (int i = 0; i < ncols_rem; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ output(lhs_vert + 3, horiz_base + i) = results[i].w;
+ }*/
+ for (int i = 0; i < 4; i++) {
+ if (horiz_base+i < n_size) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ output(lhs_vert + 3, horiz_base + i) = results[i].w;
+ }
+ }
+ } else {
+ // CHECK both boundaries.
+ for (int i = 0; i < 4; i++) {
+ if (horiz_base+i < n_size) {
+ if (lhs_vert < m_size)
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ if (lhs_vert + 1 < m_size)
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ if (lhs_vert + 2 < m_size)
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ if (lhs_vert + 3 < m_size)
+ output(lhs_vert + 3, horiz_base + i) = results[i].w;
+ }
+ }
+ }
+}
+
+
+template<typename Index, typename LhsMapper,
+ typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
+ bool CHECK_RHS_BOUNDARY>
+__device__ EIGEN_STRONG_INLINE void
+EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
+ const OutputMapper output, float2 lhs_shmem2[][32],
+ float2 rhs_shmem2[][8], const Index m_size,
+ const Index n_size, const Index k_size,
+ const Index base_m, const Index base_n) {
+
+ // prefetch registers
+ float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
+ float4 rhs_pf0, rhs_pf1;
+
+ float4 results[8];
+ for (int i=0; i < 8; i++) {
+ results[i].x = results[i].y = results[i].z = results[i].w = 0;
+ }
+
+
+ Index lhs_vert = base_m+hipThreadIdx_x*4+(hipThreadIdx_y%4)*32;
+ for (Index k = 0; k < k_size; k += 32) {
+ /*lhs_pf0 = internal::pset1<float4>(0);
+ lhs_pf1 = internal::pset1<float4>(0);
+ lhs_pf2 = internal::pset1<float4>(0);
+ lhs_pf3 = internal::pset1<float4>(0);
+
+ rhs_pf0 = internal::pset1<float4>(0);
+ rhs_pf1 = internal::pset1<float4>(0);*/
+
+
+ lhs_pf0 = make_float4(0, 0, 0, 0);
+ lhs_pf1 = make_float4(0, 0, 0, 0);
+ lhs_pf2 = make_float4(0, 0, 0, 0);
+ lhs_pf3 = make_float4(0, 0, 0, 0);
+
+ rhs_pf0 = make_float4(0, 0, 0, 0);
+ rhs_pf1 = make_float4(0, 0, 0, 0);
+
+ if (!CHECK_LHS_BOUNDARY) {
+ if ((hipThreadIdx_y/4+k+24) < k_size) {
+ //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
+ //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8));
+ //lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+16));
+ //lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+24));
+ lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+ lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+ lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+ lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
+ lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+ lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+ lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
+ lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
+ lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
+ lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
+ lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24));
+ lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24));
+ lhs_pf3.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24));
+ lhs_pf3.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24));
+ } else if ((hipThreadIdx_y/4+k+16) < k_size) {
+ //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
+ //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8));
+ //lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+16));
+ lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+ lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+ lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+ lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
+ lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+ lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+ lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
+ lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
+ lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
+ lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
+ } else if ((hipThreadIdx_y/4+k+8) < k_size) {
+ //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
+ //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8));
+ lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+ lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+ lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+ lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
+ lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+ lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+ } else if ((hipThreadIdx_y/4+k) < k_size) {
+ //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
+ lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+ lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+ lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+ }
+ } else {
+ // just CHECK_LHS_BOUNDARY
+ if (lhs_vert + 3 < m_size) {
+ if ((hipThreadIdx_y/4+k+24) < k_size) {
+ //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
+ //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8));
+ //lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+16));
+ //lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+24));
+ lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+ lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+ lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+ lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
+ lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+ lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+ lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
+ lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
+ lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
+ lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
+ lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24));
+ lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24));
+ lhs_pf3.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24));
+ lhs_pf3.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24));
+ } else if ((hipThreadIdx_y/4+k+16) < k_size) {
+ //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
+ //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8));
+ //lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+16));
+ lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+ lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+ lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+ lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
+ lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+ lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+ lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
+ lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
+ lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
+ lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
+ } else if ((hipThreadIdx_y/4+k+8) < k_size) {
+ //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
+ //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8));
+ lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+ lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+ lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+ lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
+ lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+ lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+ } else if ((hipThreadIdx_y/4+k) < k_size) {
+ //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k));
+ lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+ lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+ lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+ }
+ } else if (lhs_vert + 2 < m_size) {
+ if ((hipThreadIdx_y/4+k+24) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+ lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+ lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
+ lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+ lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
+ lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
+ lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
+ lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24));
+ lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24));
+ lhs_pf3.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24));
+ } else if ((hipThreadIdx_y/4+k+16) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+ lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+ lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
+ lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+ lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
+ lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
+ lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16));
+ } else if ((hipThreadIdx_y/4+k+8) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+ lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+ lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
+ lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8));
+ } else if ((hipThreadIdx_y/4+k) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+ lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k));
+ }
+ } else if (lhs_vert + 1 < m_size) {
+ if ((hipThreadIdx_y/4+k+24) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+ lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
+ lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
+ lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
+ lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24));
+ lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24));
+ } else if ((hipThreadIdx_y/4+k+16) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+ lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
+ lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
+ lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16));
+ } else if ((hipThreadIdx_y/4+k+8) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+ lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8));
+ } else if ((hipThreadIdx_y/4+k) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k));
+ }
+ } else if (lhs_vert < m_size) {
+ if ((hipThreadIdx_y/4+k+24) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+ lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
+ lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24));
+ } else if ((hipThreadIdx_y/4+k+16) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+ lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16));
+ } else if ((hipThreadIdx_y/4+k+8) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8));
+ } else if ((hipThreadIdx_y/4+k) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k));
+ }
+ }
+ }
+ __syncthreads();
+ Index rhs_vert = k+hipThreadIdx_x*4;
+ Index rhs_horiz0 = hipThreadIdx_y*2+base_n;
+ Index rhs_horiz1 = hipThreadIdx_y*2+1+base_n;
+ if (!CHECK_RHS_BOUNDARY) {
+ if ((rhs_vert + 3) < k_size) {
+ // just CHECK_RHS_BOUNDARY
+ //rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+ //rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+ rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0);
+ rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+ rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+ rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+ rhs_pf1.w = rhs(rhs_vert + 3, rhs_horiz1);
+ } else if (rhs_vert + 2 < k_size) {
+ // just CHECK_RHS_BOUNDARY
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+ rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+ rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+ rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+ } else if (rhs_vert + 1 < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+ rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+ } else if (rhs_vert < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+ }
+ } else {
+ if (rhs_horiz1 < n_size) {
+ if ((rhs_vert + 3) < k_size) {
+ // just CHECK_RHS_BOUNDARY
+ //rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+ //rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+ rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0);
+ rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+ rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+ rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+ rhs_pf1.w = rhs(rhs_vert + 3, rhs_horiz1);
+ } else if (rhs_vert + 2 < k_size) {
+ // just CHECK_RHS_BOUNDARY
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+ rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+ rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+ rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+ } else if (k+hipThreadIdx_x*4 + 1 < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+ rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+ } else if (k+hipThreadIdx_x*4 < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+ }
+ } else if (rhs_horiz0 < n_size) {
+ if ((rhs_vert + 3) < k_size) {
+ // just CHECK_RHS_BOUNDARY
+ //rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+ rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0);
+ } else if ((rhs_vert + 2) < k_size) {
+ // just CHECK_RHS_BOUNDARY
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+ } else if ((rhs_vert + 1) < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ } else if (rhs_vert < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ }
+ }
+ }
+ __syncthreads();
+ // Loaded. Do computation
+ // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1.
+ // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3.
+ // ..
+ // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63
+ rhs_shmem2[hipThreadIdx_y][hipThreadIdx_x] = make_float2(rhs_pf0.x, rhs_pf1.x);
+ // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1.
+ // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3.
+ // ..
+ rhs_shmem2[hipThreadIdx_y+32][hipThreadIdx_x] = make_float2(rhs_pf0.y, rhs_pf1.y);
+ // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1.
+ // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3.
+ rhs_shmem2[hipThreadIdx_y+64][hipThreadIdx_x] = make_float2(rhs_pf0.z, rhs_pf1.z);
+ // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1.
+ // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3.
+ rhs_shmem2[hipThreadIdx_y+96][hipThreadIdx_x] = make_float2(rhs_pf0.w, rhs_pf1.w);
+
+ // LHS.
+ // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125)
+ // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125)
+ // ...
+ // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127)
+ // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127)
+
+
+#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\
+ results[0].x += a_feat1.x * f1.x;\
+ results[1].x += a_feat1.x * f1.y;\
+ results[2].x += a_feat1.x * f2.x;\
+ results[3].x += a_feat1.x * f2.y;\
+ results[4].x += a_feat1.x * f3.x;\
+ results[5].x += a_feat1.x * f3.y;\
+ results[6].x += a_feat1.x * f4.x;\
+ results[7].x += a_feat1.x * f4.y;\
+\
+ results[0].y += a_feat1.y * f1.x;\
+ results[1].y += a_feat1.y * f1.y;\
+ results[2].y += a_feat1.y * f2.x;\
+ results[3].y += a_feat1.y * f2.y;\
+ results[4].y += a_feat1.y * f3.x;\
+ results[5].y += a_feat1.y * f3.y;\
+ results[6].y += a_feat1.y * f4.x;\
+ results[7].y += a_feat1.y * f4.y;\
+\
+ results[0].z += a_feat2.x * f1.x;\
+ results[1].z += a_feat2.x * f1.y;\
+ results[2].z += a_feat2.x * f2.x;\
+ results[3].z += a_feat2.x * f2.y;\
+ results[4].z += a_feat2.x * f3.x;\
+ results[5].z += a_feat2.x * f3.y;\
+ results[6].z += a_feat2.x * f4.x;\
+ results[7].z += a_feat2.x * f4.y;\
+\
+ results[0].w += a_feat2.y * f1.x;\
+ results[1].w += a_feat2.y * f1.y;\
+ results[2].w += a_feat2.y * f2.x;\
+ results[3].w += a_feat2.y * f2.y;\
+ results[4].w += a_feat2.y * f3.x;\
+ results[5].w += a_feat2.y * f3.y;\
+ results[6].w += a_feat2.y * f4.x;\
+ results[7].w += a_feat2.y * f4.y;\
+
+ lhs_shmem2[hipThreadIdx_y/4][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y);
+ lhs_shmem2[hipThreadIdx_y/4+8][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y);
+ lhs_shmem2[hipThreadIdx_y/4+16][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y);
+ lhs_shmem2[hipThreadIdx_y/4+24][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y);
+
+ lhs_shmem2[hipThreadIdx_y/4 + 32][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w);
+ lhs_shmem2[hipThreadIdx_y/4 + 40][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w);
+ lhs_shmem2[hipThreadIdx_y/4 + 48][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w);
+ lhs_shmem2[hipThreadIdx_y/4 + 56][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w);
+
+ __syncthreads();
+
+ // Do the multiplies.
+ #pragma unroll
+ for (int koff = 0; koff < 32; koff ++) {
+ float2 a3 = lhs_shmem2[koff][hipThreadIdx_x + (hipThreadIdx_y % 4) * 8];
+ float2 a4 = lhs_shmem2[koff + 32][hipThreadIdx_x + (hipThreadIdx_y % 4) * 8];
+
+ // first feature is at (hipThreadIdx_y/4) * 8 last is at start + 8.
+ int start_feature = (hipThreadIdx_y / 4) * 8;
+
+ float2 br1 = rhs_shmem2[start_feature/2 + (koff % 4) * 32][koff/4];
+ float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4];
+ float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4];
+ float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4];
+
+ add_vals(a3, a4, br1, br2, br3, br4)
+ }
+ __syncthreads();
+ } // end loop over k
+
+
+ __syncthreads();
+ Index horiz_base = (hipThreadIdx_y/4)*8+base_n;
+ if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
+ for (int i = 0; i < 8; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ output(lhs_vert + 3, horiz_base + i) = results[i].w;
+ }
+ } else if (!CHECK_RHS_BOUNDARY) {
+ if (lhs_vert + 3 < m_size) {
+ for (int i = 0; i < 8; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ output(lhs_vert + 3, horiz_base + i) = results[i].w;
+ }
+ } else if (lhs_vert + 2 < m_size) {
+ for (int i = 0; i < 8; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ }
+ } else if (lhs_vert + 1 < m_size) {
+ for (int i = 0; i < 8; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ }
+ } else if (lhs_vert < m_size) {
+ for (int i = 0; i < 8; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ }
+ }
+ } else if (!CHECK_LHS_BOUNDARY) {
+ // CHECK BOUNDARY_B
+ for (int i = 0; i < 8; i++) {
+ if (horiz_base + i < n_size) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ output(lhs_vert + 3, horiz_base + i) = results[i].w;
+ }
+ }
+ } else {
+ // CHECK both boundaries.
+ for (int i = 0; i < 8; i++) {
+ if (horiz_base + i < n_size) {
+ if (lhs_vert < m_size)
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ if (lhs_vert + 1 < m_size)
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ if (lhs_vert + 2 < m_size)
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ if (lhs_vert + 3 < m_size)
+ output(lhs_vert + 3, horiz_base + i) = results[i].w;
+ }
+ }
+ }
+}
+
+
+template<typename Index, typename LhsMapper,
+ typename RhsMapper, typename OutputMapper>
+__global__ void
+__launch_bounds__(256, 1)
+EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
+ const OutputMapper output,
+ const Index m_size, const Index n_size, const Index k_size) {
+ __shared__ float2 lhs_shmem[64*32];
+ __shared__ float2 rhs_shmem[128*8];
+
+ typedef float2 LHS_MEM[64][32];
+ typedef float2 RHS_MEM[128][8];
+
+ typedef float2 LHS_MEM16x16[32][16];
+ typedef float2 RHS_MEM16x16[64][8];
+
+ const Index m_block_idx = hipBlockIdx_x;
+ const Index n_block_idx = hipBlockIdx_y;
+
+ const Index base_m = 128 * m_block_idx;
+ const Index base_n = 64 * n_block_idx;
+
+ bool check_rhs = (base_n + 63) >= n_size;
+ bool check_lhs128 = (base_m + 127) >= m_size;
+
+ if (!check_rhs) {
+ if (!check_lhs128) {
+ // >= 128 rows left
+ EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(
+ lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+ } else {
+ EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(
+ lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+ }
+ } else {
+ if (!check_lhs128) {
+ // >= 128 rows left
+ EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(
+ lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+ } else {
+ EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(
+ lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+ }
+ }
+}
+
+template<typename Index, typename LhsMapper,
+ typename RhsMapper, typename OutputMapper>
+__global__ void
+__launch_bounds__(256, 1)
+EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
+ const OutputMapper output,
+ const Index m_size, const Index n_size, const Index k_size) {
+ __shared__ float2 lhs_shmem[32][16];
+ __shared__ float2 rhs_shmem[64][8];
+
+ const Index m_block_idx = hipBlockIdx_x;
+ const Index n_block_idx = hipBlockIdx_y;
+
+ const Index base_m = 64 * m_block_idx;
+ const Index base_n = 64 * n_block_idx;
+
+ if (base_m + 63 < m_size) {
+ if (base_n + 63 < n_size) {
+ EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+ } else {
+ EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+ }
+ } else {
+ if (base_n + 63 < n_size) {
+ EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+ } else {
+ EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+ }
+ }
+}
+
+
+template<typename Indices, typename LeftArgType, typename RightArgType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> :
+ public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> > {
+
+ typedef GpuDevice Device;
+
+ typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
+ typedef TensorContractionEvaluatorBase<Self> Base;
+
+ typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+ typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+ typedef typename XprType::Index Index;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
+
+ enum {
+ Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+ };
+
+ // Most of the code is assuming that both input tensors are ColMajor. If the
+ // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+ // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+ // will pretend B is LHS and A is RHS.
+ typedef typename internal::conditional<
+ static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
+ typedef typename internal::conditional<
+ static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+
+ static const int LDims =
+ internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+ static const int RDims =
+ internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+ static const int ContractDims = internal::array_size<Indices>::value;
+
+ typedef array<Index, LDims> left_dim_mapper_t;
+ typedef array<Index, RDims> right_dim_mapper_t;
+
+ typedef array<Index, ContractDims> contract_t;
+ typedef array<Index, LDims - ContractDims> left_nocontract_t;
+ typedef array<Index, RDims - ContractDims> right_nocontract_t;
+
+ static const int NumDims = LDims + RDims - 2 * ContractDims;
+
+ typedef DSizes<Index, NumDims> Dimensions;
+
+ // typedefs needed in evalTo
+ typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+ typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+
+ typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+ typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+ typedef typename LeftEvaluator::Dimensions LeftDimensions;
+ typedef typename RightEvaluator::Dimensions RightDimensions;
+
+ EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
+ Base(op, device) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {}
+
+ // We need to redefine this method to make hipcc happy
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+ this->m_leftImpl.evalSubExprsIfNeeded(NULL);
+ this->m_rightImpl.evalSubExprsIfNeeded(NULL);
+ if (data) {
+ evalTo(data);
+ return false;
+ } else {
+ this->m_result = static_cast<Scalar *>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
+ evalTo(this->m_result);
+ return true;
+ }
+ }
+
+ void evalTo(Scalar* buffer) const {
+ if (this->m_lhs_inner_dim_contiguous) {
+ if (this->m_rhs_inner_dim_contiguous) {
+ if (this->m_rhs_inner_dim_reordered) {
+ evalTyped<true, true, true, Unaligned>(buffer);
+ }
+ else {
+ evalTyped<true, true, false, Unaligned>(buffer);
+ }
+ }
+ else {
+ if (this->m_rhs_inner_dim_reordered) {
+ evalTyped<true, false, true, Unaligned>(buffer);
+ }
+ else {
+ evalTyped<true, false, false, Unaligned>(buffer);
+ }
+ }
+ }
+ else {
+ if (this->m_rhs_inner_dim_contiguous) {
+ if (this->m_rhs_inner_dim_reordered) {
+ evalTyped<false, true, true, Unaligned>(buffer);
+ }
+ else {
+ evalTyped<false, true, false, Unaligned>(buffer);
+ }
+ }
+ else {
+ if (this->m_rhs_inner_dim_reordered) {
+ evalTyped<false, false, true, Unaligned>(buffer);
+ }
+ else {
+ evalTyped<false, false, false, Unaligned>(buffer);
+ }
+ }
+ }
+ }
+
+ template <typename LhsScalar, typename RhsScalar, typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels {
+ static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
+ const Index m_blocks = (m + 63) / 64;
+ const Index n_blocks = (n + 63) / 64;
+ const dim3 num_blocks(m_blocks, n_blocks, 1);
+ const dim3 block_size(8, 8, 8);
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>),
+ dim3(num_blocks), dim3(block_size), 0, device.stream(), lhs, rhs, output, m, n, k);
+ }
+ };
+
+ template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels<float, float, Index, LhsMapper, RhsMapper, OutputMapper> {
+ static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
+ if (m < 768 || n < 768) {
+ const Index m_blocks = (m + 63) / 64;
+ const Index n_blocks = (n + 63) / 64;
+ const dim3 num_blocks(m_blocks, n_blocks, 1);
+ const dim3 block_size(16, 16, 1);
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>),
+ dim3(num_blocks), dim3(block_size), 0, device.stream(), lhs, rhs, output, m, n, k);
+ } else {
+ const Index m_blocks = (m + 127) / 128;
+ const Index n_blocks = (n + 63) / 64;
+ const dim3 num_blocks(m_blocks, n_blocks, 1);
+ const dim3 block_size(8, 32, 1);
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>),
+ dim3(num_blocks), dim3(block_size), 0, device.stream(), lhs, rhs, output, m, n, k);
+ }
+ }
+ };
+
+ template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+ void evalTyped(Scalar* buffer) const {
+ // columns in left side, rows in right side
+ const Index k = this->m_k_size;
+ EIGEN_UNUSED_VARIABLE(k)
+
+ // rows in left side
+ const Index m = this->m_i_size;
+
+ // columns in right side
+ const Index n = this->m_j_size;
+
+ // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
+ this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+
+ typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+ LeftEvaluator, left_nocontract_t,
+ contract_t, 4,
+ lhs_inner_dim_contiguous,
+ false, Unaligned> LhsMapper;
+
+ typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+ RightEvaluator, right_nocontract_t,
+ contract_t, 4,
+ rhs_inner_dim_contiguous,
+ rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+ typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+
+ // initialize data mappers
+ LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+ this->m_left_contracting_strides, this->m_k_strides);
+
+ RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+ this->m_right_contracting_strides, this->m_k_strides);
+
+ OutputMapper output(buffer, m);
+
+ setHipSharedMemConfig(hipSharedMemBankSizeEightByte);
+ LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output, m, n, k, this->m_device);
+ }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_USE_GPU and EIGEN_HIPCC
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_HIP_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h
new file mode 100644
index 000000000..ba9971050
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h
@@ -0,0 +1,1119 @@
+//#include "hip/hip_runtime.h"
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
+
+namespace Eigen {
+
+/** \class TensorConvolution
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor convolution class.
+ *
+ *
+ */
+namespace internal {
+
+template <typename Index, typename InputDims, int NumKernelDims, int Layout>
+class IndexMapper {
+ public:
+ IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
+ const array<Index, NumKernelDims>& indices) {
+
+ array<Index, NumDims> dimensions = input_dims;
+ for (int i = 0; i < NumKernelDims; ++i) {
+ const Index index = indices[i];
+ const Index input_dim = input_dims[index];
+ const Index kernel_dim = kernel_dims[i];
+ const Index result_dim = input_dim - kernel_dim + 1;
+ dimensions[index] = result_dim;
+ }
+
+ array<Index, NumDims> inputStrides;
+ array<Index, NumDims> outputStrides;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ inputStrides[0] = 1;
+ outputStrides[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
+ outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
+ }
+ } else {
+ inputStrides[NumDims - 1] = 1;
+ outputStrides[NumDims - 1] = 1;
+ for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
+ inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
+ outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
+ }
+ }
+
+ array<Index, NumDims> hipInputDimensions;
+ array<Index, NumDims> hipOutputDimensions;
+ array<Index, NumDims> tmp = dimensions;
+ array<Index, NumDims> ordering;
+ const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? 0
+ : NumDims - NumKernelDims;
+ for (int i = 0; i < NumKernelDims; ++i) {
+ const Index index = i + offset;
+ ordering[index] = indices[i];
+ tmp[indices[i]] = -1;
+ hipInputDimensions[index] = input_dims[indices[i]];
+ hipOutputDimensions[index] = dimensions[indices[i]];
+ }
+
+ int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? NumKernelDims
+ : 0;
+ for (int i = 0; i < NumDims; ++i) {
+ if (tmp[i] >= 0) {
+ ordering[written] = i;
+ hipInputDimensions[written] = input_dims[i];
+ hipOutputDimensions[written] = dimensions[i];
+ ++written;
+ }
+ }
+
+ for (int i = 0; i < NumDims; ++i) {
+ m_inputStrides[i] = inputStrides[ordering[i]];
+ m_outputStrides[i] = outputStrides[ordering[i]];
+ }
+
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = 0; i < NumDims; ++i) {
+ if (i > NumKernelDims) {
+ m_hipInputStrides[i] =
+ m_hipInputStrides[i - 1] * hipInputDimensions[i - 1];
+ m_hipOutputStrides[i] =
+ m_hipOutputStrides[i - 1] * hipOutputDimensions[i - 1];
+ } else {
+ m_hipInputStrides[i] = 1;
+ m_hipOutputStrides[i] = 1;
+ }
+ }
+ } else {
+ for (int i = NumDims - 1; i >= 0; --i) {
+ if (i + 1 < offset) {
+ m_hipInputStrides[i] =
+ m_hipInputStrides[i + 1] * hipInputDimensions[i + 1];
+ m_hipOutputStrides[i] =
+ m_hipOutputStrides[i + 1] * hipOutputDimensions[i + 1];
+ } else {
+ m_hipInputStrides[i] = 1;
+ m_hipOutputStrides[i] = 1;
+ }
+ }
+ }
+ }
+
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputPlaneToTensorInputOffset(Index p) const {
+ Index inputIndex = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int d = NumDims - 1; d > NumKernelDims; --d) {
+ const Index idx = p / m_hipInputStrides[d];
+ inputIndex += idx * m_inputStrides[d];
+ p -= idx * m_hipInputStrides[d];
+ }
+ inputIndex += p * m_inputStrides[NumKernelDims];
+ } else {
+ std::ptrdiff_t limit = 0;
+ if (NumKernelDims < NumDims) {
+ limit = NumDims - NumKernelDims - 1;
+ }
+ for (int d = 0; d < limit; ++d) {
+ const Index idx = p / m_hipInputStrides[d];
+ inputIndex += idx * m_inputStrides[d];
+ p -= idx * m_hipInputStrides[d];
+ }
+ inputIndex += p * m_inputStrides[limit];
+ }
+ return inputIndex;
+ }
+
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputPlaneToTensorOutputOffset(Index p) const {
+ Index outputIndex = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int d = NumDims - 1; d > NumKernelDims; --d) {
+ const Index idx = p / m_hipOutputStrides[d];
+ outputIndex += idx * m_outputStrides[d];
+ p -= idx * m_hipOutputStrides[d];
+ }
+ outputIndex += p * m_outputStrides[NumKernelDims];
+ } else {
+ std::ptrdiff_t limit = 0;
+ if (NumKernelDims < NumDims) {
+ limit = NumDims - NumKernelDims - 1;
+ }
+ for (int d = 0; d < limit; ++d) {
+ const Index idx = p / m_hipOutputStrides[d];
+ outputIndex += idx * m_outputStrides[d];
+ p -= idx * m_hipOutputStrides[d];
+ }
+ outputIndex += p * m_outputStrides[limit];
+ }
+ return outputIndex;
+ }
+
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputKernelToTensorInputOffset(Index i) const {
+ const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? 0
+ : NumDims - NumKernelDims;
+ return i * m_inputStrides[offset];
+ }
+
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputKernelToTensorOutputOffset(Index i) const {
+ const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? 0
+ : NumDims - NumKernelDims;
+ return i * m_outputStrides[offset];
+ }
+
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputKernelToTensorInputOffset(Index i, Index j) const {
+ const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? 0
+ : NumDims - NumKernelDims;
+ return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
+ }
+
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputKernelToTensorOutputOffset(Index i, Index j) const {
+ const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? 0
+ : NumDims - NumKernelDims;
+ return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
+ }
+
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
+ const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? 0
+ : NumDims - NumKernelDims;
+ return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
+ k * m_inputStrides[offset + 2];
+ }
+
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
+ const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? 0
+ : NumDims - NumKernelDims;
+ return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
+ k * m_outputStrides[offset + 2];
+ }
+
+ private:
+ static const int NumDims = internal::array_size<InputDims>::value;
+ array<Index, NumDims> m_inputStrides;
+ array<Index, NumDims> m_outputStrides;
+ array<Index, NumDims> m_hipInputStrides;
+ array<Index, NumDims> m_hipOutputStrides;
+};
+
+
+
+template<typename Dimensions, typename InputXprType, typename KernelXprType>
+struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
+{
+ // Type promotion to handle the case where the types of the lhs and the rhs are different.
+ typedef typename promote_storage_type<typename InputXprType::Scalar,
+ typename KernelXprType::Scalar>::ret Scalar;
+ typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
+ typename traits<KernelXprType>::StorageKind>::ret StorageKind;
+ typedef typename promote_index_type<typename traits<InputXprType>::Index,
+ typename traits<KernelXprType>::Index>::type Index;
+ typedef typename InputXprType::Nested LhsNested;
+ typedef typename KernelXprType::Nested RhsNested;
+ typedef typename remove_reference<LhsNested>::type _LhsNested;
+ typedef typename remove_reference<RhsNested>::type _RhsNested;
+ static const int NumDimensions = traits<InputXprType>::NumDimensions;
+ static const int Layout = traits<InputXprType>::Layout;
+
+ enum {
+ Flags = 0
+ };
+};
+
+template<typename Dimensions, typename InputXprType, typename KernelXprType>
+struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
+{
+ typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
+};
+
+template<typename Dimensions, typename InputXprType, typename KernelXprType>
+struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
+{
+ typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename Indices, typename InputXprType, typename KernelXprType>
+class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
+ typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
+ : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const Indices& indices() const { return m_indices; }
+
+ /** \returns the nested expressions */
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const typename internal::remove_all<typename InputXprType::Nested>::type&
+ inputExpression() const { return m_input_xpr; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const typename internal::remove_all<typename KernelXprType::Nested>::type&
+ kernelExpression() const { return m_kernel_xpr; }
+
+ protected:
+ typename InputXprType::Nested m_input_xpr;
+ typename KernelXprType::Nested m_kernel_xpr;
+ const Indices m_indices;
+};
+
+
+template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
+{
+ typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
+
+ static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
+ static const int NumKernelDims = internal::array_size<Indices>::value;
+ typedef typename XprType::Index Index;
+ typedef DSizes<Index, NumDims> Dimensions;
+
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+ enum {
+ IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
+ Layout = TensorEvaluator<InputArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
+ {
+ EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
+ const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
+
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_inputStride[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
+ }
+ } else {
+ m_inputStride[NumDims - 1] = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
+ }
+ }
+
+ m_dimensions = m_inputImpl.dimensions();
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = 0; i < NumKernelDims; ++i) {
+ const Index index = op.indices()[i];
+ const Index input_dim = input_dims[index];
+ const Index kernel_dim = kernel_dims[i];
+ const Index result_dim = input_dim - kernel_dim + 1;
+ m_dimensions[index] = result_dim;
+ if (i > 0) {
+ m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
+ } else {
+ m_kernelStride[0] = 1;
+ }
+ m_indexStride[i] = m_inputStride[index];
+ }
+
+ m_outputStride[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
+ }
+ } else {
+ for (int i = NumKernelDims - 1; i >= 0; --i) {
+ const Index index = op.indices()[i];
+ const Index input_dim = input_dims[index];
+ const Index kernel_dim = kernel_dims[i];
+ const Index result_dim = input_dim - kernel_dim + 1;
+ m_dimensions[index] = result_dim;
+ if (i < NumKernelDims - 1) {
+ m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
+ } else {
+ m_kernelStride[NumKernelDims - 1] = 1;
+ }
+ m_indexStride[i] = m_inputStride[index];
+ }
+
+ m_outputStride[NumDims - 1] = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
+ }
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+ m_inputImpl.evalSubExprsIfNeeded(NULL);
+ preloadKernel();
+ return true;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ m_inputImpl.cleanup();
+ if (m_local_kernel) {
+ m_device.deallocate((void*)m_kernel);
+ m_local_kernel = false;
+ }
+ m_kernel = NULL;
+ }
+
+ void evalTo(typename XprType::Scalar* buffer) {
+ evalSubExprsIfNeeded(NULL);
+ for (int i = 0; i < dimensions().TotalSize(); ++i) {
+ buffer[i] += coeff(i);
+ }
+ cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ CoeffReturnType result = CoeffReturnType(0);
+ convolve(firstInput(index), 0, NumKernelDims-1, result);
+ return result;
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
+ {
+ Index indices[2] = {index, index+PacketSize-1};
+ Index startInputs[2] = {0, 0};
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx0 = indices[0] / m_outputStride[i];
+ const Index idx1 = indices[1] / m_outputStride[i];
+ startInputs[0] += idx0 * m_inputStride[i];
+ startInputs[1] += idx1 * m_inputStride[i];
+ indices[0] -= idx0 * m_outputStride[i];
+ indices[1] -= idx1 * m_outputStride[i];
+ }
+ } else {
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx0 = indices[0] / m_outputStride[i];
+ const Index idx1 = indices[1] / m_outputStride[i];
+ startInputs[0] += idx0 * m_inputStride[i];
+ startInputs[1] += idx1 * m_inputStride[i];
+ indices[0] -= idx0 * m_outputStride[i];
+ indices[1] -= idx1 * m_outputStride[i];
+ }
+ }
+ startInputs[0] += indices[0];
+ startInputs[1] += indices[1];
+
+ if (startInputs[1]-startInputs[0] == PacketSize-1) {
+ PacketReturnType result = internal::pset1<PacketReturnType>(0);
+ convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
+ return result;
+ } else {
+ EIGEN_ALIGN_MAX Scalar data[PacketSize];
+ data[0] = Scalar(0);
+ convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
+ for (int i = 1; i < PacketSize-1; ++i) {
+ data[i] = Scalar(0);
+ convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
+ }
+ data[PacketSize-1] = Scalar(0);
+ convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
+ return internal::pload<PacketReturnType>(data);
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ const double kernel_size = m_kernelImpl.dimensions().TotalSize();
+ // We ignore the use of fused multiply-add.
+ const double convolve_compute_cost =
+ TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
+ const double firstIndex_compute_cost =
+ NumDims *
+ (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
+ TensorOpCost::DivCost<Index>());
+ return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
+ kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
+ m_kernelImpl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, convolve_compute_cost, vectorized,
+ PacketSize));
+ }
+
+ EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ private:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
+ Index startInput = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx = index / m_outputStride[i];
+ startInput += idx * m_inputStride[i];
+ index -= idx * m_outputStride[i];
+ }
+ } else {
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx = index / m_outputStride[i];
+ startInput += idx * m_inputStride[i];
+ index -= idx * m_outputStride[i];
+ }
+ }
+ startInput += index;
+ return startInput;
+ }
+
+ EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
+ for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
+ const Index input = firstIndex + j * m_indexStride[DimIndex];
+ const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
+ if (DimIndex > 0) {
+ convolve(input, kernel, DimIndex-1, accum);
+ } else {
+ accum += m_inputImpl.coeff(input) * m_kernel[kernel];
+ }
+ }
+ }
+
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
+ for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
+ const Index input = firstIndex + j * m_indexStride[DimIndex];
+ const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
+ if (DimIndex > 0) {
+ convolvePacket(input, kernel, DimIndex-1, accum);
+ } else {
+ accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
+ }
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
+ // Don't make a local copy of the kernel unless we have to (i.e. it's an
+ // expression that needs to be evaluated)
+ const Scalar* in_place = m_kernelImpl.data();
+ if (in_place) {
+ m_kernel = in_place;
+ m_local_kernel = false;
+ } else {
+ size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+ Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
+ typedef TensorEvalToOp<const KernelArgType> EvalTo;
+ EvalTo evalToTmp(local, m_kernelArg);
+ const bool PacketAccess = internal::IsVectorizable<Device, KernelArgType>::value;
+ internal::TensorExecutor<const EvalTo, Device, PacketAccess>::run(evalToTmp, m_device);
+
+ m_kernel = local;
+ m_local_kernel = true;
+ }
+ }
+
+ array<Index, NumDims> m_inputStride;
+ array<Index, NumDims> m_outputStride;
+
+ array<Index, NumKernelDims> m_indexStride;
+ array<Index, NumKernelDims> m_kernelStride;
+ TensorEvaluator<InputArgType, Device> m_inputImpl;
+ TensorEvaluator<KernelArgType, Device> m_kernelImpl;
+ Dimensions m_dimensions;
+
+ KernelArgType m_kernelArg;
+ const Scalar* m_kernel;
+ bool m_local_kernel;
+ const Device& m_device;
+};
+
+
+
+
+// Use an optimized implementation of the evaluation code for GPUs whenever possible.
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_HIPCC)
+
+template <int StaticKernelSize>
+struct GetKernelSize {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
+ return StaticKernelSize;
+ }
+};
+template <>
+struct GetKernelSize<Dynamic> {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
+ return kernelSize;
+ }
+};
+
+template <typename InputEvaluator, typename Index, typename InputDims,
+ int StaticKernelSize>
+__global__ void EigenConvolutionKernel1D(
+ InputEvaluator eval,
+ const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
+ indexMapper,
+ const float* __restrict kernel, const int numPlanes, const int numX,
+ const int maxX, const int kernelSize, float* buffer) {
+ HIP_DYNAMIC_SHARED( float, s)
+
+ const int first_x = hipBlockIdx_x * maxX;
+ const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+ const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
+ const int num_x_output = last_x - first_x + 1;
+
+ const int first_plane = hipBlockIdx_y * hipBlockDim_y;
+ const int plane_stride = hipBlockDim_y * hipGridDim_y;
+
+ for (int p = first_plane + hipThreadIdx_y; p < numPlanes; p += plane_stride) {
+ // Load inputs to shared memory
+ const int plane_input_offset = indexMapper.mapHipInputPlaneToTensorInputOffset(p);
+ const int plane_kernel_offset = hipThreadIdx_y * num_x_input;
+ #pragma unroll
+ for (int i = hipThreadIdx_x; i < num_x_input; i += hipBlockDim_x) {
+ const int tensor_index = plane_input_offset + indexMapper.mapHipInputKernelToTensorInputOffset(i+first_x);
+ s[i + plane_kernel_offset] = eval.coeff(tensor_index);
+ }
+
+ __syncthreads();
+
+ // Compute the convolution
+ const int plane_output_offset = indexMapper.mapHipOutputPlaneToTensorOutputOffset(p);
+
+ #pragma unroll
+ for (int i = hipThreadIdx_x; i < num_x_output; i += hipBlockDim_x) {
+ const int kernel_offset = plane_kernel_offset + i;
+ float result = 0.0f;
+ #pragma unroll
+ for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
+ result += s[k + kernel_offset] * kernel[k];
+ }
+ const int tensor_index = plane_output_offset + indexMapper.mapHipOutputKernelToTensorOutputOffset(i+first_x);
+ buffer[tensor_index] = result;
+ }
+ __syncthreads();
+ }
+};
+
+template <typename InputEvaluator, typename Index, typename InputDims,
+ int StaticKernelSizeX, int StaticKernelSizeY>
+__global__ void EigenConvolutionKernel2D(
+ InputEvaluator eval,
+ const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
+ indexMapper,
+ const float* __restrict kernel, const int numPlanes, const int numX,
+ const int maxX, const int numY, const int maxY, const int kernelSizeX,
+ const int kernelSizeY, float* buffer) {
+ HIP_DYNAMIC_SHARED( float, s)
+
+ const int first_x = hipBlockIdx_x * maxX;
+ const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+ const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
+ const int num_x_output = last_x - first_x + 1;
+
+ const int first_y = hipBlockIdx_y * maxY;
+ const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
+ const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
+ const int num_y_output = last_y - first_y + 1;
+
+ const int first_plane = hipBlockIdx_z * hipBlockDim_z;
+ const int plane_stride = hipBlockDim_z * hipGridDim_z;
+
+ for (int p = first_plane + hipThreadIdx_z; p < numPlanes; p += plane_stride) {
+
+ const int plane_input_offset = indexMapper.mapHipInputPlaneToTensorInputOffset(p);
+ const int plane_kernel_offset = hipThreadIdx_z * num_y_input;
+
+ // Load inputs to shared memory
+ #pragma unroll
+ for (int j = hipThreadIdx_y; j < num_y_input; j += hipBlockDim_y) {
+ const int input_offset = num_x_input * (j + plane_kernel_offset);
+ #pragma unroll
+ for (int i = hipThreadIdx_x; i < num_x_input; i += hipBlockDim_x) {
+ const int tensor_index = plane_input_offset + indexMapper.mapHipInputKernelToTensorInputOffset(i+first_x, j+first_y);
+ s[i + input_offset] = eval.coeff(tensor_index);
+ }
+ }
+
+ __syncthreads();
+
+ // Convolution
+ const int plane_output_offset = indexMapper.mapHipOutputPlaneToTensorOutputOffset(p);
+
+ #pragma unroll
+ for (int j = hipThreadIdx_y; j < num_y_output; j += hipBlockDim_y) {
+ #pragma unroll
+ for (int i = hipThreadIdx_x; i < num_x_output; i += hipBlockDim_x) {
+ float result = 0.0f;
+ #pragma unroll
+ for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
+ const int kernel_offset = kernelSizeX * l;
+ const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
+ #pragma unroll
+ for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
+ result += s[k + input_offset] * kernel[k + kernel_offset];
+ }
+ }
+ const int tensor_index = plane_output_offset + indexMapper.mapHipOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
+ buffer[tensor_index] = result;
+ }
+ }
+
+ __syncthreads();
+ }
+};
+
+template <typename InputEvaluator, typename Index, typename InputDims>
+__global__ void EigenConvolutionKernel3D(
+ InputEvaluator eval,
+ const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
+ indexMapper,
+ const float* __restrict kernel, const size_t numPlanes, const size_t numX,
+ const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
+ const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
+ const size_t kernelSizeZ, float* buffer) {
+ HIP_DYNAMIC_SHARED( float, s)
+
+ // Load inputs to shared memory
+ const int first_x = hipBlockIdx_x * maxX;
+ const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+ const int num_x_input = last_x - first_x + kernelSizeX;
+
+ const int first_y = hipBlockIdx_y * maxY;
+ const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
+ const int num_y_input = last_y - first_y + kernelSizeY;
+
+ const int first_z = hipBlockIdx_z * maxZ;
+ const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
+ const int num_z_input = last_z - first_z + kernelSizeZ;
+
+ for (int p = 0; p < numPlanes; ++p) {
+
+ const int plane_input_offset = indexMapper.mapHipInputPlaneToTensorInputOffset(p);
+ const int plane_kernel_offset = 0;
+
+ for (int k = hipThreadIdx_z; k < num_z_input; k += hipBlockDim_z) {
+ for (int j = hipThreadIdx_y; j < num_y_input; j += hipBlockDim_y) {
+ for (int i = hipThreadIdx_x; i < num_x_input; i += hipBlockDim_x) {
+ const int tensor_index = plane_input_offset + indexMapper.mapHipInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
+ s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
+ }
+ }
+ }
+
+ __syncthreads();
+
+ // Convolution
+ const int num_z_output = last_z - first_z + 1;
+ const int num_y_output = last_y - first_y + 1;
+ const int num_x_output = last_x - first_x + 1;
+ const int plane_output_offset = indexMapper.mapHipOutputPlaneToTensorOutputOffset(p);
+
+ for (int k = hipThreadIdx_z; k < num_z_output; k += hipBlockDim_z) {
+ for (int j = hipThreadIdx_y; j < num_y_output; j += hipBlockDim_y) {
+ for (int i = hipThreadIdx_x; i < num_x_output; i += hipBlockDim_x) {
+ float result = 0.0f;
+ for (int n = 0; n < kernelSizeZ; ++n) {
+ for (int m = 0; m < kernelSizeY; ++m) {
+ for (int l = 0; l < kernelSizeX; ++l) {
+ result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
+ }
+ }
+ }
+ const int tensor_index = plane_output_offset + indexMapper.mapHipOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
+ buffer[tensor_index] = result;
+ }
+ }
+ }
+ __syncthreads();
+ }
+};
+
+
+
+template<typename Indices, typename InputArgType, typename KernelArgType>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
+{
+ typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
+
+ static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
+ static const int NumKernelDims = internal::array_size<Indices>::value;
+ typedef typename XprType::Index Index;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
+
+ enum {
+ IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
+ PacketAccess = false,
+ Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+
+ EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
+ : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
+ {
+ EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
+ const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
+
+ m_dimensions = m_inputImpl.dimensions();
+ for (int i = 0; i < NumKernelDims; ++i) {
+ const Index index = op.indices()[i];
+ const Index input_dim = input_dims[index];
+ const Index kernel_dim = kernel_dims[i];
+ const Index result_dim = input_dim - kernel_dim + 1;
+ m_dimensions[index] = result_dim;
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {}
+
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
+ typedef typename InputArgType::Scalar Scalar;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+ EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+ preloadKernel();
+ m_inputImpl.evalSubExprsIfNeeded(NULL);
+ if (data) {
+ executeEval(data);
+ return false;
+ } else {
+ m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
+ executeEval(m_buf);
+ return true;
+ }
+ }
+
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_inputImpl.cleanup();
+ if (m_buf) {
+ m_device.deallocate(m_buf);
+ m_buf = NULL;
+ }
+ if (m_local_kernel) {
+ m_device.deallocate((void*)m_kernel);
+ m_local_kernel = false;
+ }
+ m_kernel = NULL;
+ }
+
+ EIGEN_STRONG_INLINE void preloadKernel() {
+ // Don't make a local copy of the kernel unless we have to (i.e. it's an
+ // expression that needs to be evaluated)
+ const Scalar* in_place = m_kernelImpl.data();
+ if (in_place) {
+ m_kernel = in_place;
+ m_local_kernel = false;
+ } else {
+ size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+ Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
+ typedef TensorEvalToOp<const KernelArgType> EvalTo;
+ EvalTo evalToTmp(local, m_kernelArg);
+ const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
+ internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
+
+ m_kernel = local;
+ m_local_kernel = true;
+ }
+ }
+
+ static unsigned int ceil(unsigned int num, unsigned int denom) {
+ const unsigned int rounded_toward_zero = num / denom;
+ if (num > rounded_toward_zero * denom) {
+ return rounded_toward_zero + 1;
+ }
+ return rounded_toward_zero;
+ }
+
+ void executeEval(Scalar* data) const {
+ typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
+
+ const int maxSharedMem = m_device.sharedMemPerBlock();
+ const int maxThreadsPerBlock = m_device.maxHipThreadsPerBlock();
+ const int maxBlocksPerProcessor = m_device.maxHipThreadsPerMultiProcessor() / maxThreadsPerBlock;
+ const int numMultiProcessors = m_device.getNumHipMultiProcessors();
+ const int hipWarpSize = 32;
+
+ switch (NumKernelDims) {
+ case 1: {
+ const int kernel_size = m_kernelImpl.dimensions().TotalSize();
+
+ const int numX = dimensions()[m_indices[0]];
+ const int numP = dimensions().TotalSize() / numX;
+ int maxX;
+ dim3 block_size;
+
+ const int single_stride_dim =
+ static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? 0
+ : m_inputImpl.dimensions().rank() - 1;
+ if (m_indices[0] == single_stride_dim) {
+ // Maximum the reuse
+ const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
+ maxX = numext::mini<int>(inner_dim, numX);
+ const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
+ block_size.x = numext::mini(maxThreadsPerBlock, maxX);
+ block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
+ }
+ else {
+ // Read as much as possible alongside the inner most dimension, that is the plane
+ const int inner_dim = maxSharedMem / ((hipWarpSize + kernel_size) * sizeof(Scalar));
+ const int maxP = numext::mini<int>(inner_dim, numP);
+ maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
+
+ block_size.x = numext::mini(hipWarpSize, maxX);
+ block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
+ }
+
+ const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
+ assert(shared_mem <= maxSharedMem);
+
+ const int num_x_blocks = ceil(numX, maxX);
+ const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
+ const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
+
+ dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
+
+
+ //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
+
+ const array<Index, 1> indices(m_indices[0]);
+ const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
+ internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
+ m_inputImpl.dimensions(), kernel_dims, indices);
+ switch(kernel_size) {
+ case 4: {
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>),
+ dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
+ break;
+ }
+ case 7: {
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>),
+ dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
+ break;
+ }
+ default: {
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>),
+ dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
+ }
+ }
+ break;
+ }
+
+ case 2: {
+ const int idxX =
+ static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
+ const int idxY =
+ static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
+ const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
+ const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
+
+ const int numX = dimensions()[m_indices[idxX]];
+ const int numY = dimensions()[m_indices[idxY]];
+ const int numP = dimensions().TotalSize() / (numX*numY);
+
+ const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
+
+ // Snap maxX to warp size
+ int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
+ const int maxX = numext::mini<int>(inner_dim, numX);
+ const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
+ const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
+
+ dim3 block_size;
+ block_size.x = numext::mini(1024, maxX);
+ block_size.y = numext::mini<int>(1024/block_size.x, maxY);
+ block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
+
+ const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
+ assert(shared_mem <= maxSharedMem);
+
+ const int num_x_blocks = ceil(numX, maxX);
+ const int num_y_blocks = ceil(numY, maxY);
+ const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
+ const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
+
+ dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
+
+
+ //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
+
+ const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
+ const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
+ m_kernelImpl.dimensions()[idxY]);
+ internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
+ m_inputImpl.dimensions(), kernel_dims, indices);
+ switch (kernel_size_x) {
+ case 4: {
+ switch (kernel_size_y) {
+ case 7: {
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>),
+ dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
+ break;
+ }
+ default: {
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>),
+ dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
+ break;
+ }
+ }
+ break;
+ }
+ case 7: {
+ switch (kernel_size_y) {
+ case 4: {
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>),
+ dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
+ break;
+ }
+ default: {
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>),
+ dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
+ break;
+ }
+ }
+ break;
+ }
+ default: {
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>),
+ dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
+ break;
+ }
+ }
+ break;
+ }
+
+ case 3: {
+ const int idxX =
+ static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
+ const int idxY =
+ static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
+ const int idxZ =
+ static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
+
+ const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
+ const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
+ const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
+
+ const int numX = dimensions()[m_indices[idxX]];
+ const int numY = dimensions()[m_indices[idxY]];
+ const int numZ = dimensions()[m_indices[idxZ]];
+ const int numP = dimensions().TotalSize() / (numX*numY*numZ);
+
+ const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
+ const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
+ const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
+
+ dim3 block_size;
+ block_size.x = numext::mini(32, maxX);
+ block_size.y = numext::mini(32, maxY);
+ block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
+ dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
+
+ const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
+ assert(shared_mem <= maxSharedMem);
+
+ //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
+ const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
+ m_indices[idxZ]);
+ const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
+ m_kernelImpl.dimensions()[idxY],
+ m_kernelImpl.dimensions()[idxZ]);
+ internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
+ m_inputImpl.dimensions(), kernel_dims, indices);
+
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>),
+ dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel,
+ numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
+ break;
+ }
+
+ default: {
+ EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
+ }
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ eigen_assert(m_buf);
+ eigen_assert(index < m_dimensions.TotalSize());
+ return m_buf[index];
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
+ {
+ eigen_assert(m_buf);
+ eigen_assert(index < m_dimensions.TotalSize());
+ return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
+ // model.
+ const double kernel_size = m_kernelImpl.dimensions().TotalSize();
+ // We ignore the use of fused multiply-add.
+ const double convolve_compute_cost =
+ TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
+ const double firstIndex_compute_cost =
+ NumDims *
+ (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
+ TensorOpCost::DivCost<Index>());
+ return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
+ kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
+ m_kernelImpl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, convolve_compute_cost, vectorized,
+ PacketSize));
+ }
+
+ private:
+ // No assignment (copies are needed by the kernels)
+ TensorEvaluator& operator = (const TensorEvaluator&);
+
+ TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
+ TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
+ KernelArgType m_kernelArg;
+ Indices m_indices;
+ Dimensions m_dimensions;
+ Scalar* m_buf;
+ const Scalar* m_kernel;
+ bool m_local_kernel;
+
+ const GpuDevice& m_device;
+};
+#endif
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
index 341889e88..e94e577fc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
@@ -35,9 +35,12 @@ struct DefaultDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
-#ifndef EIGEN_CUDA_ARCH
+#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE)
// Running on the host CPU
return 1;
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+ // Running on a HIP device
+ return 64;
#else
// Running on a CUDA device
return 32;
@@ -45,7 +48,7 @@ struct DefaultDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
-#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
+#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) && !defined(EIGEN_HIP_DEVICE_COMPILE)
// Running on the host CPU
return l1CacheSize();
#else
@@ -55,7 +58,7 @@ struct DefaultDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
-#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
+#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) && !defined(EIGEN_HIP_DEVICE_COMPILE)
// Running single threaded on the host CPU
return l3CacheSize();
#else
@@ -65,10 +68,14 @@ struct DefaultDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-#ifndef EIGEN_CUDA_ARCH
+#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE)
// Running single threaded on the host CPU
// Should return an enum that encodes the ISA supported by the CPU
return 1;
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+ // Running on a HIP device
+ // return 1 as major for HIP
+ return 1;
#else
// Running on a CUDA device
return EIGEN_CUDA_ARCH / 100;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h
new file mode 100644
index 000000000..c0e110987
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h
@@ -0,0 +1,352 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H)
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H
+
+#if defined(EIGEN_HIPCC)
+#include "hip/hip_runtime.h"
+#include "hip/hip_runtime_api.h"
+#endif
+#include <unistd.h> //for sleep function
+
+namespace Eigen {
+
+static const int kHipScratchSize = 1024;
+
+// This defines an interface that GPUDevice can take to use
+// HIP streams underneath.
+class StreamInterface {
+ public:
+ virtual ~StreamInterface() {}
+
+ virtual const hipStream_t& stream() const = 0;
+ virtual const hipDeviceProp_t& deviceProperties() const = 0;
+
+ // Allocate memory on the actual device where the computation will run
+ virtual void* allocate(size_t num_bytes) const = 0;
+ virtual void deallocate(void* buffer) const = 0;
+
+ // Return a scratchpad buffer of size 1k
+ virtual void* scratchpad() const = 0;
+
+ // Return a semaphore. The semaphore is initially initialized to 0, and
+ // each kernel using it is responsible for resetting to 0 upon completion
+ // to maintain the invariant that the semaphore is always equal to 0 upon
+ // each kernel start.
+ virtual unsigned int* semaphore() const = 0;
+};
+
+static hipDeviceProp_t* m_deviceProperties;
+static bool m_devicePropInitialized = false;
+
+static void initializeDeviceProp() {
+ if (!m_devicePropInitialized) {
+ // Attempts to ensure proper behavior in the case of multiple threads
+ // calling this function simultaneously. This would be trivial to
+ // implement if we could use std::mutex, but unfortunately mutex don't
+ // compile with nvcc, so we resort to atomics and thread fences instead.
+ // Note that if the caller uses a compiler that doesn't support c++11 we
+ // can't ensure that the initialization is thread safe.
+#if 0 && __cplusplus >= 201103L
+ static std::atomic<bool> first(true);
+ if (first.exchange(false)) {
+#else
+ static bool first = true;
+ if (first) {
+ first = false;
+#endif
+ // We're the first thread to reach this point.
+ int num_devices;
+ hipError_t status = hipGetDeviceCount(&num_devices);
+ if (status != hipSuccess) {
+ std::cerr << "Failed to get the number of HIP devices: "
+ << hipGetErrorString(status)
+ << std::endl;
+ assert(status == hipSuccess);
+ }
+ m_deviceProperties = new hipDeviceProp_t[num_devices];
+ for (int i = 0; i < num_devices; ++i) {
+ status = hipGetDeviceProperties(&m_deviceProperties[i], i);
+ if (status != hipSuccess) {
+ std::cerr << "Failed to initialize HIP device #"
+ << i
+ << ": "
+ << hipGetErrorString(status)
+ << std::endl;
+ assert(status == hipSuccess);
+ }
+ }
+
+#if 0 && __cplusplus >= 201103L
+ std::atomic_thread_fence(std::memory_order_release);
+#endif
+ m_devicePropInitialized = true;
+ } else {
+ // Wait for the other thread to inititialize the properties.
+ while (!m_devicePropInitialized) {
+#if 0 && __cplusplus >= 201103L
+ std::atomic_thread_fence(std::memory_order_acquire);
+#endif
+ sleep(1);
+ }
+ }
+ }
+}
+
+static const hipStream_t default_stream = 0x00;//TODO: Use hipStreamDefault instead of 0x00;
+
+class HipStreamDevice : public StreamInterface {
+ public:
+ // Use the default stream on the current device
+ HipStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
+ hipGetDevice(&device_);
+ initializeDeviceProp();
+ }
+ // Use the default stream on the specified device
+ HipStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
+ initializeDeviceProp();
+ }
+ // Use the specified stream. Note that it's the
+ // caller responsibility to ensure that the stream can run on
+ // the specified device. If no device is specified the code
+ // assumes that the stream is associated to the current gpu device.
+ HipStreamDevice(const hipStream_t* stream, int device = -1)
+ : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
+ if (device < 0) {
+ hipGetDevice(&device_);
+ } else {
+ int num_devices;
+ hipError_t err = hipGetDeviceCount(&num_devices);
+ EIGEN_UNUSED_VARIABLE(err)
+ assert(err == hipSuccess);
+ assert(device < num_devices);
+ device_ = device;
+ }
+ initializeDeviceProp();
+ }
+
+ virtual ~HipStreamDevice() {
+ if (scratch_) {
+ deallocate(scratch_);
+ }
+ }
+
+ const hipStream_t& stream() const { return *stream_; }
+ const hipDeviceProp_t& deviceProperties() const {
+ return m_deviceProperties[device_];
+ }
+ virtual void* allocate(size_t num_bytes) const {
+ hipError_t err = hipSetDevice(device_);
+ EIGEN_UNUSED_VARIABLE(err)
+ assert(err == hipSuccess);
+ void* result;
+ err = hipMalloc(&result, num_bytes);
+ assert(err == hipSuccess);
+ assert(result != NULL);
+ return result;
+ }
+ virtual void deallocate(void* buffer) const {
+ hipError_t err = hipSetDevice(device_);
+ EIGEN_UNUSED_VARIABLE(err)
+ assert(err == hipSuccess);
+ assert(buffer != NULL);
+ err = hipFree(buffer);
+ assert(err == hipSuccess);
+ }
+
+ virtual void* scratchpad() const {
+ if (scratch_ == NULL) {
+ scratch_ = allocate(kHipScratchSize + sizeof(unsigned int));
+ }
+ return scratch_;
+ }
+
+ virtual unsigned int* semaphore() const {
+ if (semaphore_ == NULL) {
+ char* scratch = static_cast<char*>(scratchpad()) + kHipScratchSize;
+ semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+ //hipError_t err = hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
+ hipError_t err = hipMemset(semaphore_, 0, sizeof(unsigned int));
+ EIGEN_UNUSED_VARIABLE(err)
+ assert(err == hipSuccess);
+ }
+ return semaphore_;
+ }
+
+ private:
+ const hipStream_t* stream_;
+ int device_;
+ mutable void* scratch_;
+ mutable unsigned int* semaphore_;
+};
+
+struct GpuDevice {
+ // The StreamInterface is not owned: the caller is
+ // responsible for its initialization and eventual destruction.
+ explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
+ eigen_assert(stream);
+ }
+ explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
+ eigen_assert(stream);
+ }
+ // TODO(bsteiner): This is an internal API, we should not expose it.
+ EIGEN_STRONG_INLINE const hipStream_t& stream() const {
+ return stream_->stream();
+ }
+
+ EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+ return stream_->allocate(num_bytes);
+ }
+
+ EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+ stream_->deallocate(buffer);
+ }
+
+ EIGEN_STRONG_INLINE void* scratchpad() const {
+ return stream_->scratchpad();
+ }
+
+ EIGEN_STRONG_INLINE unsigned int* semaphore() const {
+ return stream_->semaphore();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+#if !defined(EIGEN_HIP_DEVICE_COMPILE)
+ hipError_t err = hipMemcpyAsync(dst, src, n, hipMemcpyDeviceToDevice,
+ stream_->stream());
+ EIGEN_UNUSED_VARIABLE(err)
+ assert(err == hipSuccess);
+#else
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+ }
+
+ EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+ hipError_t err =
+ hipMemcpyAsync(dst, src, n, hipMemcpyHostToDevice, stream_->stream());
+ EIGEN_UNUSED_VARIABLE(err)
+ assert(err == hipSuccess);
+ }
+
+ EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+ hipError_t err =
+ hipMemcpyAsync(dst, src, n, hipMemcpyDeviceToHost, stream_->stream());
+ EIGEN_UNUSED_VARIABLE(err)
+ assert(err == hipSuccess);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+#if !defined(EIGEN_HIP_DEVICE_COMPILE)
+ //TODO:hipError_t err = hipMemsetAsync(buffer, c, n, stream_->stream());
+ hipError_t err = hipMemset(buffer, c, n);
+ EIGEN_UNUSED_VARIABLE(err)
+ assert(err == hipSuccess);
+#else
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+ }
+
+ EIGEN_STRONG_INLINE size_t numThreads() const {
+ // FIXME
+ return 32;
+ }
+
+ EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+ // FIXME
+ return 48*1024;
+ }
+
+ EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+ // We won't try to take advantage of the l2 cache for the time being, and
+ // there is no l3 cache on hip devices.
+ return firstLevelCacheSize();
+ }
+
+// FIXME - this will move into HIP
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+#undef assert
+#define assert(COND)
+#endif
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
+#if defined(EIGEN_HIPCC) && \
+ !defined(EIGEN_HIP_DEVICE_COMPILE)
+ hipError_t err = hipStreamSynchronize(stream_->stream());
+ if (err != hipSuccess) {
+ std::cerr << "Error detected in HIP stream: "
+ << hipGetErrorString(err)
+ << std::endl;
+ assert(err == hipSuccess);
+ }
+#else
+ assert(false && "The default device should be used instead to generate kernel code");
+#endif
+ }
+
+ EIGEN_STRONG_INLINE int getNumHipMultiProcessors() const {
+ return stream_->deviceProperties().multiProcessorCount;
+ }
+ EIGEN_STRONG_INLINE int maxHipThreadsPerBlock() const {
+ return stream_->deviceProperties().maxThreadsPerBlock;
+ }
+ EIGEN_STRONG_INLINE int maxHipThreadsPerMultiProcessor() const {
+ return stream_->deviceProperties().maxThreadsPerMultiProcessor;
+ }
+ EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
+ return stream_->deviceProperties().sharedMemPerBlock;
+ }
+ EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+ return stream_->deviceProperties().major;
+ }
+ EIGEN_STRONG_INLINE int minorDeviceVersion() const {
+ return stream_->deviceProperties().minor;
+ }
+
+ EIGEN_STRONG_INLINE int maxBlocks() const {
+ return max_blocks_;
+ }
+
+ // This function checks if the HIP runtime recorded an error for the
+ // underlying stream device.
+ inline bool ok() const {
+#if defined(EIGEN_HIPCC)
+ hipError_t error = hipStreamQuery(stream_->stream());
+ return (error == hipSuccess) || (error == hipErrorNotReady);
+#else
+ return false;
+#endif
+ }
+
+ private:
+ const StreamInterface* stream_;
+ int max_blocks_;
+};
+
+#define LAUNCH_HIP_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), (__VA_ARGS__)); \
+ assert(hipGetLastError() == hipSuccess);
+
+
+// FIXME: Should be device and kernel specific.
+#if defined(EIGEN_HIPCC)
+static EIGEN_DEVICE_FUNC inline void setHipSharedMemConfig(hipSharedMemConfig config) {
+#if !defined(EIGEN_HIP_DEVICE_COMPILE)
+ hipError_t status = hipDeviceSetSharedMemConfig(config);
+ EIGEN_UNUSED_VARIABLE(status)
+ assert(status == hipSuccess);
+#else
+ EIGEN_UNUSED_VARIABLE(config)
+#endif
+}
+#endif
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 0ffe68ab3..24a57970a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -201,7 +201,7 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable> {
};
-#if defined(EIGEN_CUDACC)
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template <typename Evaluator, typename Index, bool Vectorizable>
struct EigenMetaKernelEval {
static __device__ EIGEN_ALWAYS_INLINE
@@ -250,6 +250,17 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
+#if defined(EIGEN_HIPCC)
+ const int block_size = device.maxHipThreadsPerBlock();
+ const int max_blocks = device.getNumHipMultiProcessors() *
+ device.maxHipThreadsPerMultiProcessor() / block_size;
+ const Index size = array_prod(evaluator.dimensions());
+ // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
+ const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
+
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
+ dim3(num_blocks), dim3(block_size), 0, device.stream(), evaluator, size);
+#else
const int block_size = device.maxCudaThreadsPerBlock();
const int max_blocks = device.getNumCudaMultiProcessors() *
device.maxCudaThreadsPerMultiProcessor() / block_size;
@@ -260,11 +271,12 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
LAUNCH_CUDA_KERNEL(
(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
num_blocks, block_size, 0, device, evaluator, size);
+#endif
}
evaluator.cleanup();
}
-#endif // EIGEN_CUDACC
+#endif // EIGEN_CUDACC || EIGEN_HIPCC
#endif // EIGEN_USE_GPU
// SYCL Executor policy
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index c015ce196..b8f0bc798 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -109,7 +109,10 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+ #if !defined(EIGEN_HIPCC)
+ EIGEN_DEVICE_FUNC
+ #endif
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
const Index numValues = internal::array_prod(m_impl.dimensions());
m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType));
// Should initialize the memory in case we're dealing with non POD types.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
index 3209fecd3..835efbf72 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@@ -350,7 +350,11 @@ struct IndexPairList : internal::IndexTuple<FirstType, OtherTypes...> {
namespace internal {
-template<typename FirstType, typename... OtherTypes> size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
+template<typename FirstType, typename... OtherTypes>
+ #if defined(EIGEN_HIPCC)
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+ #endif
+ size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
size_t result = 1;
for (int i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) {
result *= sizes[i];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
index c9e61f359..8e1ba486d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
@@ -27,7 +27,7 @@
*/
// SFINAE requires variadic templates
-#ifndef EIGEN_CUDACC
+#if !defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC)
#if EIGEN_HAS_VARIADIC_TEMPLATES
// SFINAE doesn't work for gcc <= 4.7
#ifdef EIGEN_COMP_GNUC
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index 5431eb740..de1075cc1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -52,7 +52,7 @@ struct PacketType : internal::packet_traits<Scalar> {
};
// For CUDA packet types when using a GpuDevice
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) && defined(EIGEN_HAS_CUDA_FP16)
+#if defined(EIGEN_USE_GPU) && ((defined(EIGEN_CUDACC) && defined(EIGEN_HAS_CUDA_FP16)) || (defined(EIGEN_HIPCC) && defined(EIGEN_HAS_HIP_FP16)))
template <>
struct PacketType<half, GpuDevice> {
typedef half2 type;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index e59074506..2a979845b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -858,7 +858,10 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
}
return inputIndex;
}
-
+
+ #if defined(EIGEN_HIPCC)
+ EIGEN_DEVICE_FUNC
+ #endif
static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
#ifndef __SYCL_DEVICE_ONLY__
return numext::maxi(min, numext::mini(max,value));
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
index 230915db2..71536a4b9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@@ -16,7 +16,7 @@ namespace internal {
namespace {
EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
-#ifdef EIGEN_CUDA_ARCH
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
// We don't support 3d kernels since we currently only use 1 and
// 2d kernels.
assert(threadIdx.z == 0);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index da0ffe728..d2fb3fd32 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -334,12 +334,12 @@ struct OuterReducer {
};
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC))
template <int B, int N, typename S, typename R, typename I>
__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
-#ifdef EIGEN_HAS_CUDA_FP16
+#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
template <typename S, typename R, typename I>
__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
template <int B, int N, typename S, typename R, typename I>
@@ -495,7 +495,11 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
- EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(typename MakePointer_<CoeffReturnType>::Type data) {
+ EIGEN_STRONG_INLINE
+ #if !defined(EIGEN_HIPCC)
+ EIGEN_DEVICE_FUNC
+ #endif
+ bool evalSubExprsIfNeeded(typename MakePointer_<CoeffReturnType>::Type data) {
m_impl.evalSubExprsIfNeeded(NULL);
// Use the FullReducer if possible.
@@ -694,9 +698,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
#ifdef EIGEN_USE_THREADS
template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
#endif
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC))
template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
-#ifdef EIGEN_HAS_CUDA_FP16
+#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
template <typename S, typename R, typename I> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
@@ -774,14 +778,22 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
// Indexed by reduced dimensions.
array<Index, NumReducedDims> m_reducedDims;
+#if defined(EIGEN_HIPCC)
+ public:
+#endif
+
// Evaluator for the input expression.
TensorEvaluator<ArgType, Device> m_impl;
+#if defined(EIGEN_HIPCC)
+ private:
+#endif
+
// Operation to apply for computing the reduction.
Op m_reducer;
// For full reductions
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC))
static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
static const bool RunningOnSycl = false;
#elif defined(EIGEN_USE_SYCL)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h
new file mode 100644
index 000000000..5304a22c5
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h
@@ -0,0 +1,815 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_HIP_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_HIP_H
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+#include "Eigen/src/Core/arch/HIP/hcc/math_constants.h"
+#endif
+
+#if defined(EIGEN_HIPCC)
+#define HIP_WARP_SIZE 64
+#endif
+
+namespace Eigen {
+namespace internal {
+
+
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_HIPCC)
+// Full reducers for GPU, don't vectorize for now
+
+// Reducer function that enables multiple hip thread to safely accumulate at the same
+// output address. It basically reads the current value of the output variable, and
+// attempts to update it with the new value. If in the meantime another hip thread
+// updated the content of the output address it will try again.
+template <typename T, typename R>
+__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)
+ if (sizeof(T) == 4)
+ {
+ unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+ unsigned int newval = oldval;
+ reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+ if (newval == oldval) {
+ return;
+ }
+ unsigned int readback;
+ while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+ oldval = readback;
+ newval = oldval;
+ reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+ if (newval == oldval) {
+ return;
+ }
+ }
+ }
+ else if (sizeof(T) == 8) {
+ unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
+ unsigned long long newval = oldval;
+ reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+ if (newval == oldval) {
+ return;
+ }
+ unsigned long long readback;
+ while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
+ oldval = readback;
+ newval = oldval;
+ reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+ if (newval == oldval) {
+ return;
+ }
+ }
+ }
+ else {
+ assert(0 && "Wordsize not supported");
+ }
+#else
+ assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+// We extend atomicExch to support extra data types
+template <typename Type>
+__device__ inline Type atomicExchCustom(Type* address, Type val) {
+ return atomicExch(address, val);
+}
+
+template <>
+__device__ inline double atomicExchCustom(double* address, double val) {
+ unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
+ return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
+}
+
+#if defined(EIGEN_HAS_HIP_FP16)
+template <template <typename T> class R>
+__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
+ unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+ unsigned int newval = oldval;
+ reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+ if (newval == oldval) {
+ return;
+ }
+ unsigned int readback;
+ while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+ oldval = readback;
+ newval = oldval;
+ reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+ if (newval == oldval) {
+ return;
+ }
+ }
+}
+#endif
+
+template <>
+__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE) && (__HIP_DEVICE_COMPILE__ == 1) &&\
+ defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)
+ atomicAdd(output, accum);
+#else
+ assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+
+template <typename CoeffType, typename Index>
+__global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
+ const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ const Index num_threads = hipBlockDim_x * hipGridDim_x;
+ for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+ output[i] = val;
+ }
+}
+
+
+template <int BlockSize, int NumPerThread, typename Self,
+ typename Reducer, typename Index>
+__global__ void FullReductionKernel(const Self input, Index num_coeffs,
+ typename Self::CoeffReturnType* output, unsigned int* semaphore, Reducer reducer) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE) && (__HIP_DEVICE_COMPILE__ == 1) &&\
+ defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)
+ // Initialize the output value
+ const Index first_index = hipBlockIdx_x * BlockSize * NumPerThread + hipThreadIdx_x;
+ if (hipGridDim_x == 1) {
+ if (first_index == 0) {
+ *output = reducer.initialize();
+ }
+ }
+ else {
+ if (hipThreadIdx_x == 0) {
+ unsigned int block = atomicCAS(semaphore, 0u, 1u);
+ if (block == 0) {
+ // We're the first block to run, initialize the output value
+ atomicExchCustom(output, reducer.initialize());
+ __threadfence();
+ atomicExch(semaphore, 2u);
+ }
+ else {
+ // Wait for the first block to initialize the output value.
+ // Use atomicCAS here to ensure that the reads aren't cached
+ unsigned int val;
+ do {
+ val = atomicCAS(semaphore, 2u, 2u);
+ }
+ while (val < 2u);
+ }
+ }
+ }
+
+ __syncthreads();
+
+ eigen_assert(hipGridDim_x == 1 || *semaphore >= 2u);
+
+ typename Self::CoeffReturnType accum = reducer.initialize();
+ Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
+ for (Index i = 0; i < max_iter; i+=BlockSize) {
+ const Index index = first_index + i;
+ eigen_assert(index < num_coeffs);
+ typename Self::CoeffReturnType val = input.m_impl.coeff(index);
+ reducer.reduce(val, &accum);
+ }
+
+#pragma unroll
+ for (int offset = HIP_WARP_SIZE/2; offset > 0; offset /= 2) {
+ // XXX use std::is_floating_point to determine the type of accum
+ if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
+ reducer.reduce(__shfl_down(static_cast<float>(accum), offset, HIP_WARP_SIZE), &accum);
+ } else {
+ reducer.reduce(__shfl_down(static_cast<int>(accum), offset, HIP_WARP_SIZE), &accum);
+ }
+ }
+
+ if ((hipThreadIdx_x & (HIP_WARP_SIZE - 1)) == 0) {
+ atomicReduce(output, accum, reducer);
+ }
+
+ if (hipGridDim_x > 1 && hipThreadIdx_x == 0) {
+ // Let the last block reset the semaphore
+ atomicInc(semaphore, hipGridDim_x + 1);
+ __threadfence_system();
+ }
+
+#else
+ assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+
+#if defined(EIGEN_HAS_HIP_FP16)
+template <typename Self,
+ typename Reducer, typename Index>
+__global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) {
+ eigen_assert(hipBlockDim_x == 1);
+ eigen_assert(hipGridDim_x == 1);
+ if (num_coeffs % 2 != 0) {
+ half last = input.m_impl.coeff(num_coeffs-1);
+ *scratch = __halves2half2(last, reducer.initialize());
+ } else {
+ *scratch = reducer.template initializePacket<half2>();
+ }
+}
+
+template <typename Self,
+ typename Reducer, typename Index>
+__global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
+ const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ const Index num_threads = hipBlockDim_x * hipGridDim_x;
+ const Index num_packets = num_coeffs / 2;
+ for (Index i = thread_id; i < num_packets; i += num_threads) {
+ ((half2*)output)[i] = reducer.template initializePacket<half2>();
+ }
+
+ if (thread_id == 0 && num_coeffs % 2 != 0) {
+ output[num_coeffs-1] = reducer.initialize();
+ }
+}
+
+template <int BlockSize, int NumPerThread, typename Self,
+ typename Reducer, typename Index>
+__global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
+ half* output, half2* scratch) {
+ eigen_assert(NumPerThread % 2 == 0);
+
+ const Index first_index = hipBlockIdx_x * BlockSize * NumPerThread + 2*hipThreadIdx_x;
+
+ // Initialize the output value if it wasn't initialized by the ReductionInitKernel
+ if (hipGridDim_x == 1 && first_index == 0) {
+ if (num_coeffs % 2 != 0) {
+ half last = input.m_impl.coeff(num_coeffs-1);
+ *scratch = __halves2half2(last, reducer.initialize());
+ } else {
+ *scratch = reducer.template initializePacket<half2>();
+ }
+ __syncthreads();
+ }
+
+ half2 accum = reducer.template initializePacket<half2>();
+ const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
+ for (Index i = 0; i < max_iter; i += BlockSize) {
+ const Index index = first_index + 2*i;
+ eigen_assert(index + 1 < num_coeffs);
+ half2 val = input.m_impl.template packet<Unaligned>(index);
+ reducer.reducePacket(val, &accum);
+ }
+
+#pragma unroll
+ for (int offset = HIP_WARP_SIZE/2; offset > 0; offset /= 2) {
+ // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
+ union { int i; half2 h; } wka_in, wka_out;
+ wka_in.h = accum;
+ wka_out.i = __shfl_down(wka_in.i, offset, HIP_WARP_SIZE);
+ reducer.reducePacket(wka_out.h, &accum);
+ }
+
+ if ((hipThreadIdx_x & (HIP_WARP_SIZE - 1)) == 0) {
+ atomicReduce(scratch, accum, reducer);
+ }
+
+ __syncthreads();
+
+ if (hipGridDim_x == 1 && first_index == 0) {
+ half tmp = __low2half(*scratch);
+ reducer.reduce(__high2half(*scratch), &tmp);
+ *output = tmp;
+ }
+}
+
+template <typename Op>
+__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
+ eigen_assert(hipThreadIdx_x == 1);
+ half tmp = __low2half(*scratch);
+ reducer.reduce(__high2half(*scratch), &tmp);
+ *output = tmp;
+}
+
+#endif
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct FullReductionLauncher {
+ static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
+ assert(false && "Should only be called on doubles, floats and half floats");
+ }
+};
+
+namespace {
+ std::mutex __eigen_reduction_hip_mutex;
+}
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct FullReductionLauncher<
+ Self, Op, OutputType, PacketAccess,
+ typename internal::enable_if<
+ internal::is_same<float, OutputType>::value ||
+ internal::is_same<double, OutputType>::value,
+ void>::type> {
+ static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
+ // guard FullReductionLauncher with a mutex so only 1 FullReductionKernel
+ // is dispatched at a time
+ std::lock_guard<std::mutex> lock(__eigen_reduction_hip_mutex);
+
+ typedef typename Self::Index Index;
+ typedef typename Self::CoeffReturnType Scalar;
+ const int block_size = 256;
+ const int num_per_thread = 128;
+ const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+
+ unsigned int* semaphore = NULL;
+ if (num_blocks > 1) {
+ semaphore = device.semaphore();
+
+ unsigned int semaphore_host = 0xFF;
+ hipMemcpy(&semaphore_host, semaphore, sizeof(unsigned int), hipMemcpyDeviceToHost);
+ if (semaphore_host != 0) {
+ std::cerr << "[WARN][EIGEN][FullReductionLauncher] incorrect semaphore value: "
+ << semaphore_host << "\n";
+ // wait for all commands on the device to complete so semaphore value
+ // is reset to 0
+ hipDeviceSynchronize();
+
+ // read again
+ hipMemcpy(&semaphore_host, semaphore, sizeof(unsigned int), hipMemcpyDeviceToHost);
+ if (semaphore_host != 0) {
+ std::cerr << "[ERROR][EIGEN][FullReductionLauncher] CRITICAL incorrect semaphore value: "
+ << semaphore_host << ", apply manual override to 0\n";
+
+ // force set semaphore value to be 0
+ semaphore_host = 0;
+ hipMemcpy(semaphore, &semaphore_host, sizeof(unsigned int), hipMemcpyHostToDevice);
+ }
+ }
+ }
+
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
+ dim3(num_blocks), dim3(block_size), 0, device.stream(), self, num_coeffs, output, semaphore, reducer);
+ }
+};
+
+#if defined(EIGEN_HAS_HIP_FP16)
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, false> {
+ static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
+ assert(false && "Should not be called since there is no packet accessor");
+ }
+};
+
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, true> {
+ static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
+ typedef typename Self::Index Index;
+
+ const int block_size = 256;
+ const int num_per_thread = 128;
+ const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+ half2* scratch = static_cast<half2*>(device.scratchpad());
+
+ if (num_blocks > 1) {
+ // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
+ // won't be a race conditions between multiple thread blocks.
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
+ dim3(1), dim3(1), 0, device.stream(), reducer, self, num_coeffs, scratch);
+ }
+
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
+ dim3(num_blocks), dim3(block_size), 0, device.stream(), reducer, self, num_coeffs, output, scratch);
+
+ if (num_blocks > 1) {
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionCleanupKernelHalfFloat<Op>),
+ dim3(1), dim3(1), 0, device.stream(), reducer, output, scratch);
+ }
+ }
+};
+#endif
+
+
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
+ // Unfortunately nvidia doesn't support well exotic types such as complex,
+ // so reduce the scope of the optimized version of the code to the simple cases
+ // of doubles, floats and half floats
+#if defined(EIGEN_HAS_HIP_FP16)
+ static const bool HasOptimizedImplementation = !Op::IsStateful &&
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value ||
+ (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else
+ static const bool HasOptimizedImplementation = !Op::IsStateful &&
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif
+
+ template <typename OutputType>
+ static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
+ assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+ const Index num_coeffs = array_prod(self.m_impl.dimensions());
+ // Don't crash when we're called with an input tensor of size 0.
+ if (num_coeffs == 0) {
+ return;
+ }
+
+ FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
+ }
+};
+
+
+template <int NumPerThread, typename Self,
+ typename Reducer, typename Index>
+__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+ typename Self::CoeffReturnType* output) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE) && (__HIP_DEVICE_COMPILE__ == 1) &&\
+ defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)
+ typedef typename Self::CoeffReturnType Type;
+ eigen_assert(hipBlockDim_y == 1);
+ eigen_assert(hipBlockDim_z == 1);
+ eigen_assert(hipGridDim_y == 1);
+ eigen_assert(hipGridDim_z == 1);
+
+ const int unroll_times = 16;
+ eigen_assert(NumPerThread % unroll_times == 0);
+
+ const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, hipBlockDim_x * NumPerThread);
+ const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
+
+ const Index num_threads = hipBlockDim_x * hipGridDim_x;
+ const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+
+ // Initialize the output values if they weren't initialized by the ReductionInitKernel
+ if (hipGridDim_x == 1) {
+ for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+ output[i] = reducer.initialize();
+ }
+ __syncthreads();
+ }
+
+ for (Index i = hipBlockIdx_x; i < num_input_blocks; i += hipGridDim_x) {
+ const Index row = i / input_col_blocks;
+
+ if (row < num_preserved_coeffs) {
+ const Index col_block = i % input_col_blocks;
+ const Index col_begin = col_block * hipBlockDim_x * NumPerThread + hipThreadIdx_x;
+
+ Type reduced_val = reducer.initialize();
+
+ for (Index j = 0; j < NumPerThread; j += unroll_times) {
+ const Index last_col = col_begin + hipBlockDim_x * (j + unroll_times - 1);
+ if (last_col >= num_coeffs_to_reduce) {
+ for (Index col = col_begin + hipBlockDim_x * j; col < num_coeffs_to_reduce; col += hipBlockDim_x) {
+ const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+ reducer.reduce(val, &reduced_val);
+ }
+ break;
+ } else {
+ // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+ for (int k = 0; k < unroll_times; ++k) {
+ const Index col = col_begin + hipBlockDim_x * (j + k);
+ reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
+ }
+ }
+ }
+
+#pragma unroll
+ for (int offset = HIP_WARP_SIZE/2; offset > 0; offset /= 2) {
+ // XXX use std::is_floating_point to determine the type of reduced_val
+ if (std::is_floating_point<Type>::value) {
+ reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
+ } else {
+ reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
+ }
+ }
+
+ if ((hipThreadIdx_x & (HIP_WARP_SIZE - 1)) == 0) {
+ atomicReduce(&(output[row]), reduced_val, reducer);
+ }
+ }
+ }
+#else
+ assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+#if defined(EIGEN_HAS_HIP_FP16)
+
+template <int NumPerThread, typename Self,
+ typename Reducer, typename Index>
+__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+ half* output) {
+ eigen_assert(hipBlockDim_y == 1);
+ eigen_assert(hipBlockDim_z == 1);
+ eigen_assert(hipGridDim_y == 1);
+ eigen_assert(hipGridDim_z == 1);
+
+ const int unroll_times = 16;
+ eigen_assert(NumPerThread % unroll_times == 0);
+ eigen_assert(unroll_times % 2 == 0);
+
+ const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, hipBlockDim_x * NumPerThread * 2);
+ const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
+
+ const Index num_threads = hipBlockDim_x * hipGridDim_x;
+ const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+
+ // Initialize the output values if they weren't initialized by the ReductionInitKernel
+ if (hipGridDim_x == 1) {
+ Index i = 2*thread_id;
+ for (; i + 1 < num_preserved_coeffs; i += 2*num_threads) {
+ half* loc = output + i;
+ *((half2*)loc) = reducer.template initializePacket<half2>();
+ }
+ if (i < num_preserved_coeffs) {
+ output[i] = reducer.initialize();
+ }
+ __syncthreads();
+ }
+
+ for (Index i = hipBlockIdx_x; i < num_input_blocks; i += hipGridDim_x) {
+ const Index row = 2 * (i / input_col_blocks);
+
+ if (row + 1 < num_preserved_coeffs) {
+ const Index col_block = i % input_col_blocks;
+ const Index col_begin = 2 * (col_block * hipBlockDim_x * NumPerThread + hipThreadIdx_x);
+
+ half2 reduced_val1 = reducer.template initializePacket<half2>();
+ half2 reduced_val2 = reducer.template initializePacket<half2>();
+
+ for (Index j = 0; j < NumPerThread; j += unroll_times) {
+ const Index last_col = col_begin + hipBlockDim_x * (j + unroll_times - 1) * 2;
+ if (last_col >= num_coeffs_to_reduce) {
+ Index col = col_begin + hipBlockDim_x * j;
+ for (; col + 1 < num_coeffs_to_reduce; col += hipBlockDim_x) {
+ const half2 val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col);
+ reducer.reducePacket(val1, &reduced_val1);
+ const half2 val2 = input.m_impl.template packet<Unaligned>((row+1) * num_coeffs_to_reduce + col);
+ reducer.reducePacket(val2, &reduced_val2);
+ }
+ if (col < num_coeffs_to_reduce) {
+ // Peel;
+ const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+ const half2 val1 = __halves2half2(last1, reducer.initialize());
+ reducer.reducePacket(val1, &reduced_val1);
+ const half last2 = input.m_impl.coeff((row+1) * num_coeffs_to_reduce + col);
+ const half2 val2 = __halves2half2(last2, reducer.initialize());
+ reducer.reducePacket(val2, &reduced_val2);
+ }
+ break;
+ } else {
+ // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+ for (int k = 0; k < unroll_times; ++k) {
+ const Index col = col_begin + hipBlockDim_x * (j + k) * 2;
+ reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col), &reduced_val1);
+ reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1)* num_coeffs_to_reduce + col), &reduced_val2);
+ }
+ }
+ }
+
+#pragma unroll
+ for (int offset = HIP_WARP_SIZE/2; offset > 0; offset /= 2) {
+ // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
+ union { int i; half2 h; } wka_in, wka_out;
+
+ wka_in.h = reduced_val1;
+ wka_out.i = __shfl_down(wka_in.i, offset, HIP_WARP_SIZE);
+ reducer.reducePacket(wka_out.h, &reduced_val1);
+
+ wka_in.h = reduced_val2;
+ wka_out.i = __shfl_down(wka_in.i, offset, HIP_WARP_SIZE);
+ reducer.reducePacket(wka_out.h, &reduced_val2);
+ }
+
+ half val1 = __low2half(reduced_val1);
+ reducer.reduce(__high2half(reduced_val1), &val1);
+ half val2 = __low2half(reduced_val2);
+ reducer.reduce(__high2half(reduced_val2), &val2);
+ half2 val = __halves2half2(val1, val2);
+
+ if ((hipThreadIdx_x & (HIP_WARP_SIZE - 1)) == 0) {
+ half* loc = output + row;
+ atomicReduce((half2*)loc, val, reducer);
+ }
+ }
+ }
+}
+
+#endif
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct InnerReductionLauncher {
+ static bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
+ assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
+ return true;
+ }
+};
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct InnerReductionLauncher<
+ Self, Op, OutputType, PacketAccess,
+ typename internal::enable_if<
+ internal::is_same<float, OutputType>::value ||
+ internal::is_same<double, OutputType>::value,
+ void>::type> {
+ static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+ typedef typename Self::Index Index;
+
+ const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+ const int block_size = 256;
+ const int num_per_thread = 128;
+ const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+ const int max_blocks = device.getNumHipMultiProcessors() *
+ device.maxHipThreadsPerMultiProcessor() / block_size;
+ const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+ if (num_blocks > 1) {
+ // We initialize the outputs outside the reduction kernel when we can't be sure that there
+ // won't be a race conditions between multiple thread blocks.
+ const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+ const int max_blocks = device.getNumHipMultiProcessors() *
+ device.maxHipThreadsPerMultiProcessor() / 1024;
+ const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionInitKernel<OutputType, Index>),
+ dim3(num_blocks), dim3(1024), 0, device.stream(),
+ reducer.initialize(), num_preserved_vals, output);
+ }
+
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(InnerReductionKernel<num_per_thread, Self, Op, Index>),
+ dim3(num_blocks), dim3(block_size), 0, device.stream(), reducer, self,
+ num_coeffs_to_reduce, num_preserved_vals, output);
+
+ return false;
+ }
+};
+
+#if defined(EIGEN_HAS_HIP_FP16)
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
+ static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
+ assert(false && "Should not be called since there is no packet accessor");
+ return true;
+ }
+};
+
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
+ static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+ typedef typename Self::Index Index;
+
+ if (num_preserved_vals % 2 != 0) {
+ // Not supported yet, revert to the slower code path
+ return true;
+ }
+
+ const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+ const int block_size = /*256*/128;
+ const int num_per_thread = /*128*/64;
+ const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+ const int max_blocks = device.getNumHipMultiProcessors() *
+ device.maxHipThreadsPerMultiProcessor() / block_size;
+ const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+ if (num_blocks > 1) {
+ // We initialize the outputs outside the reduction kernel when we can't be sure that there
+ // won't be a race conditions between multiple thread blocks.
+ const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+ const int max_blocks = device.getNumHipMultiProcessors() *
+ device.maxHipThreadsPerMultiProcessor() / 1024;
+ const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionInitKernelHalfFloat<Self, Op, Index>),
+ dim3(1), dim3(1), 0, device.stream(), reducer, self, num_preserved_vals, output);
+ }
+
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
+ dim3(num_blocks), dim3(block_size), 0, device.stream(), reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+ return false;
+ }
+};
+#endif
+
+
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, GpuDevice> {
+ // Unfortunately nvidia doesn't support well exotic types such as complex,
+ // so reduce the scope of the optimized version of the code to the simple case
+ // of floats and half floats.
+#if defined(EIGEN_HAS_HIP_FP16)
+ static const bool HasOptimizedImplementation = !Op::IsStateful &&
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value ||
+ (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else
+ static const bool HasOptimizedImplementation = !Op::IsStateful &&
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif
+
+ template <typename OutputType>
+ static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+ assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+ const Index num_coeffs = array_prod(self.m_impl.dimensions());
+ // Don't crash when we're called with an input tensor of size 0.
+ if (num_coeffs == 0) {
+ return true;
+ }
+ // It's faster to use the usual code.
+ if (num_coeffs_to_reduce <= 128) {
+ return true;
+ }
+
+ return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
+ }
+};
+
+template <int NumPerThread, typename Self,
+ typename Reducer, typename Index>
+__global__ void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+ typename Self::CoeffReturnType* output) {
+ const Index num_threads = hipBlockDim_x * hipGridDim_x;
+ const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ // Initialize the output values if they weren't initialized by the ReductionInitKernel
+ if (hipGridDim_x == 1) {
+ for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+ output[i] = reducer.initialize();
+ }
+ __syncthreads();
+ }
+
+ // Do the reduction.
+ const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
+ for (Index i = thread_id; i < max_iter; i += num_threads) {
+ const Index input_col = i % num_preserved_coeffs;
+ const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
+ typename Self::CoeffReturnType reduced_val = reducer.initialize();
+ const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
+ for (Index j = input_row; j < max_row; j++) {
+ typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
+ reducer.reduce(val, &reduced_val);
+ }
+ atomicReduce(&(output[input_col]), reduced_val, reducer);
+ }
+}
+
+
+template <typename Self, typename Op>
+struct OuterReducer<Self, Op, GpuDevice> {
+ // Unfortunately nvidia doesn't support well exotic types such as complex,
+ // so reduce the scope of the optimized version of the code to the simple case
+ // of floats.
+ static const bool HasOptimizedImplementation = !Op::IsStateful &&
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value);
+ template <typename Device, typename OutputType>
+ static bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
+ assert(false && "Should only be called to reduce doubles or floats on a gpu device");
+ return true;
+ }
+
+ static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+ typedef typename Self::Index Index;
+
+ // It's faster to use the usual code.
+ if (num_coeffs_to_reduce <= 32) {
+ return true;
+ }
+
+ const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+ const int block_size = 256;
+ const int num_per_thread = 16;
+ const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+ const int max_blocks = device.getNumHipMultiProcessors() *
+ device.maxHipThreadsPerMultiProcessor() / block_size;
+ const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+ if (num_blocks > 1) {
+ // We initialize the outputs in the reduction kernel itself when we don't have to worry
+ // about race conditions between multiple thread blocks.
+ const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+ const int max_blocks = device.getNumHipMultiProcessors() *
+ device.maxHipThreadsPerMultiProcessor() / 1024;
+ const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionInitKernel<float, Index>),
+ dim3(num_blocks), dim3(1024), 0, device.stream(),
+ reducer.initialize(), num_preserved_vals, output);
+ }
+
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(OuterReductionKernel<num_per_thread, Self, Op, Index>),
+ dim3(num_blocks), dim3(block_size), 0, device.stream(), reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+ return false;
+ }
+};
+
+#endif
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_HIP_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index 1f545ef1a..174a6a064 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -242,7 +242,7 @@ struct ScanLauncher {
}
};
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC))
// GPU implementation of scan
// TODO(ibab) This placeholder implementation performs multiple scans in
@@ -278,10 +278,15 @@ struct ScanLauncher<Self, Reducer, GpuDevice> {
Index total_size = internal::array_prod(self.dimensions());
Index num_blocks = (total_size / self.size() + 63) / 64;
Index block_size = 64;
+#if defined(EIGEN_HIPCC)
+ hipLaunchKernelGGL(HIP_KERNEL_NAME(ScanKernel<Self, Reducer>), dim3(num_blocks),
+ dim3(block_size), 0, self.device().stream(), self, total_size, data);
+#else
LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
+#endif
}
};
-#endif // EIGEN_USE_GPU && EIGEN_CUDACC
+#endif // EIGEN_USE_GPU && (EIGEN_CUDACC || EIGEN_HIPCC)
} // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
index 49d315a66..bb584e3f9 100644
--- a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
+++ b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
@@ -268,6 +268,9 @@ template<
typename Reducer
> struct reduce<Reducer>
{
+ #if defined(EIGEN_HIPCC)
+ EIGEN_DEVICE_FUNC
+ #endif
constexpr static inline int run() { return Reducer::Identity; }
};
@@ -276,6 +279,9 @@ template<
typename A
> struct reduce<Reducer, A>
{
+ #if defined(EIGEN_HIPCC)
+ EIGEN_DEVICE_FUNC
+ #endif
constexpr static inline A run(A a) { return a; }
};
@@ -285,6 +291,9 @@ template<
typename... Ts
> struct reduce<Reducer, A, Ts...>
{
+ #if defined(EIGEN_HIPCC)
+ EIGEN_DEVICE_FUNC
+ #endif
constexpr static inline auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) {
return Reducer::run(a, reduce<Reducer, Ts...>::run(ts...));
}
@@ -324,6 +333,9 @@ struct greater_equal_zero_op { template<typename A> constexpr static inline auto
// together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1
// does...
template<typename... Ts>
+#if defined(EIGEN_HIPCC)
+EIGEN_DEVICE_FUNC
+#endif
constexpr inline decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts)
{
return reduce<product_op, Ts...>::run(ts...);
diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
index 96b3a8261..18b76350b 100644
--- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
@@ -15,7 +15,7 @@
// The array class is only available starting with cxx11. Emulate our own here
// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler!
// Moreover, CUDA doesn't support the STL containers, so we use our own instead.
-#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_CUDACC) || defined(EIGEN_AVOID_STL_ARRAY)
+#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) || defined(EIGEN_AVOID_STL_ARRAY)
namespace Eigen {
template <typename T, size_t n> class array {
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
index 6c7ac3f3b..5ab67dc1f 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
@@ -190,7 +190,7 @@ template <>
struct lgamma_impl<float> {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE float run(float x) {
-#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
+#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) && !defined(EIGEN_HIP_DEVICE_COMPILE)
int dummy;
return ::lgammaf_r(x, &dummy);
#else
@@ -203,7 +203,7 @@ template <>
struct lgamma_impl<double> {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE double run(double x) {
-#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
+#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) && !defined(EIGEN_HIP_DEVICE_COMPILE)
int dummy;
return ::lgamma_r(x, &dummy);
#else
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 42b790d55..c3dba5f2a 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -297,3 +297,55 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
endif()
+
+# Add HIP specific tests
+if (EIGEN_TEST_HIP)
+
+ set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.")
+
+ if (EXISTS ${HIP_PATH})
+
+ list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake)
+
+ find_package(HIP REQUIRED)
+ if (HIP_FOUND)
+
+ execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)
+
+ if (${HIP_PLATFORM} STREQUAL "hcc")
+
+ include_directories(${CMAKE_CURRENT_BINARY_DIR})
+ include_directories(${HIP_PATH}/include)
+
+ set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
+
+ # ei_add_test(cxx11_tensor_complex_hip)
+ # ei_add_test(cxx11_tensor_complex_cwise_ops_hip)
+ ei_add_test(cxx11_tensor_reduction_hip)
+ ei_add_test(cxx11_tensor_argmax_hip)
+ ei_add_test(cxx11_tensor_cast_float16_hip)
+ ei_add_test(cxx11_tensor_scan_hip)
+ ei_add_test(cxx11_tensor_device_hip)
+ ei_add_test(cxx11_tensor_hip)
+ ei_add_test(cxx11_tensor_contract_hip)
+ ei_add_test(cxx11_tensor_of_float16_hip)
+ ei_add_test(cxx11_tensor_random_hip)
+
+ unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+
+ elseif (${HIP_PLATFORM} STREQUAL "nvcc")
+ message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen")
+ else ()
+ message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}")
+ endif()
+
+ endif(HIP_FOUND)
+
+ else ()
+
+ message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist")
+
+ endif()
+
+endif(EIGEN_TEST_HIP)
+
diff --git a/unsupported/test/cxx11_tensor_argmax_hip.cu b/unsupported/test/cxx11_tensor_argmax_hip.cu
new file mode 100644
index 000000000..57d6ca000
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_argmax_hip.cu
@@ -0,0 +1,251 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_FUNC cxx11_tensor_hip
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <int Layout>
+void test_hip_simple_argmax()
+{
+ Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97));
+ Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1));
+ Tensor<DenseIndex, 1, Layout> out_min(Eigen::array<DenseIndex, 1>(1));
+ in.setRandom();
+ in *= in.constant(100.0);
+ in(0, 0, 0) = -1000.0;
+ in(71, 52, 96) = 1000.0;
+
+ std::size_t in_bytes = in.size() * sizeof(double);
+ std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
+
+ double* d_in;
+ DenseIndex* d_out_max;
+ DenseIndex* d_out_min;
+ hipMalloc((void**)(&d_in), in_bytes);
+ hipMalloc((void**)(&d_out_max), out_bytes);
+ hipMalloc((void**)(&d_out_min), out_bytes);
+
+ hipMemcpy(d_in, in.data(), in_bytes, hipMemcpyHostToDevice);
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97));
+ Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_max(d_out_max, Eigen::array<DenseIndex, 1>(1));
+ Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_min(d_out_min, Eigen::array<DenseIndex, 1>(1));
+
+ gpu_out_max.device(gpu_device) = gpu_in.argmax();
+ gpu_out_min.device(gpu_device) = gpu_in.argmin();
+
+ assert(hipMemcpyAsync(out_max.data(), d_out_max, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+ assert(hipMemcpyAsync(out_min.data(), d_out_min, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+ assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+ VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1);
+ VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0);
+
+ hipFree(d_in);
+ hipFree(d_out_max);
+ hipFree(d_out_min);
+}
+
+template <int DataLayout>
+void test_hip_argmax_dim()
+{
+ Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+ std::vector<int> dims;
+ dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7);
+
+ for (int dim = 0; dim < 4; ++dim) {
+ tensor.setRandom();
+ tensor = (tensor + tensor.constant(0.5)).log();
+
+ array<DenseIndex, 3> out_shape;
+ for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
+
+ Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape);
+
+ array<DenseIndex, 4> ix;
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 3; ++j) {
+ for (int k = 0; k < 5; ++k) {
+ for (int l = 0; l < 7; ++l) {
+ ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+ if (ix[dim] != 0) continue;
+ // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
+ tensor(ix) = 10.0;
+ }
+ }
+ }
+ }
+
+ std::size_t in_bytes = tensor.size() * sizeof(float);
+ std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
+
+ float* d_in;
+ DenseIndex* d_out;
+ hipMalloc((void**)(&d_in), in_bytes);
+ hipMalloc((void**)(&d_out), out_bytes);
+
+ hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
+ Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape);
+
+ gpu_out.device(gpu_device) = gpu_in.argmax(dim);
+
+ assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+ assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+ VERIFY_IS_EQUAL(tensor_arg.size(),
+ size_t(2*3*5*7 / tensor.dimension(dim)));
+
+ for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+ // Expect max to be in the first index of the reduced dimension
+ VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+ }
+
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 3; ++j) {
+ for (int k = 0; k < 5; ++k) {
+ for (int l = 0; l < 7; ++l) {
+ ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+ if (ix[dim] != tensor.dimension(dim) - 1) continue;
+ // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
+ tensor(ix) = 20.0;
+ }
+ }
+ }
+ }
+
+ hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
+
+ gpu_out.device(gpu_device) = gpu_in.argmax(dim);
+
+ assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+ assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+ for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+ // Expect max to be in the last index of the reduced dimension
+ VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
+ }
+
+ hipFree(d_in);
+ hipFree(d_out);
+ }
+}
+
+template <int DataLayout>
+void test_hip_argmin_dim()
+{
+ Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+ std::vector<int> dims;
+ dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7);
+
+ for (int dim = 0; dim < 4; ++dim) {
+ tensor.setRandom();
+ tensor = (tensor + tensor.constant(0.5)).log();
+
+ array<DenseIndex, 3> out_shape;
+ for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
+
+ Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape);
+
+ array<DenseIndex, 4> ix;
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 3; ++j) {
+ for (int k = 0; k < 5; ++k) {
+ for (int l = 0; l < 7; ++l) {
+ ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+ if (ix[dim] != 0) continue;
+ // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
+ tensor(ix) = -10.0;
+ }
+ }
+ }
+ }
+
+ std::size_t in_bytes = tensor.size() * sizeof(float);
+ std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
+
+ float* d_in;
+ DenseIndex* d_out;
+ hipMalloc((void**)(&d_in), in_bytes);
+ hipMalloc((void**)(&d_out), out_bytes);
+
+ hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
+ Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape);
+
+ gpu_out.device(gpu_device) = gpu_in.argmin(dim);
+
+ assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+ assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+ VERIFY_IS_EQUAL(tensor_arg.size(),
+ 2*3*5*7 / tensor.dimension(dim));
+
+ for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+ // Expect min to be in the first index of the reduced dimension
+ VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+ }
+
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 3; ++j) {
+ for (int k = 0; k < 5; ++k) {
+ for (int l = 0; l < 7; ++l) {
+ ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+ if (ix[dim] != tensor.dimension(dim) - 1) continue;
+ // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
+ tensor(ix) = -20.0;
+ }
+ }
+ }
+ }
+
+ hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
+
+ gpu_out.device(gpu_device) = gpu_in.argmin(dim);
+
+ assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+ assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+ for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+ // Expect max to be in the last index of the reduced dimension
+ VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
+ }
+
+ hipFree(d_in);
+ hipFree(d_out);
+ }
+}
+
+void test_cxx11_tensor_hip()
+{
+ CALL_SUBTEST(test_hip_simple_argmax<RowMajor>());
+ CALL_SUBTEST(test_hip_simple_argmax<ColMajor>());
+ CALL_SUBTEST(test_hip_argmax_dim<RowMajor>());
+ CALL_SUBTEST(test_hip_argmax_dim<ColMajor>());
+ CALL_SUBTEST(test_hip_argmin_dim<RowMajor>());
+ CALL_SUBTEST(test_hip_argmin_dim<ColMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_cast_float16_hip.cu b/unsupported/test/cxx11_tensor_cast_float16_hip.cu
new file mode 100644
index 000000000..bf6a49df4
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_cast_float16_hip.cu
@@ -0,0 +1,79 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_hip
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+void test_hip_conversion() {
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+ int num_elem = 101;
+
+ Tensor<float, 1> floats(num_elem);
+ floats.setRandom();
+
+ float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+ Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+ float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+ d_float, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
+ d_half, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
+ d_conv, num_elem);
+
+ gpu_device.memcpyHostToDevice(d_float, floats.data(), num_elem*sizeof(float));
+
+ gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
+ gpu_conv.device(gpu_device) = gpu_half.cast<float>();
+
+ Tensor<float, 1> initial(num_elem);
+ Tensor<float, 1> final(num_elem);
+ gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
+ gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
+ gpu_device.synchronize();
+
+ for (int i = 0; i < num_elem; ++i) {
+ VERIFY_IS_APPROX(initial(i), final(i));
+ }
+
+ gpu_device.deallocate(d_float);
+ gpu_device.deallocate(d_half);
+ gpu_device.deallocate(d_conv);
+}
+
+
+void test_fallback_conversion() {
+ int num_elem = 101;
+ Tensor<float, 1> floats(num_elem);
+ floats.setRandom();
+
+ Eigen::Tensor<Eigen::half, 1> halfs = floats.cast<Eigen::half>();
+ Eigen::Tensor<float, 1> conv = halfs.cast<float>();
+
+ for (int i = 0; i < num_elem; ++i) {
+ VERIFY_IS_APPROX(floats(i), conv(i));
+ }
+}
+
+
+void test_cxx11_tensor_cast_float16_hip()
+{
+ CALL_SUBTEST(test_hip_conversion());
+ CALL_SUBTEST(test_fallback_conversion());
+}
diff --git a/unsupported/test/cxx11_tensor_contract_hip.cu b/unsupported/test/cxx11_tensor_contract_hip.cu
new file mode 100644
index 000000000..652af0ab0
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_contract_hip.cu
@@ -0,0 +1,215 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_hip
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+
+using Eigen::Tensor;
+typedef Tensor<float, 1>::DimensionPair DimPair;
+
+template<int DataLayout>
+void test_hip_contraction(int m_size, int k_size, int n_size)
+{
+ std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+ // with these dimensions, the output has 300 * 140 elements, which is
+ // more than 30 * 1024, which is the number of threads in blocks on
+ // a 15 SM GK110 GPU
+ Tensor<float, 2, DataLayout> t_left(m_size, k_size);
+ Tensor<float, 2, DataLayout> t_right(k_size, n_size);
+ Tensor<float, 2, DataLayout> t_result(m_size, n_size);
+ Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size);
+ Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+
+ t_left.setRandom();
+ t_right.setRandom();
+
+ std::size_t t_left_bytes = t_left.size() * sizeof(float);
+ std::size_t t_right_bytes = t_right.size() * sizeof(float);
+ std::size_t t_result_bytes = t_result.size() * sizeof(float);
+
+ float* d_t_left;
+ float* d_t_right;
+ float* d_t_result;
+
+ hipMalloc((void**)(&d_t_left), t_left_bytes);
+ hipMalloc((void**)(&d_t_right), t_right_bytes);
+ hipMalloc((void**)(&d_t_result), t_result_bytes);
+
+ hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice);
+ hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice);
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+ gpu_t_left(d_t_left, Eigen::array<int, 2>(m_size, k_size));
+ Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+ gpu_t_right(d_t_right, Eigen::array<int, 2>(k_size, n_size));
+ Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+ gpu_t_result(d_t_result, Eigen::array<int, 2>(m_size, n_size));
+
+
+ gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
+ t_result = t_left.contract(t_right, dims);
+
+ hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
+ for (DenseIndex i = 0; i < t_result.size(); i++) {
+ if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
+ continue;
+ }
+ if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
+ continue;
+ }
+ std::cout << "mismatch detected at index " << i << ": " << t_result(i)
+ << " vs " << t_result_gpu(i) << std::endl;
+ assert(false);
+ }
+
+ hipFree((void*)d_t_left);
+ hipFree((void*)d_t_right);
+ hipFree((void*)d_t_result);
+}
+
+
+template<int DataLayout>
+void test_scalar(int m_size, int k_size, int n_size)
+{
+ std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+ // with these dimensions, the output has 300 * 140 elements, which is
+ // more than 30 * 1024, which is the number of threads in blocks on
+ // a 15 SM GK110 GPU
+ Tensor<float, 2, DataLayout> t_left(m_size, k_size);
+ Tensor<float, 2, DataLayout> t_right(k_size, n_size);
+ Tensor<float, 0, DataLayout> t_result;
+ Tensor<float, 0, DataLayout> t_result_gpu;
+ Eigen::array<DimPair, 2> dims(DimPair(0, 0), DimPair(1, 1));
+
+ t_left.setRandom();
+ t_right.setRandom();
+
+ std::size_t t_left_bytes = t_left.size() * sizeof(float);
+ std::size_t t_right_bytes = t_right.size() * sizeof(float);
+ std::size_t t_result_bytes = sizeof(float);
+
+ float* d_t_left;
+ float* d_t_right;
+ float* d_t_result;
+
+ hipMalloc((void**)(&d_t_left), t_left_bytes);
+ hipMalloc((void**)(&d_t_right), t_right_bytes);
+ hipMalloc((void**)(&d_t_result), t_result_bytes);
+
+ hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice);
+ hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice);
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+ gpu_t_left(d_t_left, m_size, k_size);
+ Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+ gpu_t_right(d_t_right, k_size, n_size);
+ Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> >
+ gpu_t_result(d_t_result);
+
+ gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
+ t_result = t_left.contract(t_right, dims);
+
+ hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
+ if (fabs(t_result() - t_result_gpu()) > 1e-4f &&
+ !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
+ std::cout << "mismatch detected: " << t_result()
+ << " vs " << t_result_gpu() << std::endl;
+ assert(false);
+ }
+
+ hipFree((void*)d_t_left);
+ hipFree((void*)d_t_right);
+ hipFree((void*)d_t_result);
+}
+
+
+template<int DataLayout>
+void test_hip_contraction_m() {
+ for (int k = 32; k < 256; k++) {
+ test_hip_contraction<ColMajor>(k, 128, 128);
+ test_hip_contraction<RowMajor>(k, 128, 128);
+ }
+}
+
+template<int DataLayout>
+void test_hip_contraction_k() {
+ for (int k = 32; k < 256; k++) {
+ test_hip_contraction<ColMajor>(128, k, 128);
+ test_hip_contraction<RowMajor>(128, k, 128);
+ }
+}
+
+template<int DataLayout>
+void test_hip_contraction_n() {
+ for (int k = 32; k < 256; k++) {
+ test_hip_contraction<ColMajor>(128, 128, k);
+ test_hip_contraction<RowMajor>(128, 128, k);
+ }
+}
+
+
+template<int DataLayout>
+void test_hip_contraction_sizes() {
+ int m_sizes[] = { 31, 39, 63, 64, 65,
+ 127, 129, 255, 257 , 511,
+ 512, 513, 1023, 1024, 1025};
+
+ int n_sizes[] = { 31, 39, 63, 64, 65,
+ 127, 129, 255, 257, 511,
+ 512, 513, 1023, 1024, 1025};
+
+ int k_sizes[] = { 31, 39, 63, 64, 65,
+ 95, 96, 127, 129, 255,
+ 257, 511, 512, 513, 1023,
+ 1024, 1025};
+
+ for (int i = 0; i < 15; i++) {
+ for (int j = 0; j < 15; j++) {
+ for (int k = 0; k < 17; k++) {
+ test_hip_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
+ }
+ }
+ }
+}
+
+void test_cxx11_tensor_hip()
+{
+ CALL_SUBTEST(test_hip_contraction<ColMajor>(128, 128, 128));
+ CALL_SUBTEST(test_hip_contraction<RowMajor>(128, 128, 128));
+
+ CALL_SUBTEST(test_scalar<ColMajor>(128, 128, 128));
+ CALL_SUBTEST(test_scalar<RowMajor>(128, 128, 128));
+
+ CALL_SUBTEST(test_hip_contraction_m<ColMajor>());
+ CALL_SUBTEST(test_hip_contraction_m<RowMajor>());
+
+ CALL_SUBTEST(test_hip_contraction_k<ColMajor>());
+ CALL_SUBTEST(test_hip_contraction_k<RowMajor>());
+
+ CALL_SUBTEST(test_hip_contraction_n<ColMajor>());
+ CALL_SUBTEST(test_hip_contraction_n<RowMajor>());
+
+ // Commenting out these tests due to long runtimes
+ // CALL_SUBTEST(test_hip_contraction_sizes<ColMajor>());
+ // CALL_SUBTEST(test_hip_contraction_sizes<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_device_hip.cu b/unsupported/test/cxx11_tensor_device_hip.cu
new file mode 100644
index 000000000..b98c481ff
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_device_hip.cu
@@ -0,0 +1,389 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_device
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+// Context for evaluation on cpu
+struct CPUContext {
+ CPUContext(const Eigen::Tensor<float, 3>& in1, Eigen::Tensor<float, 3>& in2, Eigen::Tensor<float, 3>& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) {
+ kernel_1d_(0) = 3.14f;
+ kernel_1d_(1) = 2.7f;
+
+ kernel_2d_(0,0) = 3.14f;
+ kernel_2d_(1,0) = 2.7f;
+ kernel_2d_(0,1) = 0.2f;
+ kernel_2d_(1,1) = 7.0f;
+
+ kernel_3d_(0,0,0) = 3.14f;
+ kernel_3d_(0,1,0) = 2.7f;
+ kernel_3d_(0,0,1) = 0.2f;
+ kernel_3d_(0,1,1) = 7.0f;
+ kernel_3d_(1,0,0) = -1.0f;
+ kernel_3d_(1,1,0) = -0.3f;
+ kernel_3d_(1,0,1) = -0.7f;
+ kernel_3d_(1,1,1) = -0.5f;
+ }
+
+ const Eigen::DefaultDevice& device() const { return cpu_device_; }
+
+ const Eigen::Tensor<float, 3>& in1() const { return in1_; }
+ const Eigen::Tensor<float, 3>& in2() const { return in2_; }
+ Eigen::Tensor<float, 3>& out() { return out_; }
+ const Eigen::Tensor<float, 1>& kernel1d() const { return kernel_1d_; }
+ const Eigen::Tensor<float, 2>& kernel2d() const { return kernel_2d_; }
+ const Eigen::Tensor<float, 3>& kernel3d() const { return kernel_3d_; }
+
+ private:
+ const Eigen::Tensor<float, 3>& in1_;
+ const Eigen::Tensor<float, 3>& in2_;
+ Eigen::Tensor<float, 3>& out_;
+
+ Eigen::Tensor<float, 1> kernel_1d_;
+ Eigen::Tensor<float, 2> kernel_2d_;
+ Eigen::Tensor<float, 3> kernel_3d_;
+
+ Eigen::DefaultDevice cpu_device_;
+};
+
+
+// Context for evaluation on GPU
+struct GPUContext {
+ GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
+ assert(hipMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == hipSuccess);
+ float kernel_1d_val[] = {3.14f, 2.7f};
+ assert(hipMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), hipMemcpyHostToDevice) == hipSuccess);
+
+ assert(hipMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == hipSuccess);
+ float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f};
+ assert(hipMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), hipMemcpyHostToDevice) == hipSuccess);
+
+ assert(hipMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == hipSuccess);
+ float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f};
+ assert(hipMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), hipMemcpyHostToDevice) == hipSuccess);
+ }
+ ~GPUContext() {
+ assert(hipFree(kernel_1d_) == hipSuccess);
+ assert(hipFree(kernel_2d_) == hipSuccess);
+ assert(hipFree(kernel_3d_) == hipSuccess);
+ }
+
+ const Eigen::GpuDevice& device() const { return gpu_device_; }
+
+ const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1() const { return in1_; }
+ const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2() const { return in2_; }
+ Eigen::TensorMap<Eigen::Tensor<float, 3> >& out() { return out_; }
+ Eigen::TensorMap<Eigen::Tensor<float, 1> > kernel1d() const { return Eigen::TensorMap<Eigen::Tensor<float, 1> >(kernel_1d_, 2); }
+ Eigen::TensorMap<Eigen::Tensor<float, 2> > kernel2d() const { return Eigen::TensorMap<Eigen::Tensor<float, 2> >(kernel_2d_, 2, 2); }
+ Eigen::TensorMap<Eigen::Tensor<float, 3> > kernel3d() const { return Eigen::TensorMap<Eigen::Tensor<float, 3> >(kernel_3d_, 2, 2, 2); }
+
+ private:
+ const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1_;
+ const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2_;
+ Eigen::TensorMap<Eigen::Tensor<float, 3> >& out_;
+
+ float* kernel_1d_;
+ float* kernel_2d_;
+ float* kernel_3d_;
+
+ Eigen::HipStreamDevice stream_;
+ Eigen::GpuDevice gpu_device_;
+};
+
+
+// The actual expression to evaluate
+template <typename Context>
+void test_contextual_eval(Context* context)
+{
+ context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f);
+}
+
+template <typename Context>
+void test_forced_contextual_eval(Context* context)
+{
+ context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f);
+}
+
+template <typename Context>
+void test_compound_assignment(Context* context)
+{
+ context->out().device(context->device()) = context->in1().constant(2.718f);
+ context->out().device(context->device()) += context->in1() + context->in2() * 3.14f;
+}
+
+
+template <typename Context>
+void test_contraction(Context* context)
+{
+ Eigen::array<std::pair<int, int>, 2> dims;
+ dims[0] = std::make_pair(1, 1);
+ dims[1] = std::make_pair(2, 2);
+
+ Eigen::array<int, 2> shape(40, 50*70);
+
+ Eigen::DSizes<int, 2> indices(0,0);
+ Eigen::DSizes<int, 2> sizes(40,40);
+
+ context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims);
+}
+
+
+template <typename Context>
+void test_1d_convolution(Context* context)
+{
+ Eigen::DSizes<int, 3> indices(0,0,0);
+ Eigen::DSizes<int, 3> sizes(40,49,70);
+
+ Eigen::array<int, 1> dims(1);
+ context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims);
+}
+
+template <typename Context>
+void test_2d_convolution(Context* context)
+{
+ Eigen::DSizes<int, 3> indices(0,0,0);
+ Eigen::DSizes<int, 3> sizes(40,49,69);
+
+ Eigen::array<int, 2> dims(1,2);
+ context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims);
+}
+
+template <typename Context>
+void test_3d_convolution(Context* context)
+{
+ Eigen::DSizes<int, 3> indices(0,0,0);
+ Eigen::DSizes<int, 3> sizes(39,49,69);
+
+ Eigen::array<int, 3> dims(0,1,2);
+ context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims);
+}
+
+
+void test_cpu() {
+ Eigen::Tensor<float, 3> in1(40,50,70);
+ Eigen::Tensor<float, 3> in2(40,50,70);
+ Eigen::Tensor<float, 3> out(40,50,70);
+
+ in1 = in1.random() + in1.constant(10.0f);
+ in2 = in2.random() + in2.constant(10.0f);
+
+ CPUContext context(in1, in2, out);
+ test_contextual_eval(&context);
+ for (int i = 0; i < 40; ++i) {
+ for (int j = 0; j < 50; ++j) {
+ for (int k = 0; k < 70; ++k) {
+ VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
+ }
+ }
+ }
+
+ test_forced_contextual_eval(&context);
+ for (int i = 0; i < 40; ++i) {
+ for (int j = 0; j < 50; ++j) {
+ for (int k = 0; k < 70; ++k) {
+ VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
+ }
+ }
+ }
+
+ test_compound_assignment(&context);
+ for (int i = 0; i < 40; ++i) {
+ for (int j = 0; j < 50; ++j) {
+ for (int k = 0; k < 70; ++k) {
+ VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
+ }
+ }
+ }
+
+ test_contraction(&context);
+ for (int i = 0; i < 40; ++i) {
+ for (int j = 0; j < 40; ++j) {
+ const float result = out(i,j,0);
+ float expected = 0;
+ for (int k = 0; k < 50; ++k) {
+ for (int l = 0; l < 70; ++l) {
+ expected += in1(i, k, l) * in2(j, k, l);
+ }
+ }
+ VERIFY_IS_APPROX(expected, result);
+ }
+ }
+
+ test_1d_convolution(&context);
+ for (int i = 0; i < 40; ++i) {
+ for (int j = 0; j < 49; ++j) {
+ for (int k = 0; k < 70; ++k) {
+ VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
+ }
+ }
+ }
+
+ test_2d_convolution(&context);
+ for (int i = 0; i < 40; ++i) {
+ for (int j = 0; j < 49; ++j) {
+ for (int k = 0; k < 69; ++k) {
+ const float result = out(i,j,k);
+ const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) +
+ (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
+ if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
+ continue;
+ }
+ VERIFY_IS_APPROX(expected, result);
+ }
+ }
+ }
+
+ test_3d_convolution(&context);
+ for (int i = 0; i < 39; ++i) {
+ for (int j = 0; j < 49; ++j) {
+ for (int k = 0; k < 69; ++k) {
+ const float result = out(i,j,k);
+ const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
+ in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) +
+ (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
+ in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
+ if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
+ continue;
+ }
+ VERIFY_IS_APPROX(expected, result);
+ }
+ }
+ }
+}
+
+void test_gpu() {
+ Eigen::Tensor<float, 3> in1(40,50,70);
+ Eigen::Tensor<float, 3> in2(40,50,70);
+ Eigen::Tensor<float, 3> out(40,50,70);
+ in1 = in1.random() + in1.constant(10.0f);
+ in2 = in2.random() + in2.constant(10.0f);
+
+ std::size_t in1_bytes = in1.size() * sizeof(float);
+ std::size_t in2_bytes = in2.size() * sizeof(float);
+ std::size_t out_bytes = out.size() * sizeof(float);
+
+ float* d_in1;
+ float* d_in2;
+ float* d_out;
+ hipMalloc((void**)(&d_in1), in1_bytes);
+ hipMalloc((void**)(&d_in2), in2_bytes);
+ hipMalloc((void**)(&d_out), out_bytes);
+
+ hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
+ hipMemcpy(d_in2, in2.data(), in2_bytes, hipMemcpyHostToDevice);
+
+ Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
+ Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
+ Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, 40,50,70);
+
+ GPUContext context(gpu_in1, gpu_in2, gpu_out);
+ test_contextual_eval(&context);
+ assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
+ for (int i = 0; i < 40; ++i) {
+ for (int j = 0; j < 50; ++j) {
+ for (int k = 0; k < 70; ++k) {
+ VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
+ }
+ }
+ }
+
+ test_forced_contextual_eval(&context);
+ assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
+ for (int i = 0; i < 40; ++i) {
+ for (int j = 0; j < 50; ++j) {
+ for (int k = 0; k < 70; ++k) {
+ VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
+ }
+ }
+ }
+
+ test_compound_assignment(&context);
+ assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
+ for (int i = 0; i < 40; ++i) {
+ for (int j = 0; j < 50; ++j) {
+ for (int k = 0; k < 70; ++k) {
+ VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
+ }
+ }
+ }
+
+ test_contraction(&context);
+ assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
+ for (int i = 0; i < 40; ++i) {
+ for (int j = 0; j < 40; ++j) {
+ const float result = out(i,j,0);
+ float expected = 0;
+ for (int k = 0; k < 50; ++k) {
+ for (int l = 0; l < 70; ++l) {
+ expected += in1(i, k, l) * in2(j, k, l);
+ }
+ }
+ VERIFY_IS_APPROX(expected, result);
+ }
+ }
+
+ test_1d_convolution(&context);
+ assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess);
+ assert(hipStreamSynchronize(context.device().stream()) == hipSuccess);
+ for (int i = 0; i < 40; ++i) {
+ for (int j = 0; j < 49; ++j) {
+ for (int k = 0; k < 70; ++k) {
+ VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
+ }
+ }
+ }
+
+ test_2d_convolution(&context);
+ assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess);
+ assert(hipStreamSynchronize(context.device().stream()) == hipSuccess);
+ for (int i = 0; i < 40; ++i) {
+ for (int j = 0; j < 49; ++j) {
+ for (int k = 0; k < 69; ++k) {
+ const float result = out(i,j,k);
+ const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
+ in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
+ VERIFY_IS_APPROX(expected, result);
+ }
+ }
+ }
+
+ /*
+ test_3d_convolution(&context);
+ assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess);
+ assert(hipStreamSynchronize(context.device().stream()) == hipSuccess);
+ for (int i = 0; i < 39; ++i) {
+ for (int j = 0; j < 49; ++j) {
+ for (int k = 0; k < 69; ++k) {
+ const float result = out(i,j,k);
+ const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
+ in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f +
+ in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
+ in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
+ VERIFY_IS_APPROX(expected, result);
+ }
+ }
+ }
+ */
+}
+
+
+void test_cxx11_tensor_device()
+{
+ CALL_SUBTEST(test_cpu());
+ CALL_SUBTEST(test_gpu());
+}
diff --git a/unsupported/test/cxx11_tensor_hip.cu b/unsupported/test/cxx11_tensor_hip.cu
new file mode 100644
index 000000000..f1794aa87
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_hip.cu
@@ -0,0 +1,1296 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_hip
+#define EIGEN_USE_GPU
+
+#ifdef __NVCC__
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
+#endif
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+void test_hip_nullary() {
+ Tensor<float, 1, 0, int> in1(2);
+ Tensor<float, 1, 0, int> in2(2);
+ in1.setRandom();
+ in2.setRandom();
+
+ std::size_t tensor_bytes = in1.size() * sizeof(float);
+
+ float* d_in1;
+ float* d_in2;
+ hipMalloc((void**)(&d_in1), tensor_bytes);
+ hipMalloc((void**)(&d_in2), tensor_bytes);
+ hipMemcpy(d_in1, in1.data(), tensor_bytes, hipMemcpyHostToDevice);
+ hipMemcpy(d_in2, in2.data(), tensor_bytes, hipMemcpyHostToDevice);
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1(
+ d_in1, 2);
+ Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in2(
+ d_in2, 2);
+
+ gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f);
+ gpu_in2.device(gpu_device) = gpu_in2.random();
+
+ Tensor<float, 1, 0, int> new1(2);
+ Tensor<float, 1, 0, int> new2(2);
+
+ assert(hipMemcpyAsync(new1.data(), d_in1, tensor_bytes, hipMemcpyDeviceToHost,
+ gpu_device.stream()) == hipSuccess);
+ assert(hipMemcpyAsync(new2.data(), d_in2, tensor_bytes, hipMemcpyDeviceToHost,
+ gpu_device.stream()) == hipSuccess);
+
+ assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+ for (int i = 0; i < 2; ++i) {
+ VERIFY_IS_APPROX(new1(i), 3.14f);
+ VERIFY_IS_NOT_EQUAL(new2(i), in2(i));
+ }
+
+ hipFree(d_in1);
+ hipFree(d_in2);
+}
+
+void test_hip_elementwise_small() {
+ Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>{2});
+ Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>{2});
+ Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>{2});
+ in1.setRandom();
+ in2.setRandom();
+
+ std::size_t in1_bytes = in1.size() * sizeof(float);
+ std::size_t in2_bytes = in2.size() * sizeof(float);
+ std::size_t out_bytes = out.size() * sizeof(float);
+
+ float* d_in1;
+ float* d_in2;
+ float* d_out;
+ hipMalloc((void**)(&d_in1), in1_bytes);
+ hipMalloc((void**)(&d_in2), in2_bytes);
+ hipMalloc((void**)(&d_out), out_bytes);
+
+ hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
+ hipMemcpy(d_in2, in2.data(), in2_bytes, hipMemcpyHostToDevice);
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
+ d_in1, Eigen::array<Eigen::DenseIndex, 1>{2});
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2(
+ d_in2, Eigen::array<Eigen::DenseIndex, 1>{2});
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out(
+ d_out, Eigen::array<Eigen::DenseIndex, 1>{2});
+
+ gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
+
+ assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost,
+ gpu_device.stream()) == hipSuccess);
+ assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+ for (int i = 0; i < 2; ++i) {
+ VERIFY_IS_APPROX(
+ out(Eigen::array<Eigen::DenseIndex, 1>{i}),
+ in1(Eigen::array<Eigen::DenseIndex, 1>{i}) + in2(Eigen::array<Eigen::DenseIndex, 1>{i}));
+ }
+
+ hipFree(d_in1);
+ hipFree(d_in2);
+ hipFree(d_out);
+}
+
+void test_hip_elementwise()
+{
+ Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
+ Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
+ Tensor<float, 3> in3(Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
+ Tensor<float, 3> out(Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
+ in1.setRandom();
+ in2.setRandom();
+ in3.setRandom();
+
+ std::size_t in1_bytes = in1.size() * sizeof(float);
+ std::size_t in2_bytes = in2.size() * sizeof(float);
+ std::size_t in3_bytes = in3.size() * sizeof(float);
+ std::size_t out_bytes = out.size() * sizeof(float);
+
+ float* d_in1;
+ float* d_in2;
+ float* d_in3;
+ float* d_out;
+ hipMalloc((void**)(&d_in1), in1_bytes);
+ hipMalloc((void**)(&d_in2), in2_bytes);
+ hipMalloc((void**)(&d_in3), in3_bytes);
+ hipMalloc((void**)(&d_out), out_bytes);
+
+ hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
+ hipMemcpy(d_in2, in2.data(), in2_bytes, hipMemcpyHostToDevice);
+ hipMemcpy(d_in3, in3.data(), in3_bytes, hipMemcpyHostToDevice);
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
+ Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
+ Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
+ Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
+
+ gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
+
+ assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+ assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+ for (int i = 0; i < 72; ++i) {
+ for (int j = 0; j < 53; ++j) {
+ for (int k = 0; k < 97; ++k) {
+ VERIFY_IS_APPROX(out(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}), in1(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}) + in2(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}) * in3(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}));
+ }
+ }
+ }
+
+ hipFree(d_in1);
+ hipFree(d_in2);
+ hipFree(d_in3);
+ hipFree(d_out);
+}
+
+void test_hip_props() {
+ Tensor<float, 1> in1(200);
+ Tensor<bool, 1> out(200);
+ in1.setRandom();
+
+ std::size_t in1_bytes = in1.size() * sizeof(float);
+ std::size_t out_bytes = out.size() * sizeof(bool);
+
+ float* d_in1;
+ bool* d_out;
+ hipMalloc((void**)(&d_in1), in1_bytes);
+ hipMalloc((void**)(&d_out), out_bytes);
+
+ hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
+ d_in1, 200);
+ Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_out(
+ d_out, 200);
+
+ gpu_out.device(gpu_device) = (gpu_in1.isnan)();
+
+ assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost,
+ gpu_device.stream()) == hipSuccess);
+ assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+ for (int i = 0; i < 200; ++i) {
+ VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i)));
+ }
+
+ hipFree(d_in1);
+ hipFree(d_out);
+}
+
+void test_hip_reduction()
+{
+ Tensor<float, 4> in1(72,53,97,113);
+ Tensor<float, 2> out(72,97);
+ in1.setRandom();
+
+ std::size_t in1_bytes = in1.size() * sizeof(float);
+ std::size_t out_bytes = out.size() * sizeof(float);
+
+ float* d_in1;
+ float* d_out;
+ hipMalloc((void**)(&d_in1), in1_bytes);
+ hipMalloc((void**)(&d_out), out_bytes);
+
+ hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
+ Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
+
+ array<Eigen::DenseIndex, 2> reduction_axis;
+ reduction_axis[0] = 1;
+ reduction_axis[1] = 3;
+
+ gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis);
+
+ assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+ assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+ for (int i = 0; i < 72; ++i) {
+ for (int j = 0; j < 97; ++j) {
+ float expected = 0;
+ for (int k = 0; k < 53; ++k) {
+ for (int l = 0; l < 113; ++l) {
+ expected =
+ std::max<float>(expected, in1(i, k, j, l));
+ }
+ }
+ VERIFY_IS_APPROX(out(i,j), expected);
+ }
+ }
+
+ hipFree(d_in1);
+ hipFree(d_out);
+}
+
+template<int DataLayout>
+void test_hip_contraction()
+{
+ // with these dimensions, the output has 300 * 140 elements, which is
+ // more than 30 * 1024, which is the number of threads in blocks on
+ // a 15 SM GK110 GPU
+ Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31);
+ Tensor<float, 5, DataLayout> t_right(Eigen::array<Eigen::DenseIndex, 5>{3, 31, 7, 20, 1});
+ Tensor<float, 5, DataLayout> t_result(Eigen::array<Eigen::DenseIndex, 5>{6, 50, 7, 20, 1});
+
+ t_left.setRandom();
+ t_right.setRandom();
+
+ std::size_t t_left_bytes = t_left.size() * sizeof(float);
+ std::size_t t_right_bytes = t_right.size() * sizeof(float);
+ std::size_t t_result_bytes = t_result.size() * sizeof(float);
+
+ float* d_t_left;
+ float* d_t_right;
+ float* d_t_result;
+
+ hipMalloc((void**)(&d_t_left), t_left_bytes);
+ hipMalloc((void**)(&d_t_right), t_right_bytes);
+ hipMalloc((void**)(&d_t_result), t_result_bytes);
+
+ hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice);
+ hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice);
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31);
+ Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_right(d_t_right, 3, 31, 7, 20, 1);
+ Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_result(d_t_result, 6, 50, 7, 20, 1);
+
+ typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf;
+ MapXf m_left(t_left.data(), 300, 93);
+ MapXf m_right(t_right.data(), 93, 140);
+ Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(300, 140);
+
+ typedef Tensor<float, 1>::DimensionPair DimPair;
+ Eigen::array<DimPair, 2> dims;
+ dims[0] = DimPair(2, 0);
+ dims[1] = DimPair(3, 1);
+
+ m_result = m_left * m_right;
+ gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
+
+ hipMemcpy(t_result.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
+
+ for (DenseIndex i = 0; i < t_result.size(); i++) {
+ if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
+ std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl;
+ assert(false);
+ }
+ }
+
+ hipFree(d_t_left);
+ hipFree(d_t_right);
+ hipFree(d_t_result);
+}
+
+
+template<int DataLayout>
+void test_hip_convolution_1d()
+{
+ Tensor<float, 4, DataLayout> input(74,37,11,137);
+ Tensor<float, 1, DataLayout> kernel(4);
+ Tensor<float, 4, DataLayout> out(74,34,11,137);
+ input = input.constant(10.0f) + input.random();
+ kernel = kernel.constant(7.0f) + kernel.random();
+
+ std::size_t input_bytes = input.size() * sizeof(float);
+ std::size_t kernel_bytes = kernel.size() * sizeof(float);
+ std::size_t out_bytes = out.size() * sizeof(float);
+
+ float* d_input;
+ float* d_kernel;
+ float* d_out;
+ hipMalloc((void**)(&d_input), input_bytes);
+ hipMalloc((void**)(&d_kernel), kernel_bytes);
+ hipMalloc((void**)(&d_out), out_bytes);
+
+ hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
+ hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137);
+ Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4);
+ Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137);
+
+ Eigen::array<Eigen::DenseIndex, 1> dims{1};
+ gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+ assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+ assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+ for (int i = 0; i < 74; ++i) {
+ for (int j = 0; j < 34; ++j) {
+ for (int k = 0; k < 11; ++k) {
+ for (int l = 0; l < 137; ++l) {
+ const float result = out(i,j,k,l);
+ const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) +
+ input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3);
+ VERIFY_IS_APPROX(result, expected);
+ }
+ }
+ }
+ }
+
+ hipFree(d_input);
+ hipFree(d_kernel);
+ hipFree(d_out);
+}
+
+void test_hip_convolution_inner_dim_col_major_1d()
+{
+ Tensor<float, 4, ColMajor> input(74,9,11,7);
+ Tensor<float, 1, ColMajor> kernel(4);
+ Tensor<float, 4, ColMajor> out(71,9,11,7);
+ input = input.constant(10.0f) + input.random();
+ kernel = kernel.constant(7.0f) + kernel.random();
+
+ std::size_t input_bytes = input.size() * sizeof(float);
+ std::size_t kernel_bytes = kernel.size() * sizeof(float);
+ std::size_t out_bytes = out.size() * sizeof(float);
+
+ float* d_input;
+ float* d_kernel;
+ float* d_out;
+ hipMalloc((void**)(&d_input), input_bytes);
+ hipMalloc((void**)(&d_kernel), kernel_bytes);
+ hipMalloc((void**)(&d_out), out_bytes);
+
+ hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
+ hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7);
+ Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4);
+ Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7);
+
+ Eigen::array<Eigen::DenseIndex, 1> dims{0};
+ gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+ assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+ assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+ for (int i = 0; i < 71; ++i) {
+ for (int j = 0; j < 9; ++j) {
+ for (int k = 0; k < 11; ++k) {
+ for (int l = 0; l < 7; ++l) {
+ const float result = out(i,j,k,l);
+ const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) +
+ input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3);
+ VERIFY_IS_APPROX(result, expected);
+ }
+ }
+ }
+ }
+
+ hipFree(d_input);
+ hipFree(d_kernel);
+ hipFree(d_out);
+}
+
+void test_hip_convolution_inner_dim_row_major_1d()
+{
+ Tensor<float, 4, RowMajor> input(7,9,11,74);
+ Tensor<float, 1, RowMajor> kernel(4);
+ Tensor<float, 4, RowMajor> out(7,9,11,71);
+ input = input.constant(10.0f) + input.random();
+ kernel = kernel.constant(7.0f) + kernel.random();
+
+ std::size_t input_bytes = input.size() * sizeof(float);
+ std::size_t kernel_bytes = kernel.size() * sizeof(float);
+ std::size_t out_bytes = out.size() * sizeof(float);
+
+ float* d_input;
+ float* d_kernel;
+ float* d_out;
+ hipMalloc((void**)(&d_input), input_bytes);
+ hipMalloc((void**)(&d_kernel), kernel_bytes);
+ hipMalloc((void**)(&d_out), out_bytes);
+
+ hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
+ hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74);
+ Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4);
+ Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71);
+
+ Eigen::array<Eigen::DenseIndex, 1> dims{3};
+ gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+ assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+ assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+ for (int i = 0; i < 7; ++i) {
+ for (int j = 0; j < 9; ++j) {
+ for (int k = 0; k < 11; ++k) {
+ for (int l = 0; l < 71; ++l) {
+ const float result = out(i,j,k,l);
+ const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) +
+ input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3);
+ VERIFY_IS_APPROX(result, expected);
+ }
+ }
+ }
+ }
+
+ hipFree(d_input);
+ hipFree(d_kernel);
+ hipFree(d_out);
+}
+
+template<int DataLayout>
+void test_hip_convolution_2d()
+{
+ Tensor<float, 4, DataLayout> input(74,37,11,137);
+ Tensor<float, 2, DataLayout> kernel(3,4);
+ Tensor<float, 4, DataLayout> out(74,35,8,137);
+ input = input.constant(10.0f) + input.random();
+ kernel = kernel.constant(7.0f) + kernel.random();
+
+ std::size_t input_bytes = input.size() * sizeof(float);
+ std::size_t kernel_bytes = kernel.size() * sizeof(float);
+ std::size_t out_bytes = out.size() * sizeof(float);
+
+ float* d_input;
+ float* d_kernel;
+ float* d_out;
+ hipMalloc((void**)(&d_input), input_bytes);
+ hipMalloc((void**)(&d_kernel), kernel_bytes);
+ hipMalloc((void**)(&d_out), out_bytes);
+
+ hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
+ hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137);
+ Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4);
+ Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137);
+
+ Eigen::array<Eigen::DenseIndex, 2> dims{1,2};
+ gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+ assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+ assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+ for (int i = 0; i < 74; ++i) {
+ for (int j = 0; j < 35; ++j) {
+ for (int k = 0; k < 8; ++k) {
+ for (int l = 0; l < 137; ++l) {
+ const float result = out(i,j,k,l);
+ const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
+ input(i,j+1,k+0,l) * kernel(1,0) +
+ input(i,j+2,k+0,l) * kernel(2,0) +
+ input(i,j+0,k+1,l) * kernel(0,1) +
+ input(i,j+1,k+1,l) * kernel(1,1) +
+ input(i,j+2,k+1,l) * kernel(2,1) +
+ input(i,j+0,k+2,l) * kernel(0,2) +
+ input(i,j+1,k+2,l) * kernel(1,2) +
+ input(i,j+2,k+2,l) * kernel(2,2) +
+ input(i,j+0,k+3,l) * kernel(0,3) +
+ input(i,j+1,k+3,l) * kernel(1,3) +
+ input(i,j+2,k+3,l) * kernel(2,3);
+ VERIFY_IS_APPROX(result, expected);
+ }
+ }
+ }
+ }
+
+ hipFree(d_input);
+ hipFree(d_kernel);
+ hipFree(d_out);
+}
+
+template<int DataLayout>
+void test_hip_convolution_3d()
+{
+ Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>{74,37,11,137,17});
+ Tensor<float, 3, DataLayout> kernel(3,4,2);
+ Tensor<float, 5, DataLayout> out(Eigen::array<Eigen::DenseIndex, 5>{74,35,8,136,17});
+ input = input.constant(10.0f) + input.random();
+ kernel = kernel.constant(7.0f) + kernel.random();
+
+ std::size_t input_bytes = input.size() * sizeof(float);
+ std::size_t kernel_bytes = kernel.size() * sizeof(float);
+ std::size_t out_bytes = out.size() * sizeof(float);
+
+ float* d_input;
+ float* d_kernel;
+ float* d_out;
+ hipMalloc((void**)(&d_input), input_bytes);
+ hipMalloc((void**)(&d_kernel), kernel_bytes);
+ hipMalloc((void**)(&d_out), out_bytes);
+
+ hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
+ hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17);
+ Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2);
+ Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17);
+
+ Eigen::array<Eigen::DenseIndex, 3> dims{1,2,3};
+ gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+ assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+ assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+ for (int i = 0; i < 74; ++i) {
+ for (int j = 0; j < 35; ++j) {
+ for (int k = 0; k < 8; ++k) {
+ for (int l = 0; l < 136; ++l) {
+ for (int m = 0; m < 17; ++m) {
+ const float result = out(i,j,k,l,m);
+ const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) +
+ input(i,j+1,k+0,l+0,m) * kernel(1,0,0) +
+ input(i,j+2,k+0,l+0,m) * kernel(2,0,0) +
+ input(i,j+0,k+1,l+0,m) * kernel(0,1,0) +
+ input(i,j+1,k+1,l+0,m) * kernel(1,1,0) +
+ input(i,j+2,k+1,l+0,m) * kernel(2,1,0) +
+ input(i,j+0,k+2,l+0,m) * kernel(0,2,0) +
+ input(i,j+1,k+2,l+0,m) * kernel(1,2,0) +
+ input(i,j+2,k+2,l+0,m) * kernel(2,2,0) +
+ input(i,j+0,k+3,l+0,m) * kernel(0,3,0) +
+ input(i,j+1,k+3,l+0,m) * kernel(1,3,0) +
+ input(i,j+2,k+3,l+0,m) * kernel(2,3,0) +
+ input(i,j+0,k+0,l+1,m) * kernel(0,0,1) +
+ input(i,j+1,k+0,l+1,m) * kernel(1,0,1) +
+ input(i,j+2,k+0,l+1,m) * kernel(2,0,1) +
+ input(i,j+0,k+1,l+1,m) * kernel(0,1,1) +
+ input(i,j+1,k+1,l+1,m) * kernel(1,1,1) +
+ input(i,j+2,k+1,l+1,m) * kernel(2,1,1) +
+ input(i,j+0,k+2,l+1,m) * kernel(0,2,1) +
+ input(i,j+1,k+2,l+1,m) * kernel(1,2,1) +
+ input(i,j+2,k+2,l+1,m) * kernel(2,2,1) +
+ input(i,j+0,k+3,l+1,m) * kernel(0,3,1) +
+ input(i,j+1,k+3,l+1,m) * kernel(1,3,1) +
+ input(i,j+2,k+3,l+1,m) * kernel(2,3,1);
+ VERIFY_IS_APPROX(result, expected);
+ }
+ }
+ }
+ }
+ }
+
+ hipFree(d_input);
+ hipFree(d_kernel);
+ hipFree(d_out);
+}
+
+
+template <typename Scalar>
+void test_hip_lgamma(const Scalar stddev)
+{
+ Tensor<Scalar, 2> in(72,97);
+ in.setRandom();
+ in *= in.constant(stddev);
+ Tensor<Scalar, 2> out(72,97);
+ out.setZero();
+
+ std::size_t bytes = in.size() * sizeof(Scalar);
+
+ Scalar* d_in;
+ Scalar* d_out;
+ hipMalloc((void**)(&d_in), bytes);
+ hipMalloc((void**)(&d_out), bytes);
+
+ hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice);
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
+
+ gpu_out.device(gpu_device) = gpu_in.lgamma();
+
+ assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+ assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+ for (int i = 0; i < 72; ++i) {
+ for (int j = 0; j < 97; ++j) {
+ VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j)));
+ }
+ }
+
+ hipFree(d_in);
+ hipFree(d_out);
+}
+
+template <typename Scalar>
+void test_hip_digamma()
+{
+ Tensor<Scalar, 1> in(7);
+ Tensor<Scalar, 1> out(7);
+ Tensor<Scalar, 1> expected_out(7);
+ out.setZero();
+
+ in(0) = Scalar(1);
+ in(1) = Scalar(1.5);
+ in(2) = Scalar(4);
+ in(3) = Scalar(-10.5);
+ in(4) = Scalar(10000.5);
+ in(5) = Scalar(0);
+ in(6) = Scalar(-1);
+
+ expected_out(0) = Scalar(-0.5772156649015329);
+ expected_out(1) = Scalar(0.03648997397857645);
+ expected_out(2) = Scalar(1.2561176684318);
+ expected_out(3) = Scalar(2.398239129535781);
+ expected_out(4) = Scalar(9.210340372392849);
+ expected_out(5) = std::numeric_limits<Scalar>::infinity();
+ expected_out(6) = std::numeric_limits<Scalar>::infinity();
+
+ std::size_t bytes = in.size() * sizeof(Scalar);
+
+ Scalar* d_in;
+ Scalar* d_out;
+ hipMalloc((void**)(&d_in), bytes);
+ hipMalloc((void**)(&d_out), bytes);
+
+ hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice);
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7);
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
+
+ gpu_out.device(gpu_device) = gpu_in.digamma();
+
+ assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+ assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+ for (int i = 0; i < 5; ++i) {
+ VERIFY_IS_APPROX(out(i), expected_out(i));
+ }
+ for (int i = 5; i < 7; ++i) {
+ VERIFY_IS_EQUAL(out(i), expected_out(i));
+ }
+
+ hipFree(d_in);
+ hipFree(d_out);
+}
+
+template <typename Scalar>
+void test_hip_zeta()
+{
+ Tensor<Scalar, 1> in_x(6);
+ Tensor<Scalar, 1> in_q(6);
+ Tensor<Scalar, 1> out(6);
+ Tensor<Scalar, 1> expected_out(6);
+ out.setZero();
+
+ in_x(0) = Scalar(1);
+ in_x(1) = Scalar(1.5);
+ in_x(2) = Scalar(4);
+ in_x(3) = Scalar(-10.5);
+ in_x(4) = Scalar(10000.5);
+ in_x(5) = Scalar(3);
+
+ in_q(0) = Scalar(1.2345);
+ in_q(1) = Scalar(2);
+ in_q(2) = Scalar(1.5);
+ in_q(3) = Scalar(3);
+ in_q(4) = Scalar(1.0001);
+ in_q(5) = Scalar(-2.5);
+
+ expected_out(0) = std::numeric_limits<Scalar>::infinity();
+ expected_out(1) = Scalar(1.61237534869);
+ expected_out(2) = Scalar(0.234848505667);
+ expected_out(3) = Scalar(1.03086757337e-5);
+ expected_out(4) = Scalar(0.367879440865);
+ expected_out(5) = Scalar(0.054102025820864097);
+
+ std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+ Scalar* d_in_x;
+ Scalar* d_in_q;
+ Scalar* d_out;
+ hipMalloc((void**)(&d_in_x), bytes);
+ hipMalloc((void**)(&d_in_q), bytes);
+ hipMalloc((void**)(&d_out), bytes);
+
+ hipMemcpy(d_in_x, in_x.data(), bytes, hipMemcpyHostToDevice);
+ hipMemcpy(d_in_q, in_q.data(), bytes, hipMemcpyHostToDevice);
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_q(d_in_q, 6);
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
+
+ gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q);
+
+ assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+ assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+ VERIFY_IS_EQUAL(out(0), expected_out(0));
+ VERIFY((std::isnan)(out(3)));
+
+ for (int i = 1; i < 6; ++i) {
+ if (i != 3) {
+ VERIFY_IS_APPROX(out(i), expected_out(i));
+ }
+ }
+
+ hipFree(d_in_x);
+ hipFree(d_in_q);
+ hipFree(d_out);
+}
+
+template <typename Scalar>
+void test_hip_polygamma()
+{
+ Tensor<Scalar, 1> in_x(7);
+ Tensor<Scalar, 1> in_n(7);
+ Tensor<Scalar, 1> out(7);
+ Tensor<Scalar, 1> expected_out(7);
+ out.setZero();
+
+ in_n(0) = Scalar(1);
+ in_n(1) = Scalar(1);
+ in_n(2) = Scalar(1);
+ in_n(3) = Scalar(17);
+ in_n(4) = Scalar(31);
+ in_n(5) = Scalar(28);
+ in_n(6) = Scalar(8);
+
+ in_x(0) = Scalar(2);
+ in_x(1) = Scalar(3);
+ in_x(2) = Scalar(25.5);
+ in_x(3) = Scalar(4.7);
+ in_x(4) = Scalar(11.8);
+ in_x(5) = Scalar(17.7);
+ in_x(6) = Scalar(30.2);
+
+ expected_out(0) = Scalar(0.644934066848);
+ expected_out(1) = Scalar(0.394934066848);
+ expected_out(2) = Scalar(0.0399946696496);
+ expected_out(3) = Scalar(293.334565435);
+ expected_out(4) = Scalar(0.445487887616);
+ expected_out(5) = Scalar(-2.47810300902e-07);
+ expected_out(6) = Scalar(-8.29668781082e-09);
+
+ std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+ Scalar* d_in_x;
+ Scalar* d_in_n;
+ Scalar* d_out;
+ hipMalloc((void**)(&d_in_x), bytes);
+ hipMalloc((void**)(&d_in_n), bytes);
+ hipMalloc((void**)(&d_out), bytes);
+
+ hipMemcpy(d_in_x, in_x.data(), bytes, hipMemcpyHostToDevice);
+ hipMemcpy(d_in_n, in_n.data(), bytes, hipMemcpyHostToDevice);
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7);
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_n(d_in_n, 7);
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
+
+ gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x);
+
+ assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+ assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+ for (int i = 0; i < 7; ++i) {
+ VERIFY_IS_APPROX(out(i), expected_out(i));
+ }
+
+ hipFree(d_in_x);
+ hipFree(d_in_n);
+ hipFree(d_out);
+}
+
+template <typename Scalar>
+void test_hip_igamma()
+{
+ Tensor<Scalar, 2> a(6, 6);
+ Tensor<Scalar, 2> x(6, 6);
+ Tensor<Scalar, 2> out(6, 6);
+ out.setZero();
+
+ Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+ Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+
+ for (int i = 0; i < 6; ++i) {
+ for (int j = 0; j < 6; ++j) {
+ a(i, j) = a_s[i];
+ x(i, j) = x_s[j];
+ }
+ }
+
+ Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+ Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
+ {0.0, 0.6321205588285578, 0.7768698398515702,
+ 0.9816843611112658, 9.999500016666262e-05, 1.0},
+ {0.0, 0.4275932955291202, 0.608374823728911,
+ 0.9539882943107686, 7.522076445089201e-07, 1.0},
+ {0.0, 0.01898815687615381, 0.06564245437845008,
+ 0.5665298796332909, 4.166333347221828e-18, 1.0},
+ {0.0, 0.9999780593618628, 0.9999899967080838,
+ 0.9999996219837988, 0.9991370418689945, 1.0},
+ {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
+
+
+
+ std::size_t bytes = a.size() * sizeof(Scalar);
+
+ Scalar* d_a;
+ Scalar* d_x;
+ Scalar* d_out;
+ assert(hipMalloc((void**)(&d_a), bytes) == hipSuccess);
+ assert(hipMalloc((void**)(&d_x), bytes) == hipSuccess);
+ assert(hipMalloc((void**)(&d_out), bytes) == hipSuccess);
+
+ hipMemcpy(d_a, a.data(), bytes, hipMemcpyHostToDevice);
+ hipMemcpy(d_x, x.data(), bytes, hipMemcpyHostToDevice);
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
+
+ gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x);
+
+ assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+ assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+ for (int i = 0; i < 6; ++i) {
+ for (int j = 0; j < 6; ++j) {
+ if ((std::isnan)(igamma_s[i][j])) {
+ VERIFY((std::isnan)(out(i, j)));
+ } else {
+ VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]);
+ }
+ }
+ }
+
+ hipFree(d_a);
+ hipFree(d_x);
+ hipFree(d_out);
+}
+
+template <typename Scalar>
+void test_hip_igammac()
+{
+ Tensor<Scalar, 2> a(6, 6);
+ Tensor<Scalar, 2> x(6, 6);
+ Tensor<Scalar, 2> out(6, 6);
+ out.setZero();
+
+ Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+ Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+
+ for (int i = 0; i < 6; ++i) {
+ for (int j = 0; j < 6; ++j) {
+ a(i, j) = a_s[i];
+ x(i, j) = x_s[j];
+ }
+ }
+
+ Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+ Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
+ {1.0, 0.36787944117144233, 0.22313016014842982,
+ 0.018315638888734182, 0.9999000049998333, 0.0},
+ {1.0, 0.5724067044708798, 0.3916251762710878,
+ 0.04601170568923136, 0.9999992477923555, 0.0},
+ {1.0, 0.9810118431238462, 0.9343575456215499,
+ 0.4334701203667089, 1.0, 0.0},
+ {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
+ 3.7801620118431334e-07, 0.0008629581310054535,
+ 0.0},
+ {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
+
+ std::size_t bytes = a.size() * sizeof(Scalar);
+
+ Scalar* d_a;
+ Scalar* d_x;
+ Scalar* d_out;
+ hipMalloc((void**)(&d_a), bytes);
+ hipMalloc((void**)(&d_x), bytes);
+ hipMalloc((void**)(&d_out), bytes);
+
+ hipMemcpy(d_a, a.data(), bytes, hipMemcpyHostToDevice);
+ hipMemcpy(d_x, x.data(), bytes, hipMemcpyHostToDevice);
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
+
+ gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x);
+
+ assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+ assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+ for (int i = 0; i < 6; ++i) {
+ for (int j = 0; j < 6; ++j) {
+ if ((std::isnan)(igammac_s[i][j])) {
+ VERIFY((std::isnan)(out(i, j)));
+ } else {
+ VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]);
+ }
+ }
+ }
+
+ hipFree(d_a);
+ hipFree(d_x);
+ hipFree(d_out);
+}
+
+template <typename Scalar>
+void test_hip_erf(const Scalar stddev)
+{
+ Tensor<Scalar, 2> in(72,97);
+ in.setRandom();
+ in *= in.constant(stddev);
+ Tensor<Scalar, 2> out(72,97);
+ out.setZero();
+
+ std::size_t bytes = in.size() * sizeof(Scalar);
+
+ Scalar* d_in;
+ Scalar* d_out;
+ assert(hipMalloc((void**)(&d_in), bytes) == hipSuccess);
+ assert(hipMalloc((void**)(&d_out), bytes) == hipSuccess);
+
+ hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice);
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
+
+ gpu_out.device(gpu_device) = gpu_in.erf();
+
+ assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+ assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+ for (int i = 0; i < 72; ++i) {
+ for (int j = 0; j < 97; ++j) {
+ VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j)));
+ }
+ }
+
+ hipFree(d_in);
+ hipFree(d_out);
+}
+
+template <typename Scalar>
+void test_hip_erfc(const Scalar stddev)
+{
+ Tensor<Scalar, 2> in(72,97);
+ in.setRandom();
+ in *= in.constant(stddev);
+ Tensor<Scalar, 2> out(72,97);
+ out.setZero();
+
+ std::size_t bytes = in.size() * sizeof(Scalar);
+
+ Scalar* d_in;
+ Scalar* d_out;
+ hipMalloc((void**)(&d_in), bytes);
+ hipMalloc((void**)(&d_out), bytes);
+
+ hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice);
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
+
+ gpu_out.device(gpu_device) = gpu_in.erfc();
+
+ assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+ assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+ for (int i = 0; i < 72; ++i) {
+ for (int j = 0; j < 97; ++j) {
+ VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j)));
+ }
+ }
+
+ hipFree(d_in);
+ hipFree(d_out);
+}
+
+template <typename Scalar>
+void test_hip_betainc()
+{
+ Tensor<Scalar, 1> in_x(125);
+ Tensor<Scalar, 1> in_a(125);
+ Tensor<Scalar, 1> in_b(125);
+ Tensor<Scalar, 1> out(125);
+ Tensor<Scalar, 1> expected_out(125);
+ out.setZero();
+
+ Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+
+ Array<Scalar, 1, Dynamic> x(125);
+ Array<Scalar, 1, Dynamic> a(125);
+ Array<Scalar, 1, Dynamic> b(125);
+ Array<Scalar, 1, Dynamic> v(125);
+
+ a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+ 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+ 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+ 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+ 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+ 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999;
+
+ b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+ 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+ 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
+ 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+ 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+ 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
+ 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+ 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+ 999.999, 999.999, 999.999;
+
+ x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
+ 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+ 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
+ 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
+ 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1,
+ -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
+ 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+ 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
+ 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1;
+
+ v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+ nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+ nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
+ 0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan,
+ 0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan,
+ 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan,
+ nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256,
+ 0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001,
+ 0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403,
+ 0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999,
+ 0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
+ 1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan,
+ nan, 7.864342668429763e-23, 3.015969667594166e-10, 0.0008598571564165444,
+ nan, nan, 6.031987710123844e-08, 0.5000000000000007, 0.9999999396801229,
+ nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan,
+ nan, nan, nan, nan, nan, nan, 0.0, 7.029920380986636e-306,
+ 2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302,
+ 1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252,
+ 2.9303043666183996e-60, nan, nan, 2.248913486879199e-196,
+ 0.5000000000004947, 0.9999999999999999, nan;
+
+ for (int i = 0; i < 125; ++i) {
+ in_x(i) = x(i);
+ in_a(i) = a(i);
+ in_b(i) = b(i);
+ expected_out(i) = v(i);
+ }
+
+ std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+ Scalar* d_in_x;
+ Scalar* d_in_a;
+ Scalar* d_in_b;
+ Scalar* d_out;
+ hipMalloc((void**)(&d_in_x), bytes);
+ hipMalloc((void**)(&d_in_a), bytes);
+ hipMalloc((void**)(&d_in_b), bytes);
+ hipMalloc((void**)(&d_out), bytes);
+
+ hipMemcpy(d_in_x, in_x.data(), bytes, hipMemcpyHostToDevice);
+ hipMemcpy(d_in_a, in_a.data(), bytes, hipMemcpyHostToDevice);
+ hipMemcpy(d_in_b, in_b.data(), bytes, hipMemcpyHostToDevice);
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125);
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_a(d_in_a, 125);
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_b(d_in_b, 125);
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 125);
+
+ gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x);
+
+ assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+ assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+ for (int i = 1; i < 125; ++i) {
+ if ((std::isnan)(expected_out(i))) {
+ VERIFY((std::isnan)(out(i)));
+ } else {
+ VERIFY_IS_APPROX(out(i), expected_out(i));
+ }
+ }
+
+ hipFree(d_in_x);
+ hipFree(d_in_a);
+ hipFree(d_in_b);
+ hipFree(d_out);
+}
+
+
+void test_cxx11_tensor_hip()
+{
+ CALL_SUBTEST(test_hip_nullary());
+ CALL_SUBTEST(test_hip_elementwise_small());
+ CALL_SUBTEST(test_hip_elementwise());
+ CALL_SUBTEST(test_hip_props());
+ CALL_SUBTEST(test_hip_reduction());
+ // FIXME : uncommenting following tests results in compile failure
+ // CALL_SUBTEST(test_hip_contraction<ColMajor>());
+ // CALL_SUBTEST(test_hip_contraction<RowMajor>());
+ CALL_SUBTEST(test_hip_convolution_1d<ColMajor>());
+ CALL_SUBTEST(test_hip_convolution_1d<RowMajor>());
+ CALL_SUBTEST(test_hip_convolution_inner_dim_col_major_1d());
+ CALL_SUBTEST(test_hip_convolution_inner_dim_row_major_1d());
+ CALL_SUBTEST(test_hip_convolution_2d<ColMajor>());
+ CALL_SUBTEST(test_hip_convolution_2d<RowMajor>());
+ // The following two tests commented out due to long runtime
+ // CALL_SUBTEST(test_hip_convolution_3d<ColMajor>());
+ // CALL_SUBTEST(test_hip_convolution_3d<RowMajor>());
+
+#if __cplusplus > 199711L
+ // std::erf, std::erfc, and so on where only added in c++11. We use them
+ // as a golden reference to validate the results produced by Eigen. Therefore
+ // we can only run these tests if we use a c++11 compiler.
+
+ CALL_SUBTEST(test_hip_lgamma<float>(1.0f));
+ CALL_SUBTEST(test_hip_lgamma<float>(100.0f));
+ CALL_SUBTEST(test_hip_lgamma<float>(0.01f));
+ CALL_SUBTEST(test_hip_lgamma<float>(0.001f));
+
+ CALL_SUBTEST(test_hip_lgamma<double>(1.0));
+ CALL_SUBTEST(test_hip_lgamma<double>(100.0));
+ CALL_SUBTEST(test_hip_lgamma<double>(0.01));
+ CALL_SUBTEST(test_hip_lgamma<double>(0.001));
+
+ CALL_SUBTEST(test_hip_erf<float>(1.0f));
+ CALL_SUBTEST(test_hip_erf<float>(100.0f));
+ CALL_SUBTEST(test_hip_erf<float>(0.01f));
+ CALL_SUBTEST(test_hip_erf<float>(0.001f));
+
+ CALL_SUBTEST(test_hip_erfc<float>(1.0f));
+ // CALL_SUBTEST(test_hip_erfc<float>(100.0f)); // HIP erfc lacks precision for large inputs
+ CALL_SUBTEST(test_hip_erfc<float>(5.0f));
+ CALL_SUBTEST(test_hip_erfc<float>(0.01f));
+ CALL_SUBTEST(test_hip_erfc<float>(0.001f));
+
+ CALL_SUBTEST(test_hip_erf<double>(1.0));
+ CALL_SUBTEST(test_hip_erf<double>(100.0));
+ CALL_SUBTEST(test_hip_erf<double>(0.01));
+ CALL_SUBTEST(test_hip_erf<double>(0.001));
+
+ CALL_SUBTEST(test_hip_erfc<double>(1.0));
+ // CALL_SUBTEST(test_hip_erfc<double>(100.0)); // HIP erfc lacks precision for large inputs
+ CALL_SUBTEST(test_hip_erfc<double>(5.0));
+ CALL_SUBTEST(test_hip_erfc<double>(0.01));
+ CALL_SUBTEST(test_hip_erfc<double>(0.001));
+
+ // Following tests have functional failures on some seeds
+ // CALL_SUBTEST(test_hip_digamma<float>());
+ // CALL_SUBTEST(test_hip_digamma<double>());
+
+ // Following tests have functional failures on some seeds
+ // CALL_SUBTEST(test_hip_polygamma<float>());
+ // CALL_SUBTEST(test_hip_polygamma<double>());
+
+ // Following tests have functional failures on some seeds
+ // CALL_SUBTEST(test_hip_zeta<float>());
+ // CALL_SUBTEST(test_hip_zeta<double>());
+
+ CALL_SUBTEST(test_hip_igamma<float>());
+ CALL_SUBTEST(test_hip_igammac<float>());
+
+ CALL_SUBTEST(test_hip_igamma<double>());
+ CALL_SUBTEST(test_hip_igammac<double>());
+
+ CALL_SUBTEST(test_hip_betainc<float>());
+ CALL_SUBTEST(test_hip_betainc<double>());
+#endif
+}
diff --git a/unsupported/test/cxx11_tensor_of_float16_hip.cu b/unsupported/test/cxx11_tensor_of_float16_hip.cu
new file mode 100644
index 000000000..6c1a401bf
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_of_float16_hip.cu
@@ -0,0 +1,498 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_hip
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+
+using Eigen::Tensor;
+
+template<typename>
+void test_hip_numext() {
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+ int num_elem = 101;
+
+ float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+ bool* d_res_half = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
+ bool* d_res_float = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
+
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+ d_float, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_half(
+ d_res_half, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_float(
+ d_res_float, num_elem);
+
+ gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
+ gpu_res_float.device(gpu_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>());
+ gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>());
+
+ Tensor<bool, 1> half_prec(num_elem);
+ Tensor<bool, 1> full_prec(num_elem);
+ gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(bool));
+ gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(bool));
+ gpu_device.synchronize();
+
+ for (int i = 0; i < num_elem; ++i) {
+ std::cout << "Checking numext " << i << std::endl;
+ VERIFY_IS_EQUAL(full_prec(i), half_prec(i));
+ }
+
+ gpu_device.deallocate(d_float);
+ gpu_device.deallocate(d_res_half);
+ gpu_device.deallocate(d_res_float);
+}
+
+
+#ifdef EIGEN_HAS_HIP_FP16
+
+template<typename>
+void test_hip_conversion() {
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+ int num_elem = 101;
+
+ float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+ Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+ float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+ d_float, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
+ d_half, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
+ d_conv, num_elem);
+
+ gpu_float.device(gpu_device) = gpu_float.random();
+ gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
+ gpu_conv.device(gpu_device) = gpu_half.cast<float>();
+
+ Tensor<float, 1> initial(num_elem);
+ Tensor<float, 1> final(num_elem);
+ gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
+ gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
+
+ for (int i = 0; i < num_elem; ++i) {
+ VERIFY_IS_APPROX(initial(i), final(i));
+ }
+
+ gpu_device.deallocate(d_float);
+ gpu_device.deallocate(d_half);
+ gpu_device.deallocate(d_conv);
+}
+
+template<typename>
+void test_hip_unary() {
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+ int num_elem = 101;
+
+ float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+ float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
+ float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+ d_float, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
+ d_res_half, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+ d_res_float, num_elem);
+
+ gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
+ gpu_res_float.device(gpu_device) = gpu_float.abs();
+ gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().cast<float>();
+
+ Tensor<float, 1> half_prec(num_elem);
+ Tensor<float, 1> full_prec(num_elem);
+ gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
+ gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+ gpu_device.synchronize();
+
+ for (int i = 0; i < num_elem; ++i) {
+ std::cout << "Checking unary " << i << std::endl;
+ VERIFY_IS_APPROX(full_prec(i), half_prec(i));
+ }
+
+ gpu_device.deallocate(d_float);
+ gpu_device.deallocate(d_res_half);
+ gpu_device.deallocate(d_res_float);
+}
+
+template<typename>
+void test_hip_elementwise() {
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+ int num_elem = 101;
+
+ float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+ float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+ float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
+ float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(
+ d_float1, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(
+ d_float2, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
+ d_res_half, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+ d_res_float, num_elem);
+
+ gpu_float1.device(gpu_device) = gpu_float1.random();
+ gpu_float2.device(gpu_device) = gpu_float2.random();
+ gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1;
+ gpu_res_half.device(gpu_device) = ((gpu_float1.cast<Eigen::half>() + gpu_float2.cast<Eigen::half>()) * gpu_float1.cast<Eigen::half>()).cast<float>();
+
+ Tensor<float, 1> half_prec(num_elem);
+ Tensor<float, 1> full_prec(num_elem);
+ gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
+ gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+ gpu_device.synchronize();
+
+ for (int i = 0; i < num_elem; ++i) {
+ std::cout << "Checking elemwise " << i << ": full prec = " << full_prec(i) << " vs half prec = " << half_prec(i) << std::endl;
+ VERIFY_IS_APPROX(static_cast<Eigen::half>(full_prec(i)), static_cast<Eigen::half>(half_prec(i)));
+ }
+
+ gpu_device.deallocate(d_float1);
+ gpu_device.deallocate(d_float2);
+ gpu_device.deallocate(d_res_half);
+ gpu_device.deallocate(d_res_float);
+}
+
+template<typename>
+void test_hip_trancendental() {
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+ int num_elem = 101;
+
+ float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+ float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+ float* d_float3 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+ Eigen::half* d_res1_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+ Eigen::half* d_res1_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+ Eigen::half* d_res2_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+ Eigen::half* d_res2_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+ Eigen::half* d_res3_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+ Eigen::half* d_res3_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_half(d_res3_half, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_float(d_res3_float, num_elem);
+
+ gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
+ gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
+ gpu_float3.device(gpu_device) = gpu_float3.random();
+ gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>();
+ gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>();
+ gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>();
+ gpu_res4_float.device(gpu_device) = gpu_float3.expm1().cast<Eigen::half>();
+
+ gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>();
+ gpu_res1_half.device(gpu_device) = gpu_res1_half.exp();
+
+ gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>();
+ gpu_res2_half.device(gpu_device) = gpu_res2_half.log();
+
+ gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
+ gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p();
+
+ gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
+ gpu_res3_half.device(gpu_device) = gpu_res3_half.expm1();
+
+ Tensor<float, 1> input1(num_elem);
+ Tensor<Eigen::half, 1> half_prec1(num_elem);
+ Tensor<Eigen::half, 1> full_prec1(num_elem);
+ Tensor<float, 1> input2(num_elem);
+ Tensor<Eigen::half, 1> half_prec2(num_elem);
+ Tensor<Eigen::half, 1> full_prec2(num_elem);
+ Tensor<float, 1> input3(num_elem);
+ Tensor<Eigen::half, 1> half_prec3(num_elem);
+ Tensor<Eigen::half, 1> full_prec3(num_elem);
+ gpu_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float));
+ gpu_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float));
+ gpu_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float));
+ gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half));
+ gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half));
+ gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half));
+ gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half));
+ gpu_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half));
+ gpu_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half));
+ gpu_device.synchronize();
+
+ for (int i = 0; i < num_elem; ++i) {
+ std::cout << "Checking elemwise exp " << i << " input = " << input1(i) << " full = " << full_prec1(i) << " half = " << half_prec1(i) << std::endl;
+ VERIFY_IS_APPROX(full_prec1(i), half_prec1(i));
+ }
+ for (int i = 0; i < num_elem; ++i) {
+ std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
+ if(std::abs(input2(i)-1.f)<0.05f) // log lacks accurary nearby 1
+ VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
+ else
+ VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
+ }
+ for (int i = 0; i < num_elem; ++i) {
+ std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl;
+ VERIFY_IS_APPROX(full_prec3(i), half_prec3(i));
+ }
+ gpu_device.deallocate(d_float1);
+ gpu_device.deallocate(d_float2);
+ gpu_device.deallocate(d_float3);
+ gpu_device.deallocate(d_res1_half);
+ gpu_device.deallocate(d_res1_float);
+ gpu_device.deallocate(d_res2_half);
+ gpu_device.deallocate(d_res2_float);
+ gpu_device.deallocate(d_res3_float);
+ gpu_device.deallocate(d_res3_half);
+}
+
+template<typename>
+void test_hip_contractions() {
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+ int rows = 23;
+ int cols = 23;
+ int num_elem = rows*cols;
+
+ float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+ float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+ Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+ Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+
+ Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
+ d_float1, rows, cols);
+ Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
+ d_float2, rows, cols);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_half(
+ d_res_half, rows, cols);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_float(
+ d_res_float, rows, cols);
+
+ gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
+ gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float2.constant(0.5f);
+
+ typedef Tensor<float, 2>::DimensionPair DimPair;
+ Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+ gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims).cast<Eigen::half>();
+ gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims);
+
+ Tensor<Eigen::half, 2> half_prec(rows, cols);
+ Tensor<Eigen::half, 2> full_prec(rows, cols);
+ gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half));
+ gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half));
+ gpu_device.synchronize();
+
+ for (int i = 0; i < rows; ++i) {
+ for (int j = 0; j < cols; ++j) {
+ std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl;
+ if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) {
+ VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j));
+ }
+ }
+ }
+
+ gpu_device.deallocate(d_float1);
+ gpu_device.deallocate(d_float2);
+ gpu_device.deallocate(d_res_half);
+ gpu_device.deallocate(d_res_float);
+}
+
+template<typename>
+void test_hip_reductions(int size1, int size2, int redux) {
+
+ std::cout << "Reducing " << size1 << " by " << size2
+ << " tensor along dim " << redux << std::endl;
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+ int num_elem = size1*size2;
+ int result_size = (redux == 1 ? size1 : size2);
+
+ float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+ float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+ Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
+ Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
+
+ Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
+ d_float1, size1, size2);
+ Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
+ d_float2, size1, size2);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half(
+ d_res_half, result_size);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float(
+ d_res_float, result_size);
+
+ gpu_float1.device(gpu_device) = gpu_float1.random() * 2.0f;
+ gpu_float2.device(gpu_device) = gpu_float2.random() * 2.0f;
+
+ Eigen::array<int, 1> redux_dim(redux);
+ gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast<Eigen::half>();
+ gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim);
+
+ Tensor<Eigen::half, 1> half_prec(result_size);
+ Tensor<Eigen::half, 1> full_prec(result_size);
+ gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, result_size*sizeof(Eigen::half));
+ gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, result_size*sizeof(Eigen::half));
+ gpu_device.synchronize();
+
+ for (int i = 0; i < result_size; ++i) {
+ std::cout << "EXPECTED " << full_prec(i) << " GOT " << half_prec(i) << std::endl;
+ VERIFY_IS_APPROX(full_prec(i), half_prec(i));
+ }
+
+ gpu_device.deallocate(d_float1);
+ gpu_device.deallocate(d_float2);
+ gpu_device.deallocate(d_res_half);
+ gpu_device.deallocate(d_res_float);
+}
+
+template<typename>
+void test_hip_reductions() {
+ test_hip_reductions<void>(13, 13, 0);
+ test_hip_reductions<void>(13, 13, 1);
+
+ test_hip_reductions<void>(35, 36, 0);
+ test_hip_reductions<void>(35, 36, 1);
+
+ test_hip_reductions<void>(36, 35, 0);
+ test_hip_reductions<void>(36, 35, 1);
+}
+
+template<typename>
+void test_hip_full_reductions() {
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+ int size = 13;
+ int num_elem = size*size;
+
+ float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+ float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+ Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
+ Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
+
+ Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
+ d_float1, size, size);
+ Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
+ d_float2, size, size);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half(
+ d_res_half);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float(
+ d_res_float);
+
+ gpu_float1.device(gpu_device) = gpu_float1.random();
+ gpu_float2.device(gpu_device) = gpu_float2.random();
+
+ gpu_res_float.device(gpu_device) = gpu_float1.sum().cast<Eigen::half>();
+ gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum();
+
+ Tensor<Eigen::half, 0> half_prec;
+ Tensor<Eigen::half, 0> full_prec;
+ gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
+ gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
+ gpu_device.synchronize();
+
+ VERIFY_IS_APPROX(full_prec(), half_prec());
+
+ gpu_res_float.device(gpu_device) = gpu_float1.maximum().cast<Eigen::half>();
+ gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().maximum();
+ gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
+ gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
+ gpu_device.synchronize();
+
+ VERIFY_IS_APPROX(full_prec(), half_prec());
+
+ gpu_device.deallocate(d_float1);
+ gpu_device.deallocate(d_float2);
+ gpu_device.deallocate(d_res_half);
+ gpu_device.deallocate(d_res_float);
+}
+
+template<typename>
+void test_hip_forced_evals() {
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+ int num_elem = 101;
+
+ float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+ float* d_res_half1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+ float* d_res_half2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+ float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+ d_float, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1(
+ d_res_half1, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
+ d_res_half2, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+ d_res_float, num_elem);
+
+ Eigen::array<int, 1> no_bcast;
+ no_bcast[0] = 1;
+
+ gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
+ gpu_res_float.device(gpu_device) = gpu_float.abs();
+ gpu_res_half1.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>();
+ gpu_res_half2.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().broadcast(no_bcast).eval().cast<float>();
+
+ Tensor<float, 1> half_prec1(num_elem);
+ Tensor<float, 1> half_prec2(num_elem);
+ Tensor<float, 1> full_prec(num_elem);
+ gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float));
+ gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half1, num_elem*sizeof(float));
+ gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+ gpu_device.synchronize();
+
+ for (int i = 0; i < num_elem; ++i) {
+ std::cout << "Checking forced eval " << i << full_prec(i) << " vs " << half_prec1(i) << " vs " << half_prec2(i) << std::endl;
+ VERIFY_IS_APPROX(full_prec(i), half_prec1(i));
+ VERIFY_IS_APPROX(full_prec(i), half_prec2(i));
+ }
+
+ gpu_device.deallocate(d_float);
+ gpu_device.deallocate(d_res_half1);
+ gpu_device.deallocate(d_res_half2);
+ gpu_device.deallocate(d_res_float);
+}
+#endif
+
+
+void test_cxx11_tensor_of_float16_hip()
+{
+ CALL_SUBTEST(test_hip_numext<void>());
+
+#ifdef EIGEN_HAS_HIP_FP16
+ CALL_SUBTEST(test_hip_conversion<void>());
+ CALL_SUBTEST(test_hip_unary<void>());
+ CALL_SUBTEST(test_hip_elementwise<void>());
+ CALL_SUBTEST(test_hip_trancendental<void>());
+ CALL_SUBTEST(test_hip_contractions<void>());
+ CALL_SUBTEST(test_hip_reductions<void>());
+ CALL_SUBTEST(test_hip_full_reductions<void>());
+ CALL_SUBTEST(test_hip_forced_evals<void>());
+#else
+ std::cout << "Half floats are not supported by this version of hip: skipping the test" << std::endl;
+#endif
+}
diff --git a/unsupported/test/cxx11_tensor_random_hip.cu b/unsupported/test/cxx11_tensor_random_hip.cu
new file mode 100644
index 000000000..7d7e72ca2
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_random_hip.cu
@@ -0,0 +1,85 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_random_hip
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+
+
+void test_hip_random_uniform()
+{
+ Tensor<float, 2> out(72,97);
+ out.setZero();
+
+ std::size_t out_bytes = out.size() * sizeof(float);
+
+ float* d_out;
+ hipMalloc((void**)(&d_out), out_bytes);
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
+
+ gpu_out.device(gpu_device) = gpu_out.random();
+
+ assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+ assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+
+ // For now we just check thes code doesn't crash.
+ // TODO: come up with a valid test of randomness
+}
+
+
+void test_hip_random_normal()
+{
+ Tensor<float, 2> out(72,97);
+ out.setZero();
+
+ std::size_t out_bytes = out.size() * sizeof(float);
+
+ float* d_out;
+ hipMalloc((void**)(&d_out), out_bytes);
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
+
+ Eigen::internal::NormalRandomGenerator<float> gen(true);
+ gpu_out.device(gpu_device) = gpu_out.random(gen);
+
+ assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
+ assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
+}
+
+static void test_complex()
+{
+ Tensor<std::complex<float>, 1> vec(6);
+ vec.setRandom();
+
+ // Fixme: we should check that the generated numbers follow a uniform
+ // distribution instead.
+ for (int i = 1; i < 6; ++i) {
+ VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1));
+ }
+}
+
+
+void test_cxx11_tensor_random_hip()
+{
+ CALL_SUBTEST(test_hip_random_uniform());
+ CALL_SUBTEST(test_hip_random_normal());
+ CALL_SUBTEST(test_complex());
+}
diff --git a/unsupported/test/cxx11_tensor_reduction_hip.cu b/unsupported/test/cxx11_tensor_reduction_hip.cu
new file mode 100644
index 000000000..c5aad05be
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_reduction_hip.cu
@@ -0,0 +1,154 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_reduction_hip
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+
+template<typename Type, int DataLayout>
+static void test_full_reductions() {
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ const int num_rows = internal::random<int>(1024, 5*1024);
+ const int num_cols = internal::random<int>(1024, 5*1024);
+
+ Tensor<Type, 2, DataLayout> in(num_rows, num_cols);
+ in.setRandom();
+
+ Tensor<Type, 0, DataLayout> full_redux;
+ full_redux = in.sum();
+
+ std::size_t in_bytes = in.size() * sizeof(Type);
+ std::size_t out_bytes = full_redux.size() * sizeof(Type);
+ Type* gpu_in_ptr = static_cast<Type*>(gpu_device.allocate(in_bytes));
+ Type* gpu_out_ptr = static_cast<Type*>(gpu_device.allocate(out_bytes));
+ gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
+
+ TensorMap<Tensor<Type, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
+ TensorMap<Tensor<Type, 0, DataLayout> > out_gpu(gpu_out_ptr);
+
+ out_gpu.device(gpu_device) = in_gpu.sum();
+
+ Tensor<Type, 0, DataLayout> full_redux_gpu;
+ gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
+ gpu_device.synchronize();
+
+ // Check that the CPU and GPU reductions return the same result.
+ VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
+
+ gpu_device.deallocate(gpu_in_ptr);
+ gpu_device.deallocate(gpu_out_ptr);
+}
+
+template<typename Type, int DataLayout>
+static void test_first_dim_reductions() {
+ int dim_x = 33;
+ int dim_y = 1;
+ int dim_z = 128;
+
+ Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
+ in.setRandom();
+
+ Eigen::array<int, 1> red_axis;
+ red_axis[0] = 0;
+ Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
+
+ // Create device
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice dev(&stream);
+
+ // Create data(T)
+ Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
+ Type* out_data = (Type*)dev.allocate(dim_z*dim_y*sizeof(Type));
+ Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
+ Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_y, dim_z);
+
+ // Perform operation
+ dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
+ gpu_out.device(dev) = gpu_in.sum(red_axis);
+ gpu_out.device(dev) += gpu_in.sum(red_axis);
+ Tensor<Type, 2, DataLayout> redux_gpu(dim_y, dim_z);
+ dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
+ dev.synchronize();
+
+ // Check that the CPU and GPU reductions return the same result.
+ for (int i = 0; i < gpu_out.size(); ++i) {
+ VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
+ }
+
+ dev.deallocate(in_data);
+ dev.deallocate(out_data);
+}
+
+template<typename Type, int DataLayout>
+static void test_last_dim_reductions() {
+ int dim_x = 128;
+ int dim_y = 1;
+ int dim_z = 33;
+
+ Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
+ in.setRandom();
+
+ Eigen::array<int, 1> red_axis;
+ red_axis[0] = 2;
+ Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
+
+ // Create device
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice dev(&stream);
+
+ // Create data
+ Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
+ Type* out_data = (Type*)dev.allocate(dim_x*dim_y*sizeof(Type));
+ Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
+ Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_x, dim_y);
+
+ // Perform operation
+ dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
+ gpu_out.device(dev) = gpu_in.sum(red_axis);
+ gpu_out.device(dev) += gpu_in.sum(red_axis);
+ Tensor<Type, 2, DataLayout> redux_gpu(dim_x, dim_y);
+ dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
+ dev.synchronize();
+
+ // Check that the CPU and GPU reductions return the same result.
+ for (int i = 0; i < gpu_out.size(); ++i) {
+ VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
+ }
+
+ dev.deallocate(in_data);
+ dev.deallocate(out_data);
+}
+
+
+void test_cxx11_tensor_reduction_hip() {
+ CALL_SUBTEST((test_full_reductions<float, ColMajor>()));
+ CALL_SUBTEST((test_full_reductions<double, ColMajor>()));
+ CALL_SUBTEST((test_full_reductions<float, RowMajor>()));
+ CALL_SUBTEST((test_full_reductions<double, RowMajor>()));
+
+ CALL_SUBTEST((test_first_dim_reductions<float, ColMajor>()));
+ CALL_SUBTEST((test_first_dim_reductions<double, ColMajor>()));
+ CALL_SUBTEST((test_first_dim_reductions<float, RowMajor>()));
+// Outer reductions of doubles aren't supported just yet.
+// CALL_SUBTEST((test_first_dim_reductions<double, RowMajor>()))
+
+ CALL_SUBTEST((test_last_dim_reductions<float, ColMajor>()));
+// Outer reductions of doubles aren't supported just yet.
+// CALL_SUBTEST((test_last_dim_reductions<double, ColMajor>()));
+ CALL_SUBTEST((test_last_dim_reductions<float, RowMajor>()));
+ CALL_SUBTEST((test_last_dim_reductions<double, RowMajor>()));
+}
diff --git a/unsupported/test/cxx11_tensor_scan_hip.cu b/unsupported/test/cxx11_tensor_scan_hip.cu
new file mode 100644
index 000000000..f4c4f59b9
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_scan_hip.cu
@@ -0,0 +1,76 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_scan_hip
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+typedef Tensor<float, 1>::DimensionPair DimPair;
+
+template<int DataLayout>
+void test_hip_cumsum(int m_size, int k_size, int n_size)
+{
+ std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+ Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size);
+ Tensor<float, 3, DataLayout> t_result(m_size, k_size, n_size);
+ Tensor<float, 3, DataLayout> t_result_gpu(m_size, k_size, n_size);
+
+ t_input.setRandom();
+
+ std::size_t t_input_bytes = t_input.size() * sizeof(float);
+ std::size_t t_result_bytes = t_result.size() * sizeof(float);
+
+ float* d_t_input;
+ float* d_t_result;
+
+ hipMalloc((void**)(&d_t_input), t_input_bytes);
+ hipMalloc((void**)(&d_t_result), t_result_bytes);
+
+ hipMemcpy(d_t_input, t_input.data(), t_input_bytes, hipMemcpyHostToDevice);
+
+ Eigen::HipStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
+ gpu_t_input(d_t_input, Eigen::array<int, 3>(m_size, k_size, n_size));
+ Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
+ gpu_t_result(d_t_result, Eigen::array<int, 3>(m_size, k_size, n_size));
+
+ gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1);
+ t_result = t_input.cumsum(1);
+
+ hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
+ for (DenseIndex i = 0; i < t_result.size(); i++) {
+ if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
+ continue;
+ }
+ if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
+ continue;
+ }
+ std::cout << "mismatch detected at index " << i << ": " << t_result(i)
+ << " vs " << t_result_gpu(i) << std::endl;
+ assert(false);
+ }
+
+ hipFree((void*)d_t_input);
+ hipFree((void*)d_t_result);
+}
+
+
+void test_cxx11_tensor_scan_hip()
+{
+ CALL_SUBTEST(test_hip_cumsum<ColMajor>(128, 128, 128));
+ CALL_SUBTEST(test_hip_cumsum<RowMajor>(128, 128, 128));
+}