diff options
50 files changed, 9527 insertions, 94 deletions
diff --git a/Eigen/Core b/Eigen/Core index f6bc18a08..c72d5468a 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -22,6 +22,17 @@ #define EIGEN_CUDA_ARCH __CUDA_ARCH__ #endif +#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP) + // analogous to EIGEN_CUDACC, but for HIP + #define EIGEN_HIPCC __HIPCC__ +#endif + +// NVCC is not supported as the target platform for HIPCC +// Note that this also makes EIGEN_CUDACC and EIGEN_HIPCC mutually exclusive +#if defined(__NVCC__) && defined(__HIPCC__) + #error "NVCC as the target platform for HIPCC is currently not supported." +#endif + // Starting with CUDA 9 the composite __CUDACC_VER__ is not available. #if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) #define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100)) @@ -32,8 +43,8 @@ #endif // Handle NVCC/CUDA/SYCL -#if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__) - // Do not try asserts on CUDA and SYCL! +#if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__) || defined(EIGEN_HIPCC) + // Do not try asserts on CUDA, HIP and SYCL! #ifndef EIGEN_NO_DEBUG #define EIGEN_NO_DEBUG #endif @@ -57,6 +68,26 @@ // We need cuda_runtime.h to ensure that that EIGEN_USING_STD_MATH macro // works properly on the device side #include <cuda_runtime.h> + + #elif defined(EIGEN_HIPCC) + // Do not try to vectorize on HIP + #ifndef EIGEN_DONT_VECTORIZE + #define EIGEN_DONT_VECTORIZE + #endif + + #define EIGEN_DEVICE_FUNC __host__ __device__ + // We need hip_runtime.h to ensure that that EIGEN_USING_STD_MATH macro + // works properly on the device side + #include <hip/hip_runtime.h> + + #if defined(__HIP_DEVICE_COMPILE__) && !defined(EIGEN_NO_HIP) + // analogous to EIGEN_CUDA_ARCH, but for HIP + #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__ + // Note this check needs to come after we include hip_runtime.h since + // hip_runtime.h includes hip_common.h which in turn has the define + // for __HIP_DEVICE_COMPILE__ + #endif + #else #define EIGEN_DEVICE_FUNC #endif @@ -68,16 +99,16 @@ #define EIGEN_DONT_VECTORIZE #endif -// When compiling CUDA device code with NVCC, pull in math functions from the -// global namespace. In host mode, and when device doee with clang, use the -// std versions. -#if defined(EIGEN_CUDA_ARCH) && defined(__NVCC__) +// When compiling CUDA device code with NVCC, or HIP device code with HIPCC +// pull in math functions from the global namespace. In host mode, and when +// device doee with clang, use the std versions. +#if (defined(EIGEN_CUDA_ARCH) && defined(__NVCC__)) || (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIPCC__)) #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC; #else #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC; #endif -#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) +#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) && !defined(EIGEN_HIP_DEVICE_COMPILE) #define EIGEN_EXCEPTIONS #endif @@ -270,6 +301,17 @@ #include <cuda_fp16.h> #endif +#if defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE) + #define EIGEN_HAS_HIP_FP16 + #include <hip/hip_fp16.h> + #define HIP_PATCH_WITH_NEW_FP16 18215 + #if (HIP_VERSION_PATCH < HIP_PATCH_WITH_NEW_FP16) + #define EIGEN_HAS_OLD_HIP_FP16 + // Old HIP implementation does not have a explicit typedef for "half2" + typedef __half2 half2; + #endif +#endif + #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE) #define EIGEN_HAS_OPENMP #endif @@ -390,7 +432,6 @@ using std::ptrdiff_t; #include "src/Core/util/IntegralConstant.h" #include "src/Core/util/SymbolicIndex.h" - #include "src/Core/NumTraits.h" #include "src/Core/MathFunctions.h" #include "src/Core/GenericPacketMath.h" @@ -434,9 +475,15 @@ using std::ptrdiff_t; #endif // Half float support -#include "src/Core/arch/CUDA/Half.h" -#include "src/Core/arch/CUDA/PacketMathHalf.h" -#include "src/Core/arch/CUDA/TypeCasting.h" +#if defined EIGEN_USE_HIP + #include "src/Core/arch/HIP/hcc/Half.h" + #include "src/Core/arch/HIP/hcc/PacketMathHalf.h" + #include "src/Core/arch/HIP/hcc/TypeCasting.h" +#else + #include "src/Core/arch/CUDA/Half.h" + #include "src/Core/arch/CUDA/PacketMathHalf.h" + #include "src/Core/arch/CUDA/TypeCasting.h" +#endif #if defined EIGEN_VECTORIZE_CUDA #include "src/Core/arch/CUDA/PacketMath.h" diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 888a3f7ea..0903c3a6e 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -299,7 +299,11 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu { pstore(to, from); } /** \internal tries to do cache prefetching of \a addr */ -template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr) +template<typename Scalar> + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + inline void prefetch(const Scalar* addr) { #ifdef EIGEN_CUDA_ARCH #if defined(__LP64__) @@ -528,7 +532,7 @@ inline void palign(PacketType& first, const PacketType& second) ***************************************************************************/ // Eigen+CUDA does not support complexes. -#ifndef EIGEN_CUDACC +#if !defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC) template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b) { return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); } diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 05462c5e1..6beef5def 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -96,7 +96,7 @@ struct real_default_impl<Scalar,true> template<typename Scalar> struct real_impl : real_default_impl<Scalar> {}; -#ifdef EIGEN_CUDA_ARCH +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) template<typename T> struct real_impl<std::complex<T> > { @@ -144,7 +144,7 @@ struct imag_default_impl<Scalar,true> template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {}; -#ifdef EIGEN_CUDA_ARCH +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) template<typename T> struct imag_impl<std::complex<T> > { @@ -260,7 +260,7 @@ struct conj_default_impl<Scalar,true> template<typename Scalar> struct conj_impl : conj_default_impl<Scalar> {}; -#ifdef EIGEN_CUDA_ARCH +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) template<typename T> struct conj_impl<std::complex<T> > { @@ -435,7 +435,12 @@ struct round_retval struct arg_impl { static inline Scalar run(const Scalar& x) { + #if defined(EIGEN_HIP_DEVICE_COMPILE) + // HIP does not seem to have a native device side implementation for the math routine "arg" + using std::arg; + #else EIGEN_USING_STD_MATH(arg); + #endif return arg(x); } }; @@ -768,7 +773,9 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type isfinite_impl(const T& x) { - #ifdef EIGEN_CUDA_ARCH + #if defined(EIGEN_HIP_DEVICE_COMPILE) + return isfinite(x); + #elif defined(EIGEN_CUDA_ARCH) return (::isfinite)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isfinite; @@ -783,7 +790,9 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type isinf_impl(const T& x) { - #ifdef EIGEN_CUDA_ARCH + #if defined(EIGEN_HIP_DEVICE_COMPILE) + return isinf(x); + #elif defined(EIGEN_CUDA_ARCH) return (::isinf)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isinf; @@ -798,7 +807,9 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type isnan_impl(const T& x) { - #ifdef EIGEN_CUDA_ARCH + #if defined(EIGEN_HIP_DEVICE_COMPILE) + return isnan(x); + #elif defined(EIGEN_CUDA_ARCH) return (::isnan)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isnan; @@ -864,7 +875,7 @@ template<typename T> T generic_fast_tanh_float(const T& a_x); namespace numext { -#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) +#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE) && !defined(__SYCL_DEVICE_ONLY__) template<typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) @@ -1078,7 +1089,7 @@ EIGEN_ALWAYS_INLINE float log1p(float x) { return cl::sycl::log1p(x); } EIGEN_ALWAYS_INLINE double log1p(double x) { return cl::sycl::log1p(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log1p(const float &x) { return ::log1pf(x); } @@ -1136,7 +1147,7 @@ EIGEN_ALWAYS_INLINE float floor(float x) { return cl::sycl::floor(x); } EIGEN_ALWAYS_INLINE double floor(double x) { return cl::sycl::floor(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float floor(const float &x) { return ::floorf(x); } @@ -1157,7 +1168,7 @@ EIGEN_ALWAYS_INLINE float ceil(float x) { return cl::sycl::ceil(x); } EIGEN_ALWAYS_INLINE double ceil(double x) { return cl::sycl::ceil(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float ceil(const float &x) { return ::ceilf(x); } @@ -1215,7 +1226,7 @@ EIGEN_ALWAYS_INLINE double log(double x) { return cl::sycl::log(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log(const float &x) { return ::logf(x); } @@ -1243,7 +1254,7 @@ EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); } EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const float &x) { return ::fabsf(x); } @@ -1273,7 +1284,7 @@ EIGEN_ALWAYS_INLINE float exp(float x) { return cl::sycl::exp(x); } EIGEN_ALWAYS_INLINE double exp(double x) { return cl::sycl::exp(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float exp(const float &x) { return ::expf(x); } @@ -1309,7 +1320,7 @@ EIGEN_ALWAYS_INLINE float expm1(float x) { return cl::sycl::expm1(x); } EIGEN_ALWAYS_INLINE double expm1(double x) { return cl::sycl::expm1(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float expm1(const float &x) { return ::expm1f(x); } @@ -1329,7 +1340,7 @@ EIGEN_ALWAYS_INLINE float cos(float x) { return cl::sycl::cos(x); } EIGEN_ALWAYS_INLINE double cos(double x) { return cl::sycl::cos(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cos(const float &x) { return ::cosf(x); } @@ -1349,7 +1360,7 @@ EIGEN_ALWAYS_INLINE float sin(float x) { return cl::sycl::sin(x); } EIGEN_ALWAYS_INLINE double sin(double x) { return cl::sycl::sin(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sin(const float &x) { return ::sinf(x); } @@ -1369,7 +1380,7 @@ EIGEN_ALWAYS_INLINE float tan(float x) { return cl::sycl::tan(x); } EIGEN_ALWAYS_INLINE double tan(double x) { return cl::sycl::tan(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tan(const float &x) { return ::tanf(x); } @@ -1400,7 +1411,7 @@ EIGEN_ALWAYS_INLINE float acosh(float x) { return cl::sycl::acosh(x); } EIGEN_ALWAYS_INLINE double acosh(double x) { return cl::sycl::acosh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float acos(const float &x) { return ::acosf(x); } @@ -1431,7 +1442,7 @@ EIGEN_ALWAYS_INLINE float asinh(float x) { return cl::sycl::asinh(x); } EIGEN_ALWAYS_INLINE double asinh(double x) { return cl::sycl::asinh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float asin(const float &x) { return ::asinf(x); } @@ -1462,7 +1473,7 @@ EIGEN_ALWAYS_INLINE float atanh(float x) { return cl::sycl::atanh(x); } EIGEN_ALWAYS_INLINE double atanh(double x) { return cl::sycl::atanh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float atan(const float &x) { return ::atanf(x); } @@ -1483,7 +1494,7 @@ EIGEN_ALWAYS_INLINE float cosh(float x) { return cl::sycl::cosh(x); } EIGEN_ALWAYS_INLINE double cosh(double x) { return cl::sycl::cosh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cosh(const float &x) { return ::coshf(x); } @@ -1503,7 +1514,7 @@ EIGEN_ALWAYS_INLINE float sinh(float x) { return cl::sycl::sinh(x); } EIGEN_ALWAYS_INLINE double sinh(double x) { return cl::sycl::sinh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sinh(const float &x) { return ::sinhf(x); } @@ -1521,12 +1532,12 @@ T tanh(const T &x) { #if defined(__SYCL_DEVICE_ONLY__) EIGEN_ALWAYS_INLINE float tanh(float x) { return cl::sycl::tanh(x); } EIGEN_ALWAYS_INLINE double tanh(double x) { return cl::sycl::tanh(x); } -#elif (!defined(EIGEN_CUDACC)) && EIGEN_FAST_MATH +#elif (!defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC)) && EIGEN_FAST_MATH EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(float x) { return internal::generic_fast_tanh_float(x); } #endif -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(const float &x) { return ::tanhf(x); } @@ -1546,7 +1557,7 @@ EIGEN_ALWAYS_INLINE float fmod(float x, float y) { return cl::sycl::fmod(x, y) EIGEN_ALWAYS_INLINE double fmod(double x, double y) { return cl::sycl::fmod(x, y); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float fmod(const float& a, const float& b) { diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index fb637191d..cc75fbce3 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -137,6 +137,9 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scal typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type> { typedef Product<Lhs,Rhs,Options> SrcXprType; + #if defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &) { @@ -390,6 +393,9 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> typedef typename Product<Lhs,Rhs>::Scalar Scalar; template<typename Dst> + #if defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { // Same as: dst.noalias() = lhs.lazyProduct(rhs); diff --git a/Eigen/src/Core/arch/HIP/hcc/Half.h b/Eigen/src/Core/arch/HIP/hcc/Half.h new file mode 100644 index 000000000..2ce8a412c --- /dev/null +++ b/Eigen/src/Core/arch/HIP/hcc/Half.h @@ -0,0 +1,705 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +// The conversion routines are Copyright (c) Fabian Giesen, 2016. +// The original license follows: +// +// Copyright (c) Fabian Giesen, 2016 +// All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted. +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +// Standard 16-bit float type, mostly useful for GPUs. Defines a new +// type Eigen::half (inheriting from HIP's __half struct) with +// operator overloads such that it behaves basically as an arithmetic +// type. It will be quite slow on CPUs (so it is recommended to stay +// in fp32 for CPUs, except for simple parameter conversions, I/O +// to disk and the likes), but fast on GPUs. + + +#ifndef EIGEN_HALF_HIP_H +#define EIGEN_HALF_HIP_H + +#if __cplusplus > 199711L +#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type() +#else +#define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type() +#endif + + +namespace Eigen { + +struct half; + +namespace half_impl { + +#if !defined(EIGEN_HAS_HIP_FP16) +// Make our own __half_raw definition that is similar to CUDA's. +struct __half_raw { + EIGEN_DEVICE_FUNC __half_raw() : x(0) {} + explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} + unsigned short x; +}; +#elif defined(EIGEN_HAS_OLD_HIP_FP16) +// Make a __half_raw definition that is +// ++ compatible with that of Eigen and +// ++ add a implcit conversion to the native __half of the old HIP implementation. +// +// Keeping ".x" as "unsigned short" keeps the interface the same between the Eigen and HIP implementation. +// +// In the old HIP implementation, +// ++ __half is a typedef of __fp16 +// ++ the "__h*" routines take "__half" arguments +// so we need to implicitly convert "__half_raw" to "__half" to avoid having to explicitly make +// that conversiion in each call to a "__h*" routine...that is why we have "operator __half" routine +struct __half_raw { + EIGEN_DEVICE_FUNC __half_raw() : x(0) {} + explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} + union { + unsigned short x; + __half data; + }; + operator __half(void) const { return data; } +}; +#endif + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h); + +struct half_base : public __half_raw { + EIGEN_DEVICE_FUNC half_base() {} + EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {} + EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {} +#if defined(EIGEN_HAS_HIP_FP16) + #if defined(EIGEN_HAS_OLD_HIP_FP16) + EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(__half_as_ushort(h)) {} + #else + EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {} + #endif +#endif +}; + +} // namespace half_impl + +// Class definition. +struct half : public half_impl::half_base { + #if !defined(EIGEN_HAS_HIP_FP16) || defined(EIGEN_HAS_OLD_HIP_FP16) + typedef half_impl::__half_raw __half_raw; + #endif + + EIGEN_DEVICE_FUNC half() {} + + EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {} + EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {} +#if defined(EIGEN_HAS_HIP_FP16) + EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} +#endif + + explicit EIGEN_DEVICE_FUNC half(bool b) + : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} + template<class T> + explicit EIGEN_DEVICE_FUNC half(const T& val) + : half_impl::half_base(half_impl::float_to_half_rtne(static_cast<float>(val))) {} + explicit EIGEN_DEVICE_FUNC half(float f) + : half_impl::half_base(half_impl::float_to_half_rtne(f)) {} + + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const { + // +0.0 and -0.0 become false, everything else becomes true. + return (x & 0x7fff) != 0; + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const { + return static_cast<signed char>(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const { + return static_cast<unsigned char>(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const { + return static_cast<short>(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const { + return static_cast<unsigned short>(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const { + return static_cast<int>(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const { + return static_cast<unsigned int>(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const { + return static_cast<long>(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const { + return static_cast<unsigned long>(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const { + return static_cast<long long>(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const { + return static_cast<unsigned long long>(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const { + return half_impl::half_to_float(*this); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const { + return static_cast<double>(half_impl::half_to_float(*this)); + } + + EIGEN_DEVICE_FUNC half& operator=(const half& other) { + x = other.x; + return *this; + } +}; + +} // end namespace Eigen + +namespace std { +template<> +struct numeric_limits<Eigen::half> { + static const bool is_specialized = true; + static const bool is_signed = true; + static const bool is_integer = false; + static const bool is_exact = false; + static const bool has_infinity = true; + static const bool has_quiet_NaN = true; + static const bool has_signaling_NaN = true; + static const float_denorm_style has_denorm = denorm_present; + static const bool has_denorm_loss = false; + static const std::float_round_style round_style = std::round_to_nearest; + static const bool is_iec559 = false; + static const bool is_bounded = false; + static const bool is_modulo = false; + static const int digits = 11; + static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html + static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html + static const int radix = 2; + static const int min_exponent = -13; + static const int min_exponent10 = -4; + static const int max_exponent = 16; + static const int max_exponent10 = 4; + static const bool traps = true; + static const bool tinyness_before = false; + + static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); } + static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); } + static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); } + static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); } + static Eigen::half round_error() { return Eigen::half(0.5); } + static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); } + static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } + static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } + static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); } +}; + +// If std::numeric_limits<T> is specialized, should also specialize +// std::numeric_limits<const T>, std::numeric_limits<volatile T>, and +// std::numeric_limits<const volatile T> +// https://stackoverflow.com/a/16519653/ +template<> +struct numeric_limits<const Eigen::half> : numeric_limits<Eigen::half> {}; +template<> +struct numeric_limits<volatile Eigen::half> : numeric_limits<Eigen::half> {}; +template<> +struct numeric_limits<const volatile Eigen::half> : numeric_limits<Eigen::half> {}; +} // end namespace std + +namespace Eigen { + +namespace half_impl { + +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + +// Intrinsics for native fp16 support. Note that on current hardware, +// these are no faster than fp32 arithmetic (you need to use the half2 +// versions to get the ALU speed increased), but you do save the +// conversion steps back and forth. + +EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) { + return __hadd(a, b); +} +EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) { + return __hmul(a, b); +} +EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) { + return __hsub(a, b); +} +EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) { + float num = __half2float(a); + float denom = __half2float(b); + return __float2half(num / denom); +} +EIGEN_STRONG_INLINE __device__ half operator - (const half& a) { + return __hneg(a); +} +EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) { + a = a + b; + return a; +} +EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) { + a = a * b; + return a; +} +EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) { + a = a - b; + return a; +} +EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) { + a = a / b; + return a; +} +EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) { + return __heq(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) { + return __hne(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) { + return __hlt(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) { + return __hle(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) { + return __hgt(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) { + return __hge(a, b); +} + +#else // Emulate support for half floats + +// Definitions for CPUs mostly working through conversion to/from fp32. + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { + return half(float(a) + float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) { + return half(float(a) * float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) { + return half(float(a) - float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) { + return half(float(a) / float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) { + half result; + result.x = a.x ^ 0x8000; + return result; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) { + a = half(float(a) + float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) { + a = half(float(a) * float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) { + a = half(float(a) - float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) { + a = half(float(a) / float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { + return numext::equal_strict(float(a),float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { + return numext::not_equal_strict(float(a), float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { + return float(a) < float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) { + return float(a) <= float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { + return float(a) > float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) { + return float(a) >= float(b); +} + +#endif // Emulate support for half floats + +// Division by an index. Do it in full float precision to avoid accuracy +// issues in converting the denominator to half. +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) { + return half(static_cast<float>(a) / static_cast<float>(b)); +} + +// Conversion routines, including fallbacks for the host or older CUDA. +// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of +// these in hardware. If we need more performance on older/other CPUs, they are +// also possible to vectorize directly. + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) { + __half_raw h; + h.x = x; + return h; +} + +union FP32 { + unsigned int u; + float f; +}; + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + __half tmp_ff = __float2half(ff); + #if defined(EIGEN_HAS_OLD_HIP_FP16) + __half_raw h; + h.data = tmp_ff; + return h; + #else + return *(__half_raw*)&tmp_ff; + #endif + +#elif defined(EIGEN_HAS_FP16_C) + __half_raw h; + h.x = _cvtss_sh(ff, 0); + return h; + +#else + FP32 f; f.f = ff; + + const FP32 f32infty = { 255 << 23 }; + const FP32 f16max = { (127 + 16) << 23 }; + const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; + unsigned int sign_mask = 0x80000000u; + __half_raw o; + o.x = static_cast<unsigned short>(0x0u); + + unsigned int sign = f.u & sign_mask; + f.u ^= sign; + + // NOTE all the integer compares in this function can be safely + // compiled into signed compares since all operands are below + // 0x80000000. Important if you want fast straight SSE2 code + // (since there's no unsigned PCMPGTD). + + if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set) + o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf + } else { // (De)normalized number or zero + if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero + // use a magic value to align our 10 mantissa bits at the bottom of + // the float. as long as FP addition is round-to-nearest-even this + // just works. + f.f += denorm_magic.f; + + // and one integer subtract of the bias later, we have our final float! + o.x = static_cast<unsigned short>(f.u - denorm_magic.u); + } else { + unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd + + // update exponent, rounding bias part 1 + f.u += ((unsigned int)(15 - 127) << 23) + 0xfff; + // rounding bias part 2 + f.u += mant_odd; + // take the bits! + o.x = static_cast<unsigned short>(f.u >> 13); + } + } + + o.x |= static_cast<unsigned short>(sign >> 16); + return o; +#endif +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return __half2float(h); + +#elif defined(EIGEN_HAS_FP16_C) + return _cvtsh_ss(h.x); + +#else + const FP32 magic = { 113 << 23 }; + const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift + FP32 o; + + o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits + unsigned int exp = shifted_exp & o.u; // just the exponent + o.u += (127 - 15) << 23; // exponent adjust + + // handle exponent special cases + if (exp == shifted_exp) { // Inf/NaN? + o.u += (128 - 16) << 23; // extra exp adjust + } else if (exp == 0) { // Zero/Denormal? + o.u += 1 << 23; // extra exp adjust + o.f -= magic.f; // renormalize + } + + o.u |= (h.x & 0x8000) << 16; // sign bit + return o.f; +#endif +} + +// --- standard functions --- + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) { + return (a.x & 0x7fff) == 0x7c00; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return __hisnan(a); +#else + return (a.x & 0x7fff) > 0x7c00; +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const half& a) { + return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a)); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) { + half result; + result.x = a.x & 0x7FFF; + return result; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return half(hexp(a)); +#else + return half(::expf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { + return half(numext::expm1(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return half(hlog(a)); +#else + return half(::logf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) { + return half(numext::log1p(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { + return half(::log10f(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return half(hsqrt(a)); +#else + return half(::sqrtf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) { + return half(::powf(float(a), float(b))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) { + return half(::sinf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) { + return half(::cosf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) { + return half(::tanf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { + return half(::tanhf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return half(hfloor(a)); +#else + return half(::floorf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return half(hceil(a)); +#else + return half(::ceilf(float(a))); +#endif +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return __hlt(b, a) ? b : a; +#else + const float f1 = static_cast<float>(a); + const float f2 = static_cast<float>(b); + return f2 < f1 ? b : a; +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return __hlt(a, b) ? b : a; +#else + const float f1 = static_cast<float>(a); + const float f2 = static_cast<float>(b); + return f1 < f2 ? b : a; +#endif +} + +EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const half& v) { + os << static_cast<float>(v); + return os; +} + +} // end namespace half_impl + +// import Eigen::half_impl::half into Eigen namespace +// using half_impl::half; + +namespace internal { + +template<> +struct random_default_impl<half, false, false> +{ + static inline half run(const half& x, const half& y) + { + return x + (y-x) * half(float(std::rand()) / float(RAND_MAX)); + } + static inline half run() + { + return run(half(-1.f), half(1.f)); + } +}; + +template<> struct is_arithmetic<half> { enum { value = true }; }; + +} // end namespace internal + +template<> struct NumTraits<Eigen::half> + : GenericNumTraits<Eigen::half> +{ + enum { + IsSigned = true, + IsInteger = false, + IsComplex = false, + RequireInitialization = false + }; + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() { + return half_impl::raw_uint16_to_half(0x0800); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { return Eigen::half(1e-2f); } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() { + return half_impl::raw_uint16_to_half(0x7bff); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() { + return half_impl::raw_uint16_to_half(0xfbff); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() { + return half_impl::raw_uint16_to_half(0x7c00); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() { + return half_impl::raw_uint16_to_half(0x7c01); + } +}; + +} // end namespace Eigen + +// C-like standard mathematical functions and trancendentals. +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half fabsh(const Eigen::half& a) { + Eigen::half result; + result.x = a.x & 0x7FFF; + return result; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) { + return Eigen::half(::expf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return Eigen::half(hlog(a)); +#else + return Eigen::half(::logf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) { + return Eigen::half(::sqrtf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half powh(const Eigen::half& a, const Eigen::half& b) { + return Eigen::half(::powf(float(a), float(b))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) { + return Eigen::half(::floorf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceilh(const Eigen::half& a) { + return Eigen::half(::ceilf(float(a))); +} + +namespace std { + +#if __cplusplus > 199711L +template <> +struct hash<Eigen::half> { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::half& a) const { + return static_cast<std::size_t>(a.x); + } +}; +#endif + +} // end namespace std + + +// Add the missing shfl_xor intrinsic +#if defined(EIGEN_HAS_HIP_FP16) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__) +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { + // FIXME + //return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width)); + return var; +} +#endif + +// ldg() has an overload for __half, but we also need one for Eigen::half. +#if defined(EIGEN_HAS_HIP_FP16) && \ + defined(__HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__) && defined(__HIP_ARCH_HAS_DYNAMIC_PARALLEL__) +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { + // FIXME + //return Eigen::half_impl::raw_uint16_to_half( + // __ldg(reinterpret_cast<const unsigned short*>(ptr))); + return *ptr; +} +#endif + + +#if defined(EIGEN_HIP_DEVICE_COMPILE) +namespace Eigen { +namespace numext { + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +bool (isnan)(const Eigen::half& h) { + return (half_impl::isnan)(h); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +bool (isinf)(const Eigen::half& h) { + return (half_impl::isinf)(h); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +bool (isfinite)(const Eigen::half& h) { + return (half_impl::isfinite)(h); +} + +} // namespace Eigen +} // namespace numext +#endif + +#endif // EIGEN_HALF_HIP_H diff --git a/Eigen/src/Core/arch/HIP/hcc/PacketMathHalf.h b/Eigen/src/Core/arch/HIP/hcc/PacketMathHalf.h new file mode 100644 index 000000000..29c3f4671 --- /dev/null +++ b/Eigen/src/Core/arch/HIP/hcc/PacketMathHalf.h @@ -0,0 +1,1019 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_HALF_HIP_H +#define EIGEN_PACKET_MATH_HALF_HIP_H + + +namespace Eigen { +namespace internal { + +// Most of the following operations require arch >= 3.0 +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + +template<> struct is_arithmetic<half2> { enum { value = true }; }; + +template<> struct packet_traits<Eigen::half> : default_packet_traits +{ + typedef half2 type; + typedef half2 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 0, + HasAdd = 1, + HasMul = 1, + HasDiv = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasExp = 1, + HasExpm1 = 1, + HasLog = 1, + HasLog1p = 1 + }; +}; + +template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; + +template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) { +#if defined(EIGEN_HAS_OLD_HIP_FP16) + return half2half2(from); +#else + return __half2half2(from); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) { + return *reinterpret_cast<const half2*>(from); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) { + return __halves2half2(from[0], from[1]); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half* from) { + return __halves2half2(from[0], from[0]); +} + +template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) { + *reinterpret_cast<half2*>(to) = from; +} + +template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) { + to[0] = __low2half(from); + to[1] = __high2half(from); +} + +template<> + __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) { +#if defined(EIGEN_HAS_OLD_HIP_FP16) + return __halves2half2((*(from+0)), (*(from+1))); +#else + return __ldg((const half2*)from); +#endif +} + +template<> +__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) { +#if defined(EIGEN_HAS_OLD_HIP_FP16) + return __halves2half2((*(from+0)), (*(from+1))); +#else + return __halves2half2(__ldg(from+0), __ldg(from+1)); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) { + return __halves2half2(from[0*stride], from[1*stride]); +} + +template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) { + to[stride*0] = __low2half(from); + to[stride*1] = __high2half(from); +} + +template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) { + return __low2half(a); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) { + __half x = __ushort_as_half(__half_as_ushort(__low2half(a)) & 0x7FFF); + __half y = __ushort_as_half(__half_as_ushort(__high2half(a)) & 0x7FFF); + return __halves2half2(x, y); +} + + +__device__ EIGEN_STRONG_INLINE void +ptranspose(PacketBlock<half2,2>& kernel) { + __half a1 = __low2half(kernel.packet[0]); + __half a2 = __high2half(kernel.packet[0]); + __half b1 = __low2half(kernel.packet[1]); + __half b2 = __high2half(kernel.packet[1]); + kernel.packet[0] = __halves2half2(a1, b1); + kernel.packet[1] = __halves2half2(a2, b2); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) { + return __halves2half2(a, __hadd(a, __float2half(1.0f))); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) { + return __hadd2(a, b); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) { + return __hsub2(a, b); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { + return __hneg2(a); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } + +template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) { + return __hmul2(a, b); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) { + return __hfma2(a, b, c); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) { +#if defined(EIGEN_HAS_OLD_HIP_FP16) + return h2div(a, b); +#else + return __h2div(a, b); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + __half r1 = a1 < b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 < b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + __half r1 = a1 > b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 > b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); +} + +template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) { + return __hadd(__low2half(a), __high2half(a)); +} + +template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) { + __half first = __low2half(a); + __half second = __high2half(a); + return __hgt(first, second) ? first : second; +} + +template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) { + __half first = __low2half(a); + __half second = __high2half(a); + return __hlt(first, second) ? first : second; +} + +template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) { + return __hmul(__low2half(a), __high2half(a)); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = log1pf(a1); + float r2 = log1pf(a2); + return __floats2half2_rn(r1, r2); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = expm1f(a1); + float r2 = expm1f(a2); + return __floats2half2_rn(r1, r2); +} + +template<> __device__ EIGEN_STRONG_INLINE +half2 plog<half2>(const half2& a) { + return h2log(a); +} + +template<> __device__ EIGEN_STRONG_INLINE +half2 pexp<half2>(const half2& a) { + return h2exp(a); +} + +template<> __device__ EIGEN_STRONG_INLINE +half2 psqrt<half2>(const half2& a) { + return h2sqrt(a); +} + +template<> __device__ EIGEN_STRONG_INLINE +half2 prsqrt<half2>(const half2& a) { + return h2rsqrt(a); +} + +#elif defined EIGEN_VECTORIZE_AVX512 + +typedef struct { + __m256i x; +} Packet16h; + + +template<> struct is_arithmetic<Packet16h> { enum { value = true }; }; + +template <> +struct packet_traits<half> : default_packet_traits { + typedef Packet16h type; + // There is no half-size packet for Packet16h. + typedef Packet16h half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 16, + HasHalfPacket = 0, + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0, + HasDiv = 0, + HasSqrt = 0, + HasRsqrt = 0, + HasExp = 0, + HasLog = 0, + HasBlend = 0 + }; +}; + + +template<> struct unpacket_traits<Packet16h> { typedef Eigen::half type; enum {size=16, alignment=Aligned32}; typedef Packet16h half; }; + +template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) { + Packet16h result; + result.x = _mm256_set1_epi16(from.x); + return result; +} + +template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) { + return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm256_extract_epi16(from.x, 0))); +} + +template<> EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) { + Packet16h result; + result.x = _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) { + Packet16h result; + result.x = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) { + _mm256_store_si256((__m256i*)to, from.x); +} + +template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) { + _mm256_storeu_si256((__m256i*)to, from.x); +} + +template<> EIGEN_STRONG_INLINE Packet16h +ploadquad(const Eigen::half* from) { + Packet16h result; + unsigned short a = from[0].x; + unsigned short b = from[1].x; + unsigned short c = from[2].x; + unsigned short d = from[3].x; + result.x = _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a); + return result; +} + +EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { +#ifdef EIGEN_HAS_FP16_C + return _mm512_cvtph_ps(a.x); +#else + EIGEN_ALIGN64 half aux[16]; + pstore(aux, a); + float f0(aux[0]); + float f1(aux[1]); + float f2(aux[2]); + float f3(aux[3]); + float f4(aux[4]); + float f5(aux[5]); + float f6(aux[6]); + float f7(aux[7]); + float f8(aux[8]); + float f9(aux[9]); + float fa(aux[10]); + float fb(aux[11]); + float fc(aux[12]); + float fd(aux[13]); + float fe(aux[14]); + float ff(aux[15]); + + return _mm512_set_ps( + ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0); +#endif +} + +EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { +#ifdef EIGEN_HAS_FP16_C + Packet16h result; + result.x = _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); + return result; +#else + EIGEN_ALIGN64 float aux[16]; + pstore(aux, a); + half h0(aux[0]); + half h1(aux[1]); + half h2(aux[2]); + half h3(aux[3]); + half h4(aux[4]); + half h5(aux[5]); + half h6(aux[6]); + half h7(aux[7]); + half h8(aux[8]); + half h9(aux[9]); + half ha(aux[10]); + half hb(aux[11]); + half hc(aux[12]); + half hd(aux[13]); + half he(aux[14]); + half hf(aux[15]); + + Packet16h result; + result.x = _mm256_set_epi16( + hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x, + h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); + return result; +#endif +} + +template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = padd(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = pmul(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) { + Packet16f from_float = half2float(from); + return half(predux(from_float)); +} + +template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride) +{ + Packet16h result; + result.x = _mm256_set_epi16( + from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x, + from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x, + from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, + from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); + return result; +} + +template<> EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride) +{ + EIGEN_ALIGN64 half aux[16]; + pstore(aux, from); + to[stride*0].x = aux[0].x; + to[stride*1].x = aux[1].x; + to[stride*2].x = aux[2].x; + to[stride*3].x = aux[3].x; + to[stride*4].x = aux[4].x; + to[stride*5].x = aux[5].x; + to[stride*6].x = aux[6].x; + to[stride*7].x = aux[7].x; + to[stride*8].x = aux[8].x; + to[stride*9].x = aux[9].x; + to[stride*10].x = aux[10].x; + to[stride*11].x = aux[11].x; + to[stride*12].x = aux[12].x; + to[stride*13].x = aux[13].x; + to[stride*14].x = aux[14].x; + to[stride*15].x = aux[15].x; +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock<Packet16h,16>& kernel) { + __m256i a = kernel.packet[0].x; + __m256i b = kernel.packet[1].x; + __m256i c = kernel.packet[2].x; + __m256i d = kernel.packet[3].x; + __m256i e = kernel.packet[4].x; + __m256i f = kernel.packet[5].x; + __m256i g = kernel.packet[6].x; + __m256i h = kernel.packet[7].x; + __m256i i = kernel.packet[8].x; + __m256i j = kernel.packet[9].x; + __m256i k = kernel.packet[10].x; + __m256i l = kernel.packet[11].x; + __m256i m = kernel.packet[12].x; + __m256i n = kernel.packet[13].x; + __m256i o = kernel.packet[14].x; + __m256i p = kernel.packet[15].x; + + __m256i ab_07 = _mm256_unpacklo_epi16(a, b); + __m256i cd_07 = _mm256_unpacklo_epi16(c, d); + __m256i ef_07 = _mm256_unpacklo_epi16(e, f); + __m256i gh_07 = _mm256_unpacklo_epi16(g, h); + __m256i ij_07 = _mm256_unpacklo_epi16(i, j); + __m256i kl_07 = _mm256_unpacklo_epi16(k, l); + __m256i mn_07 = _mm256_unpacklo_epi16(m, n); + __m256i op_07 = _mm256_unpacklo_epi16(o, p); + + __m256i ab_8f = _mm256_unpackhi_epi16(a, b); + __m256i cd_8f = _mm256_unpackhi_epi16(c, d); + __m256i ef_8f = _mm256_unpackhi_epi16(e, f); + __m256i gh_8f = _mm256_unpackhi_epi16(g, h); + __m256i ij_8f = _mm256_unpackhi_epi16(i, j); + __m256i kl_8f = _mm256_unpackhi_epi16(k, l); + __m256i mn_8f = _mm256_unpackhi_epi16(m, n); + __m256i op_8f = _mm256_unpackhi_epi16(o, p); + + __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07); + __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07); + __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07); + __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07); + __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07); + __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07); + __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07); + __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07); + + __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f); + __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f); + __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f); + __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f); + __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f); + __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f); + __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f); + __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f); + + __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03); + __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03); + __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03); + __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03); + __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47); + __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47); + __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47); + __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47); + __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b); + __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b); + __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b); + __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b); + __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf); + __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf); + __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf); + __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf); + + // NOTE: no unpacklo/hi instr in this case, so using permute instr. + __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20); + __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31); + __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20); + __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31); + __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20); + __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31); + __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20); + __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31); + __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20); + __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31); + __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20); + __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31); + __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20); + __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31); + __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20); + __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31); + + kernel.packet[0].x = a_p_0; + kernel.packet[1].x = a_p_1; + kernel.packet[2].x = a_p_2; + kernel.packet[3].x = a_p_3; + kernel.packet[4].x = a_p_4; + kernel.packet[5].x = a_p_5; + kernel.packet[6].x = a_p_6; + kernel.packet[7].x = a_p_7; + kernel.packet[8].x = a_p_8; + kernel.packet[9].x = a_p_9; + kernel.packet[10].x = a_p_a; + kernel.packet[11].x = a_p_b; + kernel.packet[12].x = a_p_c; + kernel.packet[13].x = a_p_d; + kernel.packet[14].x = a_p_e; + kernel.packet[15].x = a_p_f; +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock<Packet16h,8>& kernel) { + EIGEN_ALIGN64 half in[8][16]; + pstore<half>(in[0], kernel.packet[0]); + pstore<half>(in[1], kernel.packet[1]); + pstore<half>(in[2], kernel.packet[2]); + pstore<half>(in[3], kernel.packet[3]); + pstore<half>(in[4], kernel.packet[4]); + pstore<half>(in[5], kernel.packet[5]); + pstore<half>(in[6], kernel.packet[6]); + pstore<half>(in[7], kernel.packet[7]); + + EIGEN_ALIGN64 half out[8][16]; + + for (int i = 0; i < 8; ++i) { + for (int j = 0; j < 8; ++j) { + out[i][j] = in[j][2*i]; + } + for (int j = 0; j < 8; ++j) { + out[i][j+8] = in[j][2*i+1]; + } + } + + kernel.packet[0] = pload<Packet16h>(out[0]); + kernel.packet[1] = pload<Packet16h>(out[1]); + kernel.packet[2] = pload<Packet16h>(out[2]); + kernel.packet[3] = pload<Packet16h>(out[3]); + kernel.packet[4] = pload<Packet16h>(out[4]); + kernel.packet[5] = pload<Packet16h>(out[5]); + kernel.packet[6] = pload<Packet16h>(out[6]); + kernel.packet[7] = pload<Packet16h>(out[7]); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock<Packet16h,4>& kernel) { + EIGEN_ALIGN64 half in[4][16]; + pstore<half>(in[0], kernel.packet[0]); + pstore<half>(in[1], kernel.packet[1]); + pstore<half>(in[2], kernel.packet[2]); + pstore<half>(in[3], kernel.packet[3]); + + EIGEN_ALIGN64 half out[4][16]; + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + out[i][j] = in[j][4*i]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+4] = in[j][4*i+1]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+8] = in[j][4*i+2]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+12] = in[j][4*i+3]; + } + } + + kernel.packet[0] = pload<Packet16h>(out[0]); + kernel.packet[1] = pload<Packet16h>(out[1]); + kernel.packet[2] = pload<Packet16h>(out[2]); + kernel.packet[3] = pload<Packet16h>(out[3]); +} + + +#elif defined EIGEN_VECTORIZE_AVX + +typedef struct { + __m128i x; +} Packet8h; + + +template<> struct is_arithmetic<Packet8h> { enum { value = true }; }; + +template <> +struct packet_traits<Eigen::half> : default_packet_traits { + typedef Packet8h type; + // There is no half-size packet for Packet8h. + typedef Packet8h half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 0, + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0, + HasDiv = 0, + HasSqrt = 0, + HasRsqrt = 0, + HasExp = 0, + HasLog = 0, + HasBlend = 0 + }; +}; + + +template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16}; typedef Packet8h half; }; + +template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) { + Packet8h result; + result.x = _mm_set1_epi16(from.x); + return result; +} + +template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) { + return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_extract_epi16(from.x, 0))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) { + Packet8h result; + result.x = _mm_load_si128(reinterpret_cast<const __m128i*>(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) { + Packet8h result; + result.x = _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) { + _mm_store_si128(reinterpret_cast<__m128i*>(to), from.x); +} + +template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) { + _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x); +} + +template<> EIGEN_STRONG_INLINE Packet8h +ploadquad<Packet8h>(const Eigen::half* from) { + Packet8h result; + unsigned short a = from[0].x; + unsigned short b = from[1].x; + result.x = _mm_set_epi16(b, b, b, b, a, a, a, a); + return result; +} + +EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) { +#ifdef EIGEN_HAS_FP16_C + return _mm256_cvtph_ps(a.x); +#else + EIGEN_ALIGN32 Eigen::half aux[8]; + pstore(aux, a); + float f0(aux[0]); + float f1(aux[1]); + float f2(aux[2]); + float f3(aux[3]); + float f4(aux[4]); + float f5(aux[5]); + float f6(aux[6]); + float f7(aux[7]); + + return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0); +#endif +} + +EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { +#ifdef EIGEN_HAS_FP16_C + Packet8h result; + result.x = _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); + return result; +#else + EIGEN_ALIGN32 float aux[8]; + pstore(aux, a); + Eigen::half h0(aux[0]); + Eigen::half h1(aux[1]); + Eigen::half h2(aux[2]); + Eigen::half h3(aux[3]); + Eigen::half h4(aux[4]); + Eigen::half h5(aux[5]); + Eigen::half h6(aux[6]); + Eigen::half h7(aux[7]); + + Packet8h result; + result.x = _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); + return result; +#endif +} + +template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = padd(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = pmul(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride) +{ + Packet8h result; + result.x = _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); + return result; +} + +template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride) +{ + EIGEN_ALIGN32 Eigen::half aux[8]; + pstore(aux, from); + to[stride*0].x = aux[0].x; + to[stride*1].x = aux[1].x; + to[stride*2].x = aux[2].x; + to[stride*3].x = aux[3].x; + to[stride*4].x = aux[4].x; + to[stride*5].x = aux[5].x; + to[stride*6].x = aux[6].x; + to[stride*7].x = aux[7].x; +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux<Packet8f>(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux_max<Packet8f>(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux_min<Packet8f>(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux_mul<Packet8f>(af); + return Eigen::half(reduced); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock<Packet8h,8>& kernel) { + __m128i a = kernel.packet[0].x; + __m128i b = kernel.packet[1].x; + __m128i c = kernel.packet[2].x; + __m128i d = kernel.packet[3].x; + __m128i e = kernel.packet[4].x; + __m128i f = kernel.packet[5].x; + __m128i g = kernel.packet[6].x; + __m128i h = kernel.packet[7].x; + + __m128i a03b03 = _mm_unpacklo_epi16(a, b); + __m128i c03d03 = _mm_unpacklo_epi16(c, d); + __m128i e03f03 = _mm_unpacklo_epi16(e, f); + __m128i g03h03 = _mm_unpacklo_epi16(g, h); + __m128i a47b47 = _mm_unpackhi_epi16(a, b); + __m128i c47d47 = _mm_unpackhi_epi16(c, d); + __m128i e47f47 = _mm_unpackhi_epi16(e, f); + __m128i g47h47 = _mm_unpackhi_epi16(g, h); + + __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03); + __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03); + __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03); + __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03); + __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47); + __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47); + __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47); + __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47); + + __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01); + __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01); + __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23); + __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23); + __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45); + __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45); + __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67); + __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67); + + kernel.packet[0].x = a0b0c0d0e0f0g0h0; + kernel.packet[1].x = a1b1c1d1e1f1g1h1; + kernel.packet[2].x = a2b2c2d2e2f2g2h2; + kernel.packet[3].x = a3b3c3d3e3f3g3h3; + kernel.packet[4].x = a4b4c4d4e4f4g4h4; + kernel.packet[5].x = a5b5c5d5e5f5g5h5; + kernel.packet[6].x = a6b6c6d6e6f6g6h6; + kernel.packet[7].x = a7b7c7d7e7f7g7h7; +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock<Packet8h,4>& kernel) { + EIGEN_ALIGN32 Eigen::half in[4][8]; + pstore<Eigen::half>(in[0], kernel.packet[0]); + pstore<Eigen::half>(in[1], kernel.packet[1]); + pstore<Eigen::half>(in[2], kernel.packet[2]); + pstore<Eigen::half>(in[3], kernel.packet[3]); + + EIGEN_ALIGN32 Eigen::half out[4][8]; + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + out[i][j] = in[j][2*i]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+4] = in[j][2*i+1]; + } + } + + kernel.packet[0] = pload<Packet8h>(out[0]); + kernel.packet[1] = pload<Packet8h>(out[1]); + kernel.packet[2] = pload<Packet8h>(out[2]); + kernel.packet[3] = pload<Packet8h>(out[3]); +} + + +// Disable the following code since it's broken on too many platforms / compilers. +//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) +#elif 0 + +typedef struct { + __m64 x; +} Packet4h; + + +template<> struct is_arithmetic<Packet4h> { enum { value = true }; }; + +template <> +struct packet_traits<Eigen::half> : default_packet_traits { + typedef Packet4h type; + // There is no half-size packet for Packet4h. + typedef Packet4h half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 0, + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0, + HasDiv = 0, + HasSqrt = 0, + HasRsqrt = 0, + HasExp = 0, + HasLog = 0, + HasBlend = 0 + }; +}; + + +template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16}; typedef Packet4h half; }; + +template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) { + Packet4h result; + result.x = _mm_set1_pi16(from.x); + return result; +} + +template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) { + return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x))); +} + +template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64)); + h[0] = ha + hb; + ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16)); + h[1] = ha + hb; + ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32)); + h[2] = ha + hb; + ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48)); + h[3] = ha + hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64)); + h[0] = ha * hb; + ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16)); + h[1] = ha * hb; + ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32)); + h[2] = ha * hb; + ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48)); + h[3] = ha * hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) { + Packet4h result; + result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) { + Packet4h result; + result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) { + __int64_t r = _mm_cvtm64_si64(from.x); + *(reinterpret_cast<__int64_t*>(to)) = r; +} + +template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) { + __int64_t r = _mm_cvtm64_si64(from.x); + *(reinterpret_cast<__int64_t*>(to)) = r; +} + +template<> EIGEN_STRONG_INLINE Packet4h +ploadquad<Packet4h>(const Eigen::half* from) { + return pset1<Packet4h>(*from); +} + +template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride) +{ + Packet4h result; + result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); + return result; +} + +template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride) +{ + __int64_t a = _mm_cvtm64_si64(from.x); + to[stride*0].x = static_cast<unsigned short>(a); + to[stride*1].x = static_cast<unsigned short>(a >> 16); + to[stride*2].x = static_cast<unsigned short>(a >> 32); + to[stride*3].x = static_cast<unsigned short>(a >> 48); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock<Packet4h,4>& kernel) { + __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x); + __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x); + __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x); + __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x); + + kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1); + kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1); + kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3); + kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3); +} + +#endif + +} +} + +#endif // EIGEN_PACKET_MATH_HALF_HIP_H diff --git a/Eigen/src/Core/arch/HIP/hcc/TypeCasting.h b/Eigen/src/Core/arch/HIP/hcc/TypeCasting.h new file mode 100644 index 000000000..915266a9d --- /dev/null +++ b/Eigen/src/Core/arch/HIP/hcc/TypeCasting.h @@ -0,0 +1,212 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_HIP_H +#define EIGEN_TYPE_CASTING_HIP_H + +namespace Eigen { + +namespace internal { + +template<> +struct scalar_cast_op<float, Eigen::half> { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef Eigen::half result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const { + #if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return __float2half(a); + #else + return Eigen::half(a); + #endif + } +}; + +template<> +struct functor_traits<scalar_cast_op<float, Eigen::half> > +{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op<int, Eigen::half> { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef Eigen::half result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const { + #if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return __float2half(static_cast<float>(a)); + #else + return Eigen::half(static_cast<float>(a)); + #endif + } +}; + +template<> +struct functor_traits<scalar_cast_op<int, Eigen::half> > +{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op<Eigen::half, float> { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef float result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const { + #if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return __half2float(a); + #else + return static_cast<float>(a); + #endif + } +}; + +template<> +struct functor_traits<scalar_cast_op<Eigen::half, float> > +{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; }; + + + +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + +template <> +struct type_casting_traits<Eigen::half, float> { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 2, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(const half2& a, const half2& b) { + float2 r1 = __half22float2(a); + float2 r2 = __half22float2(b); + return make_float4(r1.x, r1.y, r2.x, r2.y); +} + +template <> +struct type_casting_traits<float, Eigen::half> { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 2 + }; +}; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) { + // Simply discard the second half of the input + return __floats2half2_rn(a.x, a.y); +} + +#elif defined EIGEN_VECTORIZE_AVX512 +template <> +struct type_casting_traits<half, float> { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) { + return half2float(a); +} + +template <> +struct type_casting_traits<float, half> { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packet16f& a) { + return float2half(a); +} + +#elif defined EIGEN_VECTORIZE_AVX + +template <> +struct type_casting_traits<Eigen::half, float> { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) { + return half2float(a); +} + +template <> +struct type_casting_traits<float, Eigen::half> { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) { + return float2half(a); +} + +// Disable the following code since it's broken on too many platforms / compilers. +//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) +#elif 0 + +template <> +struct type_casting_traits<Eigen::half, float> { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4h, Packet4f>(const Packet4h& a) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + Eigen::half h = raw_uint16_to_half(static_cast<unsigned short>(a64)); + float f1 = static_cast<float>(h); + h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16)); + float f2 = static_cast<float>(h); + h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32)); + float f3 = static_cast<float>(h); + h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48)); + float f4 = static_cast<float>(h); + return _mm_set_ps(f4, f3, f2, f1); +} + +template <> +struct type_casting_traits<float, Eigen::half> { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f& a) { + EIGEN_ALIGN16 float aux[4]; + pstore(aux, a); + Eigen::half h0(aux[0]); + Eigen::half h1(aux[1]); + Eigen::half h2(aux[2]); + Eigen::half h3(aux[3]); + + Packet4h result; + result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x); + return result; +} + +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_HIP_H diff --git a/Eigen/src/Core/arch/HIP/hcc/math_constants.h b/Eigen/src/Core/arch/HIP/hcc/math_constants.h new file mode 100644 index 000000000..25375a0a4 --- /dev/null +++ b/Eigen/src/Core/arch/HIP/hcc/math_constants.h @@ -0,0 +1,23 @@ +/* + * math_constants.h - + * HIP equivalent of the CUDA header of the same name + */ + +#ifndef __MATH_CONSTANTS_H__ +#define __MATH_CONSTANTS_H__ + +/* single precision constants */ + +#define HIPRT_INF_F __int_as_float(0x7f800000) +#define HIPRT_NAN_F __int_as_float(0x7fffffff) +#define HIPRT_MIN_DENORM_F __int_as_float(0x00000001) +#define HIPRT_MAX_NORMAL_F __int_as_float(0x7f7fffff) +#define HIPRT_NEG_ZERO_F __int_as_float(0x80000000) +#define HIPRT_ZERO_F 0.0f +#define HIPRT_ONE_F 1.0f + +/* double precision constants */ +#define HIPRT_INF __hiloint2double(0x7ff00000, 0x00000000) +#define HIPRT_NAN __hiloint2double(0xfff80000, 0x00000000) + +#endif diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index 3eae6b8ca..e269140bd 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h @@ -436,6 +436,9 @@ template<typename BinaryOp> struct bind1st_op : BinaryOp { typedef typename BinaryOp::second_argument_type second_argument_type; typedef typename BinaryOp::result_type result_type; + #if defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC explicit + #endif bind1st_op(const first_argument_type &val) : m_value(val) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); } @@ -455,6 +458,9 @@ template<typename BinaryOp> struct bind2nd_op : BinaryOp { typedef typename BinaryOp::second_argument_type second_argument_type; typedef typename BinaryOp::result_type result_type; + #if defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC explicit + #endif bind2nd_op(const second_argument_type &val) : m_value(val) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); } diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index b1791fb3a..a4cde6d95 100755 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -163,7 +163,10 @@ class BlasLinearMapper { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {} - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const { + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + EIGEN_ALWAYS_INLINE void prefetch(int i) const { internal::prefetch(&operator()(i)); } diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 9c68ecb7d..c6e27f6af 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -1008,9 +1008,12 @@ namespace Eigen { # define EIGEN_TRY try # define EIGEN_CATCH(X) catch (X) #else -# ifdef EIGEN_CUDA_ARCH +# if defined(EIGEN_CUDA_ARCH) # define EIGEN_THROW_X(X) asm("trap;") # define EIGEN_THROW asm("trap;") +# elif defined(EIGEN_HIP_DEVICE_COMPILE) +# define EIGEN_THROW_X(X) asm("s_trap 0") +# define EIGEN_THROW asm("s_trap 0") # else # define EIGEN_THROW_X(X) std::abort() # define EIGEN_THROW std::abort() diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 53300c388..87fcc30f5 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -70,7 +70,20 @@ inline void throw_std_bad_alloc() throw std::bad_alloc(); #else std::size_t huge = static_cast<std::size_t>(-1); + #if defined(EIGEN_HIPCC) + // + // calls to "::operator new" are to be treated as opaque function calls (i.e no inlining), + // and as a consequence the code in the #else block triggers the hipcc warning : + // "no overloaded function has restriction specifiers that are compatible with the ambient context" + // + // "throw_std_bad_alloc" has the EIGEN_DEVICE_FUNC attribute, so it seems that hipcc expects + // the same on "operator new" + // Reverting code back to the old version in this #if block for the hipcc compiler + // + new int[huge]; + #else ::operator new(huge); + #endif #endif } @@ -156,7 +169,13 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size) void *result; #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED + + #if defined(EIGEN_HIP_DEVICE_COMPILE) + result = aligned_malloc(size); + #else result = std::malloc(size); + #endif + #if EIGEN_DEFAULT_ALIGN_BYTES==16 eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade alignd memory allocator."); #endif @@ -174,7 +193,13 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size) EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr) { #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED + + #if defined(EIGEN_HIP_DEVICE_COMPILE) + aligned_free(ptr); + #else std::free(ptr); + #endif + #else handmade_aligned_free(ptr); #endif @@ -218,7 +243,12 @@ template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(std: { check_that_malloc_is_allowed(); + #if defined(EIGEN_HIP_DEVICE_COMPILE) + void *result = aligned_malloc(size); + #else void *result = std::malloc(size); + #endif + if(!result && size) throw_std_bad_alloc(); return result; @@ -232,7 +262,11 @@ template<bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void *ptr) { + #if defined(EIGEN_HIP_DEVICE_COMPILE) + aligned_free(ptr); + #else std::free(ptr); + #endif } template<bool Align> inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size) @@ -493,7 +527,11 @@ template<typename T> struct smart_copy_helper<T,true> { IntPtr size = IntPtr(end)-IntPtr(start); if(size==0) return; eigen_internal_assert(start!=0 && end!=0 && target!=0); + #if defined(EIGEN_HIP_DEVICE_COMPILE) + ::memcpy(target, start, size); + #else std::memcpy(target, start, size); + #endif } }; diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 6e5af35c0..ca6fa6ce9 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -16,6 +16,12 @@ #include <math_constants.h> #endif +#if defined(EIGEN_HIP_DEVICE_COMPILE) +#include <cfloat> +#include "Eigen/src/Core/arch/HIP/hcc/math_constants.h" +#endif + + #if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L #include <cstdint> #endif @@ -175,7 +181,7 @@ template<bool Condition, typename T=void> struct enable_if; template<typename T> struct enable_if<true,T> { typedef T type; }; -#if defined(EIGEN_CUDA_ARCH) +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) #if !defined(__FLT_EPSILON__) #define __FLT_EPSILON__ FLT_EPSILON #define __DBL_EPSILON__ DBL_EPSILON @@ -197,13 +203,31 @@ template<> struct numeric_limits<float> EIGEN_DEVICE_FUNC static float epsilon() { return __FLT_EPSILON__; } EIGEN_DEVICE_FUNC - static float (max)() { return CUDART_MAX_NORMAL_F; } + static float (max)() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_MAX_NORMAL_F; + #else + return HIPRT_MAX_NORMAL_F; + #endif + } EIGEN_DEVICE_FUNC static float (min)() { return FLT_MIN; } EIGEN_DEVICE_FUNC - static float infinity() { return CUDART_INF_F; } + static float infinity() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_INF_F; + #else + return HIPRT_INF_F; + #endif + } EIGEN_DEVICE_FUNC - static float quiet_NaN() { return CUDART_NAN_F; } + static float quiet_NaN() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_NAN_F; + #else + return HIPRT_NAN_F; + #endif + } }; template<> struct numeric_limits<double> { @@ -214,9 +238,21 @@ template<> struct numeric_limits<double> EIGEN_DEVICE_FUNC static double (min)() { return DBL_MIN; } EIGEN_DEVICE_FUNC - static double infinity() { return CUDART_INF; } + static double infinity() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_INF; + #else + return HIPRT_INF; + #endif + } EIGEN_DEVICE_FUNC - static double quiet_NaN() { return CUDART_NAN; } + static double quiet_NaN() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_NAN; + #else + return HIPRT_NAN; + #endif + } }; template<> struct numeric_limits<int> { @@ -529,13 +565,13 @@ template<typename T, typename U> struct scalar_product_traits namespace numext { -#if defined(EIGEN_CUDA_ARCH) +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) template<typename T> EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; } #else template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); } #endif -#if defined(EIGEN_CUDA_ARCH) +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) using internal::device::numeric_limits; #else using std::numeric_limits; diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h index 040f8d3bb..bf28edc0e 100644 --- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h @@ -354,6 +354,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver static const int m_maxIterations = 30; protected: + EIGEN_DEVICE_FUNC static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index a24deb96a..e977b9623 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -1299,7 +1299,7 @@ void BDCSVD<MatrixType>::deflation(Eigen::Index firstCol, Eigen::Index lastCol, #endif }//end deflation -#ifndef EIGEN_CUDACC +#if !defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC) /** \svd_module * * \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index 7d2d63722..f818ae840 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -19,7 +19,9 @@ macro(ei_add_test_internal testname testname_with_suffix) endif() if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu) - if(EIGEN_TEST_CUDA_CLANG) + if(EIGEN_TEST_HIP) + hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS "-DEIGEN_USE_HIP") + elseif(EIGEN_TEST_CUDA_CLANG) set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX) if(CUDA_64_BIT_DEVICE_CODE) link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64") @@ -491,6 +493,11 @@ macro(ei_testing_print_summary) else() message(STATUS "CUDA: OFF") endif() + if(EIGEN_TEST_HIP) + message(STATUS "HIP: ON (using hipcc)") + else() + message(STATUS "HIP: OFF") + endif() endif() # vectorization / alignment options diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e1eef086e..4a5c1d36d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -407,6 +407,48 @@ endif(CUDA_FOUND) endif(EIGEN_TEST_CUDA) +# HIP unit tests +option(EIGEN_TEST_HIP "Add HIP support." OFF) +if (EIGEN_TEST_HIP) + + set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.") + + if (EXISTS ${HIP_PATH}) + + list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake) + + find_package(HIP REQUIRED) + if (HIP_FOUND) + + execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM) + + if (${HIP_PLATFORM} STREQUAL "hcc") + + include_directories(${CMAKE_CURRENT_BINARY_DIR}) + include_directories(${HIP_PATH}/include) + + set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") + ei_add_test(hip_basic) + unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) + + elseif (${HIP_PLATFORM} STREQUAL "nvcc") + message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen") + else () + message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}") + endif() + + endif(HIP_FOUND) + + else () + + message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist") + + endif() + +endif(EIGEN_TEST_HIP) + + + file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests) add_test(NAME failtests WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests COMMAND ${CMAKE_COMMAND} ${Eigen_SOURCE_DIR} -G "${CMAKE_GENERATOR}" -DEIGEN_FAILTEST=ON) diff --git a/test/hip_basic.cu b/test/hip_basic.cu new file mode 100644 index 000000000..2e1bf94a4 --- /dev/null +++ b/test/hip_basic.cu @@ -0,0 +1,172 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015-2016 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// workaround issue between gcc >= 4.7 and cuda 5.5 +#if (defined __GNUC__) && (__GNUC__>4 || __GNUC_MINOR__>=7) + #undef _GLIBCXX_ATOMIC_BUILTINS + #undef _GLIBCXX_USE_INT128 +#endif + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC hip_basic +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int + +#include <hip/hip_runtime.h> + +#include "main.h" +#include "hip_common.h" + +// Check that dense modules can be properly parsed by hipcc +#include <Eigen/Dense> + +// struct Foo{ +// EIGEN_DEVICE_FUNC +// void operator()(int i, const float* mats, float* vecs) const { +// using namespace Eigen; +// // Matrix3f M(data); +// // Vector3f x(data+9); +// // Map<Vector3f>(data+9) = M.inverse() * x; +// Matrix3f M(mats+i/16); +// Vector3f x(vecs+i*3); +// // using std::min; +// // using std::sqrt; +// Map<Vector3f>(vecs+i*3) << x.minCoeff(), 1, 2;// / x.dot(x);//(M.inverse() * x) / x.x(); +// //x = x*2 + x.y() * x + x * x.maxCoeff() - x / x.sum(); +// } +// }; + +template<typename T> +struct coeff_wise { + EIGEN_DEVICE_FUNC + void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const + { + using namespace Eigen; + T x1(in+i); + T x2(in+i+1); + T x3(in+i+2); + Map<T> res(out+i*T::MaxSizeAtCompileTime); + + res.array() += (in[0] * x1 + x2).array() * x3.array(); + } +}; + +template<typename T> +struct replicate { + EIGEN_DEVICE_FUNC + void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const + { + using namespace Eigen; + T x1(in+i); + int step = x1.size() * 4; + int stride = 3 * step; + + typedef Map<Array<typename T::Scalar,Dynamic,Dynamic> > MapType; + MapType(out+i*stride+0*step, x1.rows()*2, x1.cols()*2) = x1.replicate(2,2); + MapType(out+i*stride+1*step, x1.rows()*3, x1.cols()) = in[i] * x1.colwise().replicate(3); + MapType(out+i*stride+2*step, x1.rows(), x1.cols()*3) = in[i] * x1.rowwise().replicate(3); + } +}; + +template<typename T> +struct redux { + EIGEN_DEVICE_FUNC + void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const + { + using namespace Eigen; + int N = 10; + T x1(in+i); + out[i*N+0] = x1.minCoeff(); + out[i*N+1] = x1.maxCoeff(); + out[i*N+2] = x1.sum(); + out[i*N+3] = x1.prod(); + out[i*N+4] = x1.matrix().squaredNorm(); + out[i*N+5] = x1.matrix().norm(); + out[i*N+6] = x1.colwise().sum().maxCoeff(); + out[i*N+7] = x1.rowwise().maxCoeff().sum(); + out[i*N+8] = x1.matrix().colwise().squaredNorm().sum(); + } +}; + +template<typename T1, typename T2> +struct prod_test { + EIGEN_DEVICE_FUNC + void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const + { + using namespace Eigen; + typedef Matrix<typename T1::Scalar, T1::RowsAtCompileTime, T2::ColsAtCompileTime> T3; + T1 x1(in+i); + T2 x2(in+i+1); + Map<T3> res(out+i*T3::MaxSizeAtCompileTime); + res += in[i] * x1 * x2; + } +}; + +template<typename T1, typename T2> +struct diagonal { + EIGEN_DEVICE_FUNC + void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const + { + using namespace Eigen; + T1 x1(in+i); + Map<T2> res(out+i*T2::MaxSizeAtCompileTime); + res += x1.diagonal(); + } +}; + +template<typename T> +struct eigenvalues { + EIGEN_DEVICE_FUNC + void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const + { + using namespace Eigen; + typedef Matrix<typename T::Scalar, T::RowsAtCompileTime, 1> Vec; + T M(in+i); + Map<Vec> res(out+i*Vec::MaxSizeAtCompileTime); + T A = M*M.adjoint(); + SelfAdjointEigenSolver<T> eig; + eig.computeDirect(M); + res = eig.eigenvalues(); + } +}; + +void test_hip_basic() +{ + ei_test_init_hip(); + + int nthreads = 100; + Eigen::VectorXf in, out; + + #ifndef __HIP_DEVICE_COMPILE__ + int data_size = nthreads * 512; + in.setRandom(data_size); + out.setRandom(data_size); + #endif + + CALL_SUBTEST( run_and_compare_to_hip(coeff_wise<Vector3f>(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_hip(coeff_wise<Array44f>(), nthreads, in, out) ); + + // FIXME compile fails when we uncomment the followig two tests + // CALL_SUBTEST( run_and_compare_to_hip(replicate<Array4f>(), nthreads, in, out) ); + // CALL_SUBTEST( run_and_compare_to_hip(replicate<Array33f>(), nthreads, in, out) ); + + CALL_SUBTEST( run_and_compare_to_hip(redux<Array4f>(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_hip(redux<Matrix3f>(), nthreads, in, out) ); + + CALL_SUBTEST( run_and_compare_to_hip(prod_test<Matrix3f,Matrix3f>(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_hip(prod_test<Matrix4f,Vector4f>(), nthreads, in, out) ); + + CALL_SUBTEST( run_and_compare_to_hip(diagonal<Matrix3f,Vector3f>(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_hip(diagonal<Matrix4f,Vector4f>(), nthreads, in, out) ); + + // FIXME : Runtime failure occurs when we uncomment the following two tests + // CALL_SUBTEST( run_and_compare_to_hip(eigenvalues<Matrix3f>(), nthreads, in, out) ); + // CALL_SUBTEST( run_and_compare_to_hip(eigenvalues<Matrix2f>(), nthreads, in, out) ); + +} diff --git a/test/hip_common.h b/test/hip_common.h new file mode 100644 index 000000000..251585c52 --- /dev/null +++ b/test/hip_common.h @@ -0,0 +1,103 @@ + +#ifndef EIGEN_TEST_HIP_COMMON_H +#define EIGEN_TEST_HIP_COMMON_H + +#include "hip/hip_runtime.h" +#include "hip/hip_runtime_api.h" +#include <iostream> + +#ifndef __HIPCC__ +dim3 threadIdx, blockDim, blockIdx; +#endif + +template<typename Kernel, typename Input, typename Output> +void run_on_cpu(const Kernel& ker, int n, const Input& in, Output& out) +{ + for(int i=0; i<n; i++) + ker(i, in.data(), out.data()); +} + + +template<typename Kernel, typename Input, typename Output> +__global__ __attribute__((used)) +void run_on_hip_meta_kernel(const Kernel ker, int n, const Input* in, Output* out) +{ + int i = hipThreadIdx_x + hipBlockIdx_x*hipBlockDim_x; + if(i<n) { + ker(i, in, out); + } +} + + +template<typename Kernel, typename Input, typename Output> +void run_on_hip(const Kernel& ker, int n, const Input& in, Output& out) +{ + typename Input::Scalar* d_in; + typename Output::Scalar* d_out; + std::ptrdiff_t in_bytes = in.size() * sizeof(typename Input::Scalar); + std::ptrdiff_t out_bytes = out.size() * sizeof(typename Output::Scalar); + + hipMalloc((void**)(&d_in), in_bytes); + hipMalloc((void**)(&d_out), out_bytes); + + hipMemcpy(d_in, in.data(), in_bytes, hipMemcpyHostToDevice); + hipMemcpy(d_out, out.data(), out_bytes, hipMemcpyHostToDevice); + + // Simple and non-optimal 1D mapping assuming n is not too large + // That's only for unit testing! + dim3 Blocks(128); + dim3 Grids( (n+int(Blocks.x)-1)/int(Blocks.x) ); + + hipDeviceSynchronize(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(run_on_hip_meta_kernel<Kernel, + typename std::decay<decltype(*d_in)>::type, + typename std::decay<decltype(*d_out)>::type>), + dim3(Grids), dim3(Blocks), 0, 0, ker, n, d_in, d_out); + hipDeviceSynchronize(); + + // check inputs have not been modified + hipMemcpy(const_cast<typename Input::Scalar*>(in.data()), d_in, in_bytes, hipMemcpyDeviceToHost); + hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost); + + hipFree(d_in); + hipFree(d_out); +} + + +template<typename Kernel, typename Input, typename Output> +void run_and_compare_to_hip(const Kernel& ker, int n, const Input& in, Output& out) +{ + Input in_ref, in_hip; + Output out_ref, out_hip; + #ifndef __HIP_DEVICE_COMPILE__ + in_ref = in_hip = in; + out_ref = out_hip = out; + #endif + run_on_cpu (ker, n, in_ref, out_ref); + run_on_hip(ker, n, in_hip, out_hip); + #ifndef __HIP_DEVICE_COMPILE__ + VERIFY_IS_APPROX(in_ref, in_hip); + VERIFY_IS_APPROX(out_ref, out_hip); + #endif +} + + +void ei_test_init_hip() +{ + int device = 0; + hipDeviceProp_t deviceProp; + hipGetDeviceProperties(&deviceProp, device); + std::cout << "HIP device info:\n"; + std::cout << " name: " << deviceProp.name << "\n"; + std::cout << " capability: " << deviceProp.major << "." << deviceProp.minor << "\n"; + std::cout << " multiProcessorCount: " << deviceProp.multiProcessorCount << "\n"; + std::cout << " maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor << "\n"; + std::cout << " warpSize: " << deviceProp.warpSize << "\n"; + std::cout << " regsPerBlock: " << deviceProp.regsPerBlock << "\n"; + std::cout << " concurrentKernels: " << deviceProp.concurrentKernels << "\n"; + std::cout << " clockRate: " << deviceProp.clockRate << "\n"; + std::cout << " canMapHostMemory: " << deviceProp.canMapHostMemory << "\n"; + std::cout << " computeMode: " << deviceProp.computeMode << "\n"; +} + +#endif // EIGEN_TEST_HIP_COMMON_H diff --git a/test/main.h b/test/main.h index 0fcd6cb76..79717a532 100644 --- a/test/main.h +++ b/test/main.h @@ -67,11 +67,17 @@ // protected by parenthesis against macro expansion, the min()/max() macros // are defined here and any not-parenthesized min/max call will cause a // compiler error. -#define min(A,B) please_protect_your_min_with_parentheses -#define max(A,B) please_protect_your_max_with_parentheses -#define isnan(X) please_protect_your_isnan_with_parentheses -#define isinf(X) please_protect_your_isinf_with_parentheses -#define isfinite(X) please_protect_your_isfinite_with_parentheses +#if !defined(__HIPCC__) + // HIP headers include the <thread> header which contains not-parenthesized + // calls to "max", triggering the following check and causing the compile to fail + // so disabling the following checks for HIP + #define min(A,B) please_protect_your_min_with_parentheses + #define max(A,B) please_protect_your_max_with_parentheses + #define isnan(X) please_protect_your_isnan_with_parentheses + #define isinf(X) please_protect_your_isinf_with_parentheses + #define isfinite(X) please_protect_your_isfinite_with_parentheses +#endif + #ifdef M_PI #undef M_PI #endif @@ -154,7 +160,7 @@ namespace Eigen #define EIGEN_DEFAULT_IO_FORMAT IOFormat(4, 0, " ", "\n", "", "", "", "") -#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) +#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) #define EIGEN_EXCEPTIONS #endif @@ -233,7 +239,7 @@ namespace Eigen } #endif //EIGEN_EXCEPTIONS - #elif !defined(__CUDACC__) // EIGEN_DEBUG_ASSERTS + #elif !defined(__CUDACC__) && !defined(__HIPCC__)// EIGEN_DEBUG_ASSERTS // see bug 89. The copy_bool here is working around a bug in gcc <= 4.3 #define eigen_assert(a) \ if( (!Eigen::internal::copy_bool(a)) && (!no_more_assert) )\ @@ -290,7 +296,7 @@ namespace Eigen std::cout << "Can't VERIFY_RAISES_STATIC_ASSERT( " #a " ) with exceptions disabled\n"; #endif - #if !defined(__CUDACC__) + #if !defined(__CUDACC__) && !defined(__HIPCC__) #define EIGEN_USE_CUSTOM_ASSERT #endif diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index d243fe035..4b7c7d724 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -80,12 +80,16 @@ typedef unsigned __int64 uint64_t; #endif #ifdef EIGEN_USE_GPU -#include <iostream> -#include <cuda_runtime.h> -#if __cplusplus >= 201103L -#include <atomic> -#include <unistd.h> -#endif + #include <iostream> + #if defined(EIGEN_USE_HIP) + #include <hip/hip_runtime.h> + #else + #include <cuda_runtime.h> + #endif + #if __cplusplus >= 201103L + #include <atomic> + #include <unistd.h> + #endif #endif #include "src/Tensor/TensorMacros.h" @@ -95,7 +99,11 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorCostModel.h" #include "src/Tensor/TensorDeviceDefault.h" #include "src/Tensor/TensorDeviceThreadPool.h" -#include "src/Tensor/TensorDeviceCuda.h" +#if defined(EIGEN_USE_HIP) + #include "src/Tensor/TensorDeviceHip.h" +#else + #include "src/Tensor/TensorDeviceCuda.h" +#endif #include "src/Tensor/TensorDeviceSycl.h" #include "src/Tensor/TensorIndexList.h" #include "src/Tensor/TensorDimensionList.h" @@ -112,16 +120,28 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorEvaluator.h" #include "src/Tensor/TensorExpr.h" #include "src/Tensor/TensorReduction.h" -#include "src/Tensor/TensorReductionCuda.h" +#if defined(EIGEN_USE_HIP) + #include "src/Tensor/TensorReductionHip.h" +#else + #include "src/Tensor/TensorReductionCuda.h" +#endif #include "src/Tensor/TensorArgMax.h" #include "src/Tensor/TensorConcatenation.h" #include "src/Tensor/TensorContractionMapper.h" #include "src/Tensor/TensorContractionBlocking.h" #include "src/Tensor/TensorContraction.h" #include "src/Tensor/TensorContractionThreadPool.h" -#include "src/Tensor/TensorContractionCuda.h" +#if defined(EIGEN_USE_HIP) + #include "src/Tensor/TensorContractionHip.h" +#else + #include "src/Tensor/TensorContractionCuda.h" +#endif #include "src/Tensor/TensorConversion.h" -#include "src/Tensor/TensorConvolution.h" +#if defined(EIGEN_USE_HIP) + #include "src/Tensor/TensorConvolutionHip.h" +#else + #include "src/Tensor/TensorConvolution.h" +#endif #include "src/Tensor/TensorFFT.h" #include "src/Tensor/TensorPatch.h" #include "src/Tensor/TensorImagePatch.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index e72ddb4a9..979fcf4d9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -448,7 +448,10 @@ struct TensorContractionEvaluatorBase } template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> - EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const { + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + void evalGemv(Scalar* buffer) const { const Index rows = m_i_size; const Index cols = m_k_size; @@ -489,7 +492,10 @@ struct TensorContractionEvaluatorBase } template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> - EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + void evalGemm(Scalar* buffer) const { #if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) if (m_can_use_xsmm) { evalGemmXSMM(buffer); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h index d34f9caee..4853dd37b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -28,7 +28,10 @@ class TensorContractionBlocking { typedef typename LhsMapper::Scalar LhsScalar; typedef typename RhsMapper::Scalar RhsScalar; - EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : kc_(k), mc_(m), nc_(n) { if (ShardingType == ShardByCol) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h new file mode 100644 index 000000000..7561846a3 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h @@ -0,0 +1,1521 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com> +// Copyright (C) 2015 Navdeep Jaitly <ndjaitly@google.com> +// Copyright (C) 2014 Eric Martin <eric@ericmart.in> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_HIP_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_HIP_H + +#if defined(EIGEN_USE_GPU) && defined(EIGEN_HIPCC) + +namespace Eigen { + +template<typename Scalar, typename Index, typename LhsMapper, + typename RhsMapper, typename OutputMapper, bool needs_edge_check> +__device__ EIGEN_STRONG_INLINE void +EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem, + const Index m_size, const Index n_size, const Index k_size) { + + const Index m_block_idx = hipBlockIdx_x; + const Index n_block_idx = hipBlockIdx_y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + // declare and initialize 64 registers for output 8x8 block + + // prefetch registers + Scalar lhs_pf0; + Scalar lhs_pf1; + Scalar lhs_pf2; + Scalar lhs_pf3; + Scalar lhs_pf4; + Scalar lhs_pf5; + Scalar lhs_pf6; + Scalar lhs_pf7; + + Scalar rhs_pf0; + Scalar rhs_pf1; + Scalar rhs_pf2; + Scalar rhs_pf3; + Scalar rhs_pf4; + Scalar rhs_pf5; + Scalar rhs_pf6; + Scalar rhs_pf7; + + // shared memory is formatted + // (contract idx in block, nocontract idx in block, block idx) + // where block idx is column major. This transposition limits the number of + // bank conflicts when reading the LHS. The core idea is that since the contracting + // index is shared by both sides, then the contracting index should be in hipThreadIdx_x. + + // On the LHS, we pad each row inside of each block with an extra element. This makes + // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts + // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks. + + // On the RHS we just add 8 padding elements to the end of each block. This gives no bank + // conflicts on writes and also none on reads. + + // storage indices + const Index lhs_store_idx_base = hipThreadIdx_y * 72 + hipThreadIdx_x * 9 + hipThreadIdx_z; + const Index rhs_store_idx_base = hipThreadIdx_y * 72 + hipThreadIdx_z * 8 + hipThreadIdx_x; + + const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0; + const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1; + const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2; + const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3; + const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4; + const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5; + const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6; + const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7; + + const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0; + const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1; + const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2; + const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3; + const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4; + const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5; + const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6; + const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7; + + // in the loading code, the following variables are important: + // hipThreadIdx_x: the vertical position in an 8x8 block + // hipThreadIdx_y: the vertical index of the 8x8 block in the grid + // hipThreadIdx_z: the horizontal position in an 8x8 block + // k: the horizontal index of the 8x8 block in the grid + // + // The k parameter is implicit (it was the loop counter for a loop that went + // from 0 to <8, but now that loop is unrolled in the below code. + + const Index load_idx_vert = hipThreadIdx_x + 8 * hipThreadIdx_y; + const Index lhs_vert = base_m + load_idx_vert; + +#define prefetchIntoRegisters(base_k) \ + { \ + lhs_pf0 = conv(0); \ + lhs_pf1 = conv(0); \ + lhs_pf2 = conv(0); \ + lhs_pf3 = conv(0); \ + lhs_pf4 = conv(0); \ + lhs_pf5 = conv(0); \ + lhs_pf6 = conv(0); \ + lhs_pf7 = conv(0); \ + \ + rhs_pf0 = conv(0); \ + rhs_pf1 = conv(0); \ + rhs_pf2 = conv(0); \ + rhs_pf3 = conv(0); \ + rhs_pf4 = conv(0); \ + rhs_pf5 = conv(0); \ + rhs_pf6 = conv(0); \ + rhs_pf7 = conv(0); \ + \ + if (!needs_edge_check || lhs_vert < m_size) { \ + const Index lhs_horiz_0 = base_k + hipThreadIdx_z + 0 * 8; \ + const Index lhs_horiz_1 = base_k + hipThreadIdx_z + 1 * 8; \ + const Index lhs_horiz_2 = base_k + hipThreadIdx_z + 2 * 8; \ + const Index lhs_horiz_3 = base_k + hipThreadIdx_z + 3 * 8; \ + const Index lhs_horiz_4 = base_k + hipThreadIdx_z + 4 * 8; \ + const Index lhs_horiz_5 = base_k + hipThreadIdx_z + 5 * 8; \ + const Index lhs_horiz_6 = base_k + hipThreadIdx_z + 6 * 8; \ + const Index lhs_horiz_7 = base_k + hipThreadIdx_z + 7 * 8; \ + \ + if (!needs_edge_check || lhs_horiz_7 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \ + } else if (lhs_horiz_6 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + } else if (lhs_horiz_5 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + } else if (lhs_horiz_4 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + } else if (lhs_horiz_3 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + } else if (lhs_horiz_2 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + } else if (lhs_horiz_1 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + } else if (lhs_horiz_0 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + } \ + } \ + \ + const Index rhs_vert = base_k + load_idx_vert; \ + if (!needs_edge_check || rhs_vert < k_size) { \ + const Index rhs_horiz_0 = base_n + hipThreadIdx_z + 0 * 8; \ + const Index rhs_horiz_1 = base_n + hipThreadIdx_z + 1 * 8; \ + const Index rhs_horiz_2 = base_n + hipThreadIdx_z + 2 * 8; \ + const Index rhs_horiz_3 = base_n + hipThreadIdx_z + 3 * 8; \ + const Index rhs_horiz_4 = base_n + hipThreadIdx_z + 4 * 8; \ + const Index rhs_horiz_5 = base_n + hipThreadIdx_z + 5 * 8; \ + const Index rhs_horiz_6 = base_n + hipThreadIdx_z + 6 * 8; \ + const Index rhs_horiz_7 = base_n + hipThreadIdx_z + 7 * 8; \ + \ + if (rhs_horiz_7 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \ + } else if (rhs_horiz_6 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + } else if (rhs_horiz_5 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + } else if (rhs_horiz_4 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + } else if (rhs_horiz_3 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + } else if (rhs_horiz_2 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + } else if (rhs_horiz_1 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + } else if (rhs_horiz_0 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + } \ + } \ + } \ + +#define writeRegToShmem(_) \ + lhs_shmem[lhs_store_idx_0] = lhs_pf0; \ + rhs_shmem[rhs_store_idx_0] = rhs_pf0; \ + \ + lhs_shmem[lhs_store_idx_1] = lhs_pf1; \ + rhs_shmem[rhs_store_idx_1] = rhs_pf1; \ + \ + lhs_shmem[lhs_store_idx_2] = lhs_pf2; \ + rhs_shmem[rhs_store_idx_2] = rhs_pf2; \ + \ + lhs_shmem[lhs_store_idx_3] = lhs_pf3; \ + rhs_shmem[rhs_store_idx_3] = rhs_pf3; \ + \ + lhs_shmem[lhs_store_idx_4] = lhs_pf4; \ + rhs_shmem[rhs_store_idx_4] = rhs_pf4; \ + \ + lhs_shmem[lhs_store_idx_5] = lhs_pf5; \ + rhs_shmem[rhs_store_idx_5] = rhs_pf5; \ + \ + lhs_shmem[lhs_store_idx_6] = lhs_pf6; \ + rhs_shmem[rhs_store_idx_6] = rhs_pf6; \ + \ + lhs_shmem[lhs_store_idx_7] = lhs_pf7; \ + rhs_shmem[rhs_store_idx_7] = rhs_pf7; \ + + // declare and initialize result array +#define res(i, j) _res_##i##j +#define initResultRow(i) \ + Scalar res(i, 0) = conv(0); \ + Scalar res(i, 1) = conv(0); \ + Scalar res(i, 2) = conv(0); \ + Scalar res(i, 3) = conv(0); \ + Scalar res(i, 4) = conv(0); \ + Scalar res(i, 5) = conv(0); \ + Scalar res(i, 6) = conv(0); \ + Scalar res(i, 7) = conv(0); \ + + internal::scalar_cast_op<int, Scalar> conv; + initResultRow(0); + initResultRow(1); + initResultRow(2); + initResultRow(3); + initResultRow(4); + initResultRow(5); + initResultRow(6); + initResultRow(7); +#undef initResultRow + + for (Index base_k = 0; base_k < k_size; base_k += 64) { + // wait for previous iteration to finish with shmem. Despite common sense, + // the code is a bit faster with this here then at bottom of loop + __syncthreads(); + + prefetchIntoRegisters(base_k); + writeRegToShmem(); + + #undef prefetchIntoRegisters + #undef writeRegToShmem + + // wait for shared mem packing to be done before starting computation + __syncthreads(); + + // compute 8x8 matrix product by outer product. This involves packing one column + // of LHS and one row of RHS into registers (takes 16 registers). + +#define lcol(i) _lcol##i + Scalar lcol(0); + Scalar lcol(1); + Scalar lcol(2); + Scalar lcol(3); + Scalar lcol(4); + Scalar lcol(5); + Scalar lcol(6); + Scalar lcol(7); + +#define rrow(j) _rrow##j + Scalar rrow(0); + Scalar rrow(1); + Scalar rrow(2); + Scalar rrow(3); + Scalar rrow(4); + Scalar rrow(5); + Scalar rrow(6); + Scalar rrow(7); + + // Now x corresponds to k, y to m, and z to n + const Scalar* lhs_block = &lhs_shmem[hipThreadIdx_x + 9 * hipThreadIdx_y]; + const Scalar* rhs_block = &rhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_z]; + +#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))] +#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))] + +#define loadData(i, j) \ + lcol(0) = lhs_element(0, j); \ + rrow(0) = rhs_element(i, 0); \ + lcol(1) = lhs_element(1, j); \ + rrow(1) = rhs_element(i, 1); \ + lcol(2) = lhs_element(2, j); \ + rrow(2) = rhs_element(i, 2); \ + lcol(3) = lhs_element(3, j); \ + rrow(3) = rhs_element(i, 3); \ + lcol(4) = lhs_element(4, j); \ + rrow(4) = rhs_element(i, 4); \ + lcol(5) = lhs_element(5, j); \ + rrow(5) = rhs_element(i, 5); \ + lcol(6) = lhs_element(6, j); \ + rrow(6) = rhs_element(i, 6); \ + lcol(7) = lhs_element(7, j); \ + rrow(7) = rhs_element(i, 7); \ + +#define computeCol(j) \ + res(0, j) += lcol(0) * rrow(j); \ + res(1, j) += lcol(1) * rrow(j); \ + res(2, j) += lcol(2) * rrow(j); \ + res(3, j) += lcol(3) * rrow(j); \ + res(4, j) += lcol(4) * rrow(j); \ + res(5, j) += lcol(5) * rrow(j); \ + res(6, j) += lcol(6) * rrow(j); \ + res(7, j) += lcol(7) * rrow(j); \ + +#define computePass(i) \ + loadData(i, i); \ + \ + computeCol(0); \ + computeCol(1); \ + computeCol(2); \ + computeCol(3); \ + computeCol(4); \ + computeCol(5); \ + computeCol(6); \ + computeCol(7); \ + + computePass(0); + computePass(1); + computePass(2); + computePass(3); + computePass(4); + computePass(5); + computePass(6); + computePass(7); + +#undef lcol +#undef rrow +#undef lhs_element +#undef rhs_element +#undef loadData +#undef computeCol +#undef computePass + } // end loop over k + + // we've now iterated over all of the large (ie width 64) k blocks and + // accumulated results in registers. At this point thread (x, y, z) contains + // the sum across all big k blocks of the product of little k block of index (x, y) + // with block of index (y, z). To compute the final output, we need to reduce + // the 8 threads over y by summation. +#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) + +#define reduceRow(i, mask) \ + shuffleInc(i, 0, mask); \ + shuffleInc(i, 1, mask); \ + shuffleInc(i, 2, mask); \ + shuffleInc(i, 3, mask); \ + shuffleInc(i, 4, mask); \ + shuffleInc(i, 5, mask); \ + shuffleInc(i, 6, mask); \ + shuffleInc(i, 7, mask); \ + +#define reduceMatrix(mask) \ + reduceRow(0, mask); \ + reduceRow(1, mask); \ + reduceRow(2, mask); \ + reduceRow(3, mask); \ + reduceRow(4, mask); \ + reduceRow(5, mask); \ + reduceRow(6, mask); \ + reduceRow(7, mask); \ + + // actually perform the reduction, now each thread of index (_, y, z) + // contains the correct values in its registers that belong in the output + // block + reduceMatrix(1); + reduceMatrix(2); + reduceMatrix(4); + +#undef shuffleInc +#undef reduceRow +#undef reduceMatrix + + // now we need to copy the 64 values into main memory. We can't split work + // among threads because all variables are in registers. There's 2 ways + // to do this: + // (1) have 1 thread do 64 writes from registers into global memory + // (2) have 1 thread do 64 writes into shared memory, and then 8 threads + // each do 8 writes into global memory. We can just overwrite the shared + // memory from the problem we just solved. + // (2) is slightly faster than (1) due to less branching and more ILP + + // TODO: won't yield much gain, but could just use currently unused shared mem + // and then we won't have to sync + // wait for shared mem to be out of use + __syncthreads(); + +#define writeResultShmem(i, j) \ + lhs_shmem[i + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * j] = res(i, j); \ + +#define writeRow(i) \ + writeResultShmem(i, 0); \ + writeResultShmem(i, 1); \ + writeResultShmem(i, 2); \ + writeResultShmem(i, 3); \ + writeResultShmem(i, 4); \ + writeResultShmem(i, 5); \ + writeResultShmem(i, 6); \ + writeResultShmem(i, 7); \ + + if (hipThreadIdx_x == 0) { + writeRow(0); + writeRow(1); + writeRow(2); + writeRow(3); + writeRow(4); + writeRow(5); + writeRow(6); + writeRow(7); + } +#undef writeResultShmem +#undef writeRow + + const int max_i_write = numext::mini((int)((m_size - base_m - hipThreadIdx_y + 7) / 8), 8); + const int max_j_write = numext::mini((int)((n_size - base_n - hipThreadIdx_z + 7) / 8), 8); + + if (hipThreadIdx_x < max_i_write) { + if (max_j_write == 8) { + // TODO: can i trade bank conflicts for coalesced writes? + Scalar val0 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 0]; + Scalar val1 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 1]; + Scalar val2 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 2]; + Scalar val3 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 3]; + Scalar val4 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 4]; + Scalar val5 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 5]; + Scalar val6 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 6]; + Scalar val7 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 7]; + + output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 0) = val0; + output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 1) = val1; + output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 2) = val2; + output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 3) = val3; + output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 4) = val4; + output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 5) = val5; + output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 6) = val6; + output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 7) = val7; + } else { +#pragma unroll 7 + for (int j = 0; j < max_j_write; j++) { + Scalar val = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * j]; + output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * j) = val; + } + } + } +#undef res +} + + +template<typename Scalar, typename Index, typename LhsMapper, + typename RhsMapper, typename OutputMapper> +__global__ void +__launch_bounds__(512, 1) +EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ Scalar lhs_shmem[72 * 64]; + __shared__ Scalar rhs_shmem[72 * 64]; + + const Index m_block_idx = hipBlockIdx_x; + const Index n_block_idx = hipBlockIdx_y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size && base_n + 63 < n_size) { + EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } else { + EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } +} + + +template<typename Index, typename LhsMapper, + typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY, + bool CHECK_RHS_BOUNDARY> +__device__ EIGEN_STRONG_INLINE void +EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float2 lhs_shmem2[][16], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + + // prefetch registers + float4 lhs_pf0, rhs_pf0; + + float4 results[4]; + for (int i=0; i < 4; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } + + +#define prefetch_lhs(reg, row, col) \ + if (!CHECK_LHS_BOUNDARY) { \ + if (col < k_size) { \ + /*reg = lhs.template loadPacket<Unaligned>(row, col);*/ \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + reg.z =lhs(row + 2, col); \ + reg.w =lhs(row + 3, col); \ + } \ + } else { \ + if (col < k_size) { \ + if (row + 3 < m_size) { \ + /*reg =lhs.template loadPacket<Unaligned>(row, col);*/ \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + reg.z =lhs(row + 2, col); \ + reg.w =lhs(row + 3, col); \ + } else if (row + 2 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + reg.z =lhs(row + 2, col); \ + } else if (row + 1 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + } else if (row < m_size) { \ + reg.x =lhs(row + 0, col); \ + } \ + } \ + } \ + + + Index lhs_vert = base_m+hipThreadIdx_x*4; + + for (Index k = 0; k < k_size; k += 16) { + //lhs_pf0 = internal::pset1<float4>(0); + //rhs_pf0 = internal::pset1<float4>(0); + lhs_pf0 = make_float4(0, 0, 0, 0); + rhs_pf0 = make_float4(0, 0, 0, 0); + + Index lhs_horiz = hipThreadIdx_y+k; + prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz) + + Index rhs_vert = k+(hipThreadIdx_x%4)*4; + Index rhs_horiz0 = (hipThreadIdx_x>>2)+hipThreadIdx_y*4+base_n; + + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + //rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0); + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } else { + if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + //rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0); + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + float x1, x2 ; + // the following can be a bitwise operation..... some day. + if((hipThreadIdx_x%8) < 4) { + x1 = rhs_pf0.y; + x2 = rhs_pf0.w; + } else { + x1 = rhs_pf0.x; + x2 = rhs_pf0.z; + } + x1 = __shfl_xor(x1, 4); + x2 = __shfl_xor(x2, 4); + if((hipThreadIdx_x%8) < 4) { + rhs_pf0.y = x1; + rhs_pf0.w = x2; + } else { + rhs_pf0.x = x1; + rhs_pf0.z = x2; + } + + // We have 64 features. + // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1. + // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3. + // ... + // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63 + // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1 + // ... + rhs_shmem2[(hipThreadIdx_x>>3)+ hipThreadIdx_y*2][hipThreadIdx_x%8] = make_float2(rhs_pf0.x, rhs_pf0.y); + rhs_shmem2[(hipThreadIdx_x>>3)+ hipThreadIdx_y*2+32][hipThreadIdx_x%8] = make_float2(rhs_pf0.z, rhs_pf0.w); + + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // ... + // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) + // ... + + lhs_shmem2[hipThreadIdx_y][hipThreadIdx_x] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[hipThreadIdx_y+16][hipThreadIdx_x] = make_float2(lhs_pf0.z, lhs_pf0.w); + + +#define add_vals(fl1, fl2, fr1, fr2)\ + results[0].x += fl1.x * fr1.x;\ + results[0].y += fl1.y * fr1.x;\ + results[0].z += fl2.x * fr1.x;\ + results[0].w += fl2.y * fr1.x;\ +\ + results[1].x += fl1.x * fr1.y;\ + results[1].y += fl1.y * fr1.y;\ + results[1].z += fl2.x * fr1.y;\ + results[1].w += fl2.y * fr1.y;\ +\ + results[2].x += fl1.x * fr2.x;\ + results[2].y += fl1.y * fr2.x;\ + results[2].z += fl2.x * fr2.x;\ + results[2].w += fl2.y * fr2.x;\ +\ + results[3].x += fl1.x * fr2.y;\ + results[3].y += fl1.y * fr2.y;\ + results[3].z += fl2.x * fr2.y;\ + results[3].w += fl2.y * fr2.y;\ + + __syncthreads(); + + // Do the multiplies. + #pragma unroll + for (int koff = 0; koff < 16; koff ++) { + // 32 x threads. + float2 fl1 = lhs_shmem2[koff][hipThreadIdx_x]; + float2 fl2 = lhs_shmem2[koff + 16][hipThreadIdx_x]; + + int start_feature = hipThreadIdx_y * 4; + float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; + float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; + + add_vals(fl1, fl2, fr1, fr2) + } + __syncthreads(); + } + +#undef prefetch_lhs +#undef add_vals + + Index horiz_base = hipThreadIdx_y*4+base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + // CHECK LHS + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK RHS + /* + int ncols_rem = fminf(n_size- horiz_base, 4); + for (int i = 0; i < ncols_rem; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + }*/ + for (int i = 0; i < 4; i++) { + if (horiz_base+i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 4; i++) { + if (horiz_base+i < n_size) { + if (lhs_vert < m_size) + output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) + output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) + output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} + + +template<typename Index, typename LhsMapper, + typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY, + bool CHECK_RHS_BOUNDARY> +__device__ EIGEN_STRONG_INLINE void +EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float2 lhs_shmem2[][32], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + + // prefetch registers + float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3; + float4 rhs_pf0, rhs_pf1; + + float4 results[8]; + for (int i=0; i < 8; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } + + + Index lhs_vert = base_m+hipThreadIdx_x*4+(hipThreadIdx_y%4)*32; + for (Index k = 0; k < k_size; k += 32) { + /*lhs_pf0 = internal::pset1<float4>(0); + lhs_pf1 = internal::pset1<float4>(0); + lhs_pf2 = internal::pset1<float4>(0); + lhs_pf3 = internal::pset1<float4>(0); + + rhs_pf0 = internal::pset1<float4>(0); + rhs_pf1 = internal::pset1<float4>(0);*/ + + + lhs_pf0 = make_float4(0, 0, 0, 0); + lhs_pf1 = make_float4(0, 0, 0, 0); + lhs_pf2 = make_float4(0, 0, 0, 0); + lhs_pf3 = make_float4(0, 0, 0, 0); + + rhs_pf0 = make_float4(0, 0, 0, 0); + rhs_pf1 = make_float4(0, 0, 0, 0); + + if (!CHECK_LHS_BOUNDARY) { + if ((hipThreadIdx_y/4+k+24) < k_size) { + //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k)); + //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8)); + //lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+16)); + //lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+24)); + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24)); + lhs_pf3.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24)); + lhs_pf3.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24)); + } else if ((hipThreadIdx_y/4+k+16) < k_size) { + //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k)); + //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8)); + //lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+16)); + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + } else if ((hipThreadIdx_y/4+k+8) < k_size) { + //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k)); + //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8)); + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + } else if ((hipThreadIdx_y/4+k) < k_size) { + //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k)); + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + } + } else { + // just CHECK_LHS_BOUNDARY + if (lhs_vert + 3 < m_size) { + if ((hipThreadIdx_y/4+k+24) < k_size) { + //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k)); + //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8)); + //lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+16)); + //lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+24)); + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24)); + lhs_pf3.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24)); + lhs_pf3.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24)); + } else if ((hipThreadIdx_y/4+k+16) < k_size) { + //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k)); + //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8)); + //lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+16)); + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + } else if ((hipThreadIdx_y/4+k+8) < k_size) { + //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k)); + //lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k+8)); + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + } else if ((hipThreadIdx_y/4+k) < k_size) { + //lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (hipThreadIdx_y/4+k)); + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + } + } else if (lhs_vert + 2 < m_size) { + if ((hipThreadIdx_y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24)); + lhs_pf3.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24)); + } else if ((hipThreadIdx_y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + } else if ((hipThreadIdx_y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + } else if ((hipThreadIdx_y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + } + } else if (lhs_vert + 1 < m_size) { + if ((hipThreadIdx_y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24)); + } else if ((hipThreadIdx_y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); + } else if ((hipThreadIdx_y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + } else if ((hipThreadIdx_y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + } + } else if (lhs_vert < m_size) { + if ((hipThreadIdx_y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24)); + } else if ((hipThreadIdx_y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + } else if ((hipThreadIdx_y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + } else if ((hipThreadIdx_y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + } + } + } + __syncthreads(); + Index rhs_vert = k+hipThreadIdx_x*4; + Index rhs_horiz0 = hipThreadIdx_y*2+base_n; + Index rhs_horiz1 = hipThreadIdx_y*2+1+base_n; + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + //rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0); + //rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1); + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + rhs_pf1.w = rhs(rhs_vert + 3, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else { + if (rhs_horiz1 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + //rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0); + //rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1); + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + rhs_pf1.w = rhs(rhs_vert + 3, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (k+hipThreadIdx_x*4 + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (k+hipThreadIdx_x*4 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + //rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0); + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + __syncthreads(); + // Loaded. Do computation + // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1. + // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3. + // .. + // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63 + rhs_shmem2[hipThreadIdx_y][hipThreadIdx_x] = make_float2(rhs_pf0.x, rhs_pf1.x); + // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1. + // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3. + // .. + rhs_shmem2[hipThreadIdx_y+32][hipThreadIdx_x] = make_float2(rhs_pf0.y, rhs_pf1.y); + // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1. + // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3. + rhs_shmem2[hipThreadIdx_y+64][hipThreadIdx_x] = make_float2(rhs_pf0.z, rhs_pf1.z); + // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1. + // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3. + rhs_shmem2[hipThreadIdx_y+96][hipThreadIdx_x] = make_float2(rhs_pf0.w, rhs_pf1.w); + + // LHS. + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // ... + // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + + +#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\ + results[0].x += a_feat1.x * f1.x;\ + results[1].x += a_feat1.x * f1.y;\ + results[2].x += a_feat1.x * f2.x;\ + results[3].x += a_feat1.x * f2.y;\ + results[4].x += a_feat1.x * f3.x;\ + results[5].x += a_feat1.x * f3.y;\ + results[6].x += a_feat1.x * f4.x;\ + results[7].x += a_feat1.x * f4.y;\ +\ + results[0].y += a_feat1.y * f1.x;\ + results[1].y += a_feat1.y * f1.y;\ + results[2].y += a_feat1.y * f2.x;\ + results[3].y += a_feat1.y * f2.y;\ + results[4].y += a_feat1.y * f3.x;\ + results[5].y += a_feat1.y * f3.y;\ + results[6].y += a_feat1.y * f4.x;\ + results[7].y += a_feat1.y * f4.y;\ +\ + results[0].z += a_feat2.x * f1.x;\ + results[1].z += a_feat2.x * f1.y;\ + results[2].z += a_feat2.x * f2.x;\ + results[3].z += a_feat2.x * f2.y;\ + results[4].z += a_feat2.x * f3.x;\ + results[5].z += a_feat2.x * f3.y;\ + results[6].z += a_feat2.x * f4.x;\ + results[7].z += a_feat2.x * f4.y;\ +\ + results[0].w += a_feat2.y * f1.x;\ + results[1].w += a_feat2.y * f1.y;\ + results[2].w += a_feat2.y * f2.x;\ + results[3].w += a_feat2.y * f2.y;\ + results[4].w += a_feat2.y * f3.x;\ + results[5].w += a_feat2.y * f3.y;\ + results[6].w += a_feat2.y * f4.x;\ + results[7].w += a_feat2.y * f4.y;\ + + lhs_shmem2[hipThreadIdx_y/4][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[hipThreadIdx_y/4+8][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y); + lhs_shmem2[hipThreadIdx_y/4+16][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y); + lhs_shmem2[hipThreadIdx_y/4+24][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y); + + lhs_shmem2[hipThreadIdx_y/4 + 32][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w); + lhs_shmem2[hipThreadIdx_y/4 + 40][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w); + lhs_shmem2[hipThreadIdx_y/4 + 48][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w); + lhs_shmem2[hipThreadIdx_y/4 + 56][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w); + + __syncthreads(); + + // Do the multiplies. + #pragma unroll + for (int koff = 0; koff < 32; koff ++) { + float2 a3 = lhs_shmem2[koff][hipThreadIdx_x + (hipThreadIdx_y % 4) * 8]; + float2 a4 = lhs_shmem2[koff + 32][hipThreadIdx_x + (hipThreadIdx_y % 4) * 8]; + + // first feature is at (hipThreadIdx_y/4) * 8 last is at start + 8. + int start_feature = (hipThreadIdx_y / 4) * 8; + + float2 br1 = rhs_shmem2[start_feature/2 + (koff % 4) * 32][koff/4]; + float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4]; + float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4]; + float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4]; + + add_vals(a3, a4, br1, br2, br3, br4) + } + __syncthreads(); + } // end loop over k + + + __syncthreads(); + Index horiz_base = (hipThreadIdx_y/4)*8+base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK BOUNDARY_B + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + if (lhs_vert < m_size) + output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) + output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) + output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} + + +template<typename Index, typename LhsMapper, + typename RhsMapper, typename OutputMapper> +__global__ void +__launch_bounds__(256, 1) +EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[64*32]; + __shared__ float2 rhs_shmem[128*8]; + + typedef float2 LHS_MEM[64][32]; + typedef float2 RHS_MEM[128][8]; + + typedef float2 LHS_MEM16x16[32][16]; + typedef float2 RHS_MEM16x16[64][8]; + + const Index m_block_idx = hipBlockIdx_x; + const Index n_block_idx = hipBlockIdx_y; + + const Index base_m = 128 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + bool check_rhs = (base_n + 63) >= n_size; + bool check_lhs128 = (base_m + 127) >= m_size; + + if (!check_rhs) { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } + } else { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } + } +} + +template<typename Index, typename LhsMapper, + typename RhsMapper, typename OutputMapper> +__global__ void +__launch_bounds__(256, 1) +EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[32][16]; + __shared__ float2 rhs_shmem[64][8]; + + const Index m_block_idx = hipBlockIdx_x; + const Index n_block_idx = hipBlockIdx_y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size) { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } else { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } +} + + +template<typename Indices, typename LeftArgType, typename RightArgType> +struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> : + public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> > { + + typedef GpuDevice Device; + + typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self; + typedef TensorContractionEvaluatorBase<Self> Base; + + typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType; + typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType; + + enum { + Layout = TensorEvaluator<LeftArgType, Device>::Layout, + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value; + static const int RDims = + internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value; + static const int ContractDims = internal::array_size<Indices>::value; + + typedef array<Index, LDims> left_dim_mapper_t; + typedef array<Index, RDims> right_dim_mapper_t; + + typedef array<Index, ContractDims> contract_t; + typedef array<Index, LDims - ContractDims> left_nocontract_t; + typedef array<Index, RDims - ContractDims> right_nocontract_t; + + static const int NumDims = LDims + RDims - 2 * ContractDims; + + typedef DSizes<Index, NumDims> Dimensions; + + // typedefs needed in evalTo + typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; + typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar; + + typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; + typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; + + typedef typename LeftEvaluator::Dimensions LeftDimensions; + typedef typename RightEvaluator::Dimensions RightDimensions; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {} + + // We need to redefine this method to make hipcc happy + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + this->m_leftImpl.evalSubExprsIfNeeded(NULL); + this->m_rightImpl.evalSubExprsIfNeeded(NULL); + if (data) { + evalTo(data); + return false; + } else { + this->m_result = static_cast<Scalar *>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); + evalTo(this->m_result); + return true; + } + } + + void evalTo(Scalar* buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped<true, true, true, Unaligned>(buffer); + } + else { + evalTyped<true, true, false, Unaligned>(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped<true, false, true, Unaligned>(buffer); + } + else { + evalTyped<true, false, false, Unaligned>(buffer); + } + } + } + else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped<false, true, true, Unaligned>(buffer); + } + else { + evalTyped<false, true, false, Unaligned>(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped<false, false, true, Unaligned>(buffer); + } + else { + evalTyped<false, false, false, Unaligned>(buffer); + } + } + } + } + + template <typename LhsScalar, typename RhsScalar, typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels { + static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 8, 8); + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), + dim3(num_blocks), dim3(block_size), 0, device.stream(), lhs, rhs, output, m, n, k); + } + }; + + template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels<float, float, Index, LhsMapper, RhsMapper, OutputMapper> { + static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { + if (m < 768 || n < 768) { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(16, 16, 1); + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), + dim3(num_blocks), dim3(block_size), 0, device.stream(), lhs, rhs, output, m, n, k); + } else { + const Index m_blocks = (m + 127) / 128; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 32, 1); + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), + dim3(num_blocks), dim3(block_size), 0, device.stream(), lhs, rhs, output, m, n, k); + } + } + }; + + template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> + void evalTyped(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + EIGEN_UNUSED_VARIABLE(k) + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, + LeftEvaluator, left_nocontract_t, + contract_t, 4, + lhs_inner_dim_contiguous, + false, Unaligned> LhsMapper; + + typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, + RightEvaluator, right_nocontract_t, + contract_t, 4, + rhs_inner_dim_contiguous, + rhs_inner_dim_reordered, Unaligned> RhsMapper; + + typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; + + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + setHipSharedMemConfig(hipSharedMemBankSizeEightByte); + LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output, m, n, k, this->m_device); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_USE_GPU and EIGEN_HIPCC +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_HIP_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h new file mode 100644 index 000000000..ba9971050 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h @@ -0,0 +1,1119 @@ +//#include "hip/hip_runtime.h" +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H + +namespace Eigen { + +/** \class TensorConvolution + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor convolution class. + * + * + */ +namespace internal { + +template <typename Index, typename InputDims, int NumKernelDims, int Layout> +class IndexMapper { + public: + IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims, + const array<Index, NumKernelDims>& indices) { + + array<Index, NumDims> dimensions = input_dims; + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = indices[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + dimensions[index] = result_dim; + } + + array<Index, NumDims> inputStrides; + array<Index, NumDims> outputStrides; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + inputStrides[0] = 1; + outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + inputStrides[i] = inputStrides[i-1] * input_dims[i-1]; + outputStrides[i] = outputStrides[i-1] * dimensions[i-1]; + } + } else { + inputStrides[NumDims - 1] = 1; + outputStrides[NumDims - 1] = 1; + for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) { + inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1]; + outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1]; + } + } + + array<Index, NumDims> hipInputDimensions; + array<Index, NumDims> hipOutputDimensions; + array<Index, NumDims> tmp = dimensions; + array<Index, NumDims> ordering; + const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : NumDims - NumKernelDims; + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = i + offset; + ordering[index] = indices[i]; + tmp[indices[i]] = -1; + hipInputDimensions[index] = input_dims[indices[i]]; + hipOutputDimensions[index] = dimensions[indices[i]]; + } + + int written = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? NumKernelDims + : 0; + for (int i = 0; i < NumDims; ++i) { + if (tmp[i] >= 0) { + ordering[written] = i; + hipInputDimensions[written] = input_dims[i]; + hipOutputDimensions[written] = dimensions[i]; + ++written; + } + } + + for (int i = 0; i < NumDims; ++i) { + m_inputStrides[i] = inputStrides[ordering[i]]; + m_outputStrides[i] = outputStrides[ordering[i]]; + } + + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = 0; i < NumDims; ++i) { + if (i > NumKernelDims) { + m_hipInputStrides[i] = + m_hipInputStrides[i - 1] * hipInputDimensions[i - 1]; + m_hipOutputStrides[i] = + m_hipOutputStrides[i - 1] * hipOutputDimensions[i - 1]; + } else { + m_hipInputStrides[i] = 1; + m_hipOutputStrides[i] = 1; + } + } + } else { + for (int i = NumDims - 1; i >= 0; --i) { + if (i + 1 < offset) { + m_hipInputStrides[i] = + m_hipInputStrides[i + 1] * hipInputDimensions[i + 1]; + m_hipOutputStrides[i] = + m_hipOutputStrides[i + 1] * hipOutputDimensions[i + 1]; + } else { + m_hipInputStrides[i] = 1; + m_hipOutputStrides[i] = 1; + } + } + } + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputPlaneToTensorInputOffset(Index p) const { + Index inputIndex = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_hipInputStrides[d]; + inputIndex += idx * m_inputStrides[d]; + p -= idx * m_hipInputStrides[d]; + } + inputIndex += p * m_inputStrides[NumKernelDims]; + } else { + std::ptrdiff_t limit = 0; + if (NumKernelDims < NumDims) { + limit = NumDims - NumKernelDims - 1; + } + for (int d = 0; d < limit; ++d) { + const Index idx = p / m_hipInputStrides[d]; + inputIndex += idx * m_inputStrides[d]; + p -= idx * m_hipInputStrides[d]; + } + inputIndex += p * m_inputStrides[limit]; + } + return inputIndex; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputPlaneToTensorOutputOffset(Index p) const { + Index outputIndex = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_hipOutputStrides[d]; + outputIndex += idx * m_outputStrides[d]; + p -= idx * m_hipOutputStrides[d]; + } + outputIndex += p * m_outputStrides[NumKernelDims]; + } else { + std::ptrdiff_t limit = 0; + if (NumKernelDims < NumDims) { + limit = NumDims - NumKernelDims - 1; + } + for (int d = 0; d < limit; ++d) { + const Index idx = p / m_hipOutputStrides[d]; + outputIndex += idx * m_outputStrides[d]; + p -= idx * m_hipOutputStrides[d]; + } + outputIndex += p * m_outputStrides[limit]; + } + return outputIndex; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputKernelToTensorInputOffset(Index i) const { + const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_inputStrides[offset]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputKernelToTensorOutputOffset(Index i) const { + const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_outputStrides[offset]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputKernelToTensorInputOffset(Index i, Index j) const { + const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputKernelToTensorOutputOffset(Index i, Index j) const { + const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputKernelToTensorInputOffset(Index i, Index j, Index k) const { + const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] + + k * m_inputStrides[offset + 2]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { + const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] + + k * m_outputStrides[offset + 2]; + } + + private: + static const int NumDims = internal::array_size<InputDims>::value; + array<Index, NumDims> m_inputStrides; + array<Index, NumDims> m_outputStrides; + array<Index, NumDims> m_hipInputStrides; + array<Index, NumDims> m_hipOutputStrides; +}; + + + +template<typename Dimensions, typename InputXprType, typename KernelXprType> +struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename promote_storage_type<typename InputXprType::Scalar, + typename KernelXprType::Scalar>::ret Scalar; + typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind, + typename traits<KernelXprType>::StorageKind>::ret StorageKind; + typedef typename promote_index_type<typename traits<InputXprType>::Index, + typename traits<KernelXprType>::Index>::type Index; + typedef typename InputXprType::Nested LhsNested; + typedef typename KernelXprType::Nested RhsNested; + typedef typename remove_reference<LhsNested>::type _LhsNested; + typedef typename remove_reference<RhsNested>::type _RhsNested; + static const int NumDimensions = traits<InputXprType>::NumDimensions; + static const int Layout = traits<InputXprType>::Layout; + + enum { + Flags = 0 + }; +}; + +template<typename Dimensions, typename InputXprType, typename KernelXprType> +struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense> +{ + typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type; +}; + +template<typename Dimensions, typename InputXprType, typename KernelXprType> +struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type> +{ + typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type; +}; + +} // end namespace internal + + + +template<typename Indices, typename InputXprType, typename KernelXprType> +class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType, + typename KernelXprType::CoeffReturnType>::ret CoeffReturnType; + typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested; + typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims) + : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Indices& indices() const { return m_indices; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const typename internal::remove_all<typename InputXprType::Nested>::type& + inputExpression() const { return m_input_xpr; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const typename internal::remove_all<typename KernelXprType::Nested>::type& + kernelExpression() const { return m_kernel_xpr; } + + protected: + typename InputXprType::Nested m_input_xpr; + typename KernelXprType::Nested m_kernel_xpr; + const Indices m_indices; +}; + + +template<typename Indices, typename InputArgType, typename KernelArgType, typename Device> +struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device> +{ + typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType; + + static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value; + static const int NumKernelDims = internal::array_size<Indices>::value; + typedef typename XprType::Index Index; + typedef DSizes<Index, NumDims> Dimensions; + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + + enum { + IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned, + PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess, + Layout = TensorEvaluator<InputArgType, Device>::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device) + { + EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_inputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1]; + } + } else { + m_inputStride[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1]; + } + } + + m_dimensions = m_inputImpl.dimensions(); + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + if (i > 0) { + m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1]; + } else { + m_kernelStride[0] = 1; + } + m_indexStride[i] = m_inputStride[index]; + } + + m_outputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1]; + } + } else { + for (int i = NumKernelDims - 1; i >= 0; --i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + if (i < NumKernelDims - 1) { + m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1]; + } else { + m_kernelStride[NumKernelDims - 1] = 1; + } + m_indexStride[i] = m_inputStride[index]; + } + + m_outputStride[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_inputImpl.evalSubExprsIfNeeded(NULL); + preloadKernel(); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + + void evalTo(typename XprType::Scalar* buffer) { + evalSubExprsIfNeeded(NULL); + for (int i = 0; i < dimensions().TotalSize(); ++i) { + buffer[i] += coeff(i); + } + cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + CoeffReturnType result = CoeffReturnType(0); + convolve(firstInput(index), 0, NumKernelDims-1, result); + return result; + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const + { + Index indices[2] = {index, index+PacketSize-1}; + Index startInputs[2] = {0, 0}; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStride[i]; + const Index idx1 = indices[1] / m_outputStride[i]; + startInputs[0] += idx0 * m_inputStride[i]; + startInputs[1] += idx1 * m_inputStride[i]; + indices[0] -= idx0 * m_outputStride[i]; + indices[1] -= idx1 * m_outputStride[i]; + } + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / m_outputStride[i]; + const Index idx1 = indices[1] / m_outputStride[i]; + startInputs[0] += idx0 * m_inputStride[i]; + startInputs[1] += idx1 * m_inputStride[i]; + indices[0] -= idx0 * m_outputStride[i]; + indices[1] -= idx1 * m_outputStride[i]; + } + } + startInputs[0] += indices[0]; + startInputs[1] += indices[1]; + + if (startInputs[1]-startInputs[0] == PacketSize-1) { + PacketReturnType result = internal::pset1<PacketReturnType>(0); + convolvePacket(startInputs[0], 0, NumKernelDims-1, result); + return result; + } else { + EIGEN_ALIGN_MAX Scalar data[PacketSize]; + data[0] = Scalar(0); + convolve(startInputs[0], 0, NumKernelDims-1, data[0]); + for (int i = 1; i < PacketSize-1; ++i) { + data[i] = Scalar(0); + convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]); + } + data[PacketSize-1] = Scalar(0); + convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]); + return internal::pload<PacketReturnType>(data); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = + TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + + TensorOpCost::DivCost<Index>()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, + PacketSize)); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + private: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + Index startInput = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } + } + startInput += index; + return startInput; + } + + EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const { + for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { + const Index input = firstIndex + j * m_indexStride[DimIndex]; + const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; + if (DimIndex > 0) { + convolve(input, kernel, DimIndex-1, accum); + } else { + accum += m_inputImpl.coeff(input) * m_kernel[kernel]; + } + } + } + + template <typename Packet> + EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const { + for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { + const Index input = firstIndex + j * m_indexStride[DimIndex]; + const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; + if (DimIndex > 0) { + convolvePacket(input, kernel, DimIndex-1, accum); + } else { + accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + typedef TensorEvalToOp<const KernelArgType> EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + const bool PacketAccess = internal::IsVectorizable<Device, KernelArgType>::value; + internal::TensorExecutor<const EvalTo, Device, PacketAccess>::run(evalToTmp, m_device); + + m_kernel = local; + m_local_kernel = true; + } + } + + array<Index, NumDims> m_inputStride; + array<Index, NumDims> m_outputStride; + + array<Index, NumKernelDims> m_indexStride; + array<Index, NumKernelDims> m_kernelStride; + TensorEvaluator<InputArgType, Device> m_inputImpl; + TensorEvaluator<KernelArgType, Device> m_kernelImpl; + Dimensions m_dimensions; + + KernelArgType m_kernelArg; + const Scalar* m_kernel; + bool m_local_kernel; + const Device& m_device; +}; + + + + +// Use an optimized implementation of the evaluation code for GPUs whenever possible. +#if defined(EIGEN_USE_GPU) && defined(EIGEN_HIPCC) + +template <int StaticKernelSize> +struct GetKernelSize { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const { + return StaticKernelSize; + } +}; +template <> +struct GetKernelSize<Dynamic> { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const { + return kernelSize; + } +}; + +template <typename InputEvaluator, typename Index, typename InputDims, + int StaticKernelSize> +__global__ void EigenConvolutionKernel1D( + InputEvaluator eval, + const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout> + indexMapper, + const float* __restrict kernel, const int numPlanes, const int numX, + const int maxX, const int kernelSize, float* buffer) { + HIP_DYNAMIC_SHARED( float, s) + + const int first_x = hipBlockIdx_x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize); + const int num_x_output = last_x - first_x + 1; + + const int first_plane = hipBlockIdx_y * hipBlockDim_y; + const int plane_stride = hipBlockDim_y * hipGridDim_y; + + for (int p = first_plane + hipThreadIdx_y; p < numPlanes; p += plane_stride) { + // Load inputs to shared memory + const int plane_input_offset = indexMapper.mapHipInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = hipThreadIdx_y * num_x_input; + #pragma unroll + for (int i = hipThreadIdx_x; i < num_x_input; i += hipBlockDim_x) { + const int tensor_index = plane_input_offset + indexMapper.mapHipInputKernelToTensorInputOffset(i+first_x); + s[i + plane_kernel_offset] = eval.coeff(tensor_index); + } + + __syncthreads(); + + // Compute the convolution + const int plane_output_offset = indexMapper.mapHipOutputPlaneToTensorOutputOffset(p); + + #pragma unroll + for (int i = hipThreadIdx_x; i < num_x_output; i += hipBlockDim_x) { + const int kernel_offset = plane_kernel_offset + i; + float result = 0.0f; + #pragma unroll + for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) { + result += s[k + kernel_offset] * kernel[k]; + } + const int tensor_index = plane_output_offset + indexMapper.mapHipOutputKernelToTensorOutputOffset(i+first_x); + buffer[tensor_index] = result; + } + __syncthreads(); + } +}; + +template <typename InputEvaluator, typename Index, typename InputDims, + int StaticKernelSizeX, int StaticKernelSizeY> +__global__ void EigenConvolutionKernel2D( + InputEvaluator eval, + const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout> + indexMapper, + const float* __restrict kernel, const int numPlanes, const int numX, + const int maxX, const int numY, const int maxY, const int kernelSizeX, + const int kernelSizeY, float* buffer) { + HIP_DYNAMIC_SHARED( float, s) + + const int first_x = hipBlockIdx_x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX); + const int num_x_output = last_x - first_x + 1; + + const int first_y = hipBlockIdx_y * maxY; + const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY); + const int num_y_output = last_y - first_y + 1; + + const int first_plane = hipBlockIdx_z * hipBlockDim_z; + const int plane_stride = hipBlockDim_z * hipGridDim_z; + + for (int p = first_plane + hipThreadIdx_z; p < numPlanes; p += plane_stride) { + + const int plane_input_offset = indexMapper.mapHipInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = hipThreadIdx_z * num_y_input; + + // Load inputs to shared memory + #pragma unroll + for (int j = hipThreadIdx_y; j < num_y_input; j += hipBlockDim_y) { + const int input_offset = num_x_input * (j + plane_kernel_offset); + #pragma unroll + for (int i = hipThreadIdx_x; i < num_x_input; i += hipBlockDim_x) { + const int tensor_index = plane_input_offset + indexMapper.mapHipInputKernelToTensorInputOffset(i+first_x, j+first_y); + s[i + input_offset] = eval.coeff(tensor_index); + } + } + + __syncthreads(); + + // Convolution + const int plane_output_offset = indexMapper.mapHipOutputPlaneToTensorOutputOffset(p); + + #pragma unroll + for (int j = hipThreadIdx_y; j < num_y_output; j += hipBlockDim_y) { + #pragma unroll + for (int i = hipThreadIdx_x; i < num_x_output; i += hipBlockDim_x) { + float result = 0.0f; + #pragma unroll + for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) { + const int kernel_offset = kernelSizeX * l; + const int input_offset = i + num_x_input * (j + l + plane_kernel_offset); + #pragma unroll + for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) { + result += s[k + input_offset] * kernel[k + kernel_offset]; + } + } + const int tensor_index = plane_output_offset + indexMapper.mapHipOutputKernelToTensorOutputOffset(i+first_x, j+first_y); + buffer[tensor_index] = result; + } + } + + __syncthreads(); + } +}; + +template <typename InputEvaluator, typename Index, typename InputDims> +__global__ void EigenConvolutionKernel3D( + InputEvaluator eval, + const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout> + indexMapper, + const float* __restrict kernel, const size_t numPlanes, const size_t numX, + const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, + const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, + const size_t kernelSizeZ, float* buffer) { + HIP_DYNAMIC_SHARED( float, s) + + // Load inputs to shared memory + const int first_x = hipBlockIdx_x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + kernelSizeX; + + const int first_y = hipBlockIdx_y * maxY; + const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const int num_y_input = last_y - first_y + kernelSizeY; + + const int first_z = hipBlockIdx_z * maxZ; + const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1; + const int num_z_input = last_z - first_z + kernelSizeZ; + + for (int p = 0; p < numPlanes; ++p) { + + const int plane_input_offset = indexMapper.mapHipInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = 0; + + for (int k = hipThreadIdx_z; k < num_z_input; k += hipBlockDim_z) { + for (int j = hipThreadIdx_y; j < num_y_input; j += hipBlockDim_y) { + for (int i = hipThreadIdx_x; i < num_x_input; i += hipBlockDim_x) { + const int tensor_index = plane_input_offset + indexMapper.mapHipInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); + s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); + } + } + } + + __syncthreads(); + + // Convolution + const int num_z_output = last_z - first_z + 1; + const int num_y_output = last_y - first_y + 1; + const int num_x_output = last_x - first_x + 1; + const int plane_output_offset = indexMapper.mapHipOutputPlaneToTensorOutputOffset(p); + + for (int k = hipThreadIdx_z; k < num_z_output; k += hipBlockDim_z) { + for (int j = hipThreadIdx_y; j < num_y_output; j += hipBlockDim_y) { + for (int i = hipThreadIdx_x; i < num_x_output; i += hipBlockDim_x) { + float result = 0.0f; + for (int n = 0; n < kernelSizeZ; ++n) { + for (int m = 0; m < kernelSizeY; ++m) { + for (int l = 0; l < kernelSizeX; ++l) { + result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)]; + } + } + } + const int tensor_index = plane_output_offset + indexMapper.mapHipOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z); + buffer[tensor_index] = result; + } + } + } + __syncthreads(); + } +}; + + + +template<typename Indices, typename InputArgType, typename KernelArgType> +struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice> +{ + typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType; + + static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value; + static const int NumKernelDims = internal::array_size<Indices>::value; + typedef typename XprType::Index Index; + typedef DSizes<Index, NumDims> Dimensions; + typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions; + + enum { + IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned, + PacketAccess = false, + Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device) + : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) + { + EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + m_dimensions = m_inputImpl.dimensions(); + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {} + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType; + typedef typename InputArgType::Scalar Scalar; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + preloadKernel(); + m_inputImpl.evalSubExprsIfNeeded(NULL); + if (data) { + executeEval(data); + return false; + } else { + m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)); + executeEval(m_buf); + return true; + } + } + + EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_buf) { + m_device.deallocate(m_buf); + m_buf = NULL; + } + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + + EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + typedef TensorEvalToOp<const KernelArgType> EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value; + internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device); + + m_kernel = local; + m_local_kernel = true; + } + } + + static unsigned int ceil(unsigned int num, unsigned int denom) { + const unsigned int rounded_toward_zero = num / denom; + if (num > rounded_toward_zero * denom) { + return rounded_toward_zero + 1; + } + return rounded_toward_zero; + } + + void executeEval(Scalar* data) const { + typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims; + + const int maxSharedMem = m_device.sharedMemPerBlock(); + const int maxThreadsPerBlock = m_device.maxHipThreadsPerBlock(); + const int maxBlocksPerProcessor = m_device.maxHipThreadsPerMultiProcessor() / maxThreadsPerBlock; + const int numMultiProcessors = m_device.getNumHipMultiProcessors(); + const int hipWarpSize = 32; + + switch (NumKernelDims) { + case 1: { + const int kernel_size = m_kernelImpl.dimensions().TotalSize(); + + const int numX = dimensions()[m_indices[0]]; + const int numP = dimensions().TotalSize() / numX; + int maxX; + dim3 block_size; + + const int single_stride_dim = + static_cast<int>(Layout) == static_cast<int>(ColMajor) + ? 0 + : m_inputImpl.dimensions().rank() - 1; + if (m_indices[0] == single_stride_dim) { + // Maximum the reuse + const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32; + maxX = numext::mini<int>(inner_dim, numX); + const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP); + block_size.x = numext::mini(maxThreadsPerBlock, maxX); + block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP); + } + else { + // Read as much as possible alongside the inner most dimension, that is the plane + const int inner_dim = maxSharedMem / ((hipWarpSize + kernel_size) * sizeof(Scalar)); + const int maxP = numext::mini<int>(inner_dim, numP); + maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX); + + block_size.x = numext::mini(hipWarpSize, maxX); + block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP); + } + + const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + const int num_x_blocks = ceil(numX, maxX); + const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); + const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks); + + dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y))); + + + //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + + const array<Index, 1> indices(m_indices[0]); + const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]); + internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper( + m_inputImpl.dimensions(), kernel_dims, indices); + switch(kernel_size) { + case 4: { + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), + dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); + break; + } + case 7: { + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), + dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); + break; + } + default: { + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), + dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); + } + } + break; + } + + case 2: { + const int idxX = + static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1; + const int idxY = + static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0; + const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; + const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; + + const int numX = dimensions()[m_indices[idxX]]; + const int numY = dimensions()[m_indices[idxY]]; + const int numP = dimensions().TotalSize() / (numX*numY); + + const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x)); + + // Snap maxX to warp size + int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32; + const int maxX = numext::mini<int>(inner_dim, numX); + const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY); + const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP); + + dim3 block_size; + block_size.x = numext::mini(1024, maxX); + block_size.y = numext::mini<int>(1024/block_size.x, maxY); + block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP); + + const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + const int num_x_blocks = ceil(numX, maxX); + const int num_y_blocks = ceil(numY, maxY); + const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); + const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks); + + dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z))); + + + //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + + const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]); + const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX], + m_kernelImpl.dimensions()[idxY]); + internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper( + m_inputImpl.dimensions(), kernel_dims, indices); + switch (kernel_size_x) { + case 4: { + switch (kernel_size_y) { + case 7: { + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), + dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); + break; + } + default: { + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), + dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); + break; + } + } + break; + } + case 7: { + switch (kernel_size_y) { + case 4: { + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), + dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); + break; + } + default: { + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), + dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); + break; + } + } + break; + } + default: { + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), + dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); + break; + } + } + break; + } + + case 3: { + const int idxX = + static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2; + const int idxY = + static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1; + const int idxZ = + static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0; + + const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; + const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; + const int kernel_size_z = m_kernelImpl.dimensions()[idxZ]; + + const int numX = dimensions()[m_indices[idxX]]; + const int numY = dimensions()[m_indices[idxY]]; + const int numZ = dimensions()[m_indices[idxZ]]; + const int numP = dimensions().TotalSize() / (numX*numY*numZ); + + const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX)); + const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY)); + const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ)); + + dim3 block_size; + block_size.x = numext::mini(32, maxX); + block_size.y = numext::mini(32, maxY); + block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ); + dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ)); + + const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + const array<Index, 3> indices(m_indices[idxX], m_indices[idxY], + m_indices[idxZ]); + const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX], + m_kernelImpl.dimensions()[idxY], + m_kernelImpl.dimensions()[idxZ]); + internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper( + m_inputImpl.dimensions(), kernel_dims, indices); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), + dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, + numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); + break; + } + + default: { + EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return m_buf[index]; + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const + { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost + // model. + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = + TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + + TensorOpCost::DivCost<Index>()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, + PacketSize)); + } + + private: + // No assignment (copies are needed by the kernels) + TensorEvaluator& operator = (const TensorEvaluator&); + + TensorEvaluator<InputArgType, GpuDevice> m_inputImpl; + TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl; + KernelArgType m_kernelArg; + Indices m_indices; + Dimensions m_dimensions; + Scalar* m_buf; + const Scalar* m_kernel; + bool m_local_kernel; + + const GpuDevice& m_device; +}; +#endif + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h index 341889e88..e94e577fc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h @@ -35,9 +35,12 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { -#ifndef EIGEN_CUDA_ARCH +#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE) // Running on the host CPU return 1; +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + return 64; #else // Running on a CUDA device return 32; @@ -45,7 +48,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { -#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) +#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) && !defined(EIGEN_HIP_DEVICE_COMPILE) // Running on the host CPU return l1CacheSize(); #else @@ -55,7 +58,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { -#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) +#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) && !defined(EIGEN_HIP_DEVICE_COMPILE) // Running single threaded on the host CPU return l3CacheSize(); #else @@ -65,10 +68,14 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { -#ifndef EIGEN_CUDA_ARCH +#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE) // Running single threaded on the host CPU // Should return an enum that encodes the ISA supported by the CPU return 1; +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + // return 1 as major for HIP + return 1; #else // Running on a CUDA device return EIGEN_CUDA_ARCH / 100; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h new file mode 100644 index 000000000..c0e110987 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h @@ -0,0 +1,352 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H) +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H + +#if defined(EIGEN_HIPCC) +#include "hip/hip_runtime.h" +#include "hip/hip_runtime_api.h" +#endif +#include <unistd.h> //for sleep function + +namespace Eigen { + +static const int kHipScratchSize = 1024; + +// This defines an interface that GPUDevice can take to use +// HIP streams underneath. +class StreamInterface { + public: + virtual ~StreamInterface() {} + + virtual const hipStream_t& stream() const = 0; + virtual const hipDeviceProp_t& deviceProperties() const = 0; + + // Allocate memory on the actual device where the computation will run + virtual void* allocate(size_t num_bytes) const = 0; + virtual void deallocate(void* buffer) const = 0; + + // Return a scratchpad buffer of size 1k + virtual void* scratchpad() const = 0; + + // Return a semaphore. The semaphore is initially initialized to 0, and + // each kernel using it is responsible for resetting to 0 upon completion + // to maintain the invariant that the semaphore is always equal to 0 upon + // each kernel start. + virtual unsigned int* semaphore() const = 0; +}; + +static hipDeviceProp_t* m_deviceProperties; +static bool m_devicePropInitialized = false; + +static void initializeDeviceProp() { + if (!m_devicePropInitialized) { + // Attempts to ensure proper behavior in the case of multiple threads + // calling this function simultaneously. This would be trivial to + // implement if we could use std::mutex, but unfortunately mutex don't + // compile with nvcc, so we resort to atomics and thread fences instead. + // Note that if the caller uses a compiler that doesn't support c++11 we + // can't ensure that the initialization is thread safe. +#if 0 && __cplusplus >= 201103L + static std::atomic<bool> first(true); + if (first.exchange(false)) { +#else + static bool first = true; + if (first) { + first = false; +#endif + // We're the first thread to reach this point. + int num_devices; + hipError_t status = hipGetDeviceCount(&num_devices); + if (status != hipSuccess) { + std::cerr << "Failed to get the number of HIP devices: " + << hipGetErrorString(status) + << std::endl; + assert(status == hipSuccess); + } + m_deviceProperties = new hipDeviceProp_t[num_devices]; + for (int i = 0; i < num_devices; ++i) { + status = hipGetDeviceProperties(&m_deviceProperties[i], i); + if (status != hipSuccess) { + std::cerr << "Failed to initialize HIP device #" + << i + << ": " + << hipGetErrorString(status) + << std::endl; + assert(status == hipSuccess); + } + } + +#if 0 && __cplusplus >= 201103L + std::atomic_thread_fence(std::memory_order_release); +#endif + m_devicePropInitialized = true; + } else { + // Wait for the other thread to inititialize the properties. + while (!m_devicePropInitialized) { +#if 0 && __cplusplus >= 201103L + std::atomic_thread_fence(std::memory_order_acquire); +#endif + sleep(1); + } + } + } +} + +static const hipStream_t default_stream = 0x00;//TODO: Use hipStreamDefault instead of 0x00; + +class HipStreamDevice : public StreamInterface { + public: + // Use the default stream on the current device + HipStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { + hipGetDevice(&device_); + initializeDeviceProp(); + } + // Use the default stream on the specified device + HipStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) { + initializeDeviceProp(); + } + // Use the specified stream. Note that it's the + // caller responsibility to ensure that the stream can run on + // the specified device. If no device is specified the code + // assumes that the stream is associated to the current gpu device. + HipStreamDevice(const hipStream_t* stream, int device = -1) + : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) { + if (device < 0) { + hipGetDevice(&device_); + } else { + int num_devices; + hipError_t err = hipGetDeviceCount(&num_devices); + EIGEN_UNUSED_VARIABLE(err) + assert(err == hipSuccess); + assert(device < num_devices); + device_ = device; + } + initializeDeviceProp(); + } + + virtual ~HipStreamDevice() { + if (scratch_) { + deallocate(scratch_); + } + } + + const hipStream_t& stream() const { return *stream_; } + const hipDeviceProp_t& deviceProperties() const { + return m_deviceProperties[device_]; + } + virtual void* allocate(size_t num_bytes) const { + hipError_t err = hipSetDevice(device_); + EIGEN_UNUSED_VARIABLE(err) + assert(err == hipSuccess); + void* result; + err = hipMalloc(&result, num_bytes); + assert(err == hipSuccess); + assert(result != NULL); + return result; + } + virtual void deallocate(void* buffer) const { + hipError_t err = hipSetDevice(device_); + EIGEN_UNUSED_VARIABLE(err) + assert(err == hipSuccess); + assert(buffer != NULL); + err = hipFree(buffer); + assert(err == hipSuccess); + } + + virtual void* scratchpad() const { + if (scratch_ == NULL) { + scratch_ = allocate(kHipScratchSize + sizeof(unsigned int)); + } + return scratch_; + } + + virtual unsigned int* semaphore() const { + if (semaphore_ == NULL) { + char* scratch = static_cast<char*>(scratchpad()) + kHipScratchSize; + semaphore_ = reinterpret_cast<unsigned int*>(scratch); + //hipError_t err = hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); + hipError_t err = hipMemset(semaphore_, 0, sizeof(unsigned int)); + EIGEN_UNUSED_VARIABLE(err) + assert(err == hipSuccess); + } + return semaphore_; + } + + private: + const hipStream_t* stream_; + int device_; + mutable void* scratch_; + mutable unsigned int* semaphore_; +}; + +struct GpuDevice { + // The StreamInterface is not owned: the caller is + // responsible for its initialization and eventual destruction. + explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { + eigen_assert(stream); + } + explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) { + eigen_assert(stream); + } + // TODO(bsteiner): This is an internal API, we should not expose it. + EIGEN_STRONG_INLINE const hipStream_t& stream() const { + return stream_->stream(); + } + + EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return stream_->allocate(num_bytes); + } + + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + stream_->deallocate(buffer); + } + + EIGEN_STRONG_INLINE void* scratchpad() const { + return stream_->scratchpad(); + } + + EIGEN_STRONG_INLINE unsigned int* semaphore() const { + return stream_->semaphore(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { +#if !defined(EIGEN_HIP_DEVICE_COMPILE) + hipError_t err = hipMemcpyAsync(dst, src, n, hipMemcpyDeviceToDevice, + stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + assert(err == hipSuccess); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { + hipError_t err = + hipMemcpyAsync(dst, src, n, hipMemcpyHostToDevice, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + assert(err == hipSuccess); + } + + EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { + hipError_t err = + hipMemcpyAsync(dst, src, n, hipMemcpyDeviceToHost, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + assert(err == hipSuccess); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { +#if !defined(EIGEN_HIP_DEVICE_COMPILE) + //TODO:hipError_t err = hipMemsetAsync(buffer, c, n, stream_->stream()); + hipError_t err = hipMemset(buffer, c, n); + EIGEN_UNUSED_VARIABLE(err) + assert(err == hipSuccess); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE size_t numThreads() const { + // FIXME + return 32; + } + + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { + // FIXME + return 48*1024; + } + + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // We won't try to take advantage of the l2 cache for the time being, and + // there is no l3 cache on hip devices. + return firstLevelCacheSize(); + } + +// FIXME - this will move into HIP +#if defined(EIGEN_HIP_DEVICE_COMPILE) +#undef assert +#define assert(COND) +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { +#if defined(EIGEN_HIPCC) && \ + !defined(EIGEN_HIP_DEVICE_COMPILE) + hipError_t err = hipStreamSynchronize(stream_->stream()); + if (err != hipSuccess) { + std::cerr << "Error detected in HIP stream: " + << hipGetErrorString(err) + << std::endl; + assert(err == hipSuccess); + } +#else + assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE int getNumHipMultiProcessors() const { + return stream_->deviceProperties().multiProcessorCount; + } + EIGEN_STRONG_INLINE int maxHipThreadsPerBlock() const { + return stream_->deviceProperties().maxThreadsPerBlock; + } + EIGEN_STRONG_INLINE int maxHipThreadsPerMultiProcessor() const { + return stream_->deviceProperties().maxThreadsPerMultiProcessor; + } + EIGEN_STRONG_INLINE int sharedMemPerBlock() const { + return stream_->deviceProperties().sharedMemPerBlock; + } + EIGEN_STRONG_INLINE int majorDeviceVersion() const { + return stream_->deviceProperties().major; + } + EIGEN_STRONG_INLINE int minorDeviceVersion() const { + return stream_->deviceProperties().minor; + } + + EIGEN_STRONG_INLINE int maxBlocks() const { + return max_blocks_; + } + + // This function checks if the HIP runtime recorded an error for the + // underlying stream device. + inline bool ok() const { +#if defined(EIGEN_HIPCC) + hipError_t error = hipStreamQuery(stream_->stream()); + return (error == hipSuccess) || (error == hipErrorNotReady); +#else + return false; +#endif + } + + private: + const StreamInterface* stream_; + int max_blocks_; +}; + +#define LAUNCH_HIP_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), (__VA_ARGS__)); \ + assert(hipGetLastError() == hipSuccess); + + +// FIXME: Should be device and kernel specific. +#if defined(EIGEN_HIPCC) +static EIGEN_DEVICE_FUNC inline void setHipSharedMemConfig(hipSharedMemConfig config) { +#if !defined(EIGEN_HIP_DEVICE_COMPILE) + hipError_t status = hipDeviceSetSharedMemConfig(config); + EIGEN_UNUSED_VARIABLE(status) + assert(status == hipSuccess); +#else + EIGEN_UNUSED_VARIABLE(config) +#endif +} +#endif + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 0ffe68ab3..24a57970a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -201,7 +201,7 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable> { }; -#if defined(EIGEN_CUDACC) +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template <typename Evaluator, typename Index, bool Vectorizable> struct EigenMetaKernelEval { static __device__ EIGEN_ALWAYS_INLINE @@ -250,6 +250,17 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run( TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { +#if defined(EIGEN_HIPCC) + const int block_size = device.maxHipThreadsPerBlock(); + const int max_blocks = device.getNumHipMultiProcessors() * + device.maxHipThreadsPerMultiProcessor() / block_size; + const Index size = array_prod(evaluator.dimensions()); + // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. + const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>), + dim3(num_blocks), dim3(block_size), 0, device.stream(), evaluator, size); +#else const int block_size = device.maxCudaThreadsPerBlock(); const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size; @@ -260,11 +271,12 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run( LAUNCH_CUDA_KERNEL( (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size); +#endif } evaluator.cleanup(); } -#endif // EIGEN_CUDACC +#endif // EIGEN_CUDACC || EIGEN_HIPCC #endif // EIGEN_USE_GPU // SYCL Executor policy diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index c015ce196..b8f0bc798 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -109,7 +109,10 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { const Index numValues = internal::array_prod(m_impl.dimensions()); m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType)); // Should initialize the memory in case we're dealing with non POD types. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index 3209fecd3..835efbf72 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -350,7 +350,11 @@ struct IndexPairList : internal::IndexTuple<FirstType, OtherTypes...> { namespace internal { -template<typename FirstType, typename... OtherTypes> size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) { +template<typename FirstType, typename... OtherTypes> + #if defined(EIGEN_HIPCC) + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC + #endif + size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) { size_t result = 1; for (int i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) { result *= sizes[i]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h index c9e61f359..8e1ba486d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h @@ -27,7 +27,7 @@ */ // SFINAE requires variadic templates -#ifndef EIGEN_CUDACC +#if !defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC) #if EIGEN_HAS_VARIADIC_TEMPLATES // SFINAE doesn't work for gcc <= 4.7 #ifdef EIGEN_COMP_GNUC diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index 5431eb740..de1075cc1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -52,7 +52,7 @@ struct PacketType : internal::packet_traits<Scalar> { }; // For CUDA packet types when using a GpuDevice -#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) && defined(EIGEN_HAS_CUDA_FP16) +#if defined(EIGEN_USE_GPU) && ((defined(EIGEN_CUDACC) && defined(EIGEN_HAS_CUDA_FP16)) || (defined(EIGEN_HIPCC) && defined(EIGEN_HAS_HIP_FP16))) template <> struct PacketType<half, GpuDevice> { typedef half2 type; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index e59074506..2a979845b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -858,7 +858,10 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, } return inputIndex; } - + + #if defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) { #ifndef __SYCL_DEVICE_ONLY__ return numext::maxi(min, numext::mini(max,value)); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h index 230915db2..71536a4b9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h @@ -16,7 +16,7 @@ namespace internal { namespace { EIGEN_DEVICE_FUNC uint64_t get_random_seed() { -#ifdef EIGEN_CUDA_ARCH +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) // We don't support 3d kernels since we currently only use 1 and // 2d kernels. assert(threadIdx.z == 0); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index da0ffe728..d2fb3fd32 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -334,12 +334,12 @@ struct OuterReducer { }; -#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)) template <int B, int N, typename S, typename R, typename I> __global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); -#ifdef EIGEN_HAS_CUDA_FP16 +#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) template <typename S, typename R, typename I> __global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); template <int B, int N, typename S, typename R, typename I> @@ -495,7 +495,11 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(typename MakePointer_<CoeffReturnType>::Type data) { + EIGEN_STRONG_INLINE + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + bool evalSubExprsIfNeeded(typename MakePointer_<CoeffReturnType>::Type data) { m_impl.evalSubExprsIfNeeded(NULL); // Use the FullReducer if possible. @@ -694,9 +698,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, #ifdef EIGEN_USE_THREADS template <typename S, typename O, bool V> friend struct internal::FullReducerShard; #endif -#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)) template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); -#ifdef EIGEN_HAS_CUDA_FP16 +#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) template <typename S, typename R, typename I> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*); @@ -774,14 +778,22 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, // Indexed by reduced dimensions. array<Index, NumReducedDims> m_reducedDims; +#if defined(EIGEN_HIPCC) + public: +#endif + // Evaluator for the input expression. TensorEvaluator<ArgType, Device> m_impl; +#if defined(EIGEN_HIPCC) + private: +#endif + // Operation to apply for computing the reduction. Op m_reducer; // For full reductions -#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)) static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value; static const bool RunningOnSycl = false; #elif defined(EIGEN_USE_SYCL) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h new file mode 100644 index 000000000..5304a22c5 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h @@ -0,0 +1,815 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_HIP_H +#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_HIP_H + +#if defined(EIGEN_HIP_DEVICE_COMPILE) +#include "Eigen/src/Core/arch/HIP/hcc/math_constants.h" +#endif + +#if defined(EIGEN_HIPCC) +#define HIP_WARP_SIZE 64 +#endif + +namespace Eigen { +namespace internal { + + +#if defined(EIGEN_USE_GPU) && defined(EIGEN_HIPCC) +// Full reducers for GPU, don't vectorize for now + +// Reducer function that enables multiple hip thread to safely accumulate at the same +// output address. It basically reads the current value of the output variable, and +// attempts to update it with the new value. If in the meantime another hip thread +// updated the content of the output address it will try again. +template <typename T, typename R> +__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__) + if (sizeof(T) == 4) + { + unsigned int oldval = *reinterpret_cast<unsigned int*>(output); + unsigned int newval = oldval; + reducer.reduce(accum, reinterpret_cast<T*>(&newval)); + if (newval == oldval) { + return; + } + unsigned int readback; + while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reduce(accum, reinterpret_cast<T*>(&newval)); + if (newval == oldval) { + return; + } + } + } + else if (sizeof(T) == 8) { + unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output); + unsigned long long newval = oldval; + reducer.reduce(accum, reinterpret_cast<T*>(&newval)); + if (newval == oldval) { + return; + } + unsigned long long readback; + while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reduce(accum, reinterpret_cast<T*>(&newval)); + if (newval == oldval) { + return; + } + } + } + else { + assert(0 && "Wordsize not supported"); + } +#else + assert(0 && "Shouldn't be called on unsupported device"); +#endif +} + +// We extend atomicExch to support extra data types +template <typename Type> +__device__ inline Type atomicExchCustom(Type* address, Type val) { + return atomicExch(address, val); +} + +template <> +__device__ inline double atomicExchCustom(double* address, double val) { + unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address); + return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val))); +} + +#if defined(EIGEN_HAS_HIP_FP16) +template <template <typename T> class R> +__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) { + unsigned int oldval = *reinterpret_cast<unsigned int*>(output); + unsigned int newval = oldval; + reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval)); + if (newval == oldval) { + return; + } + unsigned int readback; + while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval)); + if (newval == oldval) { + return; + } + } +} +#endif + +template <> +__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) && (__HIP_DEVICE_COMPILE__ == 1) &&\ + defined(__HIP_ARCH_HAS_WARP_SHUFFLE__) + atomicAdd(output, accum); +#else + assert(0 && "Shouldn't be called on unsupported device"); +#endif +} + + +template <typename CoeffType, typename Index> +__global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) { + const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + const Index num_threads = hipBlockDim_x * hipGridDim_x; + for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { + output[i] = val; + } +} + + +template <int BlockSize, int NumPerThread, typename Self, + typename Reducer, typename Index> +__global__ void FullReductionKernel(const Self input, Index num_coeffs, + typename Self::CoeffReturnType* output, unsigned int* semaphore, Reducer reducer) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) && (__HIP_DEVICE_COMPILE__ == 1) &&\ + defined(__HIP_ARCH_HAS_WARP_SHUFFLE__) + // Initialize the output value + const Index first_index = hipBlockIdx_x * BlockSize * NumPerThread + hipThreadIdx_x; + if (hipGridDim_x == 1) { + if (first_index == 0) { + *output = reducer.initialize(); + } + } + else { + if (hipThreadIdx_x == 0) { + unsigned int block = atomicCAS(semaphore, 0u, 1u); + if (block == 0) { + // We're the first block to run, initialize the output value + atomicExchCustom(output, reducer.initialize()); + __threadfence(); + atomicExch(semaphore, 2u); + } + else { + // Wait for the first block to initialize the output value. + // Use atomicCAS here to ensure that the reads aren't cached + unsigned int val; + do { + val = atomicCAS(semaphore, 2u, 2u); + } + while (val < 2u); + } + } + } + + __syncthreads(); + + eigen_assert(hipGridDim_x == 1 || *semaphore >= 2u); + + typename Self::CoeffReturnType accum = reducer.initialize(); + Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize); + for (Index i = 0; i < max_iter; i+=BlockSize) { + const Index index = first_index + i; + eigen_assert(index < num_coeffs); + typename Self::CoeffReturnType val = input.m_impl.coeff(index); + reducer.reduce(val, &accum); + } + +#pragma unroll + for (int offset = HIP_WARP_SIZE/2; offset > 0; offset /= 2) { + // XXX use std::is_floating_point to determine the type of accum + if (std::is_floating_point<typename Self::CoeffReturnType>::value) { + reducer.reduce(__shfl_down(static_cast<float>(accum), offset, HIP_WARP_SIZE), &accum); + } else { + reducer.reduce(__shfl_down(static_cast<int>(accum), offset, HIP_WARP_SIZE), &accum); + } + } + + if ((hipThreadIdx_x & (HIP_WARP_SIZE - 1)) == 0) { + atomicReduce(output, accum, reducer); + } + + if (hipGridDim_x > 1 && hipThreadIdx_x == 0) { + // Let the last block reset the semaphore + atomicInc(semaphore, hipGridDim_x + 1); + __threadfence_system(); + } + +#else + assert(0 && "Shouldn't be called on unsupported device"); +#endif +} + + +#if defined(EIGEN_HAS_HIP_FP16) +template <typename Self, + typename Reducer, typename Index> +__global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) { + eigen_assert(hipBlockDim_x == 1); + eigen_assert(hipGridDim_x == 1); + if (num_coeffs % 2 != 0) { + half last = input.m_impl.coeff(num_coeffs-1); + *scratch = __halves2half2(last, reducer.initialize()); + } else { + *scratch = reducer.template initializePacket<half2>(); + } +} + +template <typename Self, + typename Reducer, typename Index> +__global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) { + const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + const Index num_threads = hipBlockDim_x * hipGridDim_x; + const Index num_packets = num_coeffs / 2; + for (Index i = thread_id; i < num_packets; i += num_threads) { + ((half2*)output)[i] = reducer.template initializePacket<half2>(); + } + + if (thread_id == 0 && num_coeffs % 2 != 0) { + output[num_coeffs-1] = reducer.initialize(); + } +} + +template <int BlockSize, int NumPerThread, typename Self, + typename Reducer, typename Index> +__global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, + half* output, half2* scratch) { + eigen_assert(NumPerThread % 2 == 0); + + const Index first_index = hipBlockIdx_x * BlockSize * NumPerThread + 2*hipThreadIdx_x; + + // Initialize the output value if it wasn't initialized by the ReductionInitKernel + if (hipGridDim_x == 1 && first_index == 0) { + if (num_coeffs % 2 != 0) { + half last = input.m_impl.coeff(num_coeffs-1); + *scratch = __halves2half2(last, reducer.initialize()); + } else { + *scratch = reducer.template initializePacket<half2>(); + } + __syncthreads(); + } + + half2 accum = reducer.template initializePacket<half2>(); + const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2); + for (Index i = 0; i < max_iter; i += BlockSize) { + const Index index = first_index + 2*i; + eigen_assert(index + 1 < num_coeffs); + half2 val = input.m_impl.template packet<Unaligned>(index); + reducer.reducePacket(val, &accum); + } + +#pragma unroll + for (int offset = HIP_WARP_SIZE/2; offset > 0; offset /= 2) { + // FIXME : remove this workaround once we have native half/half2 support for __shfl_down + union { int i; half2 h; } wka_in, wka_out; + wka_in.h = accum; + wka_out.i = __shfl_down(wka_in.i, offset, HIP_WARP_SIZE); + reducer.reducePacket(wka_out.h, &accum); + } + + if ((hipThreadIdx_x & (HIP_WARP_SIZE - 1)) == 0) { + atomicReduce(scratch, accum, reducer); + } + + __syncthreads(); + + if (hipGridDim_x == 1 && first_index == 0) { + half tmp = __low2half(*scratch); + reducer.reduce(__high2half(*scratch), &tmp); + *output = tmp; + } +} + +template <typename Op> +__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) { + eigen_assert(hipThreadIdx_x == 1); + half tmp = __low2half(*scratch); + reducer.reduce(__high2half(*scratch), &tmp); + *output = tmp; +} + +#endif + +template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void> +struct FullReductionLauncher { + static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) { + assert(false && "Should only be called on doubles, floats and half floats"); + } +}; + +namespace { + std::mutex __eigen_reduction_hip_mutex; +} + +// Specialization for float and double +template <typename Self, typename Op, typename OutputType, bool PacketAccess> +struct FullReductionLauncher< + Self, Op, OutputType, PacketAccess, + typename internal::enable_if< + internal::is_same<float, OutputType>::value || + internal::is_same<double, OutputType>::value, + void>::type> { + static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) { + // guard FullReductionLauncher with a mutex so only 1 FullReductionKernel + // is dispatched at a time + std::lock_guard<std::mutex> lock(__eigen_reduction_hip_mutex); + + typedef typename Self::Index Index; + typedef typename Self::CoeffReturnType Scalar; + const int block_size = 256; + const int num_per_thread = 128; + const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread); + + unsigned int* semaphore = NULL; + if (num_blocks > 1) { + semaphore = device.semaphore(); + + unsigned int semaphore_host = 0xFF; + hipMemcpy(&semaphore_host, semaphore, sizeof(unsigned int), hipMemcpyDeviceToHost); + if (semaphore_host != 0) { + std::cerr << "[WARN][EIGEN][FullReductionLauncher] incorrect semaphore value: " + << semaphore_host << "\n"; + // wait for all commands on the device to complete so semaphore value + // is reset to 0 + hipDeviceSynchronize(); + + // read again + hipMemcpy(&semaphore_host, semaphore, sizeof(unsigned int), hipMemcpyDeviceToHost); + if (semaphore_host != 0) { + std::cerr << "[ERROR][EIGEN][FullReductionLauncher] CRITICAL incorrect semaphore value: " + << semaphore_host << ", apply manual override to 0\n"; + + // force set semaphore value to be 0 + semaphore_host = 0; + hipMemcpy(semaphore, &semaphore_host, sizeof(unsigned int), hipMemcpyHostToDevice); + } + } + } + + hipLaunchKernelGGL(HIP_KERNEL_NAME(FullReductionKernel<block_size, num_per_thread, Self, Op, Index>), + dim3(num_blocks), dim3(block_size), 0, device.stream(), self, num_coeffs, output, semaphore, reducer); + } +}; + +#if defined(EIGEN_HAS_HIP_FP16) +template <typename Self, typename Op> +struct FullReductionLauncher<Self, Op, Eigen::half, false> { + static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) { + assert(false && "Should not be called since there is no packet accessor"); + } +}; + +template <typename Self, typename Op> +struct FullReductionLauncher<Self, Op, Eigen::half, true> { + static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) { + typedef typename Self::Index Index; + + const int block_size = 256; + const int num_per_thread = 128; + const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread); + half2* scratch = static_cast<half2*>(device.scratchpad()); + + if (num_blocks > 1) { + // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there + // won't be a race conditions between multiple thread blocks. + hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>), + dim3(1), dim3(1), 0, device.stream(), reducer, self, num_coeffs, scratch); + } + + hipLaunchKernelGGL(HIP_KERNEL_NAME(FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>), + dim3(num_blocks), dim3(block_size), 0, device.stream(), reducer, self, num_coeffs, output, scratch); + + if (num_blocks > 1) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionCleanupKernelHalfFloat<Op>), + dim3(1), dim3(1), 0, device.stream(), reducer, output, scratch); + } + } +}; +#endif + + +template <typename Self, typename Op, bool Vectorizable> +struct FullReducer<Self, Op, GpuDevice, Vectorizable> { + // Unfortunately nvidia doesn't support well exotic types such as complex, + // so reduce the scope of the optimized version of the code to the simple cases + // of doubles, floats and half floats +#if defined(EIGEN_HAS_HIP_FP16) + static const bool HasOptimizedImplementation = !Op::IsStateful && + (internal::is_same<typename Self::CoeffReturnType, float>::value || + internal::is_same<typename Self::CoeffReturnType, double>::value || + (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess)); +#else + static const bool HasOptimizedImplementation = !Op::IsStateful && + (internal::is_same<typename Self::CoeffReturnType, float>::value || + internal::is_same<typename Self::CoeffReturnType, double>::value); +#endif + + template <typename OutputType> + static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { + assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats"); + const Index num_coeffs = array_prod(self.m_impl.dimensions()); + // Don't crash when we're called with an input tensor of size 0. + if (num_coeffs == 0) { + return; + } + + FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs); + } +}; + + +template <int NumPerThread, typename Self, + typename Reducer, typename Index> +__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, + typename Self::CoeffReturnType* output) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) && (__HIP_DEVICE_COMPILE__ == 1) &&\ + defined(__HIP_ARCH_HAS_WARP_SHUFFLE__) + typedef typename Self::CoeffReturnType Type; + eigen_assert(hipBlockDim_y == 1); + eigen_assert(hipBlockDim_z == 1); + eigen_assert(hipGridDim_y == 1); + eigen_assert(hipGridDim_z == 1); + + const int unroll_times = 16; + eigen_assert(NumPerThread % unroll_times == 0); + + const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, hipBlockDim_x * NumPerThread); + const Index num_input_blocks = input_col_blocks * num_preserved_coeffs; + + const Index num_threads = hipBlockDim_x * hipGridDim_x; + const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + + // Initialize the output values if they weren't initialized by the ReductionInitKernel + if (hipGridDim_x == 1) { + for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { + output[i] = reducer.initialize(); + } + __syncthreads(); + } + + for (Index i = hipBlockIdx_x; i < num_input_blocks; i += hipGridDim_x) { + const Index row = i / input_col_blocks; + + if (row < num_preserved_coeffs) { + const Index col_block = i % input_col_blocks; + const Index col_begin = col_block * hipBlockDim_x * NumPerThread + hipThreadIdx_x; + + Type reduced_val = reducer.initialize(); + + for (Index j = 0; j < NumPerThread; j += unroll_times) { + const Index last_col = col_begin + hipBlockDim_x * (j + unroll_times - 1); + if (last_col >= num_coeffs_to_reduce) { + for (Index col = col_begin + hipBlockDim_x * j; col < num_coeffs_to_reduce; col += hipBlockDim_x) { + const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col); + reducer.reduce(val, &reduced_val); + } + break; + } else { + // Faster version of the loop with no branches after unrolling. +#pragma unroll + for (int k = 0; k < unroll_times; ++k) { + const Index col = col_begin + hipBlockDim_x * (j + k); + reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val); + } + } + } + +#pragma unroll + for (int offset = HIP_WARP_SIZE/2; offset > 0; offset /= 2) { + // XXX use std::is_floating_point to determine the type of reduced_val + if (std::is_floating_point<Type>::value) { + reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val); + } else { + reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val); + } + } + + if ((hipThreadIdx_x & (HIP_WARP_SIZE - 1)) == 0) { + atomicReduce(&(output[row]), reduced_val, reducer); + } + } + } +#else + assert(0 && "Shouldn't be called on unsupported device"); +#endif +} + +#if defined(EIGEN_HAS_HIP_FP16) + +template <int NumPerThread, typename Self, + typename Reducer, typename Index> +__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, + half* output) { + eigen_assert(hipBlockDim_y == 1); + eigen_assert(hipBlockDim_z == 1); + eigen_assert(hipGridDim_y == 1); + eigen_assert(hipGridDim_z == 1); + + const int unroll_times = 16; + eigen_assert(NumPerThread % unroll_times == 0); + eigen_assert(unroll_times % 2 == 0); + + const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, hipBlockDim_x * NumPerThread * 2); + const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2); + + const Index num_threads = hipBlockDim_x * hipGridDim_x; + const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + + // Initialize the output values if they weren't initialized by the ReductionInitKernel + if (hipGridDim_x == 1) { + Index i = 2*thread_id; + for (; i + 1 < num_preserved_coeffs; i += 2*num_threads) { + half* loc = output + i; + *((half2*)loc) = reducer.template initializePacket<half2>(); + } + if (i < num_preserved_coeffs) { + output[i] = reducer.initialize(); + } + __syncthreads(); + } + + for (Index i = hipBlockIdx_x; i < num_input_blocks; i += hipGridDim_x) { + const Index row = 2 * (i / input_col_blocks); + + if (row + 1 < num_preserved_coeffs) { + const Index col_block = i % input_col_blocks; + const Index col_begin = 2 * (col_block * hipBlockDim_x * NumPerThread + hipThreadIdx_x); + + half2 reduced_val1 = reducer.template initializePacket<half2>(); + half2 reduced_val2 = reducer.template initializePacket<half2>(); + + for (Index j = 0; j < NumPerThread; j += unroll_times) { + const Index last_col = col_begin + hipBlockDim_x * (j + unroll_times - 1) * 2; + if (last_col >= num_coeffs_to_reduce) { + Index col = col_begin + hipBlockDim_x * j; + for (; col + 1 < num_coeffs_to_reduce; col += hipBlockDim_x) { + const half2 val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col); + reducer.reducePacket(val1, &reduced_val1); + const half2 val2 = input.m_impl.template packet<Unaligned>((row+1) * num_coeffs_to_reduce + col); + reducer.reducePacket(val2, &reduced_val2); + } + if (col < num_coeffs_to_reduce) { + // Peel; + const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col); + const half2 val1 = __halves2half2(last1, reducer.initialize()); + reducer.reducePacket(val1, &reduced_val1); + const half last2 = input.m_impl.coeff((row+1) * num_coeffs_to_reduce + col); + const half2 val2 = __halves2half2(last2, reducer.initialize()); + reducer.reducePacket(val2, &reduced_val2); + } + break; + } else { + // Faster version of the loop with no branches after unrolling. +#pragma unroll + for (int k = 0; k < unroll_times; ++k) { + const Index col = col_begin + hipBlockDim_x * (j + k) * 2; + reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col), &reduced_val1); + reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1)* num_coeffs_to_reduce + col), &reduced_val2); + } + } + } + +#pragma unroll + for (int offset = HIP_WARP_SIZE/2; offset > 0; offset /= 2) { + // FIXME : remove this workaround once we have native half/half2 support for __shfl_down + union { int i; half2 h; } wka_in, wka_out; + + wka_in.h = reduced_val1; + wka_out.i = __shfl_down(wka_in.i, offset, HIP_WARP_SIZE); + reducer.reducePacket(wka_out.h, &reduced_val1); + + wka_in.h = reduced_val2; + wka_out.i = __shfl_down(wka_in.i, offset, HIP_WARP_SIZE); + reducer.reducePacket(wka_out.h, &reduced_val2); + } + + half val1 = __low2half(reduced_val1); + reducer.reduce(__high2half(reduced_val1), &val1); + half val2 = __low2half(reduced_val2); + reducer.reduce(__high2half(reduced_val2), &val2); + half2 val = __halves2half2(val1, val2); + + if ((hipThreadIdx_x & (HIP_WARP_SIZE - 1)) == 0) { + half* loc = output + row; + atomicReduce((half2*)loc, val, reducer); + } + } + } +} + +#endif + +template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void> +struct InnerReductionLauncher { + static bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) { + assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device"); + return true; + } +}; + +// Specialization for float and double +template <typename Self, typename Op, typename OutputType, bool PacketAccess> +struct InnerReductionLauncher< + Self, Op, OutputType, PacketAccess, + typename internal::enable_if< + internal::is_same<float, OutputType>::value || + internal::is_same<double, OutputType>::value, + void>::type> { + static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + typedef typename Self::Index Index; + + const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; + const int block_size = 256; + const int num_per_thread = 128; + const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); + const int max_blocks = device.getNumHipMultiProcessors() * + device.maxHipThreadsPerMultiProcessor() / block_size; + const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); + + if (num_blocks > 1) { + // We initialize the outputs outside the reduction kernel when we can't be sure that there + // won't be a race conditions between multiple thread blocks. + const int dyn_blocks = divup<int>(num_preserved_vals, 1024); + const int max_blocks = device.getNumHipMultiProcessors() * + device.maxHipThreadsPerMultiProcessor() / 1024; + const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); + hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionInitKernel<OutputType, Index>), + dim3(num_blocks), dim3(1024), 0, device.stream(), + reducer.initialize(), num_preserved_vals, output); + } + + hipLaunchKernelGGL(HIP_KERNEL_NAME(InnerReductionKernel<num_per_thread, Self, Op, Index>), + dim3(num_blocks), dim3(block_size), 0, device.stream(), reducer, self, + num_coeffs_to_reduce, num_preserved_vals, output); + + return false; + } +}; + +#if defined(EIGEN_HAS_HIP_FP16) +template <typename Self, typename Op> +struct InnerReductionLauncher<Self, Op, Eigen::half, false> { + static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) { + assert(false && "Should not be called since there is no packet accessor"); + return true; + } +}; + +template <typename Self, typename Op> +struct InnerReductionLauncher<Self, Op, Eigen::half, true> { + static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + typedef typename Self::Index Index; + + if (num_preserved_vals % 2 != 0) { + // Not supported yet, revert to the slower code path + return true; + } + + const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; + const int block_size = /*256*/128; + const int num_per_thread = /*128*/64; + const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); + const int max_blocks = device.getNumHipMultiProcessors() * + device.maxHipThreadsPerMultiProcessor() / block_size; + const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); + + if (num_blocks > 1) { + // We initialize the outputs outside the reduction kernel when we can't be sure that there + // won't be a race conditions between multiple thread blocks. + const int dyn_blocks = divup<int>(num_preserved_vals, 1024); + const int max_blocks = device.getNumHipMultiProcessors() * + device.maxHipThreadsPerMultiProcessor() / 1024; + const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); + hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionInitKernelHalfFloat<Self, Op, Index>), + dim3(1), dim3(1), 0, device.stream(), reducer, self, num_preserved_vals, output); + } + + hipLaunchKernelGGL(HIP_KERNEL_NAME(InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>), + dim3(num_blocks), dim3(block_size), 0, device.stream(), reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); + + return false; + } +}; +#endif + + +template <typename Self, typename Op> +struct InnerReducer<Self, Op, GpuDevice> { + // Unfortunately nvidia doesn't support well exotic types such as complex, + // so reduce the scope of the optimized version of the code to the simple case + // of floats and half floats. +#if defined(EIGEN_HAS_HIP_FP16) + static const bool HasOptimizedImplementation = !Op::IsStateful && + (internal::is_same<typename Self::CoeffReturnType, float>::value || + internal::is_same<typename Self::CoeffReturnType, double>::value || + (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess)); +#else + static const bool HasOptimizedImplementation = !Op::IsStateful && + (internal::is_same<typename Self::CoeffReturnType, float>::value || + internal::is_same<typename Self::CoeffReturnType, double>::value); +#endif + + template <typename OutputType> + static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats"); + const Index num_coeffs = array_prod(self.m_impl.dimensions()); + // Don't crash when we're called with an input tensor of size 0. + if (num_coeffs == 0) { + return true; + } + // It's faster to use the usual code. + if (num_coeffs_to_reduce <= 128) { + return true; + } + + return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals); + } +}; + +template <int NumPerThread, typename Self, + typename Reducer, typename Index> +__global__ void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, + typename Self::CoeffReturnType* output) { + const Index num_threads = hipBlockDim_x * hipGridDim_x; + const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + // Initialize the output values if they weren't initialized by the ReductionInitKernel + if (hipGridDim_x == 1) { + for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { + output[i] = reducer.initialize(); + } + __syncthreads(); + } + + // Do the reduction. + const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread); + for (Index i = thread_id; i < max_iter; i += num_threads) { + const Index input_col = i % num_preserved_coeffs; + const Index input_row = (i / num_preserved_coeffs) * NumPerThread; + typename Self::CoeffReturnType reduced_val = reducer.initialize(); + const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce); + for (Index j = input_row; j < max_row; j++) { + typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col); + reducer.reduce(val, &reduced_val); + } + atomicReduce(&(output[input_col]), reduced_val, reducer); + } +} + + +template <typename Self, typename Op> +struct OuterReducer<Self, Op, GpuDevice> { + // Unfortunately nvidia doesn't support well exotic types such as complex, + // so reduce the scope of the optimized version of the code to the simple case + // of floats. + static const bool HasOptimizedImplementation = !Op::IsStateful && + (internal::is_same<typename Self::CoeffReturnType, float>::value || + internal::is_same<typename Self::CoeffReturnType, double>::value); + template <typename Device, typename OutputType> + static bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { + assert(false && "Should only be called to reduce doubles or floats on a gpu device"); + return true; + } + + static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + typedef typename Self::Index Index; + + // It's faster to use the usual code. + if (num_coeffs_to_reduce <= 32) { + return true; + } + + const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; + const int block_size = 256; + const int num_per_thread = 16; + const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); + const int max_blocks = device.getNumHipMultiProcessors() * + device.maxHipThreadsPerMultiProcessor() / block_size; + const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); + + if (num_blocks > 1) { + // We initialize the outputs in the reduction kernel itself when we don't have to worry + // about race conditions between multiple thread blocks. + const int dyn_blocks = divup<int>(num_preserved_vals, 1024); + const int max_blocks = device.getNumHipMultiProcessors() * + device.maxHipThreadsPerMultiProcessor() / 1024; + const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); + hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionInitKernel<float, Index>), + dim3(num_blocks), dim3(1024), 0, device.stream(), + reducer.initialize(), num_preserved_vals, output); + } + + hipLaunchKernelGGL(HIP_KERNEL_NAME(OuterReductionKernel<num_per_thread, Self, Op, Index>), + dim3(num_blocks), dim3(block_size), 0, device.stream(), reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); + + return false; + } +}; + +#endif + + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_HIP_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index 1f545ef1a..174a6a064 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -242,7 +242,7 @@ struct ScanLauncher { } }; -#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)) // GPU implementation of scan // TODO(ibab) This placeholder implementation performs multiple scans in @@ -278,10 +278,15 @@ struct ScanLauncher<Self, Reducer, GpuDevice> { Index total_size = internal::array_prod(self.dimensions()); Index num_blocks = (total_size / self.size() + 63) / 64; Index block_size = 64; +#if defined(EIGEN_HIPCC) + hipLaunchKernelGGL(HIP_KERNEL_NAME(ScanKernel<Self, Reducer>), dim3(num_blocks), + dim3(block_size), 0, self.device().stream(), self, total_size, data); +#else LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data); +#endif } }; -#endif // EIGEN_USE_GPU && EIGEN_CUDACC +#endif // EIGEN_USE_GPU && (EIGEN_CUDACC || EIGEN_HIPCC) } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h index 49d315a66..bb584e3f9 100644 --- a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h @@ -268,6 +268,9 @@ template< typename Reducer > struct reduce<Reducer> { + #if defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif constexpr static inline int run() { return Reducer::Identity; } }; @@ -276,6 +279,9 @@ template< typename A > struct reduce<Reducer, A> { + #if defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif constexpr static inline A run(A a) { return a; } }; @@ -285,6 +291,9 @@ template< typename... Ts > struct reduce<Reducer, A, Ts...> { + #if defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif constexpr static inline auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) { return Reducer::run(a, reduce<Reducer, Ts...>::run(ts...)); } @@ -324,6 +333,9 @@ struct greater_equal_zero_op { template<typename A> constexpr static inline auto // together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1 // does... template<typename... Ts> +#if defined(EIGEN_HIPCC) +EIGEN_DEVICE_FUNC +#endif constexpr inline decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts) { return reduce<product_op, Ts...>::run(ts...); diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h index 96b3a8261..18b76350b 100644 --- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h @@ -15,7 +15,7 @@ // The array class is only available starting with cxx11. Emulate our own here // if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler! // Moreover, CUDA doesn't support the STL containers, so we use our own instead. -#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_CUDACC) || defined(EIGEN_AVOID_STL_ARRAY) +#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) || defined(EIGEN_AVOID_STL_ARRAY) namespace Eigen { template <typename T, size_t n> class array { diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h index 6c7ac3f3b..5ab67dc1f 100644 --- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h @@ -190,7 +190,7 @@ template <> struct lgamma_impl<float> { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(float x) { -#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) +#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) && !defined(EIGEN_HIP_DEVICE_COMPILE) int dummy; return ::lgammaf_r(x, &dummy); #else @@ -203,7 +203,7 @@ template <> struct lgamma_impl<double> { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(double x) { -#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) +#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) && !defined(EIGEN_HIP_DEVICE_COMPILE) int dummy; return ::lgamma_r(x, &dummy); #else diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 42b790d55..c3dba5f2a 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -297,3 +297,55 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA) unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) endif() + +# Add HIP specific tests +if (EIGEN_TEST_HIP) + + set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.") + + if (EXISTS ${HIP_PATH}) + + list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake) + + find_package(HIP REQUIRED) + if (HIP_FOUND) + + execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM) + + if (${HIP_PLATFORM} STREQUAL "hcc") + + include_directories(${CMAKE_CURRENT_BINARY_DIR}) + include_directories(${HIP_PATH}/include) + + set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") + + # ei_add_test(cxx11_tensor_complex_hip) + # ei_add_test(cxx11_tensor_complex_cwise_ops_hip) + ei_add_test(cxx11_tensor_reduction_hip) + ei_add_test(cxx11_tensor_argmax_hip) + ei_add_test(cxx11_tensor_cast_float16_hip) + ei_add_test(cxx11_tensor_scan_hip) + ei_add_test(cxx11_tensor_device_hip) + ei_add_test(cxx11_tensor_hip) + ei_add_test(cxx11_tensor_contract_hip) + ei_add_test(cxx11_tensor_of_float16_hip) + ei_add_test(cxx11_tensor_random_hip) + + unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) + + elseif (${HIP_PLATFORM} STREQUAL "nvcc") + message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen") + else () + message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}") + endif() + + endif(HIP_FOUND) + + else () + + message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist") + + endif() + +endif(EIGEN_TEST_HIP) + diff --git a/unsupported/test/cxx11_tensor_argmax_hip.cu b/unsupported/test/cxx11_tensor_argmax_hip.cu new file mode 100644 index 000000000..57d6ca000 --- /dev/null +++ b/unsupported/test/cxx11_tensor_argmax_hip.cu @@ -0,0 +1,251 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_FUNC cxx11_tensor_hip +#define EIGEN_USE_GPU + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +template <int Layout> +void test_hip_simple_argmax() +{ + Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97)); + Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1)); + Tensor<DenseIndex, 1, Layout> out_min(Eigen::array<DenseIndex, 1>(1)); + in.setRandom(); + in *= in.constant(100.0); + in(0, 0, 0) = -1000.0; + in(71, 52, 96) = 1000.0; + + std::size_t in_bytes = in.size() * sizeof(double); + std::size_t out_bytes = out_max.size() * sizeof(DenseIndex); + + double* d_in; + DenseIndex* d_out_max; + DenseIndex* d_out_min; + hipMalloc((void**)(&d_in), in_bytes); + hipMalloc((void**)(&d_out_max), out_bytes); + hipMalloc((void**)(&d_out_min), out_bytes); + + hipMemcpy(d_in, in.data(), in_bytes, hipMemcpyHostToDevice); + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97)); + Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_max(d_out_max, Eigen::array<DenseIndex, 1>(1)); + Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_min(d_out_min, Eigen::array<DenseIndex, 1>(1)); + + gpu_out_max.device(gpu_device) = gpu_in.argmax(); + gpu_out_min.device(gpu_device) = gpu_in.argmin(); + + assert(hipMemcpyAsync(out_max.data(), d_out_max, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); + assert(hipMemcpyAsync(out_min.data(), d_out_min, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); + assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); + + VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1); + VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0); + + hipFree(d_in); + hipFree(d_out_max); + hipFree(d_out_min); +} + +template <int DataLayout> +void test_hip_argmax_dim() +{ + Tensor<float, 4, DataLayout> tensor(2,3,5,7); + std::vector<int> dims; + dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7); + + for (int dim = 0; dim < 4; ++dim) { + tensor.setRandom(); + tensor = (tensor + tensor.constant(0.5)).log(); + + array<DenseIndex, 3> out_shape; + for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1]; + + Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape); + + array<DenseIndex, 4> ix; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; + if (ix[dim] != 0) continue; + // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0 + tensor(ix) = 10.0; + } + } + } + } + + std::size_t in_bytes = tensor.size() * sizeof(float); + std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex); + + float* d_in; + DenseIndex* d_out; + hipMalloc((void**)(&d_in), in_bytes); + hipMalloc((void**)(&d_out), out_bytes); + + hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice); + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7)); + Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape); + + gpu_out.device(gpu_device) = gpu_in.argmax(dim); + + assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); + assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); + + VERIFY_IS_EQUAL(tensor_arg.size(), + size_t(2*3*5*7 / tensor.dimension(dim))); + + for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { + // Expect max to be in the first index of the reduced dimension + VERIFY_IS_EQUAL(tensor_arg.data()[n], 0); + } + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; + if (ix[dim] != tensor.dimension(dim) - 1) continue; + // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0 + tensor(ix) = 20.0; + } + } + } + } + + hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice); + + gpu_out.device(gpu_device) = gpu_in.argmax(dim); + + assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); + assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); + + for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { + // Expect max to be in the last index of the reduced dimension + VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); + } + + hipFree(d_in); + hipFree(d_out); + } +} + +template <int DataLayout> +void test_hip_argmin_dim() +{ + Tensor<float, 4, DataLayout> tensor(2,3,5,7); + std::vector<int> dims; + dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7); + + for (int dim = 0; dim < 4; ++dim) { + tensor.setRandom(); + tensor = (tensor + tensor.constant(0.5)).log(); + + array<DenseIndex, 3> out_shape; + for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1]; + + Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape); + + array<DenseIndex, 4> ix; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; + if (ix[dim] != 0) continue; + // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0 + tensor(ix) = -10.0; + } + } + } + } + + std::size_t in_bytes = tensor.size() * sizeof(float); + std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex); + + float* d_in; + DenseIndex* d_out; + hipMalloc((void**)(&d_in), in_bytes); + hipMalloc((void**)(&d_out), out_bytes); + + hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice); + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7)); + Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape); + + gpu_out.device(gpu_device) = gpu_in.argmin(dim); + + assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); + assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); + + VERIFY_IS_EQUAL(tensor_arg.size(), + 2*3*5*7 / tensor.dimension(dim)); + + for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { + // Expect min to be in the first index of the reduced dimension + VERIFY_IS_EQUAL(tensor_arg.data()[n], 0); + } + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; + if (ix[dim] != tensor.dimension(dim) - 1) continue; + // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0 + tensor(ix) = -20.0; + } + } + } + } + + hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice); + + gpu_out.device(gpu_device) = gpu_in.argmin(dim); + + assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); + assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); + + for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { + // Expect max to be in the last index of the reduced dimension + VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); + } + + hipFree(d_in); + hipFree(d_out); + } +} + +void test_cxx11_tensor_hip() +{ + CALL_SUBTEST(test_hip_simple_argmax<RowMajor>()); + CALL_SUBTEST(test_hip_simple_argmax<ColMajor>()); + CALL_SUBTEST(test_hip_argmax_dim<RowMajor>()); + CALL_SUBTEST(test_hip_argmax_dim<ColMajor>()); + CALL_SUBTEST(test_hip_argmin_dim<RowMajor>()); + CALL_SUBTEST(test_hip_argmin_dim<ColMajor>()); +} diff --git a/unsupported/test/cxx11_tensor_cast_float16_hip.cu b/unsupported/test/cxx11_tensor_cast_float16_hip.cu new file mode 100644 index 000000000..bf6a49df4 --- /dev/null +++ b/unsupported/test/cxx11_tensor_cast_float16_hip.cu @@ -0,0 +1,79 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_hip +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_GPU + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +void test_hip_conversion() { + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int num_elem = 101; + + Tensor<float, 1> floats(num_elem); + floats.setRandom(); + + float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); + float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float)); + + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float( + d_float, num_elem); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half( + d_half, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv( + d_conv, num_elem); + + gpu_device.memcpyHostToDevice(d_float, floats.data(), num_elem*sizeof(float)); + + gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>(); + gpu_conv.device(gpu_device) = gpu_half.cast<float>(); + + Tensor<float, 1> initial(num_elem); + Tensor<float, 1> final(num_elem); + gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float)); + gpu_device.synchronize(); + + for (int i = 0; i < num_elem; ++i) { + VERIFY_IS_APPROX(initial(i), final(i)); + } + + gpu_device.deallocate(d_float); + gpu_device.deallocate(d_half); + gpu_device.deallocate(d_conv); +} + + +void test_fallback_conversion() { + int num_elem = 101; + Tensor<float, 1> floats(num_elem); + floats.setRandom(); + + Eigen::Tensor<Eigen::half, 1> halfs = floats.cast<Eigen::half>(); + Eigen::Tensor<float, 1> conv = halfs.cast<float>(); + + for (int i = 0; i < num_elem; ++i) { + VERIFY_IS_APPROX(floats(i), conv(i)); + } +} + + +void test_cxx11_tensor_cast_float16_hip() +{ + CALL_SUBTEST(test_hip_conversion()); + CALL_SUBTEST(test_fallback_conversion()); +} diff --git a/unsupported/test/cxx11_tensor_contract_hip.cu b/unsupported/test/cxx11_tensor_contract_hip.cu new file mode 100644 index 000000000..652af0ab0 --- /dev/null +++ b/unsupported/test/cxx11_tensor_contract_hip.cu @@ -0,0 +1,215 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_hip +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_GPU + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + + +using Eigen::Tensor; +typedef Tensor<float, 1>::DimensionPair DimPair; + +template<int DataLayout> +void test_hip_contraction(int m_size, int k_size, int n_size) +{ + std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; + // with these dimensions, the output has 300 * 140 elements, which is + // more than 30 * 1024, which is the number of threads in blocks on + // a 15 SM GK110 GPU + Tensor<float, 2, DataLayout> t_left(m_size, k_size); + Tensor<float, 2, DataLayout> t_right(k_size, n_size); + Tensor<float, 2, DataLayout> t_result(m_size, n_size); + Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size); + Eigen::array<DimPair, 1> dims(DimPair(1, 0)); + + t_left.setRandom(); + t_right.setRandom(); + + std::size_t t_left_bytes = t_left.size() * sizeof(float); + std::size_t t_right_bytes = t_right.size() * sizeof(float); + std::size_t t_result_bytes = t_result.size() * sizeof(float); + + float* d_t_left; + float* d_t_right; + float* d_t_result; + + hipMalloc((void**)(&d_t_left), t_left_bytes); + hipMalloc((void**)(&d_t_right), t_right_bytes); + hipMalloc((void**)(&d_t_result), t_result_bytes); + + hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice); + hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice); + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > + gpu_t_left(d_t_left, Eigen::array<int, 2>(m_size, k_size)); + Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > + gpu_t_right(d_t_right, Eigen::array<int, 2>(k_size, n_size)); + Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > + gpu_t_result(d_t_result, Eigen::array<int, 2>(m_size, n_size)); + + + gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); + t_result = t_left.contract(t_right, dims); + + hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost); + for (DenseIndex i = 0; i < t_result.size(); i++) { + if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) { + continue; + } + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) { + continue; + } + std::cout << "mismatch detected at index " << i << ": " << t_result(i) + << " vs " << t_result_gpu(i) << std::endl; + assert(false); + } + + hipFree((void*)d_t_left); + hipFree((void*)d_t_right); + hipFree((void*)d_t_result); +} + + +template<int DataLayout> +void test_scalar(int m_size, int k_size, int n_size) +{ + std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; + // with these dimensions, the output has 300 * 140 elements, which is + // more than 30 * 1024, which is the number of threads in blocks on + // a 15 SM GK110 GPU + Tensor<float, 2, DataLayout> t_left(m_size, k_size); + Tensor<float, 2, DataLayout> t_right(k_size, n_size); + Tensor<float, 0, DataLayout> t_result; + Tensor<float, 0, DataLayout> t_result_gpu; + Eigen::array<DimPair, 2> dims(DimPair(0, 0), DimPair(1, 1)); + + t_left.setRandom(); + t_right.setRandom(); + + std::size_t t_left_bytes = t_left.size() * sizeof(float); + std::size_t t_right_bytes = t_right.size() * sizeof(float); + std::size_t t_result_bytes = sizeof(float); + + float* d_t_left; + float* d_t_right; + float* d_t_result; + + hipMalloc((void**)(&d_t_left), t_left_bytes); + hipMalloc((void**)(&d_t_right), t_right_bytes); + hipMalloc((void**)(&d_t_result), t_result_bytes); + + hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice); + hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice); + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > + gpu_t_left(d_t_left, m_size, k_size); + Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > + gpu_t_right(d_t_right, k_size, n_size); + Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> > + gpu_t_result(d_t_result); + + gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); + t_result = t_left.contract(t_right, dims); + + hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost); + if (fabs(t_result() - t_result_gpu()) > 1e-4f && + !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) { + std::cout << "mismatch detected: " << t_result() + << " vs " << t_result_gpu() << std::endl; + assert(false); + } + + hipFree((void*)d_t_left); + hipFree((void*)d_t_right); + hipFree((void*)d_t_result); +} + + +template<int DataLayout> +void test_hip_contraction_m() { + for (int k = 32; k < 256; k++) { + test_hip_contraction<ColMajor>(k, 128, 128); + test_hip_contraction<RowMajor>(k, 128, 128); + } +} + +template<int DataLayout> +void test_hip_contraction_k() { + for (int k = 32; k < 256; k++) { + test_hip_contraction<ColMajor>(128, k, 128); + test_hip_contraction<RowMajor>(128, k, 128); + } +} + +template<int DataLayout> +void test_hip_contraction_n() { + for (int k = 32; k < 256; k++) { + test_hip_contraction<ColMajor>(128, 128, k); + test_hip_contraction<RowMajor>(128, 128, k); + } +} + + +template<int DataLayout> +void test_hip_contraction_sizes() { + int m_sizes[] = { 31, 39, 63, 64, 65, + 127, 129, 255, 257 , 511, + 512, 513, 1023, 1024, 1025}; + + int n_sizes[] = { 31, 39, 63, 64, 65, + 127, 129, 255, 257, 511, + 512, 513, 1023, 1024, 1025}; + + int k_sizes[] = { 31, 39, 63, 64, 65, + 95, 96, 127, 129, 255, + 257, 511, 512, 513, 1023, + 1024, 1025}; + + for (int i = 0; i < 15; i++) { + for (int j = 0; j < 15; j++) { + for (int k = 0; k < 17; k++) { + test_hip_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]); + } + } + } +} + +void test_cxx11_tensor_hip() +{ + CALL_SUBTEST(test_hip_contraction<ColMajor>(128, 128, 128)); + CALL_SUBTEST(test_hip_contraction<RowMajor>(128, 128, 128)); + + CALL_SUBTEST(test_scalar<ColMajor>(128, 128, 128)); + CALL_SUBTEST(test_scalar<RowMajor>(128, 128, 128)); + + CALL_SUBTEST(test_hip_contraction_m<ColMajor>()); + CALL_SUBTEST(test_hip_contraction_m<RowMajor>()); + + CALL_SUBTEST(test_hip_contraction_k<ColMajor>()); + CALL_SUBTEST(test_hip_contraction_k<RowMajor>()); + + CALL_SUBTEST(test_hip_contraction_n<ColMajor>()); + CALL_SUBTEST(test_hip_contraction_n<RowMajor>()); + + // Commenting out these tests due to long runtimes + // CALL_SUBTEST(test_hip_contraction_sizes<ColMajor>()); + // CALL_SUBTEST(test_hip_contraction_sizes<RowMajor>()); +} diff --git a/unsupported/test/cxx11_tensor_device_hip.cu b/unsupported/test/cxx11_tensor_device_hip.cu new file mode 100644 index 000000000..b98c481ff --- /dev/null +++ b/unsupported/test/cxx11_tensor_device_hip.cu @@ -0,0 +1,389 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_device +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_GPU + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Tensor; +using Eigen::RowMajor; + +// Context for evaluation on cpu +struct CPUContext { + CPUContext(const Eigen::Tensor<float, 3>& in1, Eigen::Tensor<float, 3>& in2, Eigen::Tensor<float, 3>& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) { + kernel_1d_(0) = 3.14f; + kernel_1d_(1) = 2.7f; + + kernel_2d_(0,0) = 3.14f; + kernel_2d_(1,0) = 2.7f; + kernel_2d_(0,1) = 0.2f; + kernel_2d_(1,1) = 7.0f; + + kernel_3d_(0,0,0) = 3.14f; + kernel_3d_(0,1,0) = 2.7f; + kernel_3d_(0,0,1) = 0.2f; + kernel_3d_(0,1,1) = 7.0f; + kernel_3d_(1,0,0) = -1.0f; + kernel_3d_(1,1,0) = -0.3f; + kernel_3d_(1,0,1) = -0.7f; + kernel_3d_(1,1,1) = -0.5f; + } + + const Eigen::DefaultDevice& device() const { return cpu_device_; } + + const Eigen::Tensor<float, 3>& in1() const { return in1_; } + const Eigen::Tensor<float, 3>& in2() const { return in2_; } + Eigen::Tensor<float, 3>& out() { return out_; } + const Eigen::Tensor<float, 1>& kernel1d() const { return kernel_1d_; } + const Eigen::Tensor<float, 2>& kernel2d() const { return kernel_2d_; } + const Eigen::Tensor<float, 3>& kernel3d() const { return kernel_3d_; } + + private: + const Eigen::Tensor<float, 3>& in1_; + const Eigen::Tensor<float, 3>& in2_; + Eigen::Tensor<float, 3>& out_; + + Eigen::Tensor<float, 1> kernel_1d_; + Eigen::Tensor<float, 2> kernel_2d_; + Eigen::Tensor<float, 3> kernel_3d_; + + Eigen::DefaultDevice cpu_device_; +}; + + +// Context for evaluation on GPU +struct GPUContext { + GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) { + assert(hipMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == hipSuccess); + float kernel_1d_val[] = {3.14f, 2.7f}; + assert(hipMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), hipMemcpyHostToDevice) == hipSuccess); + + assert(hipMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == hipSuccess); + float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f}; + assert(hipMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), hipMemcpyHostToDevice) == hipSuccess); + + assert(hipMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == hipSuccess); + float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f}; + assert(hipMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), hipMemcpyHostToDevice) == hipSuccess); + } + ~GPUContext() { + assert(hipFree(kernel_1d_) == hipSuccess); + assert(hipFree(kernel_2d_) == hipSuccess); + assert(hipFree(kernel_3d_) == hipSuccess); + } + + const Eigen::GpuDevice& device() const { return gpu_device_; } + + const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1() const { return in1_; } + const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2() const { return in2_; } + Eigen::TensorMap<Eigen::Tensor<float, 3> >& out() { return out_; } + Eigen::TensorMap<Eigen::Tensor<float, 1> > kernel1d() const { return Eigen::TensorMap<Eigen::Tensor<float, 1> >(kernel_1d_, 2); } + Eigen::TensorMap<Eigen::Tensor<float, 2> > kernel2d() const { return Eigen::TensorMap<Eigen::Tensor<float, 2> >(kernel_2d_, 2, 2); } + Eigen::TensorMap<Eigen::Tensor<float, 3> > kernel3d() const { return Eigen::TensorMap<Eigen::Tensor<float, 3> >(kernel_3d_, 2, 2, 2); } + + private: + const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1_; + const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2_; + Eigen::TensorMap<Eigen::Tensor<float, 3> >& out_; + + float* kernel_1d_; + float* kernel_2d_; + float* kernel_3d_; + + Eigen::HipStreamDevice stream_; + Eigen::GpuDevice gpu_device_; +}; + + +// The actual expression to evaluate +template <typename Context> +void test_contextual_eval(Context* context) +{ + context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f); +} + +template <typename Context> +void test_forced_contextual_eval(Context* context) +{ + context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f); +} + +template <typename Context> +void test_compound_assignment(Context* context) +{ + context->out().device(context->device()) = context->in1().constant(2.718f); + context->out().device(context->device()) += context->in1() + context->in2() * 3.14f; +} + + +template <typename Context> +void test_contraction(Context* context) +{ + Eigen::array<std::pair<int, int>, 2> dims; + dims[0] = std::make_pair(1, 1); + dims[1] = std::make_pair(2, 2); + + Eigen::array<int, 2> shape(40, 50*70); + + Eigen::DSizes<int, 2> indices(0,0); + Eigen::DSizes<int, 2> sizes(40,40); + + context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims); +} + + +template <typename Context> +void test_1d_convolution(Context* context) +{ + Eigen::DSizes<int, 3> indices(0,0,0); + Eigen::DSizes<int, 3> sizes(40,49,70); + + Eigen::array<int, 1> dims(1); + context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims); +} + +template <typename Context> +void test_2d_convolution(Context* context) +{ + Eigen::DSizes<int, 3> indices(0,0,0); + Eigen::DSizes<int, 3> sizes(40,49,69); + + Eigen::array<int, 2> dims(1,2); + context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims); +} + +template <typename Context> +void test_3d_convolution(Context* context) +{ + Eigen::DSizes<int, 3> indices(0,0,0); + Eigen::DSizes<int, 3> sizes(39,49,69); + + Eigen::array<int, 3> dims(0,1,2); + context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims); +} + + +void test_cpu() { + Eigen::Tensor<float, 3> in1(40,50,70); + Eigen::Tensor<float, 3> in2(40,50,70); + Eigen::Tensor<float, 3> out(40,50,70); + + in1 = in1.random() + in1.constant(10.0f); + in2 = in2.random() + in2.constant(10.0f); + + CPUContext context(in1, in2, out); + test_contextual_eval(&context); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); + } + } + } + + test_forced_contextual_eval(&context); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f); + } + } + } + + test_compound_assignment(&context); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); + } + } + } + + test_contraction(&context); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 40; ++j) { + const float result = out(i,j,0); + float expected = 0; + for (int k = 0; k < 50; ++k) { + for (int l = 0; l < 70; ++l) { + expected += in1(i, k, l) * in2(j, k, l); + } + } + VERIFY_IS_APPROX(expected, result); + } + } + + test_1d_convolution(&context); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f)); + } + } + } + + test_2d_convolution(&context); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 69; ++k) { + const float result = out(i,j,k); + const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) + + (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f); + if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) { + continue; + } + VERIFY_IS_APPROX(expected, result); + } + } + } + + test_3d_convolution(&context); + for (int i = 0; i < 39; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 69; ++k) { + const float result = out(i,j,k); + const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f + + in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) + + (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f + + in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f); + if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) { + continue; + } + VERIFY_IS_APPROX(expected, result); + } + } + } +} + +void test_gpu() { + Eigen::Tensor<float, 3> in1(40,50,70); + Eigen::Tensor<float, 3> in2(40,50,70); + Eigen::Tensor<float, 3> out(40,50,70); + in1 = in1.random() + in1.constant(10.0f); + in2 = in2.random() + in2.constant(10.0f); + + std::size_t in1_bytes = in1.size() * sizeof(float); + std::size_t in2_bytes = in2.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_in1; + float* d_in2; + float* d_out; + hipMalloc((void**)(&d_in1), in1_bytes); + hipMalloc((void**)(&d_in2), in2_bytes); + hipMalloc((void**)(&d_out), out_bytes); + + hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice); + hipMemcpy(d_in2, in2.data(), in2_bytes, hipMemcpyHostToDevice); + + Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70); + Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70); + Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, 40,50,70); + + GPUContext context(gpu_in1, gpu_in2, gpu_out); + test_contextual_eval(&context); + assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); + } + } + } + + test_forced_contextual_eval(&context); + assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f); + } + } + } + + test_compound_assignment(&context); + assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); + } + } + } + + test_contraction(&context); + assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 40; ++j) { + const float result = out(i,j,0); + float expected = 0; + for (int k = 0; k < 50; ++k) { + for (int l = 0; l < 70; ++l) { + expected += in1(i, k, l) * in2(j, k, l); + } + } + VERIFY_IS_APPROX(expected, result); + } + } + + test_1d_convolution(&context); + assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess); + assert(hipStreamSynchronize(context.device().stream()) == hipSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f)); + } + } + } + + test_2d_convolution(&context); + assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess); + assert(hipStreamSynchronize(context.device().stream()) == hipSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 69; ++k) { + const float result = out(i,j,k); + const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f + + in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f); + VERIFY_IS_APPROX(expected, result); + } + } + } + + /* + test_3d_convolution(&context); + assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess); + assert(hipStreamSynchronize(context.device().stream()) == hipSuccess); + for (int i = 0; i < 39; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 69; ++k) { + const float result = out(i,j,k); + const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f + + in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f + + in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f + + in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f); + VERIFY_IS_APPROX(expected, result); + } + } + } + */ +} + + +void test_cxx11_tensor_device() +{ + CALL_SUBTEST(test_cpu()); + CALL_SUBTEST(test_gpu()); +} diff --git a/unsupported/test/cxx11_tensor_hip.cu b/unsupported/test/cxx11_tensor_hip.cu new file mode 100644 index 000000000..f1794aa87 --- /dev/null +++ b/unsupported/test/cxx11_tensor_hip.cu @@ -0,0 +1,1296 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_hip +#define EIGEN_USE_GPU + +#ifdef __NVCC__ +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#include <cuda_fp16.h> +#endif +#endif +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +void test_hip_nullary() { + Tensor<float, 1, 0, int> in1(2); + Tensor<float, 1, 0, int> in2(2); + in1.setRandom(); + in2.setRandom(); + + std::size_t tensor_bytes = in1.size() * sizeof(float); + + float* d_in1; + float* d_in2; + hipMalloc((void**)(&d_in1), tensor_bytes); + hipMalloc((void**)(&d_in2), tensor_bytes); + hipMemcpy(d_in1, in1.data(), tensor_bytes, hipMemcpyHostToDevice); + hipMemcpy(d_in2, in2.data(), tensor_bytes, hipMemcpyHostToDevice); + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1( + d_in1, 2); + Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in2( + d_in2, 2); + + gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f); + gpu_in2.device(gpu_device) = gpu_in2.random(); + + Tensor<float, 1, 0, int> new1(2); + Tensor<float, 1, 0, int> new2(2); + + assert(hipMemcpyAsync(new1.data(), d_in1, tensor_bytes, hipMemcpyDeviceToHost, + gpu_device.stream()) == hipSuccess); + assert(hipMemcpyAsync(new2.data(), d_in2, tensor_bytes, hipMemcpyDeviceToHost, + gpu_device.stream()) == hipSuccess); + + assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); + + for (int i = 0; i < 2; ++i) { + VERIFY_IS_APPROX(new1(i), 3.14f); + VERIFY_IS_NOT_EQUAL(new2(i), in2(i)); + } + + hipFree(d_in1); + hipFree(d_in2); +} + +void test_hip_elementwise_small() { + Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>{2}); + Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>{2}); + Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>{2}); + in1.setRandom(); + in2.setRandom(); + + std::size_t in1_bytes = in1.size() * sizeof(float); + std::size_t in2_bytes = in2.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_in1; + float* d_in2; + float* d_out; + hipMalloc((void**)(&d_in1), in1_bytes); + hipMalloc((void**)(&d_in2), in2_bytes); + hipMalloc((void**)(&d_out), out_bytes); + + hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice); + hipMemcpy(d_in2, in2.data(), in2_bytes, hipMemcpyHostToDevice); + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1( + d_in1, Eigen::array<Eigen::DenseIndex, 1>{2}); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2( + d_in2, Eigen::array<Eigen::DenseIndex, 1>{2}); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out( + d_out, Eigen::array<Eigen::DenseIndex, 1>{2}); + + gpu_out.device(gpu_device) = gpu_in1 + gpu_in2; + + assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, + gpu_device.stream()) == hipSuccess); + assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); + + for (int i = 0; i < 2; ++i) { + VERIFY_IS_APPROX( + out(Eigen::array<Eigen::DenseIndex, 1>{i}), + in1(Eigen::array<Eigen::DenseIndex, 1>{i}) + in2(Eigen::array<Eigen::DenseIndex, 1>{i})); + } + + hipFree(d_in1); + hipFree(d_in2); + hipFree(d_out); +} + +void test_hip_elementwise() +{ + Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>{72,53,97}); + Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>{72,53,97}); + Tensor<float, 3> in3(Eigen::array<Eigen::DenseIndex, 3>{72,53,97}); + Tensor<float, 3> out(Eigen::array<Eigen::DenseIndex, 3>{72,53,97}); + in1.setRandom(); + in2.setRandom(); + in3.setRandom(); + + std::size_t in1_bytes = in1.size() * sizeof(float); + std::size_t in2_bytes = in2.size() * sizeof(float); + std::size_t in3_bytes = in3.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_in1; + float* d_in2; + float* d_in3; + float* d_out; + hipMalloc((void**)(&d_in1), in1_bytes); + hipMalloc((void**)(&d_in2), in2_bytes); + hipMalloc((void**)(&d_in3), in3_bytes); + hipMalloc((void**)(&d_out), out_bytes); + + hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice); + hipMemcpy(d_in2, in2.data(), in2_bytes, hipMemcpyHostToDevice); + hipMemcpy(d_in3, in3.data(), in3_bytes, hipMemcpyHostToDevice); + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>{72,53,97}); + Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<Eigen::DenseIndex, 3>{72,53,97}); + Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<Eigen::DenseIndex, 3>{72,53,97}); + Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<Eigen::DenseIndex, 3>{72,53,97}); + + gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3; + + assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); + assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 53; ++j) { + for (int k = 0; k < 97; ++k) { + VERIFY_IS_APPROX(out(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}), in1(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}) + in2(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}) * in3(Eigen::array<Eigen::DenseIndex, 3>{i,j,k})); + } + } + } + + hipFree(d_in1); + hipFree(d_in2); + hipFree(d_in3); + hipFree(d_out); +} + +void test_hip_props() { + Tensor<float, 1> in1(200); + Tensor<bool, 1> out(200); + in1.setRandom(); + + std::size_t in1_bytes = in1.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(bool); + + float* d_in1; + bool* d_out; + hipMalloc((void**)(&d_in1), in1_bytes); + hipMalloc((void**)(&d_out), out_bytes); + + hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice); + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1( + d_in1, 200); + Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_out( + d_out, 200); + + gpu_out.device(gpu_device) = (gpu_in1.isnan)(); + + assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, + gpu_device.stream()) == hipSuccess); + assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); + + for (int i = 0; i < 200; ++i) { + VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i))); + } + + hipFree(d_in1); + hipFree(d_out); +} + +void test_hip_reduction() +{ + Tensor<float, 4> in1(72,53,97,113); + Tensor<float, 2> out(72,97); + in1.setRandom(); + + std::size_t in1_bytes = in1.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_in1; + float* d_out; + hipMalloc((void**)(&d_in1), in1_bytes); + hipMalloc((void**)(&d_out), out_bytes); + + hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice); + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113); + Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97); + + array<Eigen::DenseIndex, 2> reduction_axis; + reduction_axis[0] = 1; + reduction_axis[1] = 3; + + gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis); + + assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); + assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 97; ++j) { + float expected = 0; + for (int k = 0; k < 53; ++k) { + for (int l = 0; l < 113; ++l) { + expected = + std::max<float>(expected, in1(i, k, j, l)); + } + } + VERIFY_IS_APPROX(out(i,j), expected); + } + } + + hipFree(d_in1); + hipFree(d_out); +} + +template<int DataLayout> +void test_hip_contraction() +{ + // with these dimensions, the output has 300 * 140 elements, which is + // more than 30 * 1024, which is the number of threads in blocks on + // a 15 SM GK110 GPU + Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31); + Tensor<float, 5, DataLayout> t_right(Eigen::array<Eigen::DenseIndex, 5>{3, 31, 7, 20, 1}); + Tensor<float, 5, DataLayout> t_result(Eigen::array<Eigen::DenseIndex, 5>{6, 50, 7, 20, 1}); + + t_left.setRandom(); + t_right.setRandom(); + + std::size_t t_left_bytes = t_left.size() * sizeof(float); + std::size_t t_right_bytes = t_right.size() * sizeof(float); + std::size_t t_result_bytes = t_result.size() * sizeof(float); + + float* d_t_left; + float* d_t_right; + float* d_t_result; + + hipMalloc((void**)(&d_t_left), t_left_bytes); + hipMalloc((void**)(&d_t_right), t_right_bytes); + hipMalloc((void**)(&d_t_result), t_result_bytes); + + hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice); + hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice); + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31); + Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_right(d_t_right, 3, 31, 7, 20, 1); + Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_result(d_t_result, 6, 50, 7, 20, 1); + + typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf; + MapXf m_left(t_left.data(), 300, 93); + MapXf m_right(t_right.data(), 93, 140); + Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(300, 140); + + typedef Tensor<float, 1>::DimensionPair DimPair; + Eigen::array<DimPair, 2> dims; + dims[0] = DimPair(2, 0); + dims[1] = DimPair(3, 1); + + m_result = m_left * m_right; + gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); + + hipMemcpy(t_result.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost); + + for (DenseIndex i = 0; i < t_result.size(); i++) { + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) { + std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; + assert(false); + } + } + + hipFree(d_t_left); + hipFree(d_t_right); + hipFree(d_t_result); +} + + +template<int DataLayout> +void test_hip_convolution_1d() +{ + Tensor<float, 4, DataLayout> input(74,37,11,137); + Tensor<float, 1, DataLayout> kernel(4); + Tensor<float, 4, DataLayout> out(74,34,11,137); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); + + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_input; + float* d_kernel; + float* d_out; + hipMalloc((void**)(&d_input), input_bytes); + hipMalloc((void**)(&d_kernel), kernel_bytes); + hipMalloc((void**)(&d_out), out_bytes); + + hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice); + hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice); + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137); + Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4); + Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137); + + Eigen::array<Eigen::DenseIndex, 1> dims{1}; + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); + + assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); + assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); + + for (int i = 0; i < 74; ++i) { + for (int j = 0; j < 34; ++j) { + for (int k = 0; k < 11; ++k) { + for (int l = 0; l < 137; ++l) { + const float result = out(i,j,k,l); + const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) + + input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3); + VERIFY_IS_APPROX(result, expected); + } + } + } + } + + hipFree(d_input); + hipFree(d_kernel); + hipFree(d_out); +} + +void test_hip_convolution_inner_dim_col_major_1d() +{ + Tensor<float, 4, ColMajor> input(74,9,11,7); + Tensor<float, 1, ColMajor> kernel(4); + Tensor<float, 4, ColMajor> out(71,9,11,7); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); + + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_input; + float* d_kernel; + float* d_out; + hipMalloc((void**)(&d_input), input_bytes); + hipMalloc((void**)(&d_kernel), kernel_bytes); + hipMalloc((void**)(&d_out), out_bytes); + + hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice); + hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice); + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7); + Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4); + Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7); + + Eigen::array<Eigen::DenseIndex, 1> dims{0}; + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); + + assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); + assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); + + for (int i = 0; i < 71; ++i) { + for (int j = 0; j < 9; ++j) { + for (int k = 0; k < 11; ++k) { + for (int l = 0; l < 7; ++l) { + const float result = out(i,j,k,l); + const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) + + input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3); + VERIFY_IS_APPROX(result, expected); + } + } + } + } + + hipFree(d_input); + hipFree(d_kernel); + hipFree(d_out); +} + +void test_hip_convolution_inner_dim_row_major_1d() +{ + Tensor<float, 4, RowMajor> input(7,9,11,74); + Tensor<float, 1, RowMajor> kernel(4); + Tensor<float, 4, RowMajor> out(7,9,11,71); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); + + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_input; + float* d_kernel; + float* d_out; + hipMalloc((void**)(&d_input), input_bytes); + hipMalloc((void**)(&d_kernel), kernel_bytes); + hipMalloc((void**)(&d_out), out_bytes); + + hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice); + hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice); + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74); + Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4); + Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71); + + Eigen::array<Eigen::DenseIndex, 1> dims{3}; + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); + + assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); + assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); + + for (int i = 0; i < 7; ++i) { + for (int j = 0; j < 9; ++j) { + for (int k = 0; k < 11; ++k) { + for (int l = 0; l < 71; ++l) { + const float result = out(i,j,k,l); + const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) + + input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3); + VERIFY_IS_APPROX(result, expected); + } + } + } + } + + hipFree(d_input); + hipFree(d_kernel); + hipFree(d_out); +} + +template<int DataLayout> +void test_hip_convolution_2d() +{ + Tensor<float, 4, DataLayout> input(74,37,11,137); + Tensor<float, 2, DataLayout> kernel(3,4); + Tensor<float, 4, DataLayout> out(74,35,8,137); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); + + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_input; + float* d_kernel; + float* d_out; + hipMalloc((void**)(&d_input), input_bytes); + hipMalloc((void**)(&d_kernel), kernel_bytes); + hipMalloc((void**)(&d_out), out_bytes); + + hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice); + hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice); + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137); + Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4); + Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137); + + Eigen::array<Eigen::DenseIndex, 2> dims{1,2}; + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); + + assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); + assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); + + for (int i = 0; i < 74; ++i) { + for (int j = 0; j < 35; ++j) { + for (int k = 0; k < 8; ++k) { + for (int l = 0; l < 137; ++l) { + const float result = out(i,j,k,l); + const float expected = input(i,j+0,k+0,l) * kernel(0,0) + + input(i,j+1,k+0,l) * kernel(1,0) + + input(i,j+2,k+0,l) * kernel(2,0) + + input(i,j+0,k+1,l) * kernel(0,1) + + input(i,j+1,k+1,l) * kernel(1,1) + + input(i,j+2,k+1,l) * kernel(2,1) + + input(i,j+0,k+2,l) * kernel(0,2) + + input(i,j+1,k+2,l) * kernel(1,2) + + input(i,j+2,k+2,l) * kernel(2,2) + + input(i,j+0,k+3,l) * kernel(0,3) + + input(i,j+1,k+3,l) * kernel(1,3) + + input(i,j+2,k+3,l) * kernel(2,3); + VERIFY_IS_APPROX(result, expected); + } + } + } + } + + hipFree(d_input); + hipFree(d_kernel); + hipFree(d_out); +} + +template<int DataLayout> +void test_hip_convolution_3d() +{ + Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>{74,37,11,137,17}); + Tensor<float, 3, DataLayout> kernel(3,4,2); + Tensor<float, 5, DataLayout> out(Eigen::array<Eigen::DenseIndex, 5>{74,35,8,136,17}); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); + + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_input; + float* d_kernel; + float* d_out; + hipMalloc((void**)(&d_input), input_bytes); + hipMalloc((void**)(&d_kernel), kernel_bytes); + hipMalloc((void**)(&d_out), out_bytes); + + hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice); + hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice); + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17); + Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2); + Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17); + + Eigen::array<Eigen::DenseIndex, 3> dims{1,2,3}; + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); + + assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); + assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); + + for (int i = 0; i < 74; ++i) { + for (int j = 0; j < 35; ++j) { + for (int k = 0; k < 8; ++k) { + for (int l = 0; l < 136; ++l) { + for (int m = 0; m < 17; ++m) { + const float result = out(i,j,k,l,m); + const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) + + input(i,j+1,k+0,l+0,m) * kernel(1,0,0) + + input(i,j+2,k+0,l+0,m) * kernel(2,0,0) + + input(i,j+0,k+1,l+0,m) * kernel(0,1,0) + + input(i,j+1,k+1,l+0,m) * kernel(1,1,0) + + input(i,j+2,k+1,l+0,m) * kernel(2,1,0) + + input(i,j+0,k+2,l+0,m) * kernel(0,2,0) + + input(i,j+1,k+2,l+0,m) * kernel(1,2,0) + + input(i,j+2,k+2,l+0,m) * kernel(2,2,0) + + input(i,j+0,k+3,l+0,m) * kernel(0,3,0) + + input(i,j+1,k+3,l+0,m) * kernel(1,3,0) + + input(i,j+2,k+3,l+0,m) * kernel(2,3,0) + + input(i,j+0,k+0,l+1,m) * kernel(0,0,1) + + input(i,j+1,k+0,l+1,m) * kernel(1,0,1) + + input(i,j+2,k+0,l+1,m) * kernel(2,0,1) + + input(i,j+0,k+1,l+1,m) * kernel(0,1,1) + + input(i,j+1,k+1,l+1,m) * kernel(1,1,1) + + input(i,j+2,k+1,l+1,m) * kernel(2,1,1) + + input(i,j+0,k+2,l+1,m) * kernel(0,2,1) + + input(i,j+1,k+2,l+1,m) * kernel(1,2,1) + + input(i,j+2,k+2,l+1,m) * kernel(2,2,1) + + input(i,j+0,k+3,l+1,m) * kernel(0,3,1) + + input(i,j+1,k+3,l+1,m) * kernel(1,3,1) + + input(i,j+2,k+3,l+1,m) * kernel(2,3,1); + VERIFY_IS_APPROX(result, expected); + } + } + } + } + } + + hipFree(d_input); + hipFree(d_kernel); + hipFree(d_out); +} + + +template <typename Scalar> +void test_hip_lgamma(const Scalar stddev) +{ + Tensor<Scalar, 2> in(72,97); + in.setRandom(); + in *= in.constant(stddev); + Tensor<Scalar, 2> out(72,97); + out.setZero(); + + std::size_t bytes = in.size() * sizeof(Scalar); + + Scalar* d_in; + Scalar* d_out; + hipMalloc((void**)(&d_in), bytes); + hipMalloc((void**)(&d_out), bytes); + + hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice); + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97); + Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97); + + gpu_out.device(gpu_device) = gpu_in.lgamma(); + + assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); + assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 97; ++j) { + VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j))); + } + } + + hipFree(d_in); + hipFree(d_out); +} + +template <typename Scalar> +void test_hip_digamma() +{ + Tensor<Scalar, 1> in(7); + Tensor<Scalar, 1> out(7); + Tensor<Scalar, 1> expected_out(7); + out.setZero(); + + in(0) = Scalar(1); + in(1) = Scalar(1.5); + in(2) = Scalar(4); + in(3) = Scalar(-10.5); + in(4) = Scalar(10000.5); + in(5) = Scalar(0); + in(6) = Scalar(-1); + + expected_out(0) = Scalar(-0.5772156649015329); + expected_out(1) = Scalar(0.03648997397857645); + expected_out(2) = Scalar(1.2561176684318); + expected_out(3) = Scalar(2.398239129535781); + expected_out(4) = Scalar(9.210340372392849); + expected_out(5) = std::numeric_limits<Scalar>::infinity(); + expected_out(6) = std::numeric_limits<Scalar>::infinity(); + + std::size_t bytes = in.size() * sizeof(Scalar); + + Scalar* d_in; + Scalar* d_out; + hipMalloc((void**)(&d_in), bytes); + hipMalloc((void**)(&d_out), bytes); + + hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice); + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7); + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7); + + gpu_out.device(gpu_device) = gpu_in.digamma(); + + assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); + assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); + + for (int i = 0; i < 5; ++i) { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } + for (int i = 5; i < 7; ++i) { + VERIFY_IS_EQUAL(out(i), expected_out(i)); + } + + hipFree(d_in); + hipFree(d_out); +} + +template <typename Scalar> +void test_hip_zeta() +{ + Tensor<Scalar, 1> in_x(6); + Tensor<Scalar, 1> in_q(6); + Tensor<Scalar, 1> out(6); + Tensor<Scalar, 1> expected_out(6); + out.setZero(); + + in_x(0) = Scalar(1); + in_x(1) = Scalar(1.5); + in_x(2) = Scalar(4); + in_x(3) = Scalar(-10.5); + in_x(4) = Scalar(10000.5); + in_x(5) = Scalar(3); + + in_q(0) = Scalar(1.2345); + in_q(1) = Scalar(2); + in_q(2) = Scalar(1.5); + in_q(3) = Scalar(3); + in_q(4) = Scalar(1.0001); + in_q(5) = Scalar(-2.5); + + expected_out(0) = std::numeric_limits<Scalar>::infinity(); + expected_out(1) = Scalar(1.61237534869); + expected_out(2) = Scalar(0.234848505667); + expected_out(3) = Scalar(1.03086757337e-5); + expected_out(4) = Scalar(0.367879440865); + expected_out(5) = Scalar(0.054102025820864097); + + std::size_t bytes = in_x.size() * sizeof(Scalar); + + Scalar* d_in_x; + Scalar* d_in_q; + Scalar* d_out; + hipMalloc((void**)(&d_in_x), bytes); + hipMalloc((void**)(&d_in_q), bytes); + hipMalloc((void**)(&d_out), bytes); + + hipMemcpy(d_in_x, in_x.data(), bytes, hipMemcpyHostToDevice); + hipMemcpy(d_in_q, in_q.data(), bytes, hipMemcpyHostToDevice); + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6); + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_q(d_in_q, 6); + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6); + + gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q); + + assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); + assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); + + VERIFY_IS_EQUAL(out(0), expected_out(0)); + VERIFY((std::isnan)(out(3))); + + for (int i = 1; i < 6; ++i) { + if (i != 3) { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } + } + + hipFree(d_in_x); + hipFree(d_in_q); + hipFree(d_out); +} + +template <typename Scalar> +void test_hip_polygamma() +{ + Tensor<Scalar, 1> in_x(7); + Tensor<Scalar, 1> in_n(7); + Tensor<Scalar, 1> out(7); + Tensor<Scalar, 1> expected_out(7); + out.setZero(); + + in_n(0) = Scalar(1); + in_n(1) = Scalar(1); + in_n(2) = Scalar(1); + in_n(3) = Scalar(17); + in_n(4) = Scalar(31); + in_n(5) = Scalar(28); + in_n(6) = Scalar(8); + + in_x(0) = Scalar(2); + in_x(1) = Scalar(3); + in_x(2) = Scalar(25.5); + in_x(3) = Scalar(4.7); + in_x(4) = Scalar(11.8); + in_x(5) = Scalar(17.7); + in_x(6) = Scalar(30.2); + + expected_out(0) = Scalar(0.644934066848); + expected_out(1) = Scalar(0.394934066848); + expected_out(2) = Scalar(0.0399946696496); + expected_out(3) = Scalar(293.334565435); + expected_out(4) = Scalar(0.445487887616); + expected_out(5) = Scalar(-2.47810300902e-07); + expected_out(6) = Scalar(-8.29668781082e-09); + + std::size_t bytes = in_x.size() * sizeof(Scalar); + + Scalar* d_in_x; + Scalar* d_in_n; + Scalar* d_out; + hipMalloc((void**)(&d_in_x), bytes); + hipMalloc((void**)(&d_in_n), bytes); + hipMalloc((void**)(&d_out), bytes); + + hipMemcpy(d_in_x, in_x.data(), bytes, hipMemcpyHostToDevice); + hipMemcpy(d_in_n, in_n.data(), bytes, hipMemcpyHostToDevice); + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7); + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_n(d_in_n, 7); + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7); + + gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x); + + assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); + assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); + + for (int i = 0; i < 7; ++i) { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } + + hipFree(d_in_x); + hipFree(d_in_n); + hipFree(d_out); +} + +template <typename Scalar> +void test_hip_igamma() +{ + Tensor<Scalar, 2> a(6, 6); + Tensor<Scalar, 2> x(6, 6); + Tensor<Scalar, 2> out(6, 6); + out.setZero(); + + Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + a(i, j) = a_s[i]; + x(i, j) = x_s[j]; + } + } + + Scalar nan = std::numeric_limits<Scalar>::quiet_NaN(); + Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan}, + {0.0, 0.6321205588285578, 0.7768698398515702, + 0.9816843611112658, 9.999500016666262e-05, 1.0}, + {0.0, 0.4275932955291202, 0.608374823728911, + 0.9539882943107686, 7.522076445089201e-07, 1.0}, + {0.0, 0.01898815687615381, 0.06564245437845008, + 0.5665298796332909, 4.166333347221828e-18, 1.0}, + {0.0, 0.9999780593618628, 0.9999899967080838, + 0.9999996219837988, 0.9991370418689945, 1.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}}; + + + + std::size_t bytes = a.size() * sizeof(Scalar); + + Scalar* d_a; + Scalar* d_x; + Scalar* d_out; + assert(hipMalloc((void**)(&d_a), bytes) == hipSuccess); + assert(hipMalloc((void**)(&d_x), bytes) == hipSuccess); + assert(hipMalloc((void**)(&d_out), bytes) == hipSuccess); + + hipMemcpy(d_a, a.data(), bytes, hipMemcpyHostToDevice); + hipMemcpy(d_x, x.data(), bytes, hipMemcpyHostToDevice); + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6); + Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6); + Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6); + + gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x); + + assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); + assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); + + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + if ((std::isnan)(igamma_s[i][j])) { + VERIFY((std::isnan)(out(i, j))); + } else { + VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]); + } + } + } + + hipFree(d_a); + hipFree(d_x); + hipFree(d_out); +} + +template <typename Scalar> +void test_hip_igammac() +{ + Tensor<Scalar, 2> a(6, 6); + Tensor<Scalar, 2> x(6, 6); + Tensor<Scalar, 2> out(6, 6); + out.setZero(); + + Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + a(i, j) = a_s[i]; + x(i, j) = x_s[j]; + } + } + + Scalar nan = std::numeric_limits<Scalar>::quiet_NaN(); + Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan}, + {1.0, 0.36787944117144233, 0.22313016014842982, + 0.018315638888734182, 0.9999000049998333, 0.0}, + {1.0, 0.5724067044708798, 0.3916251762710878, + 0.04601170568923136, 0.9999992477923555, 0.0}, + {1.0, 0.9810118431238462, 0.9343575456215499, + 0.4334701203667089, 1.0, 0.0}, + {1.0, 2.1940638138146658e-05, 1.0003291916285e-05, + 3.7801620118431334e-07, 0.0008629581310054535, + 0.0}, + {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}}; + + std::size_t bytes = a.size() * sizeof(Scalar); + + Scalar* d_a; + Scalar* d_x; + Scalar* d_out; + hipMalloc((void**)(&d_a), bytes); + hipMalloc((void**)(&d_x), bytes); + hipMalloc((void**)(&d_out), bytes); + + hipMemcpy(d_a, a.data(), bytes, hipMemcpyHostToDevice); + hipMemcpy(d_x, x.data(), bytes, hipMemcpyHostToDevice); + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6); + Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6); + Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6); + + gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x); + + assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); + assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); + + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + if ((std::isnan)(igammac_s[i][j])) { + VERIFY((std::isnan)(out(i, j))); + } else { + VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]); + } + } + } + + hipFree(d_a); + hipFree(d_x); + hipFree(d_out); +} + +template <typename Scalar> +void test_hip_erf(const Scalar stddev) +{ + Tensor<Scalar, 2> in(72,97); + in.setRandom(); + in *= in.constant(stddev); + Tensor<Scalar, 2> out(72,97); + out.setZero(); + + std::size_t bytes = in.size() * sizeof(Scalar); + + Scalar* d_in; + Scalar* d_out; + assert(hipMalloc((void**)(&d_in), bytes) == hipSuccess); + assert(hipMalloc((void**)(&d_out), bytes) == hipSuccess); + + hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice); + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97); + Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97); + + gpu_out.device(gpu_device) = gpu_in.erf(); + + assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); + assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 97; ++j) { + VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j))); + } + } + + hipFree(d_in); + hipFree(d_out); +} + +template <typename Scalar> +void test_hip_erfc(const Scalar stddev) +{ + Tensor<Scalar, 2> in(72,97); + in.setRandom(); + in *= in.constant(stddev); + Tensor<Scalar, 2> out(72,97); + out.setZero(); + + std::size_t bytes = in.size() * sizeof(Scalar); + + Scalar* d_in; + Scalar* d_out; + hipMalloc((void**)(&d_in), bytes); + hipMalloc((void**)(&d_out), bytes); + + hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice); + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97); + Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97); + + gpu_out.device(gpu_device) = gpu_in.erfc(); + + assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); + assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 97; ++j) { + VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j))); + } + } + + hipFree(d_in); + hipFree(d_out); +} + +template <typename Scalar> +void test_hip_betainc() +{ + Tensor<Scalar, 1> in_x(125); + Tensor<Scalar, 1> in_a(125); + Tensor<Scalar, 1> in_b(125); + Tensor<Scalar, 1> out(125); + Tensor<Scalar, 1> expected_out(125); + out.setZero(); + + Scalar nan = std::numeric_limits<Scalar>::quiet_NaN(); + + Array<Scalar, 1, Dynamic> x(125); + Array<Scalar, 1, Dynamic> a(125); + Array<Scalar, 1, Dynamic> b(125); + Array<Scalar, 1, Dynamic> v(125); + + a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, + 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, + 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999, + 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, + 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, + 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999; + + b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999, + 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999, + 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999, + 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999, + 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999, + 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999, + 999.999, 999.999, 999.999; + + x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, + 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, + 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, + 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, + 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, + -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, + 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, + 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, + 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1; + + v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, + nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, + nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan, + 0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan, + 0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan, + 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, + nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256, + 0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001, + 0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403, + 0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999, + 0.9999999999999999, nan, nan, nan, nan, nan, nan, nan, + 1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan, + nan, 7.864342668429763e-23, 3.015969667594166e-10, 0.0008598571564165444, + nan, nan, 6.031987710123844e-08, 0.5000000000000007, 0.9999999396801229, + nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, + nan, nan, nan, nan, nan, nan, 0.0, 7.029920380986636e-306, + 2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302, + 1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252, + 2.9303043666183996e-60, nan, nan, 2.248913486879199e-196, + 0.5000000000004947, 0.9999999999999999, nan; + + for (int i = 0; i < 125; ++i) { + in_x(i) = x(i); + in_a(i) = a(i); + in_b(i) = b(i); + expected_out(i) = v(i); + } + + std::size_t bytes = in_x.size() * sizeof(Scalar); + + Scalar* d_in_x; + Scalar* d_in_a; + Scalar* d_in_b; + Scalar* d_out; + hipMalloc((void**)(&d_in_x), bytes); + hipMalloc((void**)(&d_in_a), bytes); + hipMalloc((void**)(&d_in_b), bytes); + hipMalloc((void**)(&d_out), bytes); + + hipMemcpy(d_in_x, in_x.data(), bytes, hipMemcpyHostToDevice); + hipMemcpy(d_in_a, in_a.data(), bytes, hipMemcpyHostToDevice); + hipMemcpy(d_in_b, in_b.data(), bytes, hipMemcpyHostToDevice); + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125); + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_a(d_in_a, 125); + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_b(d_in_b, 125); + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 125); + + gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x); + + assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); + assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); + + for (int i = 1; i < 125; ++i) { + if ((std::isnan)(expected_out(i))) { + VERIFY((std::isnan)(out(i))); + } else { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } + } + + hipFree(d_in_x); + hipFree(d_in_a); + hipFree(d_in_b); + hipFree(d_out); +} + + +void test_cxx11_tensor_hip() +{ + CALL_SUBTEST(test_hip_nullary()); + CALL_SUBTEST(test_hip_elementwise_small()); + CALL_SUBTEST(test_hip_elementwise()); + CALL_SUBTEST(test_hip_props()); + CALL_SUBTEST(test_hip_reduction()); + // FIXME : uncommenting following tests results in compile failure + // CALL_SUBTEST(test_hip_contraction<ColMajor>()); + // CALL_SUBTEST(test_hip_contraction<RowMajor>()); + CALL_SUBTEST(test_hip_convolution_1d<ColMajor>()); + CALL_SUBTEST(test_hip_convolution_1d<RowMajor>()); + CALL_SUBTEST(test_hip_convolution_inner_dim_col_major_1d()); + CALL_SUBTEST(test_hip_convolution_inner_dim_row_major_1d()); + CALL_SUBTEST(test_hip_convolution_2d<ColMajor>()); + CALL_SUBTEST(test_hip_convolution_2d<RowMajor>()); + // The following two tests commented out due to long runtime + // CALL_SUBTEST(test_hip_convolution_3d<ColMajor>()); + // CALL_SUBTEST(test_hip_convolution_3d<RowMajor>()); + +#if __cplusplus > 199711L + // std::erf, std::erfc, and so on where only added in c++11. We use them + // as a golden reference to validate the results produced by Eigen. Therefore + // we can only run these tests if we use a c++11 compiler. + + CALL_SUBTEST(test_hip_lgamma<float>(1.0f)); + CALL_SUBTEST(test_hip_lgamma<float>(100.0f)); + CALL_SUBTEST(test_hip_lgamma<float>(0.01f)); + CALL_SUBTEST(test_hip_lgamma<float>(0.001f)); + + CALL_SUBTEST(test_hip_lgamma<double>(1.0)); + CALL_SUBTEST(test_hip_lgamma<double>(100.0)); + CALL_SUBTEST(test_hip_lgamma<double>(0.01)); + CALL_SUBTEST(test_hip_lgamma<double>(0.001)); + + CALL_SUBTEST(test_hip_erf<float>(1.0f)); + CALL_SUBTEST(test_hip_erf<float>(100.0f)); + CALL_SUBTEST(test_hip_erf<float>(0.01f)); + CALL_SUBTEST(test_hip_erf<float>(0.001f)); + + CALL_SUBTEST(test_hip_erfc<float>(1.0f)); + // CALL_SUBTEST(test_hip_erfc<float>(100.0f)); // HIP erfc lacks precision for large inputs + CALL_SUBTEST(test_hip_erfc<float>(5.0f)); + CALL_SUBTEST(test_hip_erfc<float>(0.01f)); + CALL_SUBTEST(test_hip_erfc<float>(0.001f)); + + CALL_SUBTEST(test_hip_erf<double>(1.0)); + CALL_SUBTEST(test_hip_erf<double>(100.0)); + CALL_SUBTEST(test_hip_erf<double>(0.01)); + CALL_SUBTEST(test_hip_erf<double>(0.001)); + + CALL_SUBTEST(test_hip_erfc<double>(1.0)); + // CALL_SUBTEST(test_hip_erfc<double>(100.0)); // HIP erfc lacks precision for large inputs + CALL_SUBTEST(test_hip_erfc<double>(5.0)); + CALL_SUBTEST(test_hip_erfc<double>(0.01)); + CALL_SUBTEST(test_hip_erfc<double>(0.001)); + + // Following tests have functional failures on some seeds + // CALL_SUBTEST(test_hip_digamma<float>()); + // CALL_SUBTEST(test_hip_digamma<double>()); + + // Following tests have functional failures on some seeds + // CALL_SUBTEST(test_hip_polygamma<float>()); + // CALL_SUBTEST(test_hip_polygamma<double>()); + + // Following tests have functional failures on some seeds + // CALL_SUBTEST(test_hip_zeta<float>()); + // CALL_SUBTEST(test_hip_zeta<double>()); + + CALL_SUBTEST(test_hip_igamma<float>()); + CALL_SUBTEST(test_hip_igammac<float>()); + + CALL_SUBTEST(test_hip_igamma<double>()); + CALL_SUBTEST(test_hip_igammac<double>()); + + CALL_SUBTEST(test_hip_betainc<float>()); + CALL_SUBTEST(test_hip_betainc<double>()); +#endif +} diff --git a/unsupported/test/cxx11_tensor_of_float16_hip.cu b/unsupported/test/cxx11_tensor_of_float16_hip.cu new file mode 100644 index 000000000..6c1a401bf --- /dev/null +++ b/unsupported/test/cxx11_tensor_of_float16_hip.cu @@ -0,0 +1,498 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_hip +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_GPU + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + + +using Eigen::Tensor; + +template<typename> +void test_hip_numext() { + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int num_elem = 101; + + float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + bool* d_res_half = (bool*)gpu_device.allocate(num_elem * sizeof(bool)); + bool* d_res_float = (bool*)gpu_device.allocate(num_elem * sizeof(bool)); + + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float( + d_float, num_elem); + Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_half( + d_res_half, num_elem); + Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_float( + d_res_float, num_elem); + + gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f); + gpu_res_float.device(gpu_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>()); + gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>()); + + Tensor<bool, 1> half_prec(num_elem); + Tensor<bool, 1> full_prec(num_elem); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(bool)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(bool)); + gpu_device.synchronize(); + + for (int i = 0; i < num_elem; ++i) { + std::cout << "Checking numext " << i << std::endl; + VERIFY_IS_EQUAL(full_prec(i), half_prec(i)); + } + + gpu_device.deallocate(d_float); + gpu_device.deallocate(d_res_half); + gpu_device.deallocate(d_res_float); +} + + +#ifdef EIGEN_HAS_HIP_FP16 + +template<typename> +void test_hip_conversion() { + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int num_elem = 101; + + float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); + float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float)); + + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float( + d_float, num_elem); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half( + d_half, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv( + d_conv, num_elem); + + gpu_float.device(gpu_device) = gpu_float.random(); + gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>(); + gpu_conv.device(gpu_device) = gpu_half.cast<float>(); + + Tensor<float, 1> initial(num_elem); + Tensor<float, 1> final(num_elem); + gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float)); + + for (int i = 0; i < num_elem; ++i) { + VERIFY_IS_APPROX(initial(i), final(i)); + } + + gpu_device.deallocate(d_float); + gpu_device.deallocate(d_half); + gpu_device.deallocate(d_conv); +} + +template<typename> +void test_hip_unary() { + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int num_elem = 101; + + float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float( + d_float, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half( + d_res_half, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float( + d_res_float, num_elem); + + gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f); + gpu_res_float.device(gpu_device) = gpu_float.abs(); + gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().cast<float>(); + + Tensor<float, 1> half_prec(num_elem); + Tensor<float, 1> full_prec(num_elem); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); + gpu_device.synchronize(); + + for (int i = 0; i < num_elem; ++i) { + std::cout << "Checking unary " << i << std::endl; + VERIFY_IS_APPROX(full_prec(i), half_prec(i)); + } + + gpu_device.deallocate(d_float); + gpu_device.deallocate(d_res_half); + gpu_device.deallocate(d_res_float); +} + +template<typename> +void test_hip_elementwise() { + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int num_elem = 101; + + float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1( + d_float1, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2( + d_float2, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half( + d_res_half, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float( + d_res_float, num_elem); + + gpu_float1.device(gpu_device) = gpu_float1.random(); + gpu_float2.device(gpu_device) = gpu_float2.random(); + gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1; + gpu_res_half.device(gpu_device) = ((gpu_float1.cast<Eigen::half>() + gpu_float2.cast<Eigen::half>()) * gpu_float1.cast<Eigen::half>()).cast<float>(); + + Tensor<float, 1> half_prec(num_elem); + Tensor<float, 1> full_prec(num_elem); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); + gpu_device.synchronize(); + + for (int i = 0; i < num_elem; ++i) { + std::cout << "Checking elemwise " << i << ": full prec = " << full_prec(i) << " vs half prec = " << half_prec(i) << std::endl; + VERIFY_IS_APPROX(static_cast<Eigen::half>(full_prec(i)), static_cast<Eigen::half>(half_prec(i))); + } + + gpu_device.deallocate(d_float1); + gpu_device.deallocate(d_float2); + gpu_device.deallocate(d_res_half); + gpu_device.deallocate(d_res_float); +} + +template<typename> +void test_hip_trancendental() { + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int num_elem = 101; + + float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_float3 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + Eigen::half* d_res1_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); + Eigen::half* d_res1_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); + Eigen::half* d_res2_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); + Eigen::half* d_res2_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); + Eigen::half* d_res3_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); + Eigen::half* d_res3_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); + + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_half(d_res3_half, num_elem); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_float(d_res3_float, num_elem); + + gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f); + gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f); + gpu_float3.device(gpu_device) = gpu_float3.random(); + gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>(); + gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>(); + gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>(); + gpu_res4_float.device(gpu_device) = gpu_float3.expm1().cast<Eigen::half>(); + + gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>(); + gpu_res1_half.device(gpu_device) = gpu_res1_half.exp(); + + gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>(); + gpu_res2_half.device(gpu_device) = gpu_res2_half.log(); + + gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>(); + gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p(); + + gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>(); + gpu_res3_half.device(gpu_device) = gpu_res3_half.expm1(); + + Tensor<float, 1> input1(num_elem); + Tensor<Eigen::half, 1> half_prec1(num_elem); + Tensor<Eigen::half, 1> full_prec1(num_elem); + Tensor<float, 1> input2(num_elem); + Tensor<Eigen::half, 1> half_prec2(num_elem); + Tensor<Eigen::half, 1> full_prec2(num_elem); + Tensor<float, 1> input3(num_elem); + Tensor<Eigen::half, 1> half_prec3(num_elem); + Tensor<Eigen::half, 1> full_prec3(num_elem); + gpu_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half)); + gpu_device.synchronize(); + + for (int i = 0; i < num_elem; ++i) { + std::cout << "Checking elemwise exp " << i << " input = " << input1(i) << " full = " << full_prec1(i) << " half = " << half_prec1(i) << std::endl; + VERIFY_IS_APPROX(full_prec1(i), half_prec1(i)); + } + for (int i = 0; i < num_elem; ++i) { + std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl; + if(std::abs(input2(i)-1.f)<0.05f) // log lacks accurary nearby 1 + VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f)); + else + VERIFY_IS_APPROX(full_prec2(i), half_prec2(i)); + } + for (int i = 0; i < num_elem; ++i) { + std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl; + VERIFY_IS_APPROX(full_prec3(i), half_prec3(i)); + } + gpu_device.deallocate(d_float1); + gpu_device.deallocate(d_float2); + gpu_device.deallocate(d_float3); + gpu_device.deallocate(d_res1_half); + gpu_device.deallocate(d_res1_float); + gpu_device.deallocate(d_res2_half); + gpu_device.deallocate(d_res2_float); + gpu_device.deallocate(d_res3_float); + gpu_device.deallocate(d_res3_half); +} + +template<typename> +void test_hip_contractions() { + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int rows = 23; + int cols = 23; + int num_elem = rows*cols; + + float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); + Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); + + Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1( + d_float1, rows, cols); + Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2( + d_float2, rows, cols); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_half( + d_res_half, rows, cols); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_float( + d_res_float, rows, cols); + + gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f); + gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float2.constant(0.5f); + + typedef Tensor<float, 2>::DimensionPair DimPair; + Eigen::array<DimPair, 1> dims(DimPair(1, 0)); + gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims).cast<Eigen::half>(); + gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims); + + Tensor<Eigen::half, 2> half_prec(rows, cols); + Tensor<Eigen::half, 2> full_prec(rows, cols); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half)); + gpu_device.synchronize(); + + for (int i = 0; i < rows; ++i) { + for (int j = 0; j < cols; ++j) { + std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl; + if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) { + VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j)); + } + } + } + + gpu_device.deallocate(d_float1); + gpu_device.deallocate(d_float2); + gpu_device.deallocate(d_res_half); + gpu_device.deallocate(d_res_float); +} + +template<typename> +void test_hip_reductions(int size1, int size2, int redux) { + + std::cout << "Reducing " << size1 << " by " << size2 + << " tensor along dim " << redux << std::endl; + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int num_elem = size1*size2; + int result_size = (redux == 1 ? size1 : size2); + + float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half)); + Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half)); + + Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1( + d_float1, size1, size2); + Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2( + d_float2, size1, size2); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half( + d_res_half, result_size); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float( + d_res_float, result_size); + + gpu_float1.device(gpu_device) = gpu_float1.random() * 2.0f; + gpu_float2.device(gpu_device) = gpu_float2.random() * 2.0f; + + Eigen::array<int, 1> redux_dim(redux); + gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast<Eigen::half>(); + gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim); + + Tensor<Eigen::half, 1> half_prec(result_size); + Tensor<Eigen::half, 1> full_prec(result_size); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, result_size*sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, result_size*sizeof(Eigen::half)); + gpu_device.synchronize(); + + for (int i = 0; i < result_size; ++i) { + std::cout << "EXPECTED " << full_prec(i) << " GOT " << half_prec(i) << std::endl; + VERIFY_IS_APPROX(full_prec(i), half_prec(i)); + } + + gpu_device.deallocate(d_float1); + gpu_device.deallocate(d_float2); + gpu_device.deallocate(d_res_half); + gpu_device.deallocate(d_res_float); +} + +template<typename> +void test_hip_reductions() { + test_hip_reductions<void>(13, 13, 0); + test_hip_reductions<void>(13, 13, 1); + + test_hip_reductions<void>(35, 36, 0); + test_hip_reductions<void>(35, 36, 1); + + test_hip_reductions<void>(36, 35, 0); + test_hip_reductions<void>(36, 35, 1); +} + +template<typename> +void test_hip_full_reductions() { + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int size = 13; + int num_elem = size*size; + + float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half)); + Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half)); + + Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1( + d_float1, size, size); + Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2( + d_float2, size, size); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half( + d_res_half); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float( + d_res_float); + + gpu_float1.device(gpu_device) = gpu_float1.random(); + gpu_float2.device(gpu_device) = gpu_float2.random(); + + gpu_res_float.device(gpu_device) = gpu_float1.sum().cast<Eigen::half>(); + gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(); + + Tensor<Eigen::half, 0> half_prec; + Tensor<Eigen::half, 0> full_prec; + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half)); + gpu_device.synchronize(); + + VERIFY_IS_APPROX(full_prec(), half_prec()); + + gpu_res_float.device(gpu_device) = gpu_float1.maximum().cast<Eigen::half>(); + gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().maximum(); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half)); + gpu_device.synchronize(); + + VERIFY_IS_APPROX(full_prec(), half_prec()); + + gpu_device.deallocate(d_float1); + gpu_device.deallocate(d_float2); + gpu_device.deallocate(d_res_half); + gpu_device.deallocate(d_res_float); +} + +template<typename> +void test_hip_forced_evals() { + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int num_elem = 101; + + float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_half1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_half2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float( + d_float, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1( + d_res_half1, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2( + d_res_half2, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float( + d_res_float, num_elem); + + Eigen::array<int, 1> no_bcast; + no_bcast[0] = 1; + + gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f); + gpu_res_float.device(gpu_device) = gpu_float.abs(); + gpu_res_half1.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>(); + gpu_res_half2.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().broadcast(no_bcast).eval().cast<float>(); + + Tensor<float, 1> half_prec1(num_elem); + Tensor<float, 1> half_prec2(num_elem); + Tensor<float, 1> full_prec(num_elem); + gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half1, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); + gpu_device.synchronize(); + + for (int i = 0; i < num_elem; ++i) { + std::cout << "Checking forced eval " << i << full_prec(i) << " vs " << half_prec1(i) << " vs " << half_prec2(i) << std::endl; + VERIFY_IS_APPROX(full_prec(i), half_prec1(i)); + VERIFY_IS_APPROX(full_prec(i), half_prec2(i)); + } + + gpu_device.deallocate(d_float); + gpu_device.deallocate(d_res_half1); + gpu_device.deallocate(d_res_half2); + gpu_device.deallocate(d_res_float); +} +#endif + + +void test_cxx11_tensor_of_float16_hip() +{ + CALL_SUBTEST(test_hip_numext<void>()); + +#ifdef EIGEN_HAS_HIP_FP16 + CALL_SUBTEST(test_hip_conversion<void>()); + CALL_SUBTEST(test_hip_unary<void>()); + CALL_SUBTEST(test_hip_elementwise<void>()); + CALL_SUBTEST(test_hip_trancendental<void>()); + CALL_SUBTEST(test_hip_contractions<void>()); + CALL_SUBTEST(test_hip_reductions<void>()); + CALL_SUBTEST(test_hip_full_reductions<void>()); + CALL_SUBTEST(test_hip_forced_evals<void>()); +#else + std::cout << "Half floats are not supported by this version of hip: skipping the test" << std::endl; +#endif +} diff --git a/unsupported/test/cxx11_tensor_random_hip.cu b/unsupported/test/cxx11_tensor_random_hip.cu new file mode 100644 index 000000000..7d7e72ca2 --- /dev/null +++ b/unsupported/test/cxx11_tensor_random_hip.cu @@ -0,0 +1,85 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_random_hip +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_GPU + +#include "main.h" +#include <Eigen/CXX11/Tensor> + + +void test_hip_random_uniform() +{ + Tensor<float, 2> out(72,97); + out.setZero(); + + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_out; + hipMalloc((void**)(&d_out), out_bytes); + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97); + + gpu_out.device(gpu_device) = gpu_out.random(); + + assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); + assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); + + // For now we just check thes code doesn't crash. + // TODO: come up with a valid test of randomness +} + + +void test_hip_random_normal() +{ + Tensor<float, 2> out(72,97); + out.setZero(); + + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_out; + hipMalloc((void**)(&d_out), out_bytes); + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97); + + Eigen::internal::NormalRandomGenerator<float> gen(true); + gpu_out.device(gpu_device) = gpu_out.random(gen); + + assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess); + assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess); +} + +static void test_complex() +{ + Tensor<std::complex<float>, 1> vec(6); + vec.setRandom(); + + // Fixme: we should check that the generated numbers follow a uniform + // distribution instead. + for (int i = 1; i < 6; ++i) { + VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1)); + } +} + + +void test_cxx11_tensor_random_hip() +{ + CALL_SUBTEST(test_hip_random_uniform()); + CALL_SUBTEST(test_hip_random_normal()); + CALL_SUBTEST(test_complex()); +} diff --git a/unsupported/test/cxx11_tensor_reduction_hip.cu b/unsupported/test/cxx11_tensor_reduction_hip.cu new file mode 100644 index 000000000..c5aad05be --- /dev/null +++ b/unsupported/test/cxx11_tensor_reduction_hip.cu @@ -0,0 +1,154 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_reduction_hip +#define EIGEN_USE_GPU + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + + +template<typename Type, int DataLayout> +static void test_full_reductions() { + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + const int num_rows = internal::random<int>(1024, 5*1024); + const int num_cols = internal::random<int>(1024, 5*1024); + + Tensor<Type, 2, DataLayout> in(num_rows, num_cols); + in.setRandom(); + + Tensor<Type, 0, DataLayout> full_redux; + full_redux = in.sum(); + + std::size_t in_bytes = in.size() * sizeof(Type); + std::size_t out_bytes = full_redux.size() * sizeof(Type); + Type* gpu_in_ptr = static_cast<Type*>(gpu_device.allocate(in_bytes)); + Type* gpu_out_ptr = static_cast<Type*>(gpu_device.allocate(out_bytes)); + gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes); + + TensorMap<Tensor<Type, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols); + TensorMap<Tensor<Type, 0, DataLayout> > out_gpu(gpu_out_ptr); + + out_gpu.device(gpu_device) = in_gpu.sum(); + + Tensor<Type, 0, DataLayout> full_redux_gpu; + gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes); + gpu_device.synchronize(); + + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux(), full_redux_gpu()); + + gpu_device.deallocate(gpu_in_ptr); + gpu_device.deallocate(gpu_out_ptr); +} + +template<typename Type, int DataLayout> +static void test_first_dim_reductions() { + int dim_x = 33; + int dim_y = 1; + int dim_z = 128; + + Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z); + in.setRandom(); + + Eigen::array<int, 1> red_axis; + red_axis[0] = 0; + Tensor<Type, 2, DataLayout> redux = in.sum(red_axis); + + // Create device + Eigen::HipStreamDevice stream; + Eigen::GpuDevice dev(&stream); + + // Create data(T) + Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type)); + Type* out_data = (Type*)dev.allocate(dim_z*dim_y*sizeof(Type)); + Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z); + Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_y, dim_z); + + // Perform operation + dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type)); + gpu_out.device(dev) = gpu_in.sum(red_axis); + gpu_out.device(dev) += gpu_in.sum(red_axis); + Tensor<Type, 2, DataLayout> redux_gpu(dim_y, dim_z); + dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type)); + dev.synchronize(); + + // Check that the CPU and GPU reductions return the same result. + for (int i = 0; i < gpu_out.size(); ++i) { + VERIFY_IS_APPROX(2*redux(i), redux_gpu(i)); + } + + dev.deallocate(in_data); + dev.deallocate(out_data); +} + +template<typename Type, int DataLayout> +static void test_last_dim_reductions() { + int dim_x = 128; + int dim_y = 1; + int dim_z = 33; + + Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z); + in.setRandom(); + + Eigen::array<int, 1> red_axis; + red_axis[0] = 2; + Tensor<Type, 2, DataLayout> redux = in.sum(red_axis); + + // Create device + Eigen::HipStreamDevice stream; + Eigen::GpuDevice dev(&stream); + + // Create data + Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type)); + Type* out_data = (Type*)dev.allocate(dim_x*dim_y*sizeof(Type)); + Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z); + Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_x, dim_y); + + // Perform operation + dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type)); + gpu_out.device(dev) = gpu_in.sum(red_axis); + gpu_out.device(dev) += gpu_in.sum(red_axis); + Tensor<Type, 2, DataLayout> redux_gpu(dim_x, dim_y); + dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type)); + dev.synchronize(); + + // Check that the CPU and GPU reductions return the same result. + for (int i = 0; i < gpu_out.size(); ++i) { + VERIFY_IS_APPROX(2*redux(i), redux_gpu(i)); + } + + dev.deallocate(in_data); + dev.deallocate(out_data); +} + + +void test_cxx11_tensor_reduction_hip() { + CALL_SUBTEST((test_full_reductions<float, ColMajor>())); + CALL_SUBTEST((test_full_reductions<double, ColMajor>())); + CALL_SUBTEST((test_full_reductions<float, RowMajor>())); + CALL_SUBTEST((test_full_reductions<double, RowMajor>())); + + CALL_SUBTEST((test_first_dim_reductions<float, ColMajor>())); + CALL_SUBTEST((test_first_dim_reductions<double, ColMajor>())); + CALL_SUBTEST((test_first_dim_reductions<float, RowMajor>())); +// Outer reductions of doubles aren't supported just yet. +// CALL_SUBTEST((test_first_dim_reductions<double, RowMajor>())) + + CALL_SUBTEST((test_last_dim_reductions<float, ColMajor>())); +// Outer reductions of doubles aren't supported just yet. +// CALL_SUBTEST((test_last_dim_reductions<double, ColMajor>())); + CALL_SUBTEST((test_last_dim_reductions<float, RowMajor>())); + CALL_SUBTEST((test_last_dim_reductions<double, RowMajor>())); +} diff --git a/unsupported/test/cxx11_tensor_scan_hip.cu b/unsupported/test/cxx11_tensor_scan_hip.cu new file mode 100644 index 000000000..f4c4f59b9 --- /dev/null +++ b/unsupported/test/cxx11_tensor_scan_hip.cu @@ -0,0 +1,76 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_scan_hip +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_GPU + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Tensor; +typedef Tensor<float, 1>::DimensionPair DimPair; + +template<int DataLayout> +void test_hip_cumsum(int m_size, int k_size, int n_size) +{ + std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; + Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size); + Tensor<float, 3, DataLayout> t_result(m_size, k_size, n_size); + Tensor<float, 3, DataLayout> t_result_gpu(m_size, k_size, n_size); + + t_input.setRandom(); + + std::size_t t_input_bytes = t_input.size() * sizeof(float); + std::size_t t_result_bytes = t_result.size() * sizeof(float); + + float* d_t_input; + float* d_t_result; + + hipMalloc((void**)(&d_t_input), t_input_bytes); + hipMalloc((void**)(&d_t_result), t_result_bytes); + + hipMemcpy(d_t_input, t_input.data(), t_input_bytes, hipMemcpyHostToDevice); + + Eigen::HipStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > + gpu_t_input(d_t_input, Eigen::array<int, 3>(m_size, k_size, n_size)); + Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > + gpu_t_result(d_t_result, Eigen::array<int, 3>(m_size, k_size, n_size)); + + gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1); + t_result = t_input.cumsum(1); + + hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost); + for (DenseIndex i = 0; i < t_result.size(); i++) { + if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) { + continue; + } + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) { + continue; + } + std::cout << "mismatch detected at index " << i << ": " << t_result(i) + << " vs " << t_result_gpu(i) << std::endl; + assert(false); + } + + hipFree((void*)d_t_input); + hipFree((void*)d_t_result); +} + + +void test_cxx11_tensor_scan_hip() +{ + CALL_SUBTEST(test_hip_cumsum<ColMajor>(128, 128, 128)); + CALL_SUBTEST(test_hip_cumsum<RowMajor>(128, 128, 128)); +} |