aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch/GPU
diff options
context:
space:
mode:
authorGravatar Antonio Sanchez <cantonios@google.com>2020-12-07 19:11:07 -0800
committerGravatar Rasmus Munk Larsen <rmlarsen@google.com>2020-12-08 20:31:02 +0000
commit8cfe0db108f54e4ceae2e94c47c5d2eb5116197b (patch)
treeb568be6551dca471996fa5864ee6615b620a7052 /Eigen/src/Core/arch/GPU
parentbaf9d762b70b030f797ab4c8e5e6ecebf5095122 (diff)
Fix host/device calls for __half.
The previous code had `__host__ __device__` functions calling `__device__` functions (e.g. `__low2half`) which caused build failures in tensorflow. Also tried to simplify the `#ifdef` guards to make them more clear.
Diffstat (limited to 'Eigen/src/Core/arch/GPU')
-rw-r--r--Eigen/src/Core/arch/GPU/PacketMath.h302
1 files changed, 167 insertions, 135 deletions
diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h
index fb32c98ac..83bd551a0 100644
--- a/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h
@@ -14,10 +14,21 @@ namespace Eigen {
namespace internal {
+// Read-only data cached load available.
+#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 350
+#define EIGEN_GPU_HAS_LDG 1
+#endif
+
+// FP16 math available.
+#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530
+#define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
+#endif
+
// Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...)
#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
+
template<> struct is_arithmetic<float4> { enum { value = true }; };
template<> struct is_arithmetic<double2> { enum { value = true }; };
@@ -237,7 +248,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
pcmp_lt<double2>(const double2& a, const double2& b) {
return make_double2(lt_mask(a.x, b.x), lt_mask(a.y, b.y));
}
-#endif // EIGEN_CUDA_ARCH || defined(EIGEN_HIP_DEVICE_COMPILE)
+#endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
return make_float4(a, a+1, a+2, a+3);
@@ -342,7 +353,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
+#if defined(EIGEN_GPU_HAS_LDG)
return __ldg((const float4*)from);
#else
return make_float4(from[0], from[1], from[2], from[3]);
@@ -350,7 +361,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const fl
}
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
+#if defined(EIGEN_GPU_HAS_LDG)
return __ldg((const double2*)from);
#else
return make_double2(from[0], from[1]);
@@ -359,7 +370,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
+#if defined(EIGEN_GPU_HAS_LDG)
return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
#else
return make_float4(from[0], from[1], from[2], from[3]);
@@ -367,7 +378,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const
}
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
+#if defined(EIGEN_GPU_HAS_LDG)
return make_double2(__ldg(from+0), __ldg(from+1));
#else
return make_double2(from[0], from[1]);
@@ -511,12 +522,43 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
};
};
+namespace {
+// This is equivalent to make_half2, which is undocumented and doesn't seem to always exist.
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 combine_half(const __half& a, const __half& b) {
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+ return __halves2half2(a, b);
+#else
+ // Round-about way since __halves2half2 is a __device__ function.
+ return __floats2half2_rn(__half2float(a), __half2float(b));
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_low(const half2& a) {
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+ return __low2half(a);
+#else
+ return __float2half(__low2float(a));
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_high(const half2& a) {
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+ return __high2half(a);
+#else
+ return __float2half(__high2float(a));
+#endif
+}
+} // namespace
+
template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
-#if defined(EIGEN_HIPCC)
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
return half2half2(from);
-#else
+#elif defined(EIGEN_CUDA_ARCH)
return __half2half2(from);
+#else
+ const float f = __half2float(from);
+ return __floats2half2_rn(f, f);
#endif
}
@@ -532,7 +574,8 @@ pset1<Packet4h2>(const Eigen::half& from) {
return r;
}
-#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
+// We now need this visible on both host and device.
+// #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
namespace {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) {
@@ -540,11 +583,11 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) {
- return __halves2half2(from[0], from[1]);
+ return combine_half(from[0], from[1]);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) {
- return __halves2half2(from[0], from[0]);
+ return combine_half(from[0], from[0]);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to,
@@ -554,170 +597,164 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to,
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to,
const half2& from) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
- to[0] = __low2half(from);
- to[1] = __high2half(from);
-#else
- // Unfortunately __low2half and __high2half are only __device__ functions.
- to[0] = __float2half(__low2float(from));
- to[1] = __float2half(__high2float(from));
-#endif
+ to[0] = get_half2_low(from);
+ to[1] = get_half2_high(from);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(
const Eigen::half* from) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 350
+#if defined(EIGEN_GPU_HAS_LDG)
return __ldg((const half2*)from);
#else
- return __halves2half2(*(from+0), *(from+1));
+ return combine_half(*(from+0), *(from+1));
#endif
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(
const Eigen::half* from) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 350
+#if defined(EIGEN_GPU_HAS_LDG)
return __halves2half2(__ldg(from+0), __ldg(from+1));
#else
- return __halves2half2(*(from+0), *(from+1));
+ return combine_half(*(from+0), *(from+1));
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from,
Index stride) {
- return __halves2half2(from[0*stride], from[1*stride]);
+ return combine_half(from[0*stride], from[1*stride]);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(
Eigen::half* to, const half2& from, Index stride) {
- to[stride*0] = __low2half(from);
- to[stride*1] = __high2half(from);
+ to[stride*0] = get_half2_low(from);
+ to[stride*1] = get_half2_high(from);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) {
- return __low2half(a);
+ return get_half2_low(a);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) {
- half a1 = __low2half(a);
- half a2 = __high2half(a);
+ half a1 = get_half2_low(a);
+ half a2 = get_half2_high(a);
half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF);
half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF);
- return __halves2half2(result1, result2);
+ return combine_half(result1, result2);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& /*a*/) {
half true_half = half_impl::raw_uint16_to_half(0xffffu);
return pset1<half2>(true_half);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& /*a*/) {
half false_half = half_impl::raw_uint16_to_half(0x0000u);
return pset1<half2>(false_half);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
ptranspose(PacketBlock<half2,2>& kernel) {
- __half a1 = __low2half(kernel.packet[0]);
- __half a2 = __high2half(kernel.packet[0]);
- __half b1 = __low2half(kernel.packet[1]);
- __half b2 = __high2half(kernel.packet[1]);
- kernel.packet[0] = __halves2half2(a1, b1);
- kernel.packet[1] = __halves2half2(a2, b2);
+ __half a1 = get_half2_low(kernel.packet[0]);
+ __half a2 = get_half2_high(kernel.packet[0]);
+ __half b1 = get_half2_low(kernel.packet[1]);
+ __half b2 = get_half2_high(kernel.packet[1]);
+ kernel.packet[0] = combine_half(a1, b1);
+ kernel.packet[1] = combine_half(a2, b2);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
#else
float f = __half2float(a) + 1.0f;
- return __halves2half2(a, __float2half(f));
+ return combine_half(a, __float2half(f));
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask,
const half2& a,
const half2& b) {
- half mask_low = __low2half(mask);
- half mask_high = __high2half(mask);
- half result_low = mask_low == half(0) ? __low2half(b) : __low2half(a);
- half result_high = mask_high == half(0) ? __high2half(b) : __high2half(a);
- return __halves2half2(result_low, result_high);
+ half mask_low = get_half2_low(mask);
+ half mask_high = get_half2_high(mask);
+ half result_low = mask_low == half(0) ? get_half2_low(b) : get_half2_low(a);
+ half result_high = mask_high == half(0) ? get_half2_high(b) : get_half2_high(a);
+ return combine_half(result_low, result_high);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a,
const half2& b) {
half true_half = half_impl::raw_uint16_to_half(0xffffu);
half false_half = half_impl::raw_uint16_to_half(0x0000u);
- half a1 = __low2half(a);
- half a2 = __high2half(a);
- half b1 = __low2half(b);
- half b2 = __high2half(b);
+ half a1 = get_half2_low(a);
+ half a2 = get_half2_high(a);
+ half b1 = get_half2_low(b);
+ half b2 = get_half2_high(b);
half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half;
half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half;
- return __halves2half2(eq1, eq2);
+ return combine_half(eq1, eq2);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a,
const half2& b) {
half true_half = half_impl::raw_uint16_to_half(0xffffu);
half false_half = half_impl::raw_uint16_to_half(0x0000u);
- half a1 = __low2half(a);
- half a2 = __high2half(a);
- half b1 = __low2half(b);
- half b2 = __high2half(b);
+ half a1 = get_half2_low(a);
+ half a2 = get_half2_high(a);
+ half b1 = get_half2_low(b);
+ half b2 = get_half2_high(b);
half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half;
half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half;
- return __halves2half2(eq1, eq2);
+ return combine_half(eq1, eq2);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a,
const half2& b) {
- half a1 = __low2half(a);
- half a2 = __high2half(a);
- half b1 = __low2half(b);
- half b2 = __high2half(b);
+ half a1 = get_half2_low(a);
+ half a2 = get_half2_high(a);
+ half b1 = get_half2_low(b);
+ half b2 = get_half2_high(b);
half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x);
half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x);
- return __halves2half2(result1, result2);
+ return combine_half(result1, result2);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a,
const half2& b) {
- half a1 = __low2half(a);
- half a2 = __high2half(a);
- half b1 = __low2half(b);
- half b2 = __high2half(b);
+ half a1 = get_half2_low(a);
+ half a2 = get_half2_high(a);
+ half b1 = get_half2_low(b);
+ half b2 = get_half2_high(b);
half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x);
half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x);
- return __halves2half2(result1, result2);
+ return combine_half(result1, result2);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a,
const half2& b) {
- half a1 = __low2half(a);
- half a2 = __high2half(a);
- half b1 = __low2half(b);
- half b2 = __high2half(b);
+ half a1 = get_half2_low(a);
+ half a2 = get_half2_high(a);
+ half b1 = get_half2_low(b);
+ half b2 = get_half2_high(b);
half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x);
half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x);
- return __halves2half2(result1, result2);
+ return combine_half(result1, result2);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a,
const half2& b) {
- half a1 = __low2half(a);
- half a2 = __high2half(a);
- half b1 = __low2half(b);
- half b2 = __high2half(b);
+ half a1 = get_half2_low(a);
+ half a2 = get_half2_high(a);
+ half b1 = get_half2_low(b);
+ half b2 = get_half2_high(b);
half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x);
half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x);
- return __halves2half2(result1, result2);
+ return combine_half(result1, result2);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a,
const half2& b) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hadd2(a, b);
#else
float a1 = __low2float(a);
@@ -732,7 +769,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a,
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a,
const half2& b) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hsub2(a, b);
#else
float a1 = __low2float(a);
@@ -746,7 +783,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a,
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hneg2(a);
#else
float a1 = __low2float(a);
@@ -759,7 +796,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a,
const half2& b) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hmul2(a, b);
#else
float a1 = __low2float(a);
@@ -775,7 +812,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a,
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a,
const half2& b,
const half2& c) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hfma2(a, b, c);
#else
float a1 = __low2float(a);
@@ -792,9 +829,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a,
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a,
const half2& b) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __h2div(a, b);
-#else // EIGEN_CUDA_ARCH
+#else
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
@@ -811,9 +848,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a,
float a2 = __high2float(a);
float b1 = __low2float(b);
float b2 = __high2float(b);
- __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
- __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
- return __halves2half2(r1, r2);
+ __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b);
+ __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b);
+ return combine_half(r1, r2);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a,
@@ -822,13 +859,13 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a,
float a2 = __high2float(a);
float b1 = __low2float(b);
float b2 = __high2float(b);
- __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
- __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
- return __halves2half2(r1, r2);
+ __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b);
+ __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b);
+ return combine_half(r1, r2);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hadd(__low2half(a), __high2half(a));
#else
float a1 = __low2float(a);
@@ -838,31 +875,31 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
__half first = __low2half(a);
__half second = __high2half(a);
return __hgt(first, second) ? first : second;
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
- return a1 > a2 ? __low2half(a) : __high2half(a);
+ return a1 > a2 ? get_half2_low(a) : get_half2_high(a);
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
__half first = __low2half(a);
__half second = __high2half(a);
return __hlt(first, second) ? first : second;
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
- return a1 < a2 ? __low2half(a) : __high2half(a);
+ return a1 < a2 ? get_half2_low(a) : get_half2_high(a);
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hmul(__low2half(a), __high2half(a));
#else
float a1 = __low2float(a);
@@ -996,7 +1033,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2
ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 350
+#if defined(EIGEN_GPU_HAS_LDG)
Packet4h2 r;
r = __ldg((const Packet4h2*)from);
return r;
@@ -1028,10 +1065,10 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
pgather<Eigen::half, Packet4h2>(const Eigen::half* from, Index stride) {
Packet4h2 r;
half2* p_alias = reinterpret_cast<half2*>(&r);
- p_alias[0] = __halves2half2(from[0 * stride], from[1 * stride]);
- p_alias[1] = __halves2half2(from[2 * stride], from[3 * stride]);
- p_alias[2] = __halves2half2(from[4 * stride], from[5 * stride]);
- p_alias[3] = __halves2half2(from[6 * stride], from[7 * stride]);
+ p_alias[0] = combine_half(from[0 * stride], from[1 * stride]);
+ p_alias[1] = combine_half(from[2 * stride], from[3 * stride]);
+ p_alias[2] = combine_half(from[4 * stride], from[5 * stride]);
+ p_alias[3] = combine_half(from[6 * stride], from[7 * stride]);
return r;
}
@@ -1066,13 +1103,13 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pabs<Packet4h2>(
template <>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ptrue<Packet4h2>(
- const Packet4h2& a) {
+ const Packet4h2& /*a*/) {
half true_half = half_impl::raw_uint16_to_half(0xffffu);
return pset1<Packet4h2>(true_half);
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pzero<Packet4h2>(const Packet4h2& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pzero<Packet4h2>(const Packet4h2& /*a*/) {
half false_half = half_impl::raw_uint16_to_half(0x0000u);
return pset1<Packet4h2>(false_half);
}
@@ -1112,12 +1149,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2(
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
ptranspose_half(half2& f0, half2& f1) {
- __half a1 = __low2half(f0);
- __half a2 = __high2half(f0);
- __half b1 = __low2half(f1);
- __half b2 = __high2half(f1);
- f0 = __halves2half2(a1, b1);
- f1 = __halves2half2(a2, b2);
+ __half a1 = get_half2_low(f0);
+ __half a2 = get_half2_high(f0);
+ __half b1 = get_half2_low(f1);
+ __half b2 = get_half2_high(f1);
+ f0 = combine_half(a1, b1);
+ f1 = combine_half(a2, b2);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
@@ -1191,9 +1228,7 @@ plset<Packet4h2>(const Eigen::half& a) {
p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)),
__hadd(a, __float2half(7.0f)));
return r;
-#else // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
+#elif EIGEN_CUDA_ARCH >= 530
Packet4h2 r;
half2* r_alias = reinterpret_cast<half2*>(&r);
@@ -1216,14 +1251,12 @@ plset<Packet4h2>(const Eigen::half& a) {
float f = __half2float(a);
Packet4h2 r;
half2* p_alias = reinterpret_cast<half2*>(&r);
- p_alias[0] = __halves2half2(a, __float2half(f + 1.0f));
- p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f));
- p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f));
- p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f));
+ p_alias[0] = combine_half(a, __float2half(f + 1.0f));
+ p_alias[1] = combine_half(__float2half(f + 2.0f), __float2half(f + 3.0f));
+ p_alias[2] = combine_half(__float2half(f + 4.0f), __float2half(f + 5.0f));
+ p_alias[3] = combine_half(__float2half(f + 6.0f), __float2half(f + 7.0f));
return r;
#endif
-
-#endif
}
template <>
@@ -1441,9 +1474,9 @@ template <>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(
const Packet4h2& a) {
const half2* a_alias = reinterpret_cast<const half2*>(&a);
- half2 m0 = __halves2half2(predux_max(a_alias[0]),
+ half2 m0 = combine_half(predux_max(a_alias[0]),
predux_max(a_alias[1]));
- half2 m1 = __halves2half2(predux_max(a_alias[2]),
+ half2 m1 = combine_half(predux_max(a_alias[2]),
predux_max(a_alias[3]));
__half first = predux_max(m0);
__half second = predux_max(m1);
@@ -1460,9 +1493,9 @@ template <>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(
const Packet4h2& a) {
const half2* a_alias = reinterpret_cast<const half2*>(&a);
- half2 m0 = __halves2half2(predux_min(a_alias[0]),
+ half2 m0 = combine_half(predux_min(a_alias[0]),
predux_min(a_alias[1]));
- half2 m1 = __halves2half2(predux_min(a_alias[2]),
+ half2 m1 = combine_half(predux_min(a_alias[2]),
predux_min(a_alias[3]));
__half first = predux_min(m0);
__half second = predux_min(m1);
@@ -1564,7 +1597,7 @@ prsqrt<Packet4h2>(const Packet4h2& a) {
template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a,
const half2& b) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hadd2(a, b);
#else
float a1 = __low2float(a);
@@ -1580,7 +1613,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a,
template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a,
const half2& b) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hmul2(a, b);
#else
float a1 = __low2float(a);
@@ -1596,12 +1629,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a,
template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a,
const half2& b) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __h2div(a, b);
-
-#else // EIGEN_CUDA_ARCH
-
+#else
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
@@ -1609,7 +1639,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a,
float r1 = a1 / b1;
float r2 = a2 / b2;
return __floats2half2_rn(r1, r2);
-
#endif
}
@@ -1620,9 +1649,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a,
float a2 = __high2float(a);
float b1 = __low2float(b);
float b2 = __high2float(b);
- __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
- __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
- return __halves2half2(r1, r2);
+ __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b);
+ __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b);
+ return combine_half(r1, r2);
}
template<>
@@ -1632,14 +1661,17 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a,
float a2 = __high2float(a);
float b1 = __low2float(b);
float b2 = __high2float(b);
- __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
- __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
- return __halves2half2(r1, r2);
+ __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b);
+ __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b);
+ return combine_half(r1, r2);
}
-#endif // defined(EIGEN_CUDA_ARCH)
+// #endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
+
+#endif // defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
-#endif // defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC)
+#undef EIGEN_GPU_HAS_LDG
+#undef EIGEN_GPU_HAS_FP16_ARITHMETIC
} // end namespace internal