aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2016-05-06 19:16:43 -0700
committerGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2016-05-06 19:16:43 -0700
commit8adf5cc70f232f919302332e2a2fa0971c34f264 (patch)
treeb9df1f4f15b1a85d6d5c1ef29b1247f1c10c7c31
parentc54ae65c836873c57cac81eaa600348fa28ae98d (diff)
Added support for packet processing of fp16 on kepler and maxwell gpus
-rw-r--r--Eigen/src/Core/arch/CUDA/PacketMathHalf.h99
-rw-r--r--Eigen/src/Core/arch/CUDA/TypeCasting.h2
2 files changed, 93 insertions, 8 deletions
diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
index 61d532e4d..0cebc1017 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@@ -17,8 +17,8 @@
// we'll use on the host side (SSE, AVX, ...)
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
-// Most of the following operations require arch >= 5.3
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+// Most of the following operations require arch >= 3.0
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
namespace Eigen {
namespace internal {
@@ -67,13 +67,21 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<half>(half* to, co
}
template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const half* from) {
- return __ldg((const half2*)from);
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const half* from) {
+#if __CUDA_ARCH__ >= 320
+ return __ldg((const half2*)from);
+#else
+ return __halves2half2(*(from+0), *(from+1));
+#endif
}
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const half* from) {
- return __halves2half2(__ldg(from+0), __ldg(from+1));
+#if __CUDA_ARCH__ >= 320
+ return __halves2half2(__ldg(from+0), __ldg(from+1));
+#else
+ return __halves2half2(*(from+0), *(from+1));
+#endif
}
template<> EIGEN_DEVICE_FUNC inline half2 pgather<half, half2>(const half* from, Index stride) {
@@ -107,30 +115,83 @@ ptranspose(PacketBlock<half2,2>& kernel) {
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const half& a) {
+#if __CUDA_ARCH__ >= 530
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
+#else
+ float f = __half2float(a) + 1.0f;
+ return __halves2half2(a, __float2half(f));
+#endif
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
+#if __CUDA_ARCH__ >= 530
return __hadd2(a, b);
+#else
+ float a1 = __low2float(a);
+ float a2 = __high2float(a);
+ float b1 = __low2float(b);
+ float b2 = __high2float(b);
+ float r1 = a1 + b1;
+ float r2 = a2 + b2;
+ return __floats2half2_rn(r1, r2);
+#endif
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
+#if __CUDA_ARCH__ >= 530
return __hsub2(a, b);
+#else
+ float a1 = __low2float(a);
+ float a2 = __high2float(a);
+ float b1 = __low2float(b);
+ float b2 = __high2float(b);
+ float r1 = a1 - b1;
+ float r2 = a2 - b2;
+ return __floats2half2_rn(r1, r2);
+#endif
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
+#if __CUDA_ARCH__ >= 530
return __hneg2(a);
+#else
+ float a1 = __low2float(a);
+ float a2 = __high2float(a);
+ return __floats2half2_rn(-a1, -a2);
+#endif
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
+#if __CUDA_ARCH__ >= 530
return __hmul2(a, b);
+#else
+ float a1 = __low2float(a);
+ float a2 = __high2float(a);
+ float b1 = __low2float(b);
+ float b2 = __high2float(b);
+ float r1 = a1 * b1;
+ float r2 = a2 * b2;
+ return __floats2half2_rn(r1, r2);
+#endif
}
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
+#if __CUDA_ARCH__ >= 530
return __hfma2(a, b, c);
- }
+#else
+ float a1 = __low2float(a);
+ float a2 = __high2float(a);
+ float b1 = __low2float(b);
+ float b2 = __high2float(b);
+ float c1 = __low2float(c);
+ float c2 = __high2float(c);
+ float r1 = a1 * b1 + c2;
+ float r2 = a2 * b2 + c2;
+ return __floats2half2_rn(r1, r2);
+#endif
+}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
float a1 = __low2float(a);
@@ -163,23 +224,47 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2&
}
template<> EIGEN_DEVICE_FUNC inline half predux<half2>(const half2& a) {
+#if __CUDA_ARCH__ >= 530
return __hadd(__low2half(a), __high2half(a));
+#else
+ float a1 = __low2float(a);
+ float a2 = __high2float(a);
+ return half(__float2half_rn(a1 + a2));
+#endif
}
template<> EIGEN_DEVICE_FUNC inline half predux_max<half2>(const half2& a) {
+#if __CUDA_ARCH__ >= 530
half first = __low2half(a);
half second = __high2half(a);
return __hgt(first, second) ? first : second;
+#else
+ float a1 = __low2float(a);
+ float a2 = __high2float(a);
+ return half(__float2half_rn(numext::maxi(a1, a2)));
+#endif
}
template<> EIGEN_DEVICE_FUNC inline half predux_min<half2>(const half2& a) {
+#if __CUDA_ARCH__ >= 530
half first = __low2half(a);
half second = __high2half(a);
return __hlt(first, second) ? first : second;
+#else
+ float a1 = __low2float(a);
+ float a2 = __high2float(a);
+ return half(__float2half_rn(numext::mini(a1, a2)));
+#endif
}
template<> EIGEN_DEVICE_FUNC inline half predux_mul<half2>(const half2& a) {
+#if __CUDA_ARCH__ >= 530
return __hmul(__low2half(a), __high2half(a));
+#else
+ float a1 = __low2float(a);
+ float a2 = __high2float(a);
+ return half(__float2half_rn(a1 * a2));
+#endif
}
} // end namespace internal
diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h
index 396b38eaf..3ea218133 100644
--- a/Eigen/src/Core/arch/CUDA/TypeCasting.h
+++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h
@@ -71,7 +71,7 @@ struct functor_traits<scalar_cast_op<half, float> >
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
template <>
struct type_casting_traits<half, float> {