aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2016-03-03 10:34:20 -0800
committerGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2016-03-03 10:34:20 -0800
commit1032441c6fea0a0d98b394abe8ffdb228256f47b (patch)
tree9d3801e7f392aebb6f7443a860b794778e9d0a7b /Eigen
parent1da10a73580b3f3b672397ad65cded9300535ac7 (diff)
Enable partial support for half floats on Kepler GPUs.
Diffstat (limited to 'Eigen')
-rw-r--r--Eigen/src/Core/arch/CUDA/PacketMathHalf.h142
1 files changed, 78 insertions, 64 deletions
diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
index 1a1b4ec3d..720155ce1 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@@ -17,8 +17,10 @@
// we'll use on the host side (SSE, AVX, ...)
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+// The following operations require arch >= 5.3
+#if __CUDA_ARCH__ >= 530
__device__ half operator + (const half& a, const half& b) {
return __hadd(a, b);
}
@@ -60,6 +62,7 @@ __device__ half abs(const half& a) {
return result;
}
}
+#endif
namespace Eigen {
namespace internal {
@@ -98,8 +101,79 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const half&
return __half2half2(from);
}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const half* from) {
+ return *reinterpret_cast<const half2*>(from);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const half* from) {
+ return __halves2half2(from[0], from[1]);
+}
+
+template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const half* from) {
+ return __halves2half2(from[0], from[0]);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<half>(half* to, const half2& from) {
+ *reinterpret_cast<half2*>(to) = from;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<half>(half* to, const half2& from) {
+ to[0] = __low2half(from);
+ to[1] = __high2half(from);
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const half* from) {
+#if __CUDA_ARCH__ >= 320
+ return __ldg((const half2*)from);
+#else
+ return __halves2half2(*(from+0), *(from+1));
+#endif
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const half* from) {
+#if __CUDA_ARCH__ >= 320
+ return __halves2half2(__ldg(from+0), __ldg(from+1));
+#else
+ return __halves2half2(*(from+0), *(from+1));
+#endif
+}
+
+template<> EIGEN_DEVICE_FUNC inline half2 pgather<half, half2>(const half* from, Index stride) {
+ return __halves2half2(from[0*stride], from[1*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<half, half2>(half* to, const half2& from, Index stride) {
+ to[stride*0] = __low2half(from);
+ to[stride*1] = __high2half(from);
+}
+
+template<> EIGEN_DEVICE_FUNC inline half pfirst<half2>(const half2& a) {
+ return __low2half(a);
+}
+
+template<> EIGEN_DEVICE_FUNC inline half2 pabs<half2>(const half2& a) {
+ half2 result;
+ result.x = a.x & 0x7FFF7FFF;
+ return result;
+}
+
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<half2,2>& kernel) {
+ half a1 = __low2half(kernel.packet[0]);
+ half a2 = __high2half(kernel.packet[0]);
+ half b1 = __low2half(kernel.packet[1]);
+ half b2 = __high2half(kernel.packet[1]);
+ kernel.packet[0] = __halves2half2(a1, b1);
+ kernel.packet[1] = __halves2half2(a2, b2);
+}
+
+// The following operations require arch >= 5.3
+#if __CUDA_ARCH__ >= 530
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const half& a) {
- return __halves2half2(a, __hadd(a, __float2half(1)));
+ return __halves2half2(a, __hadd(a, __float2half(1.0f)));
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
@@ -140,7 +214,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2&
float b1 = __low2float(b);
float b2 = __high2float(b);
half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
- half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
+ half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
return __halves2half2(r1, r2);
}
@@ -154,50 +228,6 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2&
return __halves2half2(r1, r2);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const half* from) {
- return *reinterpret_cast<const half2*>(from);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const half* from) {
- return __halves2half2(from[0], from[1]);
-}
-
-template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const half* from) {
- return __halves2half2(from[0], from[0]);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<half>(half* to, const half2& from) {
- *reinterpret_cast<half2*>(to) = from;
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<half>(half* to, const half2& from) {
- to[0] = __low2half(from);
- to[1] = __high2half(from);
-}
-
-template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const half* from) {
- return __ldg((const half2*)from);
-}
-
-template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const half* from) {
- return __halves2half2(__ldg(from+0), __ldg(from+1));
-}
-
-template<> EIGEN_DEVICE_FUNC inline half2 pgather<half, half2>(const half* from, Index stride) {
- return __halves2half2(from[0*stride], from[1*stride]);
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<half, half2>(half* to, const half2& from, Index stride) {
- to[stride*0] = __low2half(from);
- to[stride*1] = __high2half(from);
-}
-
-template<> EIGEN_DEVICE_FUNC inline half pfirst<half2>(const half2& a) {
- return __low2half(a);
-}
-
template<> EIGEN_DEVICE_FUNC inline half predux<half2>(const half2& a) {
return __hadd(__low2half(a), __high2half(a));
}
@@ -217,23 +247,7 @@ template<> EIGEN_DEVICE_FUNC inline half predux_min<half2>(const half2& a) {
template<> EIGEN_DEVICE_FUNC inline half predux_mul<half2>(const half2& a) {
return __hmul(__low2half(a), __high2half(a));
}
-
-template<> EIGEN_DEVICE_FUNC inline half2 pabs<half2>(const half2& a) {
- half2 result;
- result.x = a.x & 0x7FFF7FFF;
- return result;
-}
-
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<half2,2>& kernel) {
- half a1 = __low2half(kernel.packet[0]);
- half a2 = __high2half(kernel.packet[0]);
- half b1 = __low2half(kernel.packet[1]);
- half b2 = __high2half(kernel.packet[1]);
- kernel.packet[0] = __halves2half2(a1, b1);
- kernel.packet[1] = __halves2half2(a2, b2);
-}
+#endif
} // end namespace internal