remove duplicate pset1 for half and add some comments about why we need expose pmul/add/div/min/max on host

author: Sami Kama <sami.kama.git@gmail.com> 2020-03-10 20:28:43 +0000
committer: Rasmus Munk Larsen <rmlarsen@google.com> 2020-03-10 20:28:43 +0000
commit: b733b8b680885c0fcdfddea5423171468609b5a6 (patch)
tree: 1174a4651bbdbe979a8bd33e97edf4011c8cc7e4 /Eigen/src/Core/arch/GPU/TypeCasting.h
parent: a45d28256d020a4e871267c9bf00206fe9d2265e (diff)
1 files changed, 29 insertions, 4 deletions
diff --git a/Eigen/src/Core/arch/GPU/TypeCasting.h b/Eigen/src/Core/arch/GPU/TypeCasting.h
index c278f3fe8..754546225 100644
--- a/Eigen/src/Core/arch/GPU/TypeCasting.h
+++ b/Eigen/src/Core/arch/GPU/TypeCasting.h
@@ -17,12 +17,13 @@ namespace internal {
 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
   (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
 
+
 template <>
 struct type_casting_traits<Eigen::half, float> {
   enum {
     VectorizedCast = 1,
-    SrcCoeffRatio = 2,
-    TgtCoeffRatio = 1
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 2
   };
 };
 
@@ -32,15 +33,39 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(con
   return make_float4(r1.x, r1.y, r2.x, r2.y);
 }
 
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcast<float4, Packet4h2>(const float4& a, const float4& b) {
+  Packet4h2 r;
+  half2* r_alias=reinterpret_cast<half2*>(&r);
+  r_alias[0]=__floats2half2_rn(a.x,a.y);
+  r_alias[1]=__floats2half2_rn(a.z,a.w);
+  r_alias[2]=__floats2half2_rn(b.x,b.y);
+  r_alias[3]=__floats2half2_rn(b.z,b.w);
+  return r;
+}
+
 template <>
 struct type_casting_traits<float, Eigen::half> {
   enum {
     VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 2
+    SrcCoeffRatio = 2,
+    TgtCoeffRatio = 1
   };
 };
 
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<Packet4h2, float4>(const Packet4h2& a) {
+  // Simply discard the second half of the input
+  float4 r;
+  const half2* a_alias=reinterpret_cast<const half2*>(&a);
+  float2 r1 = __half22float2(a_alias[0]);
+  float2 r2 = __half22float2(a_alias[1]);
+  r.x=static_cast<float>(r1.x);
+  r.y=static_cast<float>(r1.y);
+  r.z=static_cast<float>(r2.x);
+  r.w=static_cast<float>(r2.y);
+  return r;
+}
+
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
   // Simply discard the second half of the input
   return __floats2half2_rn(a.x, a.y);
author	Sami Kama <sami.kama.git@gmail.com>	2020-03-10 20:28:43 +0000
committer	Rasmus Munk Larsen <rmlarsen@google.com>	2020-03-10 20:28:43 +0000
commit	b733b8b680885c0fcdfddea5423171468609b5a6 (patch)
tree	1174a4651bbdbe979a8bd33e97edf4011c8cc7e4 /Eigen/src/Core/arch/GPU/TypeCasting.h
parent	a45d28256d020a4e871267c9bf00206fe9d2265e (diff)