Merged eigen/eigen into default

author: Deven Desai <deven.desai.amd@gmail.com> 2019-03-19 16:52:38 -0400
committer: Deven Desai <deven.desai.amd@gmail.com> 2019-03-19 16:52:38 -0400
commit: 2dbea5510fe5cb64dbfdef9042c04a3a92b87f76 (patch)
tree: c187e7ec5e90a191e19466ff6084dd8f053dba7e /Eigen/src/Core/arch/GPU
parent: e7e6809e6b38a5928efc0b5ca9520258e4d1fb3a (diff)
parent: 5c93b38c5fca514a08084e32feb8a8fb27bf3665 (diff)
2 files changed, 325 insertions, 16 deletions
diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h
index ddf37b9c1..cd4615a45 100644
--- a/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h
@@ -53,6 +53,7 @@ template<> struct packet_traits<float> : default_packet_traits
     HasBetaInc = 1,
 
     HasBlend = 0,
+    HasFloor = 1,
   };
 };
 
@@ -86,12 +87,13 @@ template<> struct packet_traits<double> : default_packet_traits
     HasBetaInc = 1,
 
     HasBlend = 0,
+    HasFloor = 1,
   };
 };
 
 
-template<> struct unpacket_traits<float4>  { typedef float  type; enum {size=4, alignment=Aligned16}; typedef float4 half; };
-template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16}; typedef double2 half; };
+template<> struct unpacket_traits<float4>  { typedef float  type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef float4 half; };
+template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef double2 half; };
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float&  from) {
   return make_float4(from, from, from, from);
@@ -100,6 +102,117 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const do
   return make_double2(from, from);
 }
 
+namespace {
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a,
+                                                        const float& b) {
+  return __int_as_float(__float_as_int(a) & __float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a,
+                                                         const double& b) {
+  return __longlong_as_double(__double_as_longlong(a) &
+                              __double_as_longlong(b));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a,
+                                                       const float& b) {
+  return __int_as_float(__float_as_int(a) | __float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a,
+                                                        const double& b) {
+  return __longlong_as_double(__double_as_longlong(a) |
+                              __double_as_longlong(b));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a,
+                                                        const float& b) {
+  return __int_as_float(__float_as_int(a) ^ __float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a,
+                                                         const double& b) {
+  return __longlong_as_double(__double_as_longlong(a) ^
+                              __double_as_longlong(b));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a,
+                                                           const float& b) {
+  return __int_as_float(__float_as_int(a) & ~__float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a,
+                                                            const double& b) {
+  return __longlong_as_double(__double_as_longlong(a) &
+                              ~__double_as_longlong(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a,
+                                                    const float& b) {
+  return __int_as_float(a == b ? 0xffffffffu : 0u);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a,
+                                                     const double& b) {
+  return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull);
+}
+
+}  // namespace
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(const float4& a,
+                                                          const float4& b) {
+  return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y),
+                     bitwise_and(a.z, b.z), bitwise_and(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand<double2>(const double2& a,
+                                                            const double2& b) {
+  return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por<float4>(const float4& a,
+                                                         const float4& b) {
+  return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y),
+                     bitwise_or(a.z, b.z), bitwise_or(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por<double2>(const double2& a,
+                                                           const double2& b) {
+  return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor<float4>(const float4& a,
+                                                          const float4& b) {
+  return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y),
+                     bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor<double2>(const double2& a,
+                                                            const double2& b) {
+  return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot<float4>(const float4& a,
+                                                             const float4& b) {
+  return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y),
+                     bitwise_andnot(a.z, b.z), bitwise_andnot(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pandnot<double2>(const double2& a, const double2& b) {
+  return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq<float4>(const float4& a,
+                                                             const float4& b) {
+  return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z),
+                     eq_mask(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pcmp_eq<double2>(const double2& a, const double2& b) {
+  return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y));
+}
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
   return make_float4(a, a+1, a+2, a+3);
@@ -297,6 +410,13 @@ template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
   return make_double2(fabs(a.x), fabs(a.y));
 }
 
+template<> EIGEN_DEVICE_FUNC inline float4  pfloor<float4>(const float4& a) {
+  return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double2 pfloor<double2>(const double2& a) {
+  return make_double2(floor(a.x), floor(a.y));
+}
+
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<float4,4>& kernel) {
   float tmp = kernel.packet[0].y;
diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
index 8787adcde..869fa7ec6 100644
--- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
@@ -30,6 +30,7 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
     size=2,
     HasHalfPacket = 0,
     HasAdd    = 1,
+    HasSub    = 1,
     HasMul    = 1,
     HasDiv    = 1,
     HasSqrt   = 1,
@@ -41,7 +42,7 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
   };
 };
 
-template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
+template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef half2 half; };
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
 
@@ -137,12 +138,22 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const
 }
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
-  half2 result;
-  unsigned temp = *(reinterpret_cast<const unsigned*>(&(a)));
-  *(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF;
-  return result;
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF);
+  half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF);
+  return __halves2half2(result1, result2);
 }
 
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue<half2>(const half2& a) {
+  half true_half = half_impl::raw_uint16_to_half(0xffffu);
+  return pset1<half2>(true_half);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero<half2>(const half2& a) {
+  half false_half = half_impl::raw_uint16_to_half(0x0000u);
+  return pset1<half2>(false_half);
+}
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
 ptranspose(PacketBlock<half2,2>& kernel) {
@@ -171,6 +182,68 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen:
 #endif
 }
 
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq<half2>(const half2& a,
+                                                           const half2& b) {
+  half true_half = half_impl::raw_uint16_to_half(0xffffu);
+  half false_half = half_impl::raw_uint16_to_half(0x0000u);
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half;
+  half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half;
+  return __halves2half2(eq1, eq2);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand<half2>(const half2& a,
+                                                        const half2& b) {
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x);
+  half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x);
+  return __halves2half2(result1, result2);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por<half2>(const half2& a,
+                                                       const half2& b) {
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x);
+  half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x);
+  return __halves2half2(result1, result2);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor<half2>(const half2& a,
+                                                        const half2& b) {
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x);
+  half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x);
+  return __halves2half2(result1, result2);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot<half2>(const half2& a,
+                                                           const half2& b) {
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x);
+  half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x);
+  return __halves2half2(result1, result2);
+}
+
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
 #if defined(EIGEN_HIP_DEVICE_COMPILE)
 
@@ -500,6 +573,7 @@ struct packet_traits<half> : default_packet_traits {
     HasAdd    = 1,
     HasSub    = 1,
     HasMul    = 1,
+    HasDiv    = 1,
     HasNegate = 1,
     HasAbs    = 0,
     HasAbs2   = 0,
@@ -507,7 +581,6 @@ struct packet_traits<half> : default_packet_traits {
     HasMax    = 0,
     HasConj   = 0,
     HasSetLinear = 0,
-    HasDiv = 0,
     HasSqrt = 0,
     HasRsqrt = 0,
     HasExp = 0,
@@ -517,7 +590,7 @@ struct packet_traits<half> : default_packet_traits {
 };
 
 
-template<> struct unpacket_traits<Packet16h> { typedef Eigen::half type; enum {size=16, alignment=Aligned32}; typedef Packet16h half; };
+template<> struct unpacket_traits<Packet16h> { typedef Eigen::half type; enum {size=16, alignment=Aligned32, vectorizable=true}; typedef Packet16h half; };
 
 template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
   Packet16h result;
@@ -640,6 +713,36 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
 #endif
 }
 
+template<> EIGEN_STRONG_INLINE Packet16h pnot(const Packet16h& a) {
+  Packet16h r; r.x = _mm256_xor_si256(a.x, pcmp_eq(a.x, a.x)); return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) {
+  Packet16h r; r.x = Packet8i(ptrue(a.x)); return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) {
+  // in some cases Packet8i is a wrapper around __m256i, so we need to 
+  // cast to Packet8i to call the correct overload.
+  Packet16h r; r.x = por(Packet8i(a.x),Packet8i(b.x)); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a,const Packet16h& b) {
+  Packet16h r; r.x = pxor(Packet8i(a.x),Packet8i(b.x)); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a,const Packet16h& b) {
+  Packet16h r; r.x = pand(Packet8i(a.x),Packet8i(b.x)); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet16h& b) {
+  Packet16h r; r.x = pandnot(Packet8i(a.x),Packet8i(b.x)); return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a,const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = pcmp_eq(af, bf);
+  return float2half(rf);
+}
+
 template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
   // FIXME we could do that with bit manipulation
   Packet16f af = half2float(a);
@@ -668,6 +771,13 @@ template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, con
   return float2half(rf);
 }
 
+template<> EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = pdiv(af, bf);
+  return float2half(rf);
+}
+
 template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
   Packet16f from_float = half2float(from);
   return half(predux(from_float));
@@ -952,6 +1062,7 @@ struct packet_traits<Eigen::half> : default_packet_traits {
     HasAdd    = 1,
     HasSub    = 1,
     HasMul    = 1,
+    HasDiv    = 1,
     HasNegate = 1,
     HasAbs    = 0,
     HasAbs2   = 0,
@@ -959,7 +1070,6 @@ struct packet_traits<Eigen::half> : default_packet_traits {
     HasMax    = 0,
     HasConj   = 0,
     HasSetLinear = 0,
-    HasDiv = 0,
     HasSqrt = 0,
     HasRsqrt = 0,
     HasExp = 0,
@@ -969,7 +1079,7 @@ struct packet_traits<Eigen::half> : default_packet_traits {
 };
 
 
-template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16}; typedef Packet8h half; };
+template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true}; typedef Packet8h half; };
 
 template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
   Packet8h result;
@@ -1063,6 +1173,32 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
 #endif
 }
 
+template<> EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) {
+  Packet8h r; r.x = _mm_cmpeq_epi32(a.x, a.x); return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) {
+  // in some cases Packet4i is a wrapper around __m128i, so we either need to 
+  // cast to Packet4i to directly call the intrinsics as below:
+  Packet8h r; r.x = _mm_or_si128(a.x,b.x); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a,const Packet8h& b) {
+  Packet8h r; r.x = _mm_xor_si128(a.x,b.x); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a,const Packet8h& b) {
+  Packet8h r; r.x = _mm_and_si128(a.x,b.x); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h& b) {
+  Packet8h r; r.x = _mm_andnot_si128(b.x,a.x); return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a,const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = pcmp_eq(af, bf);
+  return float2half(rf);
+}
+
 template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
 
 template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
@@ -1093,6 +1229,13 @@ template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const
   return float2half(rf);
 }
 
+template<> EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = pdiv(af, bf);
+  return float2half(rf);
+}
+
 template<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride)
 {
   Packet8h result;
@@ -1279,9 +1422,10 @@ struct packet_traits<Eigen::half> : default_packet_traits {
     AlignedOnScalar = 1,
     size = 4,
     HasHalfPacket = 0,
-    HasAdd    = 0,
-    HasSub    = 0,
-    HasMul    = 0,
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
     HasNegate = 0,
     HasAbs    = 0,
     HasAbs2   = 0,
@@ -1289,7 +1433,6 @@ struct packet_traits<Eigen::half> : default_packet_traits {
     HasMax    = 0,
     HasConj   = 0,
     HasSetLinear = 0,
-    HasDiv = 0,
     HasSqrt = 0,
     HasRsqrt = 0,
     HasExp = 0,
@@ -1299,7 +1442,7 @@ struct packet_traits<Eigen::half> : default_packet_traits {
 };
 
 
-template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16}; typedef Packet4h half; };
+template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef Packet4h half; };
 
 template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
   Packet4h result;
@@ -1336,6 +1479,29 @@ template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const
   return result;
 }
 
+template<> EIGEN_STRONG_INLINE Packet4h psub<Packet4h>(const Packet4h& a, const Packet4h& b) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+  Eigen::half h[4];
+
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+  h[0] = ha - hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  h[1] = ha - hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  h[2] = ha - hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  h[3] = ha - hb;
+  Packet4h result;
+  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+  return result;
+}
+
 template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
   __int64_t a64 = _mm_cvtm64_si64(a.x);
   __int64_t b64 = _mm_cvtm64_si64(b.x);
@@ -1359,6 +1525,29 @@ template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const
   return result;
 }
 
+template<> EIGEN_STRONG_INLINE Packet4h pdiv<Packet4h>(const Packet4h& a, const Packet4h& b) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+  Eigen::half h[4];
+
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+  h[0] = ha / hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  h[1] = ha / hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  h[2] = ha / hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  h[3] = ha / hb;
+  Packet4h result;
+  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+  return result;
+}
+
 template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
   Packet4h result;
   result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
author	Deven Desai <deven.desai.amd@gmail.com>	2019-03-19 16:52:38 -0400
committer	Deven Desai <deven.desai.amd@gmail.com>	2019-03-19 16:52:38 -0400
commit	2dbea5510fe5cb64dbfdef9042c04a3a92b87f76 (patch)
tree	c187e7ec5e90a191e19466ff6084dd8f053dba7e /Eigen/src/Core/arch/GPU
parent	e7e6809e6b38a5928efc0b5ca9520258e4d1fb3a (diff)
parent	5c93b38c5fca514a08084e32feb8a8fb27bf3665 (diff)