aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch/GPU/TypeCasting.h
diff options
context:
space:
mode:
authorGravatar Sami Kama <sami.kama.git@gmail.com>2020-03-10 20:28:43 +0000
committerGravatar Rasmus Munk Larsen <rmlarsen@google.com>2020-03-10 20:28:43 +0000
commitb733b8b680885c0fcdfddea5423171468609b5a6 (patch)
tree1174a4651bbdbe979a8bd33e97edf4011c8cc7e4 /Eigen/src/Core/arch/GPU/TypeCasting.h
parenta45d28256d020a4e871267c9bf00206fe9d2265e (diff)
remove duplicate pset1 for half and add some comments about why we need expose pmul/add/div/min/max on host
Diffstat (limited to 'Eigen/src/Core/arch/GPU/TypeCasting.h')
-rw-r--r--Eigen/src/Core/arch/GPU/TypeCasting.h33
1 files changed, 29 insertions, 4 deletions
diff --git a/Eigen/src/Core/arch/GPU/TypeCasting.h b/Eigen/src/Core/arch/GPU/TypeCasting.h
index c278f3fe8..754546225 100644
--- a/Eigen/src/Core/arch/GPU/TypeCasting.h
+++ b/Eigen/src/Core/arch/GPU/TypeCasting.h
@@ -17,12 +17,13 @@ namespace internal {
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+
template <>
struct type_casting_traits<Eigen::half, float> {
enum {
VectorizedCast = 1,
- SrcCoeffRatio = 2,
- TgtCoeffRatio = 1
+ SrcCoeffRatio = 1,
+ TgtCoeffRatio = 2
};
};
@@ -32,15 +33,39 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(con
return make_float4(r1.x, r1.y, r2.x, r2.y);
}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcast<float4, Packet4h2>(const float4& a, const float4& b) {
+ Packet4h2 r;
+ half2* r_alias=reinterpret_cast<half2*>(&r);
+ r_alias[0]=__floats2half2_rn(a.x,a.y);
+ r_alias[1]=__floats2half2_rn(a.z,a.w);
+ r_alias[2]=__floats2half2_rn(b.x,b.y);
+ r_alias[3]=__floats2half2_rn(b.z,b.w);
+ return r;
+}
+
template <>
struct type_casting_traits<float, Eigen::half> {
enum {
VectorizedCast = 1,
- SrcCoeffRatio = 1,
- TgtCoeffRatio = 2
+ SrcCoeffRatio = 2,
+ TgtCoeffRatio = 1
};
};
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<Packet4h2, float4>(const Packet4h2& a) {
+ // Simply discard the second half of the input
+ float4 r;
+ const half2* a_alias=reinterpret_cast<const half2*>(&a);
+ float2 r1 = __half22float2(a_alias[0]);
+ float2 r2 = __half22float2(a_alias[1]);
+ r.x=static_cast<float>(r1.x);
+ r.y=static_cast<float>(r1.y);
+ r.z=static_cast<float>(r2.x);
+ r.w=static_cast<float>(r2.y);
+ return r;
+}
+
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
// Simply discard the second half of the input
return __floats2half2_rn(a.x, a.y);