aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch/NEON/PacketMath.h
diff options
context:
space:
mode:
authorGravatar Rasmus Munk Larsen <rmlarsen@google.com>2020-05-11 13:23:31 -0700
committerGravatar Rasmus Munk Larsen <rmlarsen@google.com>2020-05-14 22:39:13 +0000
commit9b411757abd8458f9689b1384c6bf75da9b82357 (patch)
treeeb23c0c65eaa4df204880c94b287fcf1b43e14cd /Eigen/src/Core/arch/NEON/PacketMath.h
parentd640276d31a7dea9207a68a061a6fa7c9fdf50e5 (diff)
Add missing packet ops for bool, and make it pass the same packet op unit tests as other arithmetic types.
This change also contains a few minor cleanups: 1. Remove packet op pnot, which is not needed for anything other than pcmp_le_or_nan, which can be done in other ways. 2. Remove the "HasInsert" enum, which is no longer needed since we removed the corresponding packet ops. 3. Add faster pselect op for Packet4i when SSE4.1 is supported. Among other things, this makes the fast transposeInPlace() method available for Matrix<bool>. Run on ************** (72 X 2994 MHz CPUs); 2020-05-09T10:51:02.372347913-07:00 CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB Benchmark Time(ns) CPU(ns) Iterations ----------------------------------------------------------------------- BM_TransposeInPlace<float>/4 9.77 9.77 71670320 BM_TransposeInPlace<float>/8 21.9 21.9 31929525 BM_TransposeInPlace<float>/16 66.6 66.6 10000000 BM_TransposeInPlace<float>/32 243 243 2879561 BM_TransposeInPlace<float>/59 844 844 829767 BM_TransposeInPlace<float>/64 933 933 750567 BM_TransposeInPlace<float>/128 3944 3945 177405 BM_TransposeInPlace<float>/256 16853 16853 41457 BM_TransposeInPlace<float>/512 204952 204968 3448 BM_TransposeInPlace<float>/1k 1053889 1053861 664 BM_TransposeInPlace<bool>/4 14.4 14.4 48637301 BM_TransposeInPlace<bool>/8 36.0 36.0 19370222 BM_TransposeInPlace<bool>/16 31.5 31.5 22178902 BM_TransposeInPlace<bool>/32 111 111 6272048 BM_TransposeInPlace<bool>/59 626 626 1000000 BM_TransposeInPlace<bool>/64 428 428 1632689 BM_TransposeInPlace<bool>/128 1677 1677 417377 BM_TransposeInPlace<bool>/256 7126 7126 96264 BM_TransposeInPlace<bool>/512 29021 29024 24165 BM_TransposeInPlace<bool>/1k 116321 116330 6068
Diffstat (limited to 'Eigen/src/Core/arch/NEON/PacketMath.h')
-rw-r--r--Eigen/src/Core/arch/NEON/PacketMath.h57
1 files changed, 5 insertions, 52 deletions
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index ee5a938b9..e11af1dca 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -136,7 +136,6 @@ struct packet_traits<float> : default_packet_traits
HasConj = 1,
HasSetLinear = 0,
HasBlend = 0,
- HasInsert = 1,
HasDiv = 1,
HasFloor = 1,
@@ -177,8 +176,7 @@ struct packet_traits<int8_t> : default_packet_traits
HasMax = 1,
HasConj = 1,
HasSetLinear = 0,
- HasBlend = 0,
- HasInsert = 1,
+ HasBlend = 0
};
};
@@ -209,7 +207,6 @@ struct packet_traits<uint8_t> : default_packet_traits
HasConj = 1,
HasSetLinear = 0,
HasBlend = 0,
- HasInsert = 1,
HasSqrt = 1
};
@@ -241,8 +238,7 @@ struct packet_traits<int16_t> : default_packet_traits
HasMax = 1,
HasConj = 1,
HasSetLinear = 0,
- HasBlend = 0,
- HasInsert = 1,
+ HasBlend = 0
};
};
@@ -273,8 +269,6 @@ struct packet_traits<uint16_t> : default_packet_traits
HasConj = 1,
HasSetLinear = 0,
HasBlend = 0,
- HasInsert = 1,
-
HasSqrt = 1
};
};
@@ -305,8 +299,7 @@ struct packet_traits<int32_t> : default_packet_traits
HasMax = 1,
HasConj = 1,
HasSetLinear = 0,
- HasBlend = 0,
- HasInsert = 1,
+ HasBlend = 0
};
};
@@ -337,7 +330,6 @@ struct packet_traits<uint32_t> : default_packet_traits
HasConj = 1,
HasSetLinear = 0,
HasBlend = 0,
- HasInsert = 1,
HasSqrt = 1
};
@@ -370,8 +362,7 @@ struct packet_traits<int64_t> : default_packet_traits
HasMax = 1,
HasConj = 1,
HasSetLinear = 0,
- HasBlend = 0,
- HasInsert = 1,
+ HasBlend = 0
};
};
@@ -402,8 +393,7 @@ struct packet_traits<uint64_t> : default_packet_traits
HasMax = 1,
HasConj = 1,
HasSetLinear = 0,
- HasBlend = 0,
- HasInsert = 1,
+ HasBlend = 0
};
};
@@ -1498,42 +1488,6 @@ template<> EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, con
template<> EIGEN_STRONG_INLINE Packet2ul pandnot<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
{ return vbicq_u64(a,b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pnot<Packet2f>(const Packet2f& a)
-{ return vreinterpret_f32_u32(vmvn_u32(vreinterpret_u32_f32(a))); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pnot<Packet4f>(const Packet4f& a)
-{ return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(a))); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c pnot<Packet4c>(const Packet4c& a)
-{ return ~a; }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pnot<Packet8c>(const Packet8c& a)
-{ return vmvn_s8(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pnot<Packet16c>(const Packet16c& a)
-{ return vmvnq_s8(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc pnot<Packet4uc>(const Packet4uc& a)
-{ return ~a; }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pnot<Packet8uc>(const Packet8uc& a)
-{ return vmvn_u8(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pnot<Packet16uc>(const Packet16uc& a)
-{ return vmvnq_u8(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pnot<Packet4s>(const Packet4s& a)
-{ return vmvn_s16(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pnot<Packet8s>(const Packet8s& a)
-{ return vmvnq_s16(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pnot<Packet4us>(const Packet4us& a)
-{ return vmvn_u16(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pnot<Packet8us>(const Packet8us& a)
-{ return vmvnq_u16(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pnot<Packet2i>(const Packet2i& a)
-{ return vmvn_s32(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pnot<Packet4i>(const Packet4i& a)
-{ return vmvnq_s32(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pnot<Packet2ui>(const Packet2ui& a)
-{ return vmvn_u32(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pnot<Packet4ui>(const Packet4ui& a)
-{ return vmvnq_u32(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pnot<Packet2l>(const Packet2l& a)
-{ return vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(a))); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pnot<Packet2ul>(const Packet2ul& a)
-{ return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(a))); }
template<int N> EIGEN_STRONG_INLINE Packet4c parithmetic_shift_right(Packet4c& a)
{ return vget_lane_s32(vreinterpret_s32_s8(vshr_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); }
@@ -3218,7 +3172,6 @@ template<> struct packet_traits<double> : default_packet_traits
HasConj = 1,
HasSetLinear = 0,
HasBlend = 0,
- HasInsert = 1,
HasDiv = 1,
HasFloor = 1,