From 52d54278beefee8b2f19dcca4fd900916154e174 Mon Sep 17 00:00:00 2001 From: Joel Holdsworth Date: Thu, 26 Mar 2020 20:18:19 +0000 Subject: Additional NEON packet-math operations --- Eigen/src/Core/GenericPacketMath.h | 1 + Eigen/src/Core/arch/AVX/Complex.h | 3 +- Eigen/src/Core/arch/AVX/PacketMath.h | 5 +- Eigen/src/Core/arch/AVX512/Complex.h | 3 +- Eigen/src/Core/arch/AVX512/PacketMath.h | 5 +- Eigen/src/Core/arch/NEON/Complex.h | 5 + Eigen/src/Core/arch/NEON/PacketMath.h | 149 ++++++++++++++- Eigen/src/Core/arch/NEON/TypeCasting.h | 322 ++++++++++++++++++++++++++++++++ Eigen/src/Core/arch/SSE/Complex.h | 3 +- Eigen/src/Core/arch/SSE/PacketMath.h | 4 +- test/packetmath.cpp | 69 ++++++- 11 files changed, 560 insertions(+), 9 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index ddfdc39e6..19078fc80 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -58,6 +58,7 @@ struct default_packet_traits HasConj = 1, HasSetLinear = 1, HasBlend = 0, + HasInsert = 0, HasReduxp = 1, HasDiv = 0, diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index f97efd471..4a80bae05 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -42,7 +42,8 @@ template<> struct packet_traits > : default_packet_traits HasAbs2 = 0, HasMin = 0, HasMax = 0, - HasSetLinear = 0 + HasSetLinear = 0, + HasInsert = 1 }; }; #endif diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index b7d37f92b..5ab989ca9 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -64,6 +64,7 @@ template<> struct packet_traits : default_packet_traits AlignedOnScalar = 1, size = 8, HasHalfPacket = 1, + HasInsert = 1, HasDiv = 1, HasSin = EIGEN_FAST_MATH, @@ -94,6 +95,7 @@ template<> struct packet_traits : default_packet_traits AlignedOnScalar = 1, size=4, HasHalfPacket = 1, + HasInsert = 1, HasDiv = 1, HasExp = 1, @@ -131,7 +133,8 @@ struct packet_traits : default_packet_traits { HasRsqrt = 0, HasExp = 0, HasLog = 0, - HasBlend = 0 + HasBlend = 0, + HasInsert = 1 }; }; #endif diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 9ee7a284e..4d0318611 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -42,7 +42,8 @@ template<> struct packet_traits > : default_packet_traits HasMin = 0, HasMax = 0, HasSetLinear = 0, - HasReduxp = 0 + HasReduxp = 0, + HasInsert = 1 }; }; diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 98a8a5b0f..99ccc038c 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -76,7 +76,8 @@ struct packet_traits : default_packet_traits { HasRsqrt = 0, HasExp = 0, HasLog = 0, - HasBlend = 0 + HasBlend = 0, + HasInsert = 1 }; }; @@ -90,6 +91,7 @@ template<> struct packet_traits : default_packet_traits size = 16, HasHalfPacket = 1, HasBlend = 0, + HasInsert = 1, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) @@ -118,6 +120,7 @@ template<> struct packet_traits : default_packet_traits AlignedOnScalar = 1, size = 8, HasHalfPacket = 1, + HasInsert = 1, #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) HasSqrt = EIGEN_FAST_MATH, HasRsqrt = EIGEN_FAST_MATH, diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index aeb593468..aca3c9e81 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -99,6 +99,11 @@ template<> struct unpacket_traits }; }; +template<> EIGEN_STRONG_INLINE Packet1cf pcast(const float& a) +{ return Packet1cf(vset_lane_f32(a, vdup_n_f32(0.f), 0)); } +template<> EIGEN_STRONG_INLINE Packet2cf pcast(const Packet2f& a) +{ return Packet2cf(vreinterpretq_f32_u64(vmovl_u32(vreinterpret_u32_f32(a)))); } + template<> EIGEN_STRONG_INLINE Packet1cf pset1(const std::complex& from) { return Packet1cf(vld1_f32(reinterpret_cast(&from))); } template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 76c61b42f..326873f8a 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -137,6 +137,7 @@ struct packet_traits : default_packet_traits size = 4, HasHalfPacket = 1, + HasCast = 1, HasAdd = 1, HasSub = 1, HasShift = 1, @@ -151,6 +152,7 @@ struct packet_traits : default_packet_traits HasConj = 1, HasSetLinear = 0, HasBlend = 0, + HasInsert = 1, HasReduxp = 1, HasDiv = 1, @@ -178,6 +180,7 @@ struct packet_traits : default_packet_traits size = 16, HasHalfPacket = 1, + HasCast = 1, HasAdd = 1, HasSub = 1, HasShift = 1, @@ -192,6 +195,7 @@ struct packet_traits : default_packet_traits HasConj = 1, HasSetLinear = 0, HasBlend = 0, + HasInsert = 1, HasReduxp = 1 }; }; @@ -208,6 +212,7 @@ struct packet_traits : default_packet_traits size = 16, HasHalfPacket = 1, + HasCast = 1, HasAdd = 1, HasSub = 1, HasShift = 1, @@ -222,6 +227,7 @@ struct packet_traits : default_packet_traits HasConj = 1, HasSetLinear = 0, HasBlend = 0, + HasInsert = 1, HasReduxp = 1, HasSqrt = 1 @@ -240,6 +246,7 @@ struct packet_traits : default_packet_traits size = 8, HasHalfPacket = 1, + HasCast = 1, HasAdd = 1, HasSub = 1, HasShift = 1, @@ -254,6 +261,7 @@ struct packet_traits : default_packet_traits HasConj = 1, HasSetLinear = 0, HasBlend = 0, + HasInsert = 1, HasReduxp = 1 }; }; @@ -270,6 +278,7 @@ struct packet_traits : default_packet_traits size = 8, HasHalfPacket = 1, + HasCast = 1, HasAdd = 1, HasSub = 1, HasShift = 1, @@ -284,6 +293,7 @@ struct packet_traits : default_packet_traits HasConj = 1, HasSetLinear = 0, HasBlend = 0, + HasInsert = 1, HasReduxp = 1, HasSqrt = 1 @@ -302,6 +312,7 @@ struct packet_traits : default_packet_traits size = 4, HasHalfPacket = 1, + HasCast = 1, HasAdd = 1, HasSub = 1, HasShift = 1, @@ -316,6 +327,7 @@ struct packet_traits : default_packet_traits HasConj = 1, HasSetLinear = 0, HasBlend = 0, + HasInsert = 1, HasReduxp = 1 }; }; @@ -332,6 +344,7 @@ struct packet_traits : default_packet_traits size = 4, HasHalfPacket = 1, + HasCast = 1, HasAdd = 1, HasSub = 1, HasShift = 1, @@ -346,6 +359,7 @@ struct packet_traits : default_packet_traits HasConj = 1, HasSetLinear = 0, HasBlend = 0, + HasInsert = 1, HasReduxp = 1, HasSqrt = 1 @@ -1509,6 +1523,43 @@ template<> EIGEN_STRONG_INLINE Packet2l pandnot(const Packet2l& a, con template<> EIGEN_STRONG_INLINE Packet2ul pandnot(const Packet2ul& a, const Packet2ul& b) { return vbicq_u64(a,b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pnot(const Packet2f& a) +{ return vreinterpret_f32_u32(vmvn_u32(vreinterpret_u32_f32(a))); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pnot(const Packet4f& a) +{ return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(a))); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c pnot(const Packet4c& a) +{ return ~a; } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pnot(const Packet8c& a) +{ return vmvn_s8(a); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pnot(const Packet16c& a) +{ return vmvnq_s8(a); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc pnot(const Packet4uc& a) +{ return ~a; } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pnot(const Packet8uc& a) +{ return vmvn_u8(a); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pnot(const Packet16uc& a) +{ return vmvnq_u8(a); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pnot(const Packet4s& a) +{ return vmvn_s16(a); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pnot(const Packet8s& a) +{ return vmvnq_s16(a); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pnot(const Packet4us& a) +{ return vmvn_u16(a); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pnot(const Packet8us& a) +{ return vmvnq_u16(a); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pnot(const Packet2i& a) +{ return vmvn_s32(a); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pnot(const Packet4i& a) +{ return vmvnq_s32(a); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pnot(const Packet2ui& a) +{ return vmvn_u32(a); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pnot(const Packet4ui& a) +{ return vmvnq_u32(a); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pnot(const Packet2l& a) +{ return vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(a))); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pnot(const Packet2ul& a) +{ return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(a))); } + template EIGEN_STRONG_INLINE Packet4c parithmetic_shift_right(Packet4c& a) { return vget_lane_s32(vreinterpret_s32_s8(vshr_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); } template EIGEN_STRONG_INLINE Packet8c parithmetic_shift_right(Packet8c a) { return vshr_n_s8(a,N); } @@ -3431,6 +3482,82 @@ ptranspose(PacketBlock& kernel) #endif } +template<> EIGEN_DEVICE_FUNC inline Packet2f pselect( const Packet2f& mask, const Packet2f& a, const Packet2f& b) +{ return vbsl_f32(vreinterpret_u32_f32(mask), a, b); } +template<> EIGEN_DEVICE_FUNC inline Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) +{ return vbslq_f32(vreinterpretq_u32_f32(mask), a, b); } +template<> EIGEN_DEVICE_FUNC inline Packet8c pselect(const Packet8c& mask, const Packet8c& a, const Packet8c& b) +{ return vbsl_s8(vreinterpret_u8_s8(mask), a, b); } +template<> EIGEN_DEVICE_FUNC inline Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b) +{ return vbslq_s8(vreinterpretq_u8_s8(mask), a, b); } +template<> EIGEN_DEVICE_FUNC inline Packet8uc pselect(const Packet8uc& mask, const Packet8uc& a, const Packet8uc& b) +{ return vbsl_u8(mask, a, b); } +template<> EIGEN_DEVICE_FUNC inline Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a, const Packet16uc& b) +{ return vbslq_u8(mask, a, b); } +template<> EIGEN_DEVICE_FUNC inline Packet4s pselect(const Packet4s& mask, const Packet4s& a, const Packet4s& b) +{ return vbsl_s16(vreinterpret_u16_s16(mask), a, b); } +template<> EIGEN_DEVICE_FUNC inline Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b) +{ return vbslq_s16(vreinterpretq_u16_s16(mask), a, b); } +template<> EIGEN_DEVICE_FUNC inline Packet4us pselect(const Packet4us& mask, const Packet4us& a, const Packet4us& b) +{ return vbsl_u16(mask, a, b); } +template<> EIGEN_DEVICE_FUNC inline Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b) +{ return vbslq_u16(mask, a, b); } +template<> EIGEN_DEVICE_FUNC inline Packet2i pselect(const Packet2i& mask, const Packet2i& a, const Packet2i& b) +{ return vbsl_s32(vreinterpret_u32_s32(mask), a, b); } +template<> EIGEN_DEVICE_FUNC inline Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) +{ return vbslq_s32(vreinterpretq_u32_s32(mask), a, b); } +template<> EIGEN_DEVICE_FUNC inline Packet2ui pselect(const Packet2ui& mask, const Packet2ui& a, const Packet2ui& b) +{ return vbsl_u32(mask, a, b); } +template<> EIGEN_DEVICE_FUNC inline Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) +{ return vbslq_u32(mask, a, b); } +template<> EIGEN_DEVICE_FUNC inline Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) +{ return vbslq_s64(vreinterpretq_u64_s64(mask), a, b); } +template<> EIGEN_DEVICE_FUNC inline Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b) +{ return vbslq_u64(mask, a, b); } + +EIGEN_DEVICE_FUNC inline Packet2f pinsertfirst(const Packet2f& a, float b) { return vset_lane_f32(b, a, 0); } +EIGEN_DEVICE_FUNC inline Packet4f pinsertfirst(const Packet4f& a, float b) { return vsetq_lane_f32(b, a, 0); } +EIGEN_DEVICE_FUNC inline Packet4c pinsertfirst(const Packet4c& a, int8_t b) +{ + return static_cast((static_cast(a) & 0xffffff00u) | + (static_cast(b) & 0xffu)); +} +EIGEN_DEVICE_FUNC inline Packet8c pinsertfirst(const Packet8c& a, int8_t b) { return vset_lane_s8(b, a, 0); } +EIGEN_DEVICE_FUNC inline Packet16c pinsertfirst(const Packet16c& a, int8_t b) { return vsetq_lane_s8(b, a, 0); } +EIGEN_DEVICE_FUNC inline Packet4uc pinsertfirst(const Packet4uc& a, uint8_t b) { return (a & ~0xffu) | b; } +EIGEN_DEVICE_FUNC inline Packet8uc pinsertfirst(const Packet8uc& a, uint8_t b) { return vset_lane_u8(b, a, 0); } +EIGEN_DEVICE_FUNC inline Packet16uc pinsertfirst(const Packet16uc& a, uint8_t b) { return vsetq_lane_u8(b, a, 0); } +EIGEN_DEVICE_FUNC inline Packet4s pinsertfirst(const Packet4s& a, int16_t b) { return vset_lane_s16(b, a, 0); } +EIGEN_DEVICE_FUNC inline Packet8s pinsertfirst(const Packet8s& a, int16_t b) { return vsetq_lane_s16(b, a, 0); } +EIGEN_DEVICE_FUNC inline Packet4us pinsertfirst(const Packet4us& a, uint16_t b) { return vset_lane_u16(b, a, 0); } +EIGEN_DEVICE_FUNC inline Packet8us pinsertfirst(const Packet8us& a, uint16_t b) { return vsetq_lane_u16(b, a, 0); } +EIGEN_DEVICE_FUNC inline Packet2i pinsertfirst(const Packet2i& a, int32_t b) { return vset_lane_s32(b, a, 0); } +EIGEN_DEVICE_FUNC inline Packet4i pinsertfirst(const Packet4i& a, int32_t b) { return vsetq_lane_s32(b, a, 0); } +EIGEN_DEVICE_FUNC inline Packet2ui pinsertfirst(const Packet2ui& a, uint32_t b) { return vset_lane_u32(b, a, 0); } +EIGEN_DEVICE_FUNC inline Packet4ui pinsertfirst(const Packet4ui& a, uint32_t b) { return vsetq_lane_u32(b, a, 0); } +EIGEN_DEVICE_FUNC inline Packet2l pinsertfirst(const Packet2l& a, int64_t b) { return vsetq_lane_s64(b, a, 0); } +EIGEN_DEVICE_FUNC inline Packet2ul pinsertfirst(const Packet2ul& a, uint64_t b) { return vsetq_lane_u64(b, a, 0); } + +EIGEN_DEVICE_FUNC inline Packet2f pinsertlast(const Packet2f& a, float b) { return vset_lane_f32(b, a, 1); } +EIGEN_DEVICE_FUNC inline Packet4f pinsertlast(const Packet4f& a, float b) { return vsetq_lane_f32(b, a, 3); } +EIGEN_DEVICE_FUNC inline Packet4c pinsertlast(const Packet4c& a, int8_t b) +{ return (static_cast(a) & 0x00ffffffu) | (static_cast(b) << 24); } +EIGEN_DEVICE_FUNC inline Packet8c pinsertlast(const Packet8c& a, int8_t b) { return vset_lane_s8(b, a, 7); } +EIGEN_DEVICE_FUNC inline Packet16c pinsertlast(const Packet16c& a, int8_t b) { return vsetq_lane_s8(b, a, 15); } +EIGEN_DEVICE_FUNC inline Packet4uc pinsertlast(const Packet4uc& a, uint8_t b) { return (a & ~0xff000000u) | (b << 24); } +EIGEN_DEVICE_FUNC inline Packet8uc pinsertlast(const Packet8uc& a, uint8_t b) { return vset_lane_u8(b, a, 7); } +EIGEN_DEVICE_FUNC inline Packet16uc pinsertlast(const Packet16uc& a, uint8_t b) { return vsetq_lane_u8(b, a, 15); } +EIGEN_DEVICE_FUNC inline Packet4s pinsertlast(const Packet4s& a, int16_t b) { return vset_lane_s16(b, a, 3); } +EIGEN_DEVICE_FUNC inline Packet8s pinsertlast(const Packet8s& a, int16_t b) { return vsetq_lane_s16(b, a, 7); } +EIGEN_DEVICE_FUNC inline Packet4us pinsertlast(const Packet4us& a, uint16_t b) { return vset_lane_u16(b, a, 3); } +EIGEN_DEVICE_FUNC inline Packet8us pinsertlast(const Packet8us& a, uint16_t b) { return vsetq_lane_u16(b, a, 7); } +EIGEN_DEVICE_FUNC inline Packet2i pinsertlast(const Packet2i& a, int32_t b) { return vset_lane_s32(b, a, 1); } +EIGEN_DEVICE_FUNC inline Packet4i pinsertlast(const Packet4i& a, int32_t b) { return vsetq_lane_s32(b, a, 3); } +EIGEN_DEVICE_FUNC inline Packet2ui pinsertlast(const Packet2ui& a, uint32_t b) { return vset_lane_u32(b, a, 1); } +EIGEN_DEVICE_FUNC inline Packet4ui pinsertlast(const Packet4ui& a, uint32_t b) { return vsetq_lane_u32(b, a, 3); } +EIGEN_DEVICE_FUNC inline Packet2l pinsertlast(const Packet2l& a, int64_t b) { return vsetq_lane_s64(b, a, 1); } +EIGEN_DEVICE_FUNC inline Packet2ul pinsertlast(const Packet2ul& a, uint64_t b) { return vsetq_lane_u64(b, a, 1); } + /** * Computes the integer square root * @remarks The calculation is performed using an algorithm which iterates through each binary digit of the result @@ -3579,7 +3706,7 @@ template<> struct packet_traits : default_packet_traits HasReduxp = 1, HasDiv = 1, - HasFloor = 0, + HasFloor = 1, HasSin = 0, HasCos = 0, @@ -3639,6 +3766,18 @@ template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) +{ + const Packet2d cst_1 = pset1(1.0); + /* perform a floorf */ + const Packet2d tmp = vcvtq_f64_s64(vcvtq_s64_f64(a)); + + /* if greater, substract 1 */ + uint64x2_t mask = vcgtq_f64(tmp, a); + mask = vandq_u64(mask, vreinterpretq_u64_f64(cst_1)); + return vsubq_f64(tmp, vreinterpretq_f64_u64(mask)); +} + // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); } @@ -3755,6 +3894,14 @@ ptranspose(PacketBlock& kernel) kernel.packet[0] = tmp1; kernel.packet[1] = tmp2; } + +template<> EIGEN_DEVICE_FUNC inline Packet2d pselect( const Packet2d& mask, const Packet2d& a, const Packet2d& b) +{ return vbslq_f64(vreinterpretq_u64_f64(mask), a, b); } + +EIGEN_DEVICE_FUNC inline Packet2d pinsertfirst(const Packet2d& a, double b) { return vsetq_lane_f64(b, a, 0); } + +EIGEN_DEVICE_FUNC inline Packet2d pinsertlast(const Packet2d& a, double b) { return vsetq_lane_f64(b, a, 1); } + #endif // EIGEN_ARCH_ARM64 } // end namespace internal diff --git a/Eigen/src/Core/arch/NEON/TypeCasting.h b/Eigen/src/Core/arch/NEON/TypeCasting.h index 298088c2d..4aa98e370 100644 --- a/Eigen/src/Core/arch/NEON/TypeCasting.h +++ b/Eigen/src/Core/arch/NEON/TypeCasting.h @@ -14,23 +14,289 @@ namespace Eigen { namespace internal { +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; template<> struct type_casting_traits { enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; template<> struct type_casting_traits { enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; template<> struct type_casting_traits { enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; template<> struct type_casting_traits { enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; template<> EIGEN_STRONG_INLINE Packet2f pcast(const Packet2i& a) { return vcvt_f32_s32(a); } template<> EIGEN_STRONG_INLINE Packet2f pcast(const Packet2ui& a) { return vcvt_f32_u32(a); } +template<> EIGEN_STRONG_INLINE Packet2f pcast(const Packet2l& a) +{ return vcvt_f32_s32(vmovn_s64(a)); } +template<> EIGEN_STRONG_INLINE Packet2f pcast(const Packet2ul& a) +{ return vcvt_f32_u32(vmovn_u64(a)); } +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4c& a) +{ return vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s32(vdup_n_s32(a)))))); } +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4uc& a) +{ return vcvtq_f32_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(a))))))); } +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4s& a) +{ return vcvtq_f32_s32(vmovl_s16(a)); } +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4us& a) +{ return vcvtq_f32_s32(vreinterpretq_s32_u32(vmovl_u16(a))); } template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { return vcvtq_f32_s32(a); } template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4ui& a) { return vcvtq_f32_u32(a); } +template<> EIGEN_STRONG_INLINE Packet4c pcast(const Packet4f& a) +{ + const int16x4_t b = vmovn_s32(vcvtq_s32_f32(a)); + return vget_lane_s32(vreinterpret_s32_s8(vmovn_s16(vcombine_s16(b, b))), 0); +} +template<> EIGEN_STRONG_INLINE Packet4c pcast(const Packet4uc& a) +{ return static_cast(a); } +template<> EIGEN_STRONG_INLINE Packet4c pcast(const Packet4s& a) +{ return vget_lane_s32(vreinterpret_s32_s8(vmovn_s16(vcombine_s16(a, a))), 0); } +template<> EIGEN_STRONG_INLINE Packet4c pcast(const Packet4us& a) +{ + const int16x4_t b = vreinterpret_s16_u16(a); + return vget_lane_s32(vreinterpret_s32_s8(vmovn_s16(vcombine_s16(b, b))), 0); +} +template<> EIGEN_STRONG_INLINE Packet4c pcast(const Packet4i& a) +{ + const int16x4_t b = vmovn_s32(a); + return vget_lane_s32(vreinterpret_s32_s8(vmovn_s16(vcombine_s16(b, b))), 0); +} +template<> EIGEN_STRONG_INLINE Packet4c pcast(const Packet4ui& a) +{ + const int16x4_t b = vmovn_s32(vreinterpretq_s32_u32(a)); + return vget_lane_s32(vreinterpret_s32_s8(vmovn_s16(vcombine_s16(b, b))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c pcast(const Packet8uc& a) { return vreinterpret_s8_u8(a); } +template<> EIGEN_STRONG_INLINE Packet8c pcast(const Packet8s& a) { return vmovn_s16(a); } +template<> EIGEN_STRONG_INLINE Packet8c pcast(const Packet8us& a) +{ return vreinterpret_s8_u8(vmovn_u16(a)); } +template<> EIGEN_STRONG_INLINE Packet16c pcast(const Packet16uc& a) +{ return vreinterpretq_s8_u8(a); } +template<> EIGEN_STRONG_INLINE Packet4uc pcast(const Packet4f& a) +{ + const uint16x4_t b = vmovn_u32(vreinterpretq_u32_s32(vcvtq_s32_f32(a))); + return vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(b, b))), 0); +} +template<> EIGEN_STRONG_INLINE Packet4uc pcast(const Packet4i& a) +{ + const uint16x4_t b = vmovn_u32(vreinterpretq_u32_s32(a)); + return vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(b, b))), 0); +} +template<> EIGEN_STRONG_INLINE Packet4uc pcast(const Packet4ui& a) +{ + const uint16x4_t b = vmovn_u32(a); + return vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(b, b))), 0); +} +template<> EIGEN_STRONG_INLINE Packet4uc pcast(const Packet4c& a) +{ return static_cast(a); } +template<> EIGEN_STRONG_INLINE Packet4uc pcast(const Packet4s& a) +{ + const uint16x4_t b = vreinterpret_u16_s16(a); + return vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(b, b))), 0); +} +template<> EIGEN_STRONG_INLINE Packet4uc pcast(const Packet4us& a) +{ return vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(a, a))), 0); } +template<> EIGEN_STRONG_INLINE Packet8uc pcast(const Packet8c& a) { return vreinterpret_u8_s8(a); } +template<> EIGEN_STRONG_INLINE Packet8uc pcast(const Packet8s& a) +{ return vreinterpret_u8_s8(vmovn_s16(a)); } +template<> EIGEN_STRONG_INLINE Packet8uc pcast(const Packet8us& a) { return vmovn_u16(a); } +template<> EIGEN_STRONG_INLINE Packet16uc pcast(const Packet16c& a) +{ return vreinterpretq_u8_s8(a); } +template<> EIGEN_STRONG_INLINE Packet4s pcast(const Packet4f& a) +{ return vmovn_s32(vcvtq_s32_f32(a)); } +template<> EIGEN_STRONG_INLINE Packet4s pcast(const Packet4c& a) +{ return vget_low_s16(vmovl_s8(vreinterpret_s8_s32(vdup_n_s32(a)))); } +template<> EIGEN_STRONG_INLINE Packet4s pcast(const Packet4uc& a) +{ return vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(a))))); } +template<> EIGEN_STRONG_INLINE Packet4s pcast(const Packet4us& a) +{ return vreinterpret_s16_u16(a); } +template<> EIGEN_STRONG_INLINE Packet4s pcast(const Packet4i& a) { return vmovn_s32(a); } +template<> EIGEN_STRONG_INLINE Packet4s pcast(const Packet4ui& a) +{ return vmovn_s32(vreinterpretq_s32_u32(a)); } +template<> EIGEN_STRONG_INLINE Packet8s pcast(const Packet8uc& a) +{ return vreinterpretq_s16_u16(vmovl_u8(a)); } +template<> EIGEN_STRONG_INLINE Packet8s pcast(const Packet8c& a) { return vmovl_s8(a); } +template<> EIGEN_STRONG_INLINE Packet8s pcast(const Packet8us& a) +{ return vreinterpretq_s16_u16(a); } +template<> EIGEN_STRONG_INLINE Packet4us pcast(const Packet4f& a) +{ return vmovn_u32(vreinterpretq_u32_s32(vcvtq_s32_f32(a))); } +template<> EIGEN_STRONG_INLINE Packet4us pcast(const Packet4c& a) +{ return vget_low_u16(vreinterpretq_u16_s16(vmovl_s8(vreinterpret_s8_s32(vdup_n_s32(a))))); } +template<> EIGEN_STRONG_INLINE Packet4us pcast(const Packet4uc& a) +{ return vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(a)))); } +template<> EIGEN_STRONG_INLINE Packet4us pcast(const Packet4s& a) +{ return vreinterpret_u16_s16(a); } +template<> EIGEN_STRONG_INLINE Packet4us pcast(const Packet4i& a) +{ return vmovn_u32(vreinterpretq_u32_s32(a)); } +template<> EIGEN_STRONG_INLINE Packet4us pcast(const Packet4ui& a) { return vmovn_u32(a); } +template<> EIGEN_STRONG_INLINE Packet8us pcast(const Packet8c& a) +{ return vreinterpretq_u16_s16(vmovl_s8(a)); } +template<> EIGEN_STRONG_INLINE Packet8us pcast(const Packet8uc& a) { return vmovl_u8(a); } +template<> EIGEN_STRONG_INLINE Packet8us pcast(const Packet8s& a) +{ return vreinterpretq_u16_s16(a); } template<> EIGEN_STRONG_INLINE Packet2i pcast(const Packet2f& a) { return vcvt_s32_f32(a); } +template<> EIGEN_STRONG_INLINE Packet2i pcast(const Packet2ui& a) +{ return vreinterpret_s32_u32(a); } +template<> EIGEN_STRONG_INLINE Packet2i pcast(const Packet2l& a) +{ return vmovn_s64(a); } +template<> EIGEN_STRONG_INLINE Packet2i pcast(const Packet2ul& a) +{ return vmovn_s64(vreinterpretq_s64_u64(a)); } template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { return vcvtq_s32_f32(a); } +template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4c& a) +{ return vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s32(vdup_n_s32(a))))); } +template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4uc& a) +{ return vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(a)))))); } +template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4s& a) { return vmovl_s16(a); } +template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4us& a) +{ return vreinterpretq_s32_u32(vmovl_u16(a)); } +template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4ui& a) +{ return vreinterpretq_s32_u32(a); } template<> EIGEN_STRONG_INLINE Packet2ui pcast(const Packet2f& a) { return vcvt_u32_f32(a); } +template<> EIGEN_STRONG_INLINE Packet2ui pcast(const Packet2i& a) +{ return vreinterpret_u32_s32(a); } +template<> EIGEN_STRONG_INLINE Packet2ui pcast(const Packet2l& a) +{ return vmovn_u64(vreinterpretq_u64_s64(a)); } +template<> EIGEN_STRONG_INLINE Packet2ui pcast(const Packet2ul& a) +{ return vmovn_u64(a); } template<> EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4f& a) { return vcvtq_u32_f32(a); } +template<> EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4c& a) +{ return vreinterpretq_u32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s32(vdup_n_s32(a)))))); } +template<> EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4uc& a) +{ return vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(a))))); } +template<> EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4s& a) +{ return vreinterpretq_u32_s32(vmovl_s16(a)); } +template<> EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4us& a) { return vmovl_u16(a); } +template<> EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4i& a) +{ return vreinterpretq_u32_s32(a); } +template<> EIGEN_STRONG_INLINE Packet2l pcast(const Packet2f& a) +{ return vmovl_s32(vcvt_s32_f32(a)); } +template<> EIGEN_STRONG_INLINE Packet2l pcast(const Packet2i& a) +{ return vmovl_s32(a); } +template<> EIGEN_STRONG_INLINE Packet2l pcast(const Packet2ui& a) +{ return vreinterpretq_s64_u64(vmovl_u32(a)); } +template<> EIGEN_STRONG_INLINE Packet2l pcast(const Packet2ul& a) +{ return vreinterpretq_s64_u64(a); } +template<> EIGEN_STRONG_INLINE Packet2ul pcast(const Packet2f& a) +{ return vmovl_u32(vcvt_u32_f32(a)); } +template<> EIGEN_STRONG_INLINE Packet2ul pcast(const Packet2i& a) +{ return vreinterpretq_u64_s64(vmovl_s32(a)); } +template<> EIGEN_STRONG_INLINE Packet2ul pcast(const Packet2ui& a) +{ return vmovl_u32(a); } +template<> EIGEN_STRONG_INLINE Packet2ul pcast(const Packet2l& a) +{ return vreinterpretq_u64_s64(a); } template<> EIGEN_STRONG_INLINE Packet2f preinterpret(const Packet2i& a) { return vreinterpret_f32_s32(a); } @@ -40,14 +306,70 @@ template<> EIGEN_STRONG_INLINE Packet4f preinterpret(const Pa { return vreinterpretq_f32_s32(a); } template<> EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4ui& a) { return vreinterpretq_f32_u32(a); } +template<> EIGEN_STRONG_INLINE Packet4c preinterpret(const Packet4uc& a) +{ return static_cast(a); } +template<> EIGEN_STRONG_INLINE Packet8c preinterpret(const Packet8uc& a) +{ return vreinterpret_s8_u8(a); } +template<> EIGEN_STRONG_INLINE Packet16c preinterpret(const Packet16uc& a) +{ return vreinterpretq_s8_u8(a); } +template<> EIGEN_STRONG_INLINE Packet4uc preinterpret(const Packet4c& a) +{ return static_cast(a); } +template<> EIGEN_STRONG_INLINE Packet8uc preinterpret(const Packet8c& a) +{ return vreinterpret_u8_s8(a); } +template<> EIGEN_STRONG_INLINE Packet16uc preinterpret(const Packet16c& a) +{ return vreinterpretq_u8_s8(a); } +template<> EIGEN_STRONG_INLINE Packet4s preinterpret(const Packet4us& a) +{ return vreinterpret_s16_u16(a); } +template<> EIGEN_STRONG_INLINE Packet8s preinterpret(const Packet8us& a) +{ return vreinterpretq_s16_u16(a); } +template<> EIGEN_STRONG_INLINE Packet4us preinterpret(const Packet4s& a) +{ return vreinterpret_u16_s16(a); } +template<> EIGEN_STRONG_INLINE Packet8us preinterpret(const Packet8s& a) +{ return vreinterpretq_u16_s16(a); } template<> EIGEN_STRONG_INLINE Packet2i preinterpret(const Packet2f& a) { return vreinterpret_s32_f32(a); } +template<> EIGEN_STRONG_INLINE Packet2i preinterpret(const Packet2ui& a) +{ return vreinterpret_s32_u32(a); } template<> EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { return vreinterpretq_s32_f32(a); } +template<> EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4ui& a) +{ return vreinterpretq_s32_u32(a); } template<> EIGEN_STRONG_INLINE Packet2ui preinterpret(const Packet2f& a) { return vreinterpret_u32_f32(a); } +template<> EIGEN_STRONG_INLINE Packet2ui preinterpret(const Packet2i& a) +{ return vreinterpret_u32_s32(a); } template<> EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet4f& a) { return vreinterpretq_u32_f32(a); } +template<> EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet4i& a) +{ return vreinterpretq_u32_s32(a); } +template<> EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2ul& a) +{ return vreinterpretq_s64_u64(a); } +template<> EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2l& a) +{ return vreinterpretq_u64_s64(a); } + +#if EIGEN_ARCH_ARM64 + +template<> EIGEN_STRONG_INLINE Packet2f pcast(const Packet2d& a) { return vcvt_f32_f64(a); } +template<> EIGEN_STRONG_INLINE Packet2d pcast(const Packet2f& a) { return vcvt_f64_f32(a); } +template<> EIGEN_STRONG_INLINE Packet2d pcast(const Packet2i& a) { return vcvtq_f64_s64(vmovl_s32(a)); } +template<> EIGEN_STRONG_INLINE Packet2d pcast(const Packet2ui& a) { return vcvtq_f64_u64(vmovl_u32(a)); } +template<> EIGEN_STRONG_INLINE Packet2d pcast(const Packet2l& a) { return vcvtq_f64_s64(a); } +template<> EIGEN_STRONG_INLINE Packet2d pcast(const Packet2ul& a) { return vcvtq_f64_u64(a); } +template<> EIGEN_STRONG_INLINE Packet2i pcast(const Packet2d& a) { return vcvt_s32_f32(vcvt_f32_f64(a)); } +template<> EIGEN_STRONG_INLINE Packet2ui pcast(const Packet2d& a) { return vcvt_u32_f32(vcvt_f32_f64(a)); } +template<> EIGEN_STRONG_INLINE Packet2l pcast(const Packet2d& a) { return vcvtq_s64_f64(a); } +template<> EIGEN_STRONG_INLINE Packet2ul pcast(const Packet2d& a) { return vcvtq_u64_f64(a); } + +template<> EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2l& a) +{ return vreinterpretq_f64_s64(a); } +template<> EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2ul& a) +{ return vreinterpretq_f64_u64(a); } +template<> EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2d& a) +{ return vreinterpretq_s64_f64(a); } +template<> EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2d& a) +{ return vreinterpretq_u64_f64(a); } + +#endif // EIGEN_ARCH_ARM64 } // end namespace internal diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index b3b1b4854..a80395b68 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -45,7 +45,8 @@ template<> struct packet_traits > : default_packet_traits HasMin = 0, HasMax = 0, HasSetLinear = 0, - HasBlend = 1 + HasBlend = 1, + HasInsert = 1 }; }; #endif diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index f624f2c9f..d04013e7e 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -120,6 +120,7 @@ struct packet_traits : default_packet_traits { HasTanh = EIGEN_FAST_MATH, HasErf = EIGEN_FAST_MATH, HasBlend = 1, + HasInsert = 1, HasFloor = 1 #ifdef EIGEN_VECTORIZE_SSE4_1 @@ -144,7 +145,8 @@ struct packet_traits : default_packet_traits { HasExp = 1, HasSqrt = 1, HasRsqrt = 1, - HasBlend = 1 + HasBlend = 1, + HasInsert = 1 #ifdef EIGEN_VECTORIZE_SSE4_1 , diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 1ba15496b..00b1420b5 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -16,6 +16,58 @@ #define REF_DIV(a,b) ((a)/(b)) #define REF_ABS_DIFF(a,b) ((a)>(b)?(a)-(b):(b)-(a)) +template +struct test_cast_helper; + +template +struct test_cast_helper { + static void run() {} +}; + +template +struct test_cast_helper { + static void run() { + static const int PacketSize = internal::unpacket_traits::size; + EIGEN_ALIGN_MAX FromScalar data1[PacketSize]; + EIGEN_ALIGN_MAX ToScalar data2[PacketSize]; + EIGEN_ALIGN_MAX ToScalar ref[PacketSize]; + + // Construct a packet of scalars that will not overflow when casting + for (int i=0; i::Random().value(); + const ToScalar to_scalar = Array::Random().value(); + const FromScalar c = sizeof(ToScalar) > sizeof(FromScalar) ? static_cast(to_scalar) : from_scalar; + data1[i] = (NumTraits::IsSigned && !NumTraits::IsSigned) ? numext::abs(c) : c; + } + + for (int i=0; i(data1[i]); + internal::pstore(data2, internal::pcast(internal::pload(data1))); + + VERIFY(areApprox(ref, data2, PacketSize) && "internal::pcast<>"); + } +}; + +template +void test_cast() { + typedef typename internal::packet_traits::type Full; + typedef typename internal::unpacket_traits::half Half; + typedef typename internal::unpacket_traits::half>::half Quarter; + + static const int PacketSize = internal::unpacket_traits::size; + static const bool CanCast = + PacketSize == internal::unpacket_traits::size || + PacketSize == internal::unpacket_traits::size || + PacketSize == internal::unpacket_traits::size; + + typedef typename internal::unpacket_traits::type FromScalar; + typedef typename internal::conditional::size == PacketSize, Quarter, + typename internal::conditional::size == PacketSize, Half, Full>::type>::type + ToPacket; + + test_cast_helper::run(); +} + template void packetmath() { typedef internal::packet_traits PacketTraits; @@ -263,7 +315,7 @@ template void packetmath() } } - if (PacketTraits::HasBlend || g_vectorize_sse) { + if (PacketTraits::HasInsert || g_vectorize_sse) { // pinsertfirst for (int i=0; i void packetmath() VERIFY(test::areApprox(ref, data2, PacketSize) && "internal::pinsertfirst"); } - if (PacketTraits::HasBlend || g_vectorize_sse) { + if (PacketTraits::HasInsert || g_vectorize_sse) { // pinsertlast for (int i=0; i void packetmath_notcomplex() Array::Map(data1, PacketSize*4).setRandom(); + if (PacketTraits::HasCast) { + test_cast(); + test_cast(); + test_cast(); + test_cast(); + test_cast(); + test_cast(); + test_cast(); + test_cast(); + test_cast(); + test_cast(); + } + ref[0] = data1[0]; for (int i=0; i