aboutsummaryrefslogtreecommitdiffhomepage
path: root/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2017-03-23 14:46:54 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-03-23 16:11:32 -0700
commite3c3fbf1a2339c99e6a35ee1d958e0f5a79de4f7 (patch)
tree4792acf6cec73f813747121dc639aeaa45b0d97c /third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
parent5b466775f9f06d6b2ca603c50ffa0cecae90ce8f (diff)
Internal change.
Change: 151064926
Diffstat (limited to 'third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h')
-rw-r--r--third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h545
1 files changed, 545 insertions, 0 deletions
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
new file mode 100644
index 0000000000..7a222fddc1
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
@@ -0,0 +1,545 @@
+#ifndef THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
+#define THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
+
+#include "PacketMathAVX2.h"
+
+namespace Eigen {
+namespace internal {
+
+typedef struct Packet64q8i {
+ __m512i val;
+ operator __m512i() const { return val; }
+ Packet64q8i();
+ Packet64q8i(__m512i val) : val(val) {}
+} Packet64q8i;
+
+typedef struct Packet32q16i {
+ __m512i val;
+ operator __m512i() const { return val; }
+ Packet32q16i();
+ Packet32q16i(__m512i val) : val(val) {}
+} Packet32q16i;
+
+typedef struct Packet64q8u {
+ __m512i val;
+ operator __m512i() const { return val; }
+ Packet64q8u();
+ Packet64q8u(__m512i val) : val(val) {}
+} Packet64q8u;
+
+typedef struct Packet16q32i {
+ __m512i val;
+ operator __m512i() const { return val; }
+ Packet16q32i();
+ Packet16q32i(__m512i val) : val(val) {}
+} Packet16q32i;
+
+template <>
+struct packet_traits<QInt8> : default_packet_traits {
+ typedef Packet64q8i type;
+ typedef Packet32q8i half;
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 1,
+ size = 64,
+ };
+ enum {
+ HasAdd = 0,
+ HasSub = 0,
+ HasMul = 0,
+ HasNegate = 0,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 0,
+ HasSetLinear = 0
+ };
+};
+template <>
+struct packet_traits<QUInt8> : default_packet_traits {
+ typedef Packet64q8u type;
+ typedef Packet32q8u half;
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 1,
+ size = 64,
+ };
+ enum {
+ HasAdd = 0,
+ HasSub = 0,
+ HasMul = 0,
+ HasNegate = 0,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 0,
+ HasSetLinear = 0
+ };
+};
+template <>
+struct packet_traits<QInt16> : default_packet_traits {
+ typedef Packet32q16i type;
+ typedef Packet16q16i half;
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 1,
+ size = 32,
+ };
+ enum {
+ HasAdd = 0,
+ HasSub = 0,
+ HasMul = 0,
+ HasNegate = 0,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 0,
+ HasSetLinear = 0
+ };
+};
+template <>
+struct packet_traits<QInt32> : default_packet_traits {
+ typedef Packet16q32i type;
+ typedef Packet8q32i half;
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 1,
+ size = 16,
+ };
+ enum {
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasNegate = 1,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 0,
+ HasSetLinear = 0
+ };
+};
+
+template <>
+struct unpacket_traits<Packet64q8i> {
+ typedef QInt8 type;
+ typedef Packet32q8i half;
+ enum { size = 64, alignment=Aligned64 };
+};
+template <>
+struct unpacket_traits<Packet32q16i> {
+ typedef QInt16 type;
+ typedef Packet16q16i half;
+ enum { size = 32, alignment=Aligned64 };
+};
+template <>
+struct unpacket_traits<Packet64q8u> {
+ typedef QUInt8 type;
+ typedef Packet32q8u half;
+ enum { size = 64, alignment=Aligned64 };
+};
+template <>
+struct unpacket_traits<Packet16q32i> {
+ typedef QInt32 type;
+ typedef Packet8q32i half;
+ enum { size = 16, alignment=Aligned64 };
+};
+
+// Unaligned load
+template <>
+EIGEN_STRONG_INLINE Packet64q8i ploadu<Packet64q8i>(const QInt8* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
+ reinterpret_cast<const __m512i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q16i ploadu<Packet32q16i>(const QInt16* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
+ reinterpret_cast<const __m512i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet64q8u ploadu<Packet64q8u>(const QUInt8* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
+ reinterpret_cast<const __m512i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q32i ploadu<Packet16q32i>(const QInt32* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
+ reinterpret_cast<const __m512i*>(from));
+}
+
+// Aligned load
+template <>
+EIGEN_STRONG_INLINE Packet64q8i pload<Packet64q8i>(const QInt8* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
+ reinterpret_cast<const __m512i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q16i pload<Packet32q16i>(const QInt16* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
+ reinterpret_cast<const __m512i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet64q8u pload<Packet64q8u>(const QUInt8* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
+ reinterpret_cast<const __m512i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q32i pload<Packet16q32i>(const QInt32* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
+ reinterpret_cast<const __m512i*>(from));
+}
+
+// Unaligned store
+template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet64q8i& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
+ reinterpret_cast<__m512i*>(to), from.val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet32q16i& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
+ reinterpret_cast<__m512i*>(to), from.val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet64q8u& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
+ reinterpret_cast<__m512i*>(to), from.val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet16q32i& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
+ reinterpret_cast<__m512i*>(to), from.val);
+}
+
+// Aligned store
+template <>
+EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet16q32i& from) {
+ EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
+ from.val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet64q8u& from) {
+ EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
+ from.val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet64q8i& from) {
+ EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
+ from.val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet32q16i& from) {
+ EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
+ from.val);
+}
+
+// Extract first element.
+template <>
+EIGEN_STRONG_INLINE QInt32 pfirst<Packet16q32i>(const Packet16q32i& a) {
+ return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a, 0));
+}
+template <>
+EIGEN_STRONG_INLINE QUInt8 pfirst<Packet64q8u>(const Packet64q8u& a) {
+ return static_cast<uint8_t>(
+ _mm_extract_epi8(_mm512_extracti32x4_epi32(a.val, 0), 0));
+}
+template <>
+EIGEN_STRONG_INLINE QInt8 pfirst<Packet64q8i>(const Packet64q8i& a) {
+ return _mm_extract_epi8(_mm512_extracti32x4_epi32(a.val, 0), 0);
+}
+template <>
+EIGEN_STRONG_INLINE QInt16 pfirst<Packet32q16i>(const Packet32q16i& a) {
+ return _mm_extract_epi16(_mm512_extracti32x4_epi32(a.val, 0), 0);
+}
+
+// Initialize to constant value.
+template <>
+EIGEN_STRONG_INLINE Packet64q8i pset1<Packet64q8i>(const QInt8& from) {
+ return _mm512_set1_epi8(from.value);
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q16i pset1<Packet32q16i>(const QInt16& from) {
+ return _mm512_set1_epi16(from.value);
+}
+template <>
+EIGEN_STRONG_INLINE Packet64q8u pset1<Packet64q8u>(const QUInt8& from) {
+ return _mm512_set1_epi8(static_cast<uint8_t>(from.value));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q32i pset1<Packet16q32i>(const QInt32& from) {
+ return _mm512_set1_epi32(from.value);
+}
+
+// Basic arithmetic packet ops for QInt32.
+template <>
+EIGEN_STRONG_INLINE Packet16q32i padd<Packet16q32i>(const Packet16q32i& a,
+ const Packet16q32i& b) {
+ return _mm512_add_epi32(a.val, b.val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q32i psub<Packet16q32i>(const Packet16q32i& a,
+ const Packet16q32i& b) {
+ return _mm512_sub_epi32(a.val, b.val);
+}
+// Note: mullo truncates the result to 32 bits.
+template <>
+EIGEN_STRONG_INLINE Packet16q32i pmul<Packet16q32i>(const Packet16q32i& a,
+ const Packet16q32i& b) {
+ return _mm512_mullo_epi32(a.val, b.val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q32i pnegate<Packet16q32i>(const Packet16q32i& a) {
+ return _mm512_sub_epi32(_mm512_setzero_si512(), a.val);
+}
+
+// Min and max.
+template <>
+EIGEN_STRONG_INLINE Packet16q32i pmin<Packet16q32i>(const Packet16q32i& a,
+ const Packet16q32i& b) {
+ return _mm512_min_epi32(a.val, b.val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q32i pmax<Packet16q32i>(const Packet16q32i& a,
+ const Packet16q32i& b) {
+ return _mm512_max_epi32(a.val, b.val);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet64q8u pmin<Packet64q8u>(const Packet64q8u& a,
+ const Packet64q8u& b) {
+#ifdef EIGEN_VECTORIZE_AVX512BW
+ return _mm512_min_epu8(a.val, b.val);
+#else
+ __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
+ __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
+ __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
+ __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
+ __m256i r0 = _mm256_min_epu8(ap0, bp0);
+ __m256i r1 = _mm256_min_epu8(ap1, bp1);
+ return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet64q8u pmax<Packet64q8u>(const Packet64q8u& a,
+ const Packet64q8u& b) {
+#ifdef EIGEN_VECTORIZE_AVX512BW
+ return _mm512_max_epu8(a.val, b.val);
+#else
+ __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
+ __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
+ __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
+ __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
+ __m256i r0 = _mm256_max_epu8(ap0, bp0);
+ __m256i r1 = _mm256_max_epu8(ap1, bp1);
+ return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet64q8i pmin<Packet64q8i>(const Packet64q8i& a,
+ const Packet64q8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX512BW
+ return _mm512_min_epi8(a.val, b.val);
+#else
+ __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
+ __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
+ __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
+ __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
+ __m256i r0 = _mm256_min_epi8(ap0, bp0);
+ __m256i r1 = _mm256_min_epi8(ap1, bp1);
+ return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q16i pmin<Packet32q16i>(const Packet32q16i& a,
+ const Packet32q16i& b) {
+#ifdef EIGEN_VECTORIZE_AVX512BW
+ return _mm512_min_epi16(a.val, b.val);
+#else
+ __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
+ __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
+ __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
+ __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
+ __m256i r0 = _mm256_min_epi16(ap0, bp0);
+ __m256i r1 = _mm256_min_epi16(ap1, bp1);
+ return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet64q8i pmax<Packet64q8i>(const Packet64q8i& a,
+ const Packet64q8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX512BW
+ return _mm512_max_epi8(a.val, b.val);
+#else
+ __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
+ __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
+ __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
+ __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
+ __m256i r0 = _mm256_max_epi8(ap0, bp0);
+ __m256i r1 = _mm256_max_epi8(ap1, bp1);
+ return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet32q16i pmax<Packet32q16i>(const Packet32q16i& a,
+ const Packet32q16i& b) {
+#ifdef EIGEN_VECTORIZE_AVX512BW
+ return _mm512_max_epi16(a.val, b.val);
+#else
+ __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0);
+ __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1);
+ __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0);
+ __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1);
+ __m256i r0 = _mm256_max_epi16(ap0, bp0);
+ __m256i r1 = _mm256_max_epi16(ap1, bp1);
+ return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
+#endif
+}
+
+// Reductions.
+template <>
+EIGEN_STRONG_INLINE QInt32 predux_min<Packet16q32i>(const Packet16q32i& a) {
+ Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
+ Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
+ Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
+ Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
+ Packet4i res =
+ _mm_min_epi32(_mm_min_epi32(lane0, lane1), _mm_min_epi32(lane2, lane3));
+ res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
+ return pfirst(
+ _mm_min_epi32(
+ res,
+ _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+}
+template <>
+EIGEN_STRONG_INLINE QInt32 predux_max<Packet16q32i>(const Packet16q32i& a) {
+ Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
+ Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
+ Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
+ Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
+ Packet4i res =
+ _mm_max_epi32(_mm_max_epi32(lane0, lane1), _mm_max_epi32(lane2, lane3));
+ res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
+ return pfirst(
+ _mm_max_epi32(
+ res,
+ _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+}
+template <>
+EIGEN_STRONG_INLINE QInt16 predux_min<Packet32q16i>(const Packet32q16i& a) {
+ Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
+ Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
+ Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
+ Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
+ Packet4i res =
+ _mm_min_epi16(_mm_min_epi16(lane0, lane1), _mm_min_epi16(lane2, lane3));
+ res = _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
+ std::uint32_t w =
+ pfirst(
+ _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+ return std::min({
+ static_cast<std::int16_t>(w >> 16),
+ static_cast<std::int16_t>(w)
+ });
+}
+template <>
+EIGEN_STRONG_INLINE QInt16 predux_max<Packet32q16i>(const Packet32q16i& a) {
+ Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
+ Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
+ Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
+ Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
+ Packet4i res =
+ _mm_max_epi16(_mm_max_epi16(lane0, lane1), _mm_max_epi16(lane2, lane3));
+ res = _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
+ std::uint32_t w =
+ pfirst(
+ _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+ return std::max({
+ static_cast<std::int16_t>(w >> 16),
+ static_cast<std::int16_t>(w)
+ });
+}
+template <>
+EIGEN_STRONG_INLINE QUInt8 predux_min<Packet64q8u>(const Packet64q8u& a) {
+ Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
+ Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
+ Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
+ Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
+ Packet4i res =
+ _mm_min_epu8(_mm_min_epu8(lane0, lane1), _mm_min_epu8(lane2, lane3));
+ res = _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
+ std::uint32_t w =
+ pfirst(
+ _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+ return std::min({
+ static_cast<std::uint8_t>(w >> 24),
+ static_cast<std::uint8_t>(w >> 16),
+ static_cast<std::uint8_t>(w >> 8),
+ static_cast<std::uint8_t>(w)
+ });
+}
+template <>
+EIGEN_STRONG_INLINE QUInt8 predux_max<Packet64q8u>(const Packet64q8u& a) {
+ Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
+ Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
+ Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
+ Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
+ Packet4i res =
+ _mm_max_epu8(_mm_max_epu8(lane0, lane1), _mm_max_epu8(lane2, lane3));
+ res = _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
+ std::uint32_t w =
+ pfirst(
+ _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+ return std::max({
+ static_cast<std::uint8_t>(w >> 24),
+ static_cast<std::uint8_t>(w >> 16),
+ static_cast<std::uint8_t>(w >> 8),
+ static_cast<std::uint8_t>(w)
+ });
+}
+template <>
+EIGEN_STRONG_INLINE QInt8 predux_min<Packet64q8i>(const Packet64q8i& a) {
+ Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
+ Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
+ Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
+ Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
+ Packet4i res =
+ _mm_min_epi8(_mm_min_epi8(lane0, lane1), _mm_min_epi8(lane2, lane3));
+ res = _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
+ std::uint32_t w =
+ pfirst(
+ _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+ return std::min({
+ static_cast<std::int8_t>(w >> 24),
+ static_cast<std::int8_t>(w >> 16),
+ static_cast<std::int8_t>(w >> 8),
+ static_cast<std::int8_t>(w)
+ });
+}
+template <>
+EIGEN_STRONG_INLINE QInt8 predux_max<Packet64q8i>(const Packet64q8i& a) {
+ Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0);
+ Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1);
+ Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2);
+ Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3);
+ Packet4i res =
+ _mm_max_epi8(_mm_max_epi8(lane0, lane1), _mm_max_epi8(lane2, lane3));
+ res = _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
+ std::uint32_t w =
+ pfirst(
+ _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+ return std::min({
+ static_cast<std::int8_t>(w >> 24),
+ static_cast<std::int8_t>(w >> 16),
+ static_cast<std::int8_t>(w >> 8),
+ static_cast<std::int8_t>(w)
+ });
+}
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_