aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch/SSE
diff options
context:
space:
mode:
authorGravatar Rasmus Munk Larsen <rmlarsen@google.com>2020-05-11 13:23:31 -0700
committerGravatar Rasmus Munk Larsen <rmlarsen@google.com>2020-05-14 22:39:13 +0000
commit9b411757abd8458f9689b1384c6bf75da9b82357 (patch)
treeeb23c0c65eaa4df204880c94b287fcf1b43e14cd /Eigen/src/Core/arch/SSE
parentd640276d31a7dea9207a68a061a6fa7c9fdf50e5 (diff)
Add missing packet ops for bool, and make it pass the same packet op unit tests as other arithmetic types.
This change also contains a few minor cleanups: 1. Remove packet op pnot, which is not needed for anything other than pcmp_le_or_nan, which can be done in other ways. 2. Remove the "HasInsert" enum, which is no longer needed since we removed the corresponding packet ops. 3. Add faster pselect op for Packet4i when SSE4.1 is supported. Among other things, this makes the fast transposeInPlace() method available for Matrix<bool>. Run on ************** (72 X 2994 MHz CPUs); 2020-05-09T10:51:02.372347913-07:00 CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB Benchmark Time(ns) CPU(ns) Iterations ----------------------------------------------------------------------- BM_TransposeInPlace<float>/4 9.77 9.77 71670320 BM_TransposeInPlace<float>/8 21.9 21.9 31929525 BM_TransposeInPlace<float>/16 66.6 66.6 10000000 BM_TransposeInPlace<float>/32 243 243 2879561 BM_TransposeInPlace<float>/59 844 844 829767 BM_TransposeInPlace<float>/64 933 933 750567 BM_TransposeInPlace<float>/128 3944 3945 177405 BM_TransposeInPlace<float>/256 16853 16853 41457 BM_TransposeInPlace<float>/512 204952 204968 3448 BM_TransposeInPlace<float>/1k 1053889 1053861 664 BM_TransposeInPlace<bool>/4 14.4 14.4 48637301 BM_TransposeInPlace<bool>/8 36.0 36.0 19370222 BM_TransposeInPlace<bool>/16 31.5 31.5 22178902 BM_TransposeInPlace<bool>/32 111 111 6272048 BM_TransposeInPlace<bool>/59 626 626 1000000 BM_TransposeInPlace<bool>/64 428 428 1632689 BM_TransposeInPlace<bool>/128 1677 1677 417377 BM_TransposeInPlace<bool>/256 7126 7126 96264 BM_TransposeInPlace<bool>/512 29021 29024 24165 BM_TransposeInPlace<bool>/1k 116321 116330 6068
Diffstat (limited to 'Eigen/src/Core/arch/SSE')
-rw-r--r--Eigen/src/Core/arch/SSE/Complex.h5
-rwxr-xr-xEigen/src/Core/arch/SSE/PacketMath.h175
2 files changed, 163 insertions, 17 deletions
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index 8bf8bfe85..0d322a2a1 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -45,8 +45,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
HasMin = 0,
HasMax = 0,
HasSetLinear = 0,
- HasBlend = 1,
- HasInsert = 1
+ HasBlend = 1
};
};
#endif
@@ -84,7 +83,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
}
template<> EIGEN_STRONG_INLINE Packet2cf ptrue <Packet2cf>(const Packet2cf& a) { return Packet2cf(ptrue(Packet4f(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet2cf pnot <Packet2cf>(const Packet2cf& a) { return Packet2cf(pnot(Packet4f(a.v))); }
template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf por <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); }
@@ -292,7 +290,6 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con
}
template<> EIGEN_STRONG_INLINE Packet1cd ptrue <Packet1cd>(const Packet1cd& a) { return Packet1cd(ptrue(Packet2d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet1cd pnot <Packet1cd>(const Packet1cd& a) { return Packet1cd(pnot(Packet2d(a.v))); }
template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd por <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); }
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 645aee0cd..9f2e922a2 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -108,7 +108,6 @@ struct packet_traits<float> : default_packet_traits {
HasTanh = EIGEN_FAST_MATH,
HasErf = EIGEN_FAST_MATH,
HasBlend = 1,
- HasInsert = 1,
HasFloor = 1
#ifdef EIGEN_VECTORIZE_SSE4_1
@@ -133,8 +132,7 @@ struct packet_traits<double> : default_packet_traits {
HasExp = 1,
HasSqrt = 1,
HasRsqrt = 1,
- HasBlend = 1,
- HasInsert = 1
+ HasBlend = 1
#ifdef EIGEN_VECTORIZE_SSE4_1
,
@@ -171,10 +169,10 @@ template<> struct packet_traits<bool> : default_packet_traits
size=16,
HasAdd = 1,
- HasSub = 0,
+ HasSub = 1,
HasShift = 0,
HasMul = 1,
- HasNegate = 0,
+ HasNegate = 1,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 0,
@@ -254,6 +252,7 @@ template<> EIGEN_STRONG_INLINE Packet16b padd<Packet16b>(const Packet16b& a, con
template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16b psub<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
{
@@ -270,6 +269,11 @@ template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a);
}
+template<> EIGEN_STRONG_INLINE Packet16b pnegate(const Packet16b& a)
+{
+ return psub(pset1<Packet16b>(false), a);
+}
+
template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
@@ -305,11 +309,29 @@ template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d&
#endif
#ifdef EIGEN_VECTORIZE_SSE4_1
-template<> EIGEN_DEVICE_FUNC inline Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) { return _mm_blendv_ps(b,a,mask); }
+template<> EIGEN_DEVICE_FUNC inline Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
+ return _mm_blendv_ps(b,a,mask);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
+ return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b),_mm_castsi128_ps(a),_mm_castsi128_ps(mask)));
+}
template<> EIGEN_DEVICE_FUNC inline Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) { return _mm_blendv_pd(b,a,mask); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, const Packet16b& a, const Packet16b& b) {
+ return _mm_blendv_epi8(b,a,mask);
+}
+#else
+template<> EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, const Packet16b& a, const Packet16b& b) {
+ Packet16b a_part = _mm_and_si128(mask, a);
+ Packet16b b_part = _mm_andnot_si128(mask, b);
+ return _mm_or_si128(a_part, b_part);
+}
#endif
+
+
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
// There appears to be a bug in GCC, by which the optimizer may
@@ -567,6 +589,23 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
return vec4i_swizzle1(tmp, 0, 0, 1, 1);
}
+// Loads 8 bools from memory and returns the packet
+// {b0, b0, b1, b1, b2, b2, b3, b3, b4, b4, b5, b5, b6, b6, b7, b7}
+template<> EIGEN_STRONG_INLINE Packet16b ploaddup<Packet16b>(const bool* from)
+{
+ __m128i tmp = _mm_castpd_si128(pload1<Packet2d>(reinterpret_cast<const double*>(from)));
+ return _mm_unpacklo_epi8(tmp, tmp);
+}
+
+// Loads 4 bools from memory and returns the packet
+// {b0, b0 b0, b0, b1, b1, b1, b1, b2, b2, b2, b2, b3, b3, b3, b3}
+template<> EIGEN_STRONG_INLINE Packet16b
+ploadquad<Packet16b>(const bool* from) {
+ __m128i tmp = _mm_castps_si128(pload1<Packet4f>(reinterpret_cast<const float*>(from)));
+ tmp = _mm_unpacklo_epi8(tmp, tmp);
+ return _mm_unpacklo_epi16(tmp, tmp);
+}
+
template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
@@ -588,6 +627,14 @@ template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const dou
template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
{
return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet16b pgather<bool, Packet16b>(const bool* from, Index stride)
+{
+ return _mm_set_epi8(from[15*stride], from[14*stride], from[13*stride], from[12*stride],
+ from[11*stride], from[10*stride], from[9*stride], from[8*stride],
+ from[7*stride], from[6*stride], from[5*stride], from[4*stride],
+ from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
@@ -609,6 +656,14 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const
to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<bool, Packet16b>(bool* to, const Packet16b& from, Index stride)
+{
+ to[4*stride*0] = _mm_cvtsi128_si32(from);
+ to[4*stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
+ to[4*stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
+ to[4*stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
+}
+
// some compilers might be tempted to perform multiple moves instead of using a vector path.
template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
@@ -653,12 +708,19 @@ template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { retu
template<> EIGEN_STRONG_INLINE bool pfirst<Packet16b>(const Packet16b& a) { int x = _mm_cvtsi128_si32(a); return static_cast<bool>(x & 1); }
#endif
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
-{ return _mm_shuffle_ps(a,a,0x1B); }
-template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
-{ return _mm_shuffle_pd(a,a,0x1); }
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
-{ return _mm_shuffle_epi32(a,0x1B); }
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return _mm_shuffle_ps(a,a,0x1B); }
+template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return _mm_shuffle_pd(a,a,0x1); }
+template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return _mm_shuffle_epi32(a,0x1B); }
+template<> EIGEN_STRONG_INLINE Packet16b preverse(const Packet16b& a) {
+#ifdef EIGEN_VECTORIZE_SSSE3
+ __m128i mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+ return _mm_shuffle_epi8(a, mask);
+#else
+ Packet16b tmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3));
+ tmp = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
+ return _mm_or_si128(_mm_slli_epi16(tmp, 8), _mm_srli_epi16(tmp, 8));
+#endif
+}
template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
{
@@ -777,7 +839,7 @@ template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
#endif
template<> EIGEN_STRONG_INLINE bool predux<Packet16b>(const Packet16b& a) {
-Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a,a));
+ Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a,a));
return (pfirst(tmp) != 0) || (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) != 0);
}
@@ -804,6 +866,12 @@ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
return (aux[0] * aux[1]) * (aux[2] * aux[3]);
}
+template<> EIGEN_STRONG_INLINE bool predux_mul<Packet16b>(const Packet16b& a) {
+ Packet4i tmp = _mm_and_si128(a, _mm_unpackhi_epi64(a,a));
+ return ((pfirst<Packet4i>(tmp) == 0x01010101) &&
+ (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) == 0x01010101));
+}
+
// min
template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
{
@@ -904,6 +972,87 @@ ptranspose(PacketBlock<Packet16b,4>& kernel) {
kernel.packet[3] = _mm_unpackhi_epi16(T1, T3);
}
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet16b,16>& kernel) {
+ // If we number the elements in the input thus:
+ // kernel.packet[ 0] = {00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 0a, 0b, 0c, 0d, 0e, 0f}
+ // kernel.packet[ 1] = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1a, 1b, 1c, 1d, 1e, 1f}
+ // ...
+ // kernel.packet[15] = {f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, fa, fb, fc, fd, fe, ff},
+ //
+ // the desired output is:
+ // kernel.packet[ 0] = {00, 10, 20, 30, 40, 50, 60, 70, 80, 90, a0, b0, c0, d0, e0, f0}
+ // kernel.packet[ 1] = {01, 11, 21, 31, 41, 51, 61, 71, 81, 91, a1, b1, c1, d1, e1, f1}
+ // ...
+ // kernel.packet[15] = {0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, af, bf, cf, df, ef, ff},
+ __m128i t0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ __m128i t1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]); // 08 18 09 19 0a 1a 0b 1b 0c 1c 0d 1d 0e 1e 0f 1f
+ __m128i t2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]); // 20 30 21 31 22 32 ... 27 37
+ __m128i t3 = _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]); // 28 38 29 39 2a 3a ... 2f 3f
+ __m128i t4 = _mm_unpacklo_epi8(kernel.packet[4], kernel.packet[5]); // 40 50 41 51 42 52 47 57
+ __m128i t5 = _mm_unpackhi_epi8(kernel.packet[4], kernel.packet[5]); // 48 58 49 59 4a 5a
+ __m128i t6 = _mm_unpacklo_epi8(kernel.packet[6], kernel.packet[7]);
+ __m128i t7 = _mm_unpackhi_epi8(kernel.packet[6], kernel.packet[7]);
+ __m128i t8 = _mm_unpacklo_epi8(kernel.packet[8], kernel.packet[9]);
+ __m128i t9 = _mm_unpackhi_epi8(kernel.packet[8], kernel.packet[9]);
+ __m128i ta = _mm_unpacklo_epi8(kernel.packet[10], kernel.packet[11]);
+ __m128i tb = _mm_unpackhi_epi8(kernel.packet[10], kernel.packet[11]);
+ __m128i tc = _mm_unpacklo_epi8(kernel.packet[12], kernel.packet[13]);
+ __m128i td = _mm_unpackhi_epi8(kernel.packet[12], kernel.packet[13]);
+ __m128i te = _mm_unpacklo_epi8(kernel.packet[14], kernel.packet[15]);
+ __m128i tf = _mm_unpackhi_epi8(kernel.packet[14], kernel.packet[15]);
+
+ __m128i s0 = _mm_unpacklo_epi16(t0, t2); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ __m128i s1 = _mm_unpackhi_epi16(t0, t2); // 04 14 24 34
+ __m128i s2 = _mm_unpacklo_epi16(t1, t3); // 08 18 28 38 ...
+ __m128i s3 = _mm_unpackhi_epi16(t1, t3); // 0c 1c 2c 3c ...
+ __m128i s4 = _mm_unpacklo_epi16(t4, t6); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ __m128i s5 = _mm_unpackhi_epi16(t4, t6); // 44 54 64 74 ...
+ __m128i s6 = _mm_unpacklo_epi16(t5, t7);
+ __m128i s7 = _mm_unpackhi_epi16(t5, t7);
+ __m128i s8 = _mm_unpacklo_epi16(t8, ta);
+ __m128i s9 = _mm_unpackhi_epi16(t8, ta);
+ __m128i sa = _mm_unpacklo_epi16(t9, tb);
+ __m128i sb = _mm_unpackhi_epi16(t9, tb);
+ __m128i sc = _mm_unpacklo_epi16(tc, te);
+ __m128i sd = _mm_unpackhi_epi16(tc, te);
+ __m128i se = _mm_unpacklo_epi16(td, tf);
+ __m128i sf = _mm_unpackhi_epi16(td, tf);
+
+ __m128i u0 = _mm_unpacklo_epi32(s0, s4); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ __m128i u1 = _mm_unpackhi_epi32(s0, s4); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ __m128i u2 = _mm_unpacklo_epi32(s1, s5);
+ __m128i u3 = _mm_unpackhi_epi32(s1, s5);
+ __m128i u4 = _mm_unpacklo_epi32(s2, s6);
+ __m128i u5 = _mm_unpackhi_epi32(s2, s6);
+ __m128i u6 = _mm_unpacklo_epi32(s3, s7);
+ __m128i u7 = _mm_unpackhi_epi32(s3, s7);
+ __m128i u8 = _mm_unpacklo_epi32(s8, sc);
+ __m128i u9 = _mm_unpackhi_epi32(s8, sc);
+ __m128i ua = _mm_unpacklo_epi32(s9, sd);
+ __m128i ub = _mm_unpackhi_epi32(s9, sd);
+ __m128i uc = _mm_unpacklo_epi32(sa, se);
+ __m128i ud = _mm_unpackhi_epi32(sa, se);
+ __m128i ue = _mm_unpacklo_epi32(sb, sf);
+ __m128i uf = _mm_unpackhi_epi32(sb, sf);
+
+ kernel.packet[0] = _mm_unpacklo_epi64(u0, u8);
+ kernel.packet[1] = _mm_unpackhi_epi64(u0, u8);
+ kernel.packet[2] = _mm_unpacklo_epi64(u1, u9);
+ kernel.packet[3] = _mm_unpackhi_epi64(u1, u9);
+ kernel.packet[4] = _mm_unpacklo_epi64(u2, ua);
+ kernel.packet[5] = _mm_unpackhi_epi64(u2, ua);
+ kernel.packet[6] = _mm_unpacklo_epi64(u3, ub);
+ kernel.packet[7] = _mm_unpackhi_epi64(u3, ub);
+ kernel.packet[8] = _mm_unpacklo_epi64(u4, uc);
+ kernel.packet[9] = _mm_unpackhi_epi64(u4, uc);
+ kernel.packet[10] = _mm_unpacklo_epi64(u5, ud);
+ kernel.packet[11] = _mm_unpackhi_epi64(u5, ud);
+ kernel.packet[12] = _mm_unpacklo_epi64(u6, ue);
+ kernel.packet[13] = _mm_unpackhi_epi64(u6, ue);
+ kernel.packet[14] = _mm_unpacklo_epi64(u7, uf);
+ kernel.packet[15] = _mm_unpackhi_epi64(u7, uf);
+}
template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
const __m128i zero = _mm_setzero_si128();