diff options
Diffstat (limited to 'Eigen/src')
-rw-r--r-- | Eigen/src/Core/Block.h | 1 | ||||
-rw-r--r-- | Eigen/src/Core/CacheFriendlyProduct.h | 89 | ||||
-rw-r--r-- | Eigen/src/Core/DummyPacketMath.h | 16 | ||||
-rw-r--r-- | Eigen/src/Core/Product.h | 4 | ||||
-rw-r--r-- | Eigen/src/Core/arch/SSE/PacketMath.h | 81 |
5 files changed, 161 insertions, 30 deletions
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index 62617172e..c3174073b 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -274,7 +274,6 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block<MatrixTy inline const Scalar coeff(int row, int col) const { -// std::cerr << "coeff(int row, int col)\n"; if (IsRowMajor) return m_data_ptr[col + row * stride()]; else diff --git a/Eigen/src/Core/CacheFriendlyProduct.h b/Eigen/src/Core/CacheFriendlyProduct.h index a9ef66cdd..b51b50ac3 100644 --- a/Eigen/src/Core/CacheFriendlyProduct.h +++ b/Eigen/src/Core/CacheFriendlyProduct.h @@ -390,7 +390,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector( const int alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0; int alignmentPattern = alignmentStep==0 ? AllAligned - : alignmentStep==2 ? EvenAligned + : alignmentStep==(PacketSize/2) ? EvenAligned : FirstAligned; // we cannot assume the first element is aligned because of sub-matrices @@ -432,7 +432,6 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector( if (PacketSize>1) { /* explicit vectorization */ - // process initial unaligned coeffs for (int j=0; j<alignedStart; j++) res[j] += ei_pfirst(ptmp0)*lhs0[j] + ei_pfirst(ptmp1)*lhs1[j] + ei_pfirst(ptmp2)*lhs2[j] + ei_pfirst(ptmp3)*lhs3[j]; @@ -452,26 +451,41 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector( case FirstAligned: if(peels>1) { - // NOTE peeling with two _EIGEN_ACCUMULATE_PACKETS() is much less efficient - // than the following code - asm("#mybegin"); Packet A00, A01, A02, A03, A10, A11, A12, A13; + if (alignmentStep==1) + { + A00 = ptmp1; ptmp1 = ptmp3; ptmp3 = A00; + const Scalar* aux = lhs1; + lhs1 = lhs3; lhs3 = aux; + } + + A01 = ei_pload(&lhs1[alignedStart-1]); + A02 = ei_pload(&lhs2[alignedStart-2]); + A03 = ei_pload(&lhs3[alignedStart-3]); + for (int j = alignedStart; j<peeledSize; j+=peels*PacketSize) { - A01 = ei_ploadu(&lhs1[j]); A11 = ei_ploadu(&lhs1[j+PacketSize]); - A02 = ei_ploadu(&lhs2[j]); A12 = ei_ploadu(&lhs2[j+PacketSize]); - A00 = ei_pload (&lhs0[j]); A10 = ei_pload (&lhs0[j+PacketSize]); + A11 = ei_pload(&lhs1[j-1+PacketSize]); ei_palign<1>(A01,A11); + A12 = ei_pload(&lhs2[j-2+PacketSize]); ei_palign<2>(A02,A12); + A13 = ei_pload(&lhs3[j-3+PacketSize]); ei_palign<3>(A03,A13); + A00 = ei_pload (&lhs0[j]); + A10 = ei_pload (&lhs0[j+PacketSize]); A00 = ei_pmadd(ptmp0, A00, ei_pload(&res[j])); A10 = ei_pmadd(ptmp0, A10, ei_pload(&res[j+PacketSize])); - A00 = ei_pmadd(ptmp1, A01, A00); A10 = ei_pmadd(ptmp1, A11, A10); - A03 = ei_ploadu(&lhs3[j]); A13 = ei_ploadu(&lhs3[j+PacketSize]); - A00 = ei_pmadd(ptmp2, A02, A00); A10 = ei_pmadd(ptmp2, A12, A10); - A00 = ei_pmadd(ptmp3, A03, A00); A10 = ei_pmadd(ptmp3, A13, A10); - ei_pstore(&res[j],A00); ei_pstore(&res[j+PacketSize],A10); + A00 = ei_pmadd(ptmp1, A01, A00); + A01 = ei_pload(&lhs1[j-1+2*PacketSize]); ei_palign<1>(A11,A01); + A00 = ei_pmadd(ptmp2, A02, A00); + A02 = ei_pload(&lhs2[j-2+2*PacketSize]); ei_palign<2>(A12,A02); + A00 = ei_pmadd(ptmp3, A03, A00); + ei_pstore(&res[j],A00); + A03 = ei_pload(&lhs3[j-3+2*PacketSize]); ei_palign<3>(A13,A03); + A10 = ei_pmadd(ptmp1, A11, A10); + A10 = ei_pmadd(ptmp2, A12, A10); + A10 = ei_pmadd(ptmp3, A13, A10); + ei_pstore(&res[j+PacketSize],A10); } - asm("#myend"); } for (int j = peeledSize; j<alignedSize; j+=PacketSize) _EIGEN_ACCUMULATE_PACKETS(,u,u,); @@ -532,7 +546,6 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector( #undef _EIGEN_ACCUMULATE_PACKETS } - // TODO add peeling to mask unaligned load/stores template<typename Scalar, typename ResType> EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector( @@ -570,7 +583,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector( const int alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0; int alignmentPattern = alignmentStep==0 ? AllAligned - : alignmentStep==2 ? EvenAligned + : alignmentStep==(PacketSize/2) ? EvenAligned : FirstAligned; // we cannot assume the first element is aligned because of sub-matrices @@ -636,27 +649,51 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector( case FirstAligned: if (peels>1) { - Packet A01, A02, A03, b; + /* Here we proccess 4 rows with with two peeled iterations to hide + * tghe overhead of unaligned loads. Moreover unaligned loads are handled + * using special shift/move operations between the two aligned packets + * overlaping the desired unaligned packet. This is *much* more efficient + * than basic unaligned loads. + */ + Packet A01, A02, A03, b, A11, A12, A13; + if (alignmentStep==1) + { + // flip row #1 and #3 + b = ptmp1; ptmp1 = ptmp3; ptmp3 = b; + const Scalar* aux = lhs1; + lhs1 = lhs3; lhs3 = aux; + } + A01 = ei_pload(&lhs1[alignedStart-1]); + A02 = ei_pload(&lhs2[alignedStart-2]); + A03 = ei_pload(&lhs3[alignedStart-3]); + for (int j = alignedStart; j<peeledSize; j+=peels*PacketSize) { b = ei_pload(&rhs[j]); - A01 = ei_ploadu(&lhs1[j]); - A02 = ei_ploadu(&lhs2[j]); - A03 = ei_ploadu(&lhs3[j]); + A11 = ei_pload(&lhs1[j-1+PacketSize]); ei_palign<1>(A01,A11); + A12 = ei_pload(&lhs2[j-2+PacketSize]); ei_palign<2>(A02,A12); + A13 = ei_pload(&lhs3[j-3+PacketSize]); ei_palign<3>(A03,A13); ptmp0 = ei_pmadd(b, ei_pload (&lhs0[j]), ptmp0); ptmp1 = ei_pmadd(b, A01, ptmp1); - A01 = ei_ploadu(&lhs1[j+PacketSize]); + A01 = ei_pload(&lhs1[j-1+2*PacketSize]); ei_palign<1>(A11,A01); ptmp2 = ei_pmadd(b, A02, ptmp2); - A02 = ei_ploadu(&lhs2[j+PacketSize]); + A02 = ei_pload(&lhs2[j-2+2*PacketSize]); ei_palign<2>(A12,A02); ptmp3 = ei_pmadd(b, A03, ptmp3); - A03 = ei_ploadu(&lhs3[j+PacketSize]); + A03 = ei_pload(&lhs3[j-3+2*PacketSize]); ei_palign<3>(A13,A03); b = ei_pload(&rhs[j+PacketSize]); ptmp0 = ei_pmadd(b, ei_pload (&lhs0[j+PacketSize]), ptmp0); - ptmp1 = ei_pmadd(b, A01, ptmp1); - ptmp2 = ei_pmadd(b, A02, ptmp2); - ptmp3 = ei_pmadd(b, A03, ptmp3); + ptmp1 = ei_pmadd(b, A11, ptmp1); + ptmp2 = ei_pmadd(b, A12, ptmp2); + ptmp3 = ei_pmadd(b, A13, ptmp3); + } + if (alignmentStep==1) + { + // restore rows #1 and #3 + b = ptmp1; ptmp1 = ptmp3; ptmp3 = b; + const Scalar* aux = lhs1; + lhs1 = lhs3; lhs3 = aux; } } for (int j = peeledSize; j<alignedSize; j+=PacketSize) diff --git a/Eigen/src/Core/DummyPacketMath.h b/Eigen/src/Core/DummyPacketMath.h index 5aff21fe4..4abd6e997 100644 --- a/Eigen/src/Core/DummyPacketMath.h +++ b/Eigen/src/Core/DummyPacketMath.h @@ -134,5 +134,21 @@ inline static int ei_alignmentOffset(const Scalar* ptr, int maxOffset) : 0; } +/** \internal specialization of ei_palign() */ +template<int Offset,typename PacketType> +struct ei_palign_impl +{ + // by default data are aligned, so there is nothing to be done :) + inline static void run(PacketType& first, const PacketType& second) {} +}; + +/** \internal update \a first using the concatenation of the \a Offset last elements + * of \a first and packet_size minus \a Offset first elements of \a second */ +template<int Offset,typename PacketType> +inline void ei_palign(PacketType& first, const PacketType& second) +{ + ei_palign_impl<Offset,PacketType>::run(first,second); +} + #endif // EIGEN_DUMMY_PACKET_MATH_H diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index 60a1d0a35..8daedcc7d 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -111,8 +111,8 @@ template<typename Lhs, typename Rhs> struct ei_product_mode * This class represents an expression of the product of two matrices. * It is the return type of the operator* between matrices. Its template * arguments are determined automatically by ProductReturnType. Therefore, - * Product should be used direclty. To determine the result type of a function - * which involve a matrix product, use ProductReturnType::Type. + * Product should never be used direclty. To determine the result type of a + * function which involves a matrix product, use ProductReturnType::Type. * * \sa ProductReturnType, MatrixBase::operator*(const MatrixBase<OtherDerived>&) */ diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 28825ceee..07aa84d69 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -90,6 +90,12 @@ template<> inline __m128d ei_pload(const double* from) { return _mm_load_pd(fro template<> inline __m128i ei_pload(const int* from) { return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); } template<> inline __m128 ei_ploadu(const float* from) { return _mm_loadu_ps(from); } +// template<> inline __m128 ei_ploadu(const float* from) { +// if (size_t(from)&0xF) +// return _mm_loadu_ps(from); +// else +// return _mm_loadu_ps(from); +// } template<> inline __m128d ei_ploadu(const double* from) { return _mm_loadu_pd(from); } template<> inline __m128i ei_ploadu(const int* from) { return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); } @@ -198,7 +204,80 @@ inline __m128i ei_preduxp(const __m128i* vecs) // asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c)); // return res; // } +// inline __m128i _mm_alignr_epi8(const __m128i& a, const __m128i& b, const int i) +// { +// __m128i res = a; +// asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i)); +// return res; +// } #endif -#endif // EIGEN_PACKET_MATH_SSE_H +#ifdef __SSSE3__ +// SSSE3 versions +template<int Offset> +struct ei_palign_impl<Offset,__m128> +{ + inline static void run(__m128& first, const __m128& second) + { + first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(first), _mm_castps_si128(second), (4-Offset)*4)); + } +}; +template<int Offset> +struct ei_palign_impl<Offset,__m128i> +{ + inline static void run(__m128i& first, const __m128i& second) + { + first = _mm_alignr_epi8(first, second, (4-Offset)*4); + } +}; +#else +// SSE2 versions +template<int Offset> +struct ei_palign_impl<Offset,__m128> +{ + inline static void run(__m128& first, const __m128& second) + { + if (Offset==1) + { + first = _mm_move_ss(first,second); + first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39)); + } + else if (Offset==2) + { + first = _mm_movehl_ps(first,first); + first = _mm_movelh_ps(first,second); + } + else if (Offset==3) + { + first = _mm_move_ss(first,second); + first = _mm_shuffle_ps(first,second,0x93); + } + } +}; + +template<int Offset> +struct ei_palign_impl<Offset,__m128i> +{ + inline static void run(__m128i& first, const __m128i& second) + { + if (Offset==1) + { + first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); + first = _mm_shuffle_epi32(first,0x39); + } + else if (Offset==2) + { + first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first))); + first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); + } + else if (Offset==3) + { + first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); + first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93)); + } + } +}; +#endif + +#endif // EIGEN_PACKET_MATH_SSE_H |