diff options
author | 2008-07-19 00:09:01 +0000 | |
---|---|---|
committer | 2008-07-19 00:09:01 +0000 | |
commit | 22a816ade884e3eee6a2f6a25f342e6a0d3448a0 (patch) | |
tree | becfc9016d9a7bdffdb30d15f99bf8aeed842a77 /Eigen/src | |
parent | 62ec1dd616377721d8be414911721bdc8967d677 (diff) |
* Fix a couple of issues related to the recent cache friendly products
* Improve the efficiency of matrix*vector in unaligned cases
* Trivial fixes in the destructors of MatrixStorage
* Removed the matrixNorm in test/product.cpp (twice faster and
that assumed the matrix product was ok while checking that !!)
Diffstat (limited to 'Eigen/src')
-rw-r--r-- | Eigen/src/Core/CacheFriendlyProduct.h | 112 | ||||
-rw-r--r-- | Eigen/src/Core/DummyPacketMath.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/Map.h | 36 | ||||
-rw-r--r-- | Eigen/src/Core/Matrix.h | 3 | ||||
-rw-r--r-- | Eigen/src/Core/MatrixStorage.h | 6 | ||||
-rw-r--r-- | Eigen/src/Core/Product.h | 4 | ||||
-rw-r--r-- | Eigen/src/Core/arch/SSE/PacketMath.h | 9 |
7 files changed, 82 insertions, 90 deletions
diff --git a/Eigen/src/Core/CacheFriendlyProduct.h b/Eigen/src/Core/CacheFriendlyProduct.h index 669649de8..83f5da558 100644 --- a/Eigen/src/Core/CacheFriendlyProduct.h +++ b/Eigen/src/Core/CacheFriendlyProduct.h @@ -381,14 +381,14 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector( ei_pstore(&res[j OFFSET], \ ei_padd(ei_pload(&res[j OFFSET]), \ ei_padd( \ - ei_padd(ei_pmul(ptmp0,ei_pload ## A0(&lhs[j OFFSET +iN0])),ei_pmul(ptmp1,ei_pload ## A13(&lhs[j OFFSET +iN1]))), \ - ei_padd(ei_pmul(ptmp2,ei_pload ## A2(&lhs[j OFFSET +iN2])),ei_pmul(ptmp3,ei_pload ## A13(&lhs[j OFFSET +iN3]))) ))) + ei_padd(ei_pmul(ptmp0,ei_pload ## A0(&lhs0[j OFFSET])),ei_pmul(ptmp1,ei_pload ## A13(&lhs1[j OFFSET]))), \ + ei_padd(ei_pmul(ptmp2,ei_pload ## A2(&lhs2[j OFFSET])),ei_pmul(ptmp3,ei_pload ## A13(&lhs3[j OFFSET]))) ))) asm("#begin matrix_vector_product"); typedef typename ei_packet_traits<Scalar>::type Packet; const int PacketSize = sizeof(Packet)/sizeof(Scalar); - enum { AllAligned, EvenAligned, FirstAligned, NoneAligned }; + enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned }; const int columnsAtOnce = 4; const int peels = 2; const int PacketAlignedMask = PacketSize-1; @@ -401,9 +401,9 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector( ? std::min<int>( (PacketSize - ((size_t(res)/sizeof(Scalar)) & PacketAlignedMask)) & PacketAlignedMask, size) : 0; const int alignedSize = alignedStart + ((size-alignedStart) & ~PacketAlignedMask); - const int peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : 0; + const int peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart; - const int alignmentStep = lhsStride % PacketSize; + const int alignmentStep = (PacketSize - lhsStride % PacketSize) & PacketAlignedMask; int alignmentPattern = alignmentStep==0 ? AllAligned : alignmentStep==2 ? EvenAligned : FirstAligned; @@ -423,17 +423,17 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector( skipColumns = std::min(skipColumns,rhs.size()); // note that the skiped columns are processed later. } - - int columnBound = (rhs.size()/columnsAtOnce)*columnsAtOnce; + int columnBound = ((rhs.size()-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; for (int i=skipColumns; i<columnBound; i+=columnsAtOnce) { - Scalar tmp0 = rhs[i], tmp1 = rhs[i+1], tmp2 = rhs[i+2], tmp3 = rhs[i+3]; - Packet ptmp0 = ei_pset1(tmp0), ptmp1 = ei_pset1(tmp1), ptmp2 = ei_pset1(tmp2), ptmp3 = ei_pset1(tmp3); - int iN0 = i*lhsStride, iN1 = (i+1)*lhsStride, iN2 = (i+2)*lhsStride, iN3 = (i+3)*lhsStride; + Packet ptmp0 = ei_pset1(rhs[i]), ptmp1 = ei_pset1(rhs[i+1]), + ptmp2 = ei_pset1(rhs[i+2]), ptmp3 = ei_pset1(rhs[i+3]); + const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+1)*lhsStride, + *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+3)*lhsStride; // process initial unaligned coeffs for (int j=0; j<alignedStart; j++) - res[j] += tmp0 * lhs[j+iN0] + tmp1 * lhs[j+iN1] + tmp2 * lhs[j+iN2] + tmp3 * lhs[j+iN3]; + res[j] += ei_pfirst(ptmp0)*lhs0[j] + ei_pfirst(ptmp1)*lhs1[j] + ei_pfirst(ptmp2)*lhs2[j] + ei_pfirst(ptmp3)*lhs3[j]; if (alignedSize>alignedStart) { @@ -448,14 +448,29 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector( _EIGEN_ACCUMULATE_PACKETS(,u,,); break; case FirstAligned: - if (peels>1) + if(peels>1) + { + // NOTE peeling with two _EIGEN_ACCUMULATE_PACKETS() is much less efficient + // than the following code + asm("#mybegin"); + Packet A00, A01, A02, A03, A10, A11, A12, A13; for (int j = alignedStart; j<peeledSize; j+=peels*PacketSize) { - _EIGEN_ACCUMULATE_PACKETS(,u,u,); - _EIGEN_ACCUMULATE_PACKETS(,u,u,+PacketSize); - if (peels>2) _EIGEN_ACCUMULATE_PACKETS(,u,u,+2*PacketSize); - if (peels>3) _EIGEN_ACCUMULATE_PACKETS(,u,u,+3*PacketSize); + A01 = ei_ploadu(&lhs1[j]); A11 = ei_ploadu(&lhs1[j+PacketSize]); + A02 = ei_ploadu(&lhs2[j]); A12 = ei_ploadu(&lhs2[j+PacketSize]); + A00 = ei_pload (&lhs0[j]); A10 = ei_pload (&lhs0[j+PacketSize]); + + A00 = ei_pmadd(ptmp0, A00, ei_pload(&res[j])); + A10 = ei_pmadd(ptmp0, A10, ei_pload(&res[j+PacketSize])); + + A00 = ei_pmadd(ptmp1, A01, A00); A10 = ei_pmadd(ptmp1, A11, A10); + A03 = ei_ploadu(&lhs3[j]); A13 = ei_ploadu(&lhs3[j+PacketSize]); + A00 = ei_pmadd(ptmp2, A02, A00); A10 = ei_pmadd(ptmp2, A12, A10); + A00 = ei_pmadd(ptmp3, A03, A00); A10 = ei_pmadd(ptmp3, A13, A10); + ei_pstore(&res[j],A00); ei_pstore(&res[j+PacketSize],A10); } + asm("#myend"); + } for (int j = peeledSize; j<alignedSize; j+=PacketSize) _EIGEN_ACCUMULATE_PACKETS(,u,u,); break; @@ -468,7 +483,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector( // process remaining coeffs for (int j=alignedSize; j<size; j++) - res[j] += tmp0 * lhs[j+iN0] + tmp1 * lhs[j+iN1] + tmp2 * lhs[j+iN2] + tmp3 * lhs[j+iN3]; + res[j] += ei_pfirst(ptmp0)*lhs0[j] + ei_pfirst(ptmp1)*lhs1[j] + ei_pfirst(ptmp2)*lhs2[j] + ei_pfirst(ptmp3)*lhs3[j]; } // process remaining first and last columns (at most columnsAtOnce-1) @@ -476,26 +491,25 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector( int start = columnBound; do { - for (int i=columnBound; i<end; i++) + for (int i=start; i<end; i++) { - Scalar tmp0 = rhs[i]; - Packet ptmp0 = ei_pset1(tmp0); - int iN0 = i*lhsStride; + Packet ptmp0 = ei_pset1(rhs[i]); + const Scalar* lhs0 = lhs + i*lhsStride; // process first unaligned result's coeffs for (int j=0; j<alignedStart; j++) - res[j] += tmp0 * lhs[j+iN0]; + res[j] += ei_pfirst(ptmp0) * lhs0[j]; // process aligned result's coeffs - if ((iN0 % PacketSize) == 0) + if (((i*lhsStride) % PacketSize+alignedStart) == 0) for (int j = alignedStart;j<alignedSize;j+=PacketSize) - ei_pstore(&res[j], ei_padd(ei_pmul(ptmp0,ei_pload(&lhs[j+iN0])),ei_pload(&res[j]))); + ei_pstore(&res[j], ei_pmadd(ptmp0,ei_pload(&lhs0[j]),ei_pload(&res[j]))); else for (int j = alignedStart;j<alignedSize;j+=PacketSize) - ei_pstore(&res[j], ei_padd(ei_pmul(ptmp0,ei_ploadu(&lhs[j+iN0])),ei_pload(&res[j]))); + ei_pstore(&res[j], ei_pmadd(ptmp0,ei_ploadu(&lhs0[j]),ei_pload(&res[j]))); // process remaining scalars for (int j=alignedSize; j<size; j++) - res[j] += tmp0 * lhs[j+iN0]; + res[j] += ei_pfirst(ptmp0) * lhs0[j]; } if (skipColumns) { @@ -507,10 +521,11 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector( break; } while(true); asm("#end matrix_vector_product"); - #undef _EIGEN_ACCUMULATE_PACKETS } + +// TODO add peeling to mask unaligned load/stores template<typename Scalar, typename ResType> EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector( const Scalar* lhs, int lhsStride, @@ -523,10 +538,10 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector( #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2,OFFSET) {\ Packet b = ei_pload(&rhs[j]); \ - ptmp0 = ei_padd(ptmp0, ei_pmul(b, ei_pload##A0 (&lhs[j+iN0]))); \ - ptmp1 = ei_padd(ptmp1, ei_pmul(b, ei_pload##A13(&lhs[j+iN1]))); \ - ptmp2 = ei_padd(ptmp2, ei_pmul(b, ei_pload##A2 (&lhs[j+iN2]))); \ - ptmp3 = ei_padd(ptmp3, ei_pmul(b, ei_pload##A13(&lhs[j+iN3]))); } + ptmp0 = ei_pmadd(b, ei_pload##A0 (&lhs0[j]), ptmp0); \ + ptmp1 = ei_pmadd(b, ei_pload##A13(&lhs1[j]), ptmp1); \ + ptmp2 = ei_pmadd(b, ei_pload##A2 (&lhs2[j]), ptmp2); \ + ptmp3 = ei_pmadd(b, ei_pload##A13(&lhs3[j]), ptmp3); } asm("#begin matrix_vector_product"); typedef typename ei_packet_traits<Scalar>::type Packet; @@ -534,9 +549,9 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector( enum { AllAligned, EvenAligned, FirstAligned, NoneAligned }; const int rowsAtOnce = 4; - const int peels = 2; +// const int peels = 2; const int PacketAlignedMask = PacketSize-1; - const int PeelAlignedMask = PacketSize*peels-1; +// const int PeelAlignedMask = PacketSize*peels-1; const bool Vectorized = sizeof(Packet) != sizeof(Scalar); const int size = rhsSize; @@ -546,9 +561,9 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector( ? std::min<int>( (PacketSize - ((size_t(rhs)/sizeof(Scalar)) & PacketAlignedMask)) & PacketAlignedMask, size) : 0; const int alignedSize = alignedStart + ((size-alignedStart) & ~PacketAlignedMask); - const int peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : 0; + //const int peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : 0; - const int alignmentStep = lhsStride % PacketSize; + const int alignmentStep = (PacketSize - lhsStride % PacketSize) & PacketAlignedMask; int alignmentPattern = alignmentStep==0 ? AllAligned : alignmentStep==2 ? EvenAligned : FirstAligned; @@ -568,19 +583,20 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector( skipRows = std::min(skipRows,res.size()); // note that the skiped columns are processed later. } - - int rowBound = (res.size()/rowsAtOnce)*rowsAtOnce; + + int rowBound = ((res.size()-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; for (int i=skipRows; i<rowBound; i+=rowsAtOnce) { Scalar tmp0 = Scalar(0), tmp1 = Scalar(0), tmp2 = Scalar(0), tmp3 = Scalar(0); Packet ptmp0 = ei_pset1(Scalar(0)), ptmp1 = ei_pset1(Scalar(0)), ptmp2 = ei_pset1(Scalar(0)), ptmp3 = ei_pset1(Scalar(0)); - int iN0 = i*lhsStride, iN1 = (i+1)*lhsStride, iN2 = (i+2)*lhsStride, iN3 = (i+3)*lhsStride; + const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+1)*lhsStride, + *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+3)*lhsStride; // process initial unaligned coeffs for (int j=0; j<alignedStart; j++) { Scalar b = rhs[j]; - tmp0 += b * lhs[j+iN0]; tmp1 += b * lhs[j+iN1]; tmp2 += b * lhs[j+iN2]; tmp3 += b * lhs[j+iN3]; + tmp0 += b*lhs0[j]; tmp1 += b*lhs1[j]; tmp2 += b*lhs2[j]; tmp3 += b*lhs3[j]; } if (alignedSize>alignedStart) @@ -614,9 +630,9 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector( for (int j=alignedSize; j<size; j++) { Scalar b = rhs[j]; - tmp0 += b * lhs[j+iN0]; tmp1 += b * lhs[j+iN1]; tmp2 += b * lhs[j+iN2]; tmp3 += b * lhs[j+iN3]; + tmp0 += b*lhs0[j]; tmp1 += b*lhs1[j]; tmp2 += b*lhs2[j]; tmp3 += b*lhs3[j]; } - res[i] = tmp0; res[i+1] = tmp1; res[i+2] = tmp2; res[i+3] = tmp3; + res[i] += tmp0; res[i+1] += tmp1; res[i+2] += tmp2; res[i+3] += tmp3; } // process remaining first and last rows (at most columnsAtOnce-1) @@ -624,30 +640,30 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector( int start = rowBound; do { - for (int i=rowBound; i<end; i++) + for (int i=start; i<end; i++) { Scalar tmp0 = Scalar(0); Packet ptmp0 = ei_pset1(tmp0); - int iN0 = i*lhsStride; + const Scalar* lhs0 = lhs + i*lhsStride; // process first unaligned result's coeffs for (int j=0; j<alignedStart; j++) - tmp0 += rhs[j] * lhs[j+iN0]; + tmp0 += rhs[j] * lhs0[j]; if (alignedSize>alignedStart) { // process aligned rhs coeffs - if (iN0 % PacketSize==0) + if ((i*lhsStride+alignedStart) % PacketSize==0) for (int j = alignedStart;j<alignedSize;j+=PacketSize) - ptmp0 = ei_padd(ptmp0, ei_pmul(ei_pload(&rhs[j]), ei_pload(&lhs[j+iN0]))); + ptmp0 = ei_pmadd(ei_pload(&rhs[j]), ei_pload(&lhs0[j]), ptmp0); else for (int j = alignedStart;j<alignedSize;j+=PacketSize) - ptmp0 = ei_padd(ptmp0, ei_pmul(ei_pload(&rhs[j]), ei_ploadu(&lhs[j+iN0]))); + ptmp0 = ei_pmadd(ei_pload(&rhs[j]), ei_ploadu(&lhs0[j]), ptmp0); tmp0 += ei_predux(ptmp0); } // process remaining scalars for (int j=alignedSize; j<size; j++) - tmp0 += rhs[j] * lhs[j+iN0]; + tmp0 += rhs[j] * lhs0[j]; res[i] += tmp0; } if (skipRows) diff --git a/Eigen/src/Core/DummyPacketMath.h b/Eigen/src/Core/DummyPacketMath.h index 1b3bae34f..1a4ecbfa0 100644 --- a/Eigen/src/Core/DummyPacketMath.h +++ b/Eigen/src/Core/DummyPacketMath.h @@ -117,7 +117,7 @@ template <typename Scalar, typename Packet, int LoadMode> inline void ei_pstoret if(LoadMode == Aligned) ei_pstore(to, from); else - ei_pstoreu(to, from); + ei_pstoreu(to, from); } #endif // EIGEN_DUMMY_PACKET_MATH_H diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h index 92ac5f20b..13664eebc 100644 --- a/Eigen/src/Core/Map.h +++ b/Eigen/src/Core/Map.h @@ -66,6 +66,8 @@ template<typename MatrixType, int Alignment> class Map inline int rows() const { return m_rows.value(); } inline int cols() const { return m_cols.value(); } + inline int stride() const { return this->innerSize(); } + inline const Scalar& coeff(int row, int col) const { if(Flags & RowMajorBit) @@ -156,40 +158,6 @@ template<typename MatrixType, int Alignment> class Map const ei_int_if_dynamic<ColsAtCompileTime> m_cols; }; -/** Constructor copying an existing array of data. Only useful for dynamic-size matrices: - * for fixed-size matrices, it is redundant to pass the \a rows and \a cols parameters. - * \param data The array of data to copy - * \param rows The number of rows of the matrix to construct - * \param cols The number of columns of the matrix to construct - * - * \sa Matrix(const Scalar *), Matrix::map(const Scalar *, int, int) - */ -template<typename _Scalar, int _Rows, int _Cols, int _MaxRows, int _MaxCols, unsigned int _Flags> -inline Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCols, _Flags> - ::Matrix(const Scalar *data, int rows, int cols) - : m_storage(rows*cols, rows, cols) -{ - *this = Map<Matrix>(data, rows, cols); -} - -/** Constructor copying an existing array of data. Only useful for dynamic-size vectors: - * for fixed-size vectors, it is redundant to pass the \a size parameter. - * - * \only_for_vectors - * - * \param data The array of data to copy - * \param size The size of the vector to construct - * - * \sa Matrix(const Scalar *), Matrix::map(const Scalar *, int) - */ -template<typename _Scalar, int _Rows, int _Cols, int _MaxRows, int _MaxCols, unsigned int _Flags> -inline Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCols, _Flags> - ::Matrix(const Scalar *data, int size) - : m_storage(size, RowsAtCompileTime == 1 ? 1 : size, ColsAtCompileTime == 1 ? 1 : size) -{ - *this = Map<Matrix>(data, size); -} - /** Constructor copying an existing array of data. * Only for fixed-size matrices and vectors. * \param data The array of data to copy diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h index 379b5e1f4..569d7207c 100644 --- a/Eigen/src/Core/Matrix.h +++ b/Eigen/src/Core/Matrix.h @@ -319,8 +319,7 @@ class Matrix : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCol m_storage.data()[2] = z; m_storage.data()[3] = w; } - Matrix(const Scalar *data, int rows, int cols); - Matrix(const Scalar *data, int size); + explicit Matrix(const Scalar *data); /** Constructor copying the value of the expression \a other */ diff --git a/Eigen/src/Core/MatrixStorage.h b/Eigen/src/Core/MatrixStorage.h index 53faf884e..2b13018bf 100644 --- a/Eigen/src/Core/MatrixStorage.h +++ b/Eigen/src/Core/MatrixStorage.h @@ -161,7 +161,7 @@ template<typename T> class ei_matrix_storage<T, Dynamic, Dynamic, Dynamic> public: inline ei_matrix_storage(int size, int rows, int cols) : m_data(ei_aligned_malloc<T>(size)), m_rows(rows), m_cols(cols) {} - inline ~ei_matrix_storage() { delete[] m_data; } + inline ~ei_matrix_storage() { ei_aligned_free(m_data); } inline void swap(ei_matrix_storage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); } inline int rows(void) const {return m_rows;} @@ -187,7 +187,7 @@ template<typename T, int _Rows> class ei_matrix_storage<T, Dynamic, _Rows, Dynam int m_cols; public: inline ei_matrix_storage(int size, int, int cols) : m_data(ei_aligned_malloc<T>(size)), m_cols(cols) {} - inline ~ei_matrix_storage() { delete[] m_data; } + inline ~ei_matrix_storage() { ei_aligned_free(m_data); } inline void swap(ei_matrix_storage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); } inline static int rows(void) {return _Rows;} inline int cols(void) const {return m_cols;} @@ -211,7 +211,7 @@ template<typename T, int _Cols> class ei_matrix_storage<T, Dynamic, Dynamic, _Co int m_rows; public: inline ei_matrix_storage(int size, int rows, int) : m_data(ei_aligned_malloc<T>(size)), m_rows(rows) {} - inline ~ei_matrix_storage() { delete[] m_data; } + inline ~ei_matrix_storage() { ei_aligned_free(m_data); } inline void swap(ei_matrix_storage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); } inline int rows(void) const {return m_rows;} inline static int cols(void) {return _Cols;} diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index 223da4d06..831611adf 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -94,8 +94,8 @@ template<typename Lhs, typename Rhs> struct ei_product_mode : Lhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD && ( Lhs::MaxRowsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD || Rhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ) - && (!(Rhs::IsVectorAtCompileTime && (Lhs::Flags&RowMajorBit) && (!Lhs::Flags&DirectAccessBit))) - && (!(Lhs::IsVectorAtCompileTime && (!Rhs::Flags&RowMajorBit) && (!Rhs::Flags&DirectAccessBit))) + && (!(Rhs::IsVectorAtCompileTime && (Lhs::Flags&RowMajorBit) && (!(Lhs::Flags&DirectAccessBit)))) + && (!(Lhs::IsVectorAtCompileTime && (!(Rhs::Flags&RowMajorBit)) && (!(Rhs::Flags&DirectAccessBit)))) ? CacheFriendlyProduct : NormalProduct }; }; diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index a30011dea..114ed49e2 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -189,5 +189,14 @@ inline __m128i ei_preduxp(const __m128i* vecs) return _mm_add_epi32(tmp0, tmp2); } +#if (defined __GNUC__) +template <> inline __m128 ei_pmadd(const __m128& a, const __m128& b, const __m128& c) +{ + __m128 res = b; + asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c)); + return res; +} +#endif + #endif // EIGEN_PACKET_MATH_SSE_H |