From 187b1543ce2448e8be87211c833da5d83af6ec7d Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 10 Apr 2008 12:34:22 +0000 Subject: added a vectorized version of Product::_cacheOptimalProduct, added the possibility to disable the vectorization using EIGEN_DONT_VECTORIZE (some architectures has SSE support by default) --- Eigen/Core | 2 ++ Eigen/src/Core/Product.h | 77 +++++++++++++++++++++++++++++++++++++----------- 2 files changed, 61 insertions(+), 18 deletions(-) (limited to 'Eigen') diff --git a/Eigen/Core b/Eigen/Core index 3b5f1fdb1..24dc37145 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -1,12 +1,14 @@ #ifndef EIGEN_CORE_H #define EIGEN_CORE_H +#ifndef EIGEN_DONT_VECTORIZE #ifdef __SSE2__ #define EIGEN_VECTORIZE #define EIGEN_VECTORIZE_SSE #include #include #endif +#endif #include #include diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index 4dacb9269..bb7c254c2 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -108,7 +108,7 @@ struct ei_packet_product_unroller struct ei_product_eval_mode { - enum{ value = Lhs::MaxRowsAtCompileTime >= 8 && Rhs::MaxColsAtCompileTime >= 8 + enum{ value = Lhs::MaxRowsAtCompileTime >= 16 && Rhs::MaxColsAtCompileTime >= 16 ? CacheOptimalProduct : NormalProduct }; }; @@ -139,7 +139,7 @@ struct ei_traits > | ( ( !(Lhs::Flags & RowMajorBit) && (Lhs::Flags & VectorizableBit) - ) + ) ? VectorizableBit : ( ( @@ -215,7 +215,6 @@ template class Product : ei_no_assignm ? Lhs::ColsAtCompileTime : Dynamic, Lhs, Rhs, PacketScalar> ::run(row, col, m_lhs, m_rhs, res); -// std::cout << "vec unrolled product\n"; } else { @@ -280,25 +279,67 @@ template void Product::_cacheOptimalEval(DestDerived& res) const { res.setZero(); - const int cols4 = m_lhs.cols()&0xfffffffC; - for (int k=0; k::type tmp0 = ei_pset1(m_rhs.coeff(j+0,k)); + const typename ei_packet_traits::type tmp1 = ei_pset1(m_rhs.coeff(j+1,k)); + const typename ei_packet_traits::type tmp2 = ei_pset1(m_rhs.coeff(j+2,k)); + const typename ei_packet_traits::type tmp3 = ei_pset1(m_rhs.coeff(j+3,k)); + for (int i=0; i::size) + { + res.writePacketCoeff(i,k, + ei_padd( + res.packetCoeff(i,k), + ei_padd( + ei_padd( + ei_pmul(tmp0, m_lhs.packetCoeff(i,j)), + ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))), + ei_padd( + ei_pmul(tmp2, m_lhs.packetCoeff(i,j+2)), + ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3)) + ) + ) + ) + ); + } + } + for(; j::type tmp = ei_pset1(m_rhs.coeff(j,k)); + for (int i=0; i