5 files changed, 161 insertions, 30 deletions
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 62617172e..c3174073b 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -274,7 +274,6 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block<MatrixTy
 
     inline const Scalar coeff(int row, int col) const
     {
-//       std::cerr << "coeff(int row, int col)\n";
       if (IsRowMajor)
         return m_data_ptr[col + row * stride()];
       else
diff --git a/Eigen/src/Core/CacheFriendlyProduct.h b/Eigen/src/Core/CacheFriendlyProduct.h
index a9ef66cdd..b51b50ac3 100644
--- a/Eigen/src/Core/CacheFriendlyProduct.h
+++ b/Eigen/src/Core/CacheFriendlyProduct.h
@@ -390,7 +390,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
 
   const int alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0;
   int alignmentPattern = alignmentStep==0 ? AllAligned
-                       : alignmentStep==2 ? EvenAligned
+                       : alignmentStep==(PacketSize/2) ? EvenAligned
                        : FirstAligned;
 
   // we cannot assume the first element is aligned because of sub-matrices
@@ -432,7 +432,6 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
     if (PacketSize>1)
     {
       /* explicit vectorization */
-
       // process initial unaligned coeffs
       for (int j=0; j<alignedStart; j++)
         res[j] += ei_pfirst(ptmp0)*lhs0[j] + ei_pfirst(ptmp1)*lhs1[j] + ei_pfirst(ptmp2)*lhs2[j] + ei_pfirst(ptmp3)*lhs3[j];
@@ -452,26 +451,41 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
           case FirstAligned:
             if(peels>1)
             {
-              // NOTE peeling with two _EIGEN_ACCUMULATE_PACKETS() is much less efficient
-              // than the following code
-              asm("#mybegin");
               Packet A00, A01, A02, A03, A10, A11, A12, A13;
+              if (alignmentStep==1)
+              {
+                A00 = ptmp1; ptmp1 = ptmp3; ptmp3 = A00;
+                const Scalar* aux = lhs1;
+                lhs1 = lhs3; lhs3 = aux;
+              }
+
+              A01 = ei_pload(&lhs1[alignedStart-1]);
+              A02 = ei_pload(&lhs2[alignedStart-2]);
+              A03 = ei_pload(&lhs3[alignedStart-3]);
+                
               for (int j = alignedStart; j<peeledSize; j+=peels*PacketSize)
               {
-                A01 = ei_ploadu(&lhs1[j]);          A11 = ei_ploadu(&lhs1[j+PacketSize]);
-                A02 = ei_ploadu(&lhs2[j]);          A12 = ei_ploadu(&lhs2[j+PacketSize]);
-                A00 = ei_pload (&lhs0[j]);          A10 = ei_pload (&lhs0[j+PacketSize]);
+                A11 = ei_pload(&lhs1[j-1+PacketSize]);  ei_palign<1>(A01,A11);
+                A12 = ei_pload(&lhs2[j-2+PacketSize]);  ei_palign<2>(A02,A12);
+                A13 = ei_pload(&lhs3[j-3+PacketSize]);  ei_palign<3>(A03,A13);
 
+                A00 = ei_pload (&lhs0[j]);
+                A10 = ei_pload (&lhs0[j+PacketSize]);
                 A00 = ei_pmadd(ptmp0, A00, ei_pload(&res[j]));
                 A10 = ei_pmadd(ptmp0, A10, ei_pload(&res[j+PacketSize]));
 
-                A00 = ei_pmadd(ptmp1, A01, A00);    A10 = ei_pmadd(ptmp1, A11, A10);
-                A03 = ei_ploadu(&lhs3[j]);          A13 = ei_ploadu(&lhs3[j+PacketSize]);
-                A00 = ei_pmadd(ptmp2, A02, A00);    A10 = ei_pmadd(ptmp2, A12, A10);
-                A00 = ei_pmadd(ptmp3, A03, A00);    A10 = ei_pmadd(ptmp3, A13, A10);
-                ei_pstore(&res[j],A00);             ei_pstore(&res[j+PacketSize],A10);
+                A00 = ei_pmadd(ptmp1, A01, A00);
+                A01 = ei_pload(&lhs1[j-1+2*PacketSize]);  ei_palign<1>(A11,A01);
+                A00 = ei_pmadd(ptmp2, A02, A00);
+                A02 = ei_pload(&lhs2[j-2+2*PacketSize]);  ei_palign<2>(A12,A02);
+                A00 = ei_pmadd(ptmp3, A03, A00);
+                ei_pstore(&res[j],A00);
+                A03 = ei_pload(&lhs3[j-3+2*PacketSize]);  ei_palign<3>(A13,A03);
+                A10 = ei_pmadd(ptmp1, A11, A10);
+                A10 = ei_pmadd(ptmp2, A12, A10);
+                A10 = ei_pmadd(ptmp3, A13, A10);
+                ei_pstore(&res[j+PacketSize],A10);
               }
-              asm("#myend");
             }
             for (int j = peeledSize; j<alignedSize; j+=PacketSize)
               _EIGEN_ACCUMULATE_PACKETS(,u,u,);
@@ -532,7 +546,6 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
   #undef _EIGEN_ACCUMULATE_PACKETS
 }
 
-
 // TODO add peeling to mask unaligned load/stores
 template<typename Scalar, typename ResType>
 EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
@@ -570,7 +583,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
 
   const int alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0;
   int alignmentPattern = alignmentStep==0 ? AllAligned
-                       : alignmentStep==2 ? EvenAligned
+                       : alignmentStep==(PacketSize/2) ? EvenAligned
                        : FirstAligned;
 
   // we cannot assume the first element is aligned because of sub-matrices
@@ -636,27 +649,51 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
           case FirstAligned:
             if (peels>1)
             {
-              Packet A01, A02, A03, b;
+              /* Here we proccess 4 rows with with two peeled iterations to hide
+               * tghe overhead of unaligned loads. Moreover unaligned loads are handled
+               * using special shift/move operations between the two aligned packets
+               * overlaping the desired unaligned packet. This is *much* more efficient
+               * than basic unaligned loads.
+               */
+              Packet A01, A02, A03, b, A11, A12, A13;
+              if (alignmentStep==1)
+              {
+                // flip row #1 and #3
+                b = ptmp1; ptmp1 = ptmp3; ptmp3 = b;
+                const Scalar* aux = lhs1;
+                lhs1 = lhs3; lhs3 = aux;
+              }
+              A01 = ei_pload(&lhs1[alignedStart-1]);
+              A02 = ei_pload(&lhs2[alignedStart-2]);
+              A03 = ei_pload(&lhs3[alignedStart-3]);
+
               for (int j = alignedStart; j<peeledSize; j+=peels*PacketSize)
               {
                 b = ei_pload(&rhs[j]);
-                A01 = ei_ploadu(&lhs1[j]);
-                A02 = ei_ploadu(&lhs2[j]);
-                A03 = ei_ploadu(&lhs3[j]);
+                A11 = ei_pload(&lhs1[j-1+PacketSize]);  ei_palign<1>(A01,A11);
+                A12 = ei_pload(&lhs2[j-2+PacketSize]);  ei_palign<2>(A02,A12);
+                A13 = ei_pload(&lhs3[j-3+PacketSize]);  ei_palign<3>(A03,A13);
 
                 ptmp0 = ei_pmadd(b, ei_pload (&lhs0[j]), ptmp0);
                 ptmp1 = ei_pmadd(b, A01, ptmp1);
-                A01 = ei_ploadu(&lhs1[j+PacketSize]);
+                A01 = ei_pload(&lhs1[j-1+2*PacketSize]);  ei_palign<1>(A11,A01);
                 ptmp2 = ei_pmadd(b, A02, ptmp2);
-                A02 = ei_ploadu(&lhs2[j+PacketSize]);
+                A02 = ei_pload(&lhs2[j-2+2*PacketSize]);  ei_palign<2>(A12,A02);
                 ptmp3 = ei_pmadd(b, A03, ptmp3);
-                A03 = ei_ploadu(&lhs3[j+PacketSize]);
+                A03 = ei_pload(&lhs3[j-3+2*PacketSize]);  ei_palign<3>(A13,A03);
 
                 b = ei_pload(&rhs[j+PacketSize]);
                 ptmp0 = ei_pmadd(b, ei_pload (&lhs0[j+PacketSize]), ptmp0);
-                ptmp1 = ei_pmadd(b, A01, ptmp1);
-                ptmp2 = ei_pmadd(b, A02, ptmp2);
-                ptmp3 = ei_pmadd(b, A03, ptmp3);
+                ptmp1 = ei_pmadd(b, A11, ptmp1);
+                ptmp2 = ei_pmadd(b, A12, ptmp2);
+                ptmp3 = ei_pmadd(b, A13, ptmp3);
+              }
+              if (alignmentStep==1)
+              {
+                // restore rows #1 and #3
+                b = ptmp1; ptmp1 = ptmp3; ptmp3 = b;
+                const Scalar* aux = lhs1;
+                lhs1 = lhs3; lhs3 = aux;
               }
             }
             for (int j = peeledSize; j<alignedSize; j+=PacketSize)
diff --git a/Eigen/src/Core/DummyPacketMath.h b/Eigen/src/Core/DummyPacketMath.h
index 5aff21fe4..4abd6e997 100644
--- a/Eigen/src/Core/DummyPacketMath.h
+++ b/Eigen/src/Core/DummyPacketMath.h
@@ -134,5 +134,21 @@ inline static int ei_alignmentOffset(const Scalar* ptr, int maxOffset)
           : 0;
 }
 
+/** \internal specialization of ei_palign() */
+template<int Offset,typename PacketType>
+struct ei_palign_impl
+{
+  // by default data are aligned, so there is nothing to be done :)
+  inline static void run(PacketType& first, const PacketType& second) {}
+};
+
+/** \internal update \a first using the concatenation of the \a Offset last elements
+  * of \a first and packet_size minus \a Offset first elements of \a second */
+template<int Offset,typename PacketType>
+inline void ei_palign(PacketType& first, const PacketType& second)
+{
+  ei_palign_impl<Offset,PacketType>::run(first,second);
+}
+
 #endif // EIGEN_DUMMY_PACKET_MATH_H
 
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 60a1d0a35..8daedcc7d 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -111,8 +111,8 @@ template<typename Lhs, typename Rhs> struct ei_product_mode
   * This class represents an expression of the product of two matrices.
   * It is the return type of the operator* between matrices. Its template
   * arguments are determined automatically by ProductReturnType. Therefore,
-  * Product should be used direclty. To determine the result type of a function
-  * which involve a matrix product, use ProductReturnType::Type.
+  * Product should never be used direclty. To determine the result type of a
+  * function which involves a matrix product, use ProductReturnType::Type.
   *
   * \sa ProductReturnType, MatrixBase::operator*(const MatrixBase<OtherDerived>&)
   */
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 28825ceee..07aa84d69 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -90,6 +90,12 @@ template<> inline __m128d ei_pload(const double*  from) { return _mm_load_pd(fro
 template<> inline __m128i ei_pload(const int* from) { return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
 
 template<> inline __m128  ei_ploadu(const float*   from) { return _mm_loadu_ps(from); }
+// template<> inline __m128  ei_ploadu(const float*   from) {
+//   if (size_t(from)&0xF)
+//     return _mm_loadu_ps(from);
+//   else 
+//     return _mm_loadu_ps(from);
+// }
 template<> inline __m128d ei_ploadu(const double*  from) { return _mm_loadu_pd(from); }
 template<> inline __m128i ei_ploadu(const int* from) { return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); }
 
@@ -198,7 +204,80 @@ inline __m128i ei_preduxp(const __m128i* vecs)
 //   asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c));
 //   return res;
 // }
+// inline __m128i _mm_alignr_epi8(const __m128i&  a, const __m128i&  b, const int i)
+// {
+//   __m128i res = a;
+//   asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i));
+//   return res;
+// }
 #endif
 
-#endif // EIGEN_PACKET_MATH_SSE_H
+#ifdef __SSSE3__
+// SSSE3 versions
+template<int Offset>
+struct ei_palign_impl<Offset,__m128>
+{
+  inline static void run(__m128& first, const __m128& second)
+  {
+    first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(first), _mm_castps_si128(second), (4-Offset)*4));
+  }
+};
 
+template<int Offset>
+struct ei_palign_impl<Offset,__m128i>
+{
+  inline static void run(__m128i& first, const __m128i& second)
+  {
+    first = _mm_alignr_epi8(first, second, (4-Offset)*4);
+  }
+};
+#else
+// SSE2 versions
+template<int Offset>
+struct ei_palign_impl<Offset,__m128>
+{
+  inline static void run(__m128& first, const __m128& second)
+  {
+    if (Offset==1)
+    {
+      first = _mm_move_ss(first,second);
+      first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39));
+    }
+    else if (Offset==2)
+    {
+      first = _mm_movehl_ps(first,first);
+      first = _mm_movelh_ps(first,second);
+    }
+    else if (Offset==3)
+    {
+      first = _mm_move_ss(first,second);
+      first = _mm_shuffle_ps(first,second,0x93);
+    }
+  }
+};
+
+template<int Offset>
+struct ei_palign_impl<Offset,__m128i>
+{
+  inline static void run(__m128i& first, const __m128i& second)
+  {
+    if (Offset==1)
+    {
+      first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
+      first = _mm_shuffle_epi32(first,0x39);
+    }
+    else if (Offset==2)
+    {
+      first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first)));
+      first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
+    }
+    else if (Offset==3)
+    {
+      first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
+      first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93));
+    }
+  }
+};
+#endif
+
+#endif // EIGEN_PACKET_MATH_SSE_H