2 files changed, 44 insertions, 8 deletions
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index ba1f73957..639af8ed4 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -88,7 +88,7 @@ EIGEN_DONT_INLINE static void run(
   // Here we assume data are at least aligned on the base scalar type.
   Index alignedStart = internal::first_aligned(res,size);
   Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
-  const Index peeledSize  = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;
+  const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
 
   const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
   Index alignmentPattern = alignmentStep==0 ? AllAligned
@@ -177,6 +177,8 @@ EIGEN_DONT_INLINE static void run(
               _EIGEN_ACCUMULATE_PACKETS(d,du,d);
             break;
           case FirstAligned:
+          {
+            Index j = alignedStart;
             if(peels>1)
             {
               LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
@@ -186,7 +188,7 @@ EIGEN_DONT_INLINE static void run(
               A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
               A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
 
-              for (Index j = alignedStart; j<peeledSize; j+=peels*ResPacketSize)
+              for (; j<peeledSize; j+=peels*ResPacketSize)
               {
                 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]);  palign<1>(A01,A11);
                 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]);  palign<2>(A02,A12);
@@ -210,9 +212,10 @@ EIGEN_DONT_INLINE static void run(
                 pstore(&res[j+ResPacketSize],T1);
               }
             }
-            for (Index j = peeledSize; j<alignedSize; j+=ResPacketSize)
+            for (; j<alignedSize; j+=ResPacketSize)
               _EIGEN_ACCUMULATE_PACKETS(d,du,du);
             break;
+          }
           default:
             for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
               _EIGEN_ACCUMULATE_PACKETS(du,du,du);
@@ -340,7 +343,7 @@ EIGEN_DONT_INLINE static void run(
   // if that's not the case then vectorization is discarded, see below.
   Index alignedStart = internal::first_aligned(rhs, depth);
   Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
-  const Index peeledSize  = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;
+  const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
 
   const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
   Index alignmentPattern = alignmentStep==0 ? AllAligned
@@ -430,10 +433,12 @@ EIGEN_DONT_INLINE static void run(
               _EIGEN_ACCUMULATE_PACKETS(d,du,d);
             break;
           case FirstAligned:
+          {
+            Index j = alignedStart;
             if (peels>1)
             {
               /* Here we proccess 4 rows with with two peeled iterations to hide
-               * tghe overhead of unaligned loads. Moreover unaligned loads are handled
+               * the overhead of unaligned loads. Moreover unaligned loads are handled
                * using special shift/move operations between the two aligned packets
                * overlaping the desired unaligned packet. This is *much* more efficient
                * than basic unaligned loads.
@@ -443,7 +448,7 @@ EIGEN_DONT_INLINE static void run(
               A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
               A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
 
-              for (Index j = alignedStart; j<peeledSize; j+=peels*RhsPacketSize)
+              for (; j<peeledSize; j+=peels*RhsPacketSize)
               {
                 RhsPacket b = pload<RhsPacket>(&rhs[j]);
                 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]);  palign<1>(A01,A11);
@@ -465,9 +470,10 @@ EIGEN_DONT_INLINE static void run(
                 ptmp3 = pcj.pmadd(A13, b, ptmp3);
               }
             }
-            for (Index j = peeledSize; j<alignedSize; j+=RhsPacketSize)
+            for (; j<alignedSize; j+=RhsPacketSize)
               _EIGEN_ACCUMULATE_PACKETS(d,du,du);
             break;
+          }
           default:
             for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
               _EIGEN_ACCUMULATE_PACKETS(du,du,du);
diff --git a/test/product_extra.cpp b/test/product_extra.cpp
index 9a6bf0792..6f962159e 100644
--- a/test/product_extra.cpp
+++ b/test/product_extra.cpp
@@ -135,6 +135,35 @@ void zero_sized_objects()
   a*b;
 }
 
+void unaligned_objects()
+{
+  // Regression test for the bug reported here:
+  // http://forum.kde.org/viewtopic.php?f=74&t=107541
+  // Recall the matrix*vector kernel avoid unaligned loads by loading two packets and then reassemble then.
+  // There was a mistake in the computation of the valid range for fully unaligned objects: in some rare cases,
+  // memory was read outside the allocated matrix memory. Though the values were not used, this might raise segfault.
+  for(int m=450;m<460;++m)
+  {
+    for(int n=8;n<12;++n)
+    {
+      MatrixXf M(m, n);
+      VectorXf v1(n), r1(500);
+      RowVectorXf v2(m), r2(16);
+
+      M.setRandom();
+      v1.setRandom();
+      v2.setRandom();
+      for(int o=0; o<4; ++o)
+      {
+        r1.segment(o,m).noalias() = M * v1;
+        VERIFY_IS_APPROX(r1.segment(o,m), M * MatrixXf(v1));
+        r2.segment(o,n).noalias() = v2 * M;
+        VERIFY_IS_APPROX(r2.segment(o,n), MatrixXf(v2) * M);
+      }
+    }
+  }
+}
+
 void test_product_extra()
 {
   for(int i = 0; i < g_repeat; i++) {
@@ -143,6 +172,7 @@ void test_product_extra()
     CALL_SUBTEST_2( mat_mat_scalar_scalar_product() );
     CALL_SUBTEST_3( product_extra(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
     CALL_SUBTEST_4( product_extra(MatrixXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
-    CALL_SUBTEST_5( zero_sized_objects() );
   }
+  CALL_SUBTEST_5( zero_sized_objects() );
+  CALL_SUBTEST_6( unaligned_objects() );
 }