diff options
-rw-r--r-- | Eigen/src/Core/products/GeneralMatrixVector.h | 20 | ||||
-rw-r--r-- | test/product_extra.cpp | 32 |
2 files changed, 44 insertions, 8 deletions
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index ba1f73957..639af8ed4 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -88,7 +88,7 @@ EIGEN_DONT_INLINE static void run( // Here we assume data are at least aligned on the base scalar type. Index alignedStart = internal::first_aligned(res,size); Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0; - const Index peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart; + const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; Index alignmentPattern = alignmentStep==0 ? AllAligned @@ -177,6 +177,8 @@ EIGEN_DONT_INLINE static void run( _EIGEN_ACCUMULATE_PACKETS(d,du,d); break; case FirstAligned: + { + Index j = alignedStart; if(peels>1) { LhsPacket A00, A01, A02, A03, A10, A11, A12, A13; @@ -186,7 +188,7 @@ EIGEN_DONT_INLINE static void run( A02 = pload<LhsPacket>(&lhs2[alignedStart-2]); A03 = pload<LhsPacket>(&lhs3[alignedStart-3]); - for (Index j = alignedStart; j<peeledSize; j+=peels*ResPacketSize) + for (; j<peeledSize; j+=peels*ResPacketSize) { A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11); A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12); @@ -210,9 +212,10 @@ EIGEN_DONT_INLINE static void run( pstore(&res[j+ResPacketSize],T1); } } - for (Index j = peeledSize; j<alignedSize; j+=ResPacketSize) + for (; j<alignedSize; j+=ResPacketSize) _EIGEN_ACCUMULATE_PACKETS(d,du,du); break; + } default: for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) _EIGEN_ACCUMULATE_PACKETS(du,du,du); @@ -340,7 +343,7 @@ EIGEN_DONT_INLINE static void run( // if that's not the case then vectorization is discarded, see below. Index alignedStart = internal::first_aligned(rhs, depth); Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0; - const Index peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart; + const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; Index alignmentPattern = alignmentStep==0 ? AllAligned @@ -430,10 +433,12 @@ EIGEN_DONT_INLINE static void run( _EIGEN_ACCUMULATE_PACKETS(d,du,d); break; case FirstAligned: + { + Index j = alignedStart; if (peels>1) { /* Here we proccess 4 rows with with two peeled iterations to hide - * tghe overhead of unaligned loads. Moreover unaligned loads are handled + * the overhead of unaligned loads. Moreover unaligned loads are handled * using special shift/move operations between the two aligned packets * overlaping the desired unaligned packet. This is *much* more efficient * than basic unaligned loads. @@ -443,7 +448,7 @@ EIGEN_DONT_INLINE static void run( A02 = pload<LhsPacket>(&lhs2[alignedStart-2]); A03 = pload<LhsPacket>(&lhs3[alignedStart-3]); - for (Index j = alignedStart; j<peeledSize; j+=peels*RhsPacketSize) + for (; j<peeledSize; j+=peels*RhsPacketSize) { RhsPacket b = pload<RhsPacket>(&rhs[j]); A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11); @@ -465,9 +470,10 @@ EIGEN_DONT_INLINE static void run( ptmp3 = pcj.pmadd(A13, b, ptmp3); } } - for (Index j = peeledSize; j<alignedSize; j+=RhsPacketSize) + for (; j<alignedSize; j+=RhsPacketSize) _EIGEN_ACCUMULATE_PACKETS(d,du,du); break; + } default: for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) _EIGEN_ACCUMULATE_PACKETS(du,du,du); diff --git a/test/product_extra.cpp b/test/product_extra.cpp index 9a6bf0792..6f962159e 100644 --- a/test/product_extra.cpp +++ b/test/product_extra.cpp @@ -135,6 +135,35 @@ void zero_sized_objects() a*b; } +void unaligned_objects() +{ + // Regression test for the bug reported here: + // http://forum.kde.org/viewtopic.php?f=74&t=107541 + // Recall the matrix*vector kernel avoid unaligned loads by loading two packets and then reassemble then. + // There was a mistake in the computation of the valid range for fully unaligned objects: in some rare cases, + // memory was read outside the allocated matrix memory. Though the values were not used, this might raise segfault. + for(int m=450;m<460;++m) + { + for(int n=8;n<12;++n) + { + MatrixXf M(m, n); + VectorXf v1(n), r1(500); + RowVectorXf v2(m), r2(16); + + M.setRandom(); + v1.setRandom(); + v2.setRandom(); + for(int o=0; o<4; ++o) + { + r1.segment(o,m).noalias() = M * v1; + VERIFY_IS_APPROX(r1.segment(o,m), M * MatrixXf(v1)); + r2.segment(o,n).noalias() = v2 * M; + VERIFY_IS_APPROX(r2.segment(o,n), MatrixXf(v2) * M); + } + } + } +} + void test_product_extra() { for(int i = 0; i < g_repeat; i++) { @@ -143,6 +172,7 @@ void test_product_extra() CALL_SUBTEST_2( mat_mat_scalar_scalar_product() ); CALL_SUBTEST_3( product_extra(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) ); CALL_SUBTEST_4( product_extra(MatrixXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) ); - CALL_SUBTEST_5( zero_sized_objects() ); } + CALL_SUBTEST_5( zero_sized_objects() ); + CALL_SUBTEST_6( unaligned_objects() ); } |