diff options
Diffstat (limited to 'Eigen')
-rw-r--r-- | Eigen/src/Core/CacheFriendlyProduct.h | 157 | ||||
-rw-r--r-- | Eigen/src/Core/Map.h | 4 | ||||
-rw-r--r-- | Eigen/src/Core/arch/SSE/PacketMath.h | 12 | ||||
-rwxr-xr-x | Eigen/src/QR/Tridiagonalization.h | 126 |
4 files changed, 193 insertions, 106 deletions
diff --git a/Eigen/src/Core/CacheFriendlyProduct.h b/Eigen/src/Core/CacheFriendlyProduct.h index 3ce72f5ba..a9ef66cdd 100644 --- a/Eigen/src/Core/CacheFriendlyProduct.h +++ b/Eigen/src/Core/CacheFriendlyProduct.h @@ -86,13 +86,15 @@ static void ei_cache_friendly_product( const int l2BlockRows = MaxL2BlockSize > rows ? rows : MaxL2BlockSize; const int l2BlockCols = MaxL2BlockSize > cols ? cols : MaxL2BlockSize; const int l2BlockSize = MaxL2BlockSize > size ? size : MaxL2BlockSize; + const int l2BlockSizeAligned = (1 + std::max(l2BlockSize,l2BlockCols)/PacketSize)*PacketSize; + const bool needRhsCopy = (PacketSize>1) && ((rhsStride%PacketSize!=0) || (size_t(rhs)%16!=0)); Scalar* __restrict__ block = 0; const int allocBlockSize = sizeof(Scalar)*l2BlockRows*size; if (allocBlockSize>16000000) block = (Scalar*)malloc(allocBlockSize); else block = (Scalar*)alloca(allocBlockSize); - Scalar* __restrict__ rhsCopy = (Scalar*)alloca(sizeof(Scalar)*l2BlockSize); + Scalar* __restrict__ rhsCopy = (Scalar*)alloca(sizeof(Scalar)*l2BlockSizeAligned*l2BlockSizeAligned); // loops on each L2 cache friendly blocks of the result for(int l2i=0; l2i<rows; l2i+=l2BlockRows) @@ -113,8 +115,8 @@ static void ei_cache_friendly_product( for (int i = l2i; i<l2blockRowEndBW/*PlusOne*/; i+=MaxBlockRows) { - // TODO merge the if l2blockRemainingRows -// const int blockRows = std::min(i+MaxBlockRows, rows) - i; + // TODO merge the "if l2blockRemainingRows" using something like: + // const int blockRows = std::min(i+MaxBlockRows, rows) - i; for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize) { @@ -164,62 +166,58 @@ static void ei_cache_friendly_product( // acumulate bw rows of lhs time a single column of rhs to a bw x 1 block of res int l2blockSizeEnd = std::min(l2k+l2BlockSize, size); + // if not aligned, copy the rhs block + if (needRhsCopy) + for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1) + { + ei_internal_assert(l2BlockSizeAligned*(l1j-l2j)+(l2blockSizeEnd-l2k) < l2BlockSizeAligned*l2BlockSizeAligned); + memcpy(rhsCopy+l2BlockSizeAligned*(l1j-l2j),&(rhs[l1j*rhsStride+l2k]),(l2blockSizeEnd-l2k)*sizeof(Scalar)); + } + // for each bw x 1 result's block for(int l1i=l2i; l1i<l2blockRowEndBW; l1i+=MaxBlockRows) { + int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*MaxBlockRows; + const Scalar* __restrict__ localB = &block[offsetblock]; + for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1) { - int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*MaxBlockRows; - const Scalar* __restrict__ localB = &block[offsetblock]; - - const Scalar* __restrict__ rhsColumn = &(rhs[l1j*rhsStride]); - - // copy unaligned rhs data - // YES it seems to be faster to copy some part of rhs multiple times - // to aligned memory rather than using unligned load. - // Moreover this avoids a "if" in the most nested loop :) - if (PacketSize>1 && size_t(rhsColumn)%16) - { - int count = 0; - // FIXME this loop get vectorized by the compiler (ICC) - // I'm not sure thats good or not - for (int k = l2k; k<l2blockSizeEnd; ++k) - { - rhsCopy[count++] = rhsColumn[k]; - } - rhsColumn = &(rhsCopy[-l2k]); - } + const Scalar* __restrict__ rhsColumn; + if (needRhsCopy) + rhsColumn = &(rhsCopy[l2BlockSizeAligned*(l1j-l2j)-l2k]); + else + rhsColumn = &(rhs[l1j*rhsStride]); PacketType dst[MaxBlockRows]; - dst[0] = ei_pset1(Scalar(0.)); - dst[1] = dst[0]; - dst[2] = dst[0]; - dst[3] = dst[0]; + dst[3] = dst[2] = dst[1] = dst[0] = ei_pset1(Scalar(0.)); if (MaxBlockRows==8) - { - dst[4] = dst[0]; - dst[5] = dst[0]; - dst[6] = dst[0]; - dst[7] = dst[0]; - } + dst[7] = dst[6] = dst[5] = dst[4] = dst[0]; PacketType tmp; asm("#eigen begincore"); for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize) { - tmp = ei_pload(&rhsColumn[k]); - - dst[0] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows ])), dst[0]); - dst[1] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows+ PacketSize])), dst[1]); - dst[2] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows+2*PacketSize])), dst[2]); - dst[3] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows+3*PacketSize])), dst[3]); + tmp = ei_ploadu(&rhsColumn[k]); + PacketType A0, A1, A2, A3, A4, A5; + A0 = ei_pload(localB + k*MaxBlockRows); + A1 = ei_pload(localB + k*MaxBlockRows+1*PacketSize); + A2 = ei_pload(localB + k*MaxBlockRows+2*PacketSize); + A3 = ei_pload(localB + k*MaxBlockRows+3*PacketSize); + if (MaxBlockRows==8) A4 = ei_pload(localB + k*MaxBlockRows+4*PacketSize); + if (MaxBlockRows==8) A5 = ei_pload(localB + k*MaxBlockRows+5*PacketSize); + dst[0] = ei_pmadd(tmp, A0, dst[0]); + if (MaxBlockRows==8) A0 = ei_pload(localB + k*MaxBlockRows+6*PacketSize); + dst[1] = ei_pmadd(tmp, A1, dst[1]); + if (MaxBlockRows==8) A1 = ei_pload(localB + k*MaxBlockRows+7*PacketSize); + dst[2] = ei_pmadd(tmp, A2, dst[2]); + dst[3] = ei_pmadd(tmp, A3, dst[3]); if (MaxBlockRows==8) { - dst[4] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows+4*PacketSize])), dst[4]); - dst[5] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows+5*PacketSize])), dst[5]); - dst[6] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows+6*PacketSize])), dst[6]); - dst[7] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows+7*PacketSize])), dst[7]); + dst[4] = ei_pmadd(tmp, A4, dst[4]); + dst[5] = ei_pmadd(tmp, A5, dst[5]); + dst[6] = ei_pmadd(tmp, A0, dst[6]); + dst[7] = ei_pmadd(tmp, A1, dst[7]); } } @@ -227,7 +225,8 @@ static void ei_cache_friendly_product( if (PacketSize>1 && resIsAligned) { - ei_pstore(&(localRes[0]), ei_padd(ei_pload(&(localRes[0])), ei_preduxp(dst))); + // the result is aligned: let's do packet reduction + ei_pstore(&(localRes[0]), ei_padd(ei_pload(&(localRes[0])), ei_preduxp(&dst[0]))); if (PacketSize==2) ei_pstore(&(localRes[2]), ei_padd(ei_pload(&(localRes[2])), ei_preduxp(&(dst[2])))); if (MaxBlockRows==8) @@ -239,6 +238,7 @@ static void ei_cache_friendly_product( } else { + // not aligned => per coeff packet reduction localRes[0] += ei_predux(dst[0]); localRes[1] += ei_predux(dst[1]); localRes[2] += ei_predux(dst[2]); @@ -262,32 +262,16 @@ static void ei_cache_friendly_product( asm("#eigen begin dynkernel"); for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1) { - const Scalar* __restrict__ rhsColumn = &(rhs[l1j*rhsStride]); - - // copy unaligned rhs data - if (PacketSize>1 && size_t(rhsColumn)%16) - { - int count = 0; - // FIXME this loop get vectorized by the compiler ! - for (int k = l2k; k<l2blockSizeEnd; ++k) - { - rhsCopy[count++] = rhsColumn[k]; - } - rhsColumn = &(rhsCopy[-l2k]); - } + const Scalar* __restrict__ rhsColumn; + if (needRhsCopy) + rhsColumn = &(rhsCopy[l2BlockSizeAligned*(l1j-l2j)-l2k]); + else + rhsColumn = &(rhs[l1j*rhsStride]); PacketType dst[MaxBlockRows]; - dst[0] = ei_pset1(Scalar(0.)); - dst[1] = dst[0]; - dst[2] = dst[0]; - dst[3] = dst[0]; - if (MaxBlockRows>4) - { - dst[4] = dst[0]; - dst[5] = dst[0]; - dst[6] = dst[0]; - dst[7] = dst[0]; - } + dst[3] = dst[2] = dst[1] = dst[0] = ei_pset1(Scalar(0.)); + if (MaxBlockRows==8) + dst[7] = dst[6] = dst[5] = dst[4] = dst[0]; // let's declare a few other temporary registers PacketType tmp; @@ -300,7 +284,7 @@ static void ei_cache_friendly_product( if (l2blockRemainingRows>=2) dst[1] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+ PacketSize])), dst[1]); if (l2blockRemainingRows>=3) dst[2] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+2*PacketSize])), dst[2]); if (l2blockRemainingRows>=4) dst[3] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+3*PacketSize])), dst[3]); - if (MaxBlockRows>4) + if (MaxBlockRows==8) { if (l2blockRemainingRows>=5) dst[4] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+4*PacketSize])), dst[4]); if (l2blockRemainingRows>=6) dst[5] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+5*PacketSize])), dst[5]); @@ -316,7 +300,7 @@ static void ei_cache_friendly_product( if (l2blockRemainingRows>=2) localRes[1] += ei_predux(dst[1]); if (l2blockRemainingRows>=3) localRes[2] += ei_predux(dst[2]); if (l2blockRemainingRows>=4) localRes[3] += ei_predux(dst[3]); - if (MaxBlockRows>4) + if (MaxBlockRows==8) { if (l2blockRemainingRows>=5) localRes[4] += ei_predux(dst[4]); if (l2blockRemainingRows>=6) localRes[5] += ei_predux(dst[5]); @@ -573,16 +557,16 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector( enum { AllAligned, EvenAligned, FirstAligned, NoneAligned }; const int rowsAtOnce = 4; -// const int peels = 2; + const int peels = 2; const int PacketAlignedMask = PacketSize-1; -// const int PeelAlignedMask = PacketSize*peels-1; + const int PeelAlignedMask = PacketSize*peels-1; const int size = rhsSize; // How many coeffs of the result do we have to skip to be aligned. // Here we assume data are at least aligned on the base scalar type that is mandatory anyway. const int alignedStart = ei_alignmentOffset(rhs, size); const int alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0; - //const int peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : 0; + const int peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart; const int alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0; int alignmentPattern = alignmentStep==0 ? AllAligned @@ -650,7 +634,32 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector( _EIGEN_ACCUMULATE_PACKETS(,u,,); break; case FirstAligned: - for (int j = alignedStart; j<alignedSize; j+=PacketSize) + if (peels>1) + { + Packet A01, A02, A03, b; + for (int j = alignedStart; j<peeledSize; j+=peels*PacketSize) + { + b = ei_pload(&rhs[j]); + A01 = ei_ploadu(&lhs1[j]); + A02 = ei_ploadu(&lhs2[j]); + A03 = ei_ploadu(&lhs3[j]); + + ptmp0 = ei_pmadd(b, ei_pload (&lhs0[j]), ptmp0); + ptmp1 = ei_pmadd(b, A01, ptmp1); + A01 = ei_ploadu(&lhs1[j+PacketSize]); + ptmp2 = ei_pmadd(b, A02, ptmp2); + A02 = ei_ploadu(&lhs2[j+PacketSize]); + ptmp3 = ei_pmadd(b, A03, ptmp3); + A03 = ei_ploadu(&lhs3[j+PacketSize]); + + b = ei_pload(&rhs[j+PacketSize]); + ptmp0 = ei_pmadd(b, ei_pload (&lhs0[j+PacketSize]), ptmp0); + ptmp1 = ei_pmadd(b, A01, ptmp1); + ptmp2 = ei_pmadd(b, A02, ptmp2); + ptmp3 = ei_pmadd(b, A03, ptmp3); + } + } + for (int j = peeledSize; j<alignedSize; j+=PacketSize) _EIGEN_ACCUMULATE_PACKETS(,u,u,); break; default: diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h index e6f8cdcbd..f8d924f7d 100644 --- a/Eigen/src/Core/Map.h +++ b/Eigen/src/Core/Map.h @@ -49,9 +49,7 @@ struct ei_traits<Map<MatrixType, Alignment> > ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - Flags = MatrixType::Flags - & ( (HereditaryBits | LinearAccessBit | DirectAccessBit) - | (Alignment == Aligned ? PacketAccessBit : 0) ), + Flags = MatrixType::Flags, CoeffReadCost = NumTraits<Scalar>::ReadCost }; }; diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 9deb19d32..28825ceee 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -192,12 +192,12 @@ inline __m128i ei_preduxp(const __m128i* vecs) } #if (defined __GNUC__) -template <> inline __m128 ei_pmadd(const __m128& a, const __m128& b, const __m128& c) -{ - __m128 res = b; - asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c)); - return res; -} +// template <> inline __m128 ei_pmadd(const __m128& a, const __m128& b, const __m128& c) +// { +// __m128 res = b; +// asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c)); +// return res; +// } #endif #endif // EIGEN_PACKET_MATH_SSE_H diff --git a/Eigen/src/QR/Tridiagonalization.h b/Eigen/src/QR/Tridiagonalization.h index 7834c1aca..765a87130 100755 --- a/Eigen/src/QR/Tridiagonalization.h +++ b/Eigen/src/QR/Tridiagonalization.h @@ -34,7 +34,7 @@ * \param MatrixType the type of the matrix of which we are performing the tridiagonalization * * This class performs a tridiagonal decomposition of a selfadjoint matrix \f$ A \f$ such that: - * \f$ A = Q T Q^* \f$ where \f$ Q \f$ is unitatry and \f$ T \f$ a real symmetric tridiagonal matrix + * \f$ A = Q T Q^* \f$ where \f$ Q \f$ is unitary and \f$ T \f$ a real symmetric tridiagonal matrix. * * \sa MatrixBase::tridiagonalize() */ @@ -45,12 +45,15 @@ template<typename _MatrixType> class Tridiagonalization typedef _MatrixType MatrixType; typedef typename MatrixType::Scalar Scalar; typedef typename NumTraits<Scalar>::Real RealScalar; + typedef typename ei_packet_traits<Scalar>::type Packet; enum { Size = MatrixType::RowsAtCompileTime, SizeMinusOne = MatrixType::RowsAtCompileTime==Dynamic - ? Dynamic - : MatrixType::RowsAtCompileTime-1}; + ? Dynamic + : MatrixType::RowsAtCompileTime-1, + PacketSize = ei_packet_traits<Scalar>::size + }; typedef Matrix<Scalar, SizeMinusOne, 1> CoeffVectorType; typedef Matrix<RealScalar, Size, 1> DiagonalType; @@ -59,8 +62,7 @@ template<typename _MatrixType> class Tridiagonalization typedef typename NestByValue<DiagonalCoeffs<MatrixType> >::RealReturnType DiagonalReturnType; typedef typename NestByValue<DiagonalCoeffs< - NestByValue<Block< - MatrixType,SizeMinusOne,SizeMinusOne> > > >::RealReturnType SubDiagonalReturnType; + NestByValue<Block<MatrixType,SizeMinusOne,SizeMinusOne> > > >::RealReturnType SubDiagonalReturnType; /** This constructor initializes a Tridiagonalization object for * further use with Tridiagonalization::compute() @@ -103,7 +105,7 @@ template<typename _MatrixType> class Tridiagonalization * Householder coefficients returned by householderCoefficients(), * allows to reconstruct the matrix Q as follow: * Q = H_{N-1} ... H_1 H_0 - * where the matrices H are the Householder transformation: + * where the matrices H are the Householder transformations: * H_i = (I - h_i * v_i * v_i') * where h_i == householderCoefficients()[i] and v_i is a Householder vector: * v_i = [ 0, ..., 0, 1, M(i+2,i), ..., M(N-1,i) ] @@ -157,8 +159,8 @@ template<typename MatrixType> typename Tridiagonalization<MatrixType>::MatrixType Tridiagonalization<MatrixType>::matrixT(void) const { - // FIXME should this function (and other similar) rather take a matrix as argument - // and fill it (avoids temporaries) + // FIXME should this function (and other similar ones) rather take a matrix as argument + // and fill it ? (to avoid temporaries) int n = m_matrix.rows(); MatrixType matT = m_matrix; matT.corner(TopRight,n-1, n-1).diagonal() = subDiagonal().conjugate(); @@ -189,6 +191,7 @@ void Tridiagonalization<MatrixType>::_compute(MatrixType& matA, CoeffVectorType& { assert(matA.rows()==matA.cols()); int n = matA.rows(); +// std::cerr << matA << "\n\n"; for (int i = 0; i<n-2; ++i) { // let's consider the vector v = i-th column starting at position i+1 @@ -216,22 +219,100 @@ void Tridiagonalization<MatrixType>::_compute(MatrixType& matA, CoeffVectorType& // i.e., A = H' A H where H = I - h v v' and v = matA.col(i).end(n-i-1) matA.col(i).coeffRef(i+1) = 1; - // let's use the end of hCoeffs to store temporary values - hCoeffs.end(n-i-1) = h * (matA.corner(BottomRight,n-i-1,n-i-1).template part<Lower|SelfAdjoint>() - * matA.col(i).end(n-i-1)); - - + + /* This is the initial algorithm which minimize operation counts and maximize + * the use of Eigen's expression. Unfortunately, the first matrix-vector product + * using Part<Lower|Selfadjoint> is very very slow */ + #ifdef EIGEN_NEVER_DEFINED + // matrix - vector product + hCoeffs.end(n-i-1) = (matA.corner(BottomRight,n-i-1,n-i-1).template part<Lower|SelfAdjoint>() + * (h * matA.col(i).end(n-i-1))).lazy(); + // simple axpy hCoeffs.end(n-i-1) += (h * Scalar(-0.5) * matA.col(i).end(n-i-1).dot(hCoeffs.end(n-i-1))) * matA.col(i).end(n-i-1); - - matA.corner(BottomRight,n-i-1,n-i-1).template part<Lower>() = - matA.corner(BottomRight,n-i-1,n-i-1) - ( + // rank-2 update + //Block<MatrixType,Dynamic,1> B(matA,i+1,i,n-i-1,1); + matA.corner(BottomRight,n-i-1,n-i-1).template part<Lower>() -= (matA.col(i).end(n-i-1) * hCoeffs.end(n-i-1).adjoint()).lazy() - + (hCoeffs.end(n-i-1) * matA.col(i).end(n-i-1).adjoint()).lazy() ); - // FIXME check that the above expression does follow the lazy path (no temporary and - // only lower products are evaluated) - // FIXME can we avoid to evaluate twice the diagonal products ? - // (in a simple way otherwise it's overkill) + + (hCoeffs.end(n-i-1) * matA.col(i).end(n-i-1).adjoint()).lazy(); + #endif + /* end initial algorithm */ + + /* If we still want to minimize operation count (i.e., perform operation on the lower part only) + * then we could provide the following algorithm for selfadjoint - vector product. However, a full + * matrix-vector product is still faster (at least for dynamic size, and not too small, did not check + * small matrices). The algo performs block matrix-vector and transposed matrix vector products. */ + #ifdef EIGEN_NEVER_DEFINED + int n4 = (std::max(0,n-4)/4)*4; + hCoeffs.end(n-i-1).setZero(); + for (int b=i+1; b<n4; b+=4) + { + // the ?x4 part: + hCoeffs.end(b-4) += + Block<MatrixType,Dynamic,4>(matA,b+4,b,n-b-4,4) * matA.template block<4,1>(b,i); + // the respective transposed part: + Block<CoeffVectorType,4,1>(hCoeffs, b, 0, 4,1) += + Block<MatrixType,Dynamic,4>(matA,b+4,b,n-b-4,4).adjoint() * Block<MatrixType,Dynamic,1>(matA,b+4,i,n-b-4,1); + // the 4x4 block diagonal: + Block<CoeffVectorType,4,1>(hCoeffs, b, 0, 4,1) += + (Block<MatrixType,4,4>(matA,b,b,4,4).template part<Lower|SelfAdjoint>() + * (h * Block<MatrixType,4,1>(matA,b,i,4,1))).lazy(); + } + #endif + // todo: handle the remaining part + /* end optimized selfadjoint - vector product */ + + /* Another interesting note: the above rank-2 update is much slower than the following hand written loop. + * After an analyse of the ASM, it seems GCC (4.2) generate poor code because of the Block. Moreover, + * if we remove the specialization of Block for Matrix then it is even worse, much worse ! */ + #ifdef EIGEN_NEVER_DEFINED + for (int j1=i+1; j1<n; ++j1) + for (int i1=j1; i1<n; i1++) + matA.coeffRef(i1,j1) -= matA.coeff(i1,i)*ei_conj(hCoeffs.coeff(j1-1)) + + hCoeffs.coeff(i1-1)*ei_conj(matA.coeff(j1,i)); + #endif + /* end hand writen partial rank-2 update */ + + /* The current fastest implementation: the full matrix is used, no "optimization" to use/compute + * only half of the matrix. Custom vectorization of the inner col -= alpha X + beta Y such that access + * to col are always aligned. Once we support that in Assign, then the algorithm could be rewriten as + * a single compact expression. This code is therefore a good benchmark when will do that. */ + + // let's use the end of hCoeffs to store temporary values: + hCoeffs.end(n-i-1) = (matA.corner(BottomRight,n-i-1,n-i-1) * (h * matA.col(i).end(n-i-1))).lazy(); + // FIXME in the above expr a temporary is created because of the scalar multiple by h + + hCoeffs.end(n-i-1) += (h * Scalar(-0.5) * matA.col(i).end(n-i-1).dot(hCoeffs.end(n-i-1))) + * matA.col(i).end(n-i-1); + + const Scalar* __restrict__ pb = &matA.coeffRef(0,i); + const Scalar* __restrict__ pa = (&hCoeffs.coeffRef(0)) - 1; + for (int j1=i+1; j1<n; ++j1) + { + int starti = i+1; + int alignedEnd = starti; + if (PacketSize>1) + { + int alignedStart = (starti) + ei_alignmentOffset(&matA.coeffRef(starti,j1), n-starti); + alignedEnd = alignedStart + ((n-alignedStart)/PacketSize)*PacketSize; + + for (int i1=starti; i1<alignedStart; ++i1) + matA.coeffRef(i1,j1) -= matA.coeff(i1,i)*ei_conj(hCoeffs.coeff(j1-1)) + + hCoeffs.coeff(i1-1)*ei_conj(matA.coeff(j1,i)); + + Packet tmp0 = ei_pset1(hCoeffs.coeff(j1-1)); + Packet tmp1 = ei_pset1(matA.coeff(j1,i)); + Scalar* pc = &matA.coeffRef(0,j1); + for (int i1=alignedStart ; i1<alignedEnd; i1+=PacketSize) + ei_pstore(pc+i1,ei_psub(ei_pload(pc+i1), + ei_padd(ei_pmul(tmp0, ei_ploadu(pb+i1)), + ei_pmul(tmp1, ei_ploadu(pa+i1))))); + } + for (int i1=alignedEnd; i1<n; ++i1) + matA.coeffRef(i1,j1) -= matA.coeff(i1,i)*ei_conj(hCoeffs.coeff(j1-1)) + + hCoeffs.coeff(i1-1)*ei_conj(matA.coeff(j1,i)); + } + /* end optimized implemenation */ // note: at that point matA(i+1,i+1) is the (i+1)-th element of the final diagonal // note: the sequence of the beta values leads to the subdiagonal entries @@ -286,7 +367,6 @@ void Tridiagonalization<MatrixType>::decomposeInPlace(MatrixType& mat, DiagonalT ei_assert(mat.cols()==n && diag.size()==n && subdiag.size()==n-1); if (n==3 && (!NumTraits<Scalar>::IsComplex) ) { - Tridiagonalization tridiag(mat); _decomposeInPlace3x3(mat, diag, subdiag, extractQ); } else @@ -301,7 +381,7 @@ void Tridiagonalization<MatrixType>::decomposeInPlace(MatrixType& mat, DiagonalT /** \internal * Optimized path for 3x3 matrices. - * Especially usefull for plane fit. + * Especially useful for plane fitting. */ template<typename MatrixType> void Tridiagonalization<MatrixType>::_decomposeInPlace3x3(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) |