diff options
Diffstat (limited to 'Eigen')
-rw-r--r-- | Eigen/Core | 8 | ||||
-rw-r--r-- | Eigen/QR | 1 | ||||
-rw-r--r-- | Eigen/src/Core/Block.h | 4 | ||||
-rw-r--r-- | Eigen/src/Core/CacheFriendlyProduct.h | 353 | ||||
-rw-r--r-- | Eigen/src/Core/Map.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/MathFunctions.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/Matrix.h | 8 | ||||
-rw-r--r-- | Eigen/src/Core/MatrixBase.h | 9 | ||||
-rw-r--r-- | Eigen/src/Core/Product.h | 18 | ||||
-rw-r--r-- | Eigen/src/Core/ProductWIP.h | 380 | ||||
-rw-r--r-- | Eigen/src/Core/Transpose.h | 2 | ||||
-rwxr-xr-x | Eigen/src/Core/Triangular.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/util/Constants.h | 6 | ||||
-rw-r--r-- | Eigen/src/QR/EigenSolver.h | 848 | ||||
-rw-r--r-- | Eigen/src/QR/QR.h | 7 |
15 files changed, 1286 insertions, 364 deletions
diff --git a/Eigen/Core b/Eigen/Core index f2a1ad1d8..a0eef26f7 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -18,9 +18,9 @@ #ifdef __SSE3__ #include <pmmintrin.h> #endif - //#ifdef __SSSE3__ - //#include <tmmintrin.h> - //#endif + #ifdef __SSSE3__ + #include <tmmintrin.h> + #endif #elif (defined __ALTIVEC__) #define EIGEN_VECTORIZE #define EIGEN_VECTORIZE_ALTIVEC @@ -69,7 +69,7 @@ namespace Eigen { #include "src/Core/CwiseBinaryOp.h" #include "src/Core/CwiseUnaryOp.h" #include "src/Core/CwiseNullaryOp.h" -#if (defined EIGEN_WIP_PRODUCT) && (defined EIGEN_VECTORIZE) +#if (defined EIGEN_WIP_PRODUCT) #include "src/Core/ProductWIP.h" #else #include "src/Core/Product.h" @@ -6,6 +6,7 @@ namespace Eigen { #include "Eigen/src/QR/QR.h" +#include "Eigen/src/QR/EigenSolver.h" } // namespace Eigen diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index 00bb375a9..4b417224c 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -71,7 +71,7 @@ struct ei_traits<Block<MatrixType, BlockRows, BlockCols> > || (ColsAtCompileTime != Dynamic && MatrixType::ColsAtCompileTime == Dynamic)) ? ~LargeBit : ~(unsigned int)0, - Flags = MatrixType::Flags & (DefaultLostFlagMask | VectorizableBit | ReferencableBit) & FlagsMaskLargeBit, + Flags = MatrixType::Flags & (DefaultLostFlagMask | VectorizableBit | DirectAccessBit) & FlagsMaskLargeBit, CoeffReadCost = MatrixType::CoeffReadCost }; }; @@ -132,6 +132,8 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block int _rows() const { return m_blockRows.value(); } int _cols() const { return m_blockCols.value(); } + int _stride(void) const { return m_matrix.stride(); } + Scalar& _coeffRef(int row, int col) { return m_matrix.const_cast_derived() diff --git a/Eigen/src/Core/CacheFriendlyProduct.h b/Eigen/src/Core/CacheFriendlyProduct.h new file mode 100644 index 000000000..b484b1786 --- /dev/null +++ b/Eigen/src/Core/CacheFriendlyProduct.h @@ -0,0 +1,353 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. Eigen itself is part of the KDE project. +// +// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr> +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see <http://www.gnu.org/licenses/>. + +#ifndef EIGEN_CACHE_FRIENDLY_PRODUCT_H +#define EIGEN_CACHE_FRIENDLY_PRODUCT_H + +template<typename Scalar> +static void ei_cache_friendly_product( + int _rows, int _cols, int depth, + bool _lhsRowMajor, const Scalar* _lhs, int _lhsStride, + bool _rhsRowMajor, const Scalar* _rhs, int _rhsStride, + bool resRowMajor, Scalar* res, int resStride) +{ + const Scalar* __restrict__ lhs; + const Scalar* __restrict__ rhs; + int lhsStride, rhsStride, rows, cols; + bool lhsRowMajor; + + if (resRowMajor) + { + lhs = _rhs; + rhs = _lhs; + lhsStride = _rhsStride; + rhsStride = _lhsStride; + cols = _rows; + rows = _cols; + lhsRowMajor = _rhsRowMajor; + ei_assert(_lhsRowMajor); + } + else + { + lhs = _lhs; + rhs = _rhs; + lhsStride = _lhsStride; + rhsStride = _rhsStride; + rows = _rows; + cols = _cols; + lhsRowMajor = _lhsRowMajor; + ei_assert(!_rhsRowMajor); + } + + typedef typename ei_packet_traits<Scalar>::type PacketType; + + enum { + PacketSize = sizeof(PacketType)/sizeof(Scalar), + #if (defined __i386__) + // i386 architecture provides only 8 xmm registers, + // so let's reduce the max number of rows processed at once. + MaxBlockRows = 4, + MaxBlockRows_ClampingMask = 0xFFFFFC, + #else + MaxBlockRows = 8, + MaxBlockRows_ClampingMask = 0xFFFFF8, + #endif + // maximal size of the blocks fitted in L2 cache + MaxL2BlockSize = EIGEN_TUNE_FOR_L2_CACHE_SIZE / sizeof(Scalar) + }; + + + //const bool rhsIsAligned = (PacketSize==1) || (((rhsStride%PacketSize) == 0) && (size_t(rhs)%16==0)); + const bool resIsAligned = (PacketSize==1) || (((resStride%PacketSize) == 0) && (size_t(res)%16==0)); + + const int remainingSize = depth % PacketSize; + const int size = depth - remainingSize; // third dimension of the product clamped to packet boundaries + const int l2BlockRows = MaxL2BlockSize > rows ? rows : MaxL2BlockSize; + const int l2BlockCols = MaxL2BlockSize > cols ? cols : MaxL2BlockSize; + const int l2BlockSize = MaxL2BlockSize > size ? size : MaxL2BlockSize; + Scalar* __restrict__ block = (Scalar*)alloca(sizeof(Scalar)*l2BlockRows*size); + Scalar* __restrict__ rhsCopy = (Scalar*)alloca(sizeof(Scalar)*l2BlockSize); + + // loops on each L2 cache friendly blocks of the result + for(int l2i=0; l2i<rows; l2i+=l2BlockRows) + { + const int l2blockRowEnd = std::min(l2i+l2BlockRows, rows); + const int l2blockRowEndBW = l2blockRowEnd & MaxBlockRows_ClampingMask; // end of the rows aligned to bw + const int l2blockRemainingRows = l2blockRowEnd - l2blockRowEndBW; // number of remaining rows + //const int l2blockRowEndBWPlusOne = l2blockRowEndBW + (l2blockRemainingRows?0:MaxBlockRows); + + // build a cache friendly blocky matrix + int count = 0; + + // copy l2blocksize rows of m_lhs to blocks of ps x bw + asm("#eigen begin buildblocks"); + for(int l2k=0; l2k<size; l2k+=l2BlockSize) + { + const int l2blockSizeEnd = std::min(l2k+l2BlockSize, size); + + for (int i = l2i; i<l2blockRowEndBW/*PlusOne*/; i+=MaxBlockRows) + { + // TODO merge the if l2blockRemainingRows +// const int blockRows = std::min(i+MaxBlockRows, rows) - i; + + for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize) + { + // TODO write these loops using meta unrolling + // negligible for large matrices but useful for small ones + if (lhsRowMajor) + { + for (int w=0; w<MaxBlockRows; ++w) + for (int s=0; s<PacketSize; ++s) + block[count++] = lhs[(i+w)*lhsStride + (k+s)]; + } + else + { + for (int w=0; w<MaxBlockRows; ++w) + for (int s=0; s<PacketSize; ++s) + block[count++] = lhs[(i+w) + (k+s)*lhsStride]; + } + } + } + if (l2blockRemainingRows>0) + { + for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize) + { + if (lhsRowMajor) + { + for (int w=0; w<l2blockRemainingRows; ++w) + for (int s=0; s<PacketSize; ++s) + block[count++] = lhs[(l2blockRowEndBW+w)*lhsStride + (k+s)]; + } + else + { + for (int w=0; w<l2blockRemainingRows; ++w) + for (int s=0; s<PacketSize; ++s) + block[count++] = lhs[(l2blockRowEndBW+w) + (k+s)*lhsStride]; + } + } + } + } + asm("#eigen end buildblocks"); + + for(int l2j=0; l2j<cols; l2j+=l2BlockCols) + { + int l2blockColEnd = std::min(l2j+l2BlockCols, cols); + + for(int l2k=0; l2k<size; l2k+=l2BlockSize) + { + // acumulate bw rows of lhs time a single column of rhs to a bw x 1 block of res + int l2blockSizeEnd = std::min(l2k+l2BlockSize, size); + + // for each bw x 1 result's block + for(int l1i=l2i; l1i<l2blockRowEndBW; l1i+=MaxBlockRows) + { + for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1) + { + int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*MaxBlockRows; + const Scalar* __restrict__ localB = &block[offsetblock]; + + const Scalar* __restrict__ rhsColumn = &(rhs[l1j*rhsStride]); + + // copy unaligned rhs data + // YES it seems to be faster to copy some part of rhs multiple times + // to aligned memory rather than using unligned load. + // Moreover this avoids a "if" in the most nested loop :) + if (PacketSize>1 && size_t(rhsColumn)%16) + { + int count = 0; + for (int k = l2k; k<l2blockSizeEnd; ++k) + { + rhsCopy[count++] = rhsColumn[k]; + } + rhsColumn = &(rhsCopy[-l2k]); + } + + PacketType dst[MaxBlockRows]; + dst[0] = ei_pset1(Scalar(0.)); + dst[1] = dst[0]; + dst[2] = dst[0]; + dst[3] = dst[0]; + if (MaxBlockRows==8) + { + dst[4] = dst[0]; + dst[5] = dst[0]; + dst[6] = dst[0]; + dst[7] = dst[0]; + } + + PacketType tmp; + + asm("#eigen begincore"); + for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize) + { + tmp = ei_pload(&rhsColumn[k]); + + dst[0] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows ])), dst[0]); + dst[1] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows+ PacketSize])), dst[1]); + dst[2] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows+2*PacketSize])), dst[2]); + dst[3] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows+3*PacketSize])), dst[3]); + if (MaxBlockRows==8) + { + dst[4] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows+4*PacketSize])), dst[4]); + dst[5] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows+5*PacketSize])), dst[5]); + dst[6] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows+6*PacketSize])), dst[6]); + dst[7] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows+7*PacketSize])), dst[7]); + } + } + + Scalar* __restrict__ localRes = &(res[l1i + l1j*resStride]); + + if (PacketSize>1 && resIsAligned) + { + ei_pstore(&(localRes[0]), ei_padd(ei_pload(&(localRes[0])), ei_preduxp(dst))); + if (PacketSize==2) + ei_pstore(&(localRes[2]), ei_padd(ei_pload(&(localRes[2])), ei_preduxp(&(dst[2])))); + if (MaxBlockRows==8) + { + ei_pstore(&(localRes[4]), ei_padd(ei_pload(&(localRes[4])), ei_preduxp(&(dst[4])))); + if (PacketSize==2) + ei_pstore(&(localRes[6]), ei_padd(ei_pload(&(localRes[6])), ei_preduxp(&(dst[6])))); + } + } + else + { + localRes[0] += ei_predux(dst[0]); + localRes[1] += ei_predux(dst[1]); + localRes[2] += ei_predux(dst[2]); + localRes[3] += ei_predux(dst[3]); + if (MaxBlockRows==8) + { + localRes[4] += ei_predux(dst[4]); + localRes[5] += ei_predux(dst[5]); + localRes[6] += ei_predux(dst[6]); + localRes[7] += ei_predux(dst[7]); + } + } + asm("#eigen endcore"); + } + } + if (l2blockRemainingRows>0) + { + int offsetblock = l2k * (l2blockRowEnd-l2i) + (l2blockRowEndBW-l2i)*(l2blockSizeEnd-l2k) - l2k*l2blockRemainingRows; + const Scalar* localB = &block[offsetblock]; + + asm("#eigen begin dynkernel"); + for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1) + { + const Scalar* __restrict__ rhsColumn = &(rhs[l1j*rhsStride]); + + // copy unaligned rhs data + if (PacketSize>1 && size_t(rhsColumn)%16) + { + int count = 0; + for (int k = l2k; k<l2blockSizeEnd; ++k) + { + rhsCopy[count++] = rhsColumn[k]; + } + rhsColumn = &(rhsCopy[-l2k]); + } + + PacketType dst[MaxBlockRows]; + dst[0] = ei_pset1(Scalar(0.)); + dst[1] = dst[0]; + dst[2] = dst[0]; + dst[3] = dst[0]; + if (MaxBlockRows>4) + { + dst[4] = dst[0]; + dst[5] = dst[0]; + dst[6] = dst[0]; + dst[7] = dst[0]; + } + + // let's declare a few other temporary registers + PacketType tmp; + + for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize) + { + tmp = ei_pload(&rhsColumn[k]); + + dst[0] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows ])), dst[0]); + if (l2blockRemainingRows>=2) dst[1] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+ PacketSize])), dst[1]); + if (l2blockRemainingRows>=3) dst[2] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+2*PacketSize])), dst[2]); + if (l2blockRemainingRows>=4) dst[3] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+3*PacketSize])), dst[3]); + if (MaxBlockRows>4) + { + if (l2blockRemainingRows>=5) dst[4] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+4*PacketSize])), dst[4]); + if (l2blockRemainingRows>=6) dst[5] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+5*PacketSize])), dst[5]); + if (l2blockRemainingRows>=7) dst[6] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+6*PacketSize])), dst[6]); + if (l2blockRemainingRows>=8) dst[7] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+7*PacketSize])), dst[7]); + } + } + + Scalar* __restrict__ localRes = &(res[l2blockRowEndBW + l1j*resStride]); + + // process the remaining rows once at a time + localRes[0] += ei_predux(dst[0]); + if (l2blockRemainingRows>=2) localRes[1] += ei_predux(dst[1]); + if (l2blockRemainingRows>=3) localRes[2] += ei_predux(dst[2]); + if (l2blockRemainingRows>=4) localRes[3] += ei_predux(dst[3]); + if (MaxBlockRows>4) + { + if (l2blockRemainingRows>=5) localRes[4] += ei_predux(dst[4]); + if (l2blockRemainingRows>=6) localRes[5] += ei_predux(dst[5]); + if (l2blockRemainingRows>=7) localRes[6] += ei_predux(dst[6]); + if (l2blockRemainingRows>=8) localRes[7] += ei_predux(dst[7]); + } + + asm("#eigen end dynkernel"); + } + } + } + } + } + if (PacketSize>1 && remainingSize) + { + if (lhsRowMajor) + { + for (int j=0; j<cols; ++j) + for (int i=0; i<rows; ++i) + { + Scalar tmp = lhs[i*lhsStride+size] * rhs[j*rhsStride+size]; + for (int k=1; k<remainingSize; ++k) + tmp += lhs[i*lhsStride+size+k] * rhs[j*rhsStride+size+k]; + res[i+j*resStride] += tmp; + } + } + else + { + for (int j=0; j<cols; ++j) + for (int i=0; i<rows; ++i) + { + Scalar tmp = lhs[i+size*lhsStride] * rhs[j*rhsStride+size]; + for (int k=1; k<remainingSize; ++k) + tmp += lhs[i+(size+k)*lhsStride] * rhs[j*rhsStride+size+k]; + res[i+j*resStride] += tmp; + } + } + } +} + + +#endif // EIGEN_CACHE_FRIENDLY_PRODUCT_H diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h index e9f65cd7a..1f9a0f244 100644 --- a/Eigen/src/Core/Map.h +++ b/Eigen/src/Core/Map.h @@ -47,7 +47,7 @@ struct ei_traits<Map<MatrixType> > ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - Flags = MatrixType::Flags & (DefaultLostFlagMask | ReferencableBit), + Flags = MatrixType::Flags & (DefaultLostFlagMask | DirectAccessBit), CoeffReadCost = NumTraits<Scalar>::ReadCost }; }; diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 64ae7f97b..5ba37c076 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -46,7 +46,7 @@ inline int ei_log(int) { ei_assert(false); return 0; } inline int ei_sin(int) { ei_assert(false); return 0; } inline int ei_cos(int) { ei_assert(false); return 0; } -#if EIGEN_GNUC_AT_LEAST(4,3) +#if EIGEN_GNUC_AT_LEAST(4,2) inline int ei_pow(int x, int y) { return int(std::pow(double(x), y)); } #else inline int ei_pow(int x, int y) { return std::pow(x, y); } diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h index 922c3ddae..016a9ef06 100644 --- a/Eigen/src/Core/Matrix.h +++ b/Eigen/src/Core/Matrix.h @@ -100,6 +100,14 @@ class Matrix : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _Flags, _MaxRows, int _rows() const { return m_storage.rows(); } int _cols() const { return m_storage.cols(); } + int _stride(void) const + { + if(Flags & RowMajorBit) + return m_storage.cols(); + else + return m_storage.rows(); + } + const Scalar& _coeff(int row, int col) const { if(Flags & RowMajorBit) diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 1e46b10d2..5baa42261 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -185,7 +185,7 @@ template<typename Derived> class MatrixBase /** Overloaded for optimal product evaluation */ template<typename Derived1, typename Derived2> - Derived& lazyAssign(const Product<Derived1,Derived2,CacheOptimalProduct>& product); + Derived& lazyAssign(const Product<Derived1,Derived2,CacheFriendlyProduct>& product); CommaInitializer operator<< (const Scalar& s); @@ -419,6 +419,13 @@ template<typename Derived> class MatrixBase const Lazy<Derived> lazy() const; const Temporary<Derived> temporary() const; + + /** \returns number of elements to skip to pass from one row (resp. column) to another + * for a row-major (resp. column-major) matrix. + * Combined with coeffRef() and the compile times flags, it allows a direct access to the data + * of the underlying matrix. + */ + int stride(void) const { return derived()._stride(); } //@} /// \name Coefficient-wise operations diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index 66c6d4d5b..dbd8b6cdd 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -60,6 +60,12 @@ struct ei_product_unroller<Index, 0, Lhs, Rhs> static void run(int, int, const Lhs&, const Rhs&, typename Lhs::Scalar&) {} }; +template<typename Lhs, typename Rhs> +struct ei_product_unroller<0, Dynamic, Lhs, Rhs> +{ + static void run(int, int, const Lhs&, const Rhs&, typename Lhs::Scalar&) {} +}; + template<bool RowMajor, int Index, int Size, typename Lhs, typename Rhs, typename PacketScalar> struct ei_packet_product_unroller; @@ -113,6 +119,12 @@ struct ei_packet_product_unroller<false, Index, Dynamic, Lhs, Rhs, PacketScalar> static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {} }; +template<typename Lhs, typename Rhs, typename PacketScalar> +struct ei_packet_product_unroller<false, 0, Dynamic, Lhs, Rhs, PacketScalar> +{ + static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {} +}; + template<typename Product, bool RowMajor = true> struct ProductPacketCoeffImpl { inline static typename Product::PacketScalar execute(const Product& product, int row, int col) { return product._packetCoeffRowMajor(row,col); } @@ -142,7 +154,7 @@ template<typename Lhs, typename Rhs> struct ei_product_eval_mode enum{ value = Lhs::MaxRowsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD && Rhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD && (!( (Lhs::Flags&RowMajorBit) && ((Rhs::Flags&RowMajorBit) ^ RowMajorBit))) - ? CacheOptimalProduct : NormalProduct }; + ? CacheFriendlyProduct : NormalProduct }; }; template<typename Lhs, typename Rhs, int EvalMode> @@ -166,7 +178,7 @@ struct ei_traits<Product<Lhs, Rhs, EvalMode> > _LhsVectorizable = (!(LhsFlags & RowMajorBit)) && (LhsFlags & VectorizableBit) && (RowsAtCompileTime % ei_packet_traits<Scalar>::size == 0), _Vectorizable = (_LhsVectorizable || _RhsVectorizable) ? 1 : 0, _RowMajor = (RhsFlags & RowMajorBit) - && (EvalMode==(int)CacheOptimalProduct ? (int)LhsFlags & RowMajorBit : (!_LhsVectorizable)), + && (EvalMode==(int)CacheFriendlyProduct ? (int)LhsFlags & RowMajorBit : (!_LhsVectorizable)), _LostBits = DefaultLostFlagMask & ~( (_RowMajor ? 0 : RowMajorBit) | ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)), @@ -312,7 +324,7 @@ MatrixBase<Derived>::operator*=(const MatrixBase<OtherDerived> &other) template<typename Derived> template<typename Lhs, typename Rhs> -Derived& MatrixBase<Derived>::lazyAssign(const Product<Lhs,Rhs,CacheOptimalProduct>& product) +Derived& MatrixBase<Derived>::lazyAssign(const Product<Lhs,Rhs,CacheFriendlyProduct>& product) { product.template _cacheOptimalEval<Derived, Aligned>(derived(), #ifdef EIGEN_VECTORIZE diff --git a/Eigen/src/Core/ProductWIP.h b/Eigen/src/Core/ProductWIP.h index a5fc9d298..0a7ef43e9 100644 --- a/Eigen/src/Core/ProductWIP.h +++ b/Eigen/src/Core/ProductWIP.h @@ -26,9 +26,7 @@ #ifndef EIGEN_PRODUCT_H #define EIGEN_PRODUCT_H -#ifndef EIGEN_VECTORIZE -#error you must enable vectorization to try this experimental product implementation -#endif +#include "CacheFriendlyProduct.h" template<int Index, int Size, typename Lhs, typename Rhs> struct ei_product_unroller @@ -145,7 +143,7 @@ template<typename Lhs, typename Rhs> struct ei_product_eval_mode { enum{ value = Lhs::MaxRowsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD && Rhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD - ? CacheOptimalProduct : NormalProduct }; + ? CacheFriendlyProduct : NormalProduct }; }; template<typename T> class ei_product_eval_to_column_major @@ -173,7 +171,22 @@ template<typename T, int n=1> struct ei_product_nested_rhs typename ei_meta_if< (ei_traits<T>::Flags & EvalBeforeNestingBit) || (ei_traits<T>::Flags & RowMajorBit) - || (!(ei_traits<T>::Flags & ReferencableBit)) + || (!(ei_traits<T>::Flags & DirectAccessBit)) + || (n+1) * NumTraits<typename ei_traits<T>::Scalar>::ReadCost < (n-1) * T::CoeffReadCost, + typename ei_product_eval_to_column_major<T>::type, + const T& + >::ret + >::ret type; +}; + +template<typename T, int n=1> struct ei_product_nested_lhs +{ + typedef typename ei_meta_if< + ei_is_temporary<T>::ret && !(ei_traits<T>::Flags & RowMajorBit), + T, + typename ei_meta_if< + (ei_traits<T>::Flags & EvalBeforeNestingBit) + || (!(ei_traits<T>::Flags & DirectAccessBit)) || (n+1) * NumTraits<typename ei_traits<T>::Scalar>::ReadCost < (n-1) * T::CoeffReadCost, typename ei_product_eval_to_column_major<T>::type, const T& @@ -187,9 +200,12 @@ struct ei_traits<Product<Lhs, Rhs, EvalMode> > typedef typename Lhs::Scalar Scalar; // the cache friendly product evals lhs once only // FIXME what to do if we chose to dynamically call the normal product from the cache friendly one for small matrices ? - typedef typename ei_nested<Lhs, EvalMode==CacheOptimalProduct ? 0 : Rhs::ColsAtCompileTime>::type LhsNested; + typedef typename ei_meta_if<EvalMode==CacheFriendlyProduct, + typename ei_product_nested_lhs<Rhs,0>::type, + typename ei_nested<Lhs,Rhs::ColsAtCompileTime>::type>::ret LhsNested; + // NOTE that rhs must be ColumnMajor, so we might need a special nested type calculation - typedef typename ei_meta_if<EvalMode==CacheOptimalProduct, + typedef typename ei_meta_if<EvalMode==CacheFriendlyProduct, typename ei_product_nested_rhs<Rhs,Lhs::RowsAtCompileTime>::type, typename ei_nested<Rhs,Lhs::RowsAtCompileTime>::type>::ret RhsNested; typedef typename ei_unref<LhsNested>::type _LhsNested; @@ -209,7 +225,7 @@ struct ei_traits<Product<Lhs, Rhs, EvalMode> > _LhsVectorizable = (!(LhsFlags & RowMajorBit)) && (LhsFlags & VectorizableBit) && (RowsAtCompileTime % ei_packet_traits<Scalar>::size == 0), _Vectorizable = (_LhsVectorizable || _RhsVectorizable) ? 0 : 0, _RowMajor = (RhsFlags & RowMajorBit) - && (EvalMode==(int)CacheOptimalProduct ? (int)LhsFlags & RowMajorBit : (!_LhsVectorizable)), + && (EvalMode==(int)CacheFriendlyProduct ? (int)LhsFlags & RowMajorBit : (!_LhsVectorizable)), _LostBits = DefaultLostFlagMask & ~( (_RowMajor ? 0 : RowMajorBit) | ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)), @@ -241,19 +257,7 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm typedef typename ei_traits<Product>::_RhsNested _RhsNested; enum { - PacketSize = ei_packet_traits<Scalar>::size, - #if (defined __i386__) - // i386 architectures provides only 8 xmmm register, - // so let's reduce the max number of rows processed at once. - // NOTE that so far the maximal supported value is 8. - MaxBlockRows = 4, - MaxBlockRows_ClampingMask = 0xFFFFFC, - #else - MaxBlockRows = 8, - MaxBlockRows_ClampingMask = 0xFFFFF8, - #endif - // maximal size of the blocks fitted in L2 cache - MaxL2BlockSize = EIGEN_TUNE_FOR_L2_CACHE_SIZE / sizeof(Scalar) + PacketSize = ei_packet_traits<Scalar>::size }; Product(const Lhs& lhs, const Rhs& rhs) @@ -327,14 +331,8 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm } /** \internal */ - template<typename DestDerived, int RhsAlignment, int ResAlignment> - void _cacheFriendlyEvalImpl(DestDerived& res) const __attribute__ ((noinline)); - - /** \internal */ - template<typename DestDerived, int RhsAlignment, int ResAlignment, int BlockRows> - void _cacheFriendlyEvalKernel(DestDerived& res, - int l2i, int l2j, int l2k, int l1i, - int l2blockRowEnd, int l2blockColEnd, int l2blockSizeEnd, const Scalar* block) const EIGEN_DONT_INLINE; + template<typename DestDerived, int RhsAlignment> + void _cacheFriendlyEvalImpl(DestDerived& res) const EIGEN_DONT_INLINE; protected: const LhsNested m_lhs; @@ -370,7 +368,7 @@ MatrixBase<Derived>::operator*=(const MatrixBase<OtherDerived> &other) template<typename Derived> template<typename Lhs, typename Rhs> -Derived& MatrixBase<Derived>::lazyAssign(const Product<Lhs,Rhs,CacheOptimalProduct>& product) +Derived& MatrixBase<Derived>::lazyAssign(const Product<Lhs,Rhs,CacheFriendlyProduct>& product) { product._cacheFriendlyEval(derived()); return derived(); @@ -380,326 +378,16 @@ template<typename Lhs, typename Rhs, int EvalMode> template<typename DestDerived> void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEval(DestDerived& res) const { - const bool rhsIsAligned = (m_lhs.cols()%PacketSize == 0); - const bool resIsAligned = ((_rows()%PacketSize) == 0); - - if (rhsIsAligned && resIsAligned) - _cacheFriendlyEvalImpl<DestDerived, Aligned, Aligned>(res); - else if (rhsIsAligned && (!resIsAligned)) - _cacheFriendlyEvalImpl<DestDerived, Aligned, UnAligned>(res); - else if ((!rhsIsAligned) && resIsAligned) - _cacheFriendlyEvalImpl<DestDerived, UnAligned, Aligned>(res); - else - _cacheFriendlyEvalImpl<DestDerived, UnAligned, UnAligned>(res); - -} - -template<typename Lhs, typename Rhs, int EvalMode> -template<typename DestDerived, int RhsAlignment, int ResAlignment, int BlockRows> -void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalKernel(DestDerived& res, - int l2i, int l2j, int l2k, int l1i, - int l2blockRowEnd, int l2blockColEnd, int l2blockSizeEnd, const Scalar* block) const -{ - asm("#eigen begin kernel"); - - ei_internal_assert(BlockRows<=8); - - // NOTE: sounds like we cannot rely on meta-unrolling to access dst[I] without enforcing GCC - // to create the dst's elements in memory, hence killing the performance. - - for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1) - { - int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*BlockRows; - const Scalar* localB = &block[offsetblock]; - -// int l1jsize = l1j * m_lhs.cols(); //TODO find a better way to optimize address computation ? - Scalar* rhsColumn = &(m_rhs.const_cast_derived().coeffRef(0, l1j)); - - // don't worry, dst is a set of registers - PacketScalar dst[BlockRows]; - dst[0] = ei_pset1(Scalar(0.)); - switch(BlockRows) - { - case 8: dst[7] = dst[0]; - case 7: dst[6] = dst[0]; - case 6: dst[5] = dst[0]; - case 5: dst[4] = dst[0]; - case 4: dst[3] = dst[0]; - case 3: dst[2] = dst[0]; - case 2: dst[1] = dst[0]; - default: break; - } - - // let's declare a few other temporary registers - PacketScalar tmp, tmp1; - - // unaligned loads are expensive, therefore let's preload the next element in advance - if (RhsAlignment==UnAligned) - //tmp1 = ei_ploadu(&m_rhs.data()[l1jsize+l2k]); - tmp1 = ei_ploadu(&rhsColumn[l2k]); - - for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize) - { - // FIXME if we don't cache l1j*m_lhs.cols() then the performance are poor, - // let's directly access to the data - //PacketScalar tmp = m_rhs.template packetCoeff<Aligned>(k, l1j); - if (RhsAlignment==Aligned) - { - //tmp = ei_pload(&m_rhs.data()[l1jsize + k]); - tmp = ei_pload(&rhsColumn[k]); - } - else - { - tmp = tmp1; - if (k+PacketSize<l2blockSizeEnd) - //tmp1 = ei_ploadu(&m_rhs.data()[l1jsize + k+PacketSize]); - tmp1 = ei_ploadu(&rhsColumn[k+PacketSize]); - } - - dst[0] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows ])), dst[0]); - if (BlockRows>=2) dst[1] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows+ PacketSize])), dst[1]); - if (BlockRows>=3) dst[2] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows+2*PacketSize])), dst[2]); - if (BlockRows>=4) dst[3] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows+3*PacketSize])), dst[3]); - if (BlockRows>=5) dst[4] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows+4*PacketSize])), dst[4]); - if (BlockRows>=6) dst[5] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows+5*PacketSize])), dst[5]); - if (BlockRows>=7) dst[6] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows+6*PacketSize])), dst[6]); - if (BlockRows>=8) dst[7] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows+7*PacketSize])), dst[7]); - } - - enum { - // Number of rows we can reduce per packet - PacketRows = (ResAlignment==Aligned && PacketSize>1) ? (BlockRows / PacketSize) : 0, - // First row index from which we have to to do redux once at a time - RemainingStart = PacketSize * PacketRows - }; - - // we have up to 4 packets (for doubles: 8 rows / 2) - if (PacketRows>=1) - res.template writePacketCoeff<Aligned>(l1i, l1j, - ei_padd(res.template packetCoeff<Aligned>(l1i, l1j), ei_preduxp(&(dst[0])))); - if (PacketRows>=2) - res.template writePacketCoeff<Aligned>(l1i+PacketSize, l1j, - ei_padd(res.template packetCoeff<Aligned>(l1i+PacketSize, l1j), ei_preduxp(&(dst[PacketSize])))); - if (PacketRows>=3) - res.template writePacketCoeff<Aligned>(l1i+2*PacketSize, l1j, - ei_padd(res.template packetCoeff<Aligned>(l1i+2*PacketSize, l1j), ei_preduxp(&(dst[2*PacketSize])))); - if (PacketRows>=4) - res.template writePacketCoeff<Aligned>(l1i+3*PacketSize, l1j, - ei_padd(res.template packetCoeff<Aligned>(l1i+3*PacketSize, l1j), ei_preduxp(&(dst[3*PacketSize])))); - - // process the remaining rows one at a time - if (RemainingStart<=0 && BlockRows>=1) res.coeffRef(l1i+0, l1j) += ei_predux(dst[0]); - if (RemainingStart<=1 && BlockRows>=2) res.coeffRef(l1i+1, l1j) += ei_predux(dst[1]); - if (RemainingStart<=2 && BlockRows>=3) res.coeffRef(l1i+2, l1j) += ei_predux(dst[2]); - if (RemainingStart<=3 && BlockRows>=4) res.coeffRef(l1i+3, l1j) += ei_predux(dst[3]); - if (RemainingStart<=4 && BlockRows>=5) res.coeffRef(l1i+4, l1j) += ei_predux(dst[4]); - if (RemainingStart<=5 && BlockRows>=6) res.coeffRef(l1i+5, l1j) += ei_predux(dst[5]); - if (RemainingStart<=6 && BlockRows>=7) res.coeffRef(l1i+6, l1j) += ei_predux(dst[6]); - if (RemainingStart<=7 && BlockRows>=8) res.coeffRef(l1i+7, l1j) += ei_predux(dst[7]); - - asm("#eigen end kernel"); - } -} - -template<typename Lhs, typename Rhs, int EvalMode> -template<typename DestDerived, int RhsAlignment, int ResAlignment> -void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const -{ - // FIXME find a way to optimize: (an_xpr) + (a * b) - // then we don't need to clear res and avoid and additional mat-mat sum #ifndef EIGEN_WIP_PRODUCT_DIRTY -// std::cout << "wip product\n"; res.setZero(); #endif - const int rows = _rows(); - const int cols = _cols(); - const int remainingSize = m_lhs.cols()%PacketSize; - const int size = m_lhs.cols() - remainingSize; // third dimension of the product clamped to packet boundaries - const int l2BlockRows = MaxL2BlockSize > _rows() ? _rows() : MaxL2BlockSize; - const int l2BlockCols = MaxL2BlockSize > _cols() ? _cols() : MaxL2BlockSize; - const int l2BlockSize = MaxL2BlockSize > size ? size : MaxL2BlockSize; - //Scalar* __restrict__ block = new Scalar[l2blocksize*size];; - Scalar* __restrict__ block = (Scalar*)alloca(sizeof(Scalar)*l2BlockRows*size); - - // loops on each L2 cache friendly blocks of the result - for(int l2i=0; l2i<_rows(); l2i+=l2BlockRows) - { - const int l2blockRowEnd = std::min(l2i+l2BlockRows, rows); - const int l2blockRowEndBW = l2blockRowEnd & MaxBlockRows_ClampingMask; // end of the rows aligned to bw - const int l2blockRemainingRows = l2blockRowEnd - l2blockRowEndBW; // number of remaining rows - - // build a cache friendly block - int count = 0; - - // copy l2blocksize rows of m_lhs to blocks of ps x bw - for(int l2k=0; l2k<size; l2k+=l2BlockSize) - { - const int l2blockSizeEnd = std::min(l2k+l2BlockSize, size); - - for (int i = l2i; i<l2blockRowEndBW; i+=MaxBlockRows) - { - for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize) - { - // TODO write these two loops using meta unrolling - // negligible for large matrices but useful for small ones - for (int w=0; w<MaxBlockRows; ++w) - for (int s=0; s<PacketSize; ++s) - block[count++] = m_lhs.coeff(i+w,k+s); - } - } - if (l2blockRemainingRows>0) - { - for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize) - { - for (int w=0; w<l2blockRemainingRows; ++w) - for (int s=0; s<PacketSize; ++s) - block[count++] = m_lhs.coeff(l2blockRowEndBW+w,k+s); - } - } - } - - for(int l2j=0; l2j<cols; l2j+=l2BlockCols) - { - int l2blockColEnd = std::min(l2j+l2BlockCols, cols); - - for(int l2k=0; l2k<size; l2k+=l2BlockSize) - { - // acumulate a bw rows of lhs time a single column of rhs to a bw x 1 block of res - int l2blockSizeEnd = std::min(l2k+l2BlockSize, size); - - // for each bw x 1 result's block - for(int l1i=l2i; l1i<l2blockRowEndBW; l1i+=MaxBlockRows) - { - _cacheFriendlyEvalKernel<DestDerived, RhsAlignment, ResAlignment, MaxBlockRows>( - res, l2i, l2j, l2k, l1i, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); -#if 0 - for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1) - { - int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*MaxBlockRows; - const Scalar* localB = &block[offsetblock]; - - int l1jsize = l1j * m_lhs.cols(); //TODO find a better way to optimize address computation ? - - PacketScalar dst[bw]; - dst[0] = ei_pset1(Scalar(0.)); - dst[1] = dst[0]; - dst[2] = dst[0]; - dst[3] = dst[0]; - if (MaxBlockRows==8) - { - dst[4] = dst[0]; - dst[5] = dst[0]; - dst[6] = dst[0]; - dst[7] = dst[0]; - } - PacketScalar b0, b1, tmp; - // TODO in unaligned mode, preload the next element -// PacketScalar tmp1 = _mm_load_ps(&m_rhs.derived().data()[l1jsize+l2k]); - asm("#eigen begincore"); - for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize) - { -// PacketScalar tmp = m_rhs.template packetCoeff<Aligned>(k, l1j); - // TODO make this branching compile time (costly for doubles) - if (rhsIsAligned) - tmp = ei_pload(&m_rhs.derived().data()[l1jsize + k]); - else - tmp = ei_ploadu(&m_rhs.derived().data()[l1jsize + k]); - - b0 = ei_pload(&(localB[k*bw])); - b1 = ei_pload(&(localB[k*bw+ps])); - dst[0] = ei_pmadd(tmp, b0, dst[0]); - b0 = ei_pload(&(localB[k*bw+2*ps])); - dst[1] = ei_pmadd(tmp, b1, dst[1]); - b1 = ei_pload(&(localB[k*bw+3*ps])); - dst[2] = ei_pmadd(tmp, b0, dst[2]); - if (MaxBlockRows==8) - b0 = ei_pload(&(localB[k*bw+4*ps])); - dst[3] = ei_pmadd(tmp, b1, dst[3]); - if (MaxBlockRows==8) - { - b1 = ei_pload(&(localB[k*bw+5*ps])); - dst[4] = ei_pmadd(tmp, b0, dst[4]); - b0 = ei_pload(&(localB[k*bw+6*ps])); - dst[5] = ei_pmadd(tmp, b1, dst[5]); - b1 = ei_pload(&(localB[k*bw+7*ps])); - dst[6] = ei_pmadd(tmp, b0, dst[6]); - dst[7] = ei_pmadd(tmp, b1, dst[7]); - } - } - -// if (resIsAligned) - { - res.template writePacketCoeff<Aligned>(l1i, l1j, ei_padd(res.template packetCoeff<Aligned>(l1i, l1j), ei_preduxp(dst))); - if (PacketSize==2) - res.template writePacketCoeff<Aligned>(l1i+2,l1j, ei_padd(res.template packetCoeff<Aligned>(l1i+2,l1j), ei_preduxp(&(dst[2])))); - if (MaxBlockRows==8) - { - res.template writePacketCoeff<Aligned>(l1i+4,l1j, ei_padd(res.template packetCoeff<Aligned>(l1i+4,l1j), ei_preduxp(&(dst[4])))); - if (PacketSize==2) - res.template writePacketCoeff<Aligned>(l1i+6,l1j, ei_padd(res.template packetCoeff<Aligned>(l1i+6,l1j), ei_preduxp(&(dst[6])))); - } - } -// else -// { -// // TODO uncommenting this code kill the perf, even though it is never called !! -// // this is because dst cannot be a set of registers only -// // TODO optimize this loop -// // TODO is it better to do one redux at once or packet reduxes + unaligned store ? -// for (int w = 0; w<bw; ++w) -// res.coeffRef(l1i+w, l1j) += ei_predux(dst[w]); -// std::cout << "!\n"; -// } - - asm("#eigen endcore"); - } -#endif - } - if (l2blockRemainingRows>0) - { - // this is an attempt to build an array of kernels, but I did not manage to get it compiles -// typedef void (*Kernel)(DestDerived& , int, int, int, int, int, int, int, const Scalar*); -// Kernel kernels[8]; -// kernels[0] = (Kernel)(&Product<Lhs,Rhs,EvalMode>::template _cacheFriendlyEvalKernel<DestDerived, RhsAlignment, ResAlignment, 1>); -// kernels[l2blockRemainingRows](res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); - - switch(l2blockRemainingRows) - { - case 1:_cacheFriendlyEvalKernel<DestDerived, RhsAlignment, ResAlignment, 1>( - res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break; - case 2:_cacheFriendlyEvalKernel<DestDerived, RhsAlignment, ResAlignment, 2>( - res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break; - case 3:_cacheFriendlyEvalKernel<DestDerived, RhsAlignment, ResAlignment, 3>( - res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break; - case 4:_cacheFriendlyEvalKernel<DestDerived, RhsAlignment, ResAlignment, 4>( - res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break; - case 5:_cacheFriendlyEvalKernel<DestDerived, RhsAlignment, ResAlignment, 5>( - res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break; - case 6:_cacheFriendlyEvalKernel<DestDerived, RhsAlignment, ResAlignment, 6>( - res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break; - case 7:_cacheFriendlyEvalKernel<DestDerived, RhsAlignment, ResAlignment, 7>( - res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break; - default: - ei_internal_assert(false && "internal error"); break; - } - } - } - } - } - - // handle the part which cannot be processed by the vectorized path - if (remainingSize) - { - res += Product< - Block<typename ei_unconst<_LhsNested>::type,Dynamic,Dynamic>, - Block<typename ei_unconst<_RhsNested>::type,Dynamic,Dynamic>, - NormalProduct>( - m_lhs.block(0,size, _rows(), remainingSize), - m_rhs.block(size,0, remainingSize, _cols())).lazy(); -// res += m_lhs.block(0,size, _rows(), remainingSize)._lazyProduct(m_rhs.block(size,0, remainingSize, _cols())); - } - -// delete[] block; + ei_cache_friendly_product<Scalar>( + _rows(), _cols(), m_lhs.cols(), + _LhsNested::Flags&RowMajorBit, &(m_lhs.const_cast_derived().coeffRef(0,0)), m_lhs.stride(), + _RhsNested::Flags&RowMajorBit, &(m_rhs.const_cast_derived().coeffRef(0,0)), m_rhs.stride(), + Flags&RowMajorBit, &(res.coeffRef(0,0)), res.stride() + ); } #endif // EIGEN_PRODUCT_H diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index e4af78c7c..d98d06597 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -69,6 +69,8 @@ template<typename MatrixType> class Transpose int _rows() const { return m_matrix.cols(); } int _cols() const { return m_matrix.rows(); } + int _stride(void) const { return m_matrix.stride(); } + Scalar& _coeffRef(int row, int col) { return m_matrix.const_cast_derived().coeffRef(col, row); diff --git a/Eigen/src/Core/Triangular.h b/Eigen/src/Core/Triangular.h index 7d5884de3..8a17a3fee 100755 --- a/Eigen/src/Core/Triangular.h +++ b/Eigen/src/Core/Triangular.h @@ -67,7 +67,7 @@ struct ei_traits<Triangular<Mode, MatrixType> > ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - Flags = (_MatrixTypeNested::Flags & ~(VectorizableBit | Like1DArrayBit)) | Mode, + Flags = (_MatrixTypeNested::Flags & ~(VectorizableBit | Like1DArrayBit | DirectAccessBit)) | Mode, CoeffReadCost = _MatrixTypeNested::CoeffReadCost }; }; diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index 48498bfae..8aa16aa81 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -43,18 +43,18 @@ const unsigned int NullDiagBit = 0x40; ///< means all diagonal coefficients const unsigned int UnitDiagBit = 0x80; ///< means all diagonal coefficients are equal to 1 const unsigned int NullLowerBit = 0x200; ///< means the strictly triangular lower part is 0 const unsigned int NullUpperBit = 0x400; ///< means the strictly triangular upper part is 0 -const unsigned int ReferencableBit = 0x800; ///< means the expression is writable through MatrixBase::coeffRef(int,int) +const unsigned int DirectAccessBit = 0x800; ///< means the underlying matrix data can be direclty accessed enum { Upper=NullLowerBit, Lower=NullUpperBit }; enum { Aligned=0, UnAligned=1 }; // list of flags that are lost by default -const unsigned int DefaultLostFlagMask = ~(VectorizableBit | Like1DArrayBit | ReferencableBit +const unsigned int DefaultLostFlagMask = ~(VectorizableBit | Like1DArrayBit | DirectAccessBit | NullDiagBit | UnitDiagBit | NullLowerBit | NullUpperBit); enum { ConditionalJumpCost = 5 }; enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight }; enum DirectionType { Vertical, Horizontal }; -enum ProductEvaluationMode { NormalProduct, CacheOptimalProduct, LazyProduct}; +enum ProductEvaluationMode { NormalProduct, CacheFriendlyProduct, LazyProduct}; #endif // EIGEN_CONSTANTS_H diff --git a/Eigen/src/QR/EigenSolver.h b/Eigen/src/QR/EigenSolver.h new file mode 100644 index 000000000..47199862f --- /dev/null +++ b/Eigen/src/QR/EigenSolver.h @@ -0,0 +1,848 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. Eigen itself is part of the KDE project. +// +// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr> +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see <http://www.gnu.org/licenses/>. + +#ifndef EIGEN_EIGENSOLVER_H +#define EIGEN_EIGENSOLVER_H + +/** \class EigenSolver + * + * \brief Eigen values/vectors solver + * + * \param MatrixType the type of the matrix of which we are computing the eigen decomposition + * + * \note this code was adapted from JAMA (public domain) + * + * \sa MatrixBase::eigenvalues() + */ +template<typename _MatrixType> class EigenSolver +{ + public: + + typedef _MatrixType MatrixType; + typedef typename MatrixType::Scalar Scalar; + typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> VectorType; + + EigenSolver(const MatrixType& matrix) + : m_eivec(matrix.rows(), matrix.cols()), + m_eivalr(matrix.cols()), m_eivali(matrix.cols()), + m_H(matrix.rows(), matrix.cols()), + m_ort(matrix.cols()) + { + _compute(matrix); + } + + MatrixType eigenvectors(void) const { return m_eivec; } + + VectorType eigenvalues(void) const { return m_eivalr; } + + private: + + void _compute(const MatrixType& matrix); + + void tridiagonalization(void); + void tql2(void); + + void orthes(void); + void hqr2(void); + + protected: + MatrixType m_eivec; + VectorType m_eivalr, m_eivali; + MatrixType m_H; + VectorType m_ort; + bool m_isSymmetric; +}; + +template<typename MatrixType> +void EigenSolver<MatrixType>::_compute(const MatrixType& matrix) +{ + assert(matrix.cols() == matrix.rows()); + + m_isSymmetric = true; + int n = matrix.cols(); + for (int j = 0; (j < n) && m_isSymmetric; j++) { + for (int i = 0; (i < j) && m_isSymmetric; i++) { + m_isSymmetric = (matrix(i,j) == matrix(j,i)); + } + } + + m_eivalr.resize(n,1); + m_eivali.resize(n,1); + + if (m_isSymmetric) + { + m_eivec = matrix; + + // Tridiagonalize. + tridiagonalization(); + + // Diagonalize. + tql2(); + } + else + { + m_H = matrix; + m_ort.resize(n, 1); + + // Reduce to Hessenberg form. + orthes(); + + // Reduce Hessenberg to real Schur form. + hqr2(); + } + std::cout << m_eivali.transpose() << "\n"; +} + + +// Symmetric Householder reduction to tridiagonal form. +template<typename MatrixType> +void EigenSolver<MatrixType>::tridiagonalization(void) +{ + +// This is derived from the Algol procedures tred2 by +// Bowdler, Martin, Reinsch, and Wilkinson, Handbook for +// Auto. Comp., Vol.ii-Linear Algebra, and the corresponding +// Fortran subroutine in EISPACK. + + int n = m_eivec.cols(); + m_eivalr = m_eivec.row(m_eivalr.size()-1); + + // Householder reduction to tridiagonal form. + for (int i = n-1; i > 0; i--) + { + // Scale to avoid under/overflow. + Scalar scale = 0.0; + Scalar h = 0.0; + scale = m_eivalr.start(i).cwiseAbs().sum(); + + if (scale == 0.0) + { + m_eivali[i] = m_eivalr[i-1]; + m_eivalr.start(i) = m_eivec.row(i-1).start(i); + m_eivec.corner(TopLeft, i, i) = m_eivec.corner(TopLeft, i, i).diagonal().asDiagonal(); + } + else + { + // Generate Householder vector. + m_eivalr.start(i) /= scale; + h = m_eivalr.start(i).cwiseAbs2().sum(); + + Scalar f = m_eivalr[i-1]; + Scalar g = ei_sqrt(h); + if (f > 0) + g = -g; + m_eivali[i] = scale * g; + h = h - f * g; + m_eivalr[i-1] = f - g; + m_eivali.start(i).setZero(); + + // Apply similarity transformation to remaining columns. + for (int j = 0; j < i; j++) + { + f = m_eivalr[j]; + m_eivec(j,i) = f; + g = m_eivali[j] + m_eivec(j,j) * f; + int bSize = i-j-1; + if (bSize>0) + { + g += (m_eivec.col(j).block(j+1, bSize).transpose() * m_eivalr.block(j+1, bSize))(0,0); + m_eivali.block(j+1, bSize) += m_eivec.col(j).block(j+1, bSize) * f; + } + m_eivali[j] = g; + } + + f = (m_eivali.start(i).transpose() * m_eivalr.start(i))(0,0); + m_eivali.start(i) = (m_eivali.start(i) - (f / (h + h)) * m_eivalr.start(i))/h; + + m_eivec.corner(TopLeft, i, i).lower() -= + ( (m_eivali.start(i) * m_eivalr.start(i).transpose()).lazy() + + (m_eivalr.start(i) * m_eivali.start(i).transpose()).lazy()); + + m_eivalr.start(i) = m_eivec.row(i-1).start(i); + m_eivec.row(i).start(i).setZero(); + } + m_eivalr[i] = h; + } + + // Accumulate transformations. + for (int i = 0; i < n-1; i++) + { + m_eivec(n-1,i) = m_eivec(i,i); + m_eivec(i,i) = 1.0; + Scalar h = m_eivalr[i+1]; + // FIXME this does not looks very stable ;) + if (h != 0.0) + { + m_eivalr.start(i+1) = m_eivec.col(i+1).start(i+1) / h; + m_eivec.corner(TopLeft, i+1, i+1) -= m_eivalr.start(i+1) + * ( m_eivec.col(i+1).start(i+1).transpose() * m_eivec.corner(TopLeft, i+1, i+1) ); + } + m_eivec.col(i+1).start(i+1).setZero(); + } + m_eivalr = m_eivec.row(m_eivalr.size()-1); + m_eivec.row(m_eivalr.size()-1).setZero(); + m_eivec(n-1,n-1) = 1.0; + m_eivali[0] = 0.0; +} + + +// Symmetric tridiagonal QL algorithm. +template<typename MatrixType> +void EigenSolver<MatrixType>::tql2(void) +{ + +// This is derived from the Algol procedures tql2, by +// Bowdler, Martin, Reinsch, and Wilkinson, Handbook for +// Auto. Comp., Vol.ii-Linear Algebra, and the corresponding +// Fortran subroutine in EISPACK. + + int n = m_eivalr.size(); + + for (int i = 1; i < n; i++) { + m_eivali[i-1] = m_eivali[i]; + } + m_eivali[n-1] = 0.0; + + Scalar f = 0.0; + Scalar tst1 = 0.0; + Scalar eps = std::pow(2.0,-52.0); + for (int l = 0; l < n; l++) + { + // Find small subdiagonal element + tst1 = std::max(tst1,ei_abs(m_eivalr[l]) + ei_abs(m_eivali[l])); + int m = l; + + while ( (m < n) && (ei_abs(m_eivali[m]) > eps*tst1) ) + m++; + + // If m == l, m_eivalr[l] is an eigenvalue, + // otherwise, iterate. + if (m > l) + { + int iter = 0; + do + { + iter = iter + 1; + + // Compute implicit shift + Scalar g = m_eivalr[l]; + Scalar p = (m_eivalr[l+1] - g) / (2.0 * m_eivali[l]); + Scalar r = hypot(p,1.0); + if (p < 0) + r = -r; + + m_eivalr[l] = m_eivali[l] / (p + r); + m_eivalr[l+1] = m_eivali[l] * (p + r); + Scalar dl1 = m_eivalr[l+1]; + Scalar h = g - m_eivalr[l]; + if (l+2<n) + m_eivalr.end(n-l-2) -= VectorType::constant(n-l-2, h); + f = f + h; + + // Implicit QL transformation. + p = m_eivalr[m]; + Scalar c = 1.0; + Scalar c2 = c; + Scalar c3 = c; + Scalar el1 = m_eivali[l+1]; + Scalar s = 0.0; + Scalar s2 = 0.0; + for (int i = m-1; i >= l; i--) + { + c3 = c2; + c2 = c; + s2 = s; + g = c * m_eivali[i]; + h = c * p; + r = hypot(p,m_eivali[i]); + m_eivali[i+1] = s * r; + s = m_eivali[i] / r; + c = p / r; + p = c * m_eivalr[i] - s * g; + m_eivalr[i+1] = h + s * (c * g + s * m_eivalr[i]); + + // Accumulate transformation. + for (int k = 0; k < n; k++) + { + h = m_eivec(k,i+1); + m_eivec(k,i+1) = s * m_eivec(k,i) + c * h; + m_eivec(k,i) = c * m_eivec(k,i) - s * h; + } + } + p = -s * s2 * c3 * el1 * m_eivali[l] / dl1; + m_eivali[l] = s * p; + m_eivalr[l] = c * p; + + // Check for convergence. + } while (ei_abs(m_eivali[l]) > eps*tst1); + } + m_eivalr[l] = m_eivalr[l] + f; + m_eivali[l] = 0.0; + } + + // Sort eigenvalues and corresponding vectors. + // TODO use a better sort algorithm !! + for (int i = 0; i < n-1; i++) + { + int k = i; + Scalar minValue = m_eivalr[i]; + for (int j = i+1; j < n; j++) + { + if (m_eivalr[j] < minValue) + { + k = j; + minValue = m_eivalr[j]; + } + } + if (k != i) + { + std::swap(m_eivalr[i], m_eivalr[k]); + m_eivec.col(i).swap(m_eivec.col(k)); + } + } +} + + +// Nonsymmetric reduction to Hessenberg form. +template<typename MatrixType> +void EigenSolver<MatrixType>::orthes(void) +{ + // This is derived from the Algol procedures orthes and ortran, + // by Martin and Wilkinson, Handbook for Auto. Comp., + // Vol.ii-Linear Algebra, and the corresponding + // Fortran subroutines in EISPACK. + + int n = m_eivec.cols(); + int low = 0; + int high = n-1; + + for (int m = low+1; m <= high-1; m++) + { + // Scale column. + Scalar scale = m_H.block(m, m-1, high-m+1, 1).cwiseAbs().sum(); + if (scale != 0.0) + { + // Compute Householder transformation. + Scalar h = 0.0; + // FIXME could be rewritten, but this one looks better wrt cache + for (int i = high; i >= m; i--) + { + m_ort[i] = m_H(i,m-1)/scale; + h += m_ort[i] * m_ort[i]; + } + Scalar g = ei_sqrt(h); + if (m_ort[m] > 0) + g = -g; + h = h - m_ort[m] * g; + m_ort[m] = m_ort[m] - g; + + // Apply Householder similarity transformation + // H = (I-u*u'/h)*H*(I-u*u')/h) + int bSize = high-m+1; + m_H.block(m, m, bSize, n-m) -= ((m_ort.block(m, bSize)/h) + * (m_ort.block(m, bSize).transpose() * m_H.block(m, m, bSize, n-m)).lazy()).lazy(); + + m_H.block(0, m, high+1, bSize) -= ((m_H.block(0, m, high+1, bSize) * m_ort.block(m, bSize)).lazy() + * (m_ort.block(m, bSize)/h).transpose()).lazy(); + + m_ort[m] = scale*m_ort[m]; + m_H(m,m-1) = scale*g; + } + } + + // Accumulate transformations (Algol's ortran). + m_eivec.setIdentity(); + + for (int m = high-1; m >= low+1; m--) + { + if (m_H(m,m-1) != 0.0) + { + m_ort.block(m+1, high-m) = m_H.col(m-1).block(m+1, high-m); + + int bSize = high-m+1; + m_eivec.block(m, m, bSize, bSize) += ( (m_ort.block(m, bSize) / (m_H(m,m-1) * m_ort[m] ) ) + * (m_ort.block(m, bSize).transpose() * m_eivec.block(m, m, bSize, bSize)).lazy()); + } + } +} + + +// Complex scalar division. +template<typename Scalar> +std::complex<Scalar> cdiv(Scalar xr, Scalar xi, Scalar yr, Scalar yi) +{ + Scalar r,d; + if (ei_abs(yr) > ei_abs(yi)) + { + r = yi/yr; + d = yr + r*yi; + return std::complex<Scalar>((xr + r*xi)/d, (xi - r*xr)/d); + } + else + { + r = yr/yi; + d = yi + r*yr; + return std::complex<Scalar>((r*xr + xi)/d, (r*xi - xr)/d); + } +} + + +// Nonsymmetric reduction from Hessenberg to real Schur form. +template<typename MatrixType> +void EigenSolver<MatrixType>::hqr2(void) +{ + // This is derived from the Algol procedure hqr2, + // by Martin and Wilkinson, Handbook for Auto. Comp., + // Vol.ii-Linear Algebra, and the corresponding + // Fortran subroutine in EISPACK. + + // Initialize + int nn = m_eivec.cols(); + int n = nn-1; + int low = 0; + int high = nn-1; + Scalar eps = pow(2.0,-52.0); + Scalar exshift = 0.0; + Scalar p=0,q=0,r=0,s=0,z=0,t,w,x,y; + + // Store roots isolated by balanc and compute matrix norm + // FIXME to be efficient the following would requires a triangular reduxion code + // Scalar norm = m_H.upper().cwiseAbs().sum() + m_H.corner(BottomLeft,n,n).diagonal().cwiseAbs().sum(); + Scalar norm = 0.0; + for (int j = 0; j < nn; j++) + { + // FIXME what's the purpose of the following since the condition is always false + if ((j < low) || (j > high)) + { + m_eivalr[j] = m_H(j,j); + m_eivali[j] = 0.0; + } + norm += m_H.col(j).start(std::min(j+1,nn)).cwiseAbs().sum(); + } + + // Outer loop over eigenvalue index + int iter = 0; + while (n >= low) + { + // Look for single small sub-diagonal element + int l = n; + while (l > low) + { + s = ei_abs(m_H(l-1,l-1)) + ei_abs(m_H(l,l)); + if (s == 0.0) + s = norm; + if (ei_abs(m_H(l,l-1)) < eps * s) + break; + l--; + } + + // Check for convergence + // One root found + if (l == n) + { + m_H(n,n) = m_H(n,n) + exshift; + m_eivalr[n] = m_H(n,n); + m_eivali[n] = 0.0; + n--; + iter = 0; + } + else if (l == n-1) // Two roots found + { + w = m_H(n,n-1) * m_H(n-1,n); + p = (m_H(n-1,n-1) - m_H(n,n)) / 2.0; + q = p * p + w; + z = ei_sqrt(ei_abs(q)); + m_H(n,n) = m_H(n,n) + exshift; + m_H(n-1,n-1) = m_H(n-1,n-1) + exshift; + x = m_H(n,n); + + // Scalar pair + if (q >= 0) + { + if (p >= 0) + z = p + z; + else + z = p - z; + + m_eivalr[n-1] = x + z; + m_eivalr[n] = m_eivalr[n-1]; + if (z != 0.0) + m_eivalr[n] = x - w / z; + + m_eivali[n-1] = 0.0; + m_eivali[n] = 0.0; + x = m_H(n,n-1); + s = ei_abs(x) + ei_abs(z); + p = x / s; + q = z / s; + r = ei_sqrt(p * p+q * q); + p = p / r; + q = q / r; + + // Row modification + for (int j = n-1; j < nn; j++) + { + z = m_H(n-1,j); + m_H(n-1,j) = q * z + p * m_H(n,j); + m_H(n,j) = q * m_H(n,j) - p * z; + } + + // Column modification + for (int i = 0; i <= n; i++) + { + z = m_H(i,n-1); + m_H(i,n-1) = q * z + p * m_H(i,n); + m_H(i,n) = q * m_H(i,n) - p * z; + } + + // Accumulate transformations + for (int i = low; i <= high; i++) + { + z = m_eivec(i,n-1); + m_eivec(i,n-1) = q * z + p * m_eivec(i,n); + m_eivec(i,n) = q * m_eivec(i,n) - p * z; + } + } + else // Complex pair + { + m_eivalr[n-1] = x + p; + m_eivalr[n] = x + p; + m_eivali[n-1] = z; + m_eivali[n] = -z; + } + n = n - 2; + iter = 0; + } + else // No convergence yet + { + // Form shift + x = m_H(n,n); + y = 0.0; + w = 0.0; + if (l < n) + { + y = m_H(n-1,n-1); + w = m_H(n,n-1) * m_H(n-1,n); + } + + // Wilkinson's original ad hoc shift + if (iter == 10) + { + exshift += x; + for (int i = low; i <= n; i++) + m_H(i,i) -= x; + s = ei_abs(m_H(n,n-1)) + ei_abs(m_H(n-1,n-2)); + x = y = 0.75 * s; + w = -0.4375 * s * s; + } + + // MATLAB's new ad hoc shift + if (iter == 30) + { + s = (y - x) / 2.0; + s = s * s + w; + if (s > 0) + { + s = ei_sqrt(s); + if (y < x) + s = -s; + s = x - w / ((y - x) / 2.0 + s); + for (int i = low; i <= n; i++) + m_H(i,i) -= s; + exshift += s; + x = y = w = 0.964; + } + } + + iter = iter + 1; // (Could check iteration count here.) + + // Look for two consecutive small sub-diagonal elements + int m = n-2; + while (m >= l) + { + z = m_H(m,m); + r = x - z; + s = y - z; + p = (r * s - w) / m_H(m+1,m) + m_H(m,m+1); + q = m_H(m+1,m+1) - z - r - s; + r = m_H(m+2,m+1); + s = ei_abs(p) + ei_abs(q) + ei_abs(r); + p = p / s; + q = q / s; + r = r / s; + if (m == l) { + break; + } + if (ei_abs(m_H(m,m-1)) * (ei_abs(q) + ei_abs(r)) < + eps * (ei_abs(p) * (ei_abs(m_H(m-1,m-1)) + ei_abs(z) + + ei_abs(m_H(m+1,m+1))))) + { + break; + } + m--; + } + + for (int i = m+2; i <= n; i++) + { + m_H(i,i-2) = 0.0; + if (i > m+2) + m_H(i,i-3) = 0.0; + } + + // Double QR step involving rows l:n and columns m:n + for (int k = m; k <= n-1; k++) + { + int notlast = (k != n-1); + if (k != m) { + p = m_H(k,k-1); + q = m_H(k+1,k-1); + r = (notlast ? m_H(k+2,k-1) : 0.0); + x = ei_abs(p) + ei_abs(q) + ei_abs(r); + if (x != 0.0) + { + p = p / x; + q = q / x; + r = r / x; + } + } + + if (x == 0.0) + break; + + s = ei_sqrt(p * p + q * q + r * r); + + if (p < 0) + s = -s; + + if (s != 0) + { + if (k != m) + m_H(k,k-1) = -s * x; + else if (l != m) + m_H(k,k-1) = -m_H(k,k-1); + + p = p + s; + x = p / s; + y = q / s; + z = r / s; + q = q / p; + r = r / p; + + // Row modification + for (int j = k; j < nn; j++) + { + p = m_H(k,j) + q * m_H(k+1,j); + if (notlast) + { + p = p + r * m_H(k+2,j); + m_H(k+2,j) = m_H(k+2,j) - p * z; + } + m_H(k,j) = m_H(k,j) - p * x; + m_H(k+1,j) = m_H(k+1,j) - p * y; + } + + // Column modification + for (int i = 0; i <= std::min(n,k+3); i++) + { + p = x * m_H(i,k) + y * m_H(i,k+1); + if (notlast) + { + p = p + z * m_H(i,k+2); + m_H(i,k+2) = m_H(i,k+2) - p * r; + } + m_H(i,k) = m_H(i,k) - p; + m_H(i,k+1) = m_H(i,k+1) - p * q; + } + + // Accumulate transformations + for (int i = low; i <= high; i++) + { + p = x * m_eivec(i,k) + y * m_eivec(i,k+1); + if (notlast) + { + p = p + z * m_eivec(i,k+2); + m_eivec(i,k+2) = m_eivec(i,k+2) - p * r; + } + m_eivec(i,k) = m_eivec(i,k) - p; + m_eivec(i,k+1) = m_eivec(i,k+1) - p * q; + } + } // (s != 0) + } // k loop + } // check convergence + } // while (n >= low) + + // Backsubstitute to find vectors of upper triangular form + if (norm == 0.0) + { + return; + } + + for (n = nn-1; n >= 0; n--) + { + p = m_eivalr[n]; + q = m_eivali[n]; + + // Scalar vector + if (q == 0) + { + int l = n; + m_H(n,n) = 1.0; + for (int i = n-1; i >= 0; i--) + { + w = m_H(i,i) - p; + r = (m_H.row(i).end(nn-l) * m_H.col(n).end(nn-l))(0,0); + + if (m_eivali[i] < 0.0) + { + z = w; + s = r; + } + else + { + l = i; + if (m_eivali[i] == 0.0) + { + if (w != 0.0) + m_H(i,n) = -r / w; + else + m_H(i,n) = -r / (eps * norm); + } + else // Solve real equations + { + x = m_H(i,i+1); + y = m_H(i+1,i); + q = (m_eivalr[i] - p) * (m_eivalr[i] - p) + m_eivali[i] * m_eivali[i]; + t = (x * s - z * r) / q; + m_H(i,n) = t; + if (ei_abs(x) > ei_abs(z)) + m_H(i+1,n) = (-r - w * t) / x; + else + m_H(i+1,n) = (-s - y * t) / z; + } + + // Overflow control + t = ei_abs(m_H(i,n)); + if ((eps * t) * t > 1) + m_H.col(n).end(nn-i) /= t; + } + } + } + else if (q < 0) // Complex vector + { + std::complex<Scalar> cc; + int l = n-1; + + // Last vector component imaginary so matrix is triangular + if (ei_abs(m_H(n,n-1)) > ei_abs(m_H(n-1,n))) + { + m_H(n-1,n-1) = q / m_H(n,n-1); + m_H(n-1,n) = -(m_H(n,n) - p) / m_H(n,n-1); + } + else + { + cc = cdiv<Scalar>(0.0,-m_H(n-1,n),m_H(n-1,n-1)-p,q); + m_H(n-1,n-1) = ei_real(cc); + m_H(n-1,n) = ei_imag(cc); + } + m_H(n,n-1) = 0.0; + m_H(n,n) = 1.0; + for (int i = n-2; i >= 0; i--) + { + Scalar ra,sa,vr,vi; + ra = (m_H.row(i).end(nn-l) * m_H.col(n-1).end(nn-l)).lazy()(0,0); + sa = (m_H.row(i).end(nn-l) * m_H.col(n).end(nn-l)).lazy()(0,0); + w = m_H(i,i) - p; + + if (m_eivali[i] < 0.0) + { + z = w; + r = ra; + s = sa; + } + else + { + l = i; + if (m_eivali[i] == 0) + { + cc = cdiv(-ra,-sa,w,q); + m_H(i,n-1) = ei_real(cc); + m_H(i,n) = ei_imag(cc); + } + else + { + // Solve complex equations + x = m_H(i,i+1); + y = m_H(i+1,i); + vr = (m_eivalr[i] - p) * (m_eivalr[i] - p) + m_eivali[i] * m_eivali[i] - q * q; + vi = (m_eivalr[i] - p) * 2.0 * q; + if ((vr == 0.0) && (vi == 0.0)) + vr = eps * norm * (ei_abs(w) + ei_abs(q) + ei_abs(x) + ei_abs(y) + ei_abs(z)); + + cc= cdiv(x*r-z*ra+q*sa,x*s-z*sa-q*ra,vr,vi); + m_H(i,n-1) = ei_real(cc); + m_H(i,n) = ei_imag(cc); + if (ei_abs(x) > (ei_abs(z) + ei_abs(q))) + { + m_H(i+1,n-1) = (-ra - w * m_H(i,n-1) + q * m_H(i,n)) / x; + m_H(i+1,n) = (-sa - w * m_H(i,n) - q * m_H(i,n-1)) / x; + } + else + { + cc = cdiv(-r-y*m_H(i,n-1),-s-y*m_H(i,n),z,q); + m_H(i+1,n-1) = ei_real(cc); + m_H(i+1,n) = ei_imag(cc); + } + } + + // Overflow control + t = std::max(ei_abs(m_H(i,n-1)),ei_abs(m_H(i,n))); + if ((eps * t) * t > 1) + m_H.block(i, n-1, nn-i, 2) /= t; + + } + } + } + } + + // Vectors of isolated roots + for (int i = 0; i < nn; i++) + { + // FIXME again what's the purpose of this test ? + // in this algo low==0 and high==nn-1 !! + if (i < low || i > high) + { + m_eivec.row(i).end(nn-i) = m_H.row(i).end(nn-i); + } + } + + // Back transformation to get eigenvectors of original matrix + int bRows = high-low+1; + for (int j = nn-1; j >= low; j--) + { + int bSize = std::min(j,high)-low+1; + m_eivec.col(j).block(low, bRows) = (m_eivec.block(low, low, bRows, bSize) * m_H.col(j).block(low, bSize)); + } +} + +#endif // EIGEN_EIGENSOLVER_H diff --git a/Eigen/src/QR/QR.h b/Eigen/src/QR/QR.h index d0121cc7a..d42153eb9 100644 --- a/Eigen/src/QR/QR.h +++ b/Eigen/src/QR/QR.h @@ -95,10 +95,11 @@ void QR<MatrixType>::_compute(const MatrixType& matrix) m_qr(k,k) += 1.0; // apply transformation to remaining columns - for (int j = k+1; j < cols; j++) + int remainingCols = cols - k -1; + if (remainingCols>0) { - Scalar s = -(m_qr.col(k).end(remainingSize).transpose() * m_qr.col(j).end(remainingSize))(0,0) / m_qr(k,k); - m_qr.col(j).end(remainingSize) += s * m_qr.col(k).end(remainingSize); + m_qr.corner(BottomRight, remainingSize, remainingCols) -= (1./m_qr(k,k)) * m_qr.col(k).end(remainingSize) + * (m_qr.col(k).end(remainingSize).transpose() * m_qr.corner(BottomRight, remainingSize, remainingCols)); } } m_norms[k] = -nrm; |