From 45cda6704a067e73711f659ec6389fae7e36d1ad Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 12 May 2008 10:23:09 +0000 Subject: * Draft of a eigenvalues solver (does not support complex and does not re-use the QR decomposition) * Rewrite the cache friendly product to have only one instance per scalar type ! This significantly speeds up compilation time and reduces executable size. The current drawback is that some trivial expressions might be evaluated like conjugate or negate. * Renamed "cache optimal" to "cache friendly" * Added the ability to directly access matrix data of some expressions via: - the stride()/_stride() methods - DirectAccessBit flag (replace ReferencableBit) --- Eigen/src/Core/Block.h | 4 +- Eigen/src/Core/CacheFriendlyProduct.h | 353 ++++++++++++++ Eigen/src/Core/Map.h | 2 +- Eigen/src/Core/MathFunctions.h | 2 +- Eigen/src/Core/Matrix.h | 8 + Eigen/src/Core/MatrixBase.h | 9 +- Eigen/src/Core/Product.h | 18 +- Eigen/src/Core/ProductWIP.h | 380 ++------------- Eigen/src/Core/Transpose.h | 2 + Eigen/src/Core/Triangular.h | 2 +- Eigen/src/Core/util/Constants.h | 6 +- Eigen/src/QR/EigenSolver.h | 848 ++++++++++++++++++++++++++++++++++ Eigen/src/QR/QR.h | 7 +- 13 files changed, 1281 insertions(+), 360 deletions(-) create mode 100644 Eigen/src/Core/CacheFriendlyProduct.h create mode 100644 Eigen/src/QR/EigenSolver.h (limited to 'Eigen/src') diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index 00bb375a9..4b417224c 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -71,7 +71,7 @@ struct ei_traits > || (ColsAtCompileTime != Dynamic && MatrixType::ColsAtCompileTime == Dynamic)) ? ~LargeBit : ~(unsigned int)0, - Flags = MatrixType::Flags & (DefaultLostFlagMask | VectorizableBit | ReferencableBit) & FlagsMaskLargeBit, + Flags = MatrixType::Flags & (DefaultLostFlagMask | VectorizableBit | DirectAccessBit) & FlagsMaskLargeBit, CoeffReadCost = MatrixType::CoeffReadCost }; }; @@ -132,6 +132,8 @@ template class Block int _rows() const { return m_blockRows.value(); } int _cols() const { return m_blockCols.value(); } + int _stride(void) const { return m_matrix.stride(); } + Scalar& _coeffRef(int row, int col) { return m_matrix.const_cast_derived() diff --git a/Eigen/src/Core/CacheFriendlyProduct.h b/Eigen/src/Core/CacheFriendlyProduct.h new file mode 100644 index 000000000..b484b1786 --- /dev/null +++ b/Eigen/src/Core/CacheFriendlyProduct.h @@ -0,0 +1,353 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. Eigen itself is part of the KDE project. +// +// Copyright (C) 2008 Gael Guennebaud +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see . + +#ifndef EIGEN_CACHE_FRIENDLY_PRODUCT_H +#define EIGEN_CACHE_FRIENDLY_PRODUCT_H + +template +static void ei_cache_friendly_product( + int _rows, int _cols, int depth, + bool _lhsRowMajor, const Scalar* _lhs, int _lhsStride, + bool _rhsRowMajor, const Scalar* _rhs, int _rhsStride, + bool resRowMajor, Scalar* res, int resStride) +{ + const Scalar* __restrict__ lhs; + const Scalar* __restrict__ rhs; + int lhsStride, rhsStride, rows, cols; + bool lhsRowMajor; + + if (resRowMajor) + { + lhs = _rhs; + rhs = _lhs; + lhsStride = _rhsStride; + rhsStride = _lhsStride; + cols = _rows; + rows = _cols; + lhsRowMajor = _rhsRowMajor; + ei_assert(_lhsRowMajor); + } + else + { + lhs = _lhs; + rhs = _rhs; + lhsStride = _lhsStride; + rhsStride = _rhsStride; + rows = _rows; + cols = _cols; + lhsRowMajor = _lhsRowMajor; + ei_assert(!_rhsRowMajor); + } + + typedef typename ei_packet_traits::type PacketType; + + enum { + PacketSize = sizeof(PacketType)/sizeof(Scalar), + #if (defined __i386__) + // i386 architecture provides only 8 xmm registers, + // so let's reduce the max number of rows processed at once. + MaxBlockRows = 4, + MaxBlockRows_ClampingMask = 0xFFFFFC, + #else + MaxBlockRows = 8, + MaxBlockRows_ClampingMask = 0xFFFFF8, + #endif + // maximal size of the blocks fitted in L2 cache + MaxL2BlockSize = EIGEN_TUNE_FOR_L2_CACHE_SIZE / sizeof(Scalar) + }; + + + //const bool rhsIsAligned = (PacketSize==1) || (((rhsStride%PacketSize) == 0) && (size_t(rhs)%16==0)); + const bool resIsAligned = (PacketSize==1) || (((resStride%PacketSize) == 0) && (size_t(res)%16==0)); + + const int remainingSize = depth % PacketSize; + const int size = depth - remainingSize; // third dimension of the product clamped to packet boundaries + const int l2BlockRows = MaxL2BlockSize > rows ? rows : MaxL2BlockSize; + const int l2BlockCols = MaxL2BlockSize > cols ? cols : MaxL2BlockSize; + const int l2BlockSize = MaxL2BlockSize > size ? size : MaxL2BlockSize; + Scalar* __restrict__ block = (Scalar*)alloca(sizeof(Scalar)*l2BlockRows*size); + Scalar* __restrict__ rhsCopy = (Scalar*)alloca(sizeof(Scalar)*l2BlockSize); + + // loops on each L2 cache friendly blocks of the result + for(int l2i=0; l2i0) + { + for (int k=l2k; k1 && size_t(rhsColumn)%16) + { + int count = 0; + for (int k = l2k; k1 && resIsAligned) + { + ei_pstore(&(localRes[0]), ei_padd(ei_pload(&(localRes[0])), ei_preduxp(dst))); + if (PacketSize==2) + ei_pstore(&(localRes[2]), ei_padd(ei_pload(&(localRes[2])), ei_preduxp(&(dst[2])))); + if (MaxBlockRows==8) + { + ei_pstore(&(localRes[4]), ei_padd(ei_pload(&(localRes[4])), ei_preduxp(&(dst[4])))); + if (PacketSize==2) + ei_pstore(&(localRes[6]), ei_padd(ei_pload(&(localRes[6])), ei_preduxp(&(dst[6])))); + } + } + else + { + localRes[0] += ei_predux(dst[0]); + localRes[1] += ei_predux(dst[1]); + localRes[2] += ei_predux(dst[2]); + localRes[3] += ei_predux(dst[3]); + if (MaxBlockRows==8) + { + localRes[4] += ei_predux(dst[4]); + localRes[5] += ei_predux(dst[5]); + localRes[6] += ei_predux(dst[6]); + localRes[7] += ei_predux(dst[7]); + } + } + asm("#eigen endcore"); + } + } + if (l2blockRemainingRows>0) + { + int offsetblock = l2k * (l2blockRowEnd-l2i) + (l2blockRowEndBW-l2i)*(l2blockSizeEnd-l2k) - l2k*l2blockRemainingRows; + const Scalar* localB = &block[offsetblock]; + + asm("#eigen begin dynkernel"); + for(int l1j=l2j; l1j1 && size_t(rhsColumn)%16) + { + int count = 0; + for (int k = l2k; k4) + { + dst[4] = dst[0]; + dst[5] = dst[0]; + dst[6] = dst[0]; + dst[7] = dst[0]; + } + + // let's declare a few other temporary registers + PacketType tmp; + + for(int k=l2k; k=2) dst[1] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+ PacketSize])), dst[1]); + if (l2blockRemainingRows>=3) dst[2] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+2*PacketSize])), dst[2]); + if (l2blockRemainingRows>=4) dst[3] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+3*PacketSize])), dst[3]); + if (MaxBlockRows>4) + { + if (l2blockRemainingRows>=5) dst[4] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+4*PacketSize])), dst[4]); + if (l2blockRemainingRows>=6) dst[5] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+5*PacketSize])), dst[5]); + if (l2blockRemainingRows>=7) dst[6] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+6*PacketSize])), dst[6]); + if (l2blockRemainingRows>=8) dst[7] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+7*PacketSize])), dst[7]); + } + } + + Scalar* __restrict__ localRes = &(res[l2blockRowEndBW + l1j*resStride]); + + // process the remaining rows once at a time + localRes[0] += ei_predux(dst[0]); + if (l2blockRemainingRows>=2) localRes[1] += ei_predux(dst[1]); + if (l2blockRemainingRows>=3) localRes[2] += ei_predux(dst[2]); + if (l2blockRemainingRows>=4) localRes[3] += ei_predux(dst[3]); + if (MaxBlockRows>4) + { + if (l2blockRemainingRows>=5) localRes[4] += ei_predux(dst[4]); + if (l2blockRemainingRows>=6) localRes[5] += ei_predux(dst[5]); + if (l2blockRemainingRows>=7) localRes[6] += ei_predux(dst[6]); + if (l2blockRemainingRows>=8) localRes[7] += ei_predux(dst[7]); + } + + asm("#eigen end dynkernel"); + } + } + } + } + } + if (PacketSize>1 && remainingSize) + { + if (lhsRowMajor) + { + for (int j=0; j > ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - Flags = MatrixType::Flags & (DefaultLostFlagMask | ReferencableBit), + Flags = MatrixType::Flags & (DefaultLostFlagMask | DirectAccessBit), CoeffReadCost = NumTraits::ReadCost }; }; diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 64ae7f97b..5ba37c076 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -46,7 +46,7 @@ inline int ei_log(int) { ei_assert(false); return 0; } inline int ei_sin(int) { ei_assert(false); return 0; } inline int ei_cos(int) { ei_assert(false); return 0; } -#if EIGEN_GNUC_AT_LEAST(4,3) +#if EIGEN_GNUC_AT_LEAST(4,2) inline int ei_pow(int x, int y) { return int(std::pow(double(x), y)); } #else inline int ei_pow(int x, int y) { return std::pow(x, y); } diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h index 922c3ddae..016a9ef06 100644 --- a/Eigen/src/Core/Matrix.h +++ b/Eigen/src/Core/Matrix.h @@ -100,6 +100,14 @@ class Matrix : public MatrixBase class MatrixBase /** Overloaded for optimal product evaluation */ template - Derived& lazyAssign(const Product& product); + Derived& lazyAssign(const Product& product); CommaInitializer operator<< (const Scalar& s); @@ -419,6 +419,13 @@ template class MatrixBase const Lazy lazy() const; const Temporary temporary() const; + + /** \returns number of elements to skip to pass from one row (resp. column) to another + * for a row-major (resp. column-major) matrix. + * Combined with coeffRef() and the compile times flags, it allows a direct access to the data + * of the underlying matrix. + */ + int stride(void) const { return derived()._stride(); } //@} /// \name Coefficient-wise operations diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index 66c6d4d5b..dbd8b6cdd 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -60,6 +60,12 @@ struct ei_product_unroller static void run(int, int, const Lhs&, const Rhs&, typename Lhs::Scalar&) {} }; +template +struct ei_product_unroller<0, Dynamic, Lhs, Rhs> +{ + static void run(int, int, const Lhs&, const Rhs&, typename Lhs::Scalar&) {} +}; + template struct ei_packet_product_unroller; @@ -113,6 +119,12 @@ struct ei_packet_product_unroller static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {} }; +template +struct ei_packet_product_unroller +{ + static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {} +}; + template struct ProductPacketCoeffImpl { inline static typename Product::PacketScalar execute(const Product& product, int row, int col) { return product._packetCoeffRowMajor(row,col); } @@ -142,7 +154,7 @@ template struct ei_product_eval_mode enum{ value = Lhs::MaxRowsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD && Rhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD && (!( (Lhs::Flags&RowMajorBit) && ((Rhs::Flags&RowMajorBit) ^ RowMajorBit))) - ? CacheOptimalProduct : NormalProduct }; + ? CacheFriendlyProduct : NormalProduct }; }; template @@ -166,7 +178,7 @@ struct ei_traits > _LhsVectorizable = (!(LhsFlags & RowMajorBit)) && (LhsFlags & VectorizableBit) && (RowsAtCompileTime % ei_packet_traits::size == 0), _Vectorizable = (_LhsVectorizable || _RhsVectorizable) ? 1 : 0, _RowMajor = (RhsFlags & RowMajorBit) - && (EvalMode==(int)CacheOptimalProduct ? (int)LhsFlags & RowMajorBit : (!_LhsVectorizable)), + && (EvalMode==(int)CacheFriendlyProduct ? (int)LhsFlags & RowMajorBit : (!_LhsVectorizable)), _LostBits = DefaultLostFlagMask & ~( (_RowMajor ? 0 : RowMajorBit) | ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)), @@ -312,7 +324,7 @@ MatrixBase::operator*=(const MatrixBase &other) template template -Derived& MatrixBase::lazyAssign(const Product& product) +Derived& MatrixBase::lazyAssign(const Product& product) { product.template _cacheOptimalEval(derived(), #ifdef EIGEN_VECTORIZE diff --git a/Eigen/src/Core/ProductWIP.h b/Eigen/src/Core/ProductWIP.h index a5fc9d298..0a7ef43e9 100644 --- a/Eigen/src/Core/ProductWIP.h +++ b/Eigen/src/Core/ProductWIP.h @@ -26,9 +26,7 @@ #ifndef EIGEN_PRODUCT_H #define EIGEN_PRODUCT_H -#ifndef EIGEN_VECTORIZE -#error you must enable vectorization to try this experimental product implementation -#endif +#include "CacheFriendlyProduct.h" template struct ei_product_unroller @@ -145,7 +143,7 @@ template struct ei_product_eval_mode { enum{ value = Lhs::MaxRowsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD && Rhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD - ? CacheOptimalProduct : NormalProduct }; + ? CacheFriendlyProduct : NormalProduct }; }; template class ei_product_eval_to_column_major @@ -173,7 +171,22 @@ template struct ei_product_nested_rhs typename ei_meta_if< (ei_traits::Flags & EvalBeforeNestingBit) || (ei_traits::Flags & RowMajorBit) - || (!(ei_traits::Flags & ReferencableBit)) + || (!(ei_traits::Flags & DirectAccessBit)) + || (n+1) * NumTraits::Scalar>::ReadCost < (n-1) * T::CoeffReadCost, + typename ei_product_eval_to_column_major::type, + const T& + >::ret + >::ret type; +}; + +template struct ei_product_nested_lhs +{ + typedef typename ei_meta_if< + ei_is_temporary::ret && !(ei_traits::Flags & RowMajorBit), + T, + typename ei_meta_if< + (ei_traits::Flags & EvalBeforeNestingBit) + || (!(ei_traits::Flags & DirectAccessBit)) || (n+1) * NumTraits::Scalar>::ReadCost < (n-1) * T::CoeffReadCost, typename ei_product_eval_to_column_major::type, const T& @@ -187,9 +200,12 @@ struct ei_traits > typedef typename Lhs::Scalar Scalar; // the cache friendly product evals lhs once only // FIXME what to do if we chose to dynamically call the normal product from the cache friendly one for small matrices ? - typedef typename ei_nested::type LhsNested; + typedef typename ei_meta_if::type, + typename ei_nested::type>::ret LhsNested; + // NOTE that rhs must be ColumnMajor, so we might need a special nested type calculation - typedef typename ei_meta_if::type, typename ei_nested::type>::ret RhsNested; typedef typename ei_unref::type _LhsNested; @@ -209,7 +225,7 @@ struct ei_traits > _LhsVectorizable = (!(LhsFlags & RowMajorBit)) && (LhsFlags & VectorizableBit) && (RowsAtCompileTime % ei_packet_traits::size == 0), _Vectorizable = (_LhsVectorizable || _RhsVectorizable) ? 0 : 0, _RowMajor = (RhsFlags & RowMajorBit) - && (EvalMode==(int)CacheOptimalProduct ? (int)LhsFlags & RowMajorBit : (!_LhsVectorizable)), + && (EvalMode==(int)CacheFriendlyProduct ? (int)LhsFlags & RowMajorBit : (!_LhsVectorizable)), _LostBits = DefaultLostFlagMask & ~( (_RowMajor ? 0 : RowMajorBit) | ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)), @@ -241,19 +257,7 @@ template class Product : ei_no_assignm typedef typename ei_traits::_RhsNested _RhsNested; enum { - PacketSize = ei_packet_traits::size, - #if (defined __i386__) - // i386 architectures provides only 8 xmmm register, - // so let's reduce the max number of rows processed at once. - // NOTE that so far the maximal supported value is 8. - MaxBlockRows = 4, - MaxBlockRows_ClampingMask = 0xFFFFFC, - #else - MaxBlockRows = 8, - MaxBlockRows_ClampingMask = 0xFFFFF8, - #endif - // maximal size of the blocks fitted in L2 cache - MaxL2BlockSize = EIGEN_TUNE_FOR_L2_CACHE_SIZE / sizeof(Scalar) + PacketSize = ei_packet_traits::size }; Product(const Lhs& lhs, const Rhs& rhs) @@ -327,14 +331,8 @@ template class Product : ei_no_assignm } /** \internal */ - template - void _cacheFriendlyEvalImpl(DestDerived& res) const __attribute__ ((noinline)); - - /** \internal */ - template - void _cacheFriendlyEvalKernel(DestDerived& res, - int l2i, int l2j, int l2k, int l1i, - int l2blockRowEnd, int l2blockColEnd, int l2blockSizeEnd, const Scalar* block) const EIGEN_DONT_INLINE; + template + void _cacheFriendlyEvalImpl(DestDerived& res) const EIGEN_DONT_INLINE; protected: const LhsNested m_lhs; @@ -370,7 +368,7 @@ MatrixBase::operator*=(const MatrixBase &other) template template -Derived& MatrixBase::lazyAssign(const Product& product) +Derived& MatrixBase::lazyAssign(const Product& product) { product._cacheFriendlyEval(derived()); return derived(); @@ -380,326 +378,16 @@ template template void Product::_cacheFriendlyEval(DestDerived& res) const { - const bool rhsIsAligned = (m_lhs.cols()%PacketSize == 0); - const bool resIsAligned = ((_rows()%PacketSize) == 0); - - if (rhsIsAligned && resIsAligned) - _cacheFriendlyEvalImpl(res); - else if (rhsIsAligned && (!resIsAligned)) - _cacheFriendlyEvalImpl(res); - else if ((!rhsIsAligned) && resIsAligned) - _cacheFriendlyEvalImpl(res); - else - _cacheFriendlyEvalImpl(res); - -} - -template -template -void Product::_cacheFriendlyEvalKernel(DestDerived& res, - int l2i, int l2j, int l2k, int l1i, - int l2blockRowEnd, int l2blockColEnd, int l2blockSizeEnd, const Scalar* block) const -{ - asm("#eigen begin kernel"); - - ei_internal_assert(BlockRows<=8); - - // NOTE: sounds like we cannot rely on meta-unrolling to access dst[I] without enforcing GCC - // to create the dst's elements in memory, hence killing the performance. - - for(int l1j=l2j; l1j(k, l1j); - if (RhsAlignment==Aligned) - { - //tmp = ei_pload(&m_rhs.data()[l1jsize + k]); - tmp = ei_pload(&rhsColumn[k]); - } - else - { - tmp = tmp1; - if (k+PacketSize=2) dst[1] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows+ PacketSize])), dst[1]); - if (BlockRows>=3) dst[2] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows+2*PacketSize])), dst[2]); - if (BlockRows>=4) dst[3] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows+3*PacketSize])), dst[3]); - if (BlockRows>=5) dst[4] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows+4*PacketSize])), dst[4]); - if (BlockRows>=6) dst[5] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows+5*PacketSize])), dst[5]); - if (BlockRows>=7) dst[6] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows+6*PacketSize])), dst[6]); - if (BlockRows>=8) dst[7] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows+7*PacketSize])), dst[7]); - } - - enum { - // Number of rows we can reduce per packet - PacketRows = (ResAlignment==Aligned && PacketSize>1) ? (BlockRows / PacketSize) : 0, - // First row index from which we have to to do redux once at a time - RemainingStart = PacketSize * PacketRows - }; - - // we have up to 4 packets (for doubles: 8 rows / 2) - if (PacketRows>=1) - res.template writePacketCoeff(l1i, l1j, - ei_padd(res.template packetCoeff(l1i, l1j), ei_preduxp(&(dst[0])))); - if (PacketRows>=2) - res.template writePacketCoeff(l1i+PacketSize, l1j, - ei_padd(res.template packetCoeff(l1i+PacketSize, l1j), ei_preduxp(&(dst[PacketSize])))); - if (PacketRows>=3) - res.template writePacketCoeff(l1i+2*PacketSize, l1j, - ei_padd(res.template packetCoeff(l1i+2*PacketSize, l1j), ei_preduxp(&(dst[2*PacketSize])))); - if (PacketRows>=4) - res.template writePacketCoeff(l1i+3*PacketSize, l1j, - ei_padd(res.template packetCoeff(l1i+3*PacketSize, l1j), ei_preduxp(&(dst[3*PacketSize])))); - - // process the remaining rows one at a time - if (RemainingStart<=0 && BlockRows>=1) res.coeffRef(l1i+0, l1j) += ei_predux(dst[0]); - if (RemainingStart<=1 && BlockRows>=2) res.coeffRef(l1i+1, l1j) += ei_predux(dst[1]); - if (RemainingStart<=2 && BlockRows>=3) res.coeffRef(l1i+2, l1j) += ei_predux(dst[2]); - if (RemainingStart<=3 && BlockRows>=4) res.coeffRef(l1i+3, l1j) += ei_predux(dst[3]); - if (RemainingStart<=4 && BlockRows>=5) res.coeffRef(l1i+4, l1j) += ei_predux(dst[4]); - if (RemainingStart<=5 && BlockRows>=6) res.coeffRef(l1i+5, l1j) += ei_predux(dst[5]); - if (RemainingStart<=6 && BlockRows>=7) res.coeffRef(l1i+6, l1j) += ei_predux(dst[6]); - if (RemainingStart<=7 && BlockRows>=8) res.coeffRef(l1i+7, l1j) += ei_predux(dst[7]); - - asm("#eigen end kernel"); - } -} - -template -template -void Product::_cacheFriendlyEvalImpl(DestDerived& res) const -{ - // FIXME find a way to optimize: (an_xpr) + (a * b) - // then we don't need to clear res and avoid and additional mat-mat sum #ifndef EIGEN_WIP_PRODUCT_DIRTY -// std::cout << "wip product\n"; res.setZero(); #endif - const int rows = _rows(); - const int cols = _cols(); - const int remainingSize = m_lhs.cols()%PacketSize; - const int size = m_lhs.cols() - remainingSize; // third dimension of the product clamped to packet boundaries - const int l2BlockRows = MaxL2BlockSize > _rows() ? _rows() : MaxL2BlockSize; - const int l2BlockCols = MaxL2BlockSize > _cols() ? _cols() : MaxL2BlockSize; - const int l2BlockSize = MaxL2BlockSize > size ? size : MaxL2BlockSize; - //Scalar* __restrict__ block = new Scalar[l2blocksize*size];; - Scalar* __restrict__ block = (Scalar*)alloca(sizeof(Scalar)*l2BlockRows*size); - - // loops on each L2 cache friendly blocks of the result - for(int l2i=0; l2i<_rows(); l2i+=l2BlockRows) - { - const int l2blockRowEnd = std::min(l2i+l2BlockRows, rows); - const int l2blockRowEndBW = l2blockRowEnd & MaxBlockRows_ClampingMask; // end of the rows aligned to bw - const int l2blockRemainingRows = l2blockRowEnd - l2blockRowEndBW; // number of remaining rows - - // build a cache friendly block - int count = 0; - - // copy l2blocksize rows of m_lhs to blocks of ps x bw - for(int l2k=0; l2k0) - { - for (int k=l2k; k( - res, l2i, l2j, l2k, l1i, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); -#if 0 - for(int l1j=l2j; l1j(k, l1j); - // TODO make this branching compile time (costly for doubles) - if (rhsIsAligned) - tmp = ei_pload(&m_rhs.derived().data()[l1jsize + k]); - else - tmp = ei_ploadu(&m_rhs.derived().data()[l1jsize + k]); - - b0 = ei_pload(&(localB[k*bw])); - b1 = ei_pload(&(localB[k*bw+ps])); - dst[0] = ei_pmadd(tmp, b0, dst[0]); - b0 = ei_pload(&(localB[k*bw+2*ps])); - dst[1] = ei_pmadd(tmp, b1, dst[1]); - b1 = ei_pload(&(localB[k*bw+3*ps])); - dst[2] = ei_pmadd(tmp, b0, dst[2]); - if (MaxBlockRows==8) - b0 = ei_pload(&(localB[k*bw+4*ps])); - dst[3] = ei_pmadd(tmp, b1, dst[3]); - if (MaxBlockRows==8) - { - b1 = ei_pload(&(localB[k*bw+5*ps])); - dst[4] = ei_pmadd(tmp, b0, dst[4]); - b0 = ei_pload(&(localB[k*bw+6*ps])); - dst[5] = ei_pmadd(tmp, b1, dst[5]); - b1 = ei_pload(&(localB[k*bw+7*ps])); - dst[6] = ei_pmadd(tmp, b0, dst[6]); - dst[7] = ei_pmadd(tmp, b1, dst[7]); - } - } - -// if (resIsAligned) - { - res.template writePacketCoeff(l1i, l1j, ei_padd(res.template packetCoeff(l1i, l1j), ei_preduxp(dst))); - if (PacketSize==2) - res.template writePacketCoeff(l1i+2,l1j, ei_padd(res.template packetCoeff(l1i+2,l1j), ei_preduxp(&(dst[2])))); - if (MaxBlockRows==8) - { - res.template writePacketCoeff(l1i+4,l1j, ei_padd(res.template packetCoeff(l1i+4,l1j), ei_preduxp(&(dst[4])))); - if (PacketSize==2) - res.template writePacketCoeff(l1i+6,l1j, ei_padd(res.template packetCoeff(l1i+6,l1j), ei_preduxp(&(dst[6])))); - } - } -// else -// { -// // TODO uncommenting this code kill the perf, even though it is never called !! -// // this is because dst cannot be a set of registers only -// // TODO optimize this loop -// // TODO is it better to do one redux at once or packet reduxes + unaligned store ? -// for (int w = 0; w0) - { - // this is an attempt to build an array of kernels, but I did not manage to get it compiles -// typedef void (*Kernel)(DestDerived& , int, int, int, int, int, int, int, const Scalar*); -// Kernel kernels[8]; -// kernels[0] = (Kernel)(&Product::template _cacheFriendlyEvalKernel); -// kernels[l2blockRemainingRows](res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); - - switch(l2blockRemainingRows) - { - case 1:_cacheFriendlyEvalKernel( - res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break; - case 2:_cacheFriendlyEvalKernel( - res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break; - case 3:_cacheFriendlyEvalKernel( - res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break; - case 4:_cacheFriendlyEvalKernel( - res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break; - case 5:_cacheFriendlyEvalKernel( - res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break; - case 6:_cacheFriendlyEvalKernel( - res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break; - case 7:_cacheFriendlyEvalKernel( - res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break; - default: - ei_internal_assert(false && "internal error"); break; - } - } - } - } - } - - // handle the part which cannot be processed by the vectorized path - if (remainingSize) - { - res += Product< - Block::type,Dynamic,Dynamic>, - Block::type,Dynamic,Dynamic>, - NormalProduct>( - m_lhs.block(0,size, _rows(), remainingSize), - m_rhs.block(size,0, remainingSize, _cols())).lazy(); -// res += m_lhs.block(0,size, _rows(), remainingSize)._lazyProduct(m_rhs.block(size,0, remainingSize, _cols())); - } - -// delete[] block; + ei_cache_friendly_product( + _rows(), _cols(), m_lhs.cols(), + _LhsNested::Flags&RowMajorBit, &(m_lhs.const_cast_derived().coeffRef(0,0)), m_lhs.stride(), + _RhsNested::Flags&RowMajorBit, &(m_rhs.const_cast_derived().coeffRef(0,0)), m_rhs.stride(), + Flags&RowMajorBit, &(res.coeffRef(0,0)), res.stride() + ); } #endif // EIGEN_PRODUCT_H diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index e4af78c7c..d98d06597 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -69,6 +69,8 @@ template class Transpose int _rows() const { return m_matrix.cols(); } int _cols() const { return m_matrix.rows(); } + int _stride(void) const { return m_matrix.stride(); } + Scalar& _coeffRef(int row, int col) { return m_matrix.const_cast_derived().coeffRef(col, row); diff --git a/Eigen/src/Core/Triangular.h b/Eigen/src/Core/Triangular.h index 7d5884de3..8a17a3fee 100755 --- a/Eigen/src/Core/Triangular.h +++ b/Eigen/src/Core/Triangular.h @@ -67,7 +67,7 @@ struct ei_traits > ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - Flags = (_MatrixTypeNested::Flags & ~(VectorizableBit | Like1DArrayBit)) | Mode, + Flags = (_MatrixTypeNested::Flags & ~(VectorizableBit | Like1DArrayBit | DirectAccessBit)) | Mode, CoeffReadCost = _MatrixTypeNested::CoeffReadCost }; }; diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index 48498bfae..8aa16aa81 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -43,18 +43,18 @@ const unsigned int NullDiagBit = 0x40; ///< means all diagonal coefficients const unsigned int UnitDiagBit = 0x80; ///< means all diagonal coefficients are equal to 1 const unsigned int NullLowerBit = 0x200; ///< means the strictly triangular lower part is 0 const unsigned int NullUpperBit = 0x400; ///< means the strictly triangular upper part is 0 -const unsigned int ReferencableBit = 0x800; ///< means the expression is writable through MatrixBase::coeffRef(int,int) +const unsigned int DirectAccessBit = 0x800; ///< means the underlying matrix data can be direclty accessed enum { Upper=NullLowerBit, Lower=NullUpperBit }; enum { Aligned=0, UnAligned=1 }; // list of flags that are lost by default -const unsigned int DefaultLostFlagMask = ~(VectorizableBit | Like1DArrayBit | ReferencableBit +const unsigned int DefaultLostFlagMask = ~(VectorizableBit | Like1DArrayBit | DirectAccessBit | NullDiagBit | UnitDiagBit | NullLowerBit | NullUpperBit); enum { ConditionalJumpCost = 5 }; enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight }; enum DirectionType { Vertical, Horizontal }; -enum ProductEvaluationMode { NormalProduct, CacheOptimalProduct, LazyProduct}; +enum ProductEvaluationMode { NormalProduct, CacheFriendlyProduct, LazyProduct}; #endif // EIGEN_CONSTANTS_H diff --git a/Eigen/src/QR/EigenSolver.h b/Eigen/src/QR/EigenSolver.h new file mode 100644 index 000000000..47199862f --- /dev/null +++ b/Eigen/src/QR/EigenSolver.h @@ -0,0 +1,848 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. Eigen itself is part of the KDE project. +// +// Copyright (C) 2008 Gael Guennebaud +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see . + +#ifndef EIGEN_EIGENSOLVER_H +#define EIGEN_EIGENSOLVER_H + +/** \class EigenSolver + * + * \brief Eigen values/vectors solver + * + * \param MatrixType the type of the matrix of which we are computing the eigen decomposition + * + * \note this code was adapted from JAMA (public domain) + * + * \sa MatrixBase::eigenvalues() + */ +template class EigenSolver +{ + public: + + typedef _MatrixType MatrixType; + typedef typename MatrixType::Scalar Scalar; + typedef Matrix VectorType; + + EigenSolver(const MatrixType& matrix) + : m_eivec(matrix.rows(), matrix.cols()), + m_eivalr(matrix.cols()), m_eivali(matrix.cols()), + m_H(matrix.rows(), matrix.cols()), + m_ort(matrix.cols()) + { + _compute(matrix); + } + + MatrixType eigenvectors(void) const { return m_eivec; } + + VectorType eigenvalues(void) const { return m_eivalr; } + + private: + + void _compute(const MatrixType& matrix); + + void tridiagonalization(void); + void tql2(void); + + void orthes(void); + void hqr2(void); + + protected: + MatrixType m_eivec; + VectorType m_eivalr, m_eivali; + MatrixType m_H; + VectorType m_ort; + bool m_isSymmetric; +}; + +template +void EigenSolver::_compute(const MatrixType& matrix) +{ + assert(matrix.cols() == matrix.rows()); + + m_isSymmetric = true; + int n = matrix.cols(); + for (int j = 0; (j < n) && m_isSymmetric; j++) { + for (int i = 0; (i < j) && m_isSymmetric; i++) { + m_isSymmetric = (matrix(i,j) == matrix(j,i)); + } + } + + m_eivalr.resize(n,1); + m_eivali.resize(n,1); + + if (m_isSymmetric) + { + m_eivec = matrix; + + // Tridiagonalize. + tridiagonalization(); + + // Diagonalize. + tql2(); + } + else + { + m_H = matrix; + m_ort.resize(n, 1); + + // Reduce to Hessenberg form. + orthes(); + + // Reduce Hessenberg to real Schur form. + hqr2(); + } + std::cout << m_eivali.transpose() << "\n"; +} + + +// Symmetric Householder reduction to tridiagonal form. +template +void EigenSolver::tridiagonalization(void) +{ + +// This is derived from the Algol procedures tred2 by +// Bowdler, Martin, Reinsch, and Wilkinson, Handbook for +// Auto. Comp., Vol.ii-Linear Algebra, and the corresponding +// Fortran subroutine in EISPACK. + + int n = m_eivec.cols(); + m_eivalr = m_eivec.row(m_eivalr.size()-1); + + // Householder reduction to tridiagonal form. + for (int i = n-1; i > 0; i--) + { + // Scale to avoid under/overflow. + Scalar scale = 0.0; + Scalar h = 0.0; + scale = m_eivalr.start(i).cwiseAbs().sum(); + + if (scale == 0.0) + { + m_eivali[i] = m_eivalr[i-1]; + m_eivalr.start(i) = m_eivec.row(i-1).start(i); + m_eivec.corner(TopLeft, i, i) = m_eivec.corner(TopLeft, i, i).diagonal().asDiagonal(); + } + else + { + // Generate Householder vector. + m_eivalr.start(i) /= scale; + h = m_eivalr.start(i).cwiseAbs2().sum(); + + Scalar f = m_eivalr[i-1]; + Scalar g = ei_sqrt(h); + if (f > 0) + g = -g; + m_eivali[i] = scale * g; + h = h - f * g; + m_eivalr[i-1] = f - g; + m_eivali.start(i).setZero(); + + // Apply similarity transformation to remaining columns. + for (int j = 0; j < i; j++) + { + f = m_eivalr[j]; + m_eivec(j,i) = f; + g = m_eivali[j] + m_eivec(j,j) * f; + int bSize = i-j-1; + if (bSize>0) + { + g += (m_eivec.col(j).block(j+1, bSize).transpose() * m_eivalr.block(j+1, bSize))(0,0); + m_eivali.block(j+1, bSize) += m_eivec.col(j).block(j+1, bSize) * f; + } + m_eivali[j] = g; + } + + f = (m_eivali.start(i).transpose() * m_eivalr.start(i))(0,0); + m_eivali.start(i) = (m_eivali.start(i) - (f / (h + h)) * m_eivalr.start(i))/h; + + m_eivec.corner(TopLeft, i, i).lower() -= + ( (m_eivali.start(i) * m_eivalr.start(i).transpose()).lazy() + + (m_eivalr.start(i) * m_eivali.start(i).transpose()).lazy()); + + m_eivalr.start(i) = m_eivec.row(i-1).start(i); + m_eivec.row(i).start(i).setZero(); + } + m_eivalr[i] = h; + } + + // Accumulate transformations. + for (int i = 0; i < n-1; i++) + { + m_eivec(n-1,i) = m_eivec(i,i); + m_eivec(i,i) = 1.0; + Scalar h = m_eivalr[i+1]; + // FIXME this does not looks very stable ;) + if (h != 0.0) + { + m_eivalr.start(i+1) = m_eivec.col(i+1).start(i+1) / h; + m_eivec.corner(TopLeft, i+1, i+1) -= m_eivalr.start(i+1) + * ( m_eivec.col(i+1).start(i+1).transpose() * m_eivec.corner(TopLeft, i+1, i+1) ); + } + m_eivec.col(i+1).start(i+1).setZero(); + } + m_eivalr = m_eivec.row(m_eivalr.size()-1); + m_eivec.row(m_eivalr.size()-1).setZero(); + m_eivec(n-1,n-1) = 1.0; + m_eivali[0] = 0.0; +} + + +// Symmetric tridiagonal QL algorithm. +template +void EigenSolver::tql2(void) +{ + +// This is derived from the Algol procedures tql2, by +// Bowdler, Martin, Reinsch, and Wilkinson, Handbook for +// Auto. Comp., Vol.ii-Linear Algebra, and the corresponding +// Fortran subroutine in EISPACK. + + int n = m_eivalr.size(); + + for (int i = 1; i < n; i++) { + m_eivali[i-1] = m_eivali[i]; + } + m_eivali[n-1] = 0.0; + + Scalar f = 0.0; + Scalar tst1 = 0.0; + Scalar eps = std::pow(2.0,-52.0); + for (int l = 0; l < n; l++) + { + // Find small subdiagonal element + tst1 = std::max(tst1,ei_abs(m_eivalr[l]) + ei_abs(m_eivali[l])); + int m = l; + + while ( (m < n) && (ei_abs(m_eivali[m]) > eps*tst1) ) + m++; + + // If m == l, m_eivalr[l] is an eigenvalue, + // otherwise, iterate. + if (m > l) + { + int iter = 0; + do + { + iter = iter + 1; + + // Compute implicit shift + Scalar g = m_eivalr[l]; + Scalar p = (m_eivalr[l+1] - g) / (2.0 * m_eivali[l]); + Scalar r = hypot(p,1.0); + if (p < 0) + r = -r; + + m_eivalr[l] = m_eivali[l] / (p + r); + m_eivalr[l+1] = m_eivali[l] * (p + r); + Scalar dl1 = m_eivalr[l+1]; + Scalar h = g - m_eivalr[l]; + if (l+2= l; i--) + { + c3 = c2; + c2 = c; + s2 = s; + g = c * m_eivali[i]; + h = c * p; + r = hypot(p,m_eivali[i]); + m_eivali[i+1] = s * r; + s = m_eivali[i] / r; + c = p / r; + p = c * m_eivalr[i] - s * g; + m_eivalr[i+1] = h + s * (c * g + s * m_eivalr[i]); + + // Accumulate transformation. + for (int k = 0; k < n; k++) + { + h = m_eivec(k,i+1); + m_eivec(k,i+1) = s * m_eivec(k,i) + c * h; + m_eivec(k,i) = c * m_eivec(k,i) - s * h; + } + } + p = -s * s2 * c3 * el1 * m_eivali[l] / dl1; + m_eivali[l] = s * p; + m_eivalr[l] = c * p; + + // Check for convergence. + } while (ei_abs(m_eivali[l]) > eps*tst1); + } + m_eivalr[l] = m_eivalr[l] + f; + m_eivali[l] = 0.0; + } + + // Sort eigenvalues and corresponding vectors. + // TODO use a better sort algorithm !! + for (int i = 0; i < n-1; i++) + { + int k = i; + Scalar minValue = m_eivalr[i]; + for (int j = i+1; j < n; j++) + { + if (m_eivalr[j] < minValue) + { + k = j; + minValue = m_eivalr[j]; + } + } + if (k != i) + { + std::swap(m_eivalr[i], m_eivalr[k]); + m_eivec.col(i).swap(m_eivec.col(k)); + } + } +} + + +// Nonsymmetric reduction to Hessenberg form. +template +void EigenSolver::orthes(void) +{ + // This is derived from the Algol procedures orthes and ortran, + // by Martin and Wilkinson, Handbook for Auto. Comp., + // Vol.ii-Linear Algebra, and the corresponding + // Fortran subroutines in EISPACK. + + int n = m_eivec.cols(); + int low = 0; + int high = n-1; + + for (int m = low+1; m <= high-1; m++) + { + // Scale column. + Scalar scale = m_H.block(m, m-1, high-m+1, 1).cwiseAbs().sum(); + if (scale != 0.0) + { + // Compute Householder transformation. + Scalar h = 0.0; + // FIXME could be rewritten, but this one looks better wrt cache + for (int i = high; i >= m; i--) + { + m_ort[i] = m_H(i,m-1)/scale; + h += m_ort[i] * m_ort[i]; + } + Scalar g = ei_sqrt(h); + if (m_ort[m] > 0) + g = -g; + h = h - m_ort[m] * g; + m_ort[m] = m_ort[m] - g; + + // Apply Householder similarity transformation + // H = (I-u*u'/h)*H*(I-u*u')/h) + int bSize = high-m+1; + m_H.block(m, m, bSize, n-m) -= ((m_ort.block(m, bSize)/h) + * (m_ort.block(m, bSize).transpose() * m_H.block(m, m, bSize, n-m)).lazy()).lazy(); + + m_H.block(0, m, high+1, bSize) -= ((m_H.block(0, m, high+1, bSize) * m_ort.block(m, bSize)).lazy() + * (m_ort.block(m, bSize)/h).transpose()).lazy(); + + m_ort[m] = scale*m_ort[m]; + m_H(m,m-1) = scale*g; + } + } + + // Accumulate transformations (Algol's ortran). + m_eivec.setIdentity(); + + for (int m = high-1; m >= low+1; m--) + { + if (m_H(m,m-1) != 0.0) + { + m_ort.block(m+1, high-m) = m_H.col(m-1).block(m+1, high-m); + + int bSize = high-m+1; + m_eivec.block(m, m, bSize, bSize) += ( (m_ort.block(m, bSize) / (m_H(m,m-1) * m_ort[m] ) ) + * (m_ort.block(m, bSize).transpose() * m_eivec.block(m, m, bSize, bSize)).lazy()); + } + } +} + + +// Complex scalar division. +template +std::complex cdiv(Scalar xr, Scalar xi, Scalar yr, Scalar yi) +{ + Scalar r,d; + if (ei_abs(yr) > ei_abs(yi)) + { + r = yi/yr; + d = yr + r*yi; + return std::complex((xr + r*xi)/d, (xi - r*xr)/d); + } + else + { + r = yr/yi; + d = yi + r*yr; + return std::complex((r*xr + xi)/d, (r*xi - xr)/d); + } +} + + +// Nonsymmetric reduction from Hessenberg to real Schur form. +template +void EigenSolver::hqr2(void) +{ + // This is derived from the Algol procedure hqr2, + // by Martin and Wilkinson, Handbook for Auto. Comp., + // Vol.ii-Linear Algebra, and the corresponding + // Fortran subroutine in EISPACK. + + // Initialize + int nn = m_eivec.cols(); + int n = nn-1; + int low = 0; + int high = nn-1; + Scalar eps = pow(2.0,-52.0); + Scalar exshift = 0.0; + Scalar p=0,q=0,r=0,s=0,z=0,t,w,x,y; + + // Store roots isolated by balanc and compute matrix norm + // FIXME to be efficient the following would requires a triangular reduxion code + // Scalar norm = m_H.upper().cwiseAbs().sum() + m_H.corner(BottomLeft,n,n).diagonal().cwiseAbs().sum(); + Scalar norm = 0.0; + for (int j = 0; j < nn; j++) + { + // FIXME what's the purpose of the following since the condition is always false + if ((j < low) || (j > high)) + { + m_eivalr[j] = m_H(j,j); + m_eivali[j] = 0.0; + } + norm += m_H.col(j).start(std::min(j+1,nn)).cwiseAbs().sum(); + } + + // Outer loop over eigenvalue index + int iter = 0; + while (n >= low) + { + // Look for single small sub-diagonal element + int l = n; + while (l > low) + { + s = ei_abs(m_H(l-1,l-1)) + ei_abs(m_H(l,l)); + if (s == 0.0) + s = norm; + if (ei_abs(m_H(l,l-1)) < eps * s) + break; + l--; + } + + // Check for convergence + // One root found + if (l == n) + { + m_H(n,n) = m_H(n,n) + exshift; + m_eivalr[n] = m_H(n,n); + m_eivali[n] = 0.0; + n--; + iter = 0; + } + else if (l == n-1) // Two roots found + { + w = m_H(n,n-1) * m_H(n-1,n); + p = (m_H(n-1,n-1) - m_H(n,n)) / 2.0; + q = p * p + w; + z = ei_sqrt(ei_abs(q)); + m_H(n,n) = m_H(n,n) + exshift; + m_H(n-1,n-1) = m_H(n-1,n-1) + exshift; + x = m_H(n,n); + + // Scalar pair + if (q >= 0) + { + if (p >= 0) + z = p + z; + else + z = p - z; + + m_eivalr[n-1] = x + z; + m_eivalr[n] = m_eivalr[n-1]; + if (z != 0.0) + m_eivalr[n] = x - w / z; + + m_eivali[n-1] = 0.0; + m_eivali[n] = 0.0; + x = m_H(n,n-1); + s = ei_abs(x) + ei_abs(z); + p = x / s; + q = z / s; + r = ei_sqrt(p * p+q * q); + p = p / r; + q = q / r; + + // Row modification + for (int j = n-1; j < nn; j++) + { + z = m_H(n-1,j); + m_H(n-1,j) = q * z + p * m_H(n,j); + m_H(n,j) = q * m_H(n,j) - p * z; + } + + // Column modification + for (int i = 0; i <= n; i++) + { + z = m_H(i,n-1); + m_H(i,n-1) = q * z + p * m_H(i,n); + m_H(i,n) = q * m_H(i,n) - p * z; + } + + // Accumulate transformations + for (int i = low; i <= high; i++) + { + z = m_eivec(i,n-1); + m_eivec(i,n-1) = q * z + p * m_eivec(i,n); + m_eivec(i,n) = q * m_eivec(i,n) - p * z; + } + } + else // Complex pair + { + m_eivalr[n-1] = x + p; + m_eivalr[n] = x + p; + m_eivali[n-1] = z; + m_eivali[n] = -z; + } + n = n - 2; + iter = 0; + } + else // No convergence yet + { + // Form shift + x = m_H(n,n); + y = 0.0; + w = 0.0; + if (l < n) + { + y = m_H(n-1,n-1); + w = m_H(n,n-1) * m_H(n-1,n); + } + + // Wilkinson's original ad hoc shift + if (iter == 10) + { + exshift += x; + for (int i = low; i <= n; i++) + m_H(i,i) -= x; + s = ei_abs(m_H(n,n-1)) + ei_abs(m_H(n-1,n-2)); + x = y = 0.75 * s; + w = -0.4375 * s * s; + } + + // MATLAB's new ad hoc shift + if (iter == 30) + { + s = (y - x) / 2.0; + s = s * s + w; + if (s > 0) + { + s = ei_sqrt(s); + if (y < x) + s = -s; + s = x - w / ((y - x) / 2.0 + s); + for (int i = low; i <= n; i++) + m_H(i,i) -= s; + exshift += s; + x = y = w = 0.964; + } + } + + iter = iter + 1; // (Could check iteration count here.) + + // Look for two consecutive small sub-diagonal elements + int m = n-2; + while (m >= l) + { + z = m_H(m,m); + r = x - z; + s = y - z; + p = (r * s - w) / m_H(m+1,m) + m_H(m,m+1); + q = m_H(m+1,m+1) - z - r - s; + r = m_H(m+2,m+1); + s = ei_abs(p) + ei_abs(q) + ei_abs(r); + p = p / s; + q = q / s; + r = r / s; + if (m == l) { + break; + } + if (ei_abs(m_H(m,m-1)) * (ei_abs(q) + ei_abs(r)) < + eps * (ei_abs(p) * (ei_abs(m_H(m-1,m-1)) + ei_abs(z) + + ei_abs(m_H(m+1,m+1))))) + { + break; + } + m--; + } + + for (int i = m+2; i <= n; i++) + { + m_H(i,i-2) = 0.0; + if (i > m+2) + m_H(i,i-3) = 0.0; + } + + // Double QR step involving rows l:n and columns m:n + for (int k = m; k <= n-1; k++) + { + int notlast = (k != n-1); + if (k != m) { + p = m_H(k,k-1); + q = m_H(k+1,k-1); + r = (notlast ? m_H(k+2,k-1) : 0.0); + x = ei_abs(p) + ei_abs(q) + ei_abs(r); + if (x != 0.0) + { + p = p / x; + q = q / x; + r = r / x; + } + } + + if (x == 0.0) + break; + + s = ei_sqrt(p * p + q * q + r * r); + + if (p < 0) + s = -s; + + if (s != 0) + { + if (k != m) + m_H(k,k-1) = -s * x; + else if (l != m) + m_H(k,k-1) = -m_H(k,k-1); + + p = p + s; + x = p / s; + y = q / s; + z = r / s; + q = q / p; + r = r / p; + + // Row modification + for (int j = k; j < nn; j++) + { + p = m_H(k,j) + q * m_H(k+1,j); + if (notlast) + { + p = p + r * m_H(k+2,j); + m_H(k+2,j) = m_H(k+2,j) - p * z; + } + m_H(k,j) = m_H(k,j) - p * x; + m_H(k+1,j) = m_H(k+1,j) - p * y; + } + + // Column modification + for (int i = 0; i <= std::min(n,k+3); i++) + { + p = x * m_H(i,k) + y * m_H(i,k+1); + if (notlast) + { + p = p + z * m_H(i,k+2); + m_H(i,k+2) = m_H(i,k+2) - p * r; + } + m_H(i,k) = m_H(i,k) - p; + m_H(i,k+1) = m_H(i,k+1) - p * q; + } + + // Accumulate transformations + for (int i = low; i <= high; i++) + { + p = x * m_eivec(i,k) + y * m_eivec(i,k+1); + if (notlast) + { + p = p + z * m_eivec(i,k+2); + m_eivec(i,k+2) = m_eivec(i,k+2) - p * r; + } + m_eivec(i,k) = m_eivec(i,k) - p; + m_eivec(i,k+1) = m_eivec(i,k+1) - p * q; + } + } // (s != 0) + } // k loop + } // check convergence + } // while (n >= low) + + // Backsubstitute to find vectors of upper triangular form + if (norm == 0.0) + { + return; + } + + for (n = nn-1; n >= 0; n--) + { + p = m_eivalr[n]; + q = m_eivali[n]; + + // Scalar vector + if (q == 0) + { + int l = n; + m_H(n,n) = 1.0; + for (int i = n-1; i >= 0; i--) + { + w = m_H(i,i) - p; + r = (m_H.row(i).end(nn-l) * m_H.col(n).end(nn-l))(0,0); + + if (m_eivali[i] < 0.0) + { + z = w; + s = r; + } + else + { + l = i; + if (m_eivali[i] == 0.0) + { + if (w != 0.0) + m_H(i,n) = -r / w; + else + m_H(i,n) = -r / (eps * norm); + } + else // Solve real equations + { + x = m_H(i,i+1); + y = m_H(i+1,i); + q = (m_eivalr[i] - p) * (m_eivalr[i] - p) + m_eivali[i] * m_eivali[i]; + t = (x * s - z * r) / q; + m_H(i,n) = t; + if (ei_abs(x) > ei_abs(z)) + m_H(i+1,n) = (-r - w * t) / x; + else + m_H(i+1,n) = (-s - y * t) / z; + } + + // Overflow control + t = ei_abs(m_H(i,n)); + if ((eps * t) * t > 1) + m_H.col(n).end(nn-i) /= t; + } + } + } + else if (q < 0) // Complex vector + { + std::complex cc; + int l = n-1; + + // Last vector component imaginary so matrix is triangular + if (ei_abs(m_H(n,n-1)) > ei_abs(m_H(n-1,n))) + { + m_H(n-1,n-1) = q / m_H(n,n-1); + m_H(n-1,n) = -(m_H(n,n) - p) / m_H(n,n-1); + } + else + { + cc = cdiv(0.0,-m_H(n-1,n),m_H(n-1,n-1)-p,q); + m_H(n-1,n-1) = ei_real(cc); + m_H(n-1,n) = ei_imag(cc); + } + m_H(n,n-1) = 0.0; + m_H(n,n) = 1.0; + for (int i = n-2; i >= 0; i--) + { + Scalar ra,sa,vr,vi; + ra = (m_H.row(i).end(nn-l) * m_H.col(n-1).end(nn-l)).lazy()(0,0); + sa = (m_H.row(i).end(nn-l) * m_H.col(n).end(nn-l)).lazy()(0,0); + w = m_H(i,i) - p; + + if (m_eivali[i] < 0.0) + { + z = w; + r = ra; + s = sa; + } + else + { + l = i; + if (m_eivali[i] == 0) + { + cc = cdiv(-ra,-sa,w,q); + m_H(i,n-1) = ei_real(cc); + m_H(i,n) = ei_imag(cc); + } + else + { + // Solve complex equations + x = m_H(i,i+1); + y = m_H(i+1,i); + vr = (m_eivalr[i] - p) * (m_eivalr[i] - p) + m_eivali[i] * m_eivali[i] - q * q; + vi = (m_eivalr[i] - p) * 2.0 * q; + if ((vr == 0.0) && (vi == 0.0)) + vr = eps * norm * (ei_abs(w) + ei_abs(q) + ei_abs(x) + ei_abs(y) + ei_abs(z)); + + cc= cdiv(x*r-z*ra+q*sa,x*s-z*sa-q*ra,vr,vi); + m_H(i,n-1) = ei_real(cc); + m_H(i,n) = ei_imag(cc); + if (ei_abs(x) > (ei_abs(z) + ei_abs(q))) + { + m_H(i+1,n-1) = (-ra - w * m_H(i,n-1) + q * m_H(i,n)) / x; + m_H(i+1,n) = (-sa - w * m_H(i,n) - q * m_H(i,n-1)) / x; + } + else + { + cc = cdiv(-r-y*m_H(i,n-1),-s-y*m_H(i,n),z,q); + m_H(i+1,n-1) = ei_real(cc); + m_H(i+1,n) = ei_imag(cc); + } + } + + // Overflow control + t = std::max(ei_abs(m_H(i,n-1)),ei_abs(m_H(i,n))); + if ((eps * t) * t > 1) + m_H.block(i, n-1, nn-i, 2) /= t; + + } + } + } + } + + // Vectors of isolated roots + for (int i = 0; i < nn; i++) + { + // FIXME again what's the purpose of this test ? + // in this algo low==0 and high==nn-1 !! + if (i < low || i > high) + { + m_eivec.row(i).end(nn-i) = m_H.row(i).end(nn-i); + } + } + + // Back transformation to get eigenvectors of original matrix + int bRows = high-low+1; + for (int j = nn-1; j >= low; j--) + { + int bSize = std::min(j,high)-low+1; + m_eivec.col(j).block(low, bRows) = (m_eivec.block(low, low, bRows, bSize) * m_H.col(j).block(low, bSize)); + } +} + +#endif // EIGEN_EIGENSOLVER_H diff --git a/Eigen/src/QR/QR.h b/Eigen/src/QR/QR.h index d0121cc7a..d42153eb9 100644 --- a/Eigen/src/QR/QR.h +++ b/Eigen/src/QR/QR.h @@ -95,10 +95,11 @@ void QR::_compute(const MatrixType& matrix) m_qr(k,k) += 1.0; // apply transformation to remaining columns - for (int j = k+1; j < cols; j++) + int remainingCols = cols - k -1; + if (remainingCols>0) { - Scalar s = -(m_qr.col(k).end(remainingSize).transpose() * m_qr.col(j).end(remainingSize))(0,0) / m_qr(k,k); - m_qr.col(j).end(remainingSize) += s * m_qr.col(k).end(remainingSize); + m_qr.corner(BottomRight, remainingSize, remainingCols) -= (1./m_qr(k,k)) * m_qr.col(k).end(remainingSize) + * (m_qr.col(k).end(remainingSize).transpose() * m_qr.corner(BottomRight, remainingSize, remainingCols)); } } m_norms[k] = -nrm; -- cgit v1.2.3