aboutsummaryrefslogtreecommitdiffhomepage
path: root/third_party/eigen3/Eigen/src/Core/products
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/eigen3/Eigen/src/Core/products')
-rw-r--r--third_party/eigen3/Eigen/src/Core/products/CoeffBasedProduct.h454
-rw-r--r--third_party/eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h2197
-rw-r--r--third_party/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix.h465
-rw-r--r--third_party/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h285
-rw-r--r--third_party/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h146
-rw-r--r--third_party/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h118
-rw-r--r--third_party/eigen3/Eigen/src/Core/products/GeneralMatrixVector.h618
-rw-r--r--third_party/eigen3/Eigen/src/Core/products/GeneralMatrixVector_MKL.h131
-rw-r--r--third_party/eigen3/Eigen/src/Core/products/Parallelizer.h158
-rw-r--r--third_party/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix.h523
-rw-r--r--third_party/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h295
-rw-r--r--third_party/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector.h281
-rw-r--r--third_party/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h114
-rw-r--r--third_party/eigen3/Eigen/src/Core/products/SelfadjointProduct.h123
-rw-r--r--third_party/eigen3/Eigen/src/Core/products/SelfadjointRank2Update.h93
-rw-r--r--third_party/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix.h434
-rw-r--r--third_party/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h309
-rw-r--r--third_party/eigen3/Eigen/src/Core/products/TriangularMatrixVector.h354
-rw-r--r--third_party/eigen3/Eigen/src/Core/products/TriangularMatrixVector_MKL.h247
-rw-r--r--third_party/eigen3/Eigen/src/Core/products/TriangularSolverMatrix.h331
-rw-r--r--third_party/eigen3/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h155
-rw-r--r--third_party/eigen3/Eigen/src/Core/products/TriangularSolverVector.h145
22 files changed, 7976 insertions, 0 deletions
diff --git a/third_party/eigen3/Eigen/src/Core/products/CoeffBasedProduct.h b/third_party/eigen3/Eigen/src/Core/products/CoeffBasedProduct.h
new file mode 100644
index 0000000000..35a6e36e81
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/CoeffBasedProduct.h
@@ -0,0 +1,454 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COEFFBASED_PRODUCT_H
+#define EIGEN_COEFFBASED_PRODUCT_H
+
+namespace Eigen {
+
+namespace internal {
+
+/*********************************************************************************
+* Coefficient based product implementation.
+* It is designed for the following use cases:
+* - small fixed sizes
+* - lazy products
+*********************************************************************************/
+
+/* Since the all the dimensions of the product are small, here we can rely
+ * on the generic Assign mechanism to evaluate the product per coeff (or packet).
+ *
+ * Note that here the inner-loops should always be unrolled.
+ */
+
+template<int Traversal, int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
+struct product_coeff_impl;
+
+template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct product_packet_impl;
+
+template<typename LhsNested, typename RhsNested, int NestingFlags>
+struct traits<CoeffBasedProduct<LhsNested,RhsNested,NestingFlags> >
+{
+ typedef MatrixXpr XprKind;
+ typedef typename remove_all<LhsNested>::type _LhsNested;
+ typedef typename remove_all<RhsNested>::type _RhsNested;
+ typedef typename scalar_product_traits<typename _LhsNested::Scalar, typename _RhsNested::Scalar>::ReturnType Scalar;
+ typedef typename promote_storage_type<typename traits<_LhsNested>::StorageKind,
+ typename traits<_RhsNested>::StorageKind>::ret StorageKind;
+ typedef typename promote_index_type<typename traits<_LhsNested>::Index,
+ typename traits<_RhsNested>::Index>::type Index;
+
+ enum {
+ LhsCoeffReadCost = _LhsNested::CoeffReadCost,
+ RhsCoeffReadCost = _RhsNested::CoeffReadCost,
+ LhsFlags = _LhsNested::Flags,
+ RhsFlags = _RhsNested::Flags,
+
+ RowsAtCompileTime = _LhsNested::RowsAtCompileTime,
+ ColsAtCompileTime = _RhsNested::ColsAtCompileTime,
+ InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime),
+
+ MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime,
+ MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime,
+
+ LhsRowMajor = LhsFlags & RowMajorBit,
+ RhsRowMajor = RhsFlags & RowMajorBit,
+
+ SameType = is_same<typename _LhsNested::Scalar,typename _RhsNested::Scalar>::value,
+
+ CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit)
+ && (ColsAtCompileTime == Dynamic
+ || ( (ColsAtCompileTime % packet_traits<Scalar>::size) == 0
+ && (RhsFlags&AlignedBit)
+ )
+ ),
+
+ CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit)
+ && (RowsAtCompileTime == Dynamic
+ || ( (RowsAtCompileTime % packet_traits<Scalar>::size) == 0
+ && (LhsFlags&AlignedBit)
+ )
+ ),
+
+ EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
+ : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
+ : (RhsRowMajor && !CanVectorizeLhs),
+
+ Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit)
+ | (EvalToRowMajor ? RowMajorBit : 0)
+ | NestingFlags
+ | (CanVectorizeLhs ? (LhsFlags & AlignedBit) : 0)
+ | (CanVectorizeRhs ? (RhsFlags & AlignedBit) : 0)
+ // TODO enable vectorization for mixed types
+ | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0),
+
+ CoeffReadCost = InnerSize == Dynamic ? Dynamic
+ : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
+ + (InnerSize - 1) * NumTraits<Scalar>::AddCost,
+
+ /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
+ * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
+ * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
+ * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
+ */
+ CanVectorizeInner = SameType
+ && LhsRowMajor
+ && (!RhsRowMajor)
+ && (LhsFlags & RhsFlags & ActualPacketAccessBit)
+ && (LhsFlags & RhsFlags & AlignedBit)
+ && (InnerSize % packet_traits<Scalar>::size == 0)
+ };
+};
+
+} // end namespace internal
+
+template<typename LhsNested, typename RhsNested, int NestingFlags>
+class CoeffBasedProduct
+ : internal::no_assignment_operator,
+ public MatrixBase<CoeffBasedProduct<LhsNested, RhsNested, NestingFlags> >
+{
+ public:
+
+ typedef MatrixBase<CoeffBasedProduct> Base;
+ EIGEN_DENSE_PUBLIC_INTERFACE(CoeffBasedProduct)
+ typedef typename Base::PlainObject PlainObject;
+
+ private:
+
+ typedef typename internal::traits<CoeffBasedProduct>::_LhsNested _LhsNested;
+ typedef typename internal::traits<CoeffBasedProduct>::_RhsNested _RhsNested;
+
+ enum {
+ PacketSize = internal::packet_traits<Scalar>::size,
+ InnerSize = internal::traits<CoeffBasedProduct>::InnerSize,
+ Unroll = CoeffReadCost != Dynamic && CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
+ CanVectorizeInner = internal::traits<CoeffBasedProduct>::CanVectorizeInner
+ };
+
+ typedef internal::product_coeff_impl<CanVectorizeInner ? InnerVectorizedTraversal : DefaultTraversal,
+ Unroll ? InnerSize-1 : Dynamic,
+ _LhsNested, _RhsNested, Scalar> ScalarCoeffImpl;
+
+ typedef CoeffBasedProduct<LhsNested,RhsNested,NestByRefBit> LazyCoeffBasedProductType;
+
+ public:
+
+ EIGEN_DEVICE_FUNC
+ inline CoeffBasedProduct(const CoeffBasedProduct& other)
+ : Base(), m_lhs(other.m_lhs), m_rhs(other.m_rhs)
+ {}
+
+ template<typename Lhs, typename Rhs>
+ EIGEN_DEVICE_FUNC
+ inline CoeffBasedProduct(const Lhs& lhs, const Rhs& rhs)
+ : m_lhs(lhs), m_rhs(rhs)
+ {
+ // we don't allow taking products of matrices of different real types, as that wouldn't be vectorizable.
+ // We still allow to mix T and complex<T>.
+ EIGEN_STATIC_ASSERT((internal::scalar_product_traits<typename Lhs::RealScalar, typename Rhs::RealScalar>::Defined),
+ YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+ eigen_assert(lhs.cols() == rhs.rows()
+ && "invalid matrix product"
+ && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
+ {
+ Scalar res;
+ ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
+ return res;
+ }
+
+ /* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
+ * which is why we don't set the LinearAccessBit.
+ */
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
+ {
+ Scalar res;
+ const Index row = RowsAtCompileTime == 1 ? 0 : index;
+ const Index col = RowsAtCompileTime == 1 ? index : 0;
+ ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
+ return res;
+ }
+
+ template<int LoadMode>
+ EIGEN_STRONG_INLINE const PacketScalar packet(Index row, Index col) const
+ {
+ PacketScalar res;
+ internal::product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor,
+ Unroll ? InnerSize-1 : Dynamic,
+ _LhsNested, _RhsNested, PacketScalar, LoadMode>
+ ::run(row, col, m_lhs, m_rhs, res);
+ return res;
+ }
+
+ // Implicit conversion to the nested type (trigger the evaluation of the product)
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE operator const PlainObject& () const
+ {
+ m_result.lazyAssign(*this);
+ return m_result;
+ }
+
+ EIGEN_DEVICE_FUNC const _LhsNested& lhs() const { return m_lhs; }
+ EIGEN_DEVICE_FUNC const _RhsNested& rhs() const { return m_rhs; }
+
+ EIGEN_DEVICE_FUNC
+ const Diagonal<const LazyCoeffBasedProductType,0> diagonal() const
+ { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this); }
+
+ template<int DiagonalIndex>
+ EIGEN_DEVICE_FUNC
+ const Diagonal<const LazyCoeffBasedProductType,DiagonalIndex> diagonal() const
+ { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this); }
+
+ EIGEN_DEVICE_FUNC
+ const Diagonal<const LazyCoeffBasedProductType, DynamicIndex> diagonal(Index index) const {
+ return Diagonal<const LazyCoeffBasedProductType, DynamicIndex>(
+ reinterpret_cast<const LazyCoeffBasedProductType&>(*this), index);
+ }
+
+ protected:
+ typename internal::add_const_on_value_type<LhsNested>::type m_lhs;
+ typename internal::add_const_on_value_type<RhsNested>::type m_rhs;
+
+ mutable PlainObject m_result;
+};
+
+namespace internal {
+
+// here we need to overload the nested rule for products
+// such that the nested type is a const reference to a plain matrix
+template<typename Lhs, typename Rhs, int N, typename PlainObject>
+struct nested<CoeffBasedProduct<Lhs,Rhs,EvalBeforeNestingBit|EvalBeforeAssigningBit>, N, PlainObject>
+{
+ typedef PlainObject const& type;
+};
+
+/***************************************************************************
+* Normal product .coeff() implementation (with meta-unrolling)
+***************************************************************************/
+
+/**************************************
+*** Scalar path - no vectorization ***
+**************************************/
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
+struct product_coeff_impl<DefaultTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
+{
+ typedef typename Lhs::Index Index;
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
+ {
+ product_coeff_impl<DefaultTraversal, UnrollingIndex-1, Lhs, Rhs, RetScalar>::run(row, col, lhs, rhs, res);
+ res += lhs.coeff(row, UnrollingIndex) * rhs.coeff(UnrollingIndex, col);
+ }
+};
+
+template<typename Lhs, typename Rhs, typename RetScalar>
+struct product_coeff_impl<DefaultTraversal, 0, Lhs, Rhs, RetScalar>
+{
+ typedef typename Lhs::Index Index;
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
+ {
+ res = lhs.coeff(row, 0) * rhs.coeff(0, col);
+ }
+};
+
+template<typename Lhs, typename Rhs, typename RetScalar>
+struct product_coeff_impl<DefaultTraversal, Dynamic, Lhs, Rhs, RetScalar>
+{
+ typedef typename Lhs::Index Index;
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar& res)
+ {
+ eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix");
+ res = lhs.coeff(row, 0) * rhs.coeff(0, col);
+ for(Index i = 1; i < lhs.cols(); ++i)
+ res += lhs.coeff(row, i) * rhs.coeff(i, col);
+ }
+};
+
+/*******************************************
+*** Scalar path with inner vectorization ***
+*******************************************/
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet>
+struct product_coeff_vectorized_unroller
+{
+ typedef typename Lhs::Index Index;
+ enum { PacketSize = packet_traits<typename Lhs::Scalar>::size };
+ static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
+ {
+ product_coeff_vectorized_unroller<UnrollingIndex-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres);
+ pres = padd(pres, pmul( lhs.template packet<Aligned>(row, UnrollingIndex) , rhs.template packet<Aligned>(UnrollingIndex, col) ));
+ }
+};
+
+template<typename Lhs, typename Rhs, typename Packet>
+struct product_coeff_vectorized_unroller<0, Lhs, Rhs, Packet>
+{
+ typedef typename Lhs::Index Index;
+ static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
+ {
+ pres = pmul(lhs.template packet<Aligned>(row, 0) , rhs.template packet<Aligned>(0, col));
+ }
+};
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
+struct product_coeff_impl<InnerVectorizedTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
+{
+ typedef typename Lhs::PacketScalar Packet;
+ typedef typename Lhs::Index Index;
+ enum { PacketSize = packet_traits<typename Lhs::Scalar>::size };
+ static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
+ {
+ Packet pres;
+ product_coeff_vectorized_unroller<UnrollingIndex+1-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres);
+ res = predux(pres);
+ }
+};
+
+template<typename Lhs, typename Rhs, int LhsRows = Lhs::RowsAtCompileTime, int RhsCols = Rhs::ColsAtCompileTime>
+struct product_coeff_vectorized_dyn_selector
+{
+ typedef typename Lhs::Index Index;
+ static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+ {
+ res = lhs.row(row).transpose().cwiseProduct(rhs.col(col)).sum();
+ }
+};
+
+// NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower
+// NOTE maybe they are now useless since we have a specialization for Block<Matrix>
+template<typename Lhs, typename Rhs, int RhsCols>
+struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols>
+{
+ typedef typename Lhs::Index Index;
+ static EIGEN_STRONG_INLINE void run(Index /*row*/, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+ {
+ res = lhs.transpose().cwiseProduct(rhs.col(col)).sum();
+ }
+};
+
+template<typename Lhs, typename Rhs, int LhsRows>
+struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1>
+{
+ typedef typename Lhs::Index Index;
+ static EIGEN_STRONG_INLINE void run(Index row, Index /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+ {
+ res = lhs.row(row).transpose().cwiseProduct(rhs).sum();
+ }
+};
+
+template<typename Lhs, typename Rhs>
+struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,1>
+{
+ typedef typename Lhs::Index Index;
+ static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+ {
+ res = lhs.transpose().cwiseProduct(rhs).sum();
+ }
+};
+
+template<typename Lhs, typename Rhs, typename RetScalar>
+struct product_coeff_impl<InnerVectorizedTraversal, Dynamic, Lhs, Rhs, RetScalar>
+{
+ typedef typename Lhs::Index Index;
+ static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+ {
+ product_coeff_vectorized_dyn_selector<Lhs,Rhs>::run(row, col, lhs, rhs, res);
+ }
+};
+
+/*******************
+*** Packet path ***
+*******************/
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
+{
+ typedef typename Lhs::Index Index;
+ static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
+ {
+ product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res);
+ res = pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex)), rhs.template packet<LoadMode>(UnrollingIndex, col), res);
+ }
+};
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
+{
+ typedef typename Lhs::Index Index;
+ static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
+ {
+ product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res);
+ res = pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex), pset1<Packet>(rhs.coeff(UnrollingIndex, col)), res);
+ }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
+{
+ typedef typename Lhs::Index Index;
+ static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
+ {
+ res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
+ }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
+{
+ typedef typename Lhs::Index Index;
+ static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
+ {
+ res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
+ }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
+{
+ typedef typename Lhs::Index Index;
+ static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res)
+ {
+ eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix");
+ res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
+ for(Index i = 1; i < lhs.cols(); ++i)
+ res = pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
+ }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
+{
+ typedef typename Lhs::Index Index;
+ static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res)
+ {
+ eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix");
+ res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
+ for(Index i = 1; i < lhs.cols(); ++i)
+ res = pmadd(lhs.template packet<LoadMode>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
+ }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COEFFBASED_PRODUCT_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/third_party/eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h
new file mode 100644
index 0000000000..80bd6aa0e6
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -0,0 +1,2197 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERAL_BLOCK_PANEL_H
+#define EIGEN_GENERAL_BLOCK_PANEL_H
+
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
+class gebp_traits;
+
+
+/** \internal \returns b if a<=0, and returns a otherwise. */
+inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
+{
+ return a<=0 ? b : a;
+}
+
+#if EIGEN_ARCH_i386_OR_x86_64
+const std::ptrdiff_t defaultL1CacheSize = 32*1024;
+const std::ptrdiff_t defaultL2CacheSize = 256*1024;
+const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024;
+#else
+const std::ptrdiff_t defaultL1CacheSize = 16*1024;
+const std::ptrdiff_t defaultL2CacheSize = 512*1024;
+const std::ptrdiff_t defaultL3CacheSize = 512*1024;
+#endif
+
+/** \internal */
+inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
+{
+ static bool m_cache_sizes_initialized = false;
+ static std::ptrdiff_t m_l1CacheSize = 0;
+ static std::ptrdiff_t m_l2CacheSize = 0;
+ static std::ptrdiff_t m_l3CacheSize = 0;
+
+ if(EIGEN_UNLIKELY(!m_cache_sizes_initialized))
+ {
+ int l1CacheSize, l2CacheSize, l3CacheSize;
+ queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
+ m_l1CacheSize = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize);
+ m_l2CacheSize = manage_caching_sizes_helper(l2CacheSize, defaultL2CacheSize);
+ m_l3CacheSize = manage_caching_sizes_helper(l3CacheSize, defaultL3CacheSize);
+ m_cache_sizes_initialized = true;
+ }
+
+ if(EIGEN_UNLIKELY(action==SetAction))
+ {
+ // set the cpu cache size and cache all block sizes from a global cache size in byte
+ eigen_internal_assert(l1!=0 && l2!=0);
+ m_l1CacheSize = *l1;
+ m_l2CacheSize = *l2;
+ m_l3CacheSize = *l3;
+ }
+ else if(EIGEN_LIKELY(action==GetAction))
+ {
+ eigen_internal_assert(l1!=0 && l2!=0);
+ *l1 = m_l1CacheSize;
+ *l2 = m_l2CacheSize;
+ *l3 = m_l3CacheSize;
+ }
+ else
+ {
+ eigen_internal_assert(false);
+ }
+}
+
+#define CEIL(a, b) ((a)+(b)-1)/(b)
+
+/* Helper for computeProductBlockingSizes.
+ *
+ * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
+ * this function computes the blocking size parameters along the respective dimensions
+ * for matrix products and related algorithms. The blocking sizes depends on various
+ * parameters:
+ * - the L1 and L2 cache sizes,
+ * - the register level blocking sizes defined by gebp_traits,
+ * - the number of scalars that fit into a packet (when vectorization is enabled).
+ *
+ * \sa setCpuCacheSizes */
+template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
+void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1)
+{
+ // Explanations:
+ // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and
+ // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed
+ // per kc x nr vertical small panels where nr is the blocking size along the n dimension
+ // at the register level. For vectorization purpose, these small vertical panels are unpacked,
+ // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to
+ // stay in L1 cache.
+ typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+ typedef typename Traits::ResScalar ResScalar;
+ enum {
+ kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
+ ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
+ k_mask = (0xffffffff/8)*8,
+
+ mr = Traits::mr,
+ mr_mask = (0xffffffff/mr)*mr,
+
+ nr = Traits::nr,
+ nr_mask = (0xffffffff/nr)*nr
+ };
+
+ std::ptrdiff_t l1, l2, l3;
+ manage_caching_sizes(GetAction, &l1, &l2, &l3);
+
+ // Increasing k gives us more time to prefetch the content of the "C"
+ // registers. However once the latency is hidden there is no point in
+ // increasing the value of k, so we'll cap it at 320 (value determined
+ // experimentally).
+ const Index k_cache = (std::min<Index>)((l1-ksub)/kdiv, 320);
+ if (k_cache < k) {
+ k = k_cache & k_mask;
+ eigen_assert(k > 0);
+ }
+
+ const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
+ Index n_per_thread = CEIL(n, num_threads);
+ if (n_cache <= n_per_thread) {
+ // Don't exceed the capacity of the l2 cache.
+ if (n_cache < nr) {
+ n = nr;
+ } else {
+ n = n_cache & nr_mask;
+ eigen_assert(n > 0);
+ }
+ } else {
+ n = (std::min<Index>)(n, (n_per_thread + nr - 1) & nr_mask);
+ }
+
+ if (l3 > l2) {
+ // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
+ const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
+ const Index m_per_thread = CEIL(m, num_threads);
+ if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
+ m = m_cache & mr_mask;
+ eigen_assert(m > 0);
+ } else {
+ m = (std::min<Index>)(m, (m_per_thread + mr - 1) & mr_mask);
+ }
+ }
+}
+
+template <typename Index>
+bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
+{
+#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
+ if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
+ k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
+ m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
+ n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
+ return true;
+ }
+#else
+ EIGEN_UNUSED_VARIABLE(k)
+ EIGEN_UNUSED_VARIABLE(m)
+ EIGEN_UNUSED_VARIABLE(n)
+#endif
+ return false;
+}
+
+/** \brief Computes the blocking parameters for a m x k times k x n matrix product
+ *
+ * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
+ * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension.
+ * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension.
+ *
+ * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
+ * this function computes the blocking size parameters along the respective dimensions
+ * for matrix products and related algorithms.
+ *
+ * The blocking size parameters may be evaluated:
+ * - either by a heuristic based on cache sizes;
+ * - or using fixed prescribed values (for testing purposes).
+ *
+ * \sa setCpuCacheSizes */
+
+template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
+void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
+{
+ if (!k || !m || !n) {
+ return;
+ }
+
+ if (!useSpecificBlockingSizes(k, m, n)) {
+ evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor>(k, m, n, num_threads);
+ }
+
+#if !EIGEN_ARCH_i386_OR_x86_64
+ // The following code rounds k,m,n down to the nearest multiple of register-level blocking sizes.
+ // We should always do that, and in upstream Eigen we always do that.
+ // Unfortunately, we can't do that in Google3 on x86[-64] because this makes tiny differences in results and
+ // we have some unfortunate tests require very specific relative errors which fail because of that,
+ // at least //learning/laser/algorithms/wals:wals_batch_solver_test.
+ // Note that this wouldn't make any difference if we had been using only correctly rounded values,
+ // but we've not! See how in evaluateProductBlockingSizesHeuristic, we do the rounding down by
+ // bit-masking, e.g. mr_mask = (0xffffffff/mr)*mr, implicitly assuming that mr is always a power of
+ // two, which is not the case with the 3px4 kernel.
+ typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+ enum {
+ kr = 8,
+ mr = Traits::mr,
+ nr = Traits::nr
+ };
+ if (k > kr) k -= k % kr;
+ if (m > mr) m -= m % mr;
+ if (n > nr) n -= n % nr;
+#endif
+}
+
+template<typename LhsScalar, typename RhsScalar, typename Index>
+inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads)
+{
+ computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n, num_threads);
+}
+
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+ #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C);
+#else
+
+ // FIXME (a bit overkill maybe ?)
+
+ template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector {
+ EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/)
+ {
+ c = cj.pmadd(a,b,c);
+ }
+ };
+
+ template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
+ EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t)
+ {
+ t = b; t = cj.pmul(a,t); c = padd(c,t);
+ }
+ };
+
+ template<typename CJ, typename A, typename B, typename C, typename T>
+ EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
+ {
+ gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
+ }
+
+ #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T);
+// #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T);
+#endif
+
+/* Vectorization logic
+ * real*real: unpack rhs to constant packets, ...
+ *
+ * cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i),
+ * storing each res packet into two packets (2x2),
+ * at the end combine them: swap the second and addsub them
+ * cf*cf : same but with 2x4 blocks
+ * cplx*real : unpack rhs to constant packets, ...
+ * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
+ */
+template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
+class gebp_traits
+{
+public:
+ typedef _LhsScalar LhsScalar;
+ typedef _RhsScalar RhsScalar;
+ typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+ enum {
+ ConjLhs = _ConjLhs,
+ ConjRhs = _ConjRhs,
+ Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
+ LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
+ RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
+ ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
+
+ NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
+
+ // register block size along the N direction must be 1 or 4
+ nr = 4,
+
+ // register block size along the M direction (currently, this one cannot be modified)
+ default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
+#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
+ // we assume 16 registers
+ mr = Vectorizable ? 3*LhsPacketSize : default_mr,
+#else
+ mr = default_mr,
+#endif
+
+ LhsProgress = LhsPacketSize,
+ RhsProgress = 1
+ };
+
+ typedef typename packet_traits<LhsScalar>::type _LhsPacket;
+ typedef typename packet_traits<RhsScalar>::type _RhsPacket;
+ typedef typename packet_traits<ResScalar>::type _ResPacket;
+
+ typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
+ typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
+ typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+
+ typedef ResPacket AccPacket;
+
+ EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
+ {
+ p = pset1<ResPacket>(ResScalar(0));
+ }
+
+ EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+ {
+ pbroadcast4(b, b0, b1, b2, b3);
+ }
+
+// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
+// {
+// pbroadcast2(b, b0, b1);
+// }
+
+ template<typename RhsPacketType>
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
+ {
+ dest = pset1<RhsPacketType>(*b);
+ }
+
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+ {
+ dest = ploadquad<RhsPacket>(b);
+ }
+
+ template<typename LhsPacketType>
+ EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const
+ {
+ dest = pload<LhsPacketType>(a);
+ }
+
+ template<typename LhsPacketType>
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
+ {
+ dest = ploadu<LhsPacketType>(a);
+ }
+
+ template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const
+ {
+ // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
+ // let gcc allocate the register in which to store the result of the pmul
+ // (in the case where there is no FMA) gcc fails to figure out how to avoid
+ // spilling register.
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+ EIGEN_UNUSED_VARIABLE(tmp);
+ c = pmadd(a,b,c);
+#else
+ tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);
+#endif
+ }
+
+ EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
+ {
+ r = pmadd(c,alpha,r);
+ }
+
+ template<typename ResPacketHalf>
+ EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const
+ {
+ r = pmadd(c,alpha,r);
+ }
+
+protected:
+// conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
+// conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj;
+};
+
+template<typename RealScalar, bool _ConjLhs>
+class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
+{
+public:
+ typedef std::complex<RealScalar> LhsScalar;
+ typedef RealScalar RhsScalar;
+ typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+ enum {
+ ConjLhs = _ConjLhs,
+ ConjRhs = false,
+ Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
+ LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
+ RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
+ ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
+
+ NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
+ nr = 4,
+#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
+ // we assume 16 registers
+ mr = 3*LhsPacketSize,
+#else
+ mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
+#endif
+
+ LhsProgress = LhsPacketSize,
+ RhsProgress = 1
+ };
+
+ typedef typename packet_traits<LhsScalar>::type _LhsPacket;
+ typedef typename packet_traits<RhsScalar>::type _RhsPacket;
+ typedef typename packet_traits<ResScalar>::type _ResPacket;
+
+ typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
+ typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
+ typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+
+ typedef ResPacket AccPacket;
+
+ EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
+ {
+ p = pset1<ResPacket>(ResScalar(0));
+ }
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+ {
+ dest = pset1<RhsPacket>(*b);
+ }
+
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+ {
+ dest = pset1<RhsPacket>(*b);
+ }
+
+ EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
+ {
+ dest = pload<LhsPacket>(a);
+ }
+
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
+ {
+ dest = ploadu<LhsPacket>(a);
+ }
+
+ EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+ {
+ pbroadcast4(b, b0, b1, b2, b3);
+ }
+
+// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
+// {
+// pbroadcast2(b, b0, b1);
+// }
+
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
+ {
+ madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
+ }
+
+ EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
+ {
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+ EIGEN_UNUSED_VARIABLE(tmp);
+ c.v = pmadd(a.v,b,c.v);
+#else
+ tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
+#endif
+ }
+
+ EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
+ {
+ c += a * b;
+ }
+
+ EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
+ {
+ r = cj.pmadd(c,alpha,r);
+ }
+
+protected:
+ conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
+};
+
+template<typename Packet>
+struct DoublePacket
+{
+ Packet first;
+ Packet second;
+};
+
+template<typename Packet>
+DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
+{
+ DoublePacket<Packet> res;
+ res.first = padd(a.first, b.first);
+ res.second = padd(a.second,b.second);
+ return res;
+}
+
+template<typename Packet>
+const DoublePacket<Packet>& predux4(const DoublePacket<Packet> &a)
+{
+ return a;
+}
+
+template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typedef DoublePacket<Packet> half; };
+// template<typename Packet>
+// DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
+// {
+// DoublePacket<Packet> res;
+// res.first = padd(a.first, b.first);
+// res.second = padd(a.second,b.second);
+// return res;
+// }
+
+template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
+{
+public:
+ typedef std::complex<RealScalar> Scalar;
+ typedef std::complex<RealScalar> LhsScalar;
+ typedef std::complex<RealScalar> RhsScalar;
+ typedef std::complex<RealScalar> ResScalar;
+
+ enum {
+ ConjLhs = _ConjLhs,
+ ConjRhs = _ConjRhs,
+ Vectorizable = packet_traits<RealScalar>::Vectorizable
+ && packet_traits<Scalar>::Vectorizable,
+ RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1,
+ ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
+ LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
+ RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
+
+ // FIXME: should depend on NumberOfRegisters
+ nr = 4,
+ mr = ResPacketSize,
+
+ LhsProgress = ResPacketSize,
+ RhsProgress = 1
+ };
+
+ typedef typename packet_traits<RealScalar>::type RealPacket;
+ typedef typename packet_traits<Scalar>::type ScalarPacket;
+ typedef DoublePacket<RealPacket> DoublePacketType;
+
+ typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket;
+ typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
+ typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
+ typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
+
+ EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
+
+ EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p)
+ {
+ p.first = pset1<RealPacket>(RealScalar(0));
+ p.second = pset1<RealPacket>(RealScalar(0));
+ }
+
+ // Scalar path
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const
+ {
+ dest = pset1<ResPacket>(*b);
+ }
+
+ // Vectorized path
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const
+ {
+ dest.first = pset1<RealPacket>(real(*b));
+ dest.second = pset1<RealPacket>(imag(*b));
+ }
+
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
+ {
+ loadRhs(b,dest);
+ }
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
+ {
+ eigen_internal_assert(unpacket_traits<ScalarPacket>::size<=4);
+ loadRhs(b,dest);
+ }
+
+ EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+ {
+ // FIXME not sure that's the best way to implement it!
+ loadRhs(b+0, b0);
+ loadRhs(b+1, b1);
+ loadRhs(b+2, b2);
+ loadRhs(b+3, b3);
+ }
+
+ // Vectorized path
+ EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1)
+ {
+ // FIXME not sure that's the best way to implement it!
+ loadRhs(b+0, b0);
+ loadRhs(b+1, b1);
+ }
+
+ // Scalar path
+ EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1)
+ {
+ // FIXME not sure that's the best way to implement it!
+ loadRhs(b+0, b0);
+ loadRhs(b+1, b1);
+ }
+
+ // nothing special here
+ EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
+ {
+ dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
+ }
+
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
+ {
+ dest = ploadu<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
+ }
+
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const
+ {
+ c.first = padd(pmul(a,b.first), c.first);
+ c.second = padd(pmul(a,b.second),c.second);
+ }
+
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const
+ {
+ c = cj.pmadd(a,b,c);
+ }
+
+ EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
+
+ EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const
+ {
+ // assemble c
+ ResPacket tmp;
+ if((!ConjLhs)&&(!ConjRhs))
+ {
+ tmp = pcplxflip(pconj(ResPacket(c.second)));
+ tmp = padd(ResPacket(c.first),tmp);
+ }
+ else if((!ConjLhs)&&(ConjRhs))
+ {
+ tmp = pconj(pcplxflip(ResPacket(c.second)));
+ tmp = padd(ResPacket(c.first),tmp);
+ }
+ else if((ConjLhs)&&(!ConjRhs))
+ {
+ tmp = pcplxflip(ResPacket(c.second));
+ tmp = padd(pconj(ResPacket(c.first)),tmp);
+ }
+ else if((ConjLhs)&&(ConjRhs))
+ {
+ tmp = pcplxflip(ResPacket(c.second));
+ tmp = psub(pconj(ResPacket(c.first)),tmp);
+ }
+
+ r = pmadd(tmp,alpha,r);
+ }
+
+protected:
+ conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
+};
+
+template<typename RealScalar, bool _ConjRhs>
+class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
+{
+public:
+ typedef std::complex<RealScalar> Scalar;
+ typedef RealScalar LhsScalar;
+ typedef Scalar RhsScalar;
+ typedef Scalar ResScalar;
+
+ enum {
+ ConjLhs = false,
+ ConjRhs = _ConjRhs,
+ Vectorizable = packet_traits<RealScalar>::Vectorizable
+ && packet_traits<Scalar>::Vectorizable,
+ LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
+ RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
+ ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
+
+ NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
+ // FIXME: should depend on NumberOfRegisters
+ nr = 4,
+ mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize,
+
+ LhsProgress = ResPacketSize,
+ RhsProgress = 1
+ };
+
+ typedef typename packet_traits<LhsScalar>::type _LhsPacket;
+ typedef typename packet_traits<RhsScalar>::type _RhsPacket;
+ typedef typename packet_traits<ResScalar>::type _ResPacket;
+
+ typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
+ typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
+ typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+
+ typedef ResPacket AccPacket;
+
+ EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
+ {
+ p = pset1<ResPacket>(ResScalar(0));
+ }
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+ {
+ dest = pset1<RhsPacket>(*b);
+ }
+
+ void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+ {
+ pbroadcast4(b, b0, b1, b2, b3);
+ }
+
+// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
+// {
+// // FIXME not sure that's the best way to implement it!
+// b0 = pload1<RhsPacket>(b+0);
+// b1 = pload1<RhsPacket>(b+1);
+// }
+
+ EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
+ {
+ dest = ploaddup<LhsPacket>(a);
+ }
+
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+ {
+ eigen_internal_assert(unpacket_traits<RhsPacket>::size<=4);
+ loadRhs(b,dest);
+ }
+
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
+ {
+ dest = ploaddup<LhsPacket>(a);
+ }
+
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
+ {
+ madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
+ }
+
+ EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
+ {
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+ EIGEN_UNUSED_VARIABLE(tmp);
+ c.v = pmadd(a,b.v,c.v);
+#else
+ tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
+#endif
+
+ }
+
+ EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
+ {
+ c += a * b;
+ }
+
+ EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
+ {
+ r = cj.pmadd(alpha,c,r);
+ }
+
+protected:
+ conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
+};
+
+// helper for the rotating kernel below
+template <typename GebpKernel, bool UseRotatingKernel = GebpKernel::UseRotatingKernel>
+struct PossiblyRotatingKernelHelper
+{
+ // default implementation, not rotating
+
+ typedef typename GebpKernel::Traits Traits;
+ typedef typename Traits::RhsScalar RhsScalar;
+ typedef typename Traits::RhsPacket RhsPacket;
+ typedef typename Traits::AccPacket AccPacket;
+
+ const Traits& traits;
+ EIGEN_ALWAYS_INLINE PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {}
+
+
+ template <size_t K, size_t Index> EIGEN_ALWAYS_INLINE
+ void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const
+ {
+ traits.loadRhs(from + (Index+4*K)*Traits::RhsProgress, to);
+ }
+
+ EIGEN_ALWAYS_INLINE void unrotateResult(AccPacket&,
+ AccPacket&,
+ AccPacket&,
+ AccPacket&)
+ {
+ }
+};
+
+// rotating implementation
+template <typename GebpKernel>
+struct PossiblyRotatingKernelHelper<GebpKernel, true>
+{
+ typedef typename GebpKernel::Traits Traits;
+ typedef typename Traits::RhsScalar RhsScalar;
+ typedef typename Traits::RhsPacket RhsPacket;
+ typedef typename Traits::AccPacket AccPacket;
+
+ const Traits& traits;
+ EIGEN_ALWAYS_INLINE PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {}
+
+ template <size_t K, size_t Index> EIGEN_ALWAYS_INLINE
+ void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const
+ {
+ if (Index == 0) {
+ to = pload<RhsPacket>(from + 4*K*Traits::RhsProgress);
+ } else {
+ EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers");
+ to = protate<1>(to);
+ }
+ }
+
+ EIGEN_ALWAYS_INLINE void unrotateResult(AccPacket& res0,
+ AccPacket& res1,
+ AccPacket& res2,
+ AccPacket& res3)
+ {
+ PacketBlock<AccPacket> resblock;
+ resblock.packet[0] = res0;
+ resblock.packet[1] = res1;
+ resblock.packet[2] = res2;
+ resblock.packet[3] = res3;
+ ptranspose(resblock);
+ resblock.packet[3] = protate<1>(resblock.packet[3]);
+ resblock.packet[2] = protate<2>(resblock.packet[2]);
+ resblock.packet[1] = protate<3>(resblock.packet[1]);
+ ptranspose(resblock);
+ res0 = resblock.packet[0];
+ res1 = resblock.packet[1];
+ res2 = resblock.packet[2];
+ res3 = resblock.packet[3];
+ }
+};
+
+/* optimized GEneral packed Block * packed Panel product kernel
+ *
+ * Mixing type logic: C += A * B
+ * | A | B | comments
+ * |real |cplx | no vectorization yet, would require to pack A with duplication
+ * |cplx |real | easy vectorization
+ */
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel
+{
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
+ typedef typename Traits::ResScalar ResScalar;
+ typedef typename Traits::LhsPacket LhsPacket;
+ typedef typename Traits::RhsPacket RhsPacket;
+ typedef typename Traits::ResPacket ResPacket;
+ typedef typename Traits::AccPacket AccPacket;
+
+ typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
+ typedef typename SwappedTraits::ResScalar SResScalar;
+ typedef typename SwappedTraits::LhsPacket SLhsPacket;
+ typedef typename SwappedTraits::RhsPacket SRhsPacket;
+ typedef typename SwappedTraits::ResPacket SResPacket;
+ typedef typename SwappedTraits::AccPacket SAccPacket;
+
+ typedef typename DataMapper::LinearMapper LinearMapper;
+
+ enum {
+ Vectorizable = Traits::Vectorizable,
+ LhsProgress = Traits::LhsProgress,
+ RhsProgress = Traits::RhsProgress,
+ ResPacketSize = Traits::ResPacketSize
+ };
+
+ EIGEN_DONT_INLINE
+ void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
+ Index rows, Index depth, Index cols, ResScalar alpha,
+ Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+
+ static const bool UseRotatingKernel =
+ EIGEN_ARCH_ARM &&
+ internal::is_same<LhsScalar, float>::value &&
+ internal::is_same<RhsScalar, float>::value &&
+ internal::is_same<ResScalar, float>::value &&
+ Traits::LhsPacketSize == 4 &&
+ Traits::RhsPacketSize == 4 &&
+ Traits::ResPacketSize == 4;
+};
+
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE
+void gebp_kernel<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+ ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
+ Index rows, Index depth, Index cols, ResScalar alpha,
+ Index strideA, Index strideB, Index offsetA, Index offsetB)
+ {
+ Traits traits;
+ SwappedTraits straits;
+
+ if(strideA==-1) strideA = depth;
+ if(strideB==-1) strideB = depth;
+ conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
+ Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
+ const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
+ const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
+ const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
+ enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
+ const Index peeled_kc = depth & ~(pk-1);
+ const Index prefetch_res_offset = 0;
+// const Index depth2 = depth & ~1;
+
+ //---------- Process 3 * LhsProgress rows at once ----------
+ // This corresponds to 3*LhsProgress x nr register blocks.
+ // Usually, make sense only with FMA
+ if(mr>=3*Traits::LhsProgress)
+ {
+ PossiblyRotatingKernelHelper<gebp_kernel> possiblyRotatingKernelHelper(traits);
+
+ // loops on each largest micro horizontal panel of lhs (3*Traits::LhsProgress x depth)
+ for(Index i=0; i<peeled_mc3; i+=3*Traits::LhsProgress)
+ {
+ // loops on each largest micro vertical panel of rhs (depth * nr)
+ for(Index j2=0; j2<packet_cols4; j2+=nr)
+ {
+ // We select a 3*Traits::LhsProgress x nr micro block of res which is entirely
+ // stored into 3 x nr registers.
+
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
+ prefetch(&blA[0]);
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+ prefetch(&blB[0]);
+ LhsPacket A0, A1;
+
+ // gets res block as register
+ AccPacket C0, C1, C2, C3,
+ C4, C5, C6, C7,
+ C8, C9, C10, C11;
+ traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
+ traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
+ traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11);
+
+ LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+ LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+ LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+ LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+ r0.prefetch(0);
+ r1.prefetch(0);
+ r2.prefetch(0);
+ r3.prefetch(0);
+
+ // performs "inner" products
+ for(Index k=0; k<peeled_kc; k+=pk)
+ {
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
+ RhsPacket B_0, T0;
+ LhsPacket A2;
+
+#define EIGEN_GEBP_ONESTEP(K) \
+ do { \
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+ internal::prefetch(blA+(3*K+16)*LhsProgress); \
+ if (EIGEN_ARCH_ARM) internal::prefetch(blB+(4*K+16)*RhsProgress); /* Bug 953 */ \
+ traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
+ traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
+ traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
+ possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 0>(B_0, blB); \
+ traits.madd(A0, B_0, C0, T0); \
+ traits.madd(A1, B_0, C4, T0); \
+ traits.madd(A2, B_0, C8, B_0); \
+ possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 1>(B_0, blB); \
+ traits.madd(A0, B_0, C1, T0); \
+ traits.madd(A1, B_0, C5, T0); \
+ traits.madd(A2, B_0, C9, B_0); \
+ possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 2>(B_0, blB); \
+ traits.madd(A0, B_0, C2, T0); \
+ traits.madd(A1, B_0, C6, T0); \
+ traits.madd(A2, B_0, C10, B_0); \
+ possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 3>(B_0, blB); \
+ traits.madd(A0, B_0, C3 , T0); \
+ traits.madd(A1, B_0, C7, T0); \
+ traits.madd(A2, B_0, C11, B_0); \
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
+ } while(false)
+
+ internal::prefetch(blB);
+ EIGEN_GEBP_ONESTEP(0);
+ EIGEN_GEBP_ONESTEP(1);
+ EIGEN_GEBP_ONESTEP(2);
+ EIGEN_GEBP_ONESTEP(3);
+ EIGEN_GEBP_ONESTEP(4);
+ EIGEN_GEBP_ONESTEP(5);
+ EIGEN_GEBP_ONESTEP(6);
+ EIGEN_GEBP_ONESTEP(7);
+
+ blB += pk*4*RhsProgress;
+ blA += pk*3*Traits::LhsProgress;
+
+ EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
+ }
+ // process remaining peeled loop
+ for(Index k=peeled_kc; k<depth; k++)
+ {
+ RhsPacket B_0, T0;
+ LhsPacket A2;
+ EIGEN_GEBP_ONESTEP(0);
+ blB += 4*RhsProgress;
+ blA += 3*Traits::LhsProgress;
+ }
+#undef EIGEN_GEBP_ONESTEP
+
+ possiblyRotatingKernelHelper.unrotateResult(C0, C1, C2, C3);
+ possiblyRotatingKernelHelper.unrotateResult(C4, C5, C6, C7);
+ possiblyRotatingKernelHelper.unrotateResult(C8, C9, C10, C11);
+
+ ResPacket R0, R1, R2;
+ ResPacket alphav = pset1<ResPacket>(alpha);
+
+ R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+ R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+ R2 = r0.loadPacket(2 * Traits::ResPacketSize);
+ traits.acc(C0, alphav, R0);
+ traits.acc(C4, alphav, R1);
+ traits.acc(C8, alphav, R2);
+ r0.storePacket(0 * Traits::ResPacketSize, R0);
+ r0.storePacket(1 * Traits::ResPacketSize, R1);
+ r0.storePacket(2 * Traits::ResPacketSize, R2);
+
+ R0 = r1.loadPacket(0 * Traits::ResPacketSize);
+ R1 = r1.loadPacket(1 * Traits::ResPacketSize);
+ R2 = r1.loadPacket(2 * Traits::ResPacketSize);
+ traits.acc(C1, alphav, R0);
+ traits.acc(C5, alphav, R1);
+ traits.acc(C9, alphav, R2);
+ r1.storePacket(0 * Traits::ResPacketSize, R0);
+ r1.storePacket(1 * Traits::ResPacketSize, R1);
+ r1.storePacket(2 * Traits::ResPacketSize, R2);
+
+ R0 = r2.loadPacket(0 * Traits::ResPacketSize);
+ R1 = r2.loadPacket(1 * Traits::ResPacketSize);
+ R2 = r2.loadPacket(2 * Traits::ResPacketSize);
+ traits.acc(C2, alphav, R0);
+ traits.acc(C6, alphav, R1);
+ traits.acc(C10, alphav, R2);
+ r2.storePacket(0 * Traits::ResPacketSize, R0);
+ r2.storePacket(1 * Traits::ResPacketSize, R1);
+ r2.storePacket(2 * Traits::ResPacketSize, R2);
+
+ R0 = r3.loadPacket(0 * Traits::ResPacketSize);
+ R1 = r3.loadPacket(1 * Traits::ResPacketSize);
+ R2 = r3.loadPacket(2 * Traits::ResPacketSize);
+ traits.acc(C3, alphav, R0);
+ traits.acc(C7, alphav, R1);
+ traits.acc(C11, alphav, R2);
+ r3.storePacket(0 * Traits::ResPacketSize, R0);
+ r3.storePacket(1 * Traits::ResPacketSize, R1);
+ r3.storePacket(2 * Traits::ResPacketSize, R2);
+ }
+
+ // Deal with remaining columns of the rhs
+ for(Index j2=packet_cols4; j2<cols; j2++)
+ {
+ // One column at a time
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
+ prefetch(&blA[0]);
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+ prefetch(&blB[0]);
+ // gets res block as register
+ AccPacket C0, C4, C8;
+ traits.initAcc(C0);
+ traits.initAcc(C4);
+ traits.initAcc(C8);
+
+ LinearMapper r0 = res.getLinearMapper(i, j2);
+ r0.prefetch(0);
+ LhsPacket A0, A1, A2;
+
+ // performs "inner" products
+ for(Index k=0; k<peeled_kc; k+=pk)
+ {
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
+ RhsPacket B_0;
+#define EIGEN_GEBGP_ONESTEP(K) \
+ do { \
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+ traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
+ traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
+ traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
+ traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
+ traits.madd(A0, B_0, C0, B_0); \
+ traits.madd(A1, B_0, C4, B_0); \
+ traits.madd(A2, B_0, C8, B_0); \
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
+ } while(false)
+
+ EIGEN_GEBGP_ONESTEP(0);
+ EIGEN_GEBGP_ONESTEP(1);
+ EIGEN_GEBGP_ONESTEP(2);
+ EIGEN_GEBGP_ONESTEP(3);
+ EIGEN_GEBGP_ONESTEP(4);
+ EIGEN_GEBGP_ONESTEP(5);
+ EIGEN_GEBGP_ONESTEP(6);
+ EIGEN_GEBGP_ONESTEP(7);
+
+ blB += pk*RhsProgress;
+ blA += pk*3*Traits::LhsProgress;
+
+ EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
+ }
+
+ // process remaining peeled loop
+ for(Index k=peeled_kc; k<depth; k++)
+ {
+ RhsPacket B_0;
+ EIGEN_GEBGP_ONESTEP(0);
+ blB += RhsProgress;
+ blA += 3*Traits::LhsProgress;
+ }
+#undef EIGEN_GEBGP_ONESTEP
+ ResPacket R0, R1, R2;
+ ResPacket alphav = pset1<ResPacket>(alpha);
+
+ R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+ R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+ R2 = r0.loadPacket(2 * Traits::ResPacketSize);
+ traits.acc(C0, alphav, R0);
+ traits.acc(C4, alphav, R1);
+ traits.acc(C8, alphav, R2);
+ r0.storePacket(0 * Traits::ResPacketSize, R0);
+ r0.storePacket(1 * Traits::ResPacketSize, R1);
+ r0.storePacket(2 * Traits::ResPacketSize, R2);
+ }
+ }
+ }
+
+ //---------- Process 2 * LhsProgress rows at once ----------
+ if(mr>=2*Traits::LhsProgress)
+ {
+ // loops on each largest micro horizontal panel of lhs (2*LhsProgress x depth)
+ for(Index i=peeled_mc3; i<peeled_mc2; i+=2*LhsProgress)
+ {
+ // loops on each largest micro vertical panel of rhs (depth * nr)
+ for(Index j2=0; j2<packet_cols4; j2+=nr)
+ {
+ // We select a 2*Traits::LhsProgress x nr micro block of res which is entirely
+ // stored into 2 x nr registers.
+
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
+ prefetch(&blA[0]);
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+ prefetch(&blB[0]);
+
+ // gets res block as register
+ AccPacket C0, C1, C2, C3,
+ C4, C5, C6, C7;
+ traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
+ traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
+
+ LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+ LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+ LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+ LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+ r0.prefetch(prefetch_res_offset);
+ r1.prefetch(prefetch_res_offset);
+ r2.prefetch(prefetch_res_offset);
+ r3.prefetch(prefetch_res_offset);
+
+ LhsPacket A0, A1;
+
+ // performs "inner" products
+ for(Index k=0; k<peeled_kc; k+=pk)
+ {
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
+ RhsPacket B_0, B1, B2, B3, T0;
+
+ // The 2 ASM comments in the #define are intended to prevent gcc
+ // from optimizing the code accross steps since it ends up spilling
+ // registers in this case.
+ #define EIGEN_GEBGP_ONESTEP(K) \
+ do { \
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+ traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
+ traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
+ traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
+ traits.madd(A0, B_0, C0, T0); \
+ traits.madd(A1, B_0, C4, B_0); \
+ traits.madd(A0, B1, C1, T0); \
+ traits.madd(A1, B1, C5, B1); \
+ traits.madd(A0, B2, C2, T0); \
+ traits.madd(A1, B2, C6, B2); \
+ traits.madd(A0, B3, C3, T0); \
+ traits.madd(A1, B3, C7, B3); \
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
+ } while(false)
+
+ prefetch(&blB[pk*4*RhsProgress]);
+ EIGEN_GEBGP_ONESTEP(0);
+ EIGEN_GEBGP_ONESTEP(1);
+ EIGEN_GEBGP_ONESTEP(2);
+ EIGEN_GEBGP_ONESTEP(3);
+ EIGEN_GEBGP_ONESTEP(4);
+ EIGEN_GEBGP_ONESTEP(5);
+ EIGEN_GEBGP_ONESTEP(6);
+ EIGEN_GEBGP_ONESTEP(7);
+
+ blB += pk*4*RhsProgress;
+ blA += pk*(2*Traits::LhsProgress);
+
+ EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
+ }
+ // process remaining peeled loop
+ for(Index k=peeled_kc; k<depth; k++)
+ {
+ RhsPacket B_0, B1, B2, B3, T0;
+ EIGEN_GEBGP_ONESTEP(0);
+ blB += 4*RhsProgress;
+ blA += 2*Traits::LhsProgress;
+ }
+#undef EIGEN_GEBGP_ONESTEP
+
+ ResPacket R0, R1, R2, R3;
+ ResPacket alphav = pset1<ResPacket>(alpha);
+
+ R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+ R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+ R2 = r1.loadPacket(0 * Traits::ResPacketSize);
+ R3 = r1.loadPacket(1 * Traits::ResPacketSize);
+ traits.acc(C0, alphav, R0);
+ traits.acc(C4, alphav, R1);
+ traits.acc(C1, alphav, R2);
+ traits.acc(C5, alphav, R3);
+ r0.storePacket(0 * Traits::ResPacketSize, R0);
+ r0.storePacket(1 * Traits::ResPacketSize, R1);
+ r1.storePacket(0 * Traits::ResPacketSize, R2);
+ r1.storePacket(1 * Traits::ResPacketSize, R3);
+
+ R0 = r2.loadPacket(0 * Traits::ResPacketSize);
+ R1 = r2.loadPacket(1 * Traits::ResPacketSize);
+ R2 = r3.loadPacket(0 * Traits::ResPacketSize);
+ R3 = r3.loadPacket(1 * Traits::ResPacketSize);
+ traits.acc(C2, alphav, R0);
+ traits.acc(C6, alphav, R1);
+ traits.acc(C3, alphav, R2);
+ traits.acc(C7, alphav, R3);
+ r2.storePacket(0 * Traits::ResPacketSize, R0);
+ r2.storePacket(1 * Traits::ResPacketSize, R1);
+ r3.storePacket(0 * Traits::ResPacketSize, R2);
+ r3.storePacket(1 * Traits::ResPacketSize, R3);
+ }
+
+ // Deal with remaining columns of the rhs
+ for(Index j2=packet_cols4; j2<cols; j2++)
+ {
+ // One column at a time
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
+ prefetch(&blA[0]);
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+ prefetch(&blB[0]);
+
+ // gets res block as register
+ AccPacket C0, C4;
+ traits.initAcc(C0);
+ traits.initAcc(C4);
+
+ LinearMapper r0 = res.getLinearMapper(i, j2);
+ r0.prefetch(prefetch_res_offset);
+ LhsPacket A0, A1;
+
+ // performs "inner" products
+ for(Index k=0; k<peeled_kc; k+=pk)
+ {
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
+ RhsPacket B_0, B1;
+
+#define EIGEN_GEBGP_ONESTEP(K) \
+ do { \
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+ traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
+ traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
+ traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
+ traits.madd(A0, B_0, C0, B1); \
+ traits.madd(A1, B_0, C4, B_0); \
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
+ } while(false)
+
+ EIGEN_GEBGP_ONESTEP(0);
+ EIGEN_GEBGP_ONESTEP(1);
+ EIGEN_GEBGP_ONESTEP(2);
+ EIGEN_GEBGP_ONESTEP(3);
+ EIGEN_GEBGP_ONESTEP(4);
+ EIGEN_GEBGP_ONESTEP(5);
+ EIGEN_GEBGP_ONESTEP(6);
+ EIGEN_GEBGP_ONESTEP(7);
+
+ blB += pk*RhsProgress;
+ blA += pk*2*Traits::LhsProgress;
+
+ EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
+ }
+
+ // process remaining peeled loop
+ for(Index k=peeled_kc; k<depth; k++)
+ {
+ RhsPacket B_0, B1;
+ EIGEN_GEBGP_ONESTEP(0);
+ blB += RhsProgress;
+ blA += 2*Traits::LhsProgress;
+ }
+#undef EIGEN_GEBGP_ONESTEP
+ ResPacket R0, R1;
+ ResPacket alphav = pset1<ResPacket>(alpha);
+
+ R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+ R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+ traits.acc(C0, alphav, R0);
+ traits.acc(C4, alphav, R1);
+ r0.storePacket(0 * Traits::ResPacketSize, R0);
+ r0.storePacket(1 * Traits::ResPacketSize, R1);
+ }
+ }
+ }
+ //---------- Process 1 * LhsProgress rows at once ----------
+ if(mr>=1*Traits::LhsProgress)
+ {
+ // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth)
+ for(Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress)
+ {
+ // loops on each largest micro vertical panel of rhs (depth * nr)
+ for(Index j2=0; j2<packet_cols4; j2+=nr)
+ {
+ // We select a 1*Traits::LhsProgress x nr micro block of res which is entirely
+ // stored into 1 x nr registers.
+
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
+ prefetch(&blA[0]);
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+ prefetch(&blB[0]);
+
+ // gets res block as register
+ AccPacket C0, C1, C2, C3;
+ traits.initAcc(C0);
+ traits.initAcc(C1);
+ traits.initAcc(C2);
+ traits.initAcc(C3);
+
+ LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+ LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+ LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+ LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+ r0.prefetch(prefetch_res_offset);
+ r1.prefetch(prefetch_res_offset);
+ r2.prefetch(prefetch_res_offset);
+ r3.prefetch(prefetch_res_offset);
+ LhsPacket A0;
+
+ // performs "inner" products
+ for(Index k=0; k<peeled_kc; k+=pk)
+ {
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4");
+ RhsPacket B_0, B1, B2, B3;
+
+#define EIGEN_GEBGP_ONESTEP(K) \
+ do { \
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+ traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
+ traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
+ traits.madd(A0, B_0, C0, B_0); \
+ traits.madd(A0, B1, C1, B1); \
+ traits.madd(A0, B2, C2, B2); \
+ traits.madd(A0, B3, C3, B3); \
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \
+ } while(false)
+
+ EIGEN_GEBGP_ONESTEP(0);
+ EIGEN_GEBGP_ONESTEP(1);
+ EIGEN_GEBGP_ONESTEP(2);
+ EIGEN_GEBGP_ONESTEP(3);
+ EIGEN_GEBGP_ONESTEP(4);
+ EIGEN_GEBGP_ONESTEP(5);
+ EIGEN_GEBGP_ONESTEP(6);
+ EIGEN_GEBGP_ONESTEP(7);
+
+ blB += pk*4*RhsProgress;
+ blA += pk*1*LhsProgress;
+
+ EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4");
+ }
+ // process remaining peeled loop
+ for(Index k=peeled_kc; k<depth; k++)
+ {
+ RhsPacket B_0, B1, B2, B3;
+ EIGEN_GEBGP_ONESTEP(0);
+ blB += 4*RhsProgress;
+ blA += 1*LhsProgress;
+ }
+#undef EIGEN_GEBGP_ONESTEP
+
+ ResPacket R0, R1;
+ ResPacket alphav = pset1<ResPacket>(alpha);
+
+ R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+ R1 = r1.loadPacket(0 * Traits::ResPacketSize);
+ traits.acc(C0, alphav, R0);
+ traits.acc(C1, alphav, R1);
+ r0.storePacket(0 * Traits::ResPacketSize, R0);
+ r1.storePacket(0 * Traits::ResPacketSize, R1);
+
+ R0 = r2.loadPacket(0 * Traits::ResPacketSize);
+ R1 = r3.loadPacket(0 * Traits::ResPacketSize);
+ traits.acc(C2, alphav, R0);
+ traits.acc(C3, alphav, R1);
+ r2.storePacket(0 * Traits::ResPacketSize, R0);
+ r3.storePacket(0 * Traits::ResPacketSize, R1);
+ }
+
+ // Deal with remaining columns of the rhs
+ for(Index j2=packet_cols4; j2<cols; j2++)
+ {
+ // One column at a time
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
+ prefetch(&blA[0]);
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+ prefetch(&blB[0]);
+
+ // gets res block as register
+ AccPacket C0;
+ traits.initAcc(C0);
+
+ LinearMapper r0 = res.getLinearMapper(i, j2);
+ LhsPacket A0;
+
+ // performs "inner" products
+ for(Index k=0; k<peeled_kc; k+=pk)
+ {
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
+ RhsPacket B_0;
+
+#define EIGEN_GEBGP_ONESTEP(K) \
+ do { \
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+ traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
+ traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
+ traits.madd(A0, B_0, C0, B_0); \
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
+ } while(false)
+
+ EIGEN_GEBGP_ONESTEP(0);
+ EIGEN_GEBGP_ONESTEP(1);
+ EIGEN_GEBGP_ONESTEP(2);
+ EIGEN_GEBGP_ONESTEP(3);
+ EIGEN_GEBGP_ONESTEP(4);
+ EIGEN_GEBGP_ONESTEP(5);
+ EIGEN_GEBGP_ONESTEP(6);
+ EIGEN_GEBGP_ONESTEP(7);
+
+ blB += pk*RhsProgress;
+ blA += pk*1*Traits::LhsProgress;
+
+ EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
+ }
+
+ // process remaining peeled loop
+ for(Index k=peeled_kc; k<depth; k++)
+ {
+ RhsPacket B_0;
+ EIGEN_GEBGP_ONESTEP(0);
+ blB += RhsProgress;
+ blA += 1*Traits::LhsProgress;
+ }
+#undef EIGEN_GEBGP_ONESTEP
+ ResPacket R0;
+ ResPacket alphav = pset1<ResPacket>(alpha);
+ R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+ traits.acc(C0, alphav, R0);
+ r0.storePacket(0 * Traits::ResPacketSize, R0);
+ }
+ }
+ }
+ //---------- Process remaining rows, 1 by 1 ----------
+ for(Index i=peeled_mc1; i<rows; i+=1)
+ {
+ // loop on each panel of the rhs
+ for(Index j2=0; j2<packet_cols4; j2+=nr)
+ {
+ const LhsScalar* blA = &blockA[i*strideA+offsetA];
+ prefetch(&blA[0]);
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+ prefetch(&blB[0]);
+
+ if( (SwappedTraits::LhsProgress % 4)==0 )
+ {
+ // NOTE The following piece of code wont work for 512 bit registers
+ SAccPacket C0, C1, C2, C3;
+ straits.initAcc(C0);
+ straits.initAcc(C1);
+ straits.initAcc(C2);
+ straits.initAcc(C3);
+
+ const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4);
+ const Index endk = (depth/spk)*spk;
+ const Index endk4 = (depth/(spk*4))*(spk*4);
+
+ Index k=0;
+ for(; k<endk4; k+=4*spk)
+ {
+ prefetch(&blB[4*SwappedTraits::LhsProgress]);
+
+ SLhsPacket A0,A1,A2,A3;
+ SRhsPacket B_0,B_1,B_2,B_3;
+
+ straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
+ straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
+ straits.loadRhsQuad(blA+0*spk, B_0);
+ straits.loadRhsQuad(blA+1*spk, B_1);
+ straits.madd(A0,B_0,C0,B_0);
+ straits.madd(A1,B_1,C1,B_1);
+
+ straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A2);
+ straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A3);
+ straits.loadRhsQuad(blA+2*spk, B_2);
+ straits.loadRhsQuad(blA+3*spk, B_3);
+ straits.madd(A2,B_2,C2,B_2);
+ straits.madd(A3,B_3,C3,B_3);
+
+ blB += 4*SwappedTraits::LhsProgress;
+ blA += 4*spk;
+ }
+ C0 = padd(padd(C0,C1),padd(C2,C3));
+ for(; k<endk; k+=spk)
+ {
+ SLhsPacket A0;
+ SRhsPacket B_0;
+
+ straits.loadLhsUnaligned(blB, A0);
+ straits.loadRhsQuad(blA, B_0);
+ straits.madd(A0,B_0,C0,B_0);
+
+ blB += SwappedTraits::LhsProgress;
+ blA += spk;
+ }
+ if(SwappedTraits::LhsProgress==8)
+ {
+ // Special case where we have to first reduce the accumulation register C0
+ typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
+ typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
+ typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
+ typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
+
+ SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
+ SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
+
+ if(depth-endk>0)
+ {
+ // We have to handle the last row of the rhs which corresponds to a half-packet
+ SLhsPacketHalf a0;
+ SRhsPacketHalf b0;
+ straits.loadLhsUnaligned(blB, a0);
+ straits.loadRhs(blA, b0);
+ SAccPacketHalf c0 = predux4(C0);
+ straits.madd(a0,b0,c0,b0);
+ straits.acc(c0, alphav, R);
+ }
+ else
+ {
+ straits.acc(predux4(C0), alphav, R);
+ }
+ res.scatterPacket(i, j2, R);
+ }
+ else
+ {
+ SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
+ SResPacket alphav = pset1<SResPacket>(alpha);
+ straits.acc(C0, alphav, R);
+ res.scatterPacket(i, j2, R);
+ }
+ }
+ else // scalar path
+ {
+ // get a 1 x 4 res block as registers
+ ResScalar C0(0), C1(0), C2(0), C3(0);
+
+ for(Index k=0; k<depth; k++)
+ {
+ LhsScalar A0 = blA[k];
+ RhsScalar B_0 = blB[0];
+ RhsScalar B_1 = blB[1];
+ CJMADD(cj,A0,B_0,C0, B_0);
+ CJMADD(cj,A0,B_1,C1, B_1);
+ RhsScalar B_2 = blB[2];
+ RhsScalar B_3 = blB[3];
+ CJMADD(cj,A0,B_2,C2, B_2);
+ CJMADD(cj,A0,B_3,C3, B_3);
+
+ blB += 4;
+ }
+ res(i, j2 + 0) += alpha * C0;
+ res(i, j2 + 1) += alpha * C1;
+ res(i, j2 + 2) += alpha * C2;
+ res(i, j2 + 3) += alpha * C3;
+ }
+ }
+
+ // remaining columns
+ for(Index j2=packet_cols4; j2<cols; j2++)
+ {
+ const LhsScalar* blA = &blockA[i*strideA+offsetA];
+ // prefetch(blA);
+ // gets a 1 x 1 res block as registers
+ ResScalar C0(0);
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+ for(Index k=0; k<depth; k++)
+ {
+ LhsScalar A0 = blA[k];
+ RhsScalar B_0 = blB[k];
+ CJMADD(cj, A0, B_0, C0, B_0);
+ }
+ res(i, j2) += alpha * C0;
+ }
+ }
+ }
+
+
+#undef CJMADD
+
+// pack a block of the lhs
+// The traversal is as follow (mr==4):
+// 0 4 8 12 ...
+// 1 5 9 13 ...
+// 2 6 10 14 ...
+// 3 7 11 15 ...
+//
+// 16 20 24 28 ...
+// 17 21 25 29 ...
+// 18 22 26 30 ...
+// 19 23 27 31 ...
+//
+// 32 33 34 35 ...
+// 36 36 38 39 ...
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
+{
+ typedef typename DataMapper::LinearMapper LinearMapper;
+ EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
+ ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+ typedef typename packet_traits<Scalar>::type Packet;
+ enum { PacketSize = packet_traits<Scalar>::size };
+
+ EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
+ EIGEN_UNUSED_VARIABLE(stride);
+ EIGEN_UNUSED_VARIABLE(offset);
+ eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+ eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
+ conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+
+ const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
+ const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
+ const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
+ const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
+ : Pack2>1 ? (rows/Pack2)*Pack2 : 0;
+
+ Index i=0;
+
+ // Pack 3 packets
+ if(Pack1>=3*PacketSize)
+ {
+ if(PanelMode)
+ {
+ for(; i<peeled_mc3; i+=3*PacketSize)
+ {
+ blockA += (3*PacketSize) * offset;
+
+ for(Index k=0; k<depth; k++)
+ {
+ Packet A, B, C;
+ A = lhs.loadPacket(i+0*PacketSize, k);
+ B = lhs.loadPacket(i+1*PacketSize, k);
+ C = lhs.loadPacket(i+2*PacketSize, k);
+ pstore(blockA+0*PacketSize, cj.pconj(A));
+ pstore(blockA+1*PacketSize, cj.pconj(B));
+ pstore(blockA+2*PacketSize, cj.pconj(C));
+ blockA += 3*PacketSize;
+ }
+ blockA += (3*PacketSize) * (stride-offset-depth);
+ }
+ }
+ else
+ {
+ // Read the data from DRAM as sequentially as possible. We're writing to
+ // SRAM so the order of the writes shouldn't impact performance.
+ for(Index k=0; k<depth; k++)
+ {
+ Scalar* localBlockA = blockA + 3*PacketSize*k;
+ for(Index local_i = i; local_i<peeled_mc3; local_i+=3*PacketSize)
+ {
+ Packet A, B, C;
+ A = lhs.loadPacket(local_i+0*PacketSize, k);
+ B = lhs.loadPacket(local_i+1*PacketSize, k);
+ C = lhs.loadPacket(local_i+2*PacketSize, k);
+ pstore(localBlockA+0*PacketSize, cj.pconj(A));
+ pstore(localBlockA+1*PacketSize, cj.pconj(B));
+ pstore(localBlockA+2*PacketSize, cj.pconj(C));
+ localBlockA += 3*PacketSize*depth;
+ }
+ }
+ blockA += depth*peeled_mc3;
+ i = peeled_mc3;
+ }
+ }
+ // Pack 2 packets
+ if(Pack1>=2*PacketSize)
+ {
+ if(PanelMode)
+ {
+ for(; i<peeled_mc2; i+=2*PacketSize)
+ {
+ blockA += (2*PacketSize) * offset;
+
+ for(Index k=0; k<depth; k++)
+ {
+ Packet A, B;
+ A = lhs.loadPacket(i+0*PacketSize, k);
+ B = lhs.loadPacket(i+1*PacketSize, k);
+ pstore(blockA+0*PacketSize, cj.pconj(A));
+ pstore(blockA+1*PacketSize, cj.pconj(B));
+ blockA += 2*PacketSize;
+ }
+ blockA += (2*PacketSize) * (stride-offset-depth);
+ }
+ }
+ else
+ {
+ // Read the data from RAM as sequentially as possible.
+ for(Index k=0; k<depth; k++)
+ {
+ Scalar* localBlockA = blockA + 2*PacketSize*k;
+ for(Index local_i = i; local_i<peeled_mc2; local_i+=2*PacketSize)
+ {
+ Packet A, B;
+ A = lhs.loadPacket(local_i+0*PacketSize, k);
+ B = lhs.loadPacket(local_i+1*PacketSize, k);
+ pstore(localBlockA+0*PacketSize, cj.pconj(A));
+ pstore(localBlockA+1*PacketSize, cj.pconj(B));
+ localBlockA += 2*PacketSize*depth;
+ }
+ }
+ blockA += depth*(peeled_mc2-i);
+ i = peeled_mc2;
+ }
+ }
+ // Pack 1 packets
+ if(Pack1>=1*PacketSize)
+ {
+ if(PanelMode)
+ {
+ for(; i<peeled_mc1; i+=1*PacketSize)
+ {
+ blockA += (1*PacketSize) * offset;
+
+ for(Index k=0; k<depth; k++)
+ {
+ Packet A;
+ A = lhs.loadPacket(i+0*PacketSize, k);
+ pstore(blockA, cj.pconj(A));
+ blockA+=PacketSize;
+ }
+ blockA += (1*PacketSize) * (stride-offset-depth);
+ }
+ }
+ else
+ {
+ // Read the data from RAM as sequentially as possible.
+ for(Index k=0; k<depth; k++)
+ {
+ Scalar* localBlockA = blockA + PacketSize*k;
+ for(Index local_i = i; local_i<peeled_mc1; local_i+=1*PacketSize)
+ {
+ Packet A;
+ A = lhs.loadPacket(local_i+0*PacketSize, k);
+ pstore(localBlockA, cj.pconj(A));
+ localBlockA += PacketSize*depth;
+ }
+ }
+ blockA += depth*(peeled_mc1-i);
+ i = peeled_mc1;
+ }
+ }
+ // Pack scalars
+ if(Pack2<PacketSize && Pack2>1)
+ {
+ for(; i<peeled_mc0; i+=Pack2)
+ {
+ if (PanelMode) {
+ blockA += Pack2 * offset;
+ }
+
+ for(Index k=0; k<depth; k++) {
+ const LinearMapper dm0 = lhs.getLinearMapper(i, k);
+ for(Index w=0; w<Pack2; w++) {
+ *blockA = cj(dm0(w));
+ blockA += 1;
+ }
+ }
+
+ if(PanelMode) blockA += Pack2 * (stride-offset-depth);
+ }
+ }
+ for(; i<rows; i++)
+ {
+ if(PanelMode) blockA += offset;
+ for(Index k=0; k<depth; k++) {
+ *blockA = cj(lhs(i, k));
+ blockA += 1;
+ }
+ if(PanelMode) blockA += (stride-offset-depth);
+ }
+}
+
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
+{
+ typedef typename DataMapper::LinearMapper LinearMapper;
+ EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
+ ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+ typedef typename packet_traits<Scalar>::type Packet;
+ enum { PacketSize = packet_traits<Scalar>::size };
+
+ EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
+ EIGEN_UNUSED_VARIABLE(stride);
+ EIGEN_UNUSED_VARIABLE(offset);
+ eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+ conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+
+// const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
+// const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
+// const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
+
+ int pack = Pack1;
+ Index i = 0;
+ while(pack>0)
+ {
+ Index remaining_rows = rows-i;
+ Index peeled_mc = i+(remaining_rows/pack)*pack;
+ for(; i<peeled_mc; i+=pack)
+ {
+ if(PanelMode) blockA += pack * offset;
+
+ const Index peeled_k = (depth/PacketSize)*PacketSize;
+ Index k=0;
+ if(pack>=PacketSize)
+ {
+ for(; k<peeled_k; k+=PacketSize)
+ {
+ for (Index m = 0; m < pack; m += PacketSize)
+ {
+ PacketBlock<Packet> kernel;
+ for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k);
+ ptranspose(kernel);
+ for (int p = 0; p < PacketSize; ++p) pstore(blockA+m+(pack)*p, cj.pconj(kernel.packet[p]));
+ }
+ blockA += PacketSize*pack;
+ }
+ }
+ for(; k<depth; k++)
+ {
+ Index w=0;
+ for(; w<pack-3; w+=4)
+ {
+ Scalar a(cj(lhs(i+w+0, k))),
+ b(cj(lhs(i+w+1, k))),
+ c(cj(lhs(i+w+2, k))),
+ d(cj(lhs(i+w+3, k)));
+ blockA[0] = a;
+ blockA[1] = b;
+ blockA[2] = c;
+ blockA[3] = d;
+ blockA += 4;
+ }
+ if(pack%4)
+ for(;w<pack;++w) {
+ *blockA = cj(lhs(i+w, k));
+ blockA += 1;
+ }
+ }
+
+ if(PanelMode) blockA += pack * (stride-offset-depth);
+ }
+
+ pack -= PacketSize;
+ if(pack<Pack2 && (pack+PacketSize)!=Pack2)
+ pack = Pack2;
+ }
+
+ for(; i<rows; i++)
+ {
+ if(PanelMode) blockA += offset;
+ for(Index k=0; k<depth; k++) {
+ *blockA = cj(lhs(i, k));
+ blockA += 1;
+ }
+ if(PanelMode) blockA += (stride-offset-depth);
+ }
+}
+
+// copy a complete panel of the rhs
+// this version is optimized for column major matrices
+// The traversal order is as follow: (nr==4):
+// 0 1 2 3 12 13 14 15 24 27
+// 4 5 6 7 16 17 18 19 25 28
+// 8 9 10 11 20 21 22 23 26 29
+// . . . . . . . . . .
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+{
+ typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename DataMapper::LinearMapper LinearMapper;
+ enum { PacketSize = packet_traits<Scalar>::size };
+ EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+{
+ EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
+ EIGEN_UNUSED_VARIABLE(stride);
+ EIGEN_UNUSED_VARIABLE(offset);
+ eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+ conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+ Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
+ Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
+ const Index peeled_k = (depth/PacketSize)*PacketSize;
+// if(nr>=8)
+// {
+// for(Index j2=0; j2<packet_cols8; j2+=8)
+// {
+// // skip what we have before
+// if(PanelMode) count += 8 * offset;
+// const Scalar* b0 = &rhs[(j2+0)*rhsStride];
+// const Scalar* b1 = &rhs[(j2+1)*rhsStride];
+// const Scalar* b2 = &rhs[(j2+2)*rhsStride];
+// const Scalar* b3 = &rhs[(j2+3)*rhsStride];
+// const Scalar* b4 = &rhs[(j2+4)*rhsStride];
+// const Scalar* b5 = &rhs[(j2+5)*rhsStride];
+// const Scalar* b6 = &rhs[(j2+6)*rhsStride];
+// const Scalar* b7 = &rhs[(j2+7)*rhsStride];
+// Index k=0;
+// if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4
+// {
+// for(; k<peeled_k; k+=PacketSize) {
+// PacketBlock<Packet> kernel;
+// for (int p = 0; p < PacketSize; ++p) {
+// kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]);
+// }
+// ptranspose(kernel);
+// for (int p = 0; p < PacketSize; ++p) {
+// pstoreu(blockB+count, cj.pconj(kernel.packet[p]));
+// count+=PacketSize;
+// }
+// }
+// }
+// for(; k<depth; k++)
+// {
+// blockB[count+0] = cj(b0[k]);
+// blockB[count+1] = cj(b1[k]);
+// blockB[count+2] = cj(b2[k]);
+// blockB[count+3] = cj(b3[k]);
+// blockB[count+4] = cj(b4[k]);
+// blockB[count+5] = cj(b5[k]);
+// blockB[count+6] = cj(b6[k]);
+// blockB[count+7] = cj(b7[k]);
+// count += 8;
+// }
+// // skip what we have after
+// if(PanelMode) count += 8 * (stride-offset-depth);
+// }
+// }
+
+ if(nr>=4)
+ {
+ for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
+ {
+ // skip what we have before
+ if(PanelMode) blockB += 4 * offset;
+
+ // TODO: each of these makes a copy of the stride :(
+ const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+ const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+ const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+ const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+ Index k=0;
+ if((PacketSize%4)==0) // TODO enable vectorized transposition for PacketSize==2 ??
+ {
+ for(; k<peeled_k; k+=PacketSize) {
+ PacketBlock<Packet, 4> kernel;
+ kernel.packet[0] = dm0.loadPacket(k);
+ kernel.packet[1] = dm1.loadPacket(k);
+ kernel.packet[2] = dm2.loadPacket(k);
+ kernel.packet[3] = dm3.loadPacket(k);
+ ptranspose(kernel);
+ pstoreu(blockB+0*PacketSize, cj.pconj(kernel.packet[0]));
+ pstoreu(blockB+1*PacketSize, cj.pconj(kernel.packet[1]));
+ pstoreu(blockB+2*PacketSize, cj.pconj(kernel.packet[2]));
+ pstoreu(blockB+3*PacketSize, cj.pconj(kernel.packet[3]));
+ blockB+=4*PacketSize;
+ }
+ }
+ for(; k<depth; k++)
+ {
+ blockB[0] = cj(dm0(k));
+ blockB[1] = cj(dm1(k));
+ blockB[2] = cj(dm2(k));
+ blockB[3] = cj(dm3(k));
+ blockB += 4;
+ }
+ // skip what we have after
+ if(PanelMode) blockB += 4 * (stride-offset-depth);
+ }
+ }
+
+ // copy the remaining columns one at a time (nr==1)
+ for(Index j2=packet_cols4; j2<cols; ++j2)
+ {
+ const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
+ if(PanelMode) blockB += offset;
+ for(Index k=0; k<depth; k++)
+ {
+ *blockB = cj(dm0(k));
+ blockB += 1;
+ }
+ if(PanelMode) blockB += (stride-offset-depth);
+ }
+}
+
+// this version is optimized for row major matrices
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+{
+ typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename packet_traits<Scalar>::half HalfPacket;
+ typedef typename DataMapper::LinearMapper LinearMapper;
+ enum {
+ PacketSize = packet_traits<Scalar>::size,
+ HalfPacketSize = packet_traits<Scalar>::HasHalfPacket ? unpacket_traits<typename packet_traits<Scalar>::half>::size : 0
+ };
+ EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+ ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+{
+ EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
+ EIGEN_UNUSED_VARIABLE(stride);
+ EIGEN_UNUSED_VARIABLE(offset);
+ eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+ conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+ Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
+ Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
+
+// if(nr>=8)
+// {
+// for(Index j2=0; j2<packet_cols8; j2+=8)
+// {
+// // skip what we have before
+// if(PanelMode) count += 8 * offset;
+// for(Index k=0; k<depth; k++)
+// {
+// if (PacketSize==8) {
+// Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
+// pstoreu(blockB+count, cj.pconj(A));
+// } else if (PacketSize==4) {
+// Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
+// Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
+// pstoreu(blockB+count, cj.pconj(A));
+// pstoreu(blockB+count+PacketSize, cj.pconj(B));
+// } else {
+// const Scalar* b0 = &rhs[k*rhsStride + j2];
+// blockB[count+0] = cj(b0[0]);
+// blockB[count+1] = cj(b0[1]);
+// blockB[count+2] = cj(b0[2]);
+// blockB[count+3] = cj(b0[3]);
+// blockB[count+4] = cj(b0[4]);
+// blockB[count+5] = cj(b0[5]);
+// blockB[count+6] = cj(b0[6]);
+// blockB[count+7] = cj(b0[7]);
+// }
+// count += 8;
+// }
+// // skip what we have after
+// if(PanelMode) count += 8 * (stride-offset-depth);
+// }
+// }
+ if(nr>=4)
+ {
+ for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
+ {
+ // skip what we have before
+ if(PanelMode) blockB += 4 * offset;
+ for(Index k=0; k<depth; k++)
+ {
+ if (PacketSize==4) {
+ Packet A = rhs.loadPacket(k, j2);
+ pstore(blockB, cj.pconj(A));
+ blockB += PacketSize;
+ }
+ else if (HalfPacketSize==4) {
+ HalfPacket A = rhs.loadHalfPacket(k, j2);
+ pstore<Scalar, HalfPacket>(blockB, cj.pconj(A));
+ blockB += HalfPacketSize;
+ }
+ else {
+ const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
+ blockB[0] = cj(dm0(0));
+ blockB[1] = cj(dm0(1));
+ blockB[2] = cj(dm0(2));
+ blockB[3] = cj(dm0(3));
+ blockB += 4;
+ }
+ }
+ // skip what we have after
+ if(PanelMode) blockB += 4 * (stride-offset-depth);
+ }
+ }
+ // copy the remaining columns one at a time (nr==1)
+ for(Index j2=packet_cols4; j2<cols; ++j2)
+ {
+ if(PanelMode) blockB += offset;
+ for(Index k=0; k<depth; k++)
+ {
+ *blockB = cj(rhs(k, j2));
+ blockB += 1;
+ }
+ if(PanelMode) blockB += stride-offset-depth;
+ }
+}
+
+} // end namespace internal
+
+/** \returns the currently set level 1 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
+ * \sa setCpuCacheSize */
+inline std::ptrdiff_t l1CacheSize()
+{
+ std::ptrdiff_t l1, l2, l3;
+ internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
+ return l1;
+}
+
+/** \returns the currently set level 2 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
+ * \sa setCpuCacheSize */
+inline std::ptrdiff_t l2CacheSize()
+{
+ std::ptrdiff_t l1, l2, l3;
+ internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
+ return l2;
+}
+
+/** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
+ * \sa setCpuCacheSize */
+inline std::ptrdiff_t l3CacheSize()
+{
+ std::ptrdiff_t l1, l2, l3;
+ internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
+ return l3;
+}
+
+/** Set the cpu L1 and L2 cache sizes (in bytes).
+ * These values are use to adjust the size of the blocks
+ * for the algorithms working per blocks.
+ *
+ * \sa computeProductBlockingSizes */
+inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
+{
+ internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_BLOCK_PANEL_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix.h b/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix.h
new file mode 100644
index 0000000000..c3715b1a39
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -0,0 +1,465 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename _LhsScalar, typename _RhsScalar> class level3_blocking;
+
+/* Specialization for a row-major destination matrix => simple transposition of the product */
+template<
+ typename Index,
+ typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+ typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
+struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor>
+{
+ typedef gebp_traits<RhsScalar,LhsScalar> Traits;
+
+ typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+ static EIGEN_STRONG_INLINE void run(
+ Index rows, Index cols, Index depth,
+ const LhsScalar* lhs, Index lhsStride,
+ const RhsScalar* rhs, Index rhsStride,
+ ResScalar* res, Index resStride,
+ ResScalar alpha,
+ level3_blocking<RhsScalar,LhsScalar>& blocking,
+ GemmParallelInfo<Index>* info = 0)
+ {
+ // transpose the product such that the result is column major
+ general_matrix_matrix_product<Index,
+ RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
+ LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
+ ColMajor>
+ ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking,info);
+ }
+};
+
+/* Specialization for a col-major destination matrix
+ * => Blocking algorithm following Goto's paper */
+template<
+ typename Index,
+ typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+ typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
+struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor>
+{
+
+typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+
+typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+static void run(Index rows, Index cols, Index depth,
+ const LhsScalar* _lhs, Index lhsStride,
+ const RhsScalar* _rhs, Index rhsStride,
+ ResScalar* _res, Index resStride,
+ ResScalar alpha,
+ level3_blocking<LhsScalar,RhsScalar>& blocking,
+ GemmParallelInfo<Index>* info = 0)
+{
+ typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
+ typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
+ typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+ LhsMapper lhs(_lhs,lhsStride);
+ RhsMapper rhs(_rhs,rhsStride);
+ ResMapper res(_res, resStride);
+
+ Index kc = blocking.kc(); // cache block size along the K direction
+ Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
+ Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction
+
+ gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+ gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+ gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
+
+#ifdef EIGEN_HAS_OPENMP
+ if(info)
+ {
+ // this is the parallel version!
+ Index tid = omp_get_thread_num();
+ Index threads = omp_get_num_threads();
+
+ LhsScalar* blockA = blocking.blockA();
+ eigen_internal_assert(blockA!=0);
+
+ std::size_t sizeB = kc*nc;
+ ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, 0);
+
+ // For each horizontal panel of the rhs, and corresponding vertical panel of the lhs...
+ for(Index k=0; k<depth; k+=kc)
+ {
+ const Index actual_kc = (std::min)(k+kc,depth)-k; // => rows of B', and cols of the A'
+
+ // In order to reduce the chance that a thread has to wait for the other,
+ // let's start by packing B'.
+ pack_rhs(blockB, rhs.getSubMapper(k,0), actual_kc, nc);
+
+ // Pack A_k to A' in a parallel fashion:
+ // each thread packs the sub block A_k,i to A'_i where i is the thread id.
+
+ // However, before copying to A'_i, we have to make sure that no other thread is still using it,
+ // i.e., we test that info[tid].users equals 0.
+ // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
+ while(info[tid].users!=0) {}
+ info[tid].users += threads;
+
+ pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length);
+
+ // Notify the other threads that the part A'_i is ready to go.
+ info[tid].sync = k;
+
+ // Computes C_i += A' * B' per A'_i
+ for(Index shift=0; shift<threads; ++shift)
+ {
+ Index i = (tid+shift)%threads;
+
+ // At this point we have to make sure that A'_i has been updated by the thread i,
+ // we use testAndSetOrdered to mimic a volatile access.
+ // However, no need to wait for the B' part which has been updated by the current thread!
+ if (shift>0) {
+ while(info[i].sync!=k) {
+ }
+ }
+
+ gebp(res.getSubMapper(info[i].lhs_start, 0), blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
+ }
+
+ // Then keep going as usual with the remaining B'
+ for(Index j=nc; j<cols; j+=nc)
+ {
+ const Index actual_nc = (std::min)(j+nc,cols)-j;
+
+ // pack B_k,j to B'
+ pack_rhs(blockB, rhs.getSubMapper(k,j), actual_kc, actual_nc);
+
+ // C_j += A' * B'
+ gebp(res.getSubMapper(0, j), blockA, blockB, rows, actual_kc, actual_nc, alpha);
+ }
+
+ // Release all the sub blocks A'_i of A' for the current thread,
+ // i.e., we simply decrement the number of users by 1
+ #pragma omp critical
+ {
+ for(Index i=0; i<threads; ++i)
+ #pragma omp atomic
+ --(info[i].users);
+ }
+ }
+ }
+ else
+#endif // EIGEN_HAS_OPENMP
+ {
+ EIGEN_UNUSED_VARIABLE(info);
+
+ // this is the sequential version!
+ std::size_t sizeA = kc*mc;
+ std::size_t sizeB = kc*nc;
+
+ ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
+ ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
+
+ const bool pack_rhs_once = mc!=rows && kc==depth && nc==cols;
+
+ // For each horizontal panel of the rhs, and corresponding panel of the lhs...
+ for(Index i2=0; i2<rows; i2+=mc)
+ {
+ const Index actual_mc = (std::min)(i2+mc,rows)-i2;
+
+ for(Index k2=0; k2<depth; k2+=kc)
+ {
+ const Index actual_kc = (std::min)(k2+kc,depth)-k2;
+
+ // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
+ // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
+ // Note that this panel will be read as many times as the number of blocks in the rhs's
+ // horizontal panel which is, in practice, a very low number.
+ pack_lhs(blockA, lhs.getSubMapper(i2,k2), actual_kc, actual_mc);
+
+ // For each kc x nc block of the rhs's horizontal panel...
+ for(Index j2=0; j2<cols; j2+=nc)
+ {
+ const Index actual_nc = (std::min)(j2+nc,cols)-j2;
+
+ // We pack the rhs's block into a sequential chunk of memory (L2 caching)
+ // Note that this block will be read a very high number of times, which is equal to the number of
+ // micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
+ if((!pack_rhs_once) || i2==0)
+ pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc);
+
+ // Everything is packed, we can now call the panel * block kernel:
+ gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha);
+ }
+ }
+ }
+ }
+}
+
+};
+
+/*********************************************************************************
+* Specialization of GeneralProduct<> for "large" GEMM, i.e.,
+* implementation of the high level wrapper to general_matrix_matrix_product
+**********************************************************************************/
+
+template<typename Lhs, typename Rhs>
+struct traits<GeneralProduct<Lhs,Rhs,GemmProduct> >
+ : traits<ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> >
+{};
+
+template<typename Scalar, typename Index, typename Gemm, typename Lhs, typename Rhs, typename Dest, typename BlockingType>
+struct gemm_functor
+{
+ gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha, BlockingType& blocking)
+ : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking)
+ {}
+
+ void initParallelSession() const
+ {
+ m_blocking.allocateA();
+ }
+
+ void operator() (Index row, Index rows, Index col=0, Index cols=-1, GemmParallelInfo<Index>* info=0) const
+ {
+ if(cols==-1)
+ cols = m_rhs.cols();
+
+ Gemm::run(rows, cols, m_lhs.cols(),
+ /*(const Scalar*)*/&m_lhs.coeffRef(row,0), m_lhs.outerStride(),
+ /*(const Scalar*)*/&m_rhs.coeffRef(0,col), m_rhs.outerStride(),
+ (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(),
+ m_actualAlpha, m_blocking, info);
+ }
+
+ typedef typename Gemm::Traits Traits;
+
+ protected:
+ const Lhs& m_lhs;
+ const Rhs& m_rhs;
+ Dest& m_dest;
+ Scalar m_actualAlpha;
+ BlockingType& m_blocking;
+};
+
+template<int StorageOrder, typename LhsScalar, typename RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor=1,
+bool FiniteAtCompileTime = MaxRows!=Dynamic && MaxCols!=Dynamic && MaxDepth != Dynamic> class gemm_blocking_space;
+
+template<typename _LhsScalar, typename _RhsScalar>
+class level3_blocking
+{
+ typedef _LhsScalar LhsScalar;
+ typedef _RhsScalar RhsScalar;
+
+ protected:
+ LhsScalar* m_blockA;
+ RhsScalar* m_blockB;
+
+ DenseIndex m_mc;
+ DenseIndex m_nc;
+ DenseIndex m_kc;
+
+ public:
+
+ level3_blocking()
+ : m_blockA(0), m_blockB(0), m_mc(0), m_nc(0), m_kc(0)
+ {}
+
+ inline DenseIndex mc() const { return m_mc; }
+ inline DenseIndex nc() const { return m_nc; }
+ inline DenseIndex kc() const { return m_kc; }
+
+ inline LhsScalar* blockA() { return m_blockA; }
+ inline RhsScalar* blockB() { return m_blockB; }
+};
+
+template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
+class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, true>
+ : public level3_blocking<
+ typename conditional<StorageOrder==RowMajor,_RhsScalar,_LhsScalar>::type,
+ typename conditional<StorageOrder==RowMajor,_LhsScalar,_RhsScalar>::type>
+{
+ enum {
+ Transpose = StorageOrder==RowMajor,
+ ActualRows = Transpose ? MaxCols : MaxRows,
+ ActualCols = Transpose ? MaxRows : MaxCols
+ };
+ typedef typename conditional<Transpose,_RhsScalar,_LhsScalar>::type LhsScalar;
+ typedef typename conditional<Transpose,_LhsScalar,_RhsScalar>::type RhsScalar;
+ typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+ enum {
+ SizeA = ActualRows * MaxDepth,
+ SizeB = ActualCols * MaxDepth
+ };
+
+ EIGEN_ALIGN_DEFAULT LhsScalar m_staticA[SizeA];
+ EIGEN_ALIGN_DEFAULT RhsScalar m_staticB[SizeB];
+
+ public:
+
+ gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, int /*num_threads*/, bool /*full_rows = false*/)
+ {
+ this->m_mc = ActualRows;
+ this->m_nc = ActualCols;
+ this->m_kc = MaxDepth;
+ this->m_blockA = m_staticA;
+ this->m_blockB = m_staticB;
+ }
+
+ inline void allocateA() {}
+ inline void allocateB() {}
+ inline void allocateAll() {}
+};
+
+template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
+class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, false>
+ : public level3_blocking<
+ typename conditional<StorageOrder==RowMajor,_RhsScalar,_LhsScalar>::type,
+ typename conditional<StorageOrder==RowMajor,_LhsScalar,_RhsScalar>::type>
+{
+ enum {
+ Transpose = StorageOrder==RowMajor
+ };
+ typedef typename conditional<Transpose,_RhsScalar,_LhsScalar>::type LhsScalar;
+ typedef typename conditional<Transpose,_LhsScalar,_RhsScalar>::type RhsScalar;
+ typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+
+ DenseIndex m_sizeA;
+ DenseIndex m_sizeB;
+
+ public:
+
+ gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, DenseIndex num_threads, bool l3_blocking)
+ {
+ this->m_mc = Transpose ? cols : rows;
+ this->m_nc = Transpose ? rows : cols;
+ this->m_kc = depth;
+
+ if(l3_blocking)
+ {
+ computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc, num_threads);
+ }
+ else // no l3 blocking
+ {
+ DenseIndex m = this->m_mc;
+ DenseIndex n = this->m_nc;
+ computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, n, num_threads);
+ }
+
+ m_sizeA = this->m_mc * this->m_kc;
+ m_sizeB = this->m_kc * this->m_nc;
+ }
+
+ void allocateA()
+ {
+ if(this->m_blockA==0)
+ this->m_blockA = aligned_new<LhsScalar>(m_sizeA);
+ }
+
+ void allocateB()
+ {
+ if(this->m_blockB==0)
+ this->m_blockB = aligned_new<RhsScalar>(m_sizeB);
+ }
+
+ void allocateAll()
+ {
+ allocateA();
+ allocateB();
+ }
+
+ ~gemm_blocking_space()
+ {
+ aligned_delete(this->m_blockA, m_sizeA);
+ aligned_delete(this->m_blockB, m_sizeB);
+ }
+};
+
+} // end namespace internal
+
+template<typename Lhs, typename Rhs>
+class GeneralProduct<Lhs, Rhs, GemmProduct>
+ : public ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs>
+{
+ enum {
+ MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime)
+ };
+ public:
+ EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
+
+ typedef typename Lhs::Scalar LhsScalar;
+ typedef typename Rhs::Scalar RhsScalar;
+ typedef Scalar ResScalar;
+
+ GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
+ {
+ typedef internal::scalar_product_op<LhsScalar,RhsScalar> BinOp;
+ EIGEN_CHECK_BINARY_COMPATIBILIY(BinOp,LhsScalar,RhsScalar);
+ }
+
+ template<typename Dest>
+ inline void evalTo(Dest& dst) const
+ {
+ if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0)
+ dst.noalias() = m_lhs .lazyProduct( m_rhs );
+ else
+ {
+ dst.setZero();
+ scaleAndAddTo(dst,Scalar(1));
+ }
+ }
+
+ template<typename Dest>
+ inline void addTo(Dest& dst) const
+ {
+ if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0)
+ dst.noalias() += m_lhs .lazyProduct( m_rhs );
+ else
+ scaleAndAddTo(dst,Scalar(1));
+ }
+
+ template<typename Dest>
+ inline void subTo(Dest& dst) const
+ {
+ if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0)
+ dst.noalias() -= m_lhs .lazyProduct( m_rhs );
+ else
+ scaleAndAddTo(dst,Scalar(-1));
+ }
+
+ template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+ {
+ eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
+
+ typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
+ typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
+
+ Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
+ * RhsBlasTraits::extractScalarFactor(m_rhs);
+
+ typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar,
+ Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType;
+
+ typedef internal::gemm_functor<
+ Scalar, Index,
+ internal::general_matrix_matrix_product<
+ Index,
+ LhsScalar, (_ActualLhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate),
+ RhsScalar, (_ActualRhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
+ (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
+ _ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor;
+
+ BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
+
+ internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit);
+ }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
new file mode 100644
index 0000000000..e4c10e88d1
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -0,0 +1,285 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H
+
+namespace Eigen {
+
+template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjLhs, bool ConjRhs>
+struct selfadjoint_rank1_update;
+
+namespace internal {
+
+/**********************************************************************
+* This file implements a general A * B product while
+* evaluating only one triangular part of the product.
+* This is more general version of self adjoint product (C += A A^T)
+* as the level 3 SYRK Blas routine.
+**********************************************************************/
+
+// forward declarations (defined at the end of this file)
+template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo>
+struct tribb_kernel;
+
+/* Optimized matrix-matrix product evaluating only one triangular half */
+template <typename Index,
+ typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+ typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
+ int ResStorageOrder, int UpLo, int Version = Specialized>
+struct general_matrix_matrix_triangular_product;
+
+// as usual if the result is row major => we transpose the product
+template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+ typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int UpLo, int Version>
+struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,UpLo,Version>
+{
+ typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+ static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride,
+ const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha)
+ {
+ general_matrix_matrix_triangular_product<Index,
+ RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
+ LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
+ ColMajor, UpLo==Lower?Upper:Lower>
+ ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha);
+ }
+};
+
+template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+ typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int UpLo, int Version>
+struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Version>
+{
+ typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+ static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
+ const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, const ResScalar& alpha)
+ {
+ typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+
+ typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
+ typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
+ typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+ LhsMapper lhs(_lhs,lhsStride);
+ RhsMapper rhs(_rhs,rhsStride);
+ ResMapper res(_res, resStride);
+
+ Index kc = depth; // cache block size along the K direction
+ Index mc = size; // cache block size along the M direction
+ Index nc = size; // cache block size along the N direction
+ computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc, Index(1));
+ // !!! mc must be a multiple of nr:
+ if(mc > Traits::nr)
+ mc = (mc/Traits::nr)*Traits::nr;
+
+ ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0);
+ ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0);
+
+ gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+ gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+ gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
+ tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, UpLo> sybb;
+
+ for(Index k2=0; k2<depth; k2+=kc)
+ {
+ const Index actual_kc = (std::min)(k2+kc,depth)-k2;
+
+ // note that the actual rhs is the transpose/adjoint of mat
+ pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, size);
+
+ for(Index i2=0; i2<size; i2+=mc)
+ {
+ const Index actual_mc = (std::min)(i2+mc,size)-i2;
+
+ pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
+
+ // the selected actual_mc * size panel of res is split into three different part:
+ // 1 - before the diagonal => processed with gebp or skipped
+ // 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel
+ // 3 - after the diagonal => processed with gebp or skipped
+ if (UpLo==Lower)
+ gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc,
+ (std::min)(size,i2), alpha, -1, -1, 0, 0);
+
+
+ sybb(_res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
+
+ if (UpLo==Upper)
+ {
+ Index j2 = i2+actual_mc;
+ gebp(res.getSubMapper(i2, j2), blockA, blockB+actual_kc*j2, actual_mc,
+ actual_kc, (std::max)(Index(0), size-j2), alpha, -1, -1, 0, 0);
+ }
+ }
+ }
+ }
+};
+
+// Optimized packed Block * packed Block product kernel evaluating only one given triangular part
+// This kernel is built on top of the gebp kernel:
+// - the current destination block is processed per panel of actual_mc x BlockSize
+// where BlockSize is set to the minimal value allowing gebp to be as fast as possible
+// - then, as usual, each panel is split into three parts along the diagonal,
+// the sub blocks above and below the diagonal are processed as usual,
+// while the triangular block overlapping the diagonal is evaluated into a
+// small temporary buffer which is then accumulated into the result using a
+// triangular traversal.
+template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo>
+struct tribb_kernel
+{
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjLhs,ConjRhs> Traits;
+ typedef typename Traits::ResScalar ResScalar;
+
+ enum {
+ BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr)
+ };
+ void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
+ {
+ typedef blas_data_mapper<ResScalar, Index, ColMajor> ResMapper;
+ ResMapper res(_res, resStride);
+ gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
+
+ Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer;
+
+ // let's process the block per panel of actual_mc x BlockSize,
+ // again, each is split into three parts, etc.
+ for (Index j=0; j<size; j+=BlockSize)
+ {
+ Index actualBlockSize = std::min<Index>(BlockSize,size - j);
+ const RhsScalar* actual_b = blockB+j*depth;
+
+ if(UpLo==Upper)
+ gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha,
+ -1, -1, 0, 0);
+
+ // selfadjoint micro block
+ {
+ Index i = j;
+ buffer.setZero();
+ // 1 - apply the kernel on the temporary buffer
+ gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
+ -1, -1, 0, 0);
+ // 2 - triangular accumulation
+ for(Index j1=0; j1<actualBlockSize; ++j1)
+ {
+ ResScalar* r = &res(i, j + j1);
+ for(Index i1=UpLo==Lower ? j1 : 0;
+ UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1)
+ r[i1] += buffer(i1,j1);
+ }
+ }
+
+ if(UpLo==Lower)
+ {
+ Index i = j+actualBlockSize;
+ gebp_kernel(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i,
+ depth, actualBlockSize, alpha, -1, -1, 0, 0);
+ }
+ }
+ }
+};
+
+} // end namespace internal
+
+// high level API
+
+template<typename MatrixType, typename ProductType, int UpLo, bool IsOuterProduct>
+struct general_product_to_triangular_selector;
+
+
+template<typename MatrixType, typename ProductType, int UpLo>
+struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,true>
+{
+ static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha)
+ {
+ typedef typename MatrixType::Scalar Scalar;
+ typedef typename MatrixType::Index Index;
+
+ typedef typename internal::remove_all<typename ProductType::LhsNested>::type Lhs;
+ typedef internal::blas_traits<Lhs> LhsBlasTraits;
+ typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhs;
+ typedef typename internal::remove_all<ActualLhs>::type _ActualLhs;
+ typename internal::add_const_on_value_type<ActualLhs>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
+
+ typedef typename internal::remove_all<typename ProductType::RhsNested>::type Rhs;
+ typedef internal::blas_traits<Rhs> RhsBlasTraits;
+ typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhs;
+ typedef typename internal::remove_all<ActualRhs>::type _ActualRhs;
+ typename internal::add_const_on_value_type<ActualRhs>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
+
+ Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived());
+
+ enum {
+ StorageOrder = (internal::traits<MatrixType>::Flags&RowMajorBit) ? RowMajor : ColMajor,
+ UseLhsDirectly = _ActualLhs::InnerStrideAtCompileTime==1,
+ UseRhsDirectly = _ActualRhs::InnerStrideAtCompileTime==1
+ };
+
+ internal::gemv_static_vector_if<Scalar,Lhs::SizeAtCompileTime,Lhs::MaxSizeAtCompileTime,!UseLhsDirectly> static_lhs;
+ ei_declare_aligned_stack_constructed_variable(Scalar, actualLhsPtr, actualLhs.size(),
+ (UseLhsDirectly ? const_cast<Scalar*>(actualLhs.data()) : static_lhs.data()));
+ if(!UseLhsDirectly) Map<typename _ActualLhs::PlainObject>(actualLhsPtr, actualLhs.size()) = actualLhs;
+
+ internal::gemv_static_vector_if<Scalar,Rhs::SizeAtCompileTime,Rhs::MaxSizeAtCompileTime,!UseRhsDirectly> static_rhs;
+ ei_declare_aligned_stack_constructed_variable(Scalar, actualRhsPtr, actualRhs.size(),
+ (UseRhsDirectly ? const_cast<Scalar*>(actualRhs.data()) : static_rhs.data()));
+ if(!UseRhsDirectly) Map<typename _ActualRhs::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
+
+
+ selfadjoint_rank1_update<Scalar,Index,StorageOrder,UpLo,
+ LhsBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex,
+ RhsBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex>
+ ::run(actualLhs.size(), mat.data(), mat.outerStride(), actualLhsPtr, actualRhsPtr, actualAlpha);
+ }
+};
+
+template<typename MatrixType, typename ProductType, int UpLo>
+struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
+{
+ static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha)
+ {
+ typedef typename MatrixType::Index Index;
+
+ typedef typename internal::remove_all<typename ProductType::LhsNested>::type Lhs;
+ typedef internal::blas_traits<Lhs> LhsBlasTraits;
+ typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhs;
+ typedef typename internal::remove_all<ActualLhs>::type _ActualLhs;
+ typename internal::add_const_on_value_type<ActualLhs>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
+
+ typedef typename internal::remove_all<typename ProductType::RhsNested>::type Rhs;
+ typedef internal::blas_traits<Rhs> RhsBlasTraits;
+ typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhs;
+ typedef typename internal::remove_all<ActualRhs>::type _ActualRhs;
+ typename internal::add_const_on_value_type<ActualRhs>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
+
+ typename ProductType::Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived());
+
+ internal::general_matrix_matrix_triangular_product<Index,
+ typename Lhs::Scalar, _ActualLhs::Flags&RowMajorBit ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
+ typename Rhs::Scalar, _ActualRhs::Flags&RowMajorBit ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
+ MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, UpLo>
+ ::run(mat.cols(), actualLhs.cols(),
+ &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &actualRhs.coeffRef(0,0), actualRhs.outerStride(),
+ mat.data(), mat.outerStride(), actualAlpha);
+ }
+};
+
+template<typename MatrixType, unsigned int UpLo>
+template<typename ProductDerived, typename _Lhs, typename _Rhs>
+TriangularView<MatrixType,UpLo>& TriangularView<MatrixType,UpLo>::assignProduct(const ProductBase<ProductDerived, _Lhs,_Rhs>& prod, const Scalar& alpha)
+{
+ eigen_assert(m_matrix.rows() == prod.rows() && m_matrix.cols() == prod.cols());
+
+ general_product_to_triangular_selector<MatrixType, ProductDerived, UpLo, (_Lhs::ColsAtCompileTime==1) || (_Rhs::RowsAtCompileTime==1)>::run(m_matrix.const_cast_derived(), prod.derived(), alpha);
+
+ return *this;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h b/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h
new file mode 100644
index 0000000000..3deed068e3
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h
@@ -0,0 +1,146 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+ be used to endorse or promote products derived from this software without
+ specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ * Content : Eigen bindings to Intel(R) MKL
+ * Level 3 BLAS SYRK/HERK implementation.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Index, typename Scalar, int AStorageOrder, bool ConjugateA, int ResStorageOrder, int UpLo>
+struct general_matrix_matrix_rankupdate :
+ general_matrix_matrix_triangular_product<
+ Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,UpLo,BuiltIn> {};
+
+
+// try to go to BLAS specialization
+#define EIGEN_MKL_RANKUPDATE_SPECIALIZE(Scalar) \
+template <typename Index, int LhsStorageOrder, bool ConjugateLhs, \
+ int RhsStorageOrder, bool ConjugateRhs, int UpLo> \
+struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,ConjugateLhs, \
+ Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Specialized> { \
+ static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \
+ const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha) \
+ { \
+ if (lhs==rhs) { \
+ general_matrix_matrix_rankupdate<Index,Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,UpLo> \
+ ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \
+ } else { \
+ general_matrix_matrix_triangular_product<Index, \
+ Scalar, LhsStorageOrder, ConjugateLhs, \
+ Scalar, RhsStorageOrder, ConjugateRhs, \
+ ColMajor, UpLo, BuiltIn> \
+ ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \
+ } \
+ } \
+};
+
+EIGEN_MKL_RANKUPDATE_SPECIALIZE(double)
+//EIGEN_MKL_RANKUPDATE_SPECIALIZE(dcomplex)
+EIGEN_MKL_RANKUPDATE_SPECIALIZE(float)
+//EIGEN_MKL_RANKUPDATE_SPECIALIZE(scomplex)
+
+// SYRK for float/double
+#define EIGEN_MKL_RANKUPDATE_R(EIGTYPE, MKLTYPE, MKLFUNC) \
+template <typename Index, int AStorageOrder, bool ConjugateA, int UpLo> \
+struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \
+ enum { \
+ IsLower = (UpLo&Lower) == Lower, \
+ LowUp = IsLower ? Lower : Upper, \
+ conjA = ((AStorageOrder==ColMajor) && ConjugateA) ? 1 : 0 \
+ }; \
+ static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \
+ const EIGTYPE* rhs, Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha) \
+ { \
+ /* typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs;*/ \
+\
+ MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \
+ char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'T':'N'; \
+ MKLTYPE alpha_, beta_; \
+\
+/* Set alpha_ & beta_ */ \
+ assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
+ assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \
+ MKLFUNC(&uplo, &trans, &n, &k, &alpha_, lhs, &lda, &beta_, res, &ldc); \
+ } \
+};
+
+// HERK for complex data
+#define EIGEN_MKL_RANKUPDATE_C(EIGTYPE, MKLTYPE, RTYPE, MKLFUNC) \
+template <typename Index, int AStorageOrder, bool ConjugateA, int UpLo> \
+struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \
+ enum { \
+ IsLower = (UpLo&Lower) == Lower, \
+ LowUp = IsLower ? Lower : Upper, \
+ conjA = (((AStorageOrder==ColMajor) && ConjugateA) || ((AStorageOrder==RowMajor) && !ConjugateA)) ? 1 : 0 \
+ }; \
+ static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \
+ const EIGTYPE* rhs, Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha) \
+ { \
+ typedef Matrix<EIGTYPE, Dynamic, Dynamic, AStorageOrder> MatrixType; \
+\
+ MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \
+ char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'C':'N'; \
+ RTYPE alpha_, beta_; \
+ const EIGTYPE* a_ptr; \
+\
+/* Set alpha_ & beta_ */ \
+/* assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); */\
+/* assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1));*/ \
+ alpha_ = alpha.real(); \
+ beta_ = 1.0; \
+/* Copy with conjugation in some cases*/ \
+ MatrixType a; \
+ if (conjA) { \
+ Map<const MatrixType, 0, OuterStride<> > mapA(lhs,n,k,OuterStride<>(lhsStride)); \
+ a = mapA.conjugate(); \
+ lda = a.outerStride(); \
+ a_ptr = a.data(); \
+ } else a_ptr=lhs; \
+ MKLFUNC(&uplo, &trans, &n, &k, &alpha_, (MKLTYPE*)a_ptr, &lda, &beta_, (MKLTYPE*)res, &ldc); \
+ } \
+};
+
+
+EIGEN_MKL_RANKUPDATE_R(double, double, dsyrk)
+EIGEN_MKL_RANKUPDATE_R(float, float, ssyrk)
+
+//EIGEN_MKL_RANKUPDATE_C(dcomplex, MKL_Complex16, double, zherk)
+//EIGEN_MKL_RANKUPDATE_C(scomplex, MKL_Complex8, double, cherk)
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h b/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
new file mode 100644
index 0000000000..060af328eb
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
@@ -0,0 +1,118 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+ be used to endorse or promote products derived from this software without
+ specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ * Content : Eigen bindings to Intel(R) MKL
+ * General matrix-matrix product functionality based on ?GEMM.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
+
+namespace Eigen {
+
+namespace internal {
+
+/**********************************************************************
+* This file implements general matrix-matrix multiplication using BLAS
+* gemm function via partial specialization of
+* general_matrix_matrix_product::run(..) method for float, double,
+* std::complex<float> and std::complex<double> types
+**********************************************************************/
+
+// gemm specialization
+
+#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, MKLTYPE, MKLPREFIX) \
+template< \
+ typename Index, \
+ int LhsStorageOrder, bool ConjugateLhs, \
+ int RhsStorageOrder, bool ConjugateRhs> \
+struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor> \
+{ \
+static void run(Index rows, Index cols, Index depth, \
+ const EIGTYPE* _lhs, Index lhsStride, \
+ const EIGTYPE* _rhs, Index rhsStride, \
+ EIGTYPE* res, Index resStride, \
+ EIGTYPE alpha, \
+ level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/, \
+ GemmParallelInfo<Index>* /*info = 0*/) \
+{ \
+ using std::conj; \
+\
+ char transa, transb; \
+ MKL_INT m, n, k, lda, ldb, ldc; \
+ const EIGTYPE *a, *b; \
+ MKLTYPE alpha_, beta_; \
+ MatrixX##EIGPREFIX a_tmp, b_tmp; \
+ EIGTYPE myone(1);\
+\
+/* Set transpose options */ \
+ transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \
+ transb = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \
+\
+/* Set m, n, k */ \
+ m = (MKL_INT)rows; \
+ n = (MKL_INT)cols; \
+ k = (MKL_INT)depth; \
+\
+/* Set alpha_ & beta_ */ \
+ assign_scalar_eig2mkl(alpha_, alpha); \
+ assign_scalar_eig2mkl(beta_, myone); \
+\
+/* Set lda, ldb, ldc */ \
+ lda = (MKL_INT)lhsStride; \
+ ldb = (MKL_INT)rhsStride; \
+ ldc = (MKL_INT)resStride; \
+\
+/* Set a, b, c */ \
+ if ((LhsStorageOrder==ColMajor) && (ConjugateLhs)) { \
+ Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,k,OuterStride<>(lhsStride)); \
+ a_tmp = lhs.conjugate(); \
+ a = a_tmp.data(); \
+ lda = a_tmp.outerStride(); \
+ } else a = _lhs; \
+\
+ if ((RhsStorageOrder==ColMajor) && (ConjugateRhs)) { \
+ Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,k,n,OuterStride<>(rhsStride)); \
+ b_tmp = rhs.conjugate(); \
+ b = b_tmp.data(); \
+ ldb = b_tmp.outerStride(); \
+ } else b = _rhs; \
+\
+ MKLPREFIX##gemm(&transa, &transb, &m, &n, &k, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
+}};
+
+GEMM_SPECIALIZATION(double, d, double, d)
+GEMM_SPECIALIZATION(float, f, float, s)
+GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, z)
+GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, c)
+
+} // end namespase internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixVector.h b/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixVector.h
new file mode 100644
index 0000000000..cb67d5d0a9
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -0,0 +1,618 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
+#define EIGEN_GENERAL_MATRIX_VECTOR_H
+
+namespace Eigen {
+
+namespace internal {
+
+/* Optimized col-major matrix * vector product:
+ * This algorithm processes 4 columns at onces that allows to both reduce
+ * the number of load/stores of the result by a factor 4 and to reduce
+ * the instruction dependency. Moreover, we know that all bands have the
+ * same alignment pattern.
+ *
+ * Mixing type logic: C += alpha * A * B
+ * | A | B |alpha| comments
+ * |real |cplx |cplx | no vectorization
+ * |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
+ * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
+ * |cplx |real |real | optimal case, vectorization possible via real-cplx mul
+ *
+ * Accesses to the matrix coefficients follow the following logic:
+ *
+ * - if all columns have the same alignment then
+ * - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case)
+ * - otherwise perform unaligned loads only (-> NoneAligned case)
+ * - otherwise
+ * - if even columns have the same alignment then
+ * // odd columns are guaranteed to have the same alignment too
+ * - if even or odd columns have the same alignment as the result, then
+ * // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double)
+ * - perform half aligned and half unaligned loads (-> EvenAligned case)
+ * - otherwise perform unaligned loads only (-> NoneAligned case)
+ * - otherwise, if the register size is 4 scalars (e.g., SSE with float) then
+ * - one over 4 consecutive columns is guaranteed to be aligned with the result vector,
+ * perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case)
+ * // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h
+ * - otherwise,
+ * // if we get here, this means the register size is greater than 4 (e.g., AVX with floats),
+ * // we currently fall back to the NoneAligned case
+ *
+ * The same reasoning apply for the transposed case.
+ *
+ * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet...
+ * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment
+ * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow
+ * compared to unaligned loads on a 4 byte boundary.
+ *
+ */
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
+{
+ typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+enum {
+ Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
+ && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
+ LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
+ RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
+ ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
+};
+
+typedef typename packet_traits<LhsScalar>::type _LhsPacket;
+typedef typename packet_traits<RhsScalar>::type _RhsPacket;
+typedef typename packet_traits<ResScalar>::type _ResPacket;
+
+typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
+typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
+typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+
+EIGEN_DONT_INLINE static void run(
+ Index rows, Index cols,
+ const LhsMapper& lhs,
+ const RhsMapper& rhs,
+ ResScalar* res, Index resIncr,
+ RhsScalar alpha);
+};
+
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
+ Index rows, Index cols,
+ const LhsMapper& lhs,
+ const RhsMapper& rhs,
+ ResScalar* res, Index resIncr,
+ RhsScalar alpha)
+{
+ EIGEN_UNUSED_VARIABLE(resIncr);
+ eigen_internal_assert(resIncr==1);
+ #ifdef _EIGEN_ACCUMULATE_PACKETS
+ #error _EIGEN_ACCUMULATE_PACKETS has already been defined
+ #endif
+ #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \
+ pstore(&res[j], \
+ padd(pload<ResPacket>(&res[j]), \
+ padd( \
+ padd(pcj.pmul(lhs0.template load<LhsPacket, Alignment0>(j), ptmp0), \
+ pcj.pmul(lhs1.template load<LhsPacket, Alignment13>(j), ptmp1)), \
+ padd(pcj.pmul(lhs2.template load<LhsPacket, Alignment2>(j), ptmp2), \
+ pcj.pmul(lhs3.template load<LhsPacket, Alignment13>(j), ptmp3)) )))
+
+ typedef typename LhsMapper::VectorMapper LhsScalars;
+
+ conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
+ conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
+ if(ConjugateRhs)
+ alpha = numext::conj(alpha);
+
+ enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
+ const Index columnsAtOnce = 4;
+ const Index peels = 2;
+ const Index LhsPacketAlignedMask = LhsPacketSize-1;
+ const Index ResPacketAlignedMask = ResPacketSize-1;
+// const Index PeelAlignedMask = ResPacketSize*peels-1;
+ const Index size = rows;
+
+ const Index lhsStride = lhs.stride();
+
+ // How many coeffs of the result do we have to skip to be aligned.
+ // Here we assume data are at least aligned on the base scalar type.
+ Index alignedStart = internal::first_aligned(res,size);
+ Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
+ const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
+
+ const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
+ Index alignmentPattern = alignmentStep==0 ? AllAligned
+ : alignmentStep==(LhsPacketSize/2) ? EvenAligned
+ : FirstAligned;
+
+ // we cannot assume the first element is aligned because of sub-matrices
+ const Index lhsAlignmentOffset = lhs.firstAligned(size);
+
+ // find how many columns do we have to skip to be aligned with the result (if possible)
+ Index skipColumns = 0;
+ // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
+ if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (size_t(res)%sizeof(ResScalar)) )
+ {
+ alignedSize = 0;
+ alignedStart = 0;
+ alignmentPattern = NoneAligned;
+ }
+ else if(LhsPacketSize > 4)
+ {
+ // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
+ // Currently, it seems to be better to perform unaligned loads anyway
+ alignmentPattern = NoneAligned;
+ }
+ else if (LhsPacketSize>1)
+ {
+ // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
+
+ while (skipColumns<LhsPacketSize &&
+ alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
+ ++skipColumns;
+ if (skipColumns==LhsPacketSize)
+ {
+ // nothing can be aligned, no need to skip any column
+ alignmentPattern = NoneAligned;
+ skipColumns = 0;
+ }
+ else
+ {
+ skipColumns = (std::min)(skipColumns,cols);
+ // note that the skiped columns are processed later.
+ }
+
+ /* eigen_internal_assert( (alignmentPattern==NoneAligned)
+ || (skipColumns + columnsAtOnce >= cols)
+ || LhsPacketSize > size
+ || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/
+ }
+ else if(Vectorizable)
+ {
+ alignedStart = 0;
+ alignedSize = size;
+ alignmentPattern = AllAligned;
+ }
+
+ const Index offset1 = (FirstAligned && alignmentStep==1?3:1);
+ const Index offset3 = (FirstAligned && alignmentStep==1?1:3);
+
+ Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
+ for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
+ {
+ RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(i, 0)),
+ ptmp1 = pset1<RhsPacket>(alpha*rhs(i+offset1, 0)),
+ ptmp2 = pset1<RhsPacket>(alpha*rhs(i+2, 0)),
+ ptmp3 = pset1<RhsPacket>(alpha*rhs(i+offset3, 0));
+
+ // this helps a lot generating better binary code
+ const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0), lhs1 = lhs.getVectorMapper(0, i+offset1),
+ lhs2 = lhs.getVectorMapper(0, i+2), lhs3 = lhs.getVectorMapper(0, i+offset3);
+
+ if (Vectorizable)
+ {
+ /* explicit vectorization */
+ // process initial unaligned coeffs
+ for (Index j=0; j<alignedStart; ++j)
+ {
+ res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
+ res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
+ res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
+ res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
+ }
+
+ if (alignedSize>alignedStart)
+ {
+ switch(alignmentPattern)
+ {
+ case AllAligned:
+ for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
+ _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
+ break;
+ case EvenAligned:
+ for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
+ _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
+ break;
+ case FirstAligned:
+ {
+ Index j = alignedStart;
+ if(peels>1)
+ {
+ LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
+ ResPacket T0, T1;
+
+ A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
+ A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
+ A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
+
+ for (; j<peeledSize; j+=peels*ResPacketSize)
+ {
+ A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11);
+ A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12);
+ A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13);
+
+ A00 = lhs0.template load<LhsPacket, Aligned>(j);
+ A10 = lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize);
+ T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
+ T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
+
+ T0 = pcj.pmadd(A01, ptmp1, T0);
+ A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01);
+ T0 = pcj.pmadd(A02, ptmp2, T0);
+ A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02);
+ T0 = pcj.pmadd(A03, ptmp3, T0);
+ pstore(&res[j],T0);
+ A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03);
+ T1 = pcj.pmadd(A11, ptmp1, T1);
+ T1 = pcj.pmadd(A12, ptmp2, T1);
+ T1 = pcj.pmadd(A13, ptmp3, T1);
+ pstore(&res[j+ResPacketSize],T1);
+ }
+ }
+ for (; j<alignedSize; j+=ResPacketSize)
+ _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
+ break;
+ }
+ default:
+ for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
+ _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
+ break;
+ }
+ }
+ } // end explicit vectorization
+
+ /* process remaining coeffs (or all if there is no explicit vectorization) */
+ for (Index j=alignedSize; j<size; ++j)
+ {
+ res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
+ res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
+ res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
+ res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
+ }
+ }
+
+ // process remaining first and last columns (at most columnsAtOnce-1)
+ Index end = cols;
+ Index start = columnBound;
+ do
+ {
+ for (Index k=start; k<end; ++k)
+ {
+ RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(k, 0));
+ const LhsScalars lhs0 = lhs.getVectorMapper(0, k);
+
+ if (Vectorizable)
+ {
+ /* explicit vectorization */
+ // process first unaligned result's coeffs
+ for (Index j=0; j<alignedStart; ++j)
+ res[j] += cj.pmul(lhs0(j), pfirst(ptmp0));
+ // process aligned result's coeffs
+ if (lhs0.template aligned<LhsPacket>(alignedStart))
+ for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
+ pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(i), ptmp0, pload<ResPacket>(&res[i])));
+ else
+ for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
+ pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(i), ptmp0, pload<ResPacket>(&res[i])));
+ }
+
+ // process remaining scalars (or all if no explicit vectorization)
+ for (Index i=alignedSize; i<size; ++i)
+ res[i] += cj.pmul(lhs0(i), pfirst(ptmp0));
+ }
+ if (skipColumns)
+ {
+ start = 0;
+ end = skipColumns;
+ skipColumns = 0;
+ }
+ else
+ break;
+ } while(Vectorizable);
+ #undef _EIGEN_ACCUMULATE_PACKETS
+}
+
+/* Optimized row-major matrix * vector product:
+ * This algorithm processes 4 rows at onces that allows to both reduce
+ * the number of load/stores of the result by a factor 4 and to reduce
+ * the instruction dependency. Moreover, we know that all bands have the
+ * same alignment pattern.
+ *
+ * Mixing type logic:
+ * - alpha is always a complex (or converted to a complex)
+ * - no vectorization
+ */
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
+{
+typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+enum {
+ Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
+ && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
+ LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
+ RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
+ ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
+};
+
+typedef typename packet_traits<LhsScalar>::type _LhsPacket;
+typedef typename packet_traits<RhsScalar>::type _RhsPacket;
+typedef typename packet_traits<ResScalar>::type _ResPacket;
+
+typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
+typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
+typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+
+EIGEN_DONT_INLINE static void run(
+ Index rows, Index cols,
+ const LhsMapper& lhs,
+ const RhsMapper& rhs,
+ ResScalar* res, Index resIncr,
+ ResScalar alpha);
+};
+
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
+ Index rows, Index cols,
+ const LhsMapper& lhs,
+ const RhsMapper& rhs,
+ ResScalar* res, Index resIncr,
+ ResScalar alpha)
+{
+ eigen_internal_assert(rhs.stride()==1);
+
+ #ifdef _EIGEN_ACCUMULATE_PACKETS
+ #error _EIGEN_ACCUMULATE_PACKETS has already been defined
+ #endif
+
+ #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\
+ RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0); \
+ ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Alignment0>(j), b, ptmp0); \
+ ptmp1 = pcj.pmadd(lhs1.template load<LhsPacket, Alignment13>(j), b, ptmp1); \
+ ptmp2 = pcj.pmadd(lhs2.template load<LhsPacket, Alignment2>(j), b, ptmp2); \
+ ptmp3 = pcj.pmadd(lhs3.template load<LhsPacket, Alignment13>(j), b, ptmp3); }
+
+ conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
+ conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
+
+ typedef typename LhsMapper::VectorMapper LhsScalars;
+
+ enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
+ const Index rowsAtOnce = 4;
+ const Index peels = 2;
+ const Index RhsPacketAlignedMask = RhsPacketSize-1;
+ const Index LhsPacketAlignedMask = LhsPacketSize-1;
+ const Index depth = cols;
+ const Index lhsStride = lhs.stride();
+
+ // How many coeffs of the result do we have to skip to be aligned.
+ // Here we assume data are at least aligned on the base scalar type
+ // if that's not the case then vectorization is discarded, see below.
+ Index alignedStart = rhs.firstAligned(depth);
+ Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
+ const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
+
+ const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
+ Index alignmentPattern = alignmentStep==0 ? AllAligned
+ : alignmentStep==(LhsPacketSize/2) ? EvenAligned
+ : FirstAligned;
+
+ // we cannot assume the first element is aligned because of sub-matrices
+ const Index lhsAlignmentOffset = lhs.firstAligned(depth);
+ const Index rhsAlignmentOffset = rhs.firstAligned(rows);
+
+ // find how many rows do we have to skip to be aligned with rhs (if possible)
+ Index skipRows = 0;
+ // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
+ if( (sizeof(LhsScalar)!=sizeof(RhsScalar))
+ || (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth)
+ || (rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows))
+ {
+ alignedSize = 0;
+ alignedStart = 0;
+ alignmentPattern = NoneAligned;
+ }
+ else if(LhsPacketSize > 4)
+ {
+ // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
+ alignmentPattern = NoneAligned;
+ }
+ else if (LhsPacketSize>1)
+ {
+ // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
+
+ while (skipRows<LhsPacketSize &&
+ alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
+ ++skipRows;
+ if (skipRows==LhsPacketSize)
+ {
+ // nothing can be aligned, no need to skip any column
+ alignmentPattern = NoneAligned;
+ skipRows = 0;
+ }
+ else
+ {
+ skipRows = (std::min)(skipRows,Index(rows));
+ // note that the skiped columns are processed later.
+ }
+ /* eigen_internal_assert( alignmentPattern==NoneAligned
+ || LhsPacketSize==1
+ || (skipRows + rowsAtOnce >= rows)
+ || LhsPacketSize > depth
+ || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/
+ }
+ else if(Vectorizable)
+ {
+ alignedStart = 0;
+ alignedSize = depth;
+ alignmentPattern = AllAligned;
+ }
+
+ const Index offset1 = (FirstAligned && alignmentStep==1?3:1);
+ const Index offset3 = (FirstAligned && alignmentStep==1?1:3);
+
+ Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
+ for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
+ {
+ EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0);
+ ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
+
+ // this helps the compiler generating good binary code
+ const LhsScalars lhs0 = lhs.getVectorMapper(i+0, 0), lhs1 = lhs.getVectorMapper(i+offset1, 0),
+ lhs2 = lhs.getVectorMapper(i+2, 0), lhs3 = lhs.getVectorMapper(i+offset3, 0);
+
+ if (Vectorizable)
+ {
+ /* explicit vectorization */
+ ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
+ ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
+
+ // process initial unaligned coeffs
+ // FIXME this loop get vectorized by the compiler !
+ for (Index j=0; j<alignedStart; ++j)
+ {
+ RhsScalar b = rhs(j, 0);
+ tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
+ tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
+ }
+
+ if (alignedSize>alignedStart)
+ {
+ switch(alignmentPattern)
+ {
+ case AllAligned:
+ for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
+ _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
+ break;
+ case EvenAligned:
+ for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
+ _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
+ break;
+ case FirstAligned:
+ {
+ Index j = alignedStart;
+ if (peels>1)
+ {
+ /* Here we proccess 4 rows with with two peeled iterations to hide
+ * the overhead of unaligned loads. Moreover unaligned loads are handled
+ * using special shift/move operations between the two aligned packets
+ * overlaping the desired unaligned packet. This is *much* more efficient
+ * than basic unaligned loads.
+ */
+ LhsPacket A01, A02, A03, A11, A12, A13;
+ A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
+ A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
+ A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
+
+ for (; j<peeledSize; j+=peels*RhsPacketSize)
+ {
+ RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);
+ A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11);
+ A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12);
+ A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13);
+
+ ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), b, ptmp0);
+ ptmp1 = pcj.pmadd(A01, b, ptmp1);
+ A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01);
+ ptmp2 = pcj.pmadd(A02, b, ptmp2);
+ A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02);
+ ptmp3 = pcj.pmadd(A03, b, ptmp3);
+ A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03);
+
+ b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load<RhsPacket, Aligned>(0);
+ ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize), b, ptmp0);
+ ptmp1 = pcj.pmadd(A11, b, ptmp1);
+ ptmp2 = pcj.pmadd(A12, b, ptmp2);
+ ptmp3 = pcj.pmadd(A13, b, ptmp3);
+ }
+ }
+ for (; j<alignedSize; j+=RhsPacketSize)
+ _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
+ break;
+ }
+ default:
+ for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
+ _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
+ break;
+ }
+ tmp0 += predux(ptmp0);
+ tmp1 += predux(ptmp1);
+ tmp2 += predux(ptmp2);
+ tmp3 += predux(ptmp3);
+ }
+ } // end explicit vectorization
+
+ // process remaining coeffs (or all if no explicit vectorization)
+ // FIXME this loop get vectorized by the compiler !
+ for (Index j=alignedSize; j<depth; ++j)
+ {
+ RhsScalar b = rhs(j, 0);
+ tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
+ tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
+ }
+ res[i*resIncr] += alpha*tmp0;
+ res[(i+offset1)*resIncr] += alpha*tmp1;
+ res[(i+2)*resIncr] += alpha*tmp2;
+ res[(i+offset3)*resIncr] += alpha*tmp3;
+ }
+
+ // process remaining first and last rows (at most columnsAtOnce-1)
+ Index end = rows;
+ Index start = rowBound;
+ do
+ {
+ for (Index i=start; i<end; ++i)
+ {
+ EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0);
+ ResPacket ptmp0 = pset1<ResPacket>(tmp0);
+ const LhsScalars lhs0 = lhs.getVectorMapper(i, 0);
+ // process first unaligned result's coeffs
+ // FIXME this loop get vectorized by the compiler !
+ for (Index j=0; j<alignedStart; ++j)
+ tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
+
+ if (alignedSize>alignedStart)
+ {
+ // process aligned rhs coeffs
+ if (lhs0.template aligned<LhsPacket>(alignedStart))
+ for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
+ ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
+ else
+ for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
+ ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
+ tmp0 += predux(ptmp0);
+ }
+
+ // process remaining scalars
+ // FIXME this loop get vectorized by the compiler !
+ for (Index j=alignedSize; j<depth; ++j)
+ tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
+ res[i*resIncr] += alpha*tmp0;
+ }
+ if (skipRows)
+ {
+ start = 0;
+ end = skipRows;
+ skipRows = 0;
+ }
+ else
+ break;
+ } while(Vectorizable);
+
+ #undef _EIGEN_ACCUMULATE_PACKETS
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_VECTOR_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixVector_MKL.h b/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
new file mode 100644
index 0000000000..1cb9fe6b5a
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
@@ -0,0 +1,131 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+ be used to endorse or promote products derived from this software without
+ specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ * Content : Eigen bindings to Intel(R) MKL
+ * General matrix-vector product functionality based on ?GEMV.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_GENERAL_MATRIX_VECTOR_MKL_H
+#define EIGEN_GENERAL_MATRIX_VECTOR_MKL_H
+
+namespace Eigen {
+
+namespace internal {
+
+/**********************************************************************
+* This file implements general matrix-vector multiplication using BLAS
+* gemv function via partial specialization of
+* general_matrix_vector_product::run(..) method for float, double,
+* std::complex<float> and std::complex<double> types
+**********************************************************************/
+
+// gemv specialization
+
+template<typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
+struct general_matrix_vector_product_gemv :
+ general_matrix_vector_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,ConjugateRhs,BuiltIn> {};
+
+#define EIGEN_MKL_GEMV_SPECIALIZE(Scalar) \
+template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
+struct general_matrix_vector_product<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs,Specialized> { \
+static void run( \
+ Index rows, Index cols, \
+ const Scalar* lhs, Index lhsStride, \
+ const Scalar* rhs, Index rhsIncr, \
+ Scalar* res, Index resIncr, Scalar alpha) \
+{ \
+ if (ConjugateLhs) { \
+ general_matrix_vector_product<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs,BuiltIn>::run( \
+ rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
+ } else { \
+ general_matrix_vector_product_gemv<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
+ rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
+ } \
+} \
+}; \
+template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
+struct general_matrix_vector_product<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs,Specialized> { \
+static void run( \
+ Index rows, Index cols, \
+ const Scalar* lhs, Index lhsStride, \
+ const Scalar* rhs, Index rhsIncr, \
+ Scalar* res, Index resIncr, Scalar alpha) \
+{ \
+ general_matrix_vector_product_gemv<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
+ rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
+} \
+}; \
+
+EIGEN_MKL_GEMV_SPECIALIZE(double)
+EIGEN_MKL_GEMV_SPECIALIZE(float)
+EIGEN_MKL_GEMV_SPECIALIZE(dcomplex)
+EIGEN_MKL_GEMV_SPECIALIZE(scomplex)
+
+#define EIGEN_MKL_GEMV_SPECIALIZATION(EIGTYPE,MKLTYPE,MKLPREFIX) \
+template<typename Index, int LhsStorageOrder, bool ConjugateLhs, bool ConjugateRhs> \
+struct general_matrix_vector_product_gemv<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,ConjugateRhs> \
+{ \
+typedef Matrix<EIGTYPE,Dynamic,1,ColMajor> GEMVVector;\
+\
+static void run( \
+ Index rows, Index cols, \
+ const EIGTYPE* lhs, Index lhsStride, \
+ const EIGTYPE* rhs, Index rhsIncr, \
+ EIGTYPE* res, Index resIncr, EIGTYPE alpha) \
+{ \
+ MKL_INT m=rows, n=cols, lda=lhsStride, incx=rhsIncr, incy=resIncr; \
+ MKLTYPE alpha_, beta_; \
+ const EIGTYPE *x_ptr, myone(1); \
+ char trans=(LhsStorageOrder==ColMajor) ? 'N' : (ConjugateLhs) ? 'C' : 'T'; \
+ if (LhsStorageOrder==RowMajor) { \
+ m=cols; \
+ n=rows; \
+ }\
+ assign_scalar_eig2mkl(alpha_, alpha); \
+ assign_scalar_eig2mkl(beta_, myone); \
+ GEMVVector x_tmp; \
+ if (ConjugateRhs) { \
+ Map<const GEMVVector, 0, InnerStride<> > map_x(rhs,cols,1,InnerStride<>(incx)); \
+ x_tmp=map_x.conjugate(); \
+ x_ptr=x_tmp.data(); \
+ incx=1; \
+ } else x_ptr=rhs; \
+ MKLPREFIX##gemv(&trans, &m, &n, &alpha_, (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &beta_, (MKLTYPE*)res, &incy); \
+}\
+};
+
+EIGEN_MKL_GEMV_SPECIALIZATION(double, double, d)
+EIGEN_MKL_GEMV_SPECIALIZATION(float, float, s)
+EIGEN_MKL_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, z)
+EIGEN_MKL_GEMV_SPECIALIZATION(scomplex, MKL_Complex8, c)
+
+} // end namespase internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_VECTOR_MKL_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/Parallelizer.h b/third_party/eigen3/Eigen/src/Core/products/Parallelizer.h
new file mode 100644
index 0000000000..837e69415b
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/Parallelizer.h
@@ -0,0 +1,158 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PARALLELIZER_H
+#define EIGEN_PARALLELIZER_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal */
+inline void manage_multi_threading(Action action, int* v)
+{
+ static EIGEN_UNUSED int m_maxThreads = -1;
+
+ if(action==SetAction)
+ {
+ eigen_internal_assert(v!=0);
+ m_maxThreads = *v;
+ }
+ else if(action==GetAction)
+ {
+ eigen_internal_assert(v!=0);
+ #ifdef EIGEN_HAS_OPENMP
+ if(m_maxThreads>0)
+ *v = m_maxThreads;
+ else
+ *v = omp_get_max_threads();
+ #else
+ *v = 1;
+ #endif
+ }
+ else
+ {
+ eigen_internal_assert(false);
+ }
+}
+
+}
+
+/** Must be call first when calling Eigen from multiple threads */
+inline void initParallel()
+{
+ int nbt;
+ internal::manage_multi_threading(GetAction, &nbt);
+ std::ptrdiff_t l1, l2, l3;
+ internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
+}
+
+/** \returns the max number of threads reserved for Eigen
+ * \sa setNbThreads */
+inline int nbThreads()
+{
+ int ret;
+ internal::manage_multi_threading(GetAction, &ret);
+ return ret;
+}
+
+/** Sets the max number of threads reserved for Eigen
+ * \sa nbThreads */
+inline void setNbThreads(int v)
+{
+ internal::manage_multi_threading(SetAction, &v);
+}
+
+namespace internal {
+
+template<typename Index> struct GemmParallelInfo
+{
+ GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {}
+
+ int volatile sync;
+ int volatile users;
+
+ Index lhs_start;
+ Index lhs_length;
+};
+
+template<bool Condition, typename Functor, typename Index>
+void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpose)
+{
+ // TODO when EIGEN_USE_BLAS is defined,
+ // we should still enable OMP for other scalar types
+#if !(defined (EIGEN_HAS_OPENMP)) || defined (EIGEN_USE_BLAS)
+ // FIXME the transpose variable is only needed to properly split
+ // the matrix product when multithreading is enabled. This is a temporary
+ // fix to support row-major destination matrices. This whole
+ // parallelizer mechanism has to be redisigned anyway.
+ EIGEN_UNUSED_VARIABLE(transpose);
+ func(0,rows, 0,cols);
+#else
+
+ // Dynamically check whether we should enable or disable OpenMP.
+ // The conditions are:
+ // - the max number of threads we can create is greater than 1
+ // - we are not already in a parallel code
+ // - the sizes are large enough
+
+ // 1- are we already in a parallel session?
+ // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp?
+ if((!Condition) || (omp_get_num_threads()>1))
+ return func(0,rows, 0,cols);
+
+ Index size = transpose ? rows : cols;
+
+ // 2- compute the maximal number of threads from the size of the product:
+ // FIXME this has to be fine tuned
+ Index max_threads = std::max<Index>(1,size / 32);
+
+ // 3 - compute the number of threads we are going to use
+ Index threads = std::min<Index>(nbThreads(), max_threads);
+
+ if(threads==1)
+ return func(0,rows, 0,cols);
+
+ Eigen::initParallel();
+ func.initParallelSession();
+
+ if(transpose)
+ std::swap(rows,cols);
+
+ Index blockCols = (cols / threads) & ~Index(0x3);
+ Index blockRows = (rows / threads);
+ blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr;
+
+ GemmParallelInfo<Index>* info = new GemmParallelInfo<Index>[threads];
+
+ #pragma omp parallel num_threads(threads)
+ {
+ Index i = omp_get_thread_num();
+ Index r0 = i*blockRows;
+ Index actualBlockRows = (i+1==threads) ? rows-r0 : blockRows;
+
+ Index c0 = i*blockCols;
+ Index actualBlockCols = (i+1==threads) ? cols-c0 : blockCols;
+
+ info[i].lhs_start = r0;
+ info[i].lhs_length = actualBlockRows;
+
+ if(transpose) func(c0, actualBlockCols, 0, rows, info);
+ else func(0, rows, c0, actualBlockCols, info);
+ }
+
+ delete[] info;
+#endif
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PARALLELIZER_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/third_party/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
new file mode 100644
index 0000000000..4a60ef7dc5
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -0,0 +1,523 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_H
+#define EIGEN_SELFADJOINT_MATRIX_MATRIX_H
+
+namespace Eigen {
+
+namespace internal {
+
+// pack a selfadjoint block diagonal for use with the gebp_kernel
+template<typename Scalar, typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
+struct symm_pack_lhs
+{
+ template<int BlockRows> inline
+ void pack(Scalar* blockA, const const_blas_data_mapper<Scalar,Index,StorageOrder>& lhs, Index cols, Index i, Index& count)
+ {
+ // normal copy
+ for(Index k=0; k<i; k++)
+ for(Index w=0; w<BlockRows; w++)
+ blockA[count++] = lhs(i+w,k); // normal
+ // symmetric copy
+ Index h = 0;
+ for(Index k=i; k<i+BlockRows; k++)
+ {
+ for(Index w=0; w<h; w++)
+ blockA[count++] = numext::conj(lhs(k, i+w)); // transposed
+
+ blockA[count++] = numext::real(lhs(k,k)); // real (diagonal)
+
+ for(Index w=h+1; w<BlockRows; w++)
+ blockA[count++] = lhs(i+w, k); // normal
+ ++h;
+ }
+ // transposed copy
+ for(Index k=i+BlockRows; k<cols; k++)
+ for(Index w=0; w<BlockRows; w++)
+ blockA[count++] = numext::conj(lhs(k, i+w)); // transposed
+ }
+ void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
+ {
+ enum { PacketSize = packet_traits<Scalar>::size };
+ const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride);
+ Index count = 0;
+ //Index peeled_mc3 = (rows/Pack1)*Pack1;
+
+ const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
+ const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
+ const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
+
+ if(Pack1>=3*PacketSize)
+ for(Index i=0; i<peeled_mc3; i+=3*PacketSize)
+ pack<3*PacketSize>(blockA, lhs, cols, i, count);
+
+ if(Pack1>=2*PacketSize)
+ for(Index i=peeled_mc3; i<peeled_mc2; i+=2*PacketSize)
+ pack<2*PacketSize>(blockA, lhs, cols, i, count);
+
+ if(Pack1>=1*PacketSize)
+ for(Index i=peeled_mc2; i<peeled_mc1; i+=1*PacketSize)
+ pack<1*PacketSize>(blockA, lhs, cols, i, count);
+
+ // do the same with mr==1
+ for(Index i=peeled_mc1; i<rows; i++)
+ {
+ for(Index k=0; k<i; k++)
+ blockA[count++] = lhs(i, k); // normal
+
+ blockA[count++] = numext::real(lhs(i, i)); // real (diagonal)
+
+ for(Index k=i+1; k<cols; k++)
+ blockA[count++] = numext::conj(lhs(k, i)); // transposed
+ }
+ }
+};
+
+template<typename Scalar, typename Index, int nr, int StorageOrder>
+struct symm_pack_rhs
+{
+ enum { PacketSize = packet_traits<Scalar>::size };
+ void operator()(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
+ {
+ Index end_k = k2 + rows;
+ Index count = 0;
+ const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(_rhs,rhsStride);
+ Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
+ Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
+
+ // first part: normal case
+ for(Index j2=0; j2<k2; j2+=nr)
+ {
+ for(Index k=k2; k<end_k; k++)
+ {
+ blockB[count+0] = rhs(k,j2+0);
+ blockB[count+1] = rhs(k,j2+1);
+ if (nr>=4)
+ {
+ blockB[count+2] = rhs(k,j2+2);
+ blockB[count+3] = rhs(k,j2+3);
+ }
+ if (nr>=8)
+ {
+ blockB[count+4] = rhs(k,j2+4);
+ blockB[count+5] = rhs(k,j2+5);
+ blockB[count+6] = rhs(k,j2+6);
+ blockB[count+7] = rhs(k,j2+7);
+ }
+ count += nr;
+ }
+ }
+
+ // second part: diagonal block
+ Index end8 = nr>=8 ? (std::min)(k2+rows,packet_cols8) : k2;
+ if(nr>=8)
+ {
+ for(Index j2=k2; j2<end8; j2+=8)
+ {
+ // again we can split vertically in three different parts (transpose, symmetric, normal)
+ // transpose
+ for(Index k=k2; k<j2; k++)
+ {
+ blockB[count+0] = numext::conj(rhs(j2+0,k));
+ blockB[count+1] = numext::conj(rhs(j2+1,k));
+ blockB[count+2] = numext::conj(rhs(j2+2,k));
+ blockB[count+3] = numext::conj(rhs(j2+3,k));
+ blockB[count+4] = numext::conj(rhs(j2+4,k));
+ blockB[count+5] = numext::conj(rhs(j2+5,k));
+ blockB[count+6] = numext::conj(rhs(j2+6,k));
+ blockB[count+7] = numext::conj(rhs(j2+7,k));
+ count += 8;
+ }
+ // symmetric
+ Index h = 0;
+ for(Index k=j2; k<j2+8; k++)
+ {
+ // normal
+ for (Index w=0 ; w<h; ++w)
+ blockB[count+w] = rhs(k,j2+w);
+
+ blockB[count+h] = numext::real(rhs(k,k));
+
+ // transpose
+ for (Index w=h+1 ; w<8; ++w)
+ blockB[count+w] = numext::conj(rhs(j2+w,k));
+ count += 8;
+ ++h;
+ }
+ // normal
+ for(Index k=j2+8; k<end_k; k++)
+ {
+ blockB[count+0] = rhs(k,j2+0);
+ blockB[count+1] = rhs(k,j2+1);
+ blockB[count+2] = rhs(k,j2+2);
+ blockB[count+3] = rhs(k,j2+3);
+ blockB[count+4] = rhs(k,j2+4);
+ blockB[count+5] = rhs(k,j2+5);
+ blockB[count+6] = rhs(k,j2+6);
+ blockB[count+7] = rhs(k,j2+7);
+ count += 8;
+ }
+ }
+ }
+ if(nr>=4)
+ {
+ for(Index j2=end8; j2<(std::min)(k2+rows,packet_cols4); j2+=4)
+ {
+ // again we can split vertically in three different parts (transpose, symmetric, normal)
+ // transpose
+ for(Index k=k2; k<j2; k++)
+ {
+ blockB[count+0] = numext::conj(rhs(j2+0,k));
+ blockB[count+1] = numext::conj(rhs(j2+1,k));
+ blockB[count+2] = numext::conj(rhs(j2+2,k));
+ blockB[count+3] = numext::conj(rhs(j2+3,k));
+ count += 4;
+ }
+ // symmetric
+ Index h = 0;
+ for(Index k=j2; k<j2+4; k++)
+ {
+ // normal
+ for (Index w=0 ; w<h; ++w)
+ blockB[count+w] = rhs(k,j2+w);
+
+ blockB[count+h] = numext::real(rhs(k,k));
+
+ // transpose
+ for (Index w=h+1 ; w<4; ++w)
+ blockB[count+w] = numext::conj(rhs(j2+w,k));
+ count += 4;
+ ++h;
+ }
+ // normal
+ for(Index k=j2+4; k<end_k; k++)
+ {
+ blockB[count+0] = rhs(k,j2+0);
+ blockB[count+1] = rhs(k,j2+1);
+ blockB[count+2] = rhs(k,j2+2);
+ blockB[count+3] = rhs(k,j2+3);
+ count += 4;
+ }
+ }
+ }
+
+ // third part: transposed
+ if(nr>=8)
+ {
+ for(Index j2=k2+rows; j2<packet_cols8; j2+=8)
+ {
+ for(Index k=k2; k<end_k; k++)
+ {
+ blockB[count+0] = numext::conj(rhs(j2+0,k));
+ blockB[count+1] = numext::conj(rhs(j2+1,k));
+ blockB[count+2] = numext::conj(rhs(j2+2,k));
+ blockB[count+3] = numext::conj(rhs(j2+3,k));
+ blockB[count+4] = numext::conj(rhs(j2+4,k));
+ blockB[count+5] = numext::conj(rhs(j2+5,k));
+ blockB[count+6] = numext::conj(rhs(j2+6,k));
+ blockB[count+7] = numext::conj(rhs(j2+7,k));
+ count += 8;
+ }
+ }
+ }
+ if(nr>=4)
+ {
+ for(Index j2=(std::max)(packet_cols8,k2+rows); j2<packet_cols4; j2+=4)
+ {
+ for(Index k=k2; k<end_k; k++)
+ {
+ blockB[count+0] = numext::conj(rhs(j2+0,k));
+ blockB[count+1] = numext::conj(rhs(j2+1,k));
+ blockB[count+2] = numext::conj(rhs(j2+2,k));
+ blockB[count+3] = numext::conj(rhs(j2+3,k));
+ count += 4;
+ }
+ }
+ }
+
+ // copy the remaining columns one at a time (=> the same with nr==1)
+ for(Index j2=packet_cols4; j2<cols; ++j2)
+ {
+ // transpose
+ Index half = (std::min)(end_k,j2);
+ for(Index k=k2; k<half; k++)
+ {
+ blockB[count] = numext::conj(rhs(j2,k));
+ count += 1;
+ }
+
+ if(half==j2 && half<k2+rows)
+ {
+ blockB[count] = numext::real(rhs(j2,j2));
+ count += 1;
+ }
+ else
+ half--;
+
+ // normal
+ for(Index k=half+1; k<k2+rows; k++)
+ {
+ blockB[count] = rhs(k,j2);
+ count += 1;
+ }
+ }
+ }
+};
+
+/* Optimized selfadjoint matrix * matrix (_SYMM) product built on top of
+ * the general matrix matrix product.
+ */
+template <typename Scalar, typename Index,
+ int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs,
+ int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs,
+ int ResStorageOrder>
+struct product_selfadjoint_matrix;
+
+template <typename Scalar, typename Index,
+ int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs,
+ int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs>
+struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,ConjugateLhs, RhsStorageOrder,RhsSelfAdjoint,ConjugateRhs,RowMajor>
+{
+
+ static EIGEN_STRONG_INLINE void run(
+ Index rows, Index cols,
+ const Scalar* lhs, Index lhsStride,
+ const Scalar* rhs, Index rhsStride,
+ Scalar* res, Index resStride,
+ const Scalar& alpha)
+ {
+ product_selfadjoint_matrix<Scalar, Index,
+ EIGEN_LOGICAL_XOR(RhsSelfAdjoint,RhsStorageOrder==RowMajor) ? ColMajor : RowMajor,
+ RhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsSelfAdjoint,ConjugateRhs),
+ EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor,
+ LhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs),
+ ColMajor>
+ ::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha);
+ }
+};
+
+template <typename Scalar, typename Index,
+ int LhsStorageOrder, bool ConjugateLhs,
+ int RhsStorageOrder, bool ConjugateRhs>
+struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor>
+{
+
+ static EIGEN_DONT_INLINE void run(
+ Index rows, Index cols,
+ const Scalar* _lhs, Index lhsStride,
+ const Scalar* _rhs, Index rhsStride,
+ Scalar* res, Index resStride,
+ const Scalar& alpha);
+};
+
+template <typename Scalar, typename Index,
+ int LhsStorageOrder, bool ConjugateLhs,
+ int RhsStorageOrder, bool ConjugateRhs>
+EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor>::run(
+ Index rows, Index cols,
+ const Scalar* _lhs, Index lhsStride,
+ const Scalar* _rhs, Index rhsStride,
+ Scalar* _res, Index resStride,
+ const Scalar& alpha)
+ {
+ Index size = rows;
+
+ typedef gebp_traits<Scalar,Scalar> Traits;
+
+ typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+ typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper;
+ typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+ typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+ LhsMapper lhs(_lhs,lhsStride);
+ LhsTransposeMapper lhs_transpose(_lhs,lhsStride);
+ RhsMapper rhs(_rhs,rhsStride);
+ ResMapper res(_res, resStride);
+
+ Index kc = size; // cache block size along the K direction
+ Index mc = rows; // cache block size along the M direction
+ Index nc = cols; // cache block size along the N direction
+ computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, Index(1));
+ // kc must smaller than mc
+ kc = (std::min)(kc,mc);
+
+ std::size_t sizeB = kc*cols;
+ ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
+ ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
+ Scalar* blockB = allocatedBlockB;
+
+ gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+ symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+ gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
+ gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
+
+ for(Index k2=0; k2<size; k2+=kc)
+ {
+ const Index actual_kc = (std::min)(k2+kc,size)-k2;
+
+ // we have selected one row panel of rhs and one column panel of lhs
+ // pack rhs's panel into a sequential chunk of memory
+ // and expand each coeff to a constant packet for further reuse
+ pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, cols);
+
+ // the select lhs's panel has to be split in three different parts:
+ // 1 - the transposed panel above the diagonal block => transposed packed copy
+ // 2 - the diagonal block => special packed copy
+ // 3 - the panel below the diagonal block => generic packed copy
+ for(Index i2=0; i2<k2; i2+=mc)
+ {
+ const Index actual_mc = (std::min)(i2+mc,k2)-i2;
+ // transposed packed copy
+ pack_lhs_transposed(blockA, lhs_transpose.getSubMapper(i2, k2), actual_kc, actual_mc);
+
+ gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
+ }
+ // the block diagonal
+ {
+ const Index actual_mc = (std::min)(k2+kc,size)-k2;
+ // symmetric packed copy
+ pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc);
+
+ gebp_kernel(res.getSubMapper(k2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
+ }
+
+ for(Index i2=k2+kc; i2<size; i2+=mc)
+ {
+ const Index actual_mc = (std::min)(i2+mc,size)-i2;
+ gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
+ (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
+
+ gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
+ }
+ }
+ }
+
+// matrix * selfadjoint product
+template <typename Scalar, typename Index,
+ int LhsStorageOrder, bool ConjugateLhs,
+ int RhsStorageOrder, bool ConjugateRhs>
+struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor>
+{
+
+ static EIGEN_DONT_INLINE void run(
+ Index rows, Index cols,
+ const Scalar* _lhs, Index lhsStride,
+ const Scalar* _rhs, Index rhsStride,
+ Scalar* res, Index resStride,
+ const Scalar& alpha);
+};
+
+template <typename Scalar, typename Index,
+ int LhsStorageOrder, bool ConjugateLhs,
+ int RhsStorageOrder, bool ConjugateRhs>
+EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor>::run(
+ Index rows, Index cols,
+ const Scalar* _lhs, Index lhsStride,
+ const Scalar* _rhs, Index rhsStride,
+ Scalar* _res, Index resStride,
+ const Scalar& alpha)
+ {
+ Index size = cols;
+
+ typedef gebp_traits<Scalar,Scalar> Traits;
+
+ typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+ typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+ LhsMapper lhs(_lhs,lhsStride);
+ ResMapper res(_res,resStride);
+
+ Index kc = size; // cache block size along the K direction
+ Index mc = rows; // cache block size along the M direction
+ Index nc = cols; // cache block size along the N direction
+ computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, Index(1));
+ std::size_t sizeB = kc*cols;
+ ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
+ ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
+ Scalar* blockB = allocatedBlockB;
+
+ gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+ gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+ symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
+
+ for(Index k2=0; k2<size; k2+=kc)
+ {
+ const Index actual_kc = (std::min)(k2+kc,size)-k2;
+
+ pack_rhs(blockB, _rhs, rhsStride, actual_kc, cols, k2);
+
+ // => GEPP
+ for(Index i2=0; i2<rows; i2+=mc)
+ {
+ const Index actual_mc = (std::min)(i2+mc,rows)-i2;
+ pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
+
+ gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
+ }
+ }
+ }
+
+} // end namespace internal
+
+/***************************************************************************
+* Wrapper to product_selfadjoint_matrix
+***************************************************************************/
+
+namespace internal {
+template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
+struct traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false> >
+ : traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs> >
+{};
+}
+
+template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
+struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>
+ : public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs >
+{
+ EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
+
+ SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
+
+ enum {
+ LhsIsUpper = (LhsMode&(Upper|Lower))==Upper,
+ LhsIsSelfAdjoint = (LhsMode&SelfAdjoint)==SelfAdjoint,
+ RhsIsUpper = (RhsMode&(Upper|Lower))==Upper,
+ RhsIsSelfAdjoint = (RhsMode&SelfAdjoint)==SelfAdjoint
+ };
+
+ template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+ {
+ eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
+
+ typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
+ typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
+
+ Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
+ * RhsBlasTraits::extractScalarFactor(m_rhs);
+
+ internal::product_selfadjoint_matrix<Scalar, Index,
+ EIGEN_LOGICAL_XOR(LhsIsUpper,
+ internal::traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint,
+ NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)),
+ EIGEN_LOGICAL_XOR(RhsIsUpper,
+ internal::traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint,
+ NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)),
+ internal::traits<Dest>::Flags&RowMajorBit ? RowMajor : ColMajor>
+ ::run(
+ lhs.rows(), rhs.cols(), // sizes
+ &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info
+ &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info
+ &dst.coeffRef(0,0), dst.outerStride(), // result info
+ actualAlpha // alpha
+ );
+ }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h b/third_party/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h
new file mode 100644
index 0000000000..dfa687fefe
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h
@@ -0,0 +1,295 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+ be used to endorse or promote products derived from this software without
+ specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+ ********************************************************************************
+ * Content : Eigen bindings to Intel(R) MKL
+ * Self adjoint matrix * matrix product functionality based on ?SYMM/?HEMM.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H
+#define EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H
+
+namespace Eigen {
+
+namespace internal {
+
+
+/* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */
+
+#define EIGEN_MKL_SYMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+template <typename Index, \
+ int LhsStorageOrder, bool ConjugateLhs, \
+ int RhsStorageOrder, bool ConjugateRhs> \
+struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor> \
+{\
+\
+ static void run( \
+ Index rows, Index cols, \
+ const EIGTYPE* _lhs, Index lhsStride, \
+ const EIGTYPE* _rhs, Index rhsStride, \
+ EIGTYPE* res, Index resStride, \
+ EIGTYPE alpha) \
+ { \
+ char side='L', uplo='L'; \
+ MKL_INT m, n, lda, ldb, ldc; \
+ const EIGTYPE *a, *b; \
+ MKLTYPE alpha_, beta_; \
+ MatrixX##EIGPREFIX b_tmp; \
+ EIGTYPE myone(1);\
+\
+/* Set transpose options */ \
+/* Set m, n, k */ \
+ m = (MKL_INT)rows; \
+ n = (MKL_INT)cols; \
+\
+/* Set alpha_ & beta_ */ \
+ assign_scalar_eig2mkl(alpha_, alpha); \
+ assign_scalar_eig2mkl(beta_, myone); \
+\
+/* Set lda, ldb, ldc */ \
+ lda = (MKL_INT)lhsStride; \
+ ldb = (MKL_INT)rhsStride; \
+ ldc = (MKL_INT)resStride; \
+\
+/* Set a, b, c */ \
+ if (LhsStorageOrder==RowMajor) uplo='U'; \
+ a = _lhs; \
+\
+ if (RhsStorageOrder==RowMajor) { \
+ Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
+ b_tmp = rhs.adjoint(); \
+ b = b_tmp.data(); \
+ ldb = b_tmp.outerStride(); \
+ } else b = _rhs; \
+\
+ MKLPREFIX##symm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
+\
+ } \
+};
+
+
+#define EIGEN_MKL_HEMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+template <typename Index, \
+ int LhsStorageOrder, bool ConjugateLhs, \
+ int RhsStorageOrder, bool ConjugateRhs> \
+struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor> \
+{\
+ static void run( \
+ Index rows, Index cols, \
+ const EIGTYPE* _lhs, Index lhsStride, \
+ const EIGTYPE* _rhs, Index rhsStride, \
+ EIGTYPE* res, Index resStride, \
+ EIGTYPE alpha) \
+ { \
+ char side='L', uplo='L'; \
+ MKL_INT m, n, lda, ldb, ldc; \
+ const EIGTYPE *a, *b; \
+ MKLTYPE alpha_, beta_; \
+ MatrixX##EIGPREFIX b_tmp; \
+ Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> a_tmp; \
+ EIGTYPE myone(1); \
+\
+/* Set transpose options */ \
+/* Set m, n, k */ \
+ m = (MKL_INT)rows; \
+ n = (MKL_INT)cols; \
+\
+/* Set alpha_ & beta_ */ \
+ assign_scalar_eig2mkl(alpha_, alpha); \
+ assign_scalar_eig2mkl(beta_, myone); \
+\
+/* Set lda, ldb, ldc */ \
+ lda = (MKL_INT)lhsStride; \
+ ldb = (MKL_INT)rhsStride; \
+ ldc = (MKL_INT)resStride; \
+\
+/* Set a, b, c */ \
+ if (((LhsStorageOrder==ColMajor) && ConjugateLhs) || ((LhsStorageOrder==RowMajor) && (!ConjugateLhs))) { \
+ Map<const Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder>, 0, OuterStride<> > lhs(_lhs,m,m,OuterStride<>(lhsStride)); \
+ a_tmp = lhs.conjugate(); \
+ a = a_tmp.data(); \
+ lda = a_tmp.outerStride(); \
+ } else a = _lhs; \
+ if (LhsStorageOrder==RowMajor) uplo='U'; \
+\
+ if (RhsStorageOrder==ColMajor && (!ConjugateRhs)) { \
+ b = _rhs; } \
+ else { \
+ if (RhsStorageOrder==ColMajor && ConjugateRhs) { \
+ Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,m,n,OuterStride<>(rhsStride)); \
+ b_tmp = rhs.conjugate(); \
+ } else \
+ if (ConjugateRhs) { \
+ Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
+ b_tmp = rhs.adjoint(); \
+ } else { \
+ Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
+ b_tmp = rhs.transpose(); \
+ } \
+ b = b_tmp.data(); \
+ ldb = b_tmp.outerStride(); \
+ } \
+\
+ MKLPREFIX##hemm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
+\
+ } \
+};
+
+EIGEN_MKL_SYMM_L(double, double, d, d)
+EIGEN_MKL_SYMM_L(float, float, f, s)
+EIGEN_MKL_HEMM_L(dcomplex, MKL_Complex16, cd, z)
+EIGEN_MKL_HEMM_L(scomplex, MKL_Complex8, cf, c)
+
+
+/* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */
+
+#define EIGEN_MKL_SYMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+template <typename Index, \
+ int LhsStorageOrder, bool ConjugateLhs, \
+ int RhsStorageOrder, bool ConjugateRhs> \
+struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor> \
+{\
+\
+ static void run( \
+ Index rows, Index cols, \
+ const EIGTYPE* _lhs, Index lhsStride, \
+ const EIGTYPE* _rhs, Index rhsStride, \
+ EIGTYPE* res, Index resStride, \
+ EIGTYPE alpha) \
+ { \
+ char side='R', uplo='L'; \
+ MKL_INT m, n, lda, ldb, ldc; \
+ const EIGTYPE *a, *b; \
+ MKLTYPE alpha_, beta_; \
+ MatrixX##EIGPREFIX b_tmp; \
+ EIGTYPE myone(1);\
+\
+/* Set m, n, k */ \
+ m = (MKL_INT)rows; \
+ n = (MKL_INT)cols; \
+\
+/* Set alpha_ & beta_ */ \
+ assign_scalar_eig2mkl(alpha_, alpha); \
+ assign_scalar_eig2mkl(beta_, myone); \
+\
+/* Set lda, ldb, ldc */ \
+ lda = (MKL_INT)rhsStride; \
+ ldb = (MKL_INT)lhsStride; \
+ ldc = (MKL_INT)resStride; \
+\
+/* Set a, b, c */ \
+ if (RhsStorageOrder==RowMajor) uplo='U'; \
+ a = _rhs; \
+\
+ if (LhsStorageOrder==RowMajor) { \
+ Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(rhsStride)); \
+ b_tmp = lhs.adjoint(); \
+ b = b_tmp.data(); \
+ ldb = b_tmp.outerStride(); \
+ } else b = _lhs; \
+\
+ MKLPREFIX##symm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
+\
+ } \
+};
+
+
+#define EIGEN_MKL_HEMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+template <typename Index, \
+ int LhsStorageOrder, bool ConjugateLhs, \
+ int RhsStorageOrder, bool ConjugateRhs> \
+struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor> \
+{\
+ static void run( \
+ Index rows, Index cols, \
+ const EIGTYPE* _lhs, Index lhsStride, \
+ const EIGTYPE* _rhs, Index rhsStride, \
+ EIGTYPE* res, Index resStride, \
+ EIGTYPE alpha) \
+ { \
+ char side='R', uplo='L'; \
+ MKL_INT m, n, lda, ldb, ldc; \
+ const EIGTYPE *a, *b; \
+ MKLTYPE alpha_, beta_; \
+ MatrixX##EIGPREFIX b_tmp; \
+ Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> a_tmp; \
+ EIGTYPE myone(1); \
+\
+/* Set m, n, k */ \
+ m = (MKL_INT)rows; \
+ n = (MKL_INT)cols; \
+\
+/* Set alpha_ & beta_ */ \
+ assign_scalar_eig2mkl(alpha_, alpha); \
+ assign_scalar_eig2mkl(beta_, myone); \
+\
+/* Set lda, ldb, ldc */ \
+ lda = (MKL_INT)rhsStride; \
+ ldb = (MKL_INT)lhsStride; \
+ ldc = (MKL_INT)resStride; \
+\
+/* Set a, b, c */ \
+ if (((RhsStorageOrder==ColMajor) && ConjugateRhs) || ((RhsStorageOrder==RowMajor) && (!ConjugateRhs))) { \
+ Map<const Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder>, 0, OuterStride<> > rhs(_rhs,n,n,OuterStride<>(rhsStride)); \
+ a_tmp = rhs.conjugate(); \
+ a = a_tmp.data(); \
+ lda = a_tmp.outerStride(); \
+ } else a = _rhs; \
+ if (RhsStorageOrder==RowMajor) uplo='U'; \
+\
+ if (LhsStorageOrder==ColMajor && (!ConjugateLhs)) { \
+ b = _lhs; } \
+ else { \
+ if (LhsStorageOrder==ColMajor && ConjugateLhs) { \
+ Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,n,OuterStride<>(lhsStride)); \
+ b_tmp = lhs.conjugate(); \
+ } else \
+ if (ConjugateLhs) { \
+ Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(lhsStride)); \
+ b_tmp = lhs.adjoint(); \
+ } else { \
+ Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(lhsStride)); \
+ b_tmp = lhs.transpose(); \
+ } \
+ b = b_tmp.data(); \
+ ldb = b_tmp.outerStride(); \
+ } \
+\
+ MKLPREFIX##hemm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
+ } \
+};
+
+EIGEN_MKL_SYMM_R(double, double, d, d)
+EIGEN_MKL_SYMM_R(float, float, f, s)
+EIGEN_MKL_HEMM_R(dcomplex, MKL_Complex16, cd, z)
+EIGEN_MKL_HEMM_R(scomplex, MKL_Complex8, cf, c)
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector.h b/third_party/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector.h
new file mode 100644
index 0000000000..fdc81205ab
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -0,0 +1,281 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_H
+#define EIGEN_SELFADJOINT_MATRIX_VECTOR_H
+
+namespace Eigen {
+
+namespace internal {
+
+/* Optimized selfadjoint matrix * vector product:
+ * This algorithm processes 2 columns at onces that allows to both reduce
+ * the number of load/stores of the result by a factor 2 and to reduce
+ * the instruction dependency.
+ */
+
+template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version=Specialized>
+struct selfadjoint_matrix_vector_product;
+
+template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version>
+struct selfadjoint_matrix_vector_product
+
+{
+static EIGEN_DONT_INLINE void run(
+ Index size,
+ const Scalar* lhs, Index lhsStride,
+ const Scalar* _rhs, Index rhsIncr,
+ Scalar* res,
+ Scalar alpha);
+};
+
+template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run(
+ Index size,
+ const Scalar* lhs, Index lhsStride,
+ const Scalar* _rhs, Index rhsIncr,
+ Scalar* res,
+ Scalar alpha)
+{
+ typedef typename packet_traits<Scalar>::type Packet;
+ const Index PacketSize = sizeof(Packet)/sizeof(Scalar);
+
+ enum {
+ IsRowMajor = StorageOrder==RowMajor ? 1 : 0,
+ IsLower = UpLo == Lower ? 1 : 0,
+ FirstTriangular = IsRowMajor == IsLower
+ };
+
+ conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> cj0;
+ conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1;
+ conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex, ConjugateRhs> cjd;
+
+ conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> pcj0;
+ conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> pcj1;
+
+ Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha;
+
+ // FIXME this copy is now handled outside product_selfadjoint_vector, so it could probably be removed.
+ // if the rhs is not sequentially stored in memory we copy it to a temporary buffer,
+ // this is because we need to extract packets
+ ei_declare_aligned_stack_constructed_variable(Scalar,rhs,size,rhsIncr==1 ? const_cast<Scalar*>(_rhs) : 0);
+ if (rhsIncr!=1)
+ {
+ const Scalar* it = _rhs;
+ for (Index i=0; i<size; ++i, it+=rhsIncr)
+ rhs[i] = *it;
+ }
+
+ Index bound = (std::max)(Index(0),size-8) & 0xfffffffe;
+ if (FirstTriangular)
+ bound = size - bound;
+
+ for (Index j=FirstTriangular ? bound : 0;
+ j<(FirstTriangular ? size : bound);j+=2)
+ {
+ const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
+ const Scalar* EIGEN_RESTRICT A1 = lhs + (j+1)*lhsStride;
+
+ Scalar t0 = cjAlpha * rhs[j];
+ Packet ptmp0 = pset1<Packet>(t0);
+ Scalar t1 = cjAlpha * rhs[j+1];
+ Packet ptmp1 = pset1<Packet>(t1);
+
+ Scalar t2(0);
+ Packet ptmp2 = pset1<Packet>(t2);
+ Scalar t3(0);
+ Packet ptmp3 = pset1<Packet>(t3);
+
+ size_t starti = FirstTriangular ? 0 : j+2;
+ size_t endi = FirstTriangular ? j : size;
+ size_t alignedStart = (starti) + internal::first_aligned(&res[starti], endi-starti);
+ size_t alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize);
+
+ // TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed
+ res[j] += cjd.pmul(numext::real(A0[j]), t0);
+ res[j+1] += cjd.pmul(numext::real(A1[j+1]), t1);
+ if(FirstTriangular)
+ {
+ res[j] += cj0.pmul(A1[j], t1);
+ t3 += cj1.pmul(A1[j], rhs[j]);
+ }
+ else
+ {
+ res[j+1] += cj0.pmul(A0[j+1],t0);
+ t2 += cj1.pmul(A0[j+1], rhs[j+1]);
+ }
+
+ for (size_t i=starti; i<alignedStart; ++i)
+ {
+ res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1);
+ t2 += cj1.pmul(A0[i], rhs[i]);
+ t3 += cj1.pmul(A1[i], rhs[i]);
+ }
+ // Yes this an optimization for gcc 4.3 and 4.4 (=> huge speed up)
+ // gcc 4.2 does this optimization automatically.
+ const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart;
+ const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart;
+ const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
+ Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
+ for (size_t i=alignedStart; i<alignedEnd; i+=PacketSize)
+ {
+ Packet A0i = ploadu<Packet>(a0It); a0It += PacketSize;
+ Packet A1i = ploadu<Packet>(a1It); a1It += PacketSize;
+ Packet Bi = ploadu<Packet>(rhsIt); rhsIt += PacketSize; // FIXME should be aligned in most cases
+ Packet Xi = pload <Packet>(resIt);
+
+ Xi = pcj0.pmadd(A0i,ptmp0, pcj0.pmadd(A1i,ptmp1,Xi));
+ ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2);
+ ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3);
+ pstore(resIt,Xi); resIt += PacketSize;
+ }
+ for (size_t i=alignedEnd; i<endi; i++)
+ {
+ res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1);
+ t2 += cj1.pmul(A0[i], rhs[i]);
+ t3 += cj1.pmul(A1[i], rhs[i]);
+ }
+
+ res[j] += alpha * (t2 + predux(ptmp2));
+ res[j+1] += alpha * (t3 + predux(ptmp3));
+ }
+ for (Index j=FirstTriangular ? 0 : bound;j<(FirstTriangular ? bound : size);j++)
+ {
+ const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
+
+ Scalar t1 = cjAlpha * rhs[j];
+ Scalar t2(0);
+ // TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed
+ res[j] += cjd.pmul(numext::real(A0[j]), t1);
+ for (Index i=FirstTriangular ? 0 : j+1; i<(FirstTriangular ? j : size); i++)
+ {
+ res[i] += cj0.pmul(A0[i], t1);
+ t2 += cj1.pmul(A0[i], rhs[i]);
+ }
+ res[j] += alpha * t2;
+ }
+}
+
+} // end namespace internal
+
+/***************************************************************************
+* Wrapper to product_selfadjoint_vector
+***************************************************************************/
+
+namespace internal {
+template<typename Lhs, int LhsMode, typename Rhs>
+struct traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true> >
+ : traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs> >
+{};
+}
+
+template<typename Lhs, int LhsMode, typename Rhs>
+struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
+ : public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs >
+{
+ EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
+
+ enum {
+ LhsUpLo = LhsMode&(Upper|Lower)
+ };
+
+ SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
+
+ template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
+ {
+ typedef typename Dest::Scalar ResScalar;
+ typedef typename Base::RhsScalar RhsScalar;
+ typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
+
+ eigen_assert(dest.rows()==m_lhs.rows() && dest.cols()==m_rhs.cols());
+
+ typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
+ typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
+
+ Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
+ * RhsBlasTraits::extractScalarFactor(m_rhs);
+
+ enum {
+ EvalToDest = (Dest::InnerStrideAtCompileTime==1),
+ UseRhs = (_ActualRhsType::InnerStrideAtCompileTime==1)
+ };
+
+ internal::gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,!EvalToDest> static_dest;
+ internal::gemv_static_vector_if<RhsScalar,_ActualRhsType::SizeAtCompileTime,_ActualRhsType::MaxSizeAtCompileTime,!UseRhs> static_rhs;
+
+ ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
+ EvalToDest ? dest.data() : static_dest.data());
+
+ ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,rhs.size(),
+ UseRhs ? const_cast<RhsScalar*>(rhs.data()) : static_rhs.data());
+
+ if(!EvalToDest)
+ {
+ #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+ int size = dest.size();
+ EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+ #endif
+ MappedDest(actualDestPtr, dest.size()) = dest;
+ }
+
+ if(!UseRhs)
+ {
+ #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+ int size = rhs.size();
+ EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+ #endif
+ Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, rhs.size()) = rhs;
+ }
+
+
+ internal::selfadjoint_matrix_vector_product<Scalar, Index, (internal::traits<_ActualLhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)>::run
+ (
+ lhs.rows(), // size
+ &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info
+ actualRhsPtr, 1, // rhs info
+ actualDestPtr, // result info
+ actualAlpha // scale factor
+ );
+
+ if(!EvalToDest)
+ dest = MappedDest(actualDestPtr, dest.size());
+ }
+};
+
+namespace internal {
+template<typename Lhs, typename Rhs, int RhsMode>
+struct traits<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false> >
+ : traits<ProductBase<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>, Lhs, Rhs> >
+{};
+}
+
+template<typename Lhs, typename Rhs, int RhsMode>
+struct SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>
+ : public ProductBase<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>, Lhs, Rhs >
+{
+ EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
+
+ enum {
+ RhsUpLo = RhsMode&(Upper|Lower)
+ };
+
+ SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
+
+ template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
+ {
+ // let's simply transpose the product
+ Transpose<Dest> destT(dest);
+ SelfadjointProductMatrix<Transpose<const Rhs>, int(RhsUpLo)==Upper ? Lower : Upper, false,
+ Transpose<const Lhs>, 0, true>(m_rhs.transpose(), m_lhs.transpose()).scaleAndAddTo(destT, alpha);
+ }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h b/third_party/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
new file mode 100644
index 0000000000..86684b66d9
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
@@ -0,0 +1,114 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+ be used to endorse or promote products derived from this software without
+ specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ * Content : Eigen bindings to Intel(R) MKL
+ * Selfadjoint matrix-vector product functionality based on ?SYMV/HEMV.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H
+#define EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H
+
+namespace Eigen {
+
+namespace internal {
+
+/**********************************************************************
+* This file implements selfadjoint matrix-vector multiplication using BLAS
+**********************************************************************/
+
+// symv/hemv specialization
+
+template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs>
+struct selfadjoint_matrix_vector_product_symv :
+ selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,BuiltIn> {};
+
+#define EIGEN_MKL_SYMV_SPECIALIZE(Scalar) \
+template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> \
+struct selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Specialized> { \
+static void run( \
+ Index size, const Scalar* lhs, Index lhsStride, \
+ const Scalar* _rhs, Index rhsIncr, Scalar* res, Scalar alpha) { \
+ enum {\
+ IsColMajor = StorageOrder==ColMajor \
+ }; \
+ if (IsColMajor == ConjugateLhs) {\
+ selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,BuiltIn>::run( \
+ size, lhs, lhsStride, _rhs, rhsIncr, res, alpha); \
+ } else {\
+ selfadjoint_matrix_vector_product_symv<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs>::run( \
+ size, lhs, lhsStride, _rhs, rhsIncr, res, alpha); \
+ }\
+ } \
+}; \
+
+EIGEN_MKL_SYMV_SPECIALIZE(double)
+EIGEN_MKL_SYMV_SPECIALIZE(float)
+EIGEN_MKL_SYMV_SPECIALIZE(dcomplex)
+EIGEN_MKL_SYMV_SPECIALIZE(scomplex)
+
+#define EIGEN_MKL_SYMV_SPECIALIZATION(EIGTYPE,MKLTYPE,MKLFUNC) \
+template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> \
+struct selfadjoint_matrix_vector_product_symv<EIGTYPE,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs> \
+{ \
+typedef Matrix<EIGTYPE,Dynamic,1,ColMajor> SYMVVector;\
+\
+static void run( \
+Index size, const EIGTYPE* lhs, Index lhsStride, \
+const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* res, EIGTYPE alpha) \
+{ \
+ enum {\
+ IsRowMajor = StorageOrder==RowMajor ? 1 : 0, \
+ IsLower = UpLo == Lower ? 1 : 0 \
+ }; \
+ MKL_INT n=size, lda=lhsStride, incx=rhsIncr, incy=1; \
+ MKLTYPE alpha_, beta_; \
+ const EIGTYPE *x_ptr, myone(1); \
+ char uplo=(IsRowMajor) ? (IsLower ? 'U' : 'L') : (IsLower ? 'L' : 'U'); \
+ assign_scalar_eig2mkl(alpha_, alpha); \
+ assign_scalar_eig2mkl(beta_, myone); \
+ SYMVVector x_tmp; \
+ if (ConjugateRhs) { \
+ Map<const SYMVVector, 0, InnerStride<> > map_x(_rhs,size,1,InnerStride<>(incx)); \
+ x_tmp=map_x.conjugate(); \
+ x_ptr=x_tmp.data(); \
+ incx=1; \
+ } else x_ptr=_rhs; \
+ MKLFUNC(&uplo, &n, &alpha_, (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &beta_, (MKLTYPE*)res, &incy); \
+}\
+};
+
+EIGEN_MKL_SYMV_SPECIALIZATION(double, double, dsymv)
+EIGEN_MKL_SYMV_SPECIALIZATION(float, float, ssymv)
+EIGEN_MKL_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv)
+EIGEN_MKL_SYMV_SPECIALIZATION(scomplex, MKL_Complex8, chemv)
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/SelfadjointProduct.h b/third_party/eigen3/Eigen/src/Core/products/SelfadjointProduct.h
new file mode 100644
index 0000000000..6ca4ae6c0f
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/SelfadjointProduct.h
@@ -0,0 +1,123 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SELFADJOINT_PRODUCT_H
+#define EIGEN_SELFADJOINT_PRODUCT_H
+
+/**********************************************************************
+* This file implements a self adjoint product: C += A A^T updating only
+* half of the selfadjoint matrix C.
+* It corresponds to the level 3 SYRK and level 2 SYR Blas routines.
+**********************************************************************/
+
+namespace Eigen {
+
+
+template<typename Scalar, typename Index, int UpLo, bool ConjLhs, bool ConjRhs>
+struct selfadjoint_rank1_update<Scalar,Index,ColMajor,UpLo,ConjLhs,ConjRhs>
+{
+ static void run(Index size, Scalar* mat, Index stride, const Scalar* vecX, const Scalar* vecY, const Scalar& alpha)
+ {
+ internal::conj_if<ConjRhs> cj;
+ typedef Map<const Matrix<Scalar,Dynamic,1> > OtherMap;
+ typedef typename internal::conditional<ConjLhs,typename OtherMap::ConjugateReturnType,const OtherMap&>::type ConjLhsType;
+ for (Index i=0; i<size; ++i)
+ {
+ Map<Matrix<Scalar,Dynamic,1> >(mat+stride*i+(UpLo==Lower ? i : 0), (UpLo==Lower ? size-i : (i+1)))
+ += (alpha * cj(vecY[i])) * ConjLhsType(OtherMap(vecX+(UpLo==Lower ? i : 0),UpLo==Lower ? size-i : (i+1)));
+ }
+ }
+};
+
+template<typename Scalar, typename Index, int UpLo, bool ConjLhs, bool ConjRhs>
+struct selfadjoint_rank1_update<Scalar,Index,RowMajor,UpLo,ConjLhs,ConjRhs>
+{
+ static void run(Index size, Scalar* mat, Index stride, const Scalar* vecX, const Scalar* vecY, const Scalar& alpha)
+ {
+ selfadjoint_rank1_update<Scalar,Index,ColMajor,UpLo==Lower?Upper:Lower,ConjRhs,ConjLhs>::run(size,mat,stride,vecY,vecX,alpha);
+ }
+};
+
+template<typename MatrixType, typename OtherType, int UpLo, bool OtherIsVector = OtherType::IsVectorAtCompileTime>
+struct selfadjoint_product_selector;
+
+template<typename MatrixType, typename OtherType, int UpLo>
+struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,true>
+{
+ static void run(MatrixType& mat, const OtherType& other, const typename MatrixType::Scalar& alpha)
+ {
+ typedef typename MatrixType::Scalar Scalar;
+ typedef typename MatrixType::Index Index;
+ typedef internal::blas_traits<OtherType> OtherBlasTraits;
+ typedef typename OtherBlasTraits::DirectLinearAccessType ActualOtherType;
+ typedef typename internal::remove_all<ActualOtherType>::type _ActualOtherType;
+ typename internal::add_const_on_value_type<ActualOtherType>::type actualOther = OtherBlasTraits::extract(other.derived());
+
+ Scalar actualAlpha = alpha * OtherBlasTraits::extractScalarFactor(other.derived());
+
+ enum {
+ StorageOrder = (internal::traits<MatrixType>::Flags&RowMajorBit) ? RowMajor : ColMajor,
+ UseOtherDirectly = _ActualOtherType::InnerStrideAtCompileTime==1
+ };
+ internal::gemv_static_vector_if<Scalar,OtherType::SizeAtCompileTime,OtherType::MaxSizeAtCompileTime,!UseOtherDirectly> static_other;
+
+ ei_declare_aligned_stack_constructed_variable(Scalar, actualOtherPtr, other.size(),
+ (UseOtherDirectly ? const_cast<Scalar*>(actualOther.data()) : static_other.data()));
+
+ if(!UseOtherDirectly)
+ Map<typename _ActualOtherType::PlainObject>(actualOtherPtr, actualOther.size()) = actualOther;
+
+ selfadjoint_rank1_update<Scalar,Index,StorageOrder,UpLo,
+ OtherBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex,
+ (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex>
+ ::run(other.size(), mat.data(), mat.outerStride(), actualOtherPtr, actualOtherPtr, actualAlpha);
+ }
+};
+
+template<typename MatrixType, typename OtherType, int UpLo>
+struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false>
+{
+ static void run(MatrixType& mat, const OtherType& other, const typename MatrixType::Scalar& alpha)
+ {
+ typedef typename MatrixType::Scalar Scalar;
+ typedef typename MatrixType::Index Index;
+ typedef internal::blas_traits<OtherType> OtherBlasTraits;
+ typedef typename OtherBlasTraits::DirectLinearAccessType ActualOtherType;
+ typedef typename internal::remove_all<ActualOtherType>::type _ActualOtherType;
+ typename internal::add_const_on_value_type<ActualOtherType>::type actualOther = OtherBlasTraits::extract(other.derived());
+
+ Scalar actualAlpha = alpha * OtherBlasTraits::extractScalarFactor(other.derived());
+
+ enum { IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0 };
+
+ internal::general_matrix_matrix_triangular_product<Index,
+ Scalar, _ActualOtherType::Flags&RowMajorBit ? RowMajor : ColMajor, OtherBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex,
+ Scalar, _ActualOtherType::Flags&RowMajorBit ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex,
+ MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, UpLo>
+ ::run(mat.cols(), actualOther.cols(),
+ &actualOther.coeffRef(0,0), actualOther.outerStride(), &actualOther.coeffRef(0,0), actualOther.outerStride(),
+ mat.data(), mat.outerStride(), actualAlpha);
+ }
+};
+
+// high level API
+
+template<typename MatrixType, unsigned int UpLo>
+template<typename DerivedU>
+SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
+::rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha)
+{
+ selfadjoint_product_selector<MatrixType,DerivedU,UpLo>::run(_expression().const_cast_derived(), u.derived(), alpha);
+
+ return *this;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINT_PRODUCT_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/SelfadjointRank2Update.h b/third_party/eigen3/Eigen/src/Core/products/SelfadjointRank2Update.h
new file mode 100644
index 0000000000..8594a97cea
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/SelfadjointRank2Update.h
@@ -0,0 +1,93 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SELFADJOINTRANK2UPTADE_H
+#define EIGEN_SELFADJOINTRANK2UPTADE_H
+
+namespace Eigen {
+
+namespace internal {
+
+/* Optimized selfadjoint matrix += alpha * uv' + conj(alpha)*vu'
+ * It corresponds to the Level2 syr2 BLAS routine
+ */
+
+template<typename Scalar, typename Index, typename UType, typename VType, int UpLo>
+struct selfadjoint_rank2_update_selector;
+
+template<typename Scalar, typename Index, typename UType, typename VType>
+struct selfadjoint_rank2_update_selector<Scalar,Index,UType,VType,Lower>
+{
+ static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
+ {
+ const Index size = u.size();
+ for (Index i=0; i<size; ++i)
+ {
+ Map<Matrix<Scalar,Dynamic,1> >(mat+stride*i+i, size-i) +=
+ (numext::conj(alpha) * numext::conj(u.coeff(i))) * v.tail(size-i)
+ + (alpha * numext::conj(v.coeff(i))) * u.tail(size-i);
+ }
+ }
+};
+
+template<typename Scalar, typename Index, typename UType, typename VType>
+struct selfadjoint_rank2_update_selector<Scalar,Index,UType,VType,Upper>
+{
+ static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
+ {
+ const Index size = u.size();
+ for (Index i=0; i<size; ++i)
+ Map<Matrix<Scalar,Dynamic,1> >(mat+stride*i, i+1) +=
+ (numext::conj(alpha) * numext::conj(u.coeff(i))) * v.head(i+1)
+ + (alpha * numext::conj(v.coeff(i))) * u.head(i+1);
+ }
+};
+
+template<bool Cond, typename T> struct conj_expr_if
+ : conditional<!Cond, const T&,
+ CwiseUnaryOp<scalar_conjugate_op<typename traits<T>::Scalar>,T> > {};
+
+} // end namespace internal
+
+template<typename MatrixType, unsigned int UpLo>
+template<typename DerivedU, typename DerivedV>
+SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
+::rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, const Scalar& alpha)
+{
+ typedef internal::blas_traits<DerivedU> UBlasTraits;
+ typedef typename UBlasTraits::DirectLinearAccessType ActualUType;
+ typedef typename internal::remove_all<ActualUType>::type _ActualUType;
+ typename internal::add_const_on_value_type<ActualUType>::type actualU = UBlasTraits::extract(u.derived());
+
+ typedef internal::blas_traits<DerivedV> VBlasTraits;
+ typedef typename VBlasTraits::DirectLinearAccessType ActualVType;
+ typedef typename internal::remove_all<ActualVType>::type _ActualVType;
+ typename internal::add_const_on_value_type<ActualVType>::type actualV = VBlasTraits::extract(v.derived());
+
+ // If MatrixType is row major, then we use the routine for lower triangular in the upper triangular case and
+ // vice versa, and take the complex conjugate of all coefficients and vector entries.
+
+ enum { IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0 };
+ Scalar actualAlpha = alpha * UBlasTraits::extractScalarFactor(u.derived())
+ * numext::conj(VBlasTraits::extractScalarFactor(v.derived()));
+ if (IsRowMajor)
+ actualAlpha = numext::conj(actualAlpha);
+
+ internal::selfadjoint_rank2_update_selector<Scalar, Index,
+ typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ UBlasTraits::NeedToConjugate,_ActualUType>::type>::type,
+ typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ VBlasTraits::NeedToConjugate,_ActualVType>::type>::type,
+ (IsRowMajor ? int(UpLo==Upper ? Lower : Upper) : UpLo)>
+ ::run(_expression().const_cast_derived().data(),_expression().outerStride(),actualU,actualV,actualAlpha);
+
+ return *this;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINTRANK2UPTADE_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix.h b/third_party/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix.h
new file mode 100644
index 0000000000..4cbb79da0c
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -0,0 +1,434 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_H
+#define EIGEN_TRIANGULAR_MATRIX_MATRIX_H
+
+namespace Eigen {
+
+namespace internal {
+
+// template<typename Scalar, int mr, int StorageOrder, bool Conjugate, int Mode>
+// struct gemm_pack_lhs_triangular
+// {
+// Matrix<Scalar,mr,mr,
+// void operator()(Scalar* blockA, const EIGEN_RESTRICT Scalar* _lhs, int lhsStride, int depth, int rows)
+// {
+// conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+// const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride);
+// int count = 0;
+// const int peeled_mc = (rows/mr)*mr;
+// for(int i=0; i<peeled_mc; i+=mr)
+// {
+// for(int k=0; k<depth; k++)
+// for(int w=0; w<mr; w++)
+// blockA[count++] = cj(lhs(i+w, k));
+// }
+// for(int i=peeled_mc; i<rows; i++)
+// {
+// for(int k=0; k<depth; k++)
+// blockA[count++] = cj(lhs(i, k));
+// }
+// }
+// };
+
+/* Optimized triangular matrix * matrix (_TRMM++) product built on top of
+ * the general matrix matrix product.
+ */
+template <typename Scalar, typename Index,
+ int Mode, bool LhsIsTriangular,
+ int LhsStorageOrder, bool ConjugateLhs,
+ int RhsStorageOrder, bool ConjugateRhs,
+ int ResStorageOrder, int Version = Specialized>
+struct product_triangular_matrix_matrix;
+
+template <typename Scalar, typename Index,
+ int Mode, bool LhsIsTriangular,
+ int LhsStorageOrder, bool ConjugateLhs,
+ int RhsStorageOrder, bool ConjugateRhs, int Version>
+struct product_triangular_matrix_matrix<Scalar,Index,Mode,LhsIsTriangular,
+ LhsStorageOrder,ConjugateLhs,
+ RhsStorageOrder,ConjugateRhs,RowMajor,Version>
+{
+ static EIGEN_STRONG_INLINE void run(
+ Index rows, Index cols, Index depth,
+ const Scalar* lhs, Index lhsStride,
+ const Scalar* rhs, Index rhsStride,
+ Scalar* res, Index resStride,
+ const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
+ {
+ product_triangular_matrix_matrix<Scalar, Index,
+ (Mode&(UnitDiag|ZeroDiag)) | ((Mode&Upper) ? Lower : Upper),
+ (!LhsIsTriangular),
+ RhsStorageOrder==RowMajor ? ColMajor : RowMajor,
+ ConjugateRhs,
+ LhsStorageOrder==RowMajor ? ColMajor : RowMajor,
+ ConjugateLhs,
+ ColMajor>
+ ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking);
+ }
+};
+
+// implements col-major += alpha * op(triangular) * op(general)
+template <typename Scalar, typename Index, int Mode,
+ int LhsStorageOrder, bool ConjugateLhs,
+ int RhsStorageOrder, bool ConjugateRhs, int Version>
+struct product_triangular_matrix_matrix<Scalar,Index,Mode,true,
+ LhsStorageOrder,ConjugateLhs,
+ RhsStorageOrder,ConjugateRhs,ColMajor,Version>
+{
+
+ typedef gebp_traits<Scalar,Scalar> Traits;
+ enum {
+ SmallPanelWidth = 2 * EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
+ IsLower = (Mode&Lower) == Lower,
+ SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1
+ };
+
+ static EIGEN_DONT_INLINE void run(
+ Index _rows, Index _cols, Index _depth,
+ const Scalar* _lhs, Index lhsStride,
+ const Scalar* _rhs, Index rhsStride,
+ Scalar* res, Index resStride,
+ const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
+};
+
+template <typename Scalar, typename Index, int Mode,
+ int LhsStorageOrder, bool ConjugateLhs,
+ int RhsStorageOrder, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
+ LhsStorageOrder,ConjugateLhs,
+ RhsStorageOrder,ConjugateRhs,ColMajor,Version>::run(
+ Index _rows, Index _cols, Index _depth,
+ const Scalar* _lhs, Index lhsStride,
+ const Scalar* _rhs, Index rhsStride,
+ Scalar* _res, Index resStride,
+ const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
+ {
+ // strip zeros
+ Index diagSize = (std::min)(_rows,_depth);
+ Index rows = IsLower ? _rows : diagSize;
+ Index depth = IsLower ? diagSize : _depth;
+ Index cols = _cols;
+
+ typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+ typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+ typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+ LhsMapper lhs(_lhs,lhsStride);
+ RhsMapper rhs(_rhs,rhsStride);
+ ResMapper res(_res, resStride);
+
+ Index kc = blocking.kc(); // cache block size along the K direction
+ Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
+
+ std::size_t sizeA = kc*mc;
+ std::size_t sizeB = kc*cols;
+
+ ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+ ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+
+ Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
+ triangularBuffer.setZero();
+ if((Mode&ZeroDiag)==ZeroDiag)
+ triangularBuffer.diagonal().setZero();
+ else
+ triangularBuffer.diagonal().setOnes();
+
+ gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+ gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+ gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
+
+ for(Index k2=IsLower ? depth : 0;
+ IsLower ? k2>0 : k2<depth;
+ IsLower ? k2-=kc : k2+=kc)
+ {
+ Index actual_kc = (std::min)(IsLower ? k2 : depth-k2, kc);
+ Index actual_k2 = IsLower ? k2-actual_kc : k2;
+
+ // align blocks with the end of the triangular part for trapezoidal lhs
+ if((!IsLower)&&(k2<rows)&&(k2+actual_kc>rows))
+ {
+ actual_kc = rows-k2;
+ k2 = k2+actual_kc-kc;
+ }
+
+ pack_rhs(blockB, rhs.getSubMapper(actual_k2,0), actual_kc, cols);
+
+ // the selected lhs's panel has to be split in three different parts:
+ // 1 - the part which is zero => skip it
+ // 2 - the diagonal block => special kernel
+ // 3 - the dense panel below (lower case) or above (upper case) the diagonal block => GEPP
+
+ // the block diagonal, if any:
+ if(IsLower || actual_k2<rows)
+ {
+ // for each small vertical panels of lhs
+ for (Index k1=0; k1<actual_kc; k1+=SmallPanelWidth)
+ {
+ Index actualPanelWidth = std::min<Index>(actual_kc-k1, SmallPanelWidth);
+ Index lengthTarget = IsLower ? actual_kc-k1-actualPanelWidth : k1;
+ Index startBlock = actual_k2+k1;
+ Index blockBOffset = k1;
+
+ // => GEBP with the micro triangular block
+ // The trick is to pack this micro block while filling the opposite triangular part with zeros.
+ // To this end we do an extra triangular copy to a small temporary buffer
+ for (Index k=0;k<actualPanelWidth;++k)
+ {
+ if (SetDiag)
+ triangularBuffer.coeffRef(k,k) = lhs(startBlock+k,startBlock+k);
+ for (Index i=IsLower ? k+1 : 0; IsLower ? i<actualPanelWidth : i<k; ++i)
+ triangularBuffer.coeffRef(i,k) = lhs(startBlock+i,startBlock+k);
+ }
+ pack_lhs(blockA, LhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()), actualPanelWidth, actualPanelWidth);
+
+ gebp_kernel(res.getSubMapper(startBlock, 0), blockA, blockB,
+ actualPanelWidth, actualPanelWidth, cols, alpha,
+ actualPanelWidth, actual_kc, 0, blockBOffset);
+
+ // GEBP with remaining micro panel
+ if (lengthTarget>0)
+ {
+ Index startTarget = IsLower ? actual_k2+k1+actualPanelWidth : actual_k2;
+
+ pack_lhs(blockA, lhs.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
+
+ gebp_kernel(res.getSubMapper(startTarget, 0), blockA, blockB,
+ lengthTarget, actualPanelWidth, cols, alpha,
+ actualPanelWidth, actual_kc, 0, blockBOffset);
+ }
+ }
+ }
+ // the part below (lower case) or above (upper case) the diagonal => GEPP
+ {
+ Index start = IsLower ? k2 : 0;
+ Index end = IsLower ? rows : (std::min)(actual_k2,rows);
+ for(Index i2=start; i2<end; i2+=mc)
+ {
+ const Index actual_mc = (std::min)(i2+mc,end)-i2;
+ gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
+ (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
+
+ gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc,
+ actual_kc, cols, alpha, -1, -1, 0, 0);
+ }
+ }
+ }
+ }
+
+// implements col-major += alpha * op(general) * op(triangular)
+template <typename Scalar, typename Index, int Mode,
+ int LhsStorageOrder, bool ConjugateLhs,
+ int RhsStorageOrder, bool ConjugateRhs, int Version>
+struct product_triangular_matrix_matrix<Scalar,Index,Mode,false,
+ LhsStorageOrder,ConjugateLhs,
+ RhsStorageOrder,ConjugateRhs,ColMajor,Version>
+{
+ typedef gebp_traits<Scalar,Scalar> Traits;
+ enum {
+ SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
+ IsLower = (Mode&Lower) == Lower,
+ SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1
+ };
+
+ static EIGEN_DONT_INLINE void run(
+ Index _rows, Index _cols, Index _depth,
+ const Scalar* _lhs, Index lhsStride,
+ const Scalar* _rhs, Index rhsStride,
+ Scalar* res, Index resStride,
+ const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
+};
+
+template <typename Scalar, typename Index, int Mode,
+ int LhsStorageOrder, bool ConjugateLhs,
+ int RhsStorageOrder, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
+ LhsStorageOrder,ConjugateLhs,
+ RhsStorageOrder,ConjugateRhs,ColMajor,Version>::run(
+ Index _rows, Index _cols, Index _depth,
+ const Scalar* _lhs, Index lhsStride,
+ const Scalar* _rhs, Index rhsStride,
+ Scalar* _res, Index resStride,
+ const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
+ {
+ // strip zeros
+ Index diagSize = (std::min)(_cols,_depth);
+ Index rows = _rows;
+ Index depth = IsLower ? _depth : diagSize;
+ Index cols = IsLower ? diagSize : _cols;
+
+ typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+ typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+ typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+ LhsMapper lhs(_lhs,lhsStride);
+ RhsMapper rhs(_rhs,rhsStride);
+ ResMapper res(_res, resStride);
+
+ Index kc = blocking.kc(); // cache block size along the K direction
+ Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
+
+ std::size_t sizeA = kc*mc;
+ std::size_t sizeB = kc*cols+EIGEN_ALIGN_BYTES/sizeof(Scalar);
+
+ ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+ ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+
+ Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
+ triangularBuffer.setZero();
+ if((Mode&ZeroDiag)==ZeroDiag)
+ triangularBuffer.diagonal().setZero();
+ else
+ triangularBuffer.diagonal().setOnes();
+
+ gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+ gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+ gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
+ gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
+
+ for(Index k2=IsLower ? 0 : depth;
+ IsLower ? k2<depth : k2>0;
+ IsLower ? k2+=kc : k2-=kc)
+ {
+ Index actual_kc = (std::min)(IsLower ? depth-k2 : k2, kc);
+ Index actual_k2 = IsLower ? k2 : k2-actual_kc;
+
+ // align blocks with the end of the triangular part for trapezoidal rhs
+ if(IsLower && (k2<cols) && (actual_k2+actual_kc>cols))
+ {
+ actual_kc = cols-k2;
+ k2 = actual_k2 + actual_kc - kc;
+ }
+
+ // remaining size
+ Index rs = IsLower ? (std::min)(cols,actual_k2) : cols - k2;
+ // size of the triangular part
+ Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc;
+
+ Scalar* geb = blockB+ts*ts;
+ geb = geb + internal::first_aligned(geb,EIGEN_ALIGN_BYTES/sizeof(Scalar));
+
+ pack_rhs(geb, rhs.getSubMapper(actual_k2,IsLower ? 0 : k2), actual_kc, rs);
+
+ // pack the triangular part of the rhs padding the unrolled blocks with zeros
+ if(ts>0)
+ {
+ for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth)
+ {
+ Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
+ Index actual_j2 = actual_k2 + j2;
+ Index panelOffset = IsLower ? j2+actualPanelWidth : 0;
+ Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
+ // general part
+ pack_rhs_panel(blockB+j2*actual_kc,
+ rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
+ panelLength, actualPanelWidth,
+ actual_kc, panelOffset);
+
+ // append the triangular part via a temporary buffer
+ for (Index j=0;j<actualPanelWidth;++j)
+ {
+ if (SetDiag)
+ triangularBuffer.coeffRef(j,j) = rhs(actual_j2+j,actual_j2+j);
+ for (Index k=IsLower ? j+1 : 0; IsLower ? k<actualPanelWidth : k<j; ++k)
+ triangularBuffer.coeffRef(k,j) = rhs(actual_j2+k,actual_j2+j);
+ }
+
+ pack_rhs_panel(blockB+j2*actual_kc,
+ RhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()),
+ actualPanelWidth, actualPanelWidth,
+ actual_kc, j2);
+ }
+ }
+
+ for (Index i2=0; i2<rows; i2+=mc)
+ {
+ const Index actual_mc = (std::min)(mc,rows-i2);
+ pack_lhs(blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
+
+ // triangular kernel
+ if(ts>0)
+ {
+ for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth)
+ {
+ Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
+ Index panelLength = IsLower ? actual_kc-j2 : j2+actualPanelWidth;
+ Index blockOffset = IsLower ? j2 : 0;
+
+ gebp_kernel(res.getSubMapper(i2, actual_k2 + j2),
+ blockA, blockB+j2*actual_kc,
+ actual_mc, panelLength, actualPanelWidth,
+ alpha,
+ actual_kc, actual_kc, // strides
+ blockOffset, blockOffset);// offsets
+ }
+ }
+ gebp_kernel(res.getSubMapper(i2, IsLower ? 0 : k2),
+ blockA, geb, actual_mc, actual_kc, rs,
+ alpha,
+ -1, -1, 0, 0);
+ }
+ }
+ }
+
+/***************************************************************************
+* Wrapper to product_triangular_matrix_matrix
+***************************************************************************/
+
+template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
+struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false> >
+ : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>, Lhs, Rhs> >
+{};
+
+} // end namespace internal
+
+template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
+struct TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
+ : public ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>, Lhs, Rhs >
+{
+ EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct)
+
+ TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
+
+ template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+ {
+ typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
+ typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
+
+ Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
+ * RhsBlasTraits::extractScalarFactor(m_rhs);
+
+ typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
+ Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,4> BlockingType;
+
+ enum { IsLower = (Mode&Lower) == Lower };
+ Index stripedRows = ((!LhsIsTriangular) || (IsLower)) ? lhs.rows() : (std::min)(lhs.rows(),lhs.cols());
+ Index stripedCols = ((LhsIsTriangular) || (!IsLower)) ? rhs.cols() : (std::min)(rhs.cols(),rhs.rows());
+ Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows()))
+ : ((IsLower) ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols()));
+
+ BlockingType blocking(stripedRows, stripedCols, stripedDepth, 1, false);
+
+ internal::product_triangular_matrix_matrix<Scalar, Index,
+ Mode, LhsIsTriangular,
+ (internal::traits<_ActualLhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
+ (internal::traits<_ActualRhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
+ (internal::traits<Dest >::Flags&RowMajorBit) ? RowMajor : ColMajor>
+ ::run(
+ stripedRows, stripedCols, stripedDepth, // sizes
+ &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info
+ &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info
+ &dst.coeffRef(0,0), dst.outerStride(), // result info
+ actualAlpha, blocking
+ );
+ }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h b/third_party/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
new file mode 100644
index 0000000000..ba41a1c99f
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
@@ -0,0 +1,309 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+ be used to endorse or promote products derived from this software without
+ specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ * Content : Eigen bindings to Intel(R) MKL
+ * Triangular matrix * matrix product functionality based on ?TRMM.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H
+#define EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H
+
+namespace Eigen {
+
+namespace internal {
+
+
+template <typename Scalar, typename Index,
+ int Mode, bool LhsIsTriangular,
+ int LhsStorageOrder, bool ConjugateLhs,
+ int RhsStorageOrder, bool ConjugateRhs,
+ int ResStorageOrder>
+struct product_triangular_matrix_matrix_trmm :
+ product_triangular_matrix_matrix<Scalar,Index,Mode,
+ LhsIsTriangular,LhsStorageOrder,ConjugateLhs,
+ RhsStorageOrder, ConjugateRhs, ResStorageOrder, BuiltIn> {};
+
+
+// try to go to BLAS specialization
+#define EIGEN_MKL_TRMM_SPECIALIZE(Scalar, LhsIsTriangular) \
+template <typename Index, int Mode, \
+ int LhsStorageOrder, bool ConjugateLhs, \
+ int RhsStorageOrder, bool ConjugateRhs> \
+struct product_triangular_matrix_matrix<Scalar,Index, Mode, LhsIsTriangular, \
+ LhsStorageOrder,ConjugateLhs, RhsStorageOrder,ConjugateRhs,ColMajor,Specialized> { \
+ static inline void run(Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride,\
+ const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking<Scalar,Scalar>& blocking) { \
+ product_triangular_matrix_matrix_trmm<Scalar,Index,Mode, \
+ LhsIsTriangular,LhsStorageOrder,ConjugateLhs, \
+ RhsStorageOrder, ConjugateRhs, ColMajor>::run( \
+ _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
+ } \
+};
+
+EIGEN_MKL_TRMM_SPECIALIZE(double, true)
+EIGEN_MKL_TRMM_SPECIALIZE(double, false)
+EIGEN_MKL_TRMM_SPECIALIZE(dcomplex, true)
+EIGEN_MKL_TRMM_SPECIALIZE(dcomplex, false)
+EIGEN_MKL_TRMM_SPECIALIZE(float, true)
+EIGEN_MKL_TRMM_SPECIALIZE(float, false)
+EIGEN_MKL_TRMM_SPECIALIZE(scomplex, true)
+EIGEN_MKL_TRMM_SPECIALIZE(scomplex, false)
+
+// implements col-major += alpha * op(triangular) * op(general)
+#define EIGEN_MKL_TRMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+template <typename Index, int Mode, \
+ int LhsStorageOrder, bool ConjugateLhs, \
+ int RhsStorageOrder, bool ConjugateRhs> \
+struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
+ LhsStorageOrder,ConjugateLhs,RhsStorageOrder,ConjugateRhs,ColMajor> \
+{ \
+ enum { \
+ IsLower = (Mode&Lower) == Lower, \
+ SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \
+ IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \
+ IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \
+ LowUp = IsLower ? Lower : Upper, \
+ conjA = ((LhsStorageOrder==ColMajor) && ConjugateLhs) ? 1 : 0 \
+ }; \
+\
+ static void run( \
+ Index _rows, Index _cols, Index _depth, \
+ const EIGTYPE* _lhs, Index lhsStride, \
+ const EIGTYPE* _rhs, Index rhsStride, \
+ EIGTYPE* res, Index resStride, \
+ EIGTYPE alpha, level3_blocking<EIGTYPE,EIGTYPE>& blocking) \
+ { \
+ Index diagSize = (std::min)(_rows,_depth); \
+ Index rows = IsLower ? _rows : diagSize; \
+ Index depth = IsLower ? diagSize : _depth; \
+ Index cols = _cols; \
+\
+ typedef Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> MatrixLhs; \
+ typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs; \
+\
+/* Non-square case - doesn't fit to MKL ?TRMM. Fall to default triangular product or call MKL ?GEMM*/ \
+ if (rows != depth) { \
+\
+ int nthr = mkl_domain_get_max_threads(MKL_BLAS); \
+\
+ if (((nthr==1) && (((std::max)(rows,depth)-diagSize)/(double)diagSize < 0.5))) { \
+ /* Most likely no benefit to call TRMM or GEMM from MKL*/ \
+ product_triangular_matrix_matrix<EIGTYPE,Index,Mode,true, \
+ LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, BuiltIn>::run( \
+ _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
+ /*std::cout << "TRMM_L: A is not square! Go to Eigen TRMM implementation!\n";*/ \
+ } else { \
+ /* Make sense to call GEMM */ \
+ Map<const MatrixLhs, 0, OuterStride<> > lhsMap(_lhs,rows,depth,OuterStride<>(lhsStride)); \
+ MatrixLhs aa_tmp=lhsMap.template triangularView<Mode>(); \
+ MKL_INT aStride = aa_tmp.outerStride(); \
+ gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth); \
+ general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \
+ rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, resStride, alpha, gemm_blocking, 0); \
+\
+ /*std::cout << "TRMM_L: A is not square! Go to MKL GEMM implementation! " << nthr<<" \n";*/ \
+ } \
+ return; \
+ } \
+ char side = 'L', transa, uplo, diag = 'N'; \
+ EIGTYPE *b; \
+ const EIGTYPE *a; \
+ MKL_INT m, n, lda, ldb; \
+ MKLTYPE alpha_; \
+\
+/* Set alpha_*/ \
+ assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
+\
+/* Set m, n */ \
+ m = (MKL_INT)diagSize; \
+ n = (MKL_INT)cols; \
+\
+/* Set trans */ \
+ transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \
+\
+/* Set b, ldb */ \
+ Map<const MatrixRhs, 0, OuterStride<> > rhs(_rhs,depth,cols,OuterStride<>(rhsStride)); \
+ MatrixX##EIGPREFIX b_tmp; \
+\
+ if (ConjugateRhs) b_tmp = rhs.conjugate(); else b_tmp = rhs; \
+ b = b_tmp.data(); \
+ ldb = b_tmp.outerStride(); \
+\
+/* Set uplo */ \
+ uplo = IsLower ? 'L' : 'U'; \
+ if (LhsStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \
+/* Set a, lda */ \
+ Map<const MatrixLhs, 0, OuterStride<> > lhs(_lhs,rows,depth,OuterStride<>(lhsStride)); \
+ MatrixLhs a_tmp; \
+\
+ if ((conjA!=0) || (SetDiag==0)) { \
+ if (conjA) a_tmp = lhs.conjugate(); else a_tmp = lhs; \
+ if (IsZeroDiag) \
+ a_tmp.diagonal().setZero(); \
+ else if (IsUnitDiag) \
+ a_tmp.diagonal().setOnes();\
+ a = a_tmp.data(); \
+ lda = a_tmp.outerStride(); \
+ } else { \
+ a = _lhs; \
+ lda = lhsStride; \
+ } \
+ /*std::cout << "TRMM_L: A is square! Go to MKL TRMM implementation! \n";*/ \
+/* call ?trmm*/ \
+ MKLPREFIX##trmm(&side, &uplo, &transa, &diag, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (MKLTYPE*)b, &ldb); \
+\
+/* Add op(a_triangular)*b into res*/ \
+ Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
+ res_tmp=res_tmp+b_tmp; \
+ } \
+};
+
+EIGEN_MKL_TRMM_L(double, double, d, d)
+EIGEN_MKL_TRMM_L(dcomplex, MKL_Complex16, cd, z)
+EIGEN_MKL_TRMM_L(float, float, f, s)
+EIGEN_MKL_TRMM_L(scomplex, MKL_Complex8, cf, c)
+
+// implements col-major += alpha * op(general) * op(triangular)
+#define EIGEN_MKL_TRMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+template <typename Index, int Mode, \
+ int LhsStorageOrder, bool ConjugateLhs, \
+ int RhsStorageOrder, bool ConjugateRhs> \
+struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
+ LhsStorageOrder,ConjugateLhs,RhsStorageOrder,ConjugateRhs,ColMajor> \
+{ \
+ enum { \
+ IsLower = (Mode&Lower) == Lower, \
+ SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \
+ IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \
+ IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \
+ LowUp = IsLower ? Lower : Upper, \
+ conjA = ((RhsStorageOrder==ColMajor) && ConjugateRhs) ? 1 : 0 \
+ }; \
+\
+ static void run( \
+ Index _rows, Index _cols, Index _depth, \
+ const EIGTYPE* _lhs, Index lhsStride, \
+ const EIGTYPE* _rhs, Index rhsStride, \
+ EIGTYPE* res, Index resStride, \
+ EIGTYPE alpha, level3_blocking<EIGTYPE,EIGTYPE>& blocking) \
+ { \
+ Index diagSize = (std::min)(_cols,_depth); \
+ Index rows = _rows; \
+ Index depth = IsLower ? _depth : diagSize; \
+ Index cols = IsLower ? diagSize : _cols; \
+\
+ typedef Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> MatrixLhs; \
+ typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs; \
+\
+/* Non-square case - doesn't fit to MKL ?TRMM. Fall to default triangular product or call MKL ?GEMM*/ \
+ if (cols != depth) { \
+\
+ int nthr = mkl_domain_get_max_threads(MKL_BLAS); \
+\
+ if ((nthr==1) && (((std::max)(cols,depth)-diagSize)/(double)diagSize < 0.5)) { \
+ /* Most likely no benefit to call TRMM or GEMM from MKL*/ \
+ product_triangular_matrix_matrix<EIGTYPE,Index,Mode,false, \
+ LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, BuiltIn>::run( \
+ _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
+ /*std::cout << "TRMM_R: A is not square! Go to Eigen TRMM implementation!\n";*/ \
+ } else { \
+ /* Make sense to call GEMM */ \
+ Map<const MatrixRhs, 0, OuterStride<> > rhsMap(_rhs,depth,cols, OuterStride<>(rhsStride)); \
+ MatrixRhs aa_tmp=rhsMap.template triangularView<Mode>(); \
+ MKL_INT aStride = aa_tmp.outerStride(); \
+ gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth); \
+ general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \
+ rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, resStride, alpha, gemm_blocking, 0); \
+\
+ /*std::cout << "TRMM_R: A is not square! Go to MKL GEMM implementation! " << nthr<<" \n";*/ \
+ } \
+ return; \
+ } \
+ char side = 'R', transa, uplo, diag = 'N'; \
+ EIGTYPE *b; \
+ const EIGTYPE *a; \
+ MKL_INT m, n, lda, ldb; \
+ MKLTYPE alpha_; \
+\
+/* Set alpha_*/ \
+ assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
+\
+/* Set m, n */ \
+ m = (MKL_INT)rows; \
+ n = (MKL_INT)diagSize; \
+\
+/* Set trans */ \
+ transa = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \
+\
+/* Set b, ldb */ \
+ Map<const MatrixLhs, 0, OuterStride<> > lhs(_lhs,rows,depth,OuterStride<>(lhsStride)); \
+ MatrixX##EIGPREFIX b_tmp; \
+\
+ if (ConjugateLhs) b_tmp = lhs.conjugate(); else b_tmp = lhs; \
+ b = b_tmp.data(); \
+ ldb = b_tmp.outerStride(); \
+\
+/* Set uplo */ \
+ uplo = IsLower ? 'L' : 'U'; \
+ if (RhsStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \
+/* Set a, lda */ \
+ Map<const MatrixRhs, 0, OuterStride<> > rhs(_rhs,depth,cols, OuterStride<>(rhsStride)); \
+ MatrixRhs a_tmp; \
+\
+ if ((conjA!=0) || (SetDiag==0)) { \
+ if (conjA) a_tmp = rhs.conjugate(); else a_tmp = rhs; \
+ if (IsZeroDiag) \
+ a_tmp.diagonal().setZero(); \
+ else if (IsUnitDiag) \
+ a_tmp.diagonal().setOnes();\
+ a = a_tmp.data(); \
+ lda = a_tmp.outerStride(); \
+ } else { \
+ a = _rhs; \
+ lda = rhsStride; \
+ } \
+ /*std::cout << "TRMM_R: A is square! Go to MKL TRMM implementation! \n";*/ \
+/* call ?trmm*/ \
+ MKLPREFIX##trmm(&side, &uplo, &transa, &diag, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (MKLTYPE*)b, &ldb); \
+\
+/* Add op(a_triangular)*b into res*/ \
+ Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
+ res_tmp=res_tmp+b_tmp; \
+ } \
+};
+
+EIGEN_MKL_TRMM_R(double, double, d, d)
+EIGEN_MKL_TRMM_R(dcomplex, MKL_Complex16, cd, z)
+EIGEN_MKL_TRMM_R(float, float, f, s)
+EIGEN_MKL_TRMM_R(scomplex, MKL_Complex8, cf, c)
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/TriangularMatrixVector.h b/third_party/eigen3/Eigen/src/Core/products/TriangularMatrixVector.h
new file mode 100644
index 0000000000..9863076958
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -0,0 +1,354 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRIANGULARMATRIXVECTOR_H
+#define EIGEN_TRIANGULARMATRIXVECTOR_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int StorageOrder, int Version=Specialized>
+struct triangular_matrix_vector_product;
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
+struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version>
+{
+ typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+ enum {
+ IsLower = ((Mode&Lower)==Lower),
+ HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
+ HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
+ };
+ static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+ const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha);
+};
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
+EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version>
+ ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+ const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha)
+ {
+ static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
+ Index size = (std::min)(_rows,_cols);
+ Index rows = IsLower ? _rows : (std::min)(_rows,_cols);
+ Index cols = IsLower ? (std::min)(_rows,_cols) : _cols;
+
+ typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
+ const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride));
+ typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs);
+
+ typedef Map<const Matrix<RhsScalar,Dynamic,1>, 0, InnerStride<> > RhsMap;
+ const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr));
+ typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs);
+
+ typedef Map<Matrix<ResScalar,Dynamic,1> > ResMap;
+ ResMap res(_res,rows);
+
+ typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
+ typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
+
+ for (Index pi=0; pi<size; pi+=PanelWidth)
+ {
+ Index actualPanelWidth = (std::min)(PanelWidth, size-pi);
+ for (Index k=0; k<actualPanelWidth; ++k)
+ {
+ Index i = pi + k;
+ Index s = IsLower ? ((HasUnitDiag||HasZeroDiag) ? i+1 : i ) : pi;
+ Index r = IsLower ? actualPanelWidth-k : k+1;
+ if ((!(HasUnitDiag||HasZeroDiag)) || (--r)>0)
+ res.segment(s,r) += (alpha * cjRhs.coeff(i)) * cjLhs.col(i).segment(s,r);
+ if (HasUnitDiag)
+ res.coeffRef(i) += alpha * cjRhs.coeff(i);
+ }
+ Index r = IsLower ? rows - pi - actualPanelWidth : pi;
+ if (r>0)
+ {
+ Index s = IsLower ? pi+actualPanelWidth : 0;
+ general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run(
+ r, actualPanelWidth,
+ LhsMapper(&lhs.coeffRef(s,pi), lhsStride),
+ RhsMapper(&rhs.coeffRef(pi), rhsIncr),
+ &res.coeffRef(s), resIncr, alpha);
+ }
+ }
+ if((!IsLower) && cols>size)
+ {
+ general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run(
+ rows, cols-size,
+ LhsMapper(&lhs.coeffRef(0,size), lhsStride),
+ RhsMapper(&rhs.coeffRef(size), rhsIncr),
+ _res, resIncr, alpha);
+ }
+ }
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs,int Version>
+struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor,Version>
+{
+ typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+ enum {
+ IsLower = ((Mode&Lower)==Lower),
+ HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
+ HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
+ };
+ static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+ const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha);
+};
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs,int Version>
+EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor,Version>
+ ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+ const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha)
+ {
+ static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
+ Index diagSize = (std::min)(_rows,_cols);
+ Index rows = IsLower ? _rows : diagSize;
+ Index cols = IsLower ? diagSize : _cols;
+
+ typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap;
+ const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride));
+ typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs);
+
+ typedef Map<const Matrix<RhsScalar,Dynamic,1> > RhsMap;
+ const RhsMap rhs(_rhs,cols);
+ typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs);
+
+ typedef Map<Matrix<ResScalar,Dynamic,1>, 0, InnerStride<> > ResMap;
+ ResMap res(_res,rows,InnerStride<>(resIncr));
+
+ typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
+ typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
+
+ for (Index pi=0; pi<diagSize; pi+=PanelWidth)
+ {
+ Index actualPanelWidth = (std::min)(PanelWidth, diagSize-pi);
+ for (Index k=0; k<actualPanelWidth; ++k)
+ {
+ Index i = pi + k;
+ Index s = IsLower ? pi : ((HasUnitDiag||HasZeroDiag) ? i+1 : i);
+ Index r = IsLower ? k+1 : actualPanelWidth-k;
+ if ((!(HasUnitDiag||HasZeroDiag)) || (--r)>0)
+ res.coeffRef(i) += alpha * (cjLhs.row(i).segment(s,r).cwiseProduct(cjRhs.segment(s,r).transpose())).sum();
+ if (HasUnitDiag)
+ res.coeffRef(i) += alpha * cjRhs.coeff(i);
+ }
+ Index r = IsLower ? pi : cols - pi - actualPanelWidth;
+ if (r>0)
+ {
+ Index s = IsLower ? 0 : pi + actualPanelWidth;
+ general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run(
+ actualPanelWidth, r,
+ LhsMapper(&lhs.coeffRef(pi,s), lhsStride),
+ RhsMapper(&rhs.coeffRef(s), rhsIncr),
+ &res.coeffRef(pi), resIncr, alpha);
+ }
+ }
+ if(IsLower && rows>diagSize)
+ {
+ general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run(
+ rows-diagSize, cols,
+ LhsMapper(&lhs.coeffRef(diagSize,0), lhsStride),
+ RhsMapper(&rhs.coeffRef(0), rhsIncr),
+ &res.coeffRef(diagSize), resIncr, alpha);
+ }
+ }
+
+/***************************************************************************
+* Wrapper to product_triangular_vector
+***************************************************************************/
+
+template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
+struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,true> >
+ : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,true>, Lhs, Rhs> >
+{};
+
+template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
+struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,true,Rhs,false> >
+ : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,true,Rhs,false>, Lhs, Rhs> >
+{};
+
+
+template<int StorageOrder>
+struct trmv_selector;
+
+} // end namespace internal
+
+template<int Mode, typename Lhs, typename Rhs>
+struct TriangularProduct<Mode,true,Lhs,false,Rhs,true>
+ : public ProductBase<TriangularProduct<Mode,true,Lhs,false,Rhs,true>, Lhs, Rhs >
+{
+ EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct)
+
+ TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
+
+ template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+ {
+ eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
+
+ internal::trmv_selector<(int(internal::traits<Lhs>::Flags)&RowMajorBit) ? RowMajor : ColMajor>::run(*this, dst, alpha);
+ }
+};
+
+template<int Mode, typename Lhs, typename Rhs>
+struct TriangularProduct<Mode,false,Lhs,true,Rhs,false>
+ : public ProductBase<TriangularProduct<Mode,false,Lhs,true,Rhs,false>, Lhs, Rhs >
+{
+ EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct)
+
+ TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
+
+ template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+ {
+ eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
+
+ typedef TriangularProduct<(Mode & (UnitDiag|ZeroDiag)) | ((Mode & Lower) ? Upper : Lower),true,Transpose<const Rhs>,false,Transpose<const Lhs>,true> TriangularProductTranspose;
+ Transpose<Dest> dstT(dst);
+ internal::trmv_selector<(int(internal::traits<Rhs>::Flags)&RowMajorBit) ? ColMajor : RowMajor>::run(
+ TriangularProductTranspose(m_rhs.transpose(),m_lhs.transpose()), dstT, alpha);
+ }
+};
+
+namespace internal {
+
+// TODO: find a way to factorize this piece of code with gemv_selector since the logic is exactly the same.
+
+template<> struct trmv_selector<ColMajor>
+{
+ template<int Mode, typename Lhs, typename Rhs, typename Dest>
+ static void run(const TriangularProduct<Mode,true,Lhs,false,Rhs,true>& prod, Dest& dest, const typename TriangularProduct<Mode,true,Lhs,false,Rhs,true>::Scalar& alpha)
+ {
+ typedef TriangularProduct<Mode,true,Lhs,false,Rhs,true> ProductType;
+ typedef typename ProductType::Index Index;
+ typedef typename ProductType::LhsScalar LhsScalar;
+ typedef typename ProductType::RhsScalar RhsScalar;
+ typedef typename ProductType::Scalar ResScalar;
+ typedef typename ProductType::RealScalar RealScalar;
+ typedef typename ProductType::ActualLhsType ActualLhsType;
+ typedef typename ProductType::ActualRhsType ActualRhsType;
+ typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
+ typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
+ typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
+
+ typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
+ typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
+
+ ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
+ * RhsBlasTraits::extractScalarFactor(prod.rhs());
+
+ enum {
+ // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
+ // on, the other hand it is good for the cache to pack the vector anyways...
+ EvalToDestAtCompileTime = Dest::InnerStrideAtCompileTime==1,
+ ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex),
+ MightCannotUseDest = (Dest::InnerStrideAtCompileTime!=1) || ComplexByReal
+ };
+
+ gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest;
+
+ bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
+ bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
+
+ RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
+
+ ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
+ evalToDest ? dest.data() : static_dest.data());
+
+ if(!evalToDest)
+ {
+ #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+ Index size = dest.size();
+ EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+ #endif
+ if(!alphaIsCompatible)
+ {
+ MappedDest(actualDestPtr, dest.size()).setZero();
+ compatibleAlpha = RhsScalar(1);
+ }
+ else
+ MappedDest(actualDestPtr, dest.size()) = dest;
+ }
+
+ internal::triangular_matrix_vector_product
+ <Index,Mode,
+ LhsScalar, LhsBlasTraits::NeedToConjugate,
+ RhsScalar, RhsBlasTraits::NeedToConjugate,
+ ColMajor>
+ ::run(actualLhs.rows(),actualLhs.cols(),
+ actualLhs.data(),actualLhs.outerStride(),
+ actualRhs.data(),actualRhs.innerStride(),
+ actualDestPtr,1,compatibleAlpha);
+
+ if (!evalToDest)
+ {
+ if(!alphaIsCompatible)
+ dest += actualAlpha * MappedDest(actualDestPtr, dest.size());
+ else
+ dest = MappedDest(actualDestPtr, dest.size());
+ }
+ }
+};
+
+template<> struct trmv_selector<RowMajor>
+{
+ template<int Mode, typename Lhs, typename Rhs, typename Dest>
+ static void run(const TriangularProduct<Mode,true,Lhs,false,Rhs,true>& prod, Dest& dest, const typename TriangularProduct<Mode,true,Lhs,false,Rhs,true>::Scalar& alpha)
+ {
+ typedef TriangularProduct<Mode,true,Lhs,false,Rhs,true> ProductType;
+ typedef typename ProductType::LhsScalar LhsScalar;
+ typedef typename ProductType::RhsScalar RhsScalar;
+ typedef typename ProductType::Scalar ResScalar;
+ typedef typename ProductType::Index Index;
+ typedef typename ProductType::ActualLhsType ActualLhsType;
+ typedef typename ProductType::ActualRhsType ActualRhsType;
+ typedef typename ProductType::_ActualRhsType _ActualRhsType;
+ typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
+ typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
+
+ typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
+ typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
+
+ ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
+ * RhsBlasTraits::extractScalarFactor(prod.rhs());
+
+ enum {
+ DirectlyUseRhs = _ActualRhsType::InnerStrideAtCompileTime==1
+ };
+
+ gemv_static_vector_if<RhsScalar,_ActualRhsType::SizeAtCompileTime,_ActualRhsType::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
+
+ ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,actualRhs.size(),
+ DirectlyUseRhs ? const_cast<RhsScalar*>(actualRhs.data()) : static_rhs.data());
+
+ if(!DirectlyUseRhs)
+ {
+ #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+ int size = actualRhs.size();
+ EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+ #endif
+ Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
+ }
+
+ internal::triangular_matrix_vector_product
+ <Index,Mode,
+ LhsScalar, LhsBlasTraits::NeedToConjugate,
+ RhsScalar, RhsBlasTraits::NeedToConjugate,
+ RowMajor>
+ ::run(actualLhs.rows(),actualLhs.cols(),
+ actualLhs.data(),actualLhs.outerStride(),
+ actualRhsPtr,1,
+ dest.data(),dest.innerStride(),
+ actualAlpha);
+ }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULARMATRIXVECTOR_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/TriangularMatrixVector_MKL.h b/third_party/eigen3/Eigen/src/Core/products/TriangularMatrixVector_MKL.h
new file mode 100644
index 0000000000..09f110da71
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/TriangularMatrixVector_MKL.h
@@ -0,0 +1,247 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+ be used to endorse or promote products derived from this software without
+ specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ * Content : Eigen bindings to Intel(R) MKL
+ * Triangular matrix-vector product functionality based on ?TRMV.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H
+#define EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H
+
+namespace Eigen {
+
+namespace internal {
+
+/**********************************************************************
+* This file implements triangular matrix-vector multiplication using BLAS
+**********************************************************************/
+
+// trmv/hemv specialization
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int StorageOrder>
+struct triangular_matrix_vector_product_trmv :
+ triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,StorageOrder,BuiltIn> {};
+
+#define EIGEN_MKL_TRMV_SPECIALIZE(Scalar) \
+template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
+struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor,Specialized> { \
+ static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \
+ const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \
+ triangular_matrix_vector_product_trmv<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor>::run( \
+ _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+ } \
+}; \
+template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
+struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,RowMajor,Specialized> { \
+ static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \
+ const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \
+ triangular_matrix_vector_product_trmv<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,RowMajor>::run( \
+ _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+ } \
+};
+
+EIGEN_MKL_TRMV_SPECIALIZE(double)
+EIGEN_MKL_TRMV_SPECIALIZE(float)
+EIGEN_MKL_TRMV_SPECIALIZE(dcomplex)
+EIGEN_MKL_TRMV_SPECIALIZE(scomplex)
+
+// implements col-major: res += alpha * op(triangular) * vector
+#define EIGEN_MKL_TRMV_CM(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
+struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor> { \
+ enum { \
+ IsLower = (Mode&Lower) == Lower, \
+ SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \
+ IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \
+ IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \
+ LowUp = IsLower ? Lower : Upper \
+ }; \
+ static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \
+ const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \
+ { \
+ if (ConjLhs || IsZeroDiag) { \
+ triangular_matrix_vector_product<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor,BuiltIn>::run( \
+ _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+ return; \
+ }\
+ Index size = (std::min)(_rows,_cols); \
+ Index rows = IsLower ? _rows : size; \
+ Index cols = IsLower ? size : _cols; \
+\
+ typedef VectorX##EIGPREFIX VectorRhs; \
+ EIGTYPE *x, *y;\
+\
+/* Set x*/ \
+ Map<const VectorRhs, 0, InnerStride<> > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \
+ VectorRhs x_tmp; \
+ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
+ x = x_tmp.data(); \
+\
+/* Square part handling */\
+\
+ char trans, uplo, diag; \
+ MKL_INT m, n, lda, incx, incy; \
+ EIGTYPE const *a; \
+ MKLTYPE alpha_, beta_; \
+ assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
+ assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \
+\
+/* Set m, n */ \
+ n = (MKL_INT)size; \
+ lda = lhsStride; \
+ incx = 1; \
+ incy = resIncr; \
+\
+/* Set uplo, trans and diag*/ \
+ trans = 'N'; \
+ uplo = IsLower ? 'L' : 'U'; \
+ diag = IsUnitDiag ? 'U' : 'N'; \
+\
+/* call ?TRMV*/ \
+ MKLPREFIX##trmv(&uplo, &trans, &diag, &n, (const MKLTYPE*)_lhs, &lda, (MKLTYPE*)x, &incx); \
+\
+/* Add op(a_tr)rhs into res*/ \
+ MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \
+/* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \
+ if (size<(std::max)(rows,cols)) { \
+ typedef Matrix<EIGTYPE, Dynamic, Dynamic> MatrixLhs; \
+ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
+ x = x_tmp.data(); \
+ if (size<rows) { \
+ y = _res + size*resIncr; \
+ a = _lhs + size; \
+ m = rows-size; \
+ n = size; \
+ } \
+ else { \
+ x += size; \
+ y = _res; \
+ a = _lhs + size*lda; \
+ m = size; \
+ n = cols-size; \
+ } \
+ MKLPREFIX##gemv(&trans, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)x, &incx, &beta_, (MKLTYPE*)y, &incy); \
+ } \
+ } \
+};
+
+EIGEN_MKL_TRMV_CM(double, double, d, d)
+EIGEN_MKL_TRMV_CM(dcomplex, MKL_Complex16, cd, z)
+EIGEN_MKL_TRMV_CM(float, float, f, s)
+EIGEN_MKL_TRMV_CM(scomplex, MKL_Complex8, cf, c)
+
+// implements row-major: res += alpha * op(triangular) * vector
+#define EIGEN_MKL_TRMV_RM(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
+struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor> { \
+ enum { \
+ IsLower = (Mode&Lower) == Lower, \
+ SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \
+ IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \
+ IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \
+ LowUp = IsLower ? Lower : Upper \
+ }; \
+ static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \
+ const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \
+ { \
+ if (IsZeroDiag) { \
+ triangular_matrix_vector_product<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor,BuiltIn>::run( \
+ _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+ return; \
+ }\
+ Index size = (std::min)(_rows,_cols); \
+ Index rows = IsLower ? _rows : size; \
+ Index cols = IsLower ? size : _cols; \
+\
+ typedef VectorX##EIGPREFIX VectorRhs; \
+ EIGTYPE *x, *y;\
+\
+/* Set x*/ \
+ Map<const VectorRhs, 0, InnerStride<> > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \
+ VectorRhs x_tmp; \
+ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
+ x = x_tmp.data(); \
+\
+/* Square part handling */\
+\
+ char trans, uplo, diag; \
+ MKL_INT m, n, lda, incx, incy; \
+ EIGTYPE const *a; \
+ MKLTYPE alpha_, beta_; \
+ assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
+ assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \
+\
+/* Set m, n */ \
+ n = (MKL_INT)size; \
+ lda = lhsStride; \
+ incx = 1; \
+ incy = resIncr; \
+\
+/* Set uplo, trans and diag*/ \
+ trans = ConjLhs ? 'C' : 'T'; \
+ uplo = IsLower ? 'U' : 'L'; \
+ diag = IsUnitDiag ? 'U' : 'N'; \
+\
+/* call ?TRMV*/ \
+ MKLPREFIX##trmv(&uplo, &trans, &diag, &n, (const MKLTYPE*)_lhs, &lda, (MKLTYPE*)x, &incx); \
+\
+/* Add op(a_tr)rhs into res*/ \
+ MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \
+/* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \
+ if (size<(std::max)(rows,cols)) { \
+ typedef Matrix<EIGTYPE, Dynamic, Dynamic> MatrixLhs; \
+ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
+ x = x_tmp.data(); \
+ if (size<rows) { \
+ y = _res + size*resIncr; \
+ a = _lhs + size*lda; \
+ m = rows-size; \
+ n = size; \
+ } \
+ else { \
+ x += size; \
+ y = _res; \
+ a = _lhs + size; \
+ m = size; \
+ n = cols-size; \
+ } \
+ MKLPREFIX##gemv(&trans, &n, &m, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)x, &incx, &beta_, (MKLTYPE*)y, &incy); \
+ } \
+ } \
+};
+
+EIGEN_MKL_TRMV_RM(double, double, d, d)
+EIGEN_MKL_TRMV_RM(dcomplex, MKL_Complex16, cd, z)
+EIGEN_MKL_TRMV_RM(float, float, f, s)
+EIGEN_MKL_TRMV_RM(scomplex, MKL_Complex8, cf, c)
+
+} // end namespase internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/TriangularSolverMatrix.h b/third_party/eigen3/Eigen/src/Core/products/TriangularSolverMatrix.h
new file mode 100644
index 0000000000..f5de67c59f
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -0,0 +1,331 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_H
+#define EIGEN_TRIANGULAR_SOLVER_MATRIX_H
+
+namespace Eigen {
+
+namespace internal {
+
+// if the rhs is row major, let's transpose the product
+template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder>
+struct triangular_solve_matrix<Scalar,Index,Side,Mode,Conjugate,TriStorageOrder,RowMajor>
+{
+ static void run(
+ Index size, Index cols,
+ const Scalar* tri, Index triStride,
+ Scalar* _other, Index otherStride,
+ level3_blocking<Scalar,Scalar>& blocking)
+ {
+ triangular_solve_matrix<
+ Scalar, Index, Side==OnTheLeft?OnTheRight:OnTheLeft,
+ (Mode&UnitDiag) | ((Mode&Upper) ? Lower : Upper),
+ NumTraits<Scalar>::IsComplex && Conjugate,
+ TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor>
+ ::run(size, cols, tri, triStride, _other, otherStride, blocking);
+ }
+};
+
+/* Optimized triangular solver with multiple right hand side and the triangular matrix on the left
+ */
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
+struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor>
+{
+ static EIGEN_DONT_INLINE void run(
+ Index size, Index otherSize,
+ const Scalar* _tri, Index triStride,
+ Scalar* _other, Index otherStride,
+ level3_blocking<Scalar,Scalar>& blocking);
+};
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
+EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor>::run(
+ Index size, Index otherSize,
+ const Scalar* _tri, Index triStride,
+ Scalar* _other, Index otherStride,
+ level3_blocking<Scalar,Scalar>& blocking)
+ {
+ Index cols = otherSize;
+
+ typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> TriMapper;
+ typedef blas_data_mapper<Scalar, Index, ColMajor> OtherMapper;
+ TriMapper tri(_tri, triStride);
+ OtherMapper other(_other, otherStride);
+
+ typedef gebp_traits<Scalar,Scalar> Traits;
+
+ enum {
+ SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
+ IsLower = (Mode&Lower) == Lower
+ };
+
+ Index kc = blocking.kc(); // cache block size along the K direction
+ Index mc = (std::min)(size,blocking.mc()); // cache block size along the M direction
+
+ std::size_t sizeA = kc*mc;
+ std::size_t sizeB = kc*cols;
+
+ ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+ ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+
+ conj_if<Conjugate> conj;
+ gebp_kernel<Scalar, Scalar, Index, OtherMapper, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
+ gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
+ gemm_pack_rhs<Scalar, Index, OtherMapper, Traits::nr, ColMajor, false, true> pack_rhs;
+
+ // the goal here is to subdivise the Rhs panels such that we keep some cache
+ // coherence when accessing the rhs elements
+ std::ptrdiff_t l1, l2, l3;
+ manage_caching_sizes(GetAction, &l1, &l2, &l3);
+ Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0;
+ subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr);
+
+ for(Index k2=IsLower ? 0 : size;
+ IsLower ? k2<size : k2>0;
+ IsLower ? k2+=kc : k2-=kc)
+ {
+ const Index actual_kc = (std::min)(IsLower ? size-k2 : k2, kc);
+
+ // We have selected and packed a big horizontal panel R1 of rhs. Let B be the packed copy of this panel,
+ // and R2 the remaining part of rhs. The corresponding vertical panel of lhs is split into
+ // A11 (the triangular part) and A21 the remaining rectangular part.
+ // Then the high level algorithm is:
+ // - B = R1 => general block copy (done during the next step)
+ // - R1 = A11^-1 B => tricky part
+ // - update B from the new R1 => actually this has to be performed continuously during the above step
+ // - R2 -= A21 * B => GEPP
+
+ // The tricky part: compute R1 = A11^-1 B while updating B from R1
+ // The idea is to split A11 into multiple small vertical panels.
+ // Each panel can be split into a small triangular part T1k which is processed without optimization,
+ // and the remaining small part T2k which is processed using gebp with appropriate block strides
+ for(Index j2=0; j2<cols; j2+=subcols)
+ {
+ Index actual_cols = (std::min)(cols-j2,subcols);
+ // for each small vertical panels [T1k^T, T2k^T]^T of lhs
+ for (Index k1=0; k1<actual_kc; k1+=SmallPanelWidth)
+ {
+ Index actualPanelWidth = std::min<Index>(actual_kc-k1, SmallPanelWidth);
+ // tr solve
+ for (Index k=0; k<actualPanelWidth; ++k)
+ {
+ // TODO write a small kernel handling this (can be shared with trsv)
+ Index i = IsLower ? k2+k1+k : k2-k1-k-1;
+ Index s = IsLower ? k2+k1 : i+1;
+ Index rs = actualPanelWidth - k - 1; // remaining size
+
+ Scalar a = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(tri(i,i));
+ for (Index j=j2; j<j2+actual_cols; ++j)
+ {
+ if (TriStorageOrder==RowMajor)
+ {
+ Scalar b(0);
+ const Scalar* l = &tri(i,s);
+ Scalar* r = &other(s,j);
+ for (Index i3=0; i3<k; ++i3)
+ b += conj(l[i3]) * r[i3];
+
+ other(i,j) = (other(i,j) - b)*a;
+ }
+ else
+ {
+ Index s = IsLower ? i+1 : i-rs;
+ Scalar b = (other(i,j) *= a);
+ Scalar* r = &other(s,j);
+ const Scalar* l = &tri(s,i);
+ for (Index i3=0;i3<rs;++i3)
+ r[i3] -= b * conj(l[i3]);
+ }
+ }
+ }
+
+ Index lengthTarget = actual_kc-k1-actualPanelWidth;
+ Index startBlock = IsLower ? k2+k1 : k2-k1-actualPanelWidth;
+ Index blockBOffset = IsLower ? k1 : lengthTarget;
+
+ // update the respective rows of B from other
+ pack_rhs(blockB+actual_kc*j2, other.getSubMapper(startBlock,j2), actualPanelWidth, actual_cols, actual_kc, blockBOffset);
+
+ // GEBP
+ if (lengthTarget>0)
+ {
+ Index startTarget = IsLower ? k2+k1+actualPanelWidth : k2-actual_kc;
+
+ pack_lhs(blockA, tri.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
+
+ gebp_kernel(other.getSubMapper(startTarget,j2), blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
+ actualPanelWidth, actual_kc, 0, blockBOffset);
+ }
+ }
+ }
+
+ // R2 -= A21 * B => GEPP
+ {
+ Index start = IsLower ? k2+kc : 0;
+ Index end = IsLower ? size : k2-kc;
+ for(Index i2=start; i2<end; i2+=mc)
+ {
+ const Index actual_mc = (std::min)(mc,end-i2);
+ if (actual_mc>0)
+ {
+ pack_lhs(blockA, tri.getSubMapper(i2, IsLower ? k2 : k2-kc), actual_kc, actual_mc);
+
+ gebp_kernel(other.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
+ }
+ }
+ }
+ }
+ }
+
+/* Optimized triangular solver with multiple left hand sides and the trinagular matrix on the right
+ */
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
+struct triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor>
+{
+ static EIGEN_DONT_INLINE void run(
+ Index size, Index otherSize,
+ const Scalar* _tri, Index triStride,
+ Scalar* _other, Index otherStride,
+ level3_blocking<Scalar,Scalar>& blocking);
+};
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
+EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor>::run(
+ Index size, Index otherSize,
+ const Scalar* _tri, Index triStride,
+ Scalar* _other, Index otherStride,
+ level3_blocking<Scalar,Scalar>& blocking)
+ {
+ Index rows = otherSize;
+
+ typedef blas_data_mapper<Scalar, Index, ColMajor> LhsMapper;
+ typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper;
+ LhsMapper lhs(_other, otherStride);
+ RhsMapper rhs(_tri, triStride);
+
+ typedef gebp_traits<Scalar,Scalar> Traits;
+ enum {
+ RhsStorageOrder = TriStorageOrder,
+ SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
+ IsLower = (Mode&Lower) == Lower
+ };
+
+ Index kc = blocking.kc(); // cache block size along the K direction
+ Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
+
+ std::size_t sizeA = kc*mc;
+ std::size_t sizeB = kc*size;
+
+ ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+ ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+
+ conj_if<Conjugate> conj;
+ gebp_kernel<Scalar, Scalar, Index, LhsMapper, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
+ gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+ gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder,false,true> pack_rhs_panel;
+ gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
+
+ for(Index k2=IsLower ? size : 0;
+ IsLower ? k2>0 : k2<size;
+ IsLower ? k2-=kc : k2+=kc)
+ {
+ const Index actual_kc = (std::min)(IsLower ? k2 : size-k2, kc);
+ Index actual_k2 = IsLower ? k2-actual_kc : k2 ;
+
+ Index startPanel = IsLower ? 0 : k2+actual_kc;
+ Index rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc;
+ Scalar* geb = blockB+actual_kc*actual_kc;
+
+ if (rs>0) pack_rhs(geb, rhs.getSubMapper(actual_k2,startPanel), actual_kc, rs);
+
+ // triangular packing (we only pack the panels off the diagonal,
+ // neglecting the blocks overlapping the diagonal
+ {
+ for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth)
+ {
+ Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
+ Index actual_j2 = actual_k2 + j2;
+ Index panelOffset = IsLower ? j2+actualPanelWidth : 0;
+ Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
+
+ if (panelLength>0)
+ pack_rhs_panel(blockB+j2*actual_kc,
+ rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
+ panelLength, actualPanelWidth,
+ actual_kc, panelOffset);
+ }
+ }
+
+ for(Index i2=0; i2<rows; i2+=mc)
+ {
+ const Index actual_mc = (std::min)(mc,rows-i2);
+
+ // triangular solver kernel
+ {
+ // for each small block of the diagonal (=> vertical panels of rhs)
+ for (Index j2 = IsLower
+ ? (actual_kc - ((actual_kc%SmallPanelWidth) ? Index(actual_kc%SmallPanelWidth)
+ : Index(SmallPanelWidth)))
+ : 0;
+ IsLower ? j2>=0 : j2<actual_kc;
+ IsLower ? j2-=SmallPanelWidth : j2+=SmallPanelWidth)
+ {
+ Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
+ Index absolute_j2 = actual_k2 + j2;
+ Index panelOffset = IsLower ? j2+actualPanelWidth : 0;
+ Index panelLength = IsLower ? actual_kc - j2 - actualPanelWidth : j2;
+
+ // GEBP
+ if(panelLength>0)
+ {
+ gebp_kernel(lhs.getSubMapper(i2,absolute_j2),
+ blockA, blockB+j2*actual_kc,
+ actual_mc, panelLength, actualPanelWidth,
+ Scalar(-1),
+ actual_kc, actual_kc, // strides
+ panelOffset, panelOffset); // offsets
+ }
+
+ // unblocked triangular solve
+ for (Index k=0; k<actualPanelWidth; ++k)
+ {
+ Index j = IsLower ? absolute_j2+actualPanelWidth-k-1 : absolute_j2+k;
+
+ Scalar* r = &lhs(i2,j);
+ for (Index k3=0; k3<k; ++k3)
+ {
+ Scalar b = conj(rhs(IsLower ? j+1+k3 : absolute_j2+k3,j));
+ Scalar* a = &lhs(i2,IsLower ? j+1+k3 : absolute_j2+k3);
+ for (Index i=0; i<actual_mc; ++i)
+ r[i] -= a[i] * b;
+ }
+ Scalar b = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(rhs(j,j));
+ for (Index i=0; i<actual_mc; ++i)
+ r[i] *= b;
+ }
+
+ // pack the just computed part of lhs to A
+ pack_lhs_panel(blockA, LhsMapper(_other+absolute_j2*otherStride+i2, otherStride),
+ actualPanelWidth, actual_mc,
+ actual_kc, j2);
+ }
+ }
+
+ if (rs>0)
+ gebp_kernel(lhs.getSubMapper(i2, startPanel), blockA, geb,
+ actual_mc, actual_kc, rs, Scalar(-1),
+ -1, -1, 0, 0);
+ }
+ }
+ }
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h b/third_party/eigen3/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h
new file mode 100644
index 0000000000..6a0bb83393
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h
@@ -0,0 +1,155 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+ be used to endorse or promote products derived from this software without
+ specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ * Content : Eigen bindings to Intel(R) MKL
+ * Triangular matrix * matrix product functionality based on ?TRMM.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H
+#define EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H
+
+namespace Eigen {
+
+namespace internal {
+
+// implements LeftSide op(triangular)^-1 * general
+#define EIGEN_MKL_TRSM_L(EIGTYPE, MKLTYPE, MKLPREFIX) \
+template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
+struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor> \
+{ \
+ enum { \
+ IsLower = (Mode&Lower) == Lower, \
+ IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \
+ IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \
+ conjA = ((TriStorageOrder==ColMajor) && Conjugate) ? 1 : 0 \
+ }; \
+ static void run( \
+ Index size, Index otherSize, \
+ const EIGTYPE* _tri, Index triStride, \
+ EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
+ { \
+ MKL_INT m = size, n = otherSize, lda, ldb; \
+ char side = 'L', uplo, diag='N', transa; \
+ /* Set alpha_ */ \
+ MKLTYPE alpha; \
+ EIGTYPE myone(1); \
+ assign_scalar_eig2mkl(alpha, myone); \
+ ldb = otherStride;\
+\
+ const EIGTYPE *a; \
+/* Set trans */ \
+ transa = (TriStorageOrder==RowMajor) ? ((Conjugate) ? 'C' : 'T') : 'N'; \
+/* Set uplo */ \
+ uplo = IsLower ? 'L' : 'U'; \
+ if (TriStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \
+/* Set a, lda */ \
+ typedef Matrix<EIGTYPE, Dynamic, Dynamic, TriStorageOrder> MatrixTri; \
+ Map<const MatrixTri, 0, OuterStride<> > tri(_tri,size,size,OuterStride<>(triStride)); \
+ MatrixTri a_tmp; \
+\
+ if (conjA) { \
+ a_tmp = tri.conjugate(); \
+ a = a_tmp.data(); \
+ lda = a_tmp.outerStride(); \
+ } else { \
+ a = _tri; \
+ lda = triStride; \
+ } \
+ if (IsUnitDiag) diag='U'; \
+/* call ?trsm*/ \
+ MKLPREFIX##trsm(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const MKLTYPE*)a, &lda, (MKLTYPE*)_other, &ldb); \
+ } \
+};
+
+EIGEN_MKL_TRSM_L(double, double, d)
+EIGEN_MKL_TRSM_L(dcomplex, MKL_Complex16, z)
+EIGEN_MKL_TRSM_L(float, float, s)
+EIGEN_MKL_TRSM_L(scomplex, MKL_Complex8, c)
+
+
+// implements RightSide general * op(triangular)^-1
+#define EIGEN_MKL_TRSM_R(EIGTYPE, MKLTYPE, MKLPREFIX) \
+template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
+struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor> \
+{ \
+ enum { \
+ IsLower = (Mode&Lower) == Lower, \
+ IsUnitDiag = (Mode&UnitDiag) ? 1 : 0, \
+ IsZeroDiag = (Mode&ZeroDiag) ? 1 : 0, \
+ conjA = ((TriStorageOrder==ColMajor) && Conjugate) ? 1 : 0 \
+ }; \
+ static void run( \
+ Index size, Index otherSize, \
+ const EIGTYPE* _tri, Index triStride, \
+ EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
+ { \
+ MKL_INT m = otherSize, n = size, lda, ldb; \
+ char side = 'R', uplo, diag='N', transa; \
+ /* Set alpha_ */ \
+ MKLTYPE alpha; \
+ EIGTYPE myone(1); \
+ assign_scalar_eig2mkl(alpha, myone); \
+ ldb = otherStride;\
+\
+ const EIGTYPE *a; \
+/* Set trans */ \
+ transa = (TriStorageOrder==RowMajor) ? ((Conjugate) ? 'C' : 'T') : 'N'; \
+/* Set uplo */ \
+ uplo = IsLower ? 'L' : 'U'; \
+ if (TriStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \
+/* Set a, lda */ \
+ typedef Matrix<EIGTYPE, Dynamic, Dynamic, TriStorageOrder> MatrixTri; \
+ Map<const MatrixTri, 0, OuterStride<> > tri(_tri,size,size,OuterStride<>(triStride)); \
+ MatrixTri a_tmp; \
+\
+ if (conjA) { \
+ a_tmp = tri.conjugate(); \
+ a = a_tmp.data(); \
+ lda = a_tmp.outerStride(); \
+ } else { \
+ a = _tri; \
+ lda = triStride; \
+ } \
+ if (IsUnitDiag) diag='U'; \
+/* call ?trsm*/ \
+ MKLPREFIX##trsm(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const MKLTYPE*)a, &lda, (MKLTYPE*)_other, &ldb); \
+ /*std::cout << "TRMS_L specialization!\n";*/ \
+ } \
+};
+
+EIGEN_MKL_TRSM_R(double, double, d)
+EIGEN_MKL_TRSM_R(dcomplex, MKL_Complex16, z)
+EIGEN_MKL_TRSM_R(float, float, s)
+EIGEN_MKL_TRSM_R(scomplex, MKL_Complex8, c)
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/TriangularSolverVector.h b/third_party/eigen3/Eigen/src/Core/products/TriangularSolverVector.h
new file mode 100644
index 0000000000..b994759b26
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/TriangularSolverVector.h
@@ -0,0 +1,145 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRIANGULAR_SOLVER_VECTOR_H
+#define EIGEN_TRIANGULAR_SOLVER_VECTOR_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate, int StorageOrder>
+struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheRight, Mode, Conjugate, StorageOrder>
+{
+ static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs)
+ {
+ triangular_solve_vector<LhsScalar,RhsScalar,Index,OnTheLeft,
+ ((Mode&Upper)==Upper ? Lower : Upper) | (Mode&UnitDiag),
+ Conjugate,StorageOrder==RowMajor?ColMajor:RowMajor
+ >::run(size, _lhs, lhsStride, rhs);
+ }
+};
+
+// forward and backward substitution, row-major, rhs is a vector
+template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate>
+struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, RowMajor>
+{
+ enum {
+ IsLower = ((Mode&Lower)==Lower)
+ };
+ static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs)
+ {
+ typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap;
+ const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride));
+
+ typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
+ typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
+
+ typename internal::conditional<
+ Conjugate,
+ const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
+ const LhsMap&>
+ ::type cjLhs(lhs);
+ static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
+ for(Index pi=IsLower ? 0 : size;
+ IsLower ? pi<size : pi>0;
+ IsLower ? pi+=PanelWidth : pi-=PanelWidth)
+ {
+ Index actualPanelWidth = (std::min)(IsLower ? size - pi : pi, PanelWidth);
+
+ Index r = IsLower ? pi : size - pi; // remaining size
+ if (r > 0)
+ {
+ // let's directly call the low level product function because:
+ // 1 - it is faster to compile
+ // 2 - it is slighlty faster at runtime
+ Index startRow = IsLower ? pi : pi-actualPanelWidth;
+ Index startCol = IsLower ? 0 : pi;
+
+ general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
+ actualPanelWidth, r,
+ LhsMapper(&lhs.coeffRef(startRow,startCol), lhsStride),
+ RhsMapper(rhs + startCol, 1),
+ rhs + startRow, 1,
+ RhsScalar(-1));
+ }
+
+ for(Index k=0; k<actualPanelWidth; ++k)
+ {
+ Index i = IsLower ? pi+k : pi-k-1;
+ Index s = IsLower ? pi : i+1;
+ if (k>0)
+ rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum();
+
+ if(!(Mode & UnitDiag))
+ rhs[i] /= cjLhs(i,i);
+ }
+ }
+ }
+};
+
+// forward and backward substitution, column-major, rhs is a vector
+template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate>
+struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, ColMajor>
+{
+ enum {
+ IsLower = ((Mode&Lower)==Lower)
+ };
+ static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs)
+ {
+ typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
+ const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride));
+ typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
+ typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
+ typename internal::conditional<Conjugate,
+ const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
+ const LhsMap&
+ >::type cjLhs(lhs);
+ static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
+
+ for(Index pi=IsLower ? 0 : size;
+ IsLower ? pi<size : pi>0;
+ IsLower ? pi+=PanelWidth : pi-=PanelWidth)
+ {
+ Index actualPanelWidth = (std::min)(IsLower ? size - pi : pi, PanelWidth);
+ Index startBlock = IsLower ? pi : pi-actualPanelWidth;
+ Index endBlock = IsLower ? pi + actualPanelWidth : 0;
+
+ for(Index k=0; k<actualPanelWidth; ++k)
+ {
+ Index i = IsLower ? pi+k : pi-k-1;
+ if(!(Mode & UnitDiag))
+ rhs[i] /= cjLhs.coeff(i,i);
+
+ Index r = actualPanelWidth - k - 1; // remaining size
+ Index s = IsLower ? i+1 : i-r;
+ if (r>0)
+ Map<Matrix<RhsScalar,Dynamic,1> >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r);
+ }
+ Index r = IsLower ? size - endBlock : startBlock; // remaining size
+ if (r > 0)
+ {
+ // let's directly call the low level product function because:
+ // 1 - it is faster to compile
+ // 2 - it is slighlty faster at runtime
+ general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
+ r, actualPanelWidth,
+ LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride),
+ RhsMapper(rhs+startBlock, 1),
+ rhs+endBlock, 1, RhsScalar(-1));
+ }
+ }
+ }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_SOLVER_VECTOR_H