22 files changed, 7976 insertions, 0 deletions
diff --git a/third_party/eigen3/Eigen/src/Core/products/CoeffBasedProduct.h b/third_party/eigen3/Eigen/src/Core/products/CoeffBasedProduct.h
new file mode 100644
index 0000000000..35a6e36e81
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/CoeffBasedProduct.h
@@ -0,0 +1,454 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COEFFBASED_PRODUCT_H
+#define EIGEN_COEFFBASED_PRODUCT_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/*********************************************************************************
+*  Coefficient based product implementation.
+*  It is designed for the following use cases:
+*  - small fixed sizes
+*  - lazy products
+*********************************************************************************/
+
+/* Since the all the dimensions of the product are small, here we can rely
+ * on the generic Assign mechanism to evaluate the product per coeff (or packet).
+ *
+ * Note that here the inner-loops should always be unrolled.
+ */
+
+template<int Traversal, int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
+struct product_coeff_impl;
+
+template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct product_packet_impl;
+
+template<typename LhsNested, typename RhsNested, int NestingFlags>
+struct traits<CoeffBasedProduct<LhsNested,RhsNested,NestingFlags> >
+{
+  typedef MatrixXpr XprKind;
+  typedef typename remove_all<LhsNested>::type _LhsNested;
+  typedef typename remove_all<RhsNested>::type _RhsNested;
+  typedef typename scalar_product_traits<typename _LhsNested::Scalar, typename _RhsNested::Scalar>::ReturnType Scalar;
+  typedef typename promote_storage_type<typename traits<_LhsNested>::StorageKind,
+                                           typename traits<_RhsNested>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<_LhsNested>::Index,
+                                         typename traits<_RhsNested>::Index>::type Index;
+
+  enum {
+      LhsCoeffReadCost = _LhsNested::CoeffReadCost,
+      RhsCoeffReadCost = _RhsNested::CoeffReadCost,
+      LhsFlags = _LhsNested::Flags,
+      RhsFlags = _RhsNested::Flags,
+
+      RowsAtCompileTime = _LhsNested::RowsAtCompileTime,
+      ColsAtCompileTime = _RhsNested::ColsAtCompileTime,
+      InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime),
+
+      MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime,
+
+      LhsRowMajor = LhsFlags & RowMajorBit,
+      RhsRowMajor = RhsFlags & RowMajorBit,
+
+      SameType = is_same<typename _LhsNested::Scalar,typename _RhsNested::Scalar>::value,
+
+      CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit)
+                      && (ColsAtCompileTime == Dynamic
+                          || ( (ColsAtCompileTime % packet_traits<Scalar>::size) == 0
+                              && (RhsFlags&AlignedBit)
+                             )
+                         ),
+
+      CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit)
+                      && (RowsAtCompileTime == Dynamic
+                          || ( (RowsAtCompileTime % packet_traits<Scalar>::size) == 0
+                              && (LhsFlags&AlignedBit)
+                             )
+                         ),
+
+      EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
+                     : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
+                     : (RhsRowMajor && !CanVectorizeLhs),
+
+      Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit)
+            | (EvalToRowMajor ? RowMajorBit : 0)
+            | NestingFlags
+            | (CanVectorizeLhs ? (LhsFlags & AlignedBit) : 0)
+            | (CanVectorizeRhs ? (RhsFlags & AlignedBit) : 0)
+            // TODO enable vectorization for mixed types
+            | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0),
+
+      CoeffReadCost = InnerSize == Dynamic ? Dynamic
+                    : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
+                      + (InnerSize - 1) * NumTraits<Scalar>::AddCost,
+
+      /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
+      * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
+      * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
+      * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
+      */
+      CanVectorizeInner =    SameType
+                          && LhsRowMajor
+                          && (!RhsRowMajor)
+                          && (LhsFlags & RhsFlags & ActualPacketAccessBit)
+                          && (LhsFlags & RhsFlags & AlignedBit)
+                          && (InnerSize % packet_traits<Scalar>::size == 0)
+    };
+};
+
+} // end namespace internal
+
+template<typename LhsNested, typename RhsNested, int NestingFlags>
+class CoeffBasedProduct
+  : internal::no_assignment_operator,
+    public MatrixBase<CoeffBasedProduct<LhsNested, RhsNested, NestingFlags> >
+{
+  public:
+
+    typedef MatrixBase<CoeffBasedProduct> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(CoeffBasedProduct)
+    typedef typename Base::PlainObject PlainObject;
+
+  private:
+
+    typedef typename internal::traits<CoeffBasedProduct>::_LhsNested _LhsNested;
+    typedef typename internal::traits<CoeffBasedProduct>::_RhsNested _RhsNested;
+
+    enum {
+      PacketSize = internal::packet_traits<Scalar>::size,
+      InnerSize  = internal::traits<CoeffBasedProduct>::InnerSize,
+      Unroll = CoeffReadCost != Dynamic && CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
+      CanVectorizeInner = internal::traits<CoeffBasedProduct>::CanVectorizeInner
+    };
+
+    typedef internal::product_coeff_impl<CanVectorizeInner ? InnerVectorizedTraversal : DefaultTraversal,
+                                   Unroll ? InnerSize-1 : Dynamic,
+                                   _LhsNested, _RhsNested, Scalar> ScalarCoeffImpl;
+
+    typedef CoeffBasedProduct<LhsNested,RhsNested,NestByRefBit> LazyCoeffBasedProductType;
+
+  public:
+
+    EIGEN_DEVICE_FUNC
+    inline CoeffBasedProduct(const CoeffBasedProduct& other)
+      : Base(), m_lhs(other.m_lhs), m_rhs(other.m_rhs)
+    {}
+
+    template<typename Lhs, typename Rhs>
+    EIGEN_DEVICE_FUNC 
+    inline CoeffBasedProduct(const Lhs& lhs, const Rhs& rhs)
+      : m_lhs(lhs), m_rhs(rhs)
+    {
+      // we don't allow taking products of matrices of different real types, as that wouldn't be vectorizable.
+      // We still allow to mix T and complex<T>.
+      EIGEN_STATIC_ASSERT((internal::scalar_product_traits<typename Lhs::RealScalar, typename Rhs::RealScalar>::Defined),
+        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+      eigen_assert(lhs.cols() == rhs.rows()
+        && "invalid matrix product"
+        && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
+    {
+      Scalar res;
+      ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
+      return res;
+    }
+
+    /* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
+     * which is why we don't set the LinearAccessBit.
+     */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
+    {
+      Scalar res;
+      const Index row = RowsAtCompileTime == 1 ? 0 : index;
+      const Index col = RowsAtCompileTime == 1 ? index : 0;
+      ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
+      return res;
+    }
+
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE const PacketScalar packet(Index row, Index col) const
+    {
+      PacketScalar res;
+      internal::product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor,
+                              Unroll ? InnerSize-1 : Dynamic,
+                              _LhsNested, _RhsNested, PacketScalar, LoadMode>
+        ::run(row, col, m_lhs, m_rhs, res);
+      return res;
+    }
+
+    // Implicit conversion to the nested type (trigger the evaluation of the product)
+    EIGEN_DEVICE_FUNC 
+    EIGEN_STRONG_INLINE operator const PlainObject& () const
+    {
+      m_result.lazyAssign(*this);
+      return m_result;
+    }
+
+    EIGEN_DEVICE_FUNC const _LhsNested& lhs() const { return m_lhs; }
+    EIGEN_DEVICE_FUNC const _RhsNested& rhs() const { return m_rhs; }
+
+    EIGEN_DEVICE_FUNC
+    const Diagonal<const LazyCoeffBasedProductType,0> diagonal() const
+    { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this); }
+
+    template<int DiagonalIndex>
+    EIGEN_DEVICE_FUNC 
+    const Diagonal<const LazyCoeffBasedProductType,DiagonalIndex> diagonal() const
+    { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this); }
+
+    EIGEN_DEVICE_FUNC
+    const Diagonal<const LazyCoeffBasedProductType, DynamicIndex> diagonal(Index index) const {
+      return Diagonal<const LazyCoeffBasedProductType, DynamicIndex>(
+          reinterpret_cast<const LazyCoeffBasedProductType&>(*this), index);
+    }
+
+  protected:
+    typename internal::add_const_on_value_type<LhsNested>::type m_lhs;
+    typename internal::add_const_on_value_type<RhsNested>::type m_rhs;
+
+    mutable PlainObject m_result;
+};
+
+namespace internal {
+
+// here we need to overload the nested rule for products
+// such that the nested type is a const reference to a plain matrix
+template<typename Lhs, typename Rhs, int N, typename PlainObject>
+struct nested<CoeffBasedProduct<Lhs,Rhs,EvalBeforeNestingBit|EvalBeforeAssigningBit>, N, PlainObject>
+{
+  typedef PlainObject const& type;
+};
+
+/***************************************************************************
+* Normal product .coeff() implementation (with meta-unrolling)
+***************************************************************************/
+
+/**************************************
+*** Scalar path  - no vectorization ***
+**************************************/
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
+struct product_coeff_impl<DefaultTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
+{
+  typedef typename Lhs::Index Index;
+  EIGEN_DEVICE_FUNC 
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
+  {
+    product_coeff_impl<DefaultTraversal, UnrollingIndex-1, Lhs, Rhs, RetScalar>::run(row, col, lhs, rhs, res);
+    res += lhs.coeff(row, UnrollingIndex) * rhs.coeff(UnrollingIndex, col);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename RetScalar>
+struct product_coeff_impl<DefaultTraversal, 0, Lhs, Rhs, RetScalar>
+{
+  typedef typename Lhs::Index Index;
+  EIGEN_DEVICE_FUNC 
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
+  {
+    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename RetScalar>
+struct product_coeff_impl<DefaultTraversal, Dynamic, Lhs, Rhs, RetScalar>
+{
+  typedef typename Lhs::Index Index;
+  EIGEN_DEVICE_FUNC 
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar& res)
+  {
+    eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix");
+    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
+      for(Index i = 1; i < lhs.cols(); ++i)
+        res += lhs.coeff(row, i) * rhs.coeff(i, col);
+  }
+};
+
+/*******************************************
+*** Scalar path with inner vectorization ***
+*******************************************/
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet>
+struct product_coeff_vectorized_unroller
+{
+  typedef typename Lhs::Index Index;
+  enum { PacketSize = packet_traits<typename Lhs::Scalar>::size };
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
+  {
+    product_coeff_vectorized_unroller<UnrollingIndex-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres);
+    pres = padd(pres, pmul( lhs.template packet<Aligned>(row, UnrollingIndex) , rhs.template packet<Aligned>(UnrollingIndex, col) ));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet>
+struct product_coeff_vectorized_unroller<0, Lhs, Rhs, Packet>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
+  {
+    pres = pmul(lhs.template packet<Aligned>(row, 0) , rhs.template packet<Aligned>(0, col));
+  }
+};
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
+struct product_coeff_impl<InnerVectorizedTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
+{
+  typedef typename Lhs::PacketScalar Packet;
+  typedef typename Lhs::Index Index;
+  enum { PacketSize = packet_traits<typename Lhs::Scalar>::size };
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
+  {
+    Packet pres;
+    product_coeff_vectorized_unroller<UnrollingIndex+1-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres);
+    res = predux(pres);
+  }
+};
+
+template<typename Lhs, typename Rhs, int LhsRows = Lhs::RowsAtCompileTime, int RhsCols = Rhs::ColsAtCompileTime>
+struct product_coeff_vectorized_dyn_selector
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+  {
+    res = lhs.row(row).transpose().cwiseProduct(rhs.col(col)).sum();
+  }
+};
+
+// NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower
+// NOTE maybe they are now useless since we have a specialization for Block<Matrix>
+template<typename Lhs, typename Rhs, int RhsCols>
+struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+  {
+    res = lhs.transpose().cwiseProduct(rhs.col(col)).sum();
+  }
+};
+
+template<typename Lhs, typename Rhs, int LhsRows>
+struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+  {
+    res = lhs.row(row).transpose().cwiseProduct(rhs).sum();
+  }
+};
+
+template<typename Lhs, typename Rhs>
+struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,1>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+  {
+    res = lhs.transpose().cwiseProduct(rhs).sum();
+  }
+};
+
+template<typename Lhs, typename Rhs, typename RetScalar>
+struct product_coeff_impl<InnerVectorizedTraversal, Dynamic, Lhs, Rhs, RetScalar>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
+  {
+    product_coeff_vectorized_dyn_selector<Lhs,Rhs>::run(row, col, lhs, rhs, res);
+  }
+};
+
+/*******************
+*** Packet path  ***
+*******************/
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
+  {
+    product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res);
+    res =  pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex)), rhs.template packet<LoadMode>(UnrollingIndex, col), res);
+  }
+};
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
+  {
+    product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res);
+    res =  pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex), pset1<Packet>(rhs.coeff(UnrollingIndex, col)), res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
+  {
+    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
+  {
+    res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res)
+  {
+    eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix");
+    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
+      for(Index i = 1; i < lhs.cols(); ++i)
+        res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
+{
+  typedef typename Lhs::Index Index;
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res)
+  {
+    eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix");
+    res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
+      for(Index i = 1; i < lhs.cols(); ++i)
+        res =  pmadd(lhs.template packet<LoadMode>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COEFFBASED_PRODUCT_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/third_party/eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h
new file mode 100644
index 0000000000..80bd6aa0e6
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -0,0 +1,2197 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERAL_BLOCK_PANEL_H
+#define EIGEN_GENERAL_BLOCK_PANEL_H
+
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
+class gebp_traits;
+
+
+/** \internal \returns b if a<=0, and returns a otherwise. */
+inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
+{
+  return a<=0 ? b : a;
+}
+
+#if EIGEN_ARCH_i386_OR_x86_64
+const std::ptrdiff_t defaultL1CacheSize = 32*1024;
+const std::ptrdiff_t defaultL2CacheSize = 256*1024;
+const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024;
+#else
+const std::ptrdiff_t defaultL1CacheSize = 16*1024;
+const std::ptrdiff_t defaultL2CacheSize = 512*1024;
+const std::ptrdiff_t defaultL3CacheSize = 512*1024;
+#endif
+
+/** \internal */
+inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
+{
+  static bool m_cache_sizes_initialized = false;
+  static std::ptrdiff_t m_l1CacheSize = 0;
+  static std::ptrdiff_t m_l2CacheSize = 0;
+  static std::ptrdiff_t m_l3CacheSize = 0;
+
+  if(EIGEN_UNLIKELY(!m_cache_sizes_initialized))
+  {
+    int l1CacheSize, l2CacheSize, l3CacheSize;
+    queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
+    m_l1CacheSize = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize);
+    m_l2CacheSize = manage_caching_sizes_helper(l2CacheSize, defaultL2CacheSize);
+    m_l3CacheSize = manage_caching_sizes_helper(l3CacheSize, defaultL3CacheSize);
+    m_cache_sizes_initialized = true;
+  }
+
+  if(EIGEN_UNLIKELY(action==SetAction))
+  {
+    // set the cpu cache size and cache all block sizes from a global cache size in byte
+    eigen_internal_assert(l1!=0 && l2!=0);
+    m_l1CacheSize = *l1;
+    m_l2CacheSize = *l2;
+    m_l3CacheSize = *l3;
+  }
+  else if(EIGEN_LIKELY(action==GetAction))
+  {
+    eigen_internal_assert(l1!=0 && l2!=0);
+    *l1 = m_l1CacheSize;
+    *l2 = m_l2CacheSize;
+    *l3 = m_l3CacheSize;
+  }
+  else
+  {
+    eigen_internal_assert(false);
+  }
+}
+
+#define CEIL(a, b) ((a)+(b)-1)/(b)
+
+/* Helper for computeProductBlockingSizes.
+ *
+ * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
+ * this function computes the blocking size parameters along the respective dimensions
+ * for matrix products and related algorithms. The blocking sizes depends on various
+ * parameters:
+ * - the L1 and L2 cache sizes,
+ * - the register level blocking sizes defined by gebp_traits,
+ * - the number of scalars that fit into a packet (when vectorization is enabled).
+ *
+ * \sa setCpuCacheSizes */
+template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
+void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1)
+{
+  // Explanations:
+  // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and
+  // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed
+  // per kc x nr vertical small panels where nr is the blocking size along the n dimension
+  // at the register level. For vectorization purpose, these small vertical panels are unpacked,
+  // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to
+  // stay in L1 cache.
+  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+  typedef typename Traits::ResScalar ResScalar;
+  enum {
+    kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
+    ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
+    k_mask = (0xffffffff/8)*8,
+
+    mr = Traits::mr,
+    mr_mask = (0xffffffff/mr)*mr,
+
+    nr = Traits::nr,
+    nr_mask = (0xffffffff/nr)*nr
+  };
+
+  std::ptrdiff_t l1, l2, l3;
+  manage_caching_sizes(GetAction, &l1, &l2, &l3);
+
+  // Increasing k gives us more time to prefetch the content of the "C"
+  // registers. However once the latency is hidden there is no point in
+  // increasing the value of k, so we'll cap it at 320 (value determined
+  // experimentally).
+  const Index k_cache = (std::min<Index>)((l1-ksub)/kdiv, 320);
+  if (k_cache < k) {
+    k = k_cache & k_mask;
+    eigen_assert(k > 0);
+  }
+
+  const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
+  Index n_per_thread = CEIL(n, num_threads);
+  if (n_cache <= n_per_thread) {
+    // Don't exceed the capacity of the l2 cache.
+    if (n_cache < nr) {
+      n = nr;
+    } else {
+      n = n_cache & nr_mask;
+      eigen_assert(n > 0);
+    }
+  } else {
+    n = (std::min<Index>)(n, (n_per_thread + nr - 1) & nr_mask);
+  }
+
+  if (l3 > l2) {
+    // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
+    const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
+    const Index m_per_thread = CEIL(m, num_threads);
+    if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
+      m = m_cache & mr_mask;
+      eigen_assert(m > 0);
+    } else {
+      m = (std::min<Index>)(m, (m_per_thread + mr - 1) & mr_mask);
+    }
+  }
+}
+
+template <typename Index>
+bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
+{
+#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
+  if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
+    k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
+    m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
+    n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
+    return true;
+  }
+#else
+  EIGEN_UNUSED_VARIABLE(k)
+  EIGEN_UNUSED_VARIABLE(m)
+  EIGEN_UNUSED_VARIABLE(n)
+#endif
+  return false;
+}
+
+/** \brief Computes the blocking parameters for a m x k times k x n matrix product
+  *
+  * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
+  * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension.
+  * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension.
+  *
+  * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
+  * this function computes the blocking size parameters along the respective dimensions
+  * for matrix products and related algorithms.
+  *
+  * The blocking size parameters may be evaluated:
+  *   - either by a heuristic based on cache sizes;
+  *   - or using fixed prescribed values (for testing purposes).
+  *
+  * \sa setCpuCacheSizes */
+
+template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
+void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
+{
+  if (!k || !m || !n) {
+    return;
+  }
+
+  if (!useSpecificBlockingSizes(k, m, n)) {
+    evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor>(k, m, n, num_threads);
+  }
+
+#if !EIGEN_ARCH_i386_OR_x86_64
+  // The following code rounds k,m,n down to the nearest multiple of register-level blocking sizes.
+  // We should always do that, and in upstream Eigen we always do that.
+  // Unfortunately, we can't do that in Google3 on x86[-64] because this makes tiny differences in results and
+  // we have some unfortunate tests require very specific relative errors which fail because of that,
+  // at least //learning/laser/algorithms/wals:wals_batch_solver_test.
+  // Note that this wouldn't make any difference if we had been using only correctly rounded values,
+  // but we've not! See how in evaluateProductBlockingSizesHeuristic, we do the rounding down by
+  // bit-masking, e.g. mr_mask = (0xffffffff/mr)*mr, implicitly assuming that mr is always a power of
+  // two, which is not the case with the 3px4 kernel.
+  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+  enum {
+    kr = 8,
+    mr = Traits::mr,
+    nr = Traits::nr
+  };
+  if (k > kr) k -= k % kr;
+  if (m > mr) m -= m % mr;
+  if (n > nr) n -= n % nr;
+#endif
+}
+
+template<typename LhsScalar, typename RhsScalar, typename Index>
+inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads)
+{
+  computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n, num_threads);
+}
+
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+  #define CJMADD(CJ,A,B,C,T)  C = CJ.pmadd(A,B,C);
+#else
+
+  // FIXME (a bit overkill maybe ?)
+
+  template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector {
+    EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/)
+    {
+      c = cj.pmadd(a,b,c);
+    }
+  };
+
+  template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
+    EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t)
+    {
+      t = b; t = cj.pmul(a,t); c = padd(c,t);
+    }
+  };
+
+  template<typename CJ, typename A, typename B, typename C, typename T>
+  EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
+  {
+    gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
+  }
+
+  #define CJMADD(CJ,A,B,C,T)  gebp_madd(CJ,A,B,C,T);
+//   #define CJMADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); C = padd(C,T);
+#endif
+
+/* Vectorization logic
+ *  real*real: unpack rhs to constant packets, ...
+ *
+ *  cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i),
+ *          storing each res packet into two packets (2x2),
+ *          at the end combine them: swap the second and addsub them
+ *  cf*cf : same but with 2x4 blocks
+ *  cplx*real : unpack rhs to constant packets, ...
+ *  real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
+ */
+template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
+class gebp_traits
+{
+public:
+  typedef _LhsScalar LhsScalar;
+  typedef _RhsScalar RhsScalar;
+  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+  enum {
+    ConjLhs = _ConjLhs,
+    ConjRhs = _ConjRhs,
+    Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
+    LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
+    RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
+    ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
+
+    NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
+
+    // register block size along the N direction must be 1 or 4
+    nr = 4,
+
+    // register block size along the M direction (currently, this one cannot be modified)
+    default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
+#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
+    // we assume 16 registers
+    mr = Vectorizable ? 3*LhsPacketSize : default_mr,
+#else
+    mr = default_mr,
+#endif
+
+    LhsProgress = LhsPacketSize,
+    RhsProgress = 1
+  };
+
+  typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
+  typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
+  typedef typename packet_traits<ResScalar>::type  _ResPacket;
+
+  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
+  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
+  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+
+  typedef ResPacket AccPacket;
+
+  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
+  {
+    p = pset1<ResPacket>(ResScalar(0));
+  }
+
+  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+  {
+    pbroadcast4(b, b0, b1, b2, b3);
+  }
+
+//   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
+//   {
+//     pbroadcast2(b, b0, b1);
+//   }
+
+  template<typename RhsPacketType>
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
+  {
+    dest = pset1<RhsPacketType>(*b);
+  }
+
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+  {
+    dest = ploadquad<RhsPacket>(b);
+  }
+
+  template<typename LhsPacketType>
+  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const
+  {
+    dest = pload<LhsPacketType>(a);
+  }
+
+  template<typename LhsPacketType>
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
+  {
+    dest = ploadu<LhsPacketType>(a);
+  }
+
+  template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const
+  {
+    // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
+    // let gcc allocate the register in which to store the result of the pmul
+    // (in the case where there is no FMA) gcc fails to figure out how to avoid
+    // spilling register.
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+    EIGEN_UNUSED_VARIABLE(tmp);
+    c = pmadd(a,b,c);
+#else
+    tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);
+#endif
+  }
+
+  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
+  {
+    r = pmadd(c,alpha,r);
+  }
+
+  template<typename ResPacketHalf>
+  EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const
+  {
+    r = pmadd(c,alpha,r);
+  }
+
+protected:
+//   conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
+//   conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj;
+};
+
+template<typename RealScalar, bool _ConjLhs>
+class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
+{
+public:
+  typedef std::complex<RealScalar> LhsScalar;
+  typedef RealScalar RhsScalar;
+  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+  enum {
+    ConjLhs = _ConjLhs,
+    ConjRhs = false,
+    Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
+    LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
+    RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
+    ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
+
+    NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
+    nr = 4,
+#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
+    // we assume 16 registers
+    mr = 3*LhsPacketSize,
+#else
+    mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
+#endif
+
+    LhsProgress = LhsPacketSize,
+    RhsProgress = 1
+  };
+
+  typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
+  typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
+  typedef typename packet_traits<ResScalar>::type  _ResPacket;
+
+  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
+  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
+  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+
+  typedef ResPacket AccPacket;
+
+  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
+  {
+    p = pset1<ResPacket>(ResScalar(0));
+  }
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+  {
+    dest = pset1<RhsPacket>(*b);
+  }
+
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+  {
+    dest = pset1<RhsPacket>(*b);
+  }
+
+  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
+  {
+    dest = pload<LhsPacket>(a);
+  }
+
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
+  {
+    dest = ploadu<LhsPacket>(a);
+  }
+
+  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+  {
+    pbroadcast4(b, b0, b1, b2, b3);
+  }
+
+//   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
+//   {
+//     pbroadcast2(b, b0, b1);
+//   }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
+  {
+    madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
+  }
+
+  EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
+  {
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+    EIGEN_UNUSED_VARIABLE(tmp);
+    c.v = pmadd(a.v,b,c.v);
+#else
+    tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
+#endif
+  }
+
+  EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
+  {
+    c += a * b;
+  }
+
+  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
+  {
+    r = cj.pmadd(c,alpha,r);
+  }
+
+protected:
+  conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
+};
+
+template<typename Packet>
+struct DoublePacket
+{
+  Packet first;
+  Packet second;
+};
+
+template<typename Packet>
+DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
+{
+  DoublePacket<Packet> res;
+  res.first  = padd(a.first, b.first);
+  res.second = padd(a.second,b.second);
+  return res;
+}
+
+template<typename Packet>
+const DoublePacket<Packet>& predux4(const DoublePacket<Packet> &a)
+{
+  return a;
+}
+
+template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typedef DoublePacket<Packet> half; };
+// template<typename Packet>
+// DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
+// {
+//   DoublePacket<Packet> res;
+//   res.first  = padd(a.first, b.first);
+//   res.second = padd(a.second,b.second);
+//   return res;
+// }
+
+template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
+{
+public:
+  typedef std::complex<RealScalar>  Scalar;
+  typedef std::complex<RealScalar>  LhsScalar;
+  typedef std::complex<RealScalar>  RhsScalar;
+  typedef std::complex<RealScalar>  ResScalar;
+
+  enum {
+    ConjLhs = _ConjLhs,
+    ConjRhs = _ConjRhs,
+    Vectorizable = packet_traits<RealScalar>::Vectorizable
+                && packet_traits<Scalar>::Vectorizable,
+    RealPacketSize  = Vectorizable ? packet_traits<RealScalar>::size : 1,
+    ResPacketSize   = Vectorizable ? packet_traits<ResScalar>::size : 1,
+    LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
+    RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
+
+    // FIXME: should depend on NumberOfRegisters
+    nr = 4,
+    mr = ResPacketSize,
+
+    LhsProgress = ResPacketSize,
+    RhsProgress = 1
+  };
+
+  typedef typename packet_traits<RealScalar>::type RealPacket;
+  typedef typename packet_traits<Scalar>::type     ScalarPacket;
+  typedef DoublePacket<RealPacket> DoublePacketType;
+
+  typedef typename conditional<Vectorizable,RealPacket,  Scalar>::type LhsPacket;
+  typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
+  typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
+  typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
+
+  EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
+
+  EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p)
+  {
+    p.first   = pset1<RealPacket>(RealScalar(0));
+    p.second  = pset1<RealPacket>(RealScalar(0));
+  }
+
+  // Scalar path
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const
+  {
+    dest = pset1<ResPacket>(*b);
+  }
+
+  // Vectorized path
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const
+  {
+    dest.first  = pset1<RealPacket>(real(*b));
+    dest.second = pset1<RealPacket>(imag(*b));
+  }
+
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
+  {
+    loadRhs(b,dest);
+  }
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
+  {
+    eigen_internal_assert(unpacket_traits<ScalarPacket>::size<=4);
+    loadRhs(b,dest);
+  }
+
+  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+  {
+    // FIXME not sure that's the best way to implement it!
+    loadRhs(b+0, b0);
+    loadRhs(b+1, b1);
+    loadRhs(b+2, b2);
+    loadRhs(b+3, b3);
+  }
+
+  // Vectorized path
+  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1)
+  {
+    // FIXME not sure that's the best way to implement it!
+    loadRhs(b+0, b0);
+    loadRhs(b+1, b1);
+  }
+
+  // Scalar path
+  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1)
+  {
+    // FIXME not sure that's the best way to implement it!
+    loadRhs(b+0, b0);
+    loadRhs(b+1, b1);
+  }
+
+  // nothing special here
+  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
+  {
+    dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
+  }
+
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
+  {
+    dest = ploadu<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
+  }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const
+  {
+    c.first   = padd(pmul(a,b.first), c.first);
+    c.second  = padd(pmul(a,b.second),c.second);
+  }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const
+  {
+    c = cj.pmadd(a,b,c);
+  }
+
+  EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
+
+  EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const
+  {
+    // assemble c
+    ResPacket tmp;
+    if((!ConjLhs)&&(!ConjRhs))
+    {
+      tmp = pcplxflip(pconj(ResPacket(c.second)));
+      tmp = padd(ResPacket(c.first),tmp);
+    }
+    else if((!ConjLhs)&&(ConjRhs))
+    {
+      tmp = pconj(pcplxflip(ResPacket(c.second)));
+      tmp = padd(ResPacket(c.first),tmp);
+    }
+    else if((ConjLhs)&&(!ConjRhs))
+    {
+      tmp = pcplxflip(ResPacket(c.second));
+      tmp = padd(pconj(ResPacket(c.first)),tmp);
+    }
+    else if((ConjLhs)&&(ConjRhs))
+    {
+      tmp = pcplxflip(ResPacket(c.second));
+      tmp = psub(pconj(ResPacket(c.first)),tmp);
+    }
+
+    r = pmadd(tmp,alpha,r);
+  }
+
+protected:
+  conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
+};
+
+template<typename RealScalar, bool _ConjRhs>
+class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
+{
+public:
+  typedef std::complex<RealScalar>  Scalar;
+  typedef RealScalar  LhsScalar;
+  typedef Scalar      RhsScalar;
+  typedef Scalar      ResScalar;
+
+  enum {
+    ConjLhs = false,
+    ConjRhs = _ConjRhs,
+    Vectorizable = packet_traits<RealScalar>::Vectorizable
+                && packet_traits<Scalar>::Vectorizable,
+    LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
+    RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
+    ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
+
+    NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
+    // FIXME: should depend on NumberOfRegisters
+    nr = 4,
+    mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize,
+
+    LhsProgress = ResPacketSize,
+    RhsProgress = 1
+  };
+
+  typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
+  typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
+  typedef typename packet_traits<ResScalar>::type  _ResPacket;
+
+  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
+  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
+  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+
+  typedef ResPacket AccPacket;
+
+  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
+  {
+    p = pset1<ResPacket>(ResScalar(0));
+  }
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+  {
+    dest = pset1<RhsPacket>(*b);
+  }
+
+  void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+  {
+    pbroadcast4(b, b0, b1, b2, b3);
+  }
+
+//   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
+//   {
+//     // FIXME not sure that's the best way to implement it!
+//     b0 = pload1<RhsPacket>(b+0);
+//     b1 = pload1<RhsPacket>(b+1);
+//   }
+
+  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
+  {
+    dest = ploaddup<LhsPacket>(a);
+  }
+
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+  {
+    eigen_internal_assert(unpacket_traits<RhsPacket>::size<=4);
+    loadRhs(b,dest);
+  }
+
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
+  {
+    dest = ploaddup<LhsPacket>(a);
+  }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
+  {
+    madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
+  }
+
+  EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
+  {
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+    EIGEN_UNUSED_VARIABLE(tmp);
+    c.v = pmadd(a,b.v,c.v);
+#else
+    tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
+#endif
+
+  }
+
+  EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
+  {
+    c += a * b;
+  }
+
+  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
+  {
+    r = cj.pmadd(alpha,c,r);
+  }
+
+protected:
+  conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
+};
+
+// helper for the rotating kernel below
+template <typename GebpKernel, bool UseRotatingKernel = GebpKernel::UseRotatingKernel>
+struct PossiblyRotatingKernelHelper
+{
+  // default implementation, not rotating
+
+  typedef typename GebpKernel::Traits Traits;
+  typedef typename Traits::RhsScalar RhsScalar;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::AccPacket AccPacket;
+
+  const Traits& traits;
+  EIGEN_ALWAYS_INLINE PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {}
+
+
+  template <size_t K, size_t Index> EIGEN_ALWAYS_INLINE
+  void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const
+  {
+    traits.loadRhs(from + (Index+4*K)*Traits::RhsProgress, to);
+  }
+
+  EIGEN_ALWAYS_INLINE void unrotateResult(AccPacket&,
+                                          AccPacket&,
+                                          AccPacket&,
+                                          AccPacket&)
+  {
+  }
+};
+
+// rotating implementation
+template <typename GebpKernel>
+struct PossiblyRotatingKernelHelper<GebpKernel, true>
+{
+  typedef typename GebpKernel::Traits Traits;
+  typedef typename Traits::RhsScalar RhsScalar;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::AccPacket AccPacket;
+
+  const Traits& traits;
+  EIGEN_ALWAYS_INLINE PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {}
+
+  template <size_t K, size_t Index> EIGEN_ALWAYS_INLINE
+  void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const
+  {
+    if (Index == 0) {
+      to = pload<RhsPacket>(from + 4*K*Traits::RhsProgress);
+    } else {
+      EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers");
+      to = protate<1>(to);
+    }
+  }
+
+  EIGEN_ALWAYS_INLINE void unrotateResult(AccPacket& res0,
+                                          AccPacket& res1,
+                                          AccPacket& res2,
+                                          AccPacket& res3)
+  {
+    PacketBlock<AccPacket> resblock;
+    resblock.packet[0] = res0;
+    resblock.packet[1] = res1;
+    resblock.packet[2] = res2;
+    resblock.packet[3] = res3;
+    ptranspose(resblock);
+    resblock.packet[3] = protate<1>(resblock.packet[3]);
+    resblock.packet[2] = protate<2>(resblock.packet[2]);
+    resblock.packet[1] = protate<3>(resblock.packet[1]);
+    ptranspose(resblock);
+    res0 = resblock.packet[0];
+    res1 = resblock.packet[1];
+    res2 = resblock.packet[2];
+    res3 = resblock.packet[3];
+  }
+};
+
+/* optimized GEneral packed Block * packed Panel product kernel
+ *
+ * Mixing type logic: C += A * B
+ *  |  A  |  B  | comments
+ *  |real |cplx | no vectorization yet, would require to pack A with duplication
+ *  |cplx |real | easy vectorization
+ */
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel
+{
+  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
+  typedef typename Traits::ResScalar ResScalar;
+  typedef typename Traits::LhsPacket LhsPacket;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::ResPacket ResPacket;
+  typedef typename Traits::AccPacket AccPacket;
+
+  typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
+  typedef typename SwappedTraits::ResScalar SResScalar;
+  typedef typename SwappedTraits::LhsPacket SLhsPacket;
+  typedef typename SwappedTraits::RhsPacket SRhsPacket;
+  typedef typename SwappedTraits::ResPacket SResPacket;
+  typedef typename SwappedTraits::AccPacket SAccPacket;
+
+  typedef typename DataMapper::LinearMapper LinearMapper;
+
+  enum {
+    Vectorizable  = Traits::Vectorizable,
+    LhsProgress   = Traits::LhsProgress,
+    RhsProgress   = Traits::RhsProgress,
+    ResPacketSize = Traits::ResPacketSize
+  };
+
+  EIGEN_DONT_INLINE
+  void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
+                  Index rows, Index depth, Index cols, ResScalar alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+
+  static const bool UseRotatingKernel =
+    EIGEN_ARCH_ARM &&
+    internal::is_same<LhsScalar, float>::value &&
+    internal::is_same<RhsScalar, float>::value &&
+    internal::is_same<ResScalar, float>::value &&
+    Traits::LhsPacketSize == 4 &&
+    Traits::RhsPacketSize == 4 &&
+    Traits::ResPacketSize == 4;
+};
+
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE
+void gebp_kernel<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
+  ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
+               Index rows, Index depth, Index cols, ResScalar alpha,
+               Index strideA, Index strideB, Index offsetA, Index offsetB)
+  {
+    Traits traits;
+    SwappedTraits straits;
+
+    if(strideA==-1) strideA = depth;
+    if(strideB==-1) strideB = depth;
+    conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
+    Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
+    const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
+    const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
+    const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
+    enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
+    const Index peeled_kc  = depth & ~(pk-1);
+    const Index prefetch_res_offset = 0;
+//     const Index depth2     = depth & ~1;
+
+    //---------- Process 3 * LhsProgress rows at once ----------
+    // This corresponds to 3*LhsProgress x nr register blocks.
+    // Usually, make sense only with FMA
+    if(mr>=3*Traits::LhsProgress)
+    {
+      PossiblyRotatingKernelHelper<gebp_kernel> possiblyRotatingKernelHelper(traits);
+
+      // loops on each largest micro horizontal panel of lhs (3*Traits::LhsProgress x depth)
+      for(Index i=0; i<peeled_mc3; i+=3*Traits::LhsProgress)
+      {
+        // loops on each largest micro vertical panel of rhs (depth * nr)
+        for(Index j2=0; j2<packet_cols4; j2+=nr)
+        {
+          // We select a 3*Traits::LhsProgress x nr micro block of res which is entirely
+          // stored into 3 x nr registers.
+
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
+          prefetch(&blA[0]);
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+          prefetch(&blB[0]);
+          LhsPacket A0, A1;
+
+          // gets res block as register
+          AccPacket C0, C1, C2,  C3,
+                    C4, C5, C6,  C7,
+                    C8, C9, C10, C11;
+          traits.initAcc(C0);  traits.initAcc(C1);  traits.initAcc(C2);  traits.initAcc(C3);
+          traits.initAcc(C4);  traits.initAcc(C5);  traits.initAcc(C6);  traits.initAcc(C7);
+          traits.initAcc(C8);  traits.initAcc(C9);  traits.initAcc(C10); traits.initAcc(C11);
+
+          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+          r0.prefetch(0);
+          r1.prefetch(0);
+          r2.prefetch(0);
+          r3.prefetch(0);
+
+          // performs "inner" products
+          for(Index k=0; k<peeled_kc; k+=pk)
+          {
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
+            RhsPacket B_0, T0;
+            LhsPacket A2;
+
+#define EIGEN_GEBP_ONESTEP(K) \
+            do { \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              internal::prefetch(blA+(3*K+16)*LhsProgress); \
+              if (EIGEN_ARCH_ARM) internal::prefetch(blB+(4*K+16)*RhsProgress); /* Bug 953 */ \
+              traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
+              traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
+              traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
+              possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 0>(B_0, blB); \
+              traits.madd(A0, B_0, C0, T0); \
+              traits.madd(A1, B_0, C4, T0); \
+              traits.madd(A2, B_0, C8, B_0); \
+              possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 1>(B_0, blB); \
+              traits.madd(A0, B_0, C1, T0); \
+              traits.madd(A1, B_0, C5, T0); \
+              traits.madd(A2, B_0, C9, B_0); \
+              possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 2>(B_0, blB); \
+              traits.madd(A0, B_0, C2,  T0); \
+              traits.madd(A1, B_0, C6,  T0); \
+              traits.madd(A2, B_0, C10, B_0); \
+              possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 3>(B_0, blB); \
+              traits.madd(A0, B_0, C3 , T0); \
+              traits.madd(A1, B_0, C7,  T0); \
+              traits.madd(A2, B_0, C11, B_0); \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
+            } while(false)
+
+            internal::prefetch(blB);
+            EIGEN_GEBP_ONESTEP(0);
+            EIGEN_GEBP_ONESTEP(1);
+            EIGEN_GEBP_ONESTEP(2);
+            EIGEN_GEBP_ONESTEP(3);
+            EIGEN_GEBP_ONESTEP(4);
+            EIGEN_GEBP_ONESTEP(5);
+            EIGEN_GEBP_ONESTEP(6);
+            EIGEN_GEBP_ONESTEP(7);
+
+            blB += pk*4*RhsProgress;
+            blA += pk*3*Traits::LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
+          }
+          // process remaining peeled loop
+          for(Index k=peeled_kc; k<depth; k++)
+          {
+            RhsPacket B_0, T0;
+            LhsPacket A2;
+            EIGEN_GEBP_ONESTEP(0);
+            blB += 4*RhsProgress;
+            blA += 3*Traits::LhsProgress;
+          }
+#undef EIGEN_GEBP_ONESTEP
+
+          possiblyRotatingKernelHelper.unrotateResult(C0, C1, C2, C3);
+          possiblyRotatingKernelHelper.unrotateResult(C4, C5, C6, C7);
+          possiblyRotatingKernelHelper.unrotateResult(C8, C9, C10, C11);
+
+          ResPacket R0, R1, R2;
+          ResPacket alphav = pset1<ResPacket>(alpha);
+
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r0.loadPacket(2 * Traits::ResPacketSize);
+          traits.acc(C0, alphav, R0);
+          traits.acc(C4, alphav, R1);
+          traits.acc(C8, alphav, R2);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r0.storePacket(1 * Traits::ResPacketSize, R1);
+          r0.storePacket(2 * Traits::ResPacketSize, R2);
+
+          R0 = r1.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r1.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r1.loadPacket(2 * Traits::ResPacketSize);
+          traits.acc(C1, alphav, R0);
+          traits.acc(C5, alphav, R1);
+          traits.acc(C9, alphav, R2);
+          r1.storePacket(0 * Traits::ResPacketSize, R0);
+          r1.storePacket(1 * Traits::ResPacketSize, R1);
+          r1.storePacket(2 * Traits::ResPacketSize, R2);
+
+          R0 = r2.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r2.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r2.loadPacket(2 * Traits::ResPacketSize);
+          traits.acc(C2, alphav, R0);
+          traits.acc(C6, alphav, R1);
+          traits.acc(C10, alphav, R2);
+          r2.storePacket(0 * Traits::ResPacketSize, R0);
+          r2.storePacket(1 * Traits::ResPacketSize, R1);
+          r2.storePacket(2 * Traits::ResPacketSize, R2);
+
+          R0 = r3.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r3.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r3.loadPacket(2 * Traits::ResPacketSize);
+          traits.acc(C3, alphav, R0);
+          traits.acc(C7, alphav, R1);
+          traits.acc(C11, alphav, R2);
+          r3.storePacket(0 * Traits::ResPacketSize, R0);
+          r3.storePacket(1 * Traits::ResPacketSize, R1);
+          r3.storePacket(2 * Traits::ResPacketSize, R2);
+        }
+
+        // Deal with remaining columns of the rhs
+        for(Index j2=packet_cols4; j2<cols; j2++)
+        {
+          // One column at a time
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
+          prefetch(&blA[0]);
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+          prefetch(&blB[0]);
+          // gets res block as register
+          AccPacket C0, C4, C8;
+          traits.initAcc(C0);
+          traits.initAcc(C4);
+          traits.initAcc(C8);
+
+          LinearMapper r0 = res.getLinearMapper(i, j2);
+          r0.prefetch(0);
+          LhsPacket A0, A1, A2;
+
+          // performs "inner" products
+          for(Index k=0; k<peeled_kc; k+=pk)
+          {
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
+            RhsPacket B_0;
+#define EIGEN_GEBGP_ONESTEP(K) \
+            do { \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
+              traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
+              traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
+              traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);   \
+              traits.madd(A0, B_0, C0, B_0); \
+              traits.madd(A1, B_0, C4, B_0); \
+              traits.madd(A2, B_0, C8, B_0); \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
+            } while(false)
+
+            EIGEN_GEBGP_ONESTEP(0);
+            EIGEN_GEBGP_ONESTEP(1);
+            EIGEN_GEBGP_ONESTEP(2);
+            EIGEN_GEBGP_ONESTEP(3);
+            EIGEN_GEBGP_ONESTEP(4);
+            EIGEN_GEBGP_ONESTEP(5);
+            EIGEN_GEBGP_ONESTEP(6);
+            EIGEN_GEBGP_ONESTEP(7);
+
+            blB += pk*RhsProgress;
+            blA += pk*3*Traits::LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
+          }
+
+          // process remaining peeled loop
+          for(Index k=peeled_kc; k<depth; k++)
+          {
+            RhsPacket B_0;
+            EIGEN_GEBGP_ONESTEP(0);
+            blB += RhsProgress;
+            blA += 3*Traits::LhsProgress;
+          }
+#undef EIGEN_GEBGP_ONESTEP
+          ResPacket R0, R1, R2;
+          ResPacket alphav = pset1<ResPacket>(alpha);
+
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r0.loadPacket(2 * Traits::ResPacketSize);
+          traits.acc(C0, alphav, R0);
+          traits.acc(C4, alphav, R1);
+          traits.acc(C8, alphav, R2);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r0.storePacket(1 * Traits::ResPacketSize, R1);
+          r0.storePacket(2 * Traits::ResPacketSize, R2);
+        }
+      }
+    }
+
+    //---------- Process 2 * LhsProgress rows at once ----------
+    if(mr>=2*Traits::LhsProgress)
+    {
+      // loops on each largest micro horizontal panel of lhs (2*LhsProgress x depth)
+      for(Index i=peeled_mc3; i<peeled_mc2; i+=2*LhsProgress)
+      {
+        // loops on each largest micro vertical panel of rhs (depth * nr)
+        for(Index j2=0; j2<packet_cols4; j2+=nr)
+        {
+          // We select a 2*Traits::LhsProgress x nr micro block of res which is entirely
+          // stored into 2 x nr registers.
+
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
+          prefetch(&blA[0]);
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+          prefetch(&blB[0]);
+
+          // gets res block as register
+          AccPacket C0, C1, C2, C3,
+                    C4, C5, C6, C7;
+          traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
+          traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
+
+          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+          r0.prefetch(prefetch_res_offset);
+          r1.prefetch(prefetch_res_offset);
+          r2.prefetch(prefetch_res_offset);
+          r3.prefetch(prefetch_res_offset);
+
+          LhsPacket A0, A1;
+
+          // performs "inner" products
+          for(Index k=0; k<peeled_kc; k+=pk)
+          {
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
+            RhsPacket B_0, B1, B2, B3, T0;
+
+            // The 2 ASM comments in the #define are intended to prevent gcc
+            // from optimizing the code accross steps since it ends up spilling
+            // registers in this case.
+   #define EIGEN_GEBGP_ONESTEP(K) \
+            do {                                                                \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4");        \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);                    \
+              traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);                    \
+              traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \
+              traits.madd(A0, B_0, C0, T0);                                     \
+              traits.madd(A1, B_0, C4, B_0);                                    \
+              traits.madd(A0, B1,  C1, T0);                                     \
+              traits.madd(A1, B1,  C5, B1);                                     \
+              traits.madd(A0, B2,  C2, T0);                                     \
+              traits.madd(A1, B2,  C6, B2);                                     \
+              traits.madd(A0, B3,  C3, T0);                                     \
+              traits.madd(A1, B3,  C7, B3);                                     \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4");          \
+            } while(false)
+
+            prefetch(&blB[pk*4*RhsProgress]);
+            EIGEN_GEBGP_ONESTEP(0);
+            EIGEN_GEBGP_ONESTEP(1);
+            EIGEN_GEBGP_ONESTEP(2);
+            EIGEN_GEBGP_ONESTEP(3);
+            EIGEN_GEBGP_ONESTEP(4);
+            EIGEN_GEBGP_ONESTEP(5);
+            EIGEN_GEBGP_ONESTEP(6);
+            EIGEN_GEBGP_ONESTEP(7);
+
+            blB += pk*4*RhsProgress;
+            blA += pk*(2*Traits::LhsProgress);
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
+          }
+          // process remaining peeled loop
+          for(Index k=peeled_kc; k<depth; k++)
+          {
+            RhsPacket B_0, B1, B2, B3, T0;
+            EIGEN_GEBGP_ONESTEP(0);
+            blB += 4*RhsProgress;
+            blA += 2*Traits::LhsProgress;
+          }
+#undef EIGEN_GEBGP_ONESTEP
+
+          ResPacket R0, R1, R2, R3;
+          ResPacket alphav = pset1<ResPacket>(alpha);
+
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r1.loadPacket(0 * Traits::ResPacketSize);
+          R3 = r1.loadPacket(1 * Traits::ResPacketSize);
+          traits.acc(C0, alphav, R0);
+          traits.acc(C4, alphav, R1);
+          traits.acc(C1, alphav, R2);
+          traits.acc(C5, alphav, R3);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r0.storePacket(1 * Traits::ResPacketSize, R1);
+          r1.storePacket(0 * Traits::ResPacketSize, R2);
+          r1.storePacket(1 * Traits::ResPacketSize, R3);
+
+          R0 = r2.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r2.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r3.loadPacket(0 * Traits::ResPacketSize);
+          R3 = r3.loadPacket(1 * Traits::ResPacketSize);
+          traits.acc(C2,  alphav, R0);
+          traits.acc(C6,  alphav, R1);
+          traits.acc(C3,  alphav, R2);
+          traits.acc(C7,  alphav, R3);
+          r2.storePacket(0 * Traits::ResPacketSize, R0);
+          r2.storePacket(1 * Traits::ResPacketSize, R1);
+          r3.storePacket(0 * Traits::ResPacketSize, R2);
+          r3.storePacket(1 * Traits::ResPacketSize, R3);
+        }
+
+        // Deal with remaining columns of the rhs
+        for(Index j2=packet_cols4; j2<cols; j2++)
+        {
+          // One column at a time
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
+          prefetch(&blA[0]);
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+          prefetch(&blB[0]);
+
+          // gets res block as register
+          AccPacket C0, C4;
+          traits.initAcc(C0);
+          traits.initAcc(C4);
+
+          LinearMapper r0 = res.getLinearMapper(i, j2);
+          r0.prefetch(prefetch_res_offset);
+          LhsPacket A0, A1;
+
+          // performs "inner" products
+          for(Index k=0; k<peeled_kc; k+=pk)
+          {
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
+            RhsPacket B_0, B1;
+
+#define EIGEN_GEBGP_ONESTEP(K) \
+            do {                                                                  \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1");          \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);                      \
+              traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);                      \
+              traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);                       \
+              traits.madd(A0, B_0, C0, B1);                                       \
+              traits.madd(A1, B_0, C4, B_0);                                      \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1");            \
+            } while(false)
+
+            EIGEN_GEBGP_ONESTEP(0);
+            EIGEN_GEBGP_ONESTEP(1);
+            EIGEN_GEBGP_ONESTEP(2);
+            EIGEN_GEBGP_ONESTEP(3);
+            EIGEN_GEBGP_ONESTEP(4);
+            EIGEN_GEBGP_ONESTEP(5);
+            EIGEN_GEBGP_ONESTEP(6);
+            EIGEN_GEBGP_ONESTEP(7);
+
+            blB += pk*RhsProgress;
+            blA += pk*2*Traits::LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
+          }
+
+          // process remaining peeled loop
+          for(Index k=peeled_kc; k<depth; k++)
+          {
+            RhsPacket B_0, B1;
+            EIGEN_GEBGP_ONESTEP(0);
+            blB += RhsProgress;
+            blA += 2*Traits::LhsProgress;
+          }
+#undef EIGEN_GEBGP_ONESTEP
+          ResPacket R0, R1;
+          ResPacket alphav = pset1<ResPacket>(alpha);
+
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+          traits.acc(C0, alphav, R0);
+          traits.acc(C4, alphav, R1);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r0.storePacket(1 * Traits::ResPacketSize, R1);
+        }
+      }
+    }
+    //---------- Process 1 * LhsProgress rows at once ----------
+    if(mr>=1*Traits::LhsProgress)
+    {
+      // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth)
+      for(Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress)
+      {
+        // loops on each largest micro vertical panel of rhs (depth * nr)
+        for(Index j2=0; j2<packet_cols4; j2+=nr)
+        {
+          // We select a 1*Traits::LhsProgress x nr micro block of res which is entirely
+          // stored into 1 x nr registers.
+
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
+          prefetch(&blA[0]);
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+          prefetch(&blB[0]);
+
+          // gets res block as register
+          AccPacket C0, C1, C2, C3;
+          traits.initAcc(C0);
+          traits.initAcc(C1);
+          traits.initAcc(C2);
+          traits.initAcc(C3);
+
+          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+          r0.prefetch(prefetch_res_offset);
+          r1.prefetch(prefetch_res_offset);
+          r2.prefetch(prefetch_res_offset);
+          r3.prefetch(prefetch_res_offset);
+          LhsPacket A0;
+
+          // performs "inner" products
+          for(Index k=0; k<peeled_kc; k+=pk)
+          {
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4");
+            RhsPacket B_0, B1, B2, B3;
+
+#define EIGEN_GEBGP_ONESTEP(K) \
+            do {                                                                \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4");        \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \
+              traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \
+              traits.madd(A0, B_0, C0, B_0);                                    \
+              traits.madd(A0, B1,  C1, B1);                                     \
+              traits.madd(A0, B2,  C2, B2);                                     \
+              traits.madd(A0, B3,  C3, B3);                                     \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4");          \
+            } while(false)
+
+            EIGEN_GEBGP_ONESTEP(0);
+            EIGEN_GEBGP_ONESTEP(1);
+            EIGEN_GEBGP_ONESTEP(2);
+            EIGEN_GEBGP_ONESTEP(3);
+            EIGEN_GEBGP_ONESTEP(4);
+            EIGEN_GEBGP_ONESTEP(5);
+            EIGEN_GEBGP_ONESTEP(6);
+            EIGEN_GEBGP_ONESTEP(7);
+
+            blB += pk*4*RhsProgress;
+            blA += pk*1*LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4");
+          }
+          // process remaining peeled loop
+          for(Index k=peeled_kc; k<depth; k++)
+          {
+            RhsPacket B_0, B1, B2, B3;
+            EIGEN_GEBGP_ONESTEP(0);
+            blB += 4*RhsProgress;
+            blA += 1*LhsProgress;
+          }
+#undef EIGEN_GEBGP_ONESTEP
+
+          ResPacket R0, R1;
+          ResPacket alphav = pset1<ResPacket>(alpha);
+
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r1.loadPacket(0 * Traits::ResPacketSize);
+          traits.acc(C0, alphav, R0);
+          traits.acc(C1,  alphav, R1);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r1.storePacket(0 * Traits::ResPacketSize, R1);
+
+          R0 = r2.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r3.loadPacket(0 * Traits::ResPacketSize);
+          traits.acc(C2,  alphav, R0);
+          traits.acc(C3,  alphav, R1);
+          r2.storePacket(0 * Traits::ResPacketSize, R0);
+          r3.storePacket(0 * Traits::ResPacketSize, R1);
+        }
+
+        // Deal with remaining columns of the rhs
+        for(Index j2=packet_cols4; j2<cols; j2++)
+        {
+          // One column at a time
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
+          prefetch(&blA[0]);
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+          prefetch(&blB[0]);
+
+          // gets res block as register
+          AccPacket C0;
+          traits.initAcc(C0);
+
+          LinearMapper r0 = res.getLinearMapper(i, j2);
+          LhsPacket A0;
+
+          // performs "inner" products
+          for(Index k=0; k<peeled_kc; k+=pk)
+          {
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
+            RhsPacket B_0;
+
+#define EIGEN_GEBGP_ONESTEP(K) \
+            do {                                                                \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1");        \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \
+              traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);                     \
+              traits.madd(A0, B_0, C0, B_0);                                    \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1");          \
+            } while(false)
+
+            EIGEN_GEBGP_ONESTEP(0);
+            EIGEN_GEBGP_ONESTEP(1);
+            EIGEN_GEBGP_ONESTEP(2);
+            EIGEN_GEBGP_ONESTEP(3);
+            EIGEN_GEBGP_ONESTEP(4);
+            EIGEN_GEBGP_ONESTEP(5);
+            EIGEN_GEBGP_ONESTEP(6);
+            EIGEN_GEBGP_ONESTEP(7);
+
+            blB += pk*RhsProgress;
+            blA += pk*1*Traits::LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
+          }
+
+          // process remaining peeled loop
+          for(Index k=peeled_kc; k<depth; k++)
+          {
+            RhsPacket B_0;
+            EIGEN_GEBGP_ONESTEP(0);
+            blB += RhsProgress;
+            blA += 1*Traits::LhsProgress;
+          }
+#undef EIGEN_GEBGP_ONESTEP
+          ResPacket R0;
+          ResPacket alphav = pset1<ResPacket>(alpha);
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          traits.acc(C0, alphav, R0);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+        }
+      }
+    }
+    //---------- Process remaining rows, 1 by 1 ----------
+    for(Index i=peeled_mc1; i<rows; i+=1)
+    {
+      // loop on each panel of the rhs
+      for(Index j2=0; j2<packet_cols4; j2+=nr)
+      {
+        const LhsScalar* blA = &blockA[i*strideA+offsetA];
+        prefetch(&blA[0]);
+        const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+        prefetch(&blB[0]);
+
+        if( (SwappedTraits::LhsProgress % 4)==0 )
+        {
+          // NOTE The following piece of code wont work for 512 bit registers
+          SAccPacket C0, C1, C2, C3;
+          straits.initAcc(C0);
+          straits.initAcc(C1);
+          straits.initAcc(C2);
+          straits.initAcc(C3);
+
+          const Index spk   = (std::max)(1,SwappedTraits::LhsProgress/4);
+          const Index endk  = (depth/spk)*spk;
+          const Index endk4 = (depth/(spk*4))*(spk*4);
+
+          Index k=0;
+          for(; k<endk4; k+=4*spk)
+          {
+            prefetch(&blB[4*SwappedTraits::LhsProgress]);
+
+            SLhsPacket A0,A1,A2,A3;
+            SRhsPacket B_0,B_1,B_2,B_3;
+
+            straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
+            straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
+            straits.loadRhsQuad(blA+0*spk, B_0);
+            straits.loadRhsQuad(blA+1*spk, B_1);
+            straits.madd(A0,B_0,C0,B_0);
+            straits.madd(A1,B_1,C1,B_1);
+
+            straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A2);
+            straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A3);
+            straits.loadRhsQuad(blA+2*spk, B_2);
+            straits.loadRhsQuad(blA+3*spk, B_3);
+            straits.madd(A2,B_2,C2,B_2);
+            straits.madd(A3,B_3,C3,B_3);
+
+            blB += 4*SwappedTraits::LhsProgress;
+            blA += 4*spk;
+          }
+          C0 = padd(padd(C0,C1),padd(C2,C3));
+          for(; k<endk; k+=spk)
+          {
+            SLhsPacket A0;
+            SRhsPacket B_0;
+
+            straits.loadLhsUnaligned(blB, A0);
+            straits.loadRhsQuad(blA, B_0);
+            straits.madd(A0,B_0,C0,B_0);
+
+            blB += SwappedTraits::LhsProgress;
+            blA += spk;
+          }
+          if(SwappedTraits::LhsProgress==8)
+          {
+            // Special case where we have to first reduce the accumulation register C0
+            typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
+            typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
+            typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
+            typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
+
+            SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
+            SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
+
+            if(depth-endk>0)
+            {
+              // We have to handle the last row of the rhs which corresponds to a half-packet
+              SLhsPacketHalf a0;
+              SRhsPacketHalf b0;
+              straits.loadLhsUnaligned(blB, a0);
+              straits.loadRhs(blA, b0);
+              SAccPacketHalf c0 = predux4(C0);
+              straits.madd(a0,b0,c0,b0);
+              straits.acc(c0, alphav, R);
+            }
+            else
+            {
+                straits.acc(predux4(C0), alphav, R);
+            }
+            res.scatterPacket(i, j2, R);
+          }
+          else
+          {
+            SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
+            SResPacket alphav = pset1<SResPacket>(alpha);
+            straits.acc(C0, alphav, R);
+            res.scatterPacket(i, j2, R);
+          }
+        }
+        else // scalar path
+        {
+          // get a 1 x 4 res block as registers
+          ResScalar C0(0), C1(0), C2(0), C3(0);
+
+          for(Index k=0; k<depth; k++)
+          {
+            LhsScalar A0 = blA[k];
+            RhsScalar B_0 = blB[0];
+            RhsScalar B_1 = blB[1];
+            CJMADD(cj,A0,B_0,C0, B_0);
+            CJMADD(cj,A0,B_1,C1, B_1);
+            RhsScalar B_2 = blB[2];
+            RhsScalar B_3 = blB[3];
+            CJMADD(cj,A0,B_2,C2, B_2);
+            CJMADD(cj,A0,B_3,C3, B_3);
+
+            blB += 4;
+          }
+          res(i, j2 + 0) += alpha * C0;
+          res(i, j2 + 1) += alpha * C1;
+          res(i, j2 + 2) += alpha * C2;
+          res(i, j2 + 3) += alpha * C3;
+        }
+      }
+
+      // remaining columns
+      for(Index j2=packet_cols4; j2<cols; j2++)
+      {
+        const LhsScalar* blA = &blockA[i*strideA+offsetA];
+        //          prefetch(blA);
+        // gets a 1 x 1 res block as registers
+        ResScalar C0(0);
+        const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+        for(Index k=0; k<depth; k++)
+        {
+          LhsScalar A0 = blA[k];
+          RhsScalar B_0 = blB[k];
+          CJMADD(cj, A0, B_0, C0, B_0);
+        }
+        res(i, j2) += alpha * C0;
+      }
+    }
+  }
+
+
+#undef CJMADD
+
+// pack a block of the lhs
+// The traversal is as follow (mr==4):
+//   0  4  8 12 ...
+//   1  5  9 13 ...
+//   2  6 10 14 ...
+//   3  7 11 15 ...
+//
+//  16 20 24 28 ...
+//  17 21 25 29 ...
+//  18 22 26 30 ...
+//  19 23 27 31 ...
+//
+//  32 33 34 35 ...
+//  36 36 38 39 ...
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
+{
+  typedef typename DataMapper::LinearMapper LinearMapper;
+  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
+  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+  typedef typename packet_traits<Scalar>::type Packet;
+  enum { PacketSize = packet_traits<Scalar>::size };
+
+  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
+  EIGEN_UNUSED_VARIABLE(stride);
+  EIGEN_UNUSED_VARIABLE(offset);
+  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+  eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
+  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+
+  const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
+  const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
+  const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
+  const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
+                         : Pack2>1             ? (rows/Pack2)*Pack2 : 0;
+
+  Index i=0;
+
+  // Pack 3 packets
+  if(Pack1>=3*PacketSize)
+  {
+    if(PanelMode)
+    {
+      for(; i<peeled_mc3; i+=3*PacketSize)
+      {
+        blockA += (3*PacketSize) * offset;
+
+        for(Index k=0; k<depth; k++)
+        {
+          Packet A, B, C;
+          A = lhs.loadPacket(i+0*PacketSize, k);
+          B = lhs.loadPacket(i+1*PacketSize, k);
+          C = lhs.loadPacket(i+2*PacketSize, k);
+          pstore(blockA+0*PacketSize, cj.pconj(A));
+          pstore(blockA+1*PacketSize, cj.pconj(B));
+          pstore(blockA+2*PacketSize, cj.pconj(C));
+          blockA += 3*PacketSize;
+        }
+        blockA += (3*PacketSize) * (stride-offset-depth);
+      }
+    }
+    else
+    {
+      // Read the data from DRAM as sequentially as possible. We're writing to
+      // SRAM so the order of the writes shouldn't impact performance.
+      for(Index k=0; k<depth; k++)
+      {
+        Scalar* localBlockA = blockA + 3*PacketSize*k;
+        for(Index local_i = i; local_i<peeled_mc3; local_i+=3*PacketSize)
+        {
+          Packet A, B, C;
+          A = lhs.loadPacket(local_i+0*PacketSize, k);
+          B = lhs.loadPacket(local_i+1*PacketSize, k);
+          C = lhs.loadPacket(local_i+2*PacketSize, k);
+          pstore(localBlockA+0*PacketSize, cj.pconj(A));
+          pstore(localBlockA+1*PacketSize, cj.pconj(B));
+          pstore(localBlockA+2*PacketSize, cj.pconj(C));
+          localBlockA += 3*PacketSize*depth;
+        }
+      }
+      blockA += depth*peeled_mc3;
+      i = peeled_mc3;
+    }
+  }
+  // Pack 2 packets
+  if(Pack1>=2*PacketSize)
+  {
+    if(PanelMode)
+    {
+      for(; i<peeled_mc2; i+=2*PacketSize)
+      {
+        blockA += (2*PacketSize) * offset;
+
+        for(Index k=0; k<depth; k++)
+        {
+          Packet A, B;
+          A = lhs.loadPacket(i+0*PacketSize, k);
+          B = lhs.loadPacket(i+1*PacketSize, k);
+          pstore(blockA+0*PacketSize, cj.pconj(A));
+          pstore(blockA+1*PacketSize, cj.pconj(B));
+          blockA += 2*PacketSize;
+        }
+        blockA += (2*PacketSize) * (stride-offset-depth);
+      }
+    }
+    else
+    {
+      // Read the data from RAM as sequentially as possible.
+      for(Index k=0; k<depth; k++)
+      {
+        Scalar* localBlockA = blockA + 2*PacketSize*k;
+        for(Index local_i = i; local_i<peeled_mc2; local_i+=2*PacketSize)
+        {
+          Packet A, B;
+          A = lhs.loadPacket(local_i+0*PacketSize, k);
+          B = lhs.loadPacket(local_i+1*PacketSize, k);
+          pstore(localBlockA+0*PacketSize, cj.pconj(A));
+          pstore(localBlockA+1*PacketSize, cj.pconj(B));
+          localBlockA += 2*PacketSize*depth;
+        }
+      }
+      blockA += depth*(peeled_mc2-i);
+      i = peeled_mc2;
+    }
+  }
+  // Pack 1 packets
+  if(Pack1>=1*PacketSize)
+  {
+    if(PanelMode)
+    {
+      for(; i<peeled_mc1; i+=1*PacketSize)
+      {
+        blockA += (1*PacketSize) * offset;
+
+        for(Index k=0; k<depth; k++)
+        {
+          Packet A;
+          A = lhs.loadPacket(i+0*PacketSize, k);
+          pstore(blockA, cj.pconj(A));
+          blockA+=PacketSize;
+        }
+        blockA += (1*PacketSize) * (stride-offset-depth);
+      }
+    }
+    else
+    {
+      // Read the data from RAM as sequentially as possible.
+      for(Index k=0; k<depth; k++)
+      {
+        Scalar* localBlockA = blockA + PacketSize*k;
+        for(Index local_i = i; local_i<peeled_mc1; local_i+=1*PacketSize)
+        {
+          Packet A;
+          A = lhs.loadPacket(local_i+0*PacketSize, k);
+          pstore(localBlockA, cj.pconj(A));
+          localBlockA += PacketSize*depth;
+        }
+      }
+      blockA += depth*(peeled_mc1-i);
+      i = peeled_mc1;
+    }
+  }
+  // Pack scalars
+  if(Pack2<PacketSize && Pack2>1)
+  {
+    for(; i<peeled_mc0; i+=Pack2)
+    {
+      if (PanelMode) {
+        blockA += Pack2 * offset;
+      }
+
+      for(Index k=0; k<depth; k++) {
+        const LinearMapper dm0 = lhs.getLinearMapper(i, k);
+        for(Index w=0; w<Pack2; w++) {
+          *blockA = cj(dm0(w));
+          blockA += 1;
+        }
+      }
+
+      if(PanelMode) blockA += Pack2 * (stride-offset-depth);
+    }
+  }
+  for(; i<rows; i++)
+  {
+    if(PanelMode) blockA += offset;
+    for(Index k=0; k<depth; k++) {
+      *blockA = cj(lhs(i, k));
+      blockA += 1;
+    }
+    if(PanelMode) blockA += (stride-offset-depth);
+  }
+}
+
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
+{
+  typedef typename DataMapper::LinearMapper LinearMapper;
+  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
+  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+  typedef typename packet_traits<Scalar>::type Packet;
+  enum { PacketSize = packet_traits<Scalar>::size };
+
+  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
+  EIGEN_UNUSED_VARIABLE(stride);
+  EIGEN_UNUSED_VARIABLE(offset);
+  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+
+//   const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
+//   const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
+//   const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
+
+  int pack = Pack1;
+  Index i = 0;
+  while(pack>0)
+  {
+    Index remaining_rows = rows-i;
+    Index peeled_mc = i+(remaining_rows/pack)*pack;
+    for(; i<peeled_mc; i+=pack)
+    {
+      if(PanelMode) blockA += pack * offset;
+
+      const Index peeled_k = (depth/PacketSize)*PacketSize;
+      Index k=0;
+      if(pack>=PacketSize)
+      {
+        for(; k<peeled_k; k+=PacketSize)
+        {
+          for (Index m = 0; m < pack; m += PacketSize)
+          {
+            PacketBlock<Packet> kernel;
+            for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k);
+            ptranspose(kernel);
+            for (int p = 0; p < PacketSize; ++p) pstore(blockA+m+(pack)*p, cj.pconj(kernel.packet[p]));
+          }
+          blockA += PacketSize*pack;
+        }
+      }
+      for(; k<depth; k++)
+      {
+        Index w=0;
+        for(; w<pack-3; w+=4)
+        {
+          Scalar a(cj(lhs(i+w+0, k))),
+                 b(cj(lhs(i+w+1, k))),
+                 c(cj(lhs(i+w+2, k))),
+                 d(cj(lhs(i+w+3, k)));
+          blockA[0] = a;
+          blockA[1] = b;
+          blockA[2] = c;
+          blockA[3] = d;
+          blockA += 4;
+        }
+        if(pack%4)
+          for(;w<pack;++w) {
+            *blockA = cj(lhs(i+w, k));
+            blockA += 1;
+          }
+      }
+
+      if(PanelMode) blockA += pack * (stride-offset-depth);
+    }
+
+    pack -= PacketSize;
+    if(pack<Pack2 && (pack+PacketSize)!=Pack2)
+      pack = Pack2;
+  }
+
+  for(; i<rows; i++)
+  {
+    if(PanelMode) blockA += offset;
+    for(Index k=0; k<depth; k++) {
+      *blockA = cj(lhs(i, k));
+      blockA += 1;
+    }
+    if(PanelMode) blockA += (stride-offset-depth);
+  }
+}
+
+// copy a complete panel of the rhs
+// this version is optimized for column major matrices
+// The traversal order is as follow: (nr==4):
+//  0  1  2  3   12 13 14 15   24 27
+//  4  5  6  7   16 17 18 19   25 28
+//  8  9 10 11   20 21 22 23   26 29
+//  .  .  .  .    .  .  .  .    .  .
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+{
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename DataMapper::LinearMapper LinearMapper;
+  enum { PacketSize = packet_traits<Scalar>::size };
+  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+{
+  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
+  EIGEN_UNUSED_VARIABLE(stride);
+  EIGEN_UNUSED_VARIABLE(offset);
+  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+  Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
+  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
+  const Index peeled_k = (depth/PacketSize)*PacketSize;
+//   if(nr>=8)
+//   {
+//     for(Index j2=0; j2<packet_cols8; j2+=8)
+//     {
+//       // skip what we have before
+//       if(PanelMode) count += 8 * offset;
+//       const Scalar* b0 = &rhs[(j2+0)*rhsStride];
+//       const Scalar* b1 = &rhs[(j2+1)*rhsStride];
+//       const Scalar* b2 = &rhs[(j2+2)*rhsStride];
+//       const Scalar* b3 = &rhs[(j2+3)*rhsStride];
+//       const Scalar* b4 = &rhs[(j2+4)*rhsStride];
+//       const Scalar* b5 = &rhs[(j2+5)*rhsStride];
+//       const Scalar* b6 = &rhs[(j2+6)*rhsStride];
+//       const Scalar* b7 = &rhs[(j2+7)*rhsStride];
+//       Index k=0;
+//       if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4
+//       {
+//         for(; k<peeled_k; k+=PacketSize) {
+//           PacketBlock<Packet> kernel;
+//           for (int p = 0; p < PacketSize; ++p) {
+//             kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]);
+//           }
+//           ptranspose(kernel);
+//           for (int p = 0; p < PacketSize; ++p) {
+//             pstoreu(blockB+count, cj.pconj(kernel.packet[p]));
+//             count+=PacketSize;
+//           }
+//         }
+//       }
+//       for(; k<depth; k++)
+//       {
+//         blockB[count+0] = cj(b0[k]);
+//         blockB[count+1] = cj(b1[k]);
+//         blockB[count+2] = cj(b2[k]);
+//         blockB[count+3] = cj(b3[k]);
+//         blockB[count+4] = cj(b4[k]);
+//         blockB[count+5] = cj(b5[k]);
+//         blockB[count+6] = cj(b6[k]);
+//         blockB[count+7] = cj(b7[k]);
+//         count += 8;
+//       }
+//       // skip what we have after
+//       if(PanelMode) count += 8 * (stride-offset-depth);
+//     }
+//   }
+
+  if(nr>=4)
+  {
+    for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
+    {
+      // skip what we have before
+      if(PanelMode) blockB += 4 * offset;
+
+      // TODO: each of these makes a copy of the stride :(
+      const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+      Index k=0;
+      if((PacketSize%4)==0) // TODO enable vectorized transposition for PacketSize==2 ??
+      {
+        for(; k<peeled_k; k+=PacketSize) {
+          PacketBlock<Packet, 4> kernel;
+          kernel.packet[0] = dm0.loadPacket(k);
+          kernel.packet[1] = dm1.loadPacket(k);
+          kernel.packet[2] = dm2.loadPacket(k);
+          kernel.packet[3] = dm3.loadPacket(k);
+          ptranspose(kernel);
+          pstoreu(blockB+0*PacketSize, cj.pconj(kernel.packet[0]));
+          pstoreu(blockB+1*PacketSize, cj.pconj(kernel.packet[1]));
+          pstoreu(blockB+2*PacketSize, cj.pconj(kernel.packet[2]));
+          pstoreu(blockB+3*PacketSize, cj.pconj(kernel.packet[3]));
+          blockB+=4*PacketSize;
+        }
+      }
+      for(; k<depth; k++)
+      {
+        blockB[0] = cj(dm0(k));
+        blockB[1] = cj(dm1(k));
+        blockB[2] = cj(dm2(k));
+        blockB[3] = cj(dm3(k));
+        blockB += 4;
+      }
+      // skip what we have after
+      if(PanelMode) blockB += 4 * (stride-offset-depth);
+    }
+  }
+
+  // copy the remaining columns one at a time (nr==1)
+  for(Index j2=packet_cols4; j2<cols; ++j2)
+  {
+    const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
+    if(PanelMode) blockB += offset;
+    for(Index k=0; k<depth; k++)
+    {
+      *blockB = cj(dm0(k));
+      blockB += 1;
+    }
+    if(PanelMode) blockB += (stride-offset-depth);
+  }
+}
+
+// this version is optimized for row major matrices
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+{
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+  typedef typename DataMapper::LinearMapper LinearMapper;
+  enum {
+    PacketSize = packet_traits<Scalar>::size,
+    HalfPacketSize = packet_traits<Scalar>::HasHalfPacket ? unpacket_traits<typename packet_traits<Scalar>::half>::size : 0
+  };
+  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+  ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
+{
+  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
+  EIGEN_UNUSED_VARIABLE(stride);
+  EIGEN_UNUSED_VARIABLE(offset);
+  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+  Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
+  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
+
+//   if(nr>=8)
+//   {
+//     for(Index j2=0; j2<packet_cols8; j2+=8)
+//     {
+//       // skip what we have before
+//       if(PanelMode) count += 8 * offset;
+//       for(Index k=0; k<depth; k++)
+//       {
+//         if (PacketSize==8) {
+//           Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
+//           pstoreu(blockB+count, cj.pconj(A));
+//         } else if (PacketSize==4) {
+//           Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
+//           Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
+//           pstoreu(blockB+count, cj.pconj(A));
+//           pstoreu(blockB+count+PacketSize, cj.pconj(B));
+//         } else {
+//           const Scalar* b0 = &rhs[k*rhsStride + j2];
+//           blockB[count+0] = cj(b0[0]);
+//           blockB[count+1] = cj(b0[1]);
+//           blockB[count+2] = cj(b0[2]);
+//           blockB[count+3] = cj(b0[3]);
+//           blockB[count+4] = cj(b0[4]);
+//           blockB[count+5] = cj(b0[5]);
+//           blockB[count+6] = cj(b0[6]);
+//           blockB[count+7] = cj(b0[7]);
+//         }
+//         count += 8;
+//       }
+//       // skip what we have after
+//       if(PanelMode) count += 8 * (stride-offset-depth);
+//     }
+//   }
+  if(nr>=4)
+  {
+    for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
+    {
+      // skip what we have before
+      if(PanelMode) blockB += 4 * offset;
+      for(Index k=0; k<depth; k++)
+      {
+        if (PacketSize==4) {
+          Packet A = rhs.loadPacket(k, j2);
+          pstore(blockB, cj.pconj(A));
+          blockB += PacketSize;
+        }
+        else if (HalfPacketSize==4) {
+          HalfPacket A = rhs.loadHalfPacket(k, j2);
+          pstore<Scalar, HalfPacket>(blockB, cj.pconj(A));
+          blockB += HalfPacketSize;
+        }
+        else {
+          const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
+          blockB[0] = cj(dm0(0));
+          blockB[1] = cj(dm0(1));
+          blockB[2] = cj(dm0(2));
+          blockB[3] = cj(dm0(3));
+          blockB += 4;
+        }
+      }
+      // skip what we have after
+      if(PanelMode) blockB += 4 * (stride-offset-depth);
+    }
+  }
+  // copy the remaining columns one at a time (nr==1)
+  for(Index j2=packet_cols4; j2<cols; ++j2)
+  {
+    if(PanelMode) blockB += offset;
+    for(Index k=0; k<depth; k++)
+    {
+      *blockB = cj(rhs(k, j2));
+      blockB += 1;
+    }
+    if(PanelMode) blockB += stride-offset-depth;
+  }
+}
+
+} // end namespace internal
+
+/** \returns the currently set level 1 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
+  * \sa setCpuCacheSize */
+inline std::ptrdiff_t l1CacheSize()
+{
+  std::ptrdiff_t l1, l2, l3;
+  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
+  return l1;
+}
+
+/** \returns the currently set level 2 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
+  * \sa setCpuCacheSize */
+inline std::ptrdiff_t l2CacheSize()
+{
+  std::ptrdiff_t l1, l2, l3;
+  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
+  return l2;
+}
+
+/** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
+  * \sa setCpuCacheSize */
+inline std::ptrdiff_t l3CacheSize()
+{
+  std::ptrdiff_t l1, l2, l3;
+  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
+  return l3;
+}
+
+/** Set the cpu L1 and L2 cache sizes (in bytes).
+  * These values are use to adjust the size of the blocks
+  * for the algorithms working per blocks.
+  *
+  * \sa computeProductBlockingSizes */
+inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
+{
+  internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_BLOCK_PANEL_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix.h b/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix.h
new file mode 100644
index 0000000000..c3715b1a39
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -0,0 +1,465 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template<typename _LhsScalar, typename _RhsScalar> class level3_blocking;
+
+/* Specialization for a row-major destination matrix => simple transposition of the product */
+template<
+  typename Index,
+  typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+  typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
+struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor>
+{
+  typedef gebp_traits<RhsScalar,LhsScalar> Traits;
+  
+  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  static EIGEN_STRONG_INLINE void run(
+    Index rows, Index cols, Index depth,
+    const LhsScalar* lhs, Index lhsStride,
+    const RhsScalar* rhs, Index rhsStride,
+    ResScalar* res, Index resStride,
+    ResScalar alpha,
+    level3_blocking<RhsScalar,LhsScalar>& blocking,
+    GemmParallelInfo<Index>* info = 0)
+  {
+    // transpose the product such that the result is column major
+    general_matrix_matrix_product<Index,
+      RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
+      LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
+      ColMajor>
+    ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking,info);
+  }
+};
+
+/*  Specialization for a col-major destination matrix
+ *    => Blocking algorithm following Goto's paper */
+template<
+  typename Index,
+  typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+  typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
+struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor>
+{
+
+typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+  
+typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+static void run(Index rows, Index cols, Index depth,
+  const LhsScalar* _lhs, Index lhsStride,
+  const RhsScalar* _rhs, Index rhsStride,
+  ResScalar* _res, Index resStride,
+  ResScalar alpha,
+  level3_blocking<LhsScalar,RhsScalar>& blocking,
+  GemmParallelInfo<Index>* info = 0)
+{
+  typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
+  typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
+  typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+  LhsMapper lhs(_lhs,lhsStride);
+  RhsMapper rhs(_rhs,rhsStride);
+  ResMapper res(_res, resStride);
+
+  Index kc = blocking.kc();                   // cache block size along the K direction
+  Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
+  Index nc = (std::min)(cols,blocking.nc());  // cache block size along the N direction
+
+  gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+  gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+  gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
+
+#ifdef EIGEN_HAS_OPENMP
+  if(info)
+  {
+    // this is the parallel version!
+    Index tid = omp_get_thread_num();
+    Index threads = omp_get_num_threads();
+    
+    LhsScalar* blockA = blocking.blockA();
+    eigen_internal_assert(blockA!=0);
+    
+    std::size_t sizeB = kc*nc;
+    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, 0);
+      
+    // For each horizontal panel of the rhs, and corresponding vertical panel of the lhs...
+    for(Index k=0; k<depth; k+=kc)
+    {
+      const Index actual_kc = (std::min)(k+kc,depth)-k; // => rows of B', and cols of the A'
+
+      // In order to reduce the chance that a thread has to wait for the other,
+      // let's start by packing B'.
+      pack_rhs(blockB, rhs.getSubMapper(k,0), actual_kc, nc);
+
+      // Pack A_k to A' in a parallel fashion:
+      // each thread packs the sub block A_k,i to A'_i where i is the thread id.
+
+      // However, before copying to A'_i, we have to make sure that no other thread is still using it,
+      // i.e., we test that info[tid].users equals 0.
+      // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
+      while(info[tid].users!=0) {}
+      info[tid].users += threads;
+
+      pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length);
+
+      // Notify the other threads that the part A'_i is ready to go.
+      info[tid].sync = k;
+      
+      // Computes C_i += A' * B' per A'_i
+      for(Index shift=0; shift<threads; ++shift)
+      {
+        Index i = (tid+shift)%threads;
+
+        // At this point we have to make sure that A'_i has been updated by the thread i,
+        // we use testAndSetOrdered to mimic a volatile access.
+        // However, no need to wait for the B' part which has been updated by the current thread!
+        if (shift>0) {
+          while(info[i].sync!=k) {
+          }
+        }
+
+        gebp(res.getSubMapper(info[i].lhs_start, 0), blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
+      }
+
+      // Then keep going as usual with the remaining B'
+      for(Index j=nc; j<cols; j+=nc)
+      {
+        const Index actual_nc = (std::min)(j+nc,cols)-j;
+
+        // pack B_k,j to B'
+        pack_rhs(blockB, rhs.getSubMapper(k,j), actual_kc, actual_nc);
+
+        // C_j += A' * B'
+        gebp(res.getSubMapper(0, j), blockA, blockB, rows, actual_kc, actual_nc, alpha);
+      }
+
+      // Release all the sub blocks A'_i of A' for the current thread,
+      // i.e., we simply decrement the number of users by 1
+      #pragma omp critical
+      {
+      for(Index i=0; i<threads; ++i)
+        #pragma omp atomic
+        --(info[i].users);
+      }
+    }
+  }
+  else
+#endif // EIGEN_HAS_OPENMP
+  {
+    EIGEN_UNUSED_VARIABLE(info);
+
+    // this is the sequential version!
+    std::size_t sizeA = kc*mc;
+    std::size_t sizeB = kc*nc;
+
+    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
+    
+    const bool pack_rhs_once = mc!=rows && kc==depth && nc==cols;
+
+    // For each horizontal panel of the rhs, and corresponding panel of the lhs...
+    for(Index i2=0; i2<rows; i2+=mc)
+    {
+      const Index actual_mc = (std::min)(i2+mc,rows)-i2;
+
+      for(Index k2=0; k2<depth; k2+=kc)
+      {
+        const Index actual_kc = (std::min)(k2+kc,depth)-k2;
+        
+        // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
+        // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
+        // Note that this panel will be read as many times as the number of blocks in the rhs's
+        // horizontal panel which is, in practice, a very low number.
+        pack_lhs(blockA, lhs.getSubMapper(i2,k2), actual_kc, actual_mc);
+        
+        // For each kc x nc block of the rhs's horizontal panel...
+        for(Index j2=0; j2<cols; j2+=nc)
+        {
+          const Index actual_nc = (std::min)(j2+nc,cols)-j2;
+          
+          // We pack the rhs's block into a sequential chunk of memory (L2 caching)
+          // Note that this block will be read a very high number of times, which is equal to the number of
+          // micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
+          if((!pack_rhs_once) || i2==0)
+            pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc);
+          
+          // Everything is packed, we can now call the panel * block kernel:
+          gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha);
+        }
+      }
+    }
+  }
+}
+
+};
+
+/*********************************************************************************
+*  Specialization of GeneralProduct<> for "large" GEMM, i.e.,
+*  implementation of the high level wrapper to general_matrix_matrix_product
+**********************************************************************************/
+
+template<typename Lhs, typename Rhs>
+struct traits<GeneralProduct<Lhs,Rhs,GemmProduct> >
+ : traits<ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> >
+{};
+
+template<typename Scalar, typename Index, typename Gemm, typename Lhs, typename Rhs, typename Dest, typename BlockingType>
+struct gemm_functor
+{
+  gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha, BlockingType& blocking)
+    : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking)
+  {}
+
+  void initParallelSession() const
+  {
+    m_blocking.allocateA();
+  }
+
+  void operator() (Index row, Index rows, Index col=0, Index cols=-1, GemmParallelInfo<Index>* info=0) const
+  {
+    if(cols==-1)
+      cols = m_rhs.cols();
+
+    Gemm::run(rows, cols, m_lhs.cols(),
+              /*(const Scalar*)*/&m_lhs.coeffRef(row,0), m_lhs.outerStride(),
+              /*(const Scalar*)*/&m_rhs.coeffRef(0,col), m_rhs.outerStride(),
+              (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(),
+              m_actualAlpha, m_blocking, info);
+  }
+  
+  typedef typename Gemm::Traits Traits;
+
+  protected:
+    const Lhs& m_lhs;
+    const Rhs& m_rhs;
+    Dest& m_dest;
+    Scalar m_actualAlpha;
+    BlockingType& m_blocking;
+};
+
+template<int StorageOrder, typename LhsScalar, typename RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor=1,
+bool FiniteAtCompileTime = MaxRows!=Dynamic && MaxCols!=Dynamic && MaxDepth != Dynamic> class gemm_blocking_space;
+
+template<typename _LhsScalar, typename _RhsScalar>
+class level3_blocking
+{
+    typedef _LhsScalar LhsScalar;
+    typedef _RhsScalar RhsScalar;
+
+  protected:
+    LhsScalar* m_blockA;
+    RhsScalar* m_blockB;
+
+    DenseIndex m_mc;
+    DenseIndex m_nc;
+    DenseIndex m_kc;
+
+  public:
+
+    level3_blocking()
+      : m_blockA(0), m_blockB(0), m_mc(0), m_nc(0), m_kc(0)
+    {}
+
+    inline DenseIndex mc() const { return m_mc; }
+    inline DenseIndex nc() const { return m_nc; }
+    inline DenseIndex kc() const { return m_kc; }
+
+    inline LhsScalar* blockA() { return m_blockA; }
+    inline RhsScalar* blockB() { return m_blockB; }
+};
+
+template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
+class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, true>
+  : public level3_blocking<
+      typename conditional<StorageOrder==RowMajor,_RhsScalar,_LhsScalar>::type,
+      typename conditional<StorageOrder==RowMajor,_LhsScalar,_RhsScalar>::type>
+{
+    enum {
+      Transpose = StorageOrder==RowMajor,
+      ActualRows = Transpose ? MaxCols : MaxRows,
+      ActualCols = Transpose ? MaxRows : MaxCols
+    };
+    typedef typename conditional<Transpose,_RhsScalar,_LhsScalar>::type LhsScalar;
+    typedef typename conditional<Transpose,_LhsScalar,_RhsScalar>::type RhsScalar;
+    typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+    enum {
+      SizeA = ActualRows * MaxDepth,
+      SizeB = ActualCols * MaxDepth
+    };
+
+    EIGEN_ALIGN_DEFAULT LhsScalar m_staticA[SizeA];
+    EIGEN_ALIGN_DEFAULT RhsScalar m_staticB[SizeB];
+
+  public:
+
+    gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, int /*num_threads*/, bool /*full_rows = false*/)
+    {
+      this->m_mc = ActualRows;
+      this->m_nc = ActualCols;
+      this->m_kc = MaxDepth;
+      this->m_blockA = m_staticA;
+      this->m_blockB = m_staticB;
+    }
+
+    inline void allocateA() {}
+    inline void allocateB() {}
+    inline void allocateAll() {}
+};
+
+template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
+class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, false>
+  : public level3_blocking<
+      typename conditional<StorageOrder==RowMajor,_RhsScalar,_LhsScalar>::type,
+      typename conditional<StorageOrder==RowMajor,_LhsScalar,_RhsScalar>::type>
+{
+    enum {
+      Transpose = StorageOrder==RowMajor
+    };
+    typedef typename conditional<Transpose,_RhsScalar,_LhsScalar>::type LhsScalar;
+    typedef typename conditional<Transpose,_LhsScalar,_RhsScalar>::type RhsScalar;
+    typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+
+    DenseIndex m_sizeA;
+    DenseIndex m_sizeB;
+
+  public:
+
+    gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, DenseIndex num_threads, bool l3_blocking)
+    {
+      this->m_mc = Transpose ? cols : rows;
+      this->m_nc = Transpose ? rows : cols;
+      this->m_kc = depth;
+
+      if(l3_blocking)
+      {
+        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc, num_threads);
+      }
+      else // no l3 blocking
+      {
+        DenseIndex m = this->m_mc;
+        DenseIndex n = this->m_nc;
+        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, n, num_threads);
+      }
+      
+      m_sizeA = this->m_mc * this->m_kc;
+      m_sizeB = this->m_kc * this->m_nc;
+    }
+
+    void allocateA()
+    {
+      if(this->m_blockA==0)
+        this->m_blockA = aligned_new<LhsScalar>(m_sizeA);
+    }
+
+    void allocateB()
+    {
+      if(this->m_blockB==0)
+        this->m_blockB = aligned_new<RhsScalar>(m_sizeB);
+    }
+
+    void allocateAll()
+    {
+      allocateA();
+      allocateB();
+    }
+
+    ~gemm_blocking_space()
+    {
+      aligned_delete(this->m_blockA, m_sizeA);
+      aligned_delete(this->m_blockB, m_sizeB);
+    }
+};
+
+} // end namespace internal
+
+template<typename Lhs, typename Rhs>
+class GeneralProduct<Lhs, Rhs, GemmProduct>
+  : public ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs>
+{
+    enum {
+      MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime)
+    };
+  public:
+    EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
+    
+    typedef typename  Lhs::Scalar LhsScalar;
+    typedef typename  Rhs::Scalar RhsScalar;
+    typedef           Scalar      ResScalar;
+
+    GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
+    {
+      typedef internal::scalar_product_op<LhsScalar,RhsScalar> BinOp;
+      EIGEN_CHECK_BINARY_COMPATIBILIY(BinOp,LhsScalar,RhsScalar);
+    }
+    
+    template<typename Dest>
+    inline void evalTo(Dest& dst) const
+    {
+      if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0)
+        dst.noalias() = m_lhs .lazyProduct( m_rhs );
+      else
+      {
+        dst.setZero();
+        scaleAndAddTo(dst,Scalar(1));
+      }
+    }
+
+    template<typename Dest>
+    inline void addTo(Dest& dst) const
+    {
+      if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0)
+        dst.noalias() += m_lhs .lazyProduct( m_rhs );
+      else
+        scaleAndAddTo(dst,Scalar(1));
+    }
+
+    template<typename Dest>
+    inline void subTo(Dest& dst) const
+    {
+      if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0)
+        dst.noalias() -= m_lhs .lazyProduct( m_rhs );
+      else
+        scaleAndAddTo(dst,Scalar(-1));
+    }
+    
+    template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+    {
+      eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
+
+      typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
+      typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
+
+      Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
+                                 * RhsBlasTraits::extractScalarFactor(m_rhs);
+
+      typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar,
+              Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType;
+
+      typedef internal::gemm_functor<
+        Scalar, Index,
+        internal::general_matrix_matrix_product<
+          Index,
+          LhsScalar, (_ActualLhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate),
+          RhsScalar, (_ActualRhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
+          (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
+        _ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor;
+
+      BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
+
+      internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit);
+    }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
new file mode 100644
index 0000000000..e4c10e88d1
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -0,0 +1,285 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H
+
+namespace Eigen { 
+
+template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjLhs, bool ConjRhs>
+struct selfadjoint_rank1_update;
+
+namespace internal {
+
+/**********************************************************************
+* This file implements a general A * B product while
+* evaluating only one triangular part of the product.
+* This is more general version of self adjoint product (C += A A^T)
+* as the level 3 SYRK Blas routine.
+**********************************************************************/
+
+// forward declarations (defined at the end of this file)
+template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo>
+struct tribb_kernel;
+  
+/* Optimized matrix-matrix product evaluating only one triangular half */
+template <typename Index,
+          typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+          typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
+                              int ResStorageOrder, int  UpLo, int Version = Specialized>
+struct general_matrix_matrix_triangular_product;
+
+// as usual if the result is row major => we transpose the product
+template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+                          typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int  UpLo, int Version>
+struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,UpLo,Version>
+{
+  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride,
+                                      const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha)
+  {
+    general_matrix_matrix_triangular_product<Index,
+        RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
+        LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
+        ColMajor, UpLo==Lower?Upper:Lower>
+      ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha);
+  }
+};
+
+template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
+                          typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int  UpLo, int Version>
+struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Version>
+{
+  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
+                                      const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, const ResScalar& alpha)
+  {
+    typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+
+    typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    RhsMapper rhs(_rhs,rhsStride);
+    ResMapper res(_res, resStride);
+
+    Index kc = depth; // cache block size along the K direction
+    Index mc = size;  // cache block size along the M direction
+    Index nc = size;  // cache block size along the N direction
+    computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc, Index(1));
+    // !!! mc must be a multiple of nr:
+    if(mc > Traits::nr)
+      mc = (mc/Traits::nr)*Traits::nr;
+
+    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0);
+    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0);
+
+    gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+    gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
+    tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, UpLo> sybb;
+
+    for(Index k2=0; k2<depth; k2+=kc)
+    {
+      const Index actual_kc = (std::min)(k2+kc,depth)-k2;
+
+      // note that the actual rhs is the transpose/adjoint of mat
+      pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, size);
+
+      for(Index i2=0; i2<size; i2+=mc)
+      {
+        const Index actual_mc = (std::min)(i2+mc,size)-i2;
+
+        pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
+
+        // the selected actual_mc * size panel of res is split into three different part:
+        //  1 - before the diagonal => processed with gebp or skipped
+        //  2 - the actual_mc x actual_mc symmetric block => processed with a special kernel
+        //  3 - after the diagonal => processed with gebp or skipped
+        if (UpLo==Lower)
+          gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc,
+               (std::min)(size,i2), alpha, -1, -1, 0, 0);
+
+
+        sybb(_res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
+
+        if (UpLo==Upper)
+        {
+          Index j2 = i2+actual_mc;
+          gebp(res.getSubMapper(i2, j2), blockA, blockB+actual_kc*j2, actual_mc,
+               actual_kc, (std::max)(Index(0), size-j2), alpha, -1, -1, 0, 0);
+        }
+      }
+    }
+  }
+};
+
+// Optimized packed Block * packed Block product kernel evaluating only one given triangular part
+// This kernel is built on top of the gebp kernel:
+// - the current destination block is processed per panel of actual_mc x BlockSize
+//   where BlockSize is set to the minimal value allowing gebp to be as fast as possible
+// - then, as usual, each panel is split into three parts along the diagonal,
+//   the sub blocks above and below the diagonal are processed as usual,
+//   while the triangular block overlapping the diagonal is evaluated into a
+//   small temporary buffer which is then accumulated into the result using a
+//   triangular traversal.
+template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo>
+struct tribb_kernel
+{
+  typedef gebp_traits<LhsScalar,RhsScalar,ConjLhs,ConjRhs> Traits;
+  typedef typename Traits::ResScalar ResScalar;
+
+  enum {
+    BlockSize  = EIGEN_PLAIN_ENUM_MAX(mr,nr)
+  };
+  void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
+  {
+    typedef blas_data_mapper<ResScalar, Index, ColMajor> ResMapper;
+    ResMapper res(_res, resStride);
+    gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
+
+    Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer;
+
+    // let's process the block per panel of actual_mc x BlockSize,
+    // again, each is split into three parts, etc.
+    for (Index j=0; j<size; j+=BlockSize)
+    {
+      Index actualBlockSize = std::min<Index>(BlockSize,size - j);
+      const RhsScalar* actual_b = blockB+j*depth;
+
+      if(UpLo==Upper)
+        gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha,
+                    -1, -1, 0, 0);
+
+      // selfadjoint micro block
+      {
+        Index i = j;
+        buffer.setZero();
+        // 1 - apply the kernel on the temporary buffer
+        gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
+                    -1, -1, 0, 0);
+        // 2 - triangular accumulation
+        for(Index j1=0; j1<actualBlockSize; ++j1)
+        {
+          ResScalar* r = &res(i, j + j1);
+          for(Index i1=UpLo==Lower ? j1 : 0;
+              UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1)
+            r[i1] += buffer(i1,j1);
+        }
+      }
+
+      if(UpLo==Lower)
+      {
+        Index i = j+actualBlockSize;
+        gebp_kernel(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i, 
+                    depth, actualBlockSize, alpha, -1, -1, 0, 0);
+      }
+    }
+  }
+};
+
+} // end namespace internal
+
+// high level API
+
+template<typename MatrixType, typename ProductType, int UpLo, bool IsOuterProduct>
+struct general_product_to_triangular_selector;
+
+
+template<typename MatrixType, typename ProductType, int UpLo>
+struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,true>
+{
+  static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha)
+  {
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::Index Index;
+    
+    typedef typename internal::remove_all<typename ProductType::LhsNested>::type Lhs;
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhs;
+    typedef typename internal::remove_all<ActualLhs>::type _ActualLhs;
+    typename internal::add_const_on_value_type<ActualLhs>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
+    
+    typedef typename internal::remove_all<typename ProductType::RhsNested>::type Rhs;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhs;
+    typedef typename internal::remove_all<ActualRhs>::type _ActualRhs;
+    typename internal::add_const_on_value_type<ActualRhs>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
+
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived());
+
+    enum {
+      StorageOrder = (internal::traits<MatrixType>::Flags&RowMajorBit) ? RowMajor : ColMajor,
+      UseLhsDirectly = _ActualLhs::InnerStrideAtCompileTime==1,
+      UseRhsDirectly = _ActualRhs::InnerStrideAtCompileTime==1
+    };
+    
+    internal::gemv_static_vector_if<Scalar,Lhs::SizeAtCompileTime,Lhs::MaxSizeAtCompileTime,!UseLhsDirectly> static_lhs;
+    ei_declare_aligned_stack_constructed_variable(Scalar, actualLhsPtr, actualLhs.size(),
+      (UseLhsDirectly ? const_cast<Scalar*>(actualLhs.data()) : static_lhs.data()));
+    if(!UseLhsDirectly) Map<typename _ActualLhs::PlainObject>(actualLhsPtr, actualLhs.size()) = actualLhs;
+    
+    internal::gemv_static_vector_if<Scalar,Rhs::SizeAtCompileTime,Rhs::MaxSizeAtCompileTime,!UseRhsDirectly> static_rhs;
+    ei_declare_aligned_stack_constructed_variable(Scalar, actualRhsPtr, actualRhs.size(),
+      (UseRhsDirectly ? const_cast<Scalar*>(actualRhs.data()) : static_rhs.data()));
+    if(!UseRhsDirectly) Map<typename _ActualRhs::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
+    
+    
+    selfadjoint_rank1_update<Scalar,Index,StorageOrder,UpLo,
+                              LhsBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex,
+                              RhsBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex>
+          ::run(actualLhs.size(), mat.data(), mat.outerStride(), actualLhsPtr, actualRhsPtr, actualAlpha);
+  }
+};
+
+template<typename MatrixType, typename ProductType, int UpLo>
+struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
+{
+  static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha)
+  {
+    typedef typename MatrixType::Index Index;
+    
+    typedef typename internal::remove_all<typename ProductType::LhsNested>::type Lhs;
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhs;
+    typedef typename internal::remove_all<ActualLhs>::type _ActualLhs;
+    typename internal::add_const_on_value_type<ActualLhs>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
+    
+    typedef typename internal::remove_all<typename ProductType::RhsNested>::type Rhs;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhs;
+    typedef typename internal::remove_all<ActualRhs>::type _ActualRhs;
+    typename internal::add_const_on_value_type<ActualRhs>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
+
+    typename ProductType::Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived());
+
+    internal::general_matrix_matrix_triangular_product<Index,
+      typename Lhs::Scalar, _ActualLhs::Flags&RowMajorBit ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
+      typename Rhs::Scalar, _ActualRhs::Flags&RowMajorBit ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
+      MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, UpLo>
+      ::run(mat.cols(), actualLhs.cols(),
+            &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &actualRhs.coeffRef(0,0), actualRhs.outerStride(),
+            mat.data(), mat.outerStride(), actualAlpha);
+  }
+};
+
+template<typename MatrixType, unsigned int UpLo>
+template<typename ProductDerived, typename _Lhs, typename _Rhs>
+TriangularView<MatrixType,UpLo>& TriangularView<MatrixType,UpLo>::assignProduct(const ProductBase<ProductDerived, _Lhs,_Rhs>& prod, const Scalar& alpha)
+{
+  eigen_assert(m_matrix.rows() == prod.rows() && m_matrix.cols() == prod.cols());
+
+  general_product_to_triangular_selector<MatrixType, ProductDerived, UpLo, (_Lhs::ColsAtCompileTime==1) || (_Rhs::RowsAtCompileTime==1)>::run(m_matrix.const_cast_derived(), prod.derived(), alpha);
+  
+  return *this;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h b/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h
new file mode 100644
index 0000000000..3deed068e3
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h
@@ -0,0 +1,146 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL
+ *   Level 3 BLAS SYRK/HERK implementation.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
+
+namespace Eigen { 
+
+namespace internal {
+
+template <typename Index, typename Scalar, int AStorageOrder, bool ConjugateA, int ResStorageOrder, int  UpLo>
+struct general_matrix_matrix_rankupdate :
+       general_matrix_matrix_triangular_product<
+         Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,UpLo,BuiltIn> {};
+
+
+// try to go to BLAS specialization
+#define EIGEN_MKL_RANKUPDATE_SPECIALIZE(Scalar) \
+template <typename Index, int LhsStorageOrder, bool ConjugateLhs, \
+                          int RhsStorageOrder, bool ConjugateRhs, int  UpLo> \
+struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,ConjugateLhs, \
+               Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Specialized> { \
+  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \
+                          const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha) \
+  { \
+    if (lhs==rhs) { \
+      general_matrix_matrix_rankupdate<Index,Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,UpLo> \
+      ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \
+    } else { \
+      general_matrix_matrix_triangular_product<Index, \
+        Scalar, LhsStorageOrder, ConjugateLhs, \
+        Scalar, RhsStorageOrder, ConjugateRhs, \
+        ColMajor, UpLo, BuiltIn> \
+      ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \
+    } \
+  } \
+};
+
+EIGEN_MKL_RANKUPDATE_SPECIALIZE(double)
+//EIGEN_MKL_RANKUPDATE_SPECIALIZE(dcomplex)
+EIGEN_MKL_RANKUPDATE_SPECIALIZE(float)
+//EIGEN_MKL_RANKUPDATE_SPECIALIZE(scomplex)
+
+// SYRK for float/double
+#define EIGEN_MKL_RANKUPDATE_R(EIGTYPE, MKLTYPE, MKLFUNC) \
+template <typename Index, int AStorageOrder, bool ConjugateA, int  UpLo> \
+struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \
+  enum { \
+    IsLower = (UpLo&Lower) == Lower, \
+    LowUp = IsLower ? Lower : Upper, \
+    conjA = ((AStorageOrder==ColMajor) && ConjugateA) ? 1 : 0 \
+  }; \
+  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \
+                          const EIGTYPE* rhs, Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha) \
+  { \
+  /* typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs;*/ \
+\
+   MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \
+   char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'T':'N'; \
+   MKLTYPE alpha_, beta_; \
+\
+/* Set alpha_ & beta_ */ \
+   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
+   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \
+   MKLFUNC(&uplo, &trans, &n, &k, &alpha_, lhs, &lda, &beta_, res, &ldc); \
+  } \
+};
+
+// HERK for complex data
+#define EIGEN_MKL_RANKUPDATE_C(EIGTYPE, MKLTYPE, RTYPE, MKLFUNC) \
+template <typename Index, int AStorageOrder, bool ConjugateA, int  UpLo> \
+struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \
+  enum { \
+    IsLower = (UpLo&Lower) == Lower, \
+    LowUp = IsLower ? Lower : Upper, \
+    conjA = (((AStorageOrder==ColMajor) && ConjugateA) || ((AStorageOrder==RowMajor) && !ConjugateA)) ? 1 : 0 \
+  }; \
+  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \
+                          const EIGTYPE* rhs, Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha) \
+  { \
+   typedef Matrix<EIGTYPE, Dynamic, Dynamic, AStorageOrder> MatrixType; \
+\
+   MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \
+   char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'C':'N'; \
+   RTYPE alpha_, beta_; \
+   const EIGTYPE* a_ptr; \
+\
+/* Set alpha_ & beta_ */ \
+/*   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); */\
+/*   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1));*/ \
+   alpha_ = alpha.real(); \
+   beta_ = 1.0; \
+/* Copy with conjugation in some cases*/ \
+   MatrixType a; \
+   if (conjA) { \
+     Map<const MatrixType, 0, OuterStride<> > mapA(lhs,n,k,OuterStride<>(lhsStride)); \
+     a = mapA.conjugate(); \
+     lda = a.outerStride(); \
+     a_ptr = a.data(); \
+   } else a_ptr=lhs; \
+   MKLFUNC(&uplo, &trans, &n, &k, &alpha_, (MKLTYPE*)a_ptr, &lda, &beta_, (MKLTYPE*)res, &ldc); \
+  } \
+};
+
+
+EIGEN_MKL_RANKUPDATE_R(double, double, dsyrk)
+EIGEN_MKL_RANKUPDATE_R(float,  float,  ssyrk)
+
+//EIGEN_MKL_RANKUPDATE_C(dcomplex, MKL_Complex16, double, zherk)
+//EIGEN_MKL_RANKUPDATE_C(scomplex, MKL_Complex8,  double, cherk)
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h b/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
new file mode 100644
index 0000000000..060af328eb
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
@@ -0,0 +1,118 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL
+ *   General matrix-matrix product functionality based on ?GEMM.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/**********************************************************************
+* This file implements general matrix-matrix multiplication using BLAS
+* gemm function via partial specialization of
+* general_matrix_matrix_product::run(..) method for float, double,
+* std::complex<float> and std::complex<double> types
+**********************************************************************/
+
+// gemm specialization
+
+#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, MKLTYPE, MKLPREFIX) \
+template< \
+  typename Index, \
+  int LhsStorageOrder, bool ConjugateLhs, \
+  int RhsStorageOrder, bool ConjugateRhs> \
+struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor> \
+{ \
+static void run(Index rows, Index cols, Index depth, \
+  const EIGTYPE* _lhs, Index lhsStride, \
+  const EIGTYPE* _rhs, Index rhsStride, \
+  EIGTYPE* res, Index resStride, \
+  EIGTYPE alpha, \
+  level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/, \
+  GemmParallelInfo<Index>* /*info = 0*/) \
+{ \
+  using std::conj; \
+\
+  char transa, transb; \
+  MKL_INT m, n, k, lda, ldb, ldc; \
+  const EIGTYPE *a, *b; \
+  MKLTYPE alpha_, beta_; \
+  MatrixX##EIGPREFIX a_tmp, b_tmp; \
+  EIGTYPE myone(1);\
+\
+/* Set transpose options */ \
+  transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \
+  transb = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \
+\
+/* Set m, n, k */ \
+  m = (MKL_INT)rows;  \
+  n = (MKL_INT)cols;  \
+  k = (MKL_INT)depth; \
+\
+/* Set alpha_ & beta_ */ \
+  assign_scalar_eig2mkl(alpha_, alpha); \
+  assign_scalar_eig2mkl(beta_, myone); \
+\
+/* Set lda, ldb, ldc */ \
+  lda = (MKL_INT)lhsStride; \
+  ldb = (MKL_INT)rhsStride; \
+  ldc = (MKL_INT)resStride; \
+\
+/* Set a, b, c */ \
+  if ((LhsStorageOrder==ColMajor) && (ConjugateLhs)) { \
+    Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,k,OuterStride<>(lhsStride)); \
+    a_tmp = lhs.conjugate(); \
+    a = a_tmp.data(); \
+    lda = a_tmp.outerStride(); \
+  } else a = _lhs; \
+\
+  if ((RhsStorageOrder==ColMajor) && (ConjugateRhs)) { \
+    Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,k,n,OuterStride<>(rhsStride)); \
+    b_tmp = rhs.conjugate(); \
+    b = b_tmp.data(); \
+    ldb = b_tmp.outerStride(); \
+  } else b = _rhs; \
+\
+  MKLPREFIX##gemm(&transa, &transb, &m, &n, &k, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
+}};
+
+GEMM_SPECIALIZATION(double,   d,  double,        d)
+GEMM_SPECIALIZATION(float,    f,  float,         s)
+GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, z)
+GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8,  c)
+
+} // end namespase internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixVector.h b/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixVector.h
new file mode 100644
index 0000000000..cb67d5d0a9
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -0,0 +1,618 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
+#define EIGEN_GENERAL_MATRIX_VECTOR_H
+
+namespace Eigen {
+
+namespace internal {
+
+/* Optimized col-major matrix * vector product:
+ * This algorithm processes 4 columns at onces that allows to both reduce
+ * the number of load/stores of the result by a factor 4 and to reduce
+ * the instruction dependency. Moreover, we know that all bands have the
+ * same alignment pattern.
+ *
+ * Mixing type logic: C += alpha * A * B
+ *  |  A  |  B  |alpha| comments
+ *  |real |cplx |cplx | no vectorization
+ *  |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
+ *  |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
+ *  |cplx |real |real | optimal case, vectorization possible via real-cplx mul
+ *
+ * Accesses to the matrix coefficients follow the following logic:
+ *
+ * - if all columns have the same alignment then
+ *   - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case)
+ *   - otherwise perform unaligned loads only (-> NoneAligned case)
+ * - otherwise
+ *   - if even columns have the same alignment then
+ *     // odd columns are guaranteed to have the same alignment too
+ *     - if even or odd columns have the same alignment as the result, then
+ *       // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double)
+ *       - perform half aligned and half unaligned loads (-> EvenAligned case)
+ *     - otherwise perform unaligned loads only (-> NoneAligned case)
+ *   - otherwise, if the register size is 4 scalars (e.g., SSE with float) then
+ *     - one over 4 consecutive columns is guaranteed to be aligned with the result vector,
+ *       perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case)
+ *       // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h
+ *   - otherwise,
+ *     // if we get here, this means the register size is greater than 4 (e.g., AVX with floats),
+ *     // we currently fall back to the NoneAligned case
+ *
+ * The same reasoning apply for the transposed case.
+ *
+ * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet...
+ * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment
+ * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow
+ * compared to unaligned loads on a 4 byte boundary.
+ *
+ */
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
+{
+  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+enum {
+  Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
+              && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
+  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
+  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
+  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
+};
+
+typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
+typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
+typedef typename packet_traits<ResScalar>::type  _ResPacket;
+
+typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
+typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
+typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+
+EIGEN_DONT_INLINE static void run(
+  Index rows, Index cols,
+  const LhsMapper& lhs,
+  const RhsMapper& rhs,
+        ResScalar* res, Index resIncr,
+  RhsScalar alpha);
+};
+
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
+  Index rows, Index cols,
+  const LhsMapper& lhs,
+  const RhsMapper& rhs,
+        ResScalar* res, Index resIncr,
+  RhsScalar alpha)
+{
+  EIGEN_UNUSED_VARIABLE(resIncr);
+  eigen_internal_assert(resIncr==1);
+  #ifdef _EIGEN_ACCUMULATE_PACKETS
+  #error _EIGEN_ACCUMULATE_PACKETS has already been defined
+  #endif
+  #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \
+    pstore(&res[j], \
+      padd(pload<ResPacket>(&res[j]), \
+        padd( \
+      padd(pcj.pmul(lhs0.template load<LhsPacket, Alignment0>(j),    ptmp0), \
+      pcj.pmul(lhs1.template load<LhsPacket, Alignment13>(j),   ptmp1)),   \
+      padd(pcj.pmul(lhs2.template load<LhsPacket, Alignment2>(j),    ptmp2), \
+      pcj.pmul(lhs3.template load<LhsPacket, Alignment13>(j),   ptmp3)) )))
+
+  typedef typename LhsMapper::VectorMapper LhsScalars;
+
+  conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
+  conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
+  if(ConjugateRhs)
+    alpha = numext::conj(alpha);
+
+  enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
+  const Index columnsAtOnce = 4;
+  const Index peels = 2;
+  const Index LhsPacketAlignedMask = LhsPacketSize-1;
+  const Index ResPacketAlignedMask = ResPacketSize-1;
+//  const Index PeelAlignedMask = ResPacketSize*peels-1;
+  const Index size = rows;
+
+  const Index lhsStride = lhs.stride();
+
+  // How many coeffs of the result do we have to skip to be aligned.
+  // Here we assume data are at least aligned on the base scalar type.
+  Index alignedStart = internal::first_aligned(res,size);
+  Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
+  const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
+
+  const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
+  Index alignmentPattern = alignmentStep==0 ? AllAligned
+                       : alignmentStep==(LhsPacketSize/2) ? EvenAligned
+                       : FirstAligned;
+
+  // we cannot assume the first element is aligned because of sub-matrices
+  const Index lhsAlignmentOffset = lhs.firstAligned(size);
+
+  // find how many columns do we have to skip to be aligned with the result (if possible)
+  Index skipColumns = 0;
+  // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
+  if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (size_t(res)%sizeof(ResScalar)) )
+  {
+    alignedSize = 0;
+    alignedStart = 0;
+    alignmentPattern = NoneAligned;
+  }
+  else if(LhsPacketSize > 4)
+  {
+    // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
+    // Currently, it seems to be better to perform unaligned loads anyway
+    alignmentPattern = NoneAligned;
+  }
+  else if (LhsPacketSize>1)
+  {
+  //    eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
+
+    while (skipColumns<LhsPacketSize &&
+          alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
+      ++skipColumns;
+    if (skipColumns==LhsPacketSize)
+    {
+      // nothing can be aligned, no need to skip any column
+      alignmentPattern = NoneAligned;
+      skipColumns = 0;
+    }
+    else
+    {
+      skipColumns = (std::min)(skipColumns,cols);
+      // note that the skiped columns are processed later.
+    }
+
+    /*    eigen_internal_assert(  (alignmentPattern==NoneAligned)
+                      || (skipColumns + columnsAtOnce >= cols)
+                      || LhsPacketSize > size
+                      || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/
+  }
+  else if(Vectorizable)
+  {
+    alignedStart = 0;
+    alignedSize = size;
+    alignmentPattern = AllAligned;
+  }
+
+  const Index offset1 = (FirstAligned && alignmentStep==1?3:1);
+  const Index offset3 = (FirstAligned && alignmentStep==1?1:3);
+
+  Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
+  for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
+  {
+    RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(i, 0)),
+              ptmp1 = pset1<RhsPacket>(alpha*rhs(i+offset1, 0)),
+              ptmp2 = pset1<RhsPacket>(alpha*rhs(i+2, 0)),
+              ptmp3 = pset1<RhsPacket>(alpha*rhs(i+offset3, 0));
+
+    // this helps a lot generating better binary code
+    const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0),   lhs1 = lhs.getVectorMapper(0, i+offset1),
+                     lhs2 = lhs.getVectorMapper(0, i+2),   lhs3 = lhs.getVectorMapper(0, i+offset3);
+
+    if (Vectorizable)
+    {
+      /* explicit vectorization */
+      // process initial unaligned coeffs
+      for (Index j=0; j<alignedStart; ++j)
+      {
+        res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
+        res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
+        res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
+        res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
+      }
+
+      if (alignedSize>alignedStart)
+      {
+        switch(alignmentPattern)
+        {
+          case AllAligned:
+            for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
+            break;
+          case EvenAligned:
+            for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
+            break;
+          case FirstAligned:
+          {
+            Index j = alignedStart;
+            if(peels>1)
+            {
+              LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
+              ResPacket T0, T1;
+
+              A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
+              A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
+              A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
+
+              for (; j<peeledSize; j+=peels*ResPacketSize)
+              {
+                A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize);  palign<1>(A01,A11);
+                A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize);  palign<2>(A02,A12);
+                A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize);  palign<3>(A03,A13);
+
+                A00 = lhs0.template load<LhsPacket, Aligned>(j);
+                A10 = lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize);
+                T0  = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
+                T1  = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
+
+                T0  = pcj.pmadd(A01, ptmp1, T0);
+                A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize);  palign<1>(A11,A01);
+                T0  = pcj.pmadd(A02, ptmp2, T0);
+                A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize);  palign<2>(A12,A02);
+                T0  = pcj.pmadd(A03, ptmp3, T0);
+                pstore(&res[j],T0);
+                A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize);  palign<3>(A13,A03);
+                T1  = pcj.pmadd(A11, ptmp1, T1);
+                T1  = pcj.pmadd(A12, ptmp2, T1);
+                T1  = pcj.pmadd(A13, ptmp3, T1);
+                pstore(&res[j+ResPacketSize],T1);
+              }
+            }
+            for (; j<alignedSize; j+=ResPacketSize)
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
+            break;
+          }
+          default:
+            for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
+              _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
+            break;
+        }
+      }
+    } // end explicit vectorization
+
+    /* process remaining coeffs (or all if there is no explicit vectorization) */
+    for (Index j=alignedSize; j<size; ++j)
+    {
+      res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
+      res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
+      res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
+      res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
+    }
+  }
+
+  // process remaining first and last columns (at most columnsAtOnce-1)
+  Index end = cols;
+  Index start = columnBound;
+  do
+  {
+    for (Index k=start; k<end; ++k)
+    {
+      RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(k, 0));
+      const LhsScalars lhs0 = lhs.getVectorMapper(0, k);
+
+      if (Vectorizable)
+      {
+        /* explicit vectorization */
+        // process first unaligned result's coeffs
+        for (Index j=0; j<alignedStart; ++j)
+          res[j] += cj.pmul(lhs0(j), pfirst(ptmp0));
+        // process aligned result's coeffs
+        if (lhs0.template aligned<LhsPacket>(alignedStart))
+          for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
+            pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(i), ptmp0, pload<ResPacket>(&res[i])));
+        else
+          for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
+            pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(i), ptmp0, pload<ResPacket>(&res[i])));
+      }
+
+      // process remaining scalars (or all if no explicit vectorization)
+      for (Index i=alignedSize; i<size; ++i)
+        res[i] += cj.pmul(lhs0(i), pfirst(ptmp0));
+    }
+    if (skipColumns)
+    {
+      start = 0;
+      end = skipColumns;
+      skipColumns = 0;
+    }
+    else
+      break;
+  } while(Vectorizable);
+  #undef _EIGEN_ACCUMULATE_PACKETS
+}
+
+/* Optimized row-major matrix * vector product:
+ * This algorithm processes 4 rows at onces that allows to both reduce
+ * the number of load/stores of the result by a factor 4 and to reduce
+ * the instruction dependency. Moreover, we know that all bands have the
+ * same alignment pattern.
+ *
+ * Mixing type logic:
+ *  - alpha is always a complex (or converted to a complex)
+ *  - no vectorization
+ */
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
+{
+typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+enum {
+  Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
+              && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
+  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
+  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
+  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
+};
+
+typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
+typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
+typedef typename packet_traits<ResScalar>::type  _ResPacket;
+
+typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
+typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
+typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+
+EIGEN_DONT_INLINE static void run(
+  Index rows, Index cols,
+  const LhsMapper& lhs,
+  const RhsMapper& rhs,
+        ResScalar* res, Index resIncr,
+  ResScalar alpha);
+};
+
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
+  Index rows, Index cols,
+  const LhsMapper& lhs,
+  const RhsMapper& rhs,
+  ResScalar* res, Index resIncr,
+  ResScalar alpha)
+{
+  eigen_internal_assert(rhs.stride()==1);
+
+  #ifdef _EIGEN_ACCUMULATE_PACKETS
+  #error _EIGEN_ACCUMULATE_PACKETS has already been defined
+  #endif
+
+  #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\
+    RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);  \
+    ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Alignment0>(j), b, ptmp0); \
+    ptmp1 = pcj.pmadd(lhs1.template load<LhsPacket, Alignment13>(j), b, ptmp1); \
+    ptmp2 = pcj.pmadd(lhs2.template load<LhsPacket, Alignment2>(j), b, ptmp2); \
+    ptmp3 = pcj.pmadd(lhs3.template load<LhsPacket, Alignment13>(j), b, ptmp3); }
+
+  conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
+  conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
+
+  typedef typename LhsMapper::VectorMapper LhsScalars;
+
+  enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
+  const Index rowsAtOnce = 4;
+  const Index peels = 2;
+  const Index RhsPacketAlignedMask = RhsPacketSize-1;
+  const Index LhsPacketAlignedMask = LhsPacketSize-1;
+  const Index depth = cols;
+  const Index lhsStride = lhs.stride();
+
+  // How many coeffs of the result do we have to skip to be aligned.
+  // Here we assume data are at least aligned on the base scalar type
+  // if that's not the case then vectorization is discarded, see below.
+  Index alignedStart = rhs.firstAligned(depth);
+  Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
+  const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
+
+  const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
+  Index alignmentPattern = alignmentStep==0 ? AllAligned
+                           : alignmentStep==(LhsPacketSize/2) ? EvenAligned
+                           : FirstAligned;
+
+  // we cannot assume the first element is aligned because of sub-matrices
+  const Index lhsAlignmentOffset = lhs.firstAligned(depth);
+  const Index rhsAlignmentOffset = rhs.firstAligned(rows);
+
+  // find how many rows do we have to skip to be aligned with rhs (if possible)
+  Index skipRows = 0;
+  // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
+  if( (sizeof(LhsScalar)!=sizeof(RhsScalar))
+      || (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth)
+      || (rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows))
+  {
+    alignedSize = 0;
+    alignedStart = 0;
+    alignmentPattern = NoneAligned;
+  }
+  else if(LhsPacketSize > 4)
+  {
+    // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
+    alignmentPattern = NoneAligned;
+  }
+  else if (LhsPacketSize>1)
+  {
+  //    eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0  || depth<LhsPacketSize);
+
+    while (skipRows<LhsPacketSize &&
+           alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
+      ++skipRows;
+    if (skipRows==LhsPacketSize)
+    {
+      // nothing can be aligned, no need to skip any column
+      alignmentPattern = NoneAligned;
+      skipRows = 0;
+    }
+    else
+    {
+      skipRows = (std::min)(skipRows,Index(rows));
+      // note that the skiped columns are processed later.
+    }
+    /*    eigen_internal_assert(  alignmentPattern==NoneAligned
+                      || LhsPacketSize==1
+                      || (skipRows + rowsAtOnce >= rows)
+                      || LhsPacketSize > depth
+                      || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/
+  }
+  else if(Vectorizable)
+  {
+    alignedStart = 0;
+    alignedSize = depth;
+    alignmentPattern = AllAligned;
+  }
+
+  const Index offset1 = (FirstAligned && alignmentStep==1?3:1);
+  const Index offset3 = (FirstAligned && alignmentStep==1?1:3);
+
+  Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
+  for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
+  {
+    EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0);
+    ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
+
+    // this helps the compiler generating good binary code
+    const LhsScalars lhs0 = lhs.getVectorMapper(i+0, 0),    lhs1 = lhs.getVectorMapper(i+offset1, 0),
+                     lhs2 = lhs.getVectorMapper(i+2, 0),    lhs3 = lhs.getVectorMapper(i+offset3, 0);
+
+    if (Vectorizable)
+    {
+      /* explicit vectorization */
+      ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
+                ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
+
+      // process initial unaligned coeffs
+      // FIXME this loop get vectorized by the compiler !
+      for (Index j=0; j<alignedStart; ++j)
+      {
+        RhsScalar b = rhs(j, 0);
+        tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
+        tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
+      }
+
+      if (alignedSize>alignedStart)
+      {
+        switch(alignmentPattern)
+        {
+          case AllAligned:
+            for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
+            break;
+          case EvenAligned:
+            for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
+            break;
+          case FirstAligned:
+          {
+            Index j = alignedStart;
+            if (peels>1)
+            {
+              /* Here we proccess 4 rows with with two peeled iterations to hide
+               * the overhead of unaligned loads. Moreover unaligned loads are handled
+               * using special shift/move operations between the two aligned packets
+               * overlaping the desired unaligned packet. This is *much* more efficient
+               * than basic unaligned loads.
+               */
+              LhsPacket A01, A02, A03, A11, A12, A13;
+              A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
+              A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
+              A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
+
+              for (; j<peeledSize; j+=peels*RhsPacketSize)
+              {
+                RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);
+                A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize);  palign<1>(A01,A11);
+                A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize);  palign<2>(A02,A12);
+                A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize);  palign<3>(A03,A13);
+
+                ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), b, ptmp0);
+                ptmp1 = pcj.pmadd(A01, b, ptmp1);
+                A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize);  palign<1>(A11,A01);
+                ptmp2 = pcj.pmadd(A02, b, ptmp2);
+                A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize);  palign<2>(A12,A02);
+                ptmp3 = pcj.pmadd(A03, b, ptmp3);
+                A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize);  palign<3>(A13,A03);
+
+                b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load<RhsPacket, Aligned>(0);
+                ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize), b, ptmp0);
+                ptmp1 = pcj.pmadd(A11, b, ptmp1);
+                ptmp2 = pcj.pmadd(A12, b, ptmp2);
+                ptmp3 = pcj.pmadd(A13, b, ptmp3);
+              }
+            }
+            for (; j<alignedSize; j+=RhsPacketSize)
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
+            break;
+          }
+          default:
+            for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
+              _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
+            break;
+        }
+        tmp0 += predux(ptmp0);
+        tmp1 += predux(ptmp1);
+        tmp2 += predux(ptmp2);
+        tmp3 += predux(ptmp3);
+      }
+    } // end explicit vectorization
+
+    // process remaining coeffs (or all if no explicit vectorization)
+    // FIXME this loop get vectorized by the compiler !
+    for (Index j=alignedSize; j<depth; ++j)
+    {
+      RhsScalar b = rhs(j, 0);
+      tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
+      tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
+    }
+    res[i*resIncr]            += alpha*tmp0;
+    res[(i+offset1)*resIncr]  += alpha*tmp1;
+    res[(i+2)*resIncr]        += alpha*tmp2;
+    res[(i+offset3)*resIncr]  += alpha*tmp3;
+  }
+
+  // process remaining first and last rows (at most columnsAtOnce-1)
+  Index end = rows;
+  Index start = rowBound;
+  do
+  {
+    for (Index i=start; i<end; ++i)
+    {
+      EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0);
+      ResPacket ptmp0 = pset1<ResPacket>(tmp0);
+      const LhsScalars lhs0 = lhs.getVectorMapper(i, 0);
+      // process first unaligned result's coeffs
+      // FIXME this loop get vectorized by the compiler !
+      for (Index j=0; j<alignedStart; ++j)
+        tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
+
+      if (alignedSize>alignedStart)
+      {
+        // process aligned rhs coeffs
+        if (lhs0.template aligned<LhsPacket>(alignedStart))
+          for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
+            ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
+        else
+          for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
+            ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
+        tmp0 += predux(ptmp0);
+      }
+
+      // process remaining scalars
+      // FIXME this loop get vectorized by the compiler !
+      for (Index j=alignedSize; j<depth; ++j)
+        tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
+      res[i*resIncr] += alpha*tmp0;
+    }
+    if (skipRows)
+    {
+      start = 0;
+      end = skipRows;
+      skipRows = 0;
+    }
+    else
+      break;
+  } while(Vectorizable);
+
+  #undef _EIGEN_ACCUMULATE_PACKETS
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_VECTOR_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixVector_MKL.h b/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
new file mode 100644
index 0000000000..1cb9fe6b5a
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
@@ -0,0 +1,131 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL
+ *   General matrix-vector product functionality based on ?GEMV.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_GENERAL_MATRIX_VECTOR_MKL_H
+#define EIGEN_GENERAL_MATRIX_VECTOR_MKL_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/**********************************************************************
+* This file implements general matrix-vector multiplication using BLAS
+* gemv function via partial specialization of
+* general_matrix_vector_product::run(..) method for float, double,
+* std::complex<float> and std::complex<double> types
+**********************************************************************/
+
+// gemv specialization
+
+template<typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
+struct general_matrix_vector_product_gemv :
+  general_matrix_vector_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,ConjugateRhs,BuiltIn> {};
+
+#define EIGEN_MKL_GEMV_SPECIALIZE(Scalar) \
+template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
+struct general_matrix_vector_product<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs,Specialized> { \
+static void run( \
+  Index rows, Index cols, \
+  const Scalar* lhs, Index lhsStride, \
+  const Scalar* rhs, Index rhsIncr, \
+  Scalar* res, Index resIncr, Scalar alpha) \
+{ \
+  if (ConjugateLhs) { \
+    general_matrix_vector_product<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs,BuiltIn>::run( \
+      rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
+  } else { \
+    general_matrix_vector_product_gemv<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
+      rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
+  } \
+} \
+}; \
+template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
+struct general_matrix_vector_product<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs,Specialized> { \
+static void run( \
+  Index rows, Index cols, \
+  const Scalar* lhs, Index lhsStride, \
+  const Scalar* rhs, Index rhsIncr, \
+  Scalar* res, Index resIncr, Scalar alpha) \
+{ \
+    general_matrix_vector_product_gemv<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
+      rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
+} \
+}; \
+
+EIGEN_MKL_GEMV_SPECIALIZE(double)
+EIGEN_MKL_GEMV_SPECIALIZE(float)
+EIGEN_MKL_GEMV_SPECIALIZE(dcomplex)
+EIGEN_MKL_GEMV_SPECIALIZE(scomplex)
+
+#define EIGEN_MKL_GEMV_SPECIALIZATION(EIGTYPE,MKLTYPE,MKLPREFIX) \
+template<typename Index, int LhsStorageOrder, bool ConjugateLhs, bool ConjugateRhs> \
+struct general_matrix_vector_product_gemv<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,ConjugateRhs> \
+{ \
+typedef Matrix<EIGTYPE,Dynamic,1,ColMajor> GEMVVector;\
+\
+static void run( \
+  Index rows, Index cols, \
+  const EIGTYPE* lhs, Index lhsStride, \
+  const EIGTYPE* rhs, Index rhsIncr, \
+  EIGTYPE* res, Index resIncr, EIGTYPE alpha) \
+{ \
+  MKL_INT m=rows, n=cols, lda=lhsStride, incx=rhsIncr, incy=resIncr; \
+  MKLTYPE alpha_, beta_; \
+  const EIGTYPE *x_ptr, myone(1); \
+  char trans=(LhsStorageOrder==ColMajor) ? 'N' : (ConjugateLhs) ? 'C' : 'T'; \
+  if (LhsStorageOrder==RowMajor) { \
+    m=cols; \
+    n=rows; \
+  }\
+  assign_scalar_eig2mkl(alpha_, alpha); \
+  assign_scalar_eig2mkl(beta_, myone); \
+  GEMVVector x_tmp; \
+  if (ConjugateRhs) { \
+    Map<const GEMVVector, 0, InnerStride<> > map_x(rhs,cols,1,InnerStride<>(incx)); \
+    x_tmp=map_x.conjugate(); \
+    x_ptr=x_tmp.data(); \
+    incx=1; \
+  } else x_ptr=rhs; \
+  MKLPREFIX##gemv(&trans, &m, &n, &alpha_, (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &beta_, (MKLTYPE*)res, &incy); \
+}\
+};
+
+EIGEN_MKL_GEMV_SPECIALIZATION(double,   double,        d)
+EIGEN_MKL_GEMV_SPECIALIZATION(float,    float,         s)
+EIGEN_MKL_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, z)
+EIGEN_MKL_GEMV_SPECIALIZATION(scomplex, MKL_Complex8,  c)
+
+} // end namespase internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GENERAL_MATRIX_VECTOR_MKL_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/Parallelizer.h b/third_party/eigen3/Eigen/src/Core/products/Parallelizer.h
new file mode 100644
index 0000000000..837e69415b
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/Parallelizer.h
@@ -0,0 +1,158 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PARALLELIZER_H
+#define EIGEN_PARALLELIZER_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/** \internal */
+inline void manage_multi_threading(Action action, int* v)
+{
+  static EIGEN_UNUSED int m_maxThreads = -1;
+
+  if(action==SetAction)
+  {
+    eigen_internal_assert(v!=0);
+    m_maxThreads = *v;
+  }
+  else if(action==GetAction)
+  {
+    eigen_internal_assert(v!=0);
+    #ifdef EIGEN_HAS_OPENMP
+    if(m_maxThreads>0)
+      *v = m_maxThreads;
+    else
+      *v = omp_get_max_threads();
+    #else
+    *v = 1;
+    #endif
+  }
+  else
+  {
+    eigen_internal_assert(false);
+  }
+}
+
+}
+
+/** Must be call first when calling Eigen from multiple threads */
+inline void initParallel()
+{
+  int nbt;
+  internal::manage_multi_threading(GetAction, &nbt);
+  std::ptrdiff_t l1, l2, l3;
+  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
+}
+
+/** \returns the max number of threads reserved for Eigen
+  * \sa setNbThreads */
+inline int nbThreads()
+{
+  int ret;
+  internal::manage_multi_threading(GetAction, &ret);
+  return ret;
+}
+
+/** Sets the max number of threads reserved for Eigen
+  * \sa nbThreads */
+inline void setNbThreads(int v)
+{
+  internal::manage_multi_threading(SetAction, &v);
+}
+
+namespace internal {
+
+template<typename Index> struct GemmParallelInfo
+{
+  GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {}
+
+  int volatile sync;
+  int volatile users;
+
+  Index lhs_start;
+  Index lhs_length;
+};
+
+template<bool Condition, typename Functor, typename Index>
+void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpose)
+{
+  // TODO when EIGEN_USE_BLAS is defined,
+  // we should still enable OMP for other scalar types
+#if !(defined (EIGEN_HAS_OPENMP)) || defined (EIGEN_USE_BLAS)
+  // FIXME the transpose variable is only needed to properly split
+  // the matrix product when multithreading is enabled. This is a temporary
+  // fix to support row-major destination matrices. This whole
+  // parallelizer mechanism has to be redisigned anyway.
+  EIGEN_UNUSED_VARIABLE(transpose);
+  func(0,rows, 0,cols);
+#else
+
+  // Dynamically check whether we should enable or disable OpenMP.
+  // The conditions are:
+  // - the max number of threads we can create is greater than 1
+  // - we are not already in a parallel code
+  // - the sizes are large enough
+
+  // 1- are we already in a parallel session?
+  // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp?
+  if((!Condition) || (omp_get_num_threads()>1))
+    return func(0,rows, 0,cols);
+
+  Index size = transpose ? rows : cols;
+
+  // 2- compute the maximal number of threads from the size of the product:
+  // FIXME this has to be fine tuned
+  Index max_threads = std::max<Index>(1,size / 32);
+
+  // 3 - compute the number of threads we are going to use
+  Index threads = std::min<Index>(nbThreads(), max_threads);
+
+  if(threads==1)
+    return func(0,rows, 0,cols);
+
+  Eigen::initParallel();
+  func.initParallelSession();
+
+  if(transpose)
+    std::swap(rows,cols);
+
+  Index blockCols = (cols / threads) & ~Index(0x3);
+  Index blockRows = (rows / threads);
+  blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr;
+  
+  GemmParallelInfo<Index>* info = new GemmParallelInfo<Index>[threads];
+
+  #pragma omp parallel num_threads(threads)
+  {
+    Index i = omp_get_thread_num();
+    Index r0 = i*blockRows;
+    Index actualBlockRows = (i+1==threads) ? rows-r0 : blockRows;
+
+    Index c0 = i*blockCols;
+    Index actualBlockCols = (i+1==threads) ? cols-c0 : blockCols;
+
+    info[i].lhs_start = r0;
+    info[i].lhs_length = actualBlockRows;
+
+    if(transpose) func(c0, actualBlockCols, 0, rows, info);
+    else          func(0, rows, c0, actualBlockCols, info);
+  }
+
+  delete[] info;
+#endif
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PARALLELIZER_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/third_party/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
new file mode 100644
index 0000000000..4a60ef7dc5
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -0,0 +1,523 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_H
+#define EIGEN_SELFADJOINT_MATRIX_MATRIX_H
+
+namespace Eigen {
+
+namespace internal {
+
+// pack a selfadjoint block diagonal for use with the gebp_kernel
+template<typename Scalar, typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
+struct symm_pack_lhs
+{
+  template<int BlockRows> inline
+  void pack(Scalar* blockA, const const_blas_data_mapper<Scalar,Index,StorageOrder>& lhs, Index cols, Index i, Index& count)
+  {
+    // normal copy
+    for(Index k=0; k<i; k++)
+      for(Index w=0; w<BlockRows; w++)
+        blockA[count++] = lhs(i+w,k);           // normal
+    // symmetric copy
+    Index h = 0;
+    for(Index k=i; k<i+BlockRows; k++)
+    {
+      for(Index w=0; w<h; w++)
+        blockA[count++] = numext::conj(lhs(k, i+w)); // transposed
+
+      blockA[count++] = numext::real(lhs(k,k));   // real (diagonal)
+
+      for(Index w=h+1; w<BlockRows; w++)
+        blockA[count++] = lhs(i+w, k);          // normal
+      ++h;
+    }
+    // transposed copy
+    for(Index k=i+BlockRows; k<cols; k++)
+      for(Index w=0; w<BlockRows; w++)
+        blockA[count++] = numext::conj(lhs(k, i+w)); // transposed
+  }
+  void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
+  {
+    enum { PacketSize = packet_traits<Scalar>::size };
+    const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride);
+    Index count = 0;
+    //Index peeled_mc3 = (rows/Pack1)*Pack1;
+
+    const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
+    const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
+    const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
+
+    if(Pack1>=3*PacketSize)
+      for(Index i=0; i<peeled_mc3; i+=3*PacketSize)
+        pack<3*PacketSize>(blockA, lhs, cols, i, count);
+
+    if(Pack1>=2*PacketSize)
+      for(Index i=peeled_mc3; i<peeled_mc2; i+=2*PacketSize)
+        pack<2*PacketSize>(blockA, lhs, cols, i, count);
+
+    if(Pack1>=1*PacketSize)
+      for(Index i=peeled_mc2; i<peeled_mc1; i+=1*PacketSize)
+        pack<1*PacketSize>(blockA, lhs, cols, i, count);
+
+    // do the same with mr==1
+    for(Index i=peeled_mc1; i<rows; i++)
+    {
+      for(Index k=0; k<i; k++)
+        blockA[count++] = lhs(i, k);                   // normal
+
+      blockA[count++] = numext::real(lhs(i, i));       // real (diagonal)
+
+      for(Index k=i+1; k<cols; k++)
+        blockA[count++] = numext::conj(lhs(k, i));     // transposed
+    }
+  }
+};
+
+template<typename Scalar, typename Index, int nr, int StorageOrder>
+struct symm_pack_rhs
+{
+  enum { PacketSize = packet_traits<Scalar>::size };
+  void operator()(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
+  {
+    Index end_k = k2 + rows;
+    Index count = 0;
+    const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(_rhs,rhsStride);
+    Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
+    Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
+
+    // first part: normal case
+    for(Index j2=0; j2<k2; j2+=nr)
+    {
+      for(Index k=k2; k<end_k; k++)
+      {
+        blockB[count+0] = rhs(k,j2+0);
+        blockB[count+1] = rhs(k,j2+1);
+        if (nr>=4)
+        {
+          blockB[count+2] = rhs(k,j2+2);
+          blockB[count+3] = rhs(k,j2+3);
+        }
+        if (nr>=8)
+        {
+          blockB[count+4] = rhs(k,j2+4);
+          blockB[count+5] = rhs(k,j2+5);
+          blockB[count+6] = rhs(k,j2+6);
+          blockB[count+7] = rhs(k,j2+7);
+        }
+        count += nr;
+      }
+    }
+
+    // second part: diagonal block
+    Index end8 = nr>=8 ? (std::min)(k2+rows,packet_cols8) : k2;
+    if(nr>=8)
+    {
+      for(Index j2=k2; j2<end8; j2+=8)
+      {
+        // again we can split vertically in three different parts (transpose, symmetric, normal)
+        // transpose
+        for(Index k=k2; k<j2; k++)
+        {
+          blockB[count+0] = numext::conj(rhs(j2+0,k));
+          blockB[count+1] = numext::conj(rhs(j2+1,k));
+          blockB[count+2] = numext::conj(rhs(j2+2,k));
+          blockB[count+3] = numext::conj(rhs(j2+3,k));
+          blockB[count+4] = numext::conj(rhs(j2+4,k));
+          blockB[count+5] = numext::conj(rhs(j2+5,k));
+          blockB[count+6] = numext::conj(rhs(j2+6,k));
+          blockB[count+7] = numext::conj(rhs(j2+7,k));
+          count += 8;
+        }
+        // symmetric
+        Index h = 0;
+        for(Index k=j2; k<j2+8; k++)
+        {
+          // normal
+          for (Index w=0 ; w<h; ++w)
+            blockB[count+w] = rhs(k,j2+w);
+
+          blockB[count+h] = numext::real(rhs(k,k));
+
+          // transpose
+          for (Index w=h+1 ; w<8; ++w)
+            blockB[count+w] = numext::conj(rhs(j2+w,k));
+          count += 8;
+          ++h;
+        }
+        // normal
+        for(Index k=j2+8; k<end_k; k++)
+        {
+          blockB[count+0] = rhs(k,j2+0);
+          blockB[count+1] = rhs(k,j2+1);
+          blockB[count+2] = rhs(k,j2+2);
+          blockB[count+3] = rhs(k,j2+3);
+          blockB[count+4] = rhs(k,j2+4);
+          blockB[count+5] = rhs(k,j2+5);
+          blockB[count+6] = rhs(k,j2+6);
+          blockB[count+7] = rhs(k,j2+7);
+          count += 8;
+        }
+      }
+    }
+    if(nr>=4)
+    {
+      for(Index j2=end8; j2<(std::min)(k2+rows,packet_cols4); j2+=4)
+      {
+        // again we can split vertically in three different parts (transpose, symmetric, normal)
+        // transpose
+        for(Index k=k2; k<j2; k++)
+        {
+          blockB[count+0] = numext::conj(rhs(j2+0,k));
+          blockB[count+1] = numext::conj(rhs(j2+1,k));
+          blockB[count+2] = numext::conj(rhs(j2+2,k));
+          blockB[count+3] = numext::conj(rhs(j2+3,k));
+          count += 4;
+        }
+        // symmetric
+        Index h = 0;
+        for(Index k=j2; k<j2+4; k++)
+        {
+          // normal
+          for (Index w=0 ; w<h; ++w)
+            blockB[count+w] = rhs(k,j2+w);
+
+          blockB[count+h] = numext::real(rhs(k,k));
+
+          // transpose
+          for (Index w=h+1 ; w<4; ++w)
+            blockB[count+w] = numext::conj(rhs(j2+w,k));
+          count += 4;
+          ++h;
+        }
+        // normal
+        for(Index k=j2+4; k<end_k; k++)
+        {
+          blockB[count+0] = rhs(k,j2+0);
+          blockB[count+1] = rhs(k,j2+1);
+          blockB[count+2] = rhs(k,j2+2);
+          blockB[count+3] = rhs(k,j2+3);
+          count += 4;
+        }
+      }
+    }
+
+    // third part: transposed
+    if(nr>=8)
+    {
+      for(Index j2=k2+rows; j2<packet_cols8; j2+=8)
+      {
+        for(Index k=k2; k<end_k; k++)
+        {
+          blockB[count+0] = numext::conj(rhs(j2+0,k));
+          blockB[count+1] = numext::conj(rhs(j2+1,k));
+          blockB[count+2] = numext::conj(rhs(j2+2,k));
+          blockB[count+3] = numext::conj(rhs(j2+3,k));
+          blockB[count+4] = numext::conj(rhs(j2+4,k));
+          blockB[count+5] = numext::conj(rhs(j2+5,k));
+          blockB[count+6] = numext::conj(rhs(j2+6,k));
+          blockB[count+7] = numext::conj(rhs(j2+7,k));
+          count += 8;
+        }
+      }
+    }
+    if(nr>=4)
+    {
+      for(Index j2=(std::max)(packet_cols8,k2+rows); j2<packet_cols4; j2+=4)
+      {
+        for(Index k=k2; k<end_k; k++)
+        {
+          blockB[count+0] = numext::conj(rhs(j2+0,k));
+          blockB[count+1] = numext::conj(rhs(j2+1,k));
+          blockB[count+2] = numext::conj(rhs(j2+2,k));
+          blockB[count+3] = numext::conj(rhs(j2+3,k));
+          count += 4;
+        }
+      }
+    }
+
+    // copy the remaining columns one at a time (=> the same with nr==1)
+    for(Index j2=packet_cols4; j2<cols; ++j2)
+    {
+      // transpose
+      Index half = (std::min)(end_k,j2);
+      for(Index k=k2; k<half; k++)
+      {
+        blockB[count] = numext::conj(rhs(j2,k));
+        count += 1;
+      }
+
+      if(half==j2 && half<k2+rows)
+      {
+        blockB[count] = numext::real(rhs(j2,j2));
+        count += 1;
+      }
+      else
+        half--;
+
+      // normal
+      for(Index k=half+1; k<k2+rows; k++)
+      {
+        blockB[count] = rhs(k,j2);
+        count += 1;
+      }
+    }
+  }
+};
+
+/* Optimized selfadjoint matrix * matrix (_SYMM) product built on top of
+ * the general matrix matrix product.
+ */
+template <typename Scalar, typename Index,
+          int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs,
+          int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs,
+          int ResStorageOrder>
+struct product_selfadjoint_matrix;
+
+template <typename Scalar, typename Index,
+          int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs,
+          int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs>
+struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,ConjugateLhs, RhsStorageOrder,RhsSelfAdjoint,ConjugateRhs,RowMajor>
+{
+
+  static EIGEN_STRONG_INLINE void run(
+    Index rows, Index cols,
+    const Scalar* lhs, Index lhsStride,
+    const Scalar* rhs, Index rhsStride,
+    Scalar* res,       Index resStride,
+    const Scalar& alpha)
+  {
+    product_selfadjoint_matrix<Scalar, Index,
+      EIGEN_LOGICAL_XOR(RhsSelfAdjoint,RhsStorageOrder==RowMajor) ? ColMajor : RowMajor,
+      RhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsSelfAdjoint,ConjugateRhs),
+      EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor,
+      LhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs),
+      ColMajor>
+      ::run(cols, rows,  rhs, rhsStride,  lhs, lhsStride,  res, resStride,  alpha);
+  }
+};
+
+template <typename Scalar, typename Index,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs>
+struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor>
+{
+
+  static EIGEN_DONT_INLINE void run(
+    Index rows, Index cols,
+    const Scalar* _lhs, Index lhsStride,
+    const Scalar* _rhs, Index rhsStride,
+    Scalar* res,        Index resStride,
+    const Scalar& alpha);
+};
+
+template <typename Scalar, typename Index,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs>
+EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor>::run(
+    Index rows, Index cols,
+    const Scalar* _lhs, Index lhsStride,
+    const Scalar* _rhs, Index rhsStride,
+    Scalar* _res,        Index resStride,
+    const Scalar& alpha)
+  {
+    Index size = rows;
+
+    typedef gebp_traits<Scalar,Scalar> Traits;
+
+    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+    typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper;
+    typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    LhsTransposeMapper lhs_transpose(_lhs,lhsStride);
+    RhsMapper rhs(_rhs,rhsStride);
+    ResMapper res(_res, resStride);
+
+    Index kc = size;  // cache block size along the K direction
+    Index mc = rows;  // cache block size along the M direction
+    Index nc = cols;  // cache block size along the N direction
+    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, Index(1));
+    // kc must smaller than mc
+    kc = (std::min)(kc,mc);
+
+    std::size_t sizeB = kc*cols;
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
+    ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
+    Scalar* blockB = allocatedBlockB;
+
+    gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+    symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
+    gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
+
+    for(Index k2=0; k2<size; k2+=kc)
+    {
+      const Index actual_kc = (std::min)(k2+kc,size)-k2;
+
+      // we have selected one row panel of rhs and one column panel of lhs
+      // pack rhs's panel into a sequential chunk of memory
+      // and expand each coeff to a constant packet for further reuse
+      pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, cols);
+
+      // the select lhs's panel has to be split in three different parts:
+      //  1 - the transposed panel above the diagonal block => transposed packed copy
+      //  2 - the diagonal block => special packed copy
+      //  3 - the panel below the diagonal block => generic packed copy
+      for(Index i2=0; i2<k2; i2+=mc)
+      {
+        const Index actual_mc = (std::min)(i2+mc,k2)-i2;
+        // transposed packed copy
+        pack_lhs_transposed(blockA, lhs_transpose.getSubMapper(i2, k2), actual_kc, actual_mc);
+
+        gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
+      }
+      // the block diagonal
+      {
+        const Index actual_mc = (std::min)(k2+kc,size)-k2;
+        // symmetric packed copy
+        pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc);
+
+        gebp_kernel(res.getSubMapper(k2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
+      }
+
+      for(Index i2=k2+kc; i2<size; i2+=mc)
+      {
+        const Index actual_mc = (std::min)(i2+mc,size)-i2;
+        gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
+          (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
+
+        gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
+      }
+    }
+  }
+
+// matrix * selfadjoint product
+template <typename Scalar, typename Index,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs>
+struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor>
+{
+
+  static EIGEN_DONT_INLINE void run(
+    Index rows, Index cols,
+    const Scalar* _lhs, Index lhsStride,
+    const Scalar* _rhs, Index rhsStride,
+    Scalar* res,        Index resStride,
+    const Scalar& alpha);
+};
+
+template <typename Scalar, typename Index,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs>
+EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor>::run(
+    Index rows, Index cols,
+    const Scalar* _lhs, Index lhsStride,
+    const Scalar* _rhs, Index rhsStride,
+    Scalar* _res,        Index resStride,
+    const Scalar& alpha)
+  {
+    Index size = cols;
+
+    typedef gebp_traits<Scalar,Scalar> Traits;
+
+    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    ResMapper res(_res,resStride);
+
+    Index kc = size; // cache block size along the K direction
+    Index mc = rows;  // cache block size along the M direction
+    Index nc = cols;  // cache block size along the N direction
+    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, Index(1));
+    std::size_t sizeB = kc*cols;
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
+    ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
+    Scalar* blockB = allocatedBlockB;
+
+    gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
+
+    for(Index k2=0; k2<size; k2+=kc)
+    {
+      const Index actual_kc = (std::min)(k2+kc,size)-k2;
+
+      pack_rhs(blockB, _rhs, rhsStride, actual_kc, cols, k2);
+
+      // => GEPP
+      for(Index i2=0; i2<rows; i2+=mc)
+      {
+        const Index actual_mc = (std::min)(i2+mc,rows)-i2;
+        pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
+
+        gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
+      }
+    }
+  }
+
+} // end namespace internal
+
+/***************************************************************************
+* Wrapper to product_selfadjoint_matrix
+***************************************************************************/
+
+namespace internal {
+template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
+struct traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false> >
+  : traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs> >
+{};
+}
+
+template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
+struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>
+  : public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs >
+{
+  EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
+
+  SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
+
+  enum {
+    LhsIsUpper = (LhsMode&(Upper|Lower))==Upper,
+    LhsIsSelfAdjoint = (LhsMode&SelfAdjoint)==SelfAdjoint,
+    RhsIsUpper = (RhsMode&(Upper|Lower))==Upper,
+    RhsIsSelfAdjoint = (RhsMode&SelfAdjoint)==SelfAdjoint
+  };
+
+  template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+  {
+    eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
+
+    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
+
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
+                               * RhsBlasTraits::extractScalarFactor(m_rhs);
+
+    internal::product_selfadjoint_matrix<Scalar, Index,
+      EIGEN_LOGICAL_XOR(LhsIsUpper,
+                        internal::traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint,
+      NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)),
+      EIGEN_LOGICAL_XOR(RhsIsUpper,
+                        internal::traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint,
+      NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)),
+      internal::traits<Dest>::Flags&RowMajorBit  ? RowMajor : ColMajor>
+      ::run(
+        lhs.rows(), rhs.cols(),                     // sizes
+        &lhs.coeffRef(0,0),    lhs.outerStride(),   // lhs info
+        &rhs.coeffRef(0,0),    rhs.outerStride(),   // rhs info
+        &dst.coeffRef(0,0), dst.outerStride(),      // result info
+        actualAlpha                                 // alpha
+      );
+  }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h b/third_party/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h
new file mode 100644
index 0000000000..dfa687fefe
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h
@@ -0,0 +1,295 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL
+ *   Self adjoint matrix * matrix product functionality based on ?SYMM/?HEMM.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H
+#define EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H
+
+namespace Eigen { 
+
+namespace internal {
+
+
+/* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */
+
+#define EIGEN_MKL_SYMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+template <typename Index, \
+          int LhsStorageOrder, bool ConjugateLhs, \
+          int RhsStorageOrder, bool ConjugateRhs> \
+struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor> \
+{\
+\
+  static void run( \
+    Index rows, Index cols, \
+    const EIGTYPE* _lhs, Index lhsStride, \
+    const EIGTYPE* _rhs, Index rhsStride, \
+    EIGTYPE* res,        Index resStride, \
+    EIGTYPE alpha) \
+  { \
+    char side='L', uplo='L'; \
+    MKL_INT m, n, lda, ldb, ldc; \
+    const EIGTYPE *a, *b; \
+    MKLTYPE alpha_, beta_; \
+    MatrixX##EIGPREFIX b_tmp; \
+    EIGTYPE myone(1);\
+\
+/* Set transpose options */ \
+/* Set m, n, k */ \
+    m = (MKL_INT)rows;  \
+    n = (MKL_INT)cols;  \
+\
+/* Set alpha_ & beta_ */ \
+    assign_scalar_eig2mkl(alpha_, alpha); \
+    assign_scalar_eig2mkl(beta_, myone); \
+\
+/* Set lda, ldb, ldc */ \
+    lda = (MKL_INT)lhsStride; \
+    ldb = (MKL_INT)rhsStride; \
+    ldc = (MKL_INT)resStride; \
+\
+/* Set a, b, c */ \
+    if (LhsStorageOrder==RowMajor) uplo='U'; \
+    a = _lhs; \
+\
+    if (RhsStorageOrder==RowMajor) { \
+      Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
+      b_tmp = rhs.adjoint(); \
+      b = b_tmp.data(); \
+      ldb = b_tmp.outerStride(); \
+    } else b = _rhs; \
+\
+    MKLPREFIX##symm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
+\
+  } \
+};
+
+
+#define EIGEN_MKL_HEMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+template <typename Index, \
+          int LhsStorageOrder, bool ConjugateLhs, \
+          int RhsStorageOrder, bool ConjugateRhs> \
+struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor> \
+{\
+  static void run( \
+    Index rows, Index cols, \
+    const EIGTYPE* _lhs, Index lhsStride, \
+    const EIGTYPE* _rhs, Index rhsStride, \
+    EIGTYPE* res,        Index resStride, \
+    EIGTYPE alpha) \
+  { \
+    char side='L', uplo='L'; \
+    MKL_INT m, n, lda, ldb, ldc; \
+    const EIGTYPE *a, *b; \
+    MKLTYPE alpha_, beta_; \
+    MatrixX##EIGPREFIX b_tmp; \
+    Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> a_tmp; \
+    EIGTYPE myone(1); \
+\
+/* Set transpose options */ \
+/* Set m, n, k */ \
+    m = (MKL_INT)rows; \
+    n = (MKL_INT)cols; \
+\
+/* Set alpha_ & beta_ */ \
+    assign_scalar_eig2mkl(alpha_, alpha); \
+    assign_scalar_eig2mkl(beta_, myone); \
+\
+/* Set lda, ldb, ldc */ \
+    lda = (MKL_INT)lhsStride; \
+    ldb = (MKL_INT)rhsStride; \
+    ldc = (MKL_INT)resStride; \
+\
+/* Set a, b, c */ \
+    if (((LhsStorageOrder==ColMajor) && ConjugateLhs) || ((LhsStorageOrder==RowMajor) && (!ConjugateLhs))) { \
+      Map<const Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder>, 0, OuterStride<> > lhs(_lhs,m,m,OuterStride<>(lhsStride)); \
+      a_tmp = lhs.conjugate(); \
+      a = a_tmp.data(); \
+      lda = a_tmp.outerStride(); \
+    } else a = _lhs; \
+    if (LhsStorageOrder==RowMajor) uplo='U'; \
+\
+    if (RhsStorageOrder==ColMajor && (!ConjugateRhs)) { \
+       b = _rhs; } \
+    else { \
+      if (RhsStorageOrder==ColMajor && ConjugateRhs) { \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,m,n,OuterStride<>(rhsStride)); \
+        b_tmp = rhs.conjugate(); \
+      } else \
+      if (ConjugateRhs) { \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
+        b_tmp = rhs.adjoint(); \
+      } else { \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
+        b_tmp = rhs.transpose(); \
+      } \
+      b = b_tmp.data(); \
+      ldb = b_tmp.outerStride(); \
+    } \
+\
+    MKLPREFIX##hemm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
+\
+  } \
+};
+
+EIGEN_MKL_SYMM_L(double, double, d, d)
+EIGEN_MKL_SYMM_L(float, float, f, s)
+EIGEN_MKL_HEMM_L(dcomplex, MKL_Complex16, cd, z)
+EIGEN_MKL_HEMM_L(scomplex, MKL_Complex8, cf, c)
+
+
+/* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */
+
+#define EIGEN_MKL_SYMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+template <typename Index, \
+          int LhsStorageOrder, bool ConjugateLhs, \
+          int RhsStorageOrder, bool ConjugateRhs> \
+struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor> \
+{\
+\
+  static void run( \
+    Index rows, Index cols, \
+    const EIGTYPE* _lhs, Index lhsStride, \
+    const EIGTYPE* _rhs, Index rhsStride, \
+    EIGTYPE* res,        Index resStride, \
+    EIGTYPE alpha) \
+  { \
+    char side='R', uplo='L'; \
+    MKL_INT m, n, lda, ldb, ldc; \
+    const EIGTYPE *a, *b; \
+    MKLTYPE alpha_, beta_; \
+    MatrixX##EIGPREFIX b_tmp; \
+    EIGTYPE myone(1);\
+\
+/* Set m, n, k */ \
+    m = (MKL_INT)rows;  \
+    n = (MKL_INT)cols;  \
+\
+/* Set alpha_ & beta_ */ \
+    assign_scalar_eig2mkl(alpha_, alpha); \
+    assign_scalar_eig2mkl(beta_, myone); \
+\
+/* Set lda, ldb, ldc */ \
+    lda = (MKL_INT)rhsStride; \
+    ldb = (MKL_INT)lhsStride; \
+    ldc = (MKL_INT)resStride; \
+\
+/* Set a, b, c */ \
+    if (RhsStorageOrder==RowMajor) uplo='U'; \
+    a = _rhs; \
+\
+    if (LhsStorageOrder==RowMajor) { \
+      Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(rhsStride)); \
+      b_tmp = lhs.adjoint(); \
+      b = b_tmp.data(); \
+      ldb = b_tmp.outerStride(); \
+    } else b = _lhs; \
+\
+    MKLPREFIX##symm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
+\
+  } \
+};
+
+
+#define EIGEN_MKL_HEMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+template <typename Index, \
+          int LhsStorageOrder, bool ConjugateLhs, \
+          int RhsStorageOrder, bool ConjugateRhs> \
+struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor> \
+{\
+  static void run( \
+    Index rows, Index cols, \
+    const EIGTYPE* _lhs, Index lhsStride, \
+    const EIGTYPE* _rhs, Index rhsStride, \
+    EIGTYPE* res,        Index resStride, \
+    EIGTYPE alpha) \
+  { \
+    char side='R', uplo='L'; \
+    MKL_INT m, n, lda, ldb, ldc; \
+    const EIGTYPE *a, *b; \
+    MKLTYPE alpha_, beta_; \
+    MatrixX##EIGPREFIX b_tmp; \
+    Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> a_tmp; \
+    EIGTYPE myone(1); \
+\
+/* Set m, n, k */ \
+    m = (MKL_INT)rows; \
+    n = (MKL_INT)cols; \
+\
+/* Set alpha_ & beta_ */ \
+    assign_scalar_eig2mkl(alpha_, alpha); \
+    assign_scalar_eig2mkl(beta_, myone); \
+\
+/* Set lda, ldb, ldc */ \
+    lda = (MKL_INT)rhsStride; \
+    ldb = (MKL_INT)lhsStride; \
+    ldc = (MKL_INT)resStride; \
+\
+/* Set a, b, c */ \
+    if (((RhsStorageOrder==ColMajor) && ConjugateRhs) || ((RhsStorageOrder==RowMajor) && (!ConjugateRhs))) { \
+      Map<const Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder>, 0, OuterStride<> > rhs(_rhs,n,n,OuterStride<>(rhsStride)); \
+      a_tmp = rhs.conjugate(); \
+      a = a_tmp.data(); \
+      lda = a_tmp.outerStride(); \
+    } else a = _rhs; \
+    if (RhsStorageOrder==RowMajor) uplo='U'; \
+\
+    if (LhsStorageOrder==ColMajor && (!ConjugateLhs)) { \
+       b = _lhs; } \
+    else { \
+      if (LhsStorageOrder==ColMajor && ConjugateLhs) { \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,n,OuterStride<>(lhsStride)); \
+        b_tmp = lhs.conjugate(); \
+      } else \
+      if (ConjugateLhs) { \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(lhsStride)); \
+        b_tmp = lhs.adjoint(); \
+      } else { \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(lhsStride)); \
+        b_tmp = lhs.transpose(); \
+      } \
+      b = b_tmp.data(); \
+      ldb = b_tmp.outerStride(); \
+    } \
+\
+    MKLPREFIX##hemm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
+  } \
+};
+
+EIGEN_MKL_SYMM_R(double, double, d, d)
+EIGEN_MKL_SYMM_R(float, float, f, s)
+EIGEN_MKL_HEMM_R(dcomplex, MKL_Complex16, cd, z)
+EIGEN_MKL_HEMM_R(scomplex, MKL_Complex8, cf, c)
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector.h b/third_party/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector.h
new file mode 100644
index 0000000000..fdc81205ab
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -0,0 +1,281 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_H
+#define EIGEN_SELFADJOINT_MATRIX_VECTOR_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/* Optimized selfadjoint matrix * vector product:
+ * This algorithm processes 2 columns at onces that allows to both reduce
+ * the number of load/stores of the result by a factor 2 and to reduce
+ * the instruction dependency.
+ */
+
+template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version=Specialized>
+struct selfadjoint_matrix_vector_product;
+
+template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version>
+struct selfadjoint_matrix_vector_product
+
+{
+static EIGEN_DONT_INLINE void run(
+  Index size,
+  const Scalar*  lhs, Index lhsStride,
+  const Scalar* _rhs, Index rhsIncr,
+  Scalar* res,
+  Scalar alpha);
+};
+
+template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run(
+  Index size,
+  const Scalar*  lhs, Index lhsStride,
+  const Scalar* _rhs, Index rhsIncr,
+  Scalar* res,
+  Scalar alpha)
+{
+  typedef typename packet_traits<Scalar>::type Packet;
+  const Index PacketSize = sizeof(Packet)/sizeof(Scalar);
+
+  enum {
+    IsRowMajor = StorageOrder==RowMajor ? 1 : 0,
+    IsLower = UpLo == Lower ? 1 : 0,
+    FirstTriangular = IsRowMajor == IsLower
+  };
+
+  conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs,  IsRowMajor), ConjugateRhs> cj0;
+  conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1;
+  conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex, ConjugateRhs> cjd;
+
+  conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs,  IsRowMajor), ConjugateRhs> pcj0;
+  conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> pcj1;
+
+  Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha;
+
+  // FIXME this copy is now handled outside product_selfadjoint_vector, so it could probably be removed.
+  // if the rhs is not sequentially stored in memory we copy it to a temporary buffer,
+  // this is because we need to extract packets
+  ei_declare_aligned_stack_constructed_variable(Scalar,rhs,size,rhsIncr==1 ? const_cast<Scalar*>(_rhs) : 0);  
+  if (rhsIncr!=1)
+  {
+    const Scalar* it = _rhs;
+    for (Index i=0; i<size; ++i, it+=rhsIncr)
+      rhs[i] = *it;
+  }
+
+  Index bound = (std::max)(Index(0),size-8) & 0xfffffffe;
+  if (FirstTriangular)
+    bound = size - bound;
+
+  for (Index j=FirstTriangular ? bound : 0;
+       j<(FirstTriangular ? size : bound);j+=2)
+  {
+    const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
+    const Scalar* EIGEN_RESTRICT A1 = lhs + (j+1)*lhsStride;
+
+    Scalar t0 = cjAlpha * rhs[j];
+    Packet ptmp0 = pset1<Packet>(t0);
+    Scalar t1 = cjAlpha * rhs[j+1];
+    Packet ptmp1 = pset1<Packet>(t1);
+
+    Scalar t2(0);
+    Packet ptmp2 = pset1<Packet>(t2);
+    Scalar t3(0);
+    Packet ptmp3 = pset1<Packet>(t3);
+
+    size_t starti = FirstTriangular ? 0 : j+2;
+    size_t endi   = FirstTriangular ? j : size;
+    size_t alignedStart = (starti) + internal::first_aligned(&res[starti], endi-starti);
+    size_t alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize);
+
+    // TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed
+    res[j]   += cjd.pmul(numext::real(A0[j]), t0);
+    res[j+1] += cjd.pmul(numext::real(A1[j+1]), t1);
+    if(FirstTriangular)
+    {
+      res[j]   += cj0.pmul(A1[j],   t1);
+      t3       += cj1.pmul(A1[j],   rhs[j]);
+    }
+    else
+    {
+      res[j+1] += cj0.pmul(A0[j+1],t0);
+      t2 += cj1.pmul(A0[j+1], rhs[j+1]);
+    }
+
+    for (size_t i=starti; i<alignedStart; ++i)
+    {
+      res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1);
+      t2 += cj1.pmul(A0[i], rhs[i]);
+      t3 += cj1.pmul(A1[i], rhs[i]);
+    }
+    // Yes this an optimization for gcc 4.3 and 4.4 (=> huge speed up)
+    // gcc 4.2 does this optimization automatically.
+    const Scalar* EIGEN_RESTRICT a0It  = A0  + alignedStart;
+    const Scalar* EIGEN_RESTRICT a1It  = A1  + alignedStart;
+    const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
+          Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
+    for (size_t i=alignedStart; i<alignedEnd; i+=PacketSize)
+    {
+      Packet A0i = ploadu<Packet>(a0It);  a0It  += PacketSize;
+      Packet A1i = ploadu<Packet>(a1It);  a1It  += PacketSize;
+      Packet Bi  = ploadu<Packet>(rhsIt); rhsIt += PacketSize; // FIXME should be aligned in most cases
+      Packet Xi  = pload <Packet>(resIt);
+
+      Xi    = pcj0.pmadd(A0i,ptmp0, pcj0.pmadd(A1i,ptmp1,Xi));
+      ptmp2 = pcj1.pmadd(A0i,  Bi, ptmp2);
+      ptmp3 = pcj1.pmadd(A1i,  Bi, ptmp3);
+      pstore(resIt,Xi); resIt += PacketSize;
+    }
+    for (size_t i=alignedEnd; i<endi; i++)
+    {
+      res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1);
+      t2 += cj1.pmul(A0[i], rhs[i]);
+      t3 += cj1.pmul(A1[i], rhs[i]);
+    }
+
+    res[j]   += alpha * (t2 + predux(ptmp2));
+    res[j+1] += alpha * (t3 + predux(ptmp3));
+  }
+  for (Index j=FirstTriangular ? 0 : bound;j<(FirstTriangular ? bound : size);j++)
+  {
+    const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
+
+    Scalar t1 = cjAlpha * rhs[j];
+    Scalar t2(0);
+    // TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed
+    res[j] += cjd.pmul(numext::real(A0[j]), t1);
+    for (Index i=FirstTriangular ? 0 : j+1; i<(FirstTriangular ? j : size); i++)
+    {
+      res[i] += cj0.pmul(A0[i], t1);
+      t2 += cj1.pmul(A0[i], rhs[i]);
+    }
+    res[j] += alpha * t2;
+  }
+}
+
+} // end namespace internal 
+
+/***************************************************************************
+* Wrapper to product_selfadjoint_vector
+***************************************************************************/
+
+namespace internal {
+template<typename Lhs, int LhsMode, typename Rhs>
+struct traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true> >
+  : traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs> >
+{};
+}
+
+template<typename Lhs, int LhsMode, typename Rhs>
+struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
+  : public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs >
+{
+  EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
+
+  enum {
+    LhsUpLo = LhsMode&(Upper|Lower)
+  };
+
+  SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
+
+  template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
+  {
+    typedef typename Dest::Scalar ResScalar;
+    typedef typename Base::RhsScalar RhsScalar;
+    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
+    
+    eigen_assert(dest.rows()==m_lhs.rows() && dest.cols()==m_rhs.cols());
+
+    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
+
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
+                               * RhsBlasTraits::extractScalarFactor(m_rhs);
+
+    enum {
+      EvalToDest = (Dest::InnerStrideAtCompileTime==1),
+      UseRhs = (_ActualRhsType::InnerStrideAtCompileTime==1)
+    };
+    
+    internal::gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,!EvalToDest> static_dest;
+    internal::gemv_static_vector_if<RhsScalar,_ActualRhsType::SizeAtCompileTime,_ActualRhsType::MaxSizeAtCompileTime,!UseRhs> static_rhs;
+
+    ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
+                                                  EvalToDest ? dest.data() : static_dest.data());
+                                                  
+    ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,rhs.size(),
+        UseRhs ? const_cast<RhsScalar*>(rhs.data()) : static_rhs.data());
+    
+    if(!EvalToDest)
+    {
+      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      int size = dest.size();
+      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      #endif
+      MappedDest(actualDestPtr, dest.size()) = dest;
+    }
+      
+    if(!UseRhs)
+    {
+      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      int size = rhs.size();
+      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      #endif
+      Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, rhs.size()) = rhs;
+    }
+      
+      
+    internal::selfadjoint_matrix_vector_product<Scalar, Index, (internal::traits<_ActualLhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)>::run
+      (
+        lhs.rows(),                             // size
+        &lhs.coeffRef(0,0),  lhs.outerStride(), // lhs info
+        actualRhsPtr, 1,                        // rhs info
+        actualDestPtr,                          // result info
+        actualAlpha                             // scale factor
+      );
+    
+    if(!EvalToDest)
+      dest = MappedDest(actualDestPtr, dest.size());
+  }
+};
+
+namespace internal {
+template<typename Lhs, typename Rhs, int RhsMode>
+struct traits<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false> >
+  : traits<ProductBase<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>, Lhs, Rhs> >
+{};
+}
+
+template<typename Lhs, typename Rhs, int RhsMode>
+struct SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>
+  : public ProductBase<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>, Lhs, Rhs >
+{
+  EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
+
+  enum {
+    RhsUpLo = RhsMode&(Upper|Lower)
+  };
+
+  SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
+
+  template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
+  {
+    // let's simply transpose the product
+    Transpose<Dest> destT(dest);
+    SelfadjointProductMatrix<Transpose<const Rhs>, int(RhsUpLo)==Upper ? Lower : Upper, false,
+                             Transpose<const Lhs>, 0, true>(m_rhs.transpose(), m_lhs.transpose()).scaleAndAddTo(destT, alpha);
+  }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h b/third_party/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
new file mode 100644
index 0000000000..86684b66d9
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
@@ -0,0 +1,114 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL
+ *   Selfadjoint matrix-vector product functionality based on ?SYMV/HEMV.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H
+#define EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/**********************************************************************
+* This file implements selfadjoint matrix-vector multiplication using BLAS
+**********************************************************************/
+
+// symv/hemv specialization
+
+template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs>
+struct selfadjoint_matrix_vector_product_symv :
+  selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,BuiltIn> {};
+
+#define EIGEN_MKL_SYMV_SPECIALIZE(Scalar) \
+template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> \
+struct selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Specialized> { \
+static void run( \
+  Index size, const Scalar*  lhs, Index lhsStride, \
+  const Scalar* _rhs, Index rhsIncr, Scalar* res, Scalar alpha) { \
+    enum {\
+      IsColMajor = StorageOrder==ColMajor \
+    }; \
+    if (IsColMajor == ConjugateLhs) {\
+      selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,BuiltIn>::run( \
+        size, lhs, lhsStride, _rhs, rhsIncr, res, alpha);  \
+    } else {\
+      selfadjoint_matrix_vector_product_symv<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs>::run( \
+        size, lhs, lhsStride, _rhs, rhsIncr, res, alpha);  \
+    }\
+  } \
+}; \
+
+EIGEN_MKL_SYMV_SPECIALIZE(double)
+EIGEN_MKL_SYMV_SPECIALIZE(float)
+EIGEN_MKL_SYMV_SPECIALIZE(dcomplex)
+EIGEN_MKL_SYMV_SPECIALIZE(scomplex)
+
+#define EIGEN_MKL_SYMV_SPECIALIZATION(EIGTYPE,MKLTYPE,MKLFUNC) \
+template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> \
+struct selfadjoint_matrix_vector_product_symv<EIGTYPE,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs> \
+{ \
+typedef Matrix<EIGTYPE,Dynamic,1,ColMajor> SYMVVector;\
+\
+static void run( \
+Index size, const EIGTYPE*  lhs, Index lhsStride, \
+const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* res, EIGTYPE alpha) \
+{ \
+  enum {\
+    IsRowMajor = StorageOrder==RowMajor ? 1 : 0, \
+    IsLower = UpLo == Lower ? 1 : 0 \
+  }; \
+  MKL_INT n=size, lda=lhsStride, incx=rhsIncr, incy=1; \
+  MKLTYPE alpha_, beta_; \
+  const EIGTYPE *x_ptr, myone(1); \
+  char uplo=(IsRowMajor) ? (IsLower ? 'U' : 'L') : (IsLower ? 'L' : 'U'); \
+  assign_scalar_eig2mkl(alpha_, alpha); \
+  assign_scalar_eig2mkl(beta_, myone); \
+  SYMVVector x_tmp; \
+  if (ConjugateRhs) { \
+    Map<const SYMVVector, 0, InnerStride<> > map_x(_rhs,size,1,InnerStride<>(incx)); \
+    x_tmp=map_x.conjugate(); \
+    x_ptr=x_tmp.data(); \
+    incx=1; \
+  } else x_ptr=_rhs; \
+  MKLFUNC(&uplo, &n, &alpha_, (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &beta_, (MKLTYPE*)res, &incy); \
+}\
+};
+
+EIGEN_MKL_SYMV_SPECIALIZATION(double,   double,        dsymv)
+EIGEN_MKL_SYMV_SPECIALIZATION(float,    float,         ssymv)
+EIGEN_MKL_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv)
+EIGEN_MKL_SYMV_SPECIALIZATION(scomplex, MKL_Complex8,  chemv)
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/SelfadjointProduct.h b/third_party/eigen3/Eigen/src/Core/products/SelfadjointProduct.h
new file mode 100644
index 0000000000..6ca4ae6c0f
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/SelfadjointProduct.h
@@ -0,0 +1,123 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SELFADJOINT_PRODUCT_H
+#define EIGEN_SELFADJOINT_PRODUCT_H
+
+/**********************************************************************
+* This file implements a self adjoint product: C += A A^T updating only
+* half of the selfadjoint matrix C.
+* It corresponds to the level 3 SYRK and level 2 SYR Blas routines.
+**********************************************************************/
+
+namespace Eigen { 
+
+
+template<typename Scalar, typename Index, int UpLo, bool ConjLhs, bool ConjRhs>
+struct selfadjoint_rank1_update<Scalar,Index,ColMajor,UpLo,ConjLhs,ConjRhs>
+{
+  static void run(Index size, Scalar* mat, Index stride, const Scalar* vecX, const Scalar* vecY, const Scalar& alpha)
+  {
+    internal::conj_if<ConjRhs> cj;
+    typedef Map<const Matrix<Scalar,Dynamic,1> > OtherMap;
+    typedef typename internal::conditional<ConjLhs,typename OtherMap::ConjugateReturnType,const OtherMap&>::type ConjLhsType;
+    for (Index i=0; i<size; ++i)
+    {
+      Map<Matrix<Scalar,Dynamic,1> >(mat+stride*i+(UpLo==Lower ? i : 0), (UpLo==Lower ? size-i : (i+1)))
+          += (alpha * cj(vecY[i])) * ConjLhsType(OtherMap(vecX+(UpLo==Lower ? i : 0),UpLo==Lower ? size-i : (i+1)));
+    }
+  }
+};
+
+template<typename Scalar, typename Index, int UpLo, bool ConjLhs, bool ConjRhs>
+struct selfadjoint_rank1_update<Scalar,Index,RowMajor,UpLo,ConjLhs,ConjRhs>
+{
+  static void run(Index size, Scalar* mat, Index stride, const Scalar* vecX, const Scalar* vecY, const Scalar& alpha)
+  {
+    selfadjoint_rank1_update<Scalar,Index,ColMajor,UpLo==Lower?Upper:Lower,ConjRhs,ConjLhs>::run(size,mat,stride,vecY,vecX,alpha);
+  }
+};
+
+template<typename MatrixType, typename OtherType, int UpLo, bool OtherIsVector = OtherType::IsVectorAtCompileTime>
+struct selfadjoint_product_selector;
+
+template<typename MatrixType, typename OtherType, int UpLo>
+struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,true>
+{
+  static void run(MatrixType& mat, const OtherType& other, const typename MatrixType::Scalar& alpha)
+  {
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::Index Index;
+    typedef internal::blas_traits<OtherType> OtherBlasTraits;
+    typedef typename OtherBlasTraits::DirectLinearAccessType ActualOtherType;
+    typedef typename internal::remove_all<ActualOtherType>::type _ActualOtherType;
+    typename internal::add_const_on_value_type<ActualOtherType>::type actualOther = OtherBlasTraits::extract(other.derived());
+
+    Scalar actualAlpha = alpha * OtherBlasTraits::extractScalarFactor(other.derived());
+
+    enum {
+      StorageOrder = (internal::traits<MatrixType>::Flags&RowMajorBit) ? RowMajor : ColMajor,
+      UseOtherDirectly = _ActualOtherType::InnerStrideAtCompileTime==1
+    };
+    internal::gemv_static_vector_if<Scalar,OtherType::SizeAtCompileTime,OtherType::MaxSizeAtCompileTime,!UseOtherDirectly> static_other;
+
+    ei_declare_aligned_stack_constructed_variable(Scalar, actualOtherPtr, other.size(),
+      (UseOtherDirectly ? const_cast<Scalar*>(actualOther.data()) : static_other.data()));
+      
+    if(!UseOtherDirectly)
+      Map<typename _ActualOtherType::PlainObject>(actualOtherPtr, actualOther.size()) = actualOther;
+    
+    selfadjoint_rank1_update<Scalar,Index,StorageOrder,UpLo,
+                              OtherBlasTraits::NeedToConjugate  && NumTraits<Scalar>::IsComplex,
+                            (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex>
+          ::run(other.size(), mat.data(), mat.outerStride(), actualOtherPtr, actualOtherPtr, actualAlpha);
+  }
+};
+
+template<typename MatrixType, typename OtherType, int UpLo>
+struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false>
+{
+  static void run(MatrixType& mat, const OtherType& other, const typename MatrixType::Scalar& alpha)
+  {
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::Index Index;
+    typedef internal::blas_traits<OtherType> OtherBlasTraits;
+    typedef typename OtherBlasTraits::DirectLinearAccessType ActualOtherType;
+    typedef typename internal::remove_all<ActualOtherType>::type _ActualOtherType;
+    typename internal::add_const_on_value_type<ActualOtherType>::type actualOther = OtherBlasTraits::extract(other.derived());
+
+    Scalar actualAlpha = alpha * OtherBlasTraits::extractScalarFactor(other.derived());
+
+    enum { IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0 };
+
+    internal::general_matrix_matrix_triangular_product<Index,
+      Scalar, _ActualOtherType::Flags&RowMajorBit ? RowMajor : ColMajor,   OtherBlasTraits::NeedToConjugate  && NumTraits<Scalar>::IsComplex,
+      Scalar, _ActualOtherType::Flags&RowMajorBit ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex,
+      MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, UpLo>
+      ::run(mat.cols(), actualOther.cols(),
+            &actualOther.coeffRef(0,0), actualOther.outerStride(), &actualOther.coeffRef(0,0), actualOther.outerStride(),
+            mat.data(), mat.outerStride(), actualAlpha);
+  }
+};
+
+// high level API
+
+template<typename MatrixType, unsigned int UpLo>
+template<typename DerivedU>
+SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
+::rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha)
+{
+  selfadjoint_product_selector<MatrixType,DerivedU,UpLo>::run(_expression().const_cast_derived(), u.derived(), alpha);
+
+  return *this;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINT_PRODUCT_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/SelfadjointRank2Update.h b/third_party/eigen3/Eigen/src/Core/products/SelfadjointRank2Update.h
new file mode 100644
index 0000000000..8594a97cea
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/SelfadjointRank2Update.h
@@ -0,0 +1,93 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SELFADJOINTRANK2UPTADE_H
+#define EIGEN_SELFADJOINTRANK2UPTADE_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/* Optimized selfadjoint matrix += alpha * uv' + conj(alpha)*vu'
+ * It corresponds to the Level2 syr2 BLAS routine
+ */
+
+template<typename Scalar, typename Index, typename UType, typename VType, int UpLo>
+struct selfadjoint_rank2_update_selector;
+
+template<typename Scalar, typename Index, typename UType, typename VType>
+struct selfadjoint_rank2_update_selector<Scalar,Index,UType,VType,Lower>
+{
+  static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
+  {
+    const Index size = u.size();
+    for (Index i=0; i<size; ++i)
+    {
+      Map<Matrix<Scalar,Dynamic,1> >(mat+stride*i+i, size-i) +=
+                        (numext::conj(alpha) * numext::conj(u.coeff(i))) * v.tail(size-i)
+                      + (alpha * numext::conj(v.coeff(i))) * u.tail(size-i);
+    }
+  }
+};
+
+template<typename Scalar, typename Index, typename UType, typename VType>
+struct selfadjoint_rank2_update_selector<Scalar,Index,UType,VType,Upper>
+{
+  static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
+  {
+    const Index size = u.size();
+    for (Index i=0; i<size; ++i)
+      Map<Matrix<Scalar,Dynamic,1> >(mat+stride*i, i+1) +=
+                        (numext::conj(alpha)  * numext::conj(u.coeff(i))) * v.head(i+1)
+                      + (alpha * numext::conj(v.coeff(i))) * u.head(i+1);
+  }
+};
+
+template<bool Cond, typename T> struct conj_expr_if
+  : conditional<!Cond, const T&,
+      CwiseUnaryOp<scalar_conjugate_op<typename traits<T>::Scalar>,T> > {};
+
+} // end namespace internal
+
+template<typename MatrixType, unsigned int UpLo>
+template<typename DerivedU, typename DerivedV>
+SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
+::rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, const Scalar& alpha)
+{
+  typedef internal::blas_traits<DerivedU> UBlasTraits;
+  typedef typename UBlasTraits::DirectLinearAccessType ActualUType;
+  typedef typename internal::remove_all<ActualUType>::type _ActualUType;
+  typename internal::add_const_on_value_type<ActualUType>::type actualU = UBlasTraits::extract(u.derived());
+
+  typedef internal::blas_traits<DerivedV> VBlasTraits;
+  typedef typename VBlasTraits::DirectLinearAccessType ActualVType;
+  typedef typename internal::remove_all<ActualVType>::type _ActualVType;
+  typename internal::add_const_on_value_type<ActualVType>::type actualV = VBlasTraits::extract(v.derived());
+
+  // If MatrixType is row major, then we use the routine for lower triangular in the upper triangular case and
+  // vice versa, and take the complex conjugate of all coefficients and vector entries.
+
+  enum { IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0 };
+  Scalar actualAlpha = alpha * UBlasTraits::extractScalarFactor(u.derived())
+                             * numext::conj(VBlasTraits::extractScalarFactor(v.derived()));
+  if (IsRowMajor)
+    actualAlpha = numext::conj(actualAlpha);
+
+  internal::selfadjoint_rank2_update_selector<Scalar, Index,
+    typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ UBlasTraits::NeedToConjugate,_ActualUType>::type>::type,
+    typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ VBlasTraits::NeedToConjugate,_ActualVType>::type>::type,
+    (IsRowMajor ? int(UpLo==Upper ? Lower : Upper) : UpLo)>
+    ::run(_expression().const_cast_derived().data(),_expression().outerStride(),actualU,actualV,actualAlpha);
+
+  return *this;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SELFADJOINTRANK2UPTADE_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix.h b/third_party/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix.h
new file mode 100644
index 0000000000..4cbb79da0c
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -0,0 +1,434 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_H
+#define EIGEN_TRIANGULAR_MATRIX_MATRIX_H
+
+namespace Eigen { 
+
+namespace internal {
+
+// template<typename Scalar, int mr, int StorageOrder, bool Conjugate, int Mode>
+// struct gemm_pack_lhs_triangular
+// {
+//   Matrix<Scalar,mr,mr,
+//   void operator()(Scalar* blockA, const EIGEN_RESTRICT Scalar* _lhs, int lhsStride, int depth, int rows)
+//   {
+//     conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+//     const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride);
+//     int count = 0;
+//     const int peeled_mc = (rows/mr)*mr;
+//     for(int i=0; i<peeled_mc; i+=mr)
+//     {
+//       for(int k=0; k<depth; k++)
+//         for(int w=0; w<mr; w++)
+//           blockA[count++] = cj(lhs(i+w, k));
+//     }
+//     for(int i=peeled_mc; i<rows; i++)
+//     {
+//       for(int k=0; k<depth; k++)
+//         blockA[count++] = cj(lhs(i, k));
+//     }
+//   }
+// };
+
+/* Optimized triangular matrix * matrix (_TRMM++) product built on top of
+ * the general matrix matrix product.
+ */
+template <typename Scalar, typename Index,
+          int Mode, bool LhsIsTriangular,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResStorageOrder, int Version = Specialized>
+struct product_triangular_matrix_matrix;
+
+template <typename Scalar, typename Index,
+          int Mode, bool LhsIsTriangular,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs, int Version>
+struct product_triangular_matrix_matrix<Scalar,Index,Mode,LhsIsTriangular,
+                                           LhsStorageOrder,ConjugateLhs,
+                                           RhsStorageOrder,ConjugateRhs,RowMajor,Version>
+{
+  static EIGEN_STRONG_INLINE void run(
+    Index rows, Index cols, Index depth,
+    const Scalar* lhs, Index lhsStride,
+    const Scalar* rhs, Index rhsStride,
+    Scalar* res,       Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
+  {
+    product_triangular_matrix_matrix<Scalar, Index,
+      (Mode&(UnitDiag|ZeroDiag)) | ((Mode&Upper) ? Lower : Upper),
+      (!LhsIsTriangular),
+      RhsStorageOrder==RowMajor ? ColMajor : RowMajor,
+      ConjugateRhs,
+      LhsStorageOrder==RowMajor ? ColMajor : RowMajor,
+      ConjugateLhs,
+      ColMajor>
+      ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking);
+  }
+};
+
+// implements col-major += alpha * op(triangular) * op(general)
+template <typename Scalar, typename Index, int Mode,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs, int Version>
+struct product_triangular_matrix_matrix<Scalar,Index,Mode,true,
+                                           LhsStorageOrder,ConjugateLhs,
+                                           RhsStorageOrder,ConjugateRhs,ColMajor,Version>
+{
+  
+  typedef gebp_traits<Scalar,Scalar> Traits;
+  enum {
+    SmallPanelWidth   = 2 * EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
+    IsLower = (Mode&Lower) == Lower,
+    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1
+  };
+
+  static EIGEN_DONT_INLINE void run(
+    Index _rows, Index _cols, Index _depth,
+    const Scalar* _lhs, Index lhsStride,
+    const Scalar* _rhs, Index rhsStride,
+    Scalar* res,        Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
+};
+
+template <typename Scalar, typename Index, int Mode,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
+                                                        LhsStorageOrder,ConjugateLhs,
+                                                        RhsStorageOrder,ConjugateRhs,ColMajor,Version>::run(
+    Index _rows, Index _cols, Index _depth,
+    const Scalar* _lhs, Index lhsStride,
+    const Scalar* _rhs, Index rhsStride,
+    Scalar* _res,        Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
+  {
+    // strip zeros
+    Index diagSize  = (std::min)(_rows,_depth);
+    Index rows      = IsLower ? _rows : diagSize;
+    Index depth     = IsLower ? diagSize : _depth;
+    Index cols      = _cols;
+    
+    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+    typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    RhsMapper rhs(_rhs,rhsStride);
+    ResMapper res(_res, resStride);
+
+    Index kc = blocking.kc();                   // cache block size along the K direction
+    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
+
+    std::size_t sizeA = kc*mc;
+    std::size_t sizeB = kc*cols;
+
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+
+    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
+    triangularBuffer.setZero();
+    if((Mode&ZeroDiag)==ZeroDiag)
+      triangularBuffer.diagonal().setZero();
+    else
+      triangularBuffer.diagonal().setOnes();
+
+    gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
+
+    for(Index k2=IsLower ? depth : 0;
+        IsLower ? k2>0 : k2<depth;
+        IsLower ? k2-=kc : k2+=kc)
+    {
+      Index actual_kc = (std::min)(IsLower ? k2 : depth-k2, kc);
+      Index actual_k2 = IsLower ? k2-actual_kc : k2;
+
+      // align blocks with the end of the triangular part for trapezoidal lhs
+      if((!IsLower)&&(k2<rows)&&(k2+actual_kc>rows))
+      {
+        actual_kc = rows-k2;
+        k2 = k2+actual_kc-kc;
+      }
+
+      pack_rhs(blockB, rhs.getSubMapper(actual_k2,0), actual_kc, cols);
+
+      // the selected lhs's panel has to be split in three different parts:
+      //  1 - the part which is zero => skip it
+      //  2 - the diagonal block => special kernel
+      //  3 - the dense panel below (lower case) or above (upper case) the diagonal block => GEPP
+
+      // the block diagonal, if any:
+      if(IsLower || actual_k2<rows)
+      {
+        // for each small vertical panels of lhs
+        for (Index k1=0; k1<actual_kc; k1+=SmallPanelWidth)
+        {
+          Index actualPanelWidth = std::min<Index>(actual_kc-k1, SmallPanelWidth);
+          Index lengthTarget = IsLower ? actual_kc-k1-actualPanelWidth : k1;
+          Index startBlock   = actual_k2+k1;
+          Index blockBOffset = k1;
+
+          // => GEBP with the micro triangular block
+          // The trick is to pack this micro block while filling the opposite triangular part with zeros.
+          // To this end we do an extra triangular copy to a small temporary buffer
+          for (Index k=0;k<actualPanelWidth;++k)
+          {
+            if (SetDiag)
+              triangularBuffer.coeffRef(k,k) = lhs(startBlock+k,startBlock+k);
+            for (Index i=IsLower ? k+1 : 0; IsLower ? i<actualPanelWidth : i<k; ++i)
+              triangularBuffer.coeffRef(i,k) = lhs(startBlock+i,startBlock+k);
+          }
+          pack_lhs(blockA, LhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()), actualPanelWidth, actualPanelWidth);
+
+          gebp_kernel(res.getSubMapper(startBlock, 0), blockA, blockB,
+                      actualPanelWidth, actualPanelWidth, cols, alpha,
+                      actualPanelWidth, actual_kc, 0, blockBOffset);
+
+          // GEBP with remaining micro panel
+          if (lengthTarget>0)
+          {
+            Index startTarget  = IsLower ? actual_k2+k1+actualPanelWidth : actual_k2;
+
+            pack_lhs(blockA, lhs.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
+
+            gebp_kernel(res.getSubMapper(startTarget, 0), blockA, blockB,
+                        lengthTarget, actualPanelWidth, cols, alpha,
+                        actualPanelWidth, actual_kc, 0, blockBOffset);
+          }
+        }
+      }
+      // the part below (lower case) or above (upper case) the diagonal => GEPP
+      {
+        Index start = IsLower ? k2 : 0;
+        Index end   = IsLower ? rows : (std::min)(actual_k2,rows);
+        for(Index i2=start; i2<end; i2+=mc)
+        {
+          const Index actual_mc = (std::min)(i2+mc,end)-i2;
+          gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
+            (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
+
+          gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc,
+                      actual_kc, cols, alpha, -1, -1, 0, 0);
+        }
+      }
+    }
+  }
+
+// implements col-major += alpha * op(general) * op(triangular)
+template <typename Scalar, typename Index, int Mode,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs, int Version>
+struct product_triangular_matrix_matrix<Scalar,Index,Mode,false,
+                                        LhsStorageOrder,ConjugateLhs,
+                                        RhsStorageOrder,ConjugateRhs,ColMajor,Version>
+{
+  typedef gebp_traits<Scalar,Scalar> Traits;
+  enum {
+    SmallPanelWidth   = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
+    IsLower = (Mode&Lower) == Lower,
+    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1
+  };
+
+  static EIGEN_DONT_INLINE void run(
+    Index _rows, Index _cols, Index _depth,
+    const Scalar* _lhs, Index lhsStride,
+    const Scalar* _rhs, Index rhsStride,
+    Scalar* res,        Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
+};
+
+template <typename Scalar, typename Index, int Mode,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
+                                                        LhsStorageOrder,ConjugateLhs,
+                                                        RhsStorageOrder,ConjugateRhs,ColMajor,Version>::run(
+    Index _rows, Index _cols, Index _depth,
+    const Scalar* _lhs, Index lhsStride,
+    const Scalar* _rhs, Index rhsStride,
+    Scalar* _res,        Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
+  {
+    // strip zeros
+    Index diagSize  = (std::min)(_cols,_depth);
+    Index rows      = _rows;
+    Index depth     = IsLower ? _depth : diagSize;
+    Index cols      = IsLower ? diagSize : _cols;
+    
+    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+    typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    RhsMapper rhs(_rhs,rhsStride);
+    ResMapper res(_res, resStride);
+
+    Index kc = blocking.kc();                   // cache block size along the K direction
+    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
+
+    std::size_t sizeA = kc*mc;
+    std::size_t sizeB = kc*cols+EIGEN_ALIGN_BYTES/sizeof(Scalar);
+
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+
+    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
+    triangularBuffer.setZero();
+    if((Mode&ZeroDiag)==ZeroDiag)
+      triangularBuffer.diagonal().setZero();
+    else
+      triangularBuffer.diagonal().setOnes();
+
+    gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
+
+    for(Index k2=IsLower ? 0 : depth;
+        IsLower ? k2<depth  : k2>0;
+        IsLower ? k2+=kc   : k2-=kc)
+    {
+      Index actual_kc = (std::min)(IsLower ? depth-k2 : k2, kc);
+      Index actual_k2 = IsLower ? k2 : k2-actual_kc;
+
+      // align blocks with the end of the triangular part for trapezoidal rhs
+      if(IsLower && (k2<cols) && (actual_k2+actual_kc>cols))
+      {
+        actual_kc = cols-k2;
+        k2 = actual_k2 + actual_kc - kc;
+      }
+
+      // remaining size
+      Index rs = IsLower ? (std::min)(cols,actual_k2) : cols - k2;
+      // size of the triangular part
+      Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc;
+
+      Scalar* geb = blockB+ts*ts;
+      geb = geb + internal::first_aligned(geb,EIGEN_ALIGN_BYTES/sizeof(Scalar));
+
+      pack_rhs(geb, rhs.getSubMapper(actual_k2,IsLower ? 0 : k2), actual_kc, rs);
+
+      // pack the triangular part of the rhs padding the unrolled blocks with zeros
+      if(ts>0)
+      {
+        for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth)
+        {
+          Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
+          Index actual_j2 = actual_k2 + j2;
+          Index panelOffset = IsLower ? j2+actualPanelWidth : 0;
+          Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
+          // general part
+          pack_rhs_panel(blockB+j2*actual_kc,
+                         rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
+                         panelLength, actualPanelWidth,
+                         actual_kc, panelOffset);
+
+          // append the triangular part via a temporary buffer
+          for (Index j=0;j<actualPanelWidth;++j)
+          {
+            if (SetDiag)
+              triangularBuffer.coeffRef(j,j) = rhs(actual_j2+j,actual_j2+j);
+            for (Index k=IsLower ? j+1 : 0; IsLower ? k<actualPanelWidth : k<j; ++k)
+              triangularBuffer.coeffRef(k,j) = rhs(actual_j2+k,actual_j2+j);
+          }
+
+          pack_rhs_panel(blockB+j2*actual_kc,
+                         RhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()),
+                         actualPanelWidth, actualPanelWidth,
+                         actual_kc, j2);
+        }
+      }
+
+      for (Index i2=0; i2<rows; i2+=mc)
+      {
+        const Index actual_mc = (std::min)(mc,rows-i2);
+        pack_lhs(blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
+
+        // triangular kernel
+        if(ts>0)
+        {
+          for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth)
+          {
+            Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
+            Index panelLength = IsLower ? actual_kc-j2 : j2+actualPanelWidth;
+            Index blockOffset = IsLower ? j2 : 0;
+
+            gebp_kernel(res.getSubMapper(i2, actual_k2 + j2),
+                        blockA, blockB+j2*actual_kc,
+                        actual_mc, panelLength, actualPanelWidth,
+                        alpha,
+                        actual_kc, actual_kc,  // strides
+                        blockOffset, blockOffset);// offsets
+          }
+        }
+        gebp_kernel(res.getSubMapper(i2, IsLower ? 0 : k2),
+                    blockA, geb, actual_mc, actual_kc, rs,
+                    alpha,
+                    -1, -1, 0, 0);
+      }
+    }
+  }
+
+/***************************************************************************
+* Wrapper to product_triangular_matrix_matrix
+***************************************************************************/
+
+template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
+struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false> >
+  : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>, Lhs, Rhs> >
+{};
+
+} // end namespace internal
+
+template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
+struct TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
+  : public ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>, Lhs, Rhs >
+{
+  EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct)
+
+  TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
+
+  template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+  {
+    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
+
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
+                               * RhsBlasTraits::extractScalarFactor(m_rhs);
+
+    typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
+              Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,4> BlockingType;
+
+    enum { IsLower = (Mode&Lower) == Lower };
+    Index stripedRows  = ((!LhsIsTriangular) || (IsLower))  ? lhs.rows() : (std::min)(lhs.rows(),lhs.cols());
+    Index stripedCols  = ((LhsIsTriangular)  || (!IsLower)) ? rhs.cols() : (std::min)(rhs.cols(),rhs.rows());
+    Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows()))
+                                         : ((IsLower)  ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols()));
+
+    BlockingType blocking(stripedRows, stripedCols, stripedDepth, 1, false);
+
+    internal::product_triangular_matrix_matrix<Scalar, Index,
+      Mode, LhsIsTriangular,
+      (internal::traits<_ActualLhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
+      (internal::traits<_ActualRhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
+      (internal::traits<Dest          >::Flags&RowMajorBit) ? RowMajor : ColMajor>
+      ::run(
+        stripedRows, stripedCols, stripedDepth,   // sizes
+        &lhs.coeffRef(0,0),    lhs.outerStride(), // lhs info
+        &rhs.coeffRef(0,0),    rhs.outerStride(), // rhs info
+        &dst.coeffRef(0,0), dst.outerStride(),    // result info
+        actualAlpha, blocking
+      );
+  }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h b/third_party/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
new file mode 100644
index 0000000000..ba41a1c99f
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
@@ -0,0 +1,309 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL
+ *   Triangular matrix * matrix product functionality based on ?TRMM.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H
+#define EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H
+
+namespace Eigen { 
+
+namespace internal {
+
+
+template <typename Scalar, typename Index,
+          int Mode, bool LhsIsTriangular,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResStorageOrder>
+struct product_triangular_matrix_matrix_trmm :
+       product_triangular_matrix_matrix<Scalar,Index,Mode,
+          LhsIsTriangular,LhsStorageOrder,ConjugateLhs,
+          RhsStorageOrder, ConjugateRhs, ResStorageOrder, BuiltIn> {};
+
+
+// try to go to BLAS specialization
+#define EIGEN_MKL_TRMM_SPECIALIZE(Scalar, LhsIsTriangular) \
+template <typename Index, int Mode, \
+          int LhsStorageOrder, bool ConjugateLhs, \
+          int RhsStorageOrder, bool ConjugateRhs> \
+struct product_triangular_matrix_matrix<Scalar,Index, Mode, LhsIsTriangular, \
+           LhsStorageOrder,ConjugateLhs, RhsStorageOrder,ConjugateRhs,ColMajor,Specialized> { \
+  static inline void run(Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride,\
+    const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking<Scalar,Scalar>& blocking) { \
+      product_triangular_matrix_matrix_trmm<Scalar,Index,Mode, \
+        LhsIsTriangular,LhsStorageOrder,ConjugateLhs, \
+        RhsStorageOrder, ConjugateRhs, ColMajor>::run( \
+        _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
+  } \
+};
+
+EIGEN_MKL_TRMM_SPECIALIZE(double, true)
+EIGEN_MKL_TRMM_SPECIALIZE(double, false)
+EIGEN_MKL_TRMM_SPECIALIZE(dcomplex, true)
+EIGEN_MKL_TRMM_SPECIALIZE(dcomplex, false)
+EIGEN_MKL_TRMM_SPECIALIZE(float, true)
+EIGEN_MKL_TRMM_SPECIALIZE(float, false)
+EIGEN_MKL_TRMM_SPECIALIZE(scomplex, true)
+EIGEN_MKL_TRMM_SPECIALIZE(scomplex, false)
+
+// implements col-major += alpha * op(triangular) * op(general)
+#define EIGEN_MKL_TRMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+template <typename Index, int Mode, \
+          int LhsStorageOrder, bool ConjugateLhs, \
+          int RhsStorageOrder, bool ConjugateRhs> \
+struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
+         LhsStorageOrder,ConjugateLhs,RhsStorageOrder,ConjugateRhs,ColMajor> \
+{ \
+  enum { \
+    IsLower = (Mode&Lower) == Lower, \
+    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \
+    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
+    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
+    LowUp = IsLower ? Lower : Upper, \
+    conjA = ((LhsStorageOrder==ColMajor) && ConjugateLhs) ? 1 : 0 \
+  }; \
+\
+  static void run( \
+    Index _rows, Index _cols, Index _depth, \
+    const EIGTYPE* _lhs, Index lhsStride, \
+    const EIGTYPE* _rhs, Index rhsStride, \
+    EIGTYPE* res,        Index resStride, \
+    EIGTYPE alpha, level3_blocking<EIGTYPE,EIGTYPE>& blocking) \
+  { \
+   Index diagSize  = (std::min)(_rows,_depth); \
+   Index rows      = IsLower ? _rows : diagSize; \
+   Index depth     = IsLower ? diagSize : _depth; \
+   Index cols      = _cols; \
+\
+   typedef Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> MatrixLhs; \
+   typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs; \
+\
+/* Non-square case - doesn't fit to MKL ?TRMM. Fall to default triangular product or call MKL ?GEMM*/ \
+   if (rows != depth) { \
+\
+     int nthr = mkl_domain_get_max_threads(MKL_BLAS); \
+\
+     if (((nthr==1) && (((std::max)(rows,depth)-diagSize)/(double)diagSize < 0.5))) { \
+     /* Most likely no benefit to call TRMM or GEMM from MKL*/ \
+       product_triangular_matrix_matrix<EIGTYPE,Index,Mode,true, \
+       LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, BuiltIn>::run( \
+           _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
+     /*std::cout << "TRMM_L: A is not square! Go to Eigen TRMM implementation!\n";*/ \
+     } else { \
+     /* Make sense to call GEMM */ \
+       Map<const MatrixLhs, 0, OuterStride<> > lhsMap(_lhs,rows,depth,OuterStride<>(lhsStride)); \
+       MatrixLhs aa_tmp=lhsMap.template triangularView<Mode>(); \
+       MKL_INT aStride = aa_tmp.outerStride(); \
+       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth); \
+       general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \
+       rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, resStride, alpha, gemm_blocking, 0); \
+\
+     /*std::cout << "TRMM_L: A is not square! Go to MKL GEMM implementation! " << nthr<<" \n";*/ \
+     } \
+     return; \
+   } \
+   char side = 'L', transa, uplo, diag = 'N'; \
+   EIGTYPE *b; \
+   const EIGTYPE *a; \
+   MKL_INT m, n, lda, ldb; \
+   MKLTYPE alpha_; \
+\
+/* Set alpha_*/ \
+   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
+\
+/* Set m, n */ \
+   m = (MKL_INT)diagSize; \
+   n = (MKL_INT)cols; \
+\
+/* Set trans */ \
+   transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \
+\
+/* Set b, ldb */ \
+   Map<const MatrixRhs, 0, OuterStride<> > rhs(_rhs,depth,cols,OuterStride<>(rhsStride)); \
+   MatrixX##EIGPREFIX b_tmp; \
+\
+   if (ConjugateRhs) b_tmp = rhs.conjugate(); else b_tmp = rhs; \
+   b = b_tmp.data(); \
+   ldb = b_tmp.outerStride(); \
+\
+/* Set uplo */ \
+   uplo = IsLower ? 'L' : 'U'; \
+   if (LhsStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \
+/* Set a, lda */ \
+   Map<const MatrixLhs, 0, OuterStride<> > lhs(_lhs,rows,depth,OuterStride<>(lhsStride)); \
+   MatrixLhs a_tmp; \
+\
+   if ((conjA!=0) || (SetDiag==0)) { \
+     if (conjA) a_tmp = lhs.conjugate(); else a_tmp = lhs; \
+     if (IsZeroDiag) \
+       a_tmp.diagonal().setZero(); \
+     else if (IsUnitDiag) \
+       a_tmp.diagonal().setOnes();\
+     a = a_tmp.data(); \
+     lda = a_tmp.outerStride(); \
+   } else { \
+     a = _lhs; \
+     lda = lhsStride; \
+   } \
+   /*std::cout << "TRMM_L: A is square! Go to MKL TRMM implementation! \n";*/ \
+/* call ?trmm*/ \
+   MKLPREFIX##trmm(&side, &uplo, &transa, &diag, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (MKLTYPE*)b, &ldb); \
+\
+/* Add op(a_triangular)*b into res*/ \
+   Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
+   res_tmp=res_tmp+b_tmp; \
+  } \
+};
+
+EIGEN_MKL_TRMM_L(double, double, d, d)
+EIGEN_MKL_TRMM_L(dcomplex, MKL_Complex16, cd, z)
+EIGEN_MKL_TRMM_L(float, float, f, s)
+EIGEN_MKL_TRMM_L(scomplex, MKL_Complex8, cf, c)
+
+// implements col-major += alpha * op(general) * op(triangular)
+#define EIGEN_MKL_TRMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+template <typename Index, int Mode, \
+          int LhsStorageOrder, bool ConjugateLhs, \
+          int RhsStorageOrder, bool ConjugateRhs> \
+struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
+         LhsStorageOrder,ConjugateLhs,RhsStorageOrder,ConjugateRhs,ColMajor> \
+{ \
+  enum { \
+    IsLower = (Mode&Lower) == Lower, \
+    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \
+    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
+    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
+    LowUp = IsLower ? Lower : Upper, \
+    conjA = ((RhsStorageOrder==ColMajor) && ConjugateRhs) ? 1 : 0 \
+  }; \
+\
+  static void run( \
+    Index _rows, Index _cols, Index _depth, \
+    const EIGTYPE* _lhs, Index lhsStride, \
+    const EIGTYPE* _rhs, Index rhsStride, \
+    EIGTYPE* res,        Index resStride, \
+    EIGTYPE alpha, level3_blocking<EIGTYPE,EIGTYPE>& blocking) \
+  { \
+   Index diagSize  = (std::min)(_cols,_depth); \
+   Index rows      = _rows; \
+   Index depth     = IsLower ? _depth : diagSize; \
+   Index cols      = IsLower ? diagSize : _cols; \
+\
+   typedef Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> MatrixLhs; \
+   typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs; \
+\
+/* Non-square case - doesn't fit to MKL ?TRMM. Fall to default triangular product or call MKL ?GEMM*/ \
+   if (cols != depth) { \
+\
+     int nthr = mkl_domain_get_max_threads(MKL_BLAS); \
+\
+     if ((nthr==1) && (((std::max)(cols,depth)-diagSize)/(double)diagSize < 0.5)) { \
+     /* Most likely no benefit to call TRMM or GEMM from MKL*/ \
+       product_triangular_matrix_matrix<EIGTYPE,Index,Mode,false, \
+       LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, BuiltIn>::run( \
+           _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
+       /*std::cout << "TRMM_R: A is not square! Go to Eigen TRMM implementation!\n";*/ \
+     } else { \
+     /* Make sense to call GEMM */ \
+       Map<const MatrixRhs, 0, OuterStride<> > rhsMap(_rhs,depth,cols, OuterStride<>(rhsStride)); \
+       MatrixRhs aa_tmp=rhsMap.template triangularView<Mode>(); \
+       MKL_INT aStride = aa_tmp.outerStride(); \
+       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth); \
+       general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \
+       rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, resStride, alpha, gemm_blocking, 0); \
+\
+     /*std::cout << "TRMM_R: A is not square! Go to MKL GEMM implementation! " << nthr<<" \n";*/ \
+     } \
+     return; \
+   } \
+   char side = 'R', transa, uplo, diag = 'N'; \
+   EIGTYPE *b; \
+   const EIGTYPE *a; \
+   MKL_INT m, n, lda, ldb; \
+   MKLTYPE alpha_; \
+\
+/* Set alpha_*/ \
+   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
+\
+/* Set m, n */ \
+   m = (MKL_INT)rows; \
+   n = (MKL_INT)diagSize; \
+\
+/* Set trans */ \
+   transa = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \
+\
+/* Set b, ldb */ \
+   Map<const MatrixLhs, 0, OuterStride<> > lhs(_lhs,rows,depth,OuterStride<>(lhsStride)); \
+   MatrixX##EIGPREFIX b_tmp; \
+\
+   if (ConjugateLhs) b_tmp = lhs.conjugate(); else b_tmp = lhs; \
+   b = b_tmp.data(); \
+   ldb = b_tmp.outerStride(); \
+\
+/* Set uplo */ \
+   uplo = IsLower ? 'L' : 'U'; \
+   if (RhsStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \
+/* Set a, lda */ \
+   Map<const MatrixRhs, 0, OuterStride<> > rhs(_rhs,depth,cols, OuterStride<>(rhsStride)); \
+   MatrixRhs a_tmp; \
+\
+   if ((conjA!=0) || (SetDiag==0)) { \
+     if (conjA) a_tmp = rhs.conjugate(); else a_tmp = rhs; \
+     if (IsZeroDiag) \
+       a_tmp.diagonal().setZero(); \
+     else if (IsUnitDiag) \
+       a_tmp.diagonal().setOnes();\
+     a = a_tmp.data(); \
+     lda = a_tmp.outerStride(); \
+   } else { \
+     a = _rhs; \
+     lda = rhsStride; \
+   } \
+   /*std::cout << "TRMM_R: A is square! Go to MKL TRMM implementation! \n";*/ \
+/* call ?trmm*/ \
+   MKLPREFIX##trmm(&side, &uplo, &transa, &diag, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (MKLTYPE*)b, &ldb); \
+\
+/* Add op(a_triangular)*b into res*/ \
+   Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
+   res_tmp=res_tmp+b_tmp; \
+  } \
+};
+
+EIGEN_MKL_TRMM_R(double, double, d, d)
+EIGEN_MKL_TRMM_R(dcomplex, MKL_Complex16, cd, z)
+EIGEN_MKL_TRMM_R(float, float, f, s)
+EIGEN_MKL_TRMM_R(scomplex, MKL_Complex8, cf, c)
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/TriangularMatrixVector.h b/third_party/eigen3/Eigen/src/Core/products/TriangularMatrixVector.h
new file mode 100644
index 0000000000..9863076958
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -0,0 +1,354 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRIANGULARMATRIXVECTOR_H
+#define EIGEN_TRIANGULARMATRIXVECTOR_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int StorageOrder, int Version=Specialized>
+struct triangular_matrix_vector_product;
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
+struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version>
+{
+  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  enum {
+    IsLower = ((Mode&Lower)==Lower),
+    HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
+    HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
+  };
+  static EIGEN_DONT_INLINE  void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+                                     const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha);
+};
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
+EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version>
+  ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+        const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha)
+  {
+    static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
+    Index size = (std::min)(_rows,_cols);
+    Index rows = IsLower ? _rows : (std::min)(_rows,_cols);
+    Index cols = IsLower ? (std::min)(_rows,_cols) : _cols;
+
+    typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
+    const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride));
+    typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs);
+
+    typedef Map<const Matrix<RhsScalar,Dynamic,1>, 0, InnerStride<> > RhsMap;
+    const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr));
+    typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs);
+
+    typedef Map<Matrix<ResScalar,Dynamic,1> > ResMap;
+    ResMap res(_res,rows);
+
+    typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
+
+    for (Index pi=0; pi<size; pi+=PanelWidth)
+    {
+      Index actualPanelWidth = (std::min)(PanelWidth, size-pi);
+      for (Index k=0; k<actualPanelWidth; ++k)
+      {
+        Index i = pi + k;
+        Index s = IsLower ? ((HasUnitDiag||HasZeroDiag) ? i+1 : i ) : pi;
+        Index r = IsLower ? actualPanelWidth-k : k+1;
+        if ((!(HasUnitDiag||HasZeroDiag)) || (--r)>0)
+          res.segment(s,r) += (alpha * cjRhs.coeff(i)) * cjLhs.col(i).segment(s,r);
+        if (HasUnitDiag)
+          res.coeffRef(i) += alpha * cjRhs.coeff(i);
+      }
+      Index r = IsLower ? rows - pi - actualPanelWidth : pi;
+      if (r>0)
+      {
+        Index s = IsLower ? pi+actualPanelWidth : 0;
+        general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run(
+            r, actualPanelWidth,
+            LhsMapper(&lhs.coeffRef(s,pi), lhsStride),
+            RhsMapper(&rhs.coeffRef(pi), rhsIncr),
+            &res.coeffRef(s), resIncr, alpha);
+      }
+    }
+    if((!IsLower) && cols>size)
+    {
+      general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run(
+          rows, cols-size,
+          LhsMapper(&lhs.coeffRef(0,size), lhsStride),
+          RhsMapper(&rhs.coeffRef(size), rhsIncr),
+          _res, resIncr, alpha);
+    }
+  }
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs,int Version>
+struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor,Version>
+{
+  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  enum {
+    IsLower = ((Mode&Lower)==Lower),
+    HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
+    HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
+  };
+  static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+                                    const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha);
+};
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs,int Version>
+EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor,Version>
+  ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+        const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha)
+  {
+    static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
+    Index diagSize = (std::min)(_rows,_cols);
+    Index rows = IsLower ? _rows : diagSize;
+    Index cols = IsLower ? diagSize : _cols;
+
+    typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap;
+    const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride));
+    typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs);
+
+    typedef Map<const Matrix<RhsScalar,Dynamic,1> > RhsMap;
+    const RhsMap rhs(_rhs,cols);
+    typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs);
+
+    typedef Map<Matrix<ResScalar,Dynamic,1>, 0, InnerStride<> > ResMap;
+    ResMap res(_res,rows,InnerStride<>(resIncr));
+
+    typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
+
+    for (Index pi=0; pi<diagSize; pi+=PanelWidth)
+    {
+      Index actualPanelWidth = (std::min)(PanelWidth, diagSize-pi);
+      for (Index k=0; k<actualPanelWidth; ++k)
+      {
+        Index i = pi + k;
+        Index s = IsLower ? pi  : ((HasUnitDiag||HasZeroDiag) ? i+1 : i);
+        Index r = IsLower ? k+1 : actualPanelWidth-k;
+        if ((!(HasUnitDiag||HasZeroDiag)) || (--r)>0)
+          res.coeffRef(i) += alpha * (cjLhs.row(i).segment(s,r).cwiseProduct(cjRhs.segment(s,r).transpose())).sum();
+        if (HasUnitDiag)
+          res.coeffRef(i) += alpha * cjRhs.coeff(i);
+      }
+      Index r = IsLower ? pi : cols - pi - actualPanelWidth;
+      if (r>0)
+      {
+        Index s = IsLower ? 0 : pi + actualPanelWidth;
+        general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run(
+            actualPanelWidth, r,
+            LhsMapper(&lhs.coeffRef(pi,s), lhsStride),
+            RhsMapper(&rhs.coeffRef(s), rhsIncr),
+            &res.coeffRef(pi), resIncr, alpha);
+      }
+    }
+    if(IsLower && rows>diagSize)
+    {
+      general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run(
+            rows-diagSize, cols,
+            LhsMapper(&lhs.coeffRef(diagSize,0), lhsStride),
+            RhsMapper(&rhs.coeffRef(0), rhsIncr),
+            &res.coeffRef(diagSize), resIncr, alpha);
+    }
+  }
+
+/***************************************************************************
+* Wrapper to product_triangular_vector
+***************************************************************************/
+
+template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
+struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,true> >
+ : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,true>, Lhs, Rhs> >
+{};
+
+template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
+struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,true,Rhs,false> >
+ : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,true,Rhs,false>, Lhs, Rhs> >
+{};
+
+
+template<int StorageOrder>
+struct trmv_selector;
+
+} // end namespace internal
+
+template<int Mode, typename Lhs, typename Rhs>
+struct TriangularProduct<Mode,true,Lhs,false,Rhs,true>
+  : public ProductBase<TriangularProduct<Mode,true,Lhs,false,Rhs,true>, Lhs, Rhs >
+{
+  EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct)
+
+  TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
+
+  template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+  {
+    eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
+
+    internal::trmv_selector<(int(internal::traits<Lhs>::Flags)&RowMajorBit) ? RowMajor : ColMajor>::run(*this, dst, alpha);
+  }
+};
+
+template<int Mode, typename Lhs, typename Rhs>
+struct TriangularProduct<Mode,false,Lhs,true,Rhs,false>
+  : public ProductBase<TriangularProduct<Mode,false,Lhs,true,Rhs,false>, Lhs, Rhs >
+{
+  EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct)
+
+  TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
+
+  template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+  {
+    eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
+
+    typedef TriangularProduct<(Mode & (UnitDiag|ZeroDiag)) | ((Mode & Lower) ? Upper : Lower),true,Transpose<const Rhs>,false,Transpose<const Lhs>,true> TriangularProductTranspose;
+    Transpose<Dest> dstT(dst);
+    internal::trmv_selector<(int(internal::traits<Rhs>::Flags)&RowMajorBit) ? ColMajor : RowMajor>::run(
+      TriangularProductTranspose(m_rhs.transpose(),m_lhs.transpose()), dstT, alpha);
+  }
+};
+
+namespace internal {
+
+// TODO: find a way to factorize this piece of code with gemv_selector since the logic is exactly the same.
+
+template<> struct trmv_selector<ColMajor>
+{
+  template<int Mode, typename Lhs, typename Rhs, typename Dest>
+  static void run(const TriangularProduct<Mode,true,Lhs,false,Rhs,true>& prod, Dest& dest, const typename TriangularProduct<Mode,true,Lhs,false,Rhs,true>::Scalar& alpha)
+  {
+    typedef TriangularProduct<Mode,true,Lhs,false,Rhs,true> ProductType;
+    typedef typename ProductType::Index Index;
+    typedef typename ProductType::LhsScalar   LhsScalar;
+    typedef typename ProductType::RhsScalar   RhsScalar;
+    typedef typename ProductType::Scalar      ResScalar;
+    typedef typename ProductType::RealScalar  RealScalar;
+    typedef typename ProductType::ActualLhsType ActualLhsType;
+    typedef typename ProductType::ActualRhsType ActualRhsType;
+    typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
+    typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
+    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
+
+    typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
+    typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
+
+    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
+                                  * RhsBlasTraits::extractScalarFactor(prod.rhs());
+
+    enum {
+      // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
+      // on, the other hand it is good for the cache to pack the vector anyways...
+      EvalToDestAtCompileTime = Dest::InnerStrideAtCompileTime==1,
+      ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex),
+      MightCannotUseDest = (Dest::InnerStrideAtCompileTime!=1) || ComplexByReal
+    };
+
+    gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest;
+
+    bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
+    bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
+
+    RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
+
+    ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
+                                                  evalToDest ? dest.data() : static_dest.data());
+
+    if(!evalToDest)
+    {
+      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      Index size = dest.size();
+      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      #endif
+      if(!alphaIsCompatible)
+      {
+        MappedDest(actualDestPtr, dest.size()).setZero();
+        compatibleAlpha = RhsScalar(1);
+      }
+      else
+        MappedDest(actualDestPtr, dest.size()) = dest;
+    }
+
+    internal::triangular_matrix_vector_product
+      <Index,Mode,
+       LhsScalar, LhsBlasTraits::NeedToConjugate,
+       RhsScalar, RhsBlasTraits::NeedToConjugate,
+       ColMajor>
+      ::run(actualLhs.rows(),actualLhs.cols(),
+            actualLhs.data(),actualLhs.outerStride(),
+            actualRhs.data(),actualRhs.innerStride(),
+            actualDestPtr,1,compatibleAlpha);
+
+    if (!evalToDest)
+    {
+      if(!alphaIsCompatible)
+        dest += actualAlpha * MappedDest(actualDestPtr, dest.size());
+      else
+        dest = MappedDest(actualDestPtr, dest.size());
+    }
+  }
+};
+
+template<> struct trmv_selector<RowMajor>
+{
+  template<int Mode, typename Lhs, typename Rhs, typename Dest>
+  static void run(const TriangularProduct<Mode,true,Lhs,false,Rhs,true>& prod, Dest& dest, const typename TriangularProduct<Mode,true,Lhs,false,Rhs,true>::Scalar& alpha)
+  {
+    typedef TriangularProduct<Mode,true,Lhs,false,Rhs,true> ProductType;
+    typedef typename ProductType::LhsScalar LhsScalar;
+    typedef typename ProductType::RhsScalar RhsScalar;
+    typedef typename ProductType::Scalar    ResScalar;
+    typedef typename ProductType::Index Index;
+    typedef typename ProductType::ActualLhsType ActualLhsType;
+    typedef typename ProductType::ActualRhsType ActualRhsType;
+    typedef typename ProductType::_ActualRhsType _ActualRhsType;
+    typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
+    typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
+
+    typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
+    typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
+
+    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
+                                  * RhsBlasTraits::extractScalarFactor(prod.rhs());
+
+    enum {
+      DirectlyUseRhs = _ActualRhsType::InnerStrideAtCompileTime==1
+    };
+
+    gemv_static_vector_if<RhsScalar,_ActualRhsType::SizeAtCompileTime,_ActualRhsType::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
+
+    ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,actualRhs.size(),
+        DirectlyUseRhs ? const_cast<RhsScalar*>(actualRhs.data()) : static_rhs.data());
+
+    if(!DirectlyUseRhs)
+    {
+      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      int size = actualRhs.size();
+      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      #endif
+      Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
+    }
+
+    internal::triangular_matrix_vector_product
+      <Index,Mode,
+       LhsScalar, LhsBlasTraits::NeedToConjugate,
+       RhsScalar, RhsBlasTraits::NeedToConjugate,
+       RowMajor>
+      ::run(actualLhs.rows(),actualLhs.cols(),
+            actualLhs.data(),actualLhs.outerStride(),
+            actualRhsPtr,1,
+            dest.data(),dest.innerStride(),
+            actualAlpha);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULARMATRIXVECTOR_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/TriangularMatrixVector_MKL.h b/third_party/eigen3/Eigen/src/Core/products/TriangularMatrixVector_MKL.h
new file mode 100644
index 0000000000..09f110da71
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/TriangularMatrixVector_MKL.h
@@ -0,0 +1,247 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL
+ *   Triangular matrix-vector product functionality based on ?TRMV.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H
+#define EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/**********************************************************************
+* This file implements triangular matrix-vector multiplication using BLAS
+**********************************************************************/
+
+// trmv/hemv specialization
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int StorageOrder>
+struct triangular_matrix_vector_product_trmv :
+  triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,StorageOrder,BuiltIn> {};
+
+#define EIGEN_MKL_TRMV_SPECIALIZE(Scalar) \
+template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
+struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor,Specialized> { \
+ static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \
+                                     const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \
+      triangular_matrix_vector_product_trmv<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor>::run( \
+        _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+  } \
+}; \
+template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
+struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,RowMajor,Specialized> { \
+ static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \
+                                     const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \
+      triangular_matrix_vector_product_trmv<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,RowMajor>::run( \
+        _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+  } \
+};
+
+EIGEN_MKL_TRMV_SPECIALIZE(double)
+EIGEN_MKL_TRMV_SPECIALIZE(float)
+EIGEN_MKL_TRMV_SPECIALIZE(dcomplex)
+EIGEN_MKL_TRMV_SPECIALIZE(scomplex)
+
+// implements col-major: res += alpha * op(triangular) * vector
+#define EIGEN_MKL_TRMV_CM(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
+struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor> { \
+  enum { \
+    IsLower = (Mode&Lower) == Lower, \
+    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \
+    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
+    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
+    LowUp = IsLower ? Lower : Upper \
+  }; \
+ static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \
+                 const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \
+ { \
+   if (ConjLhs || IsZeroDiag) { \
+     triangular_matrix_vector_product<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor,BuiltIn>::run( \
+       _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+     return; \
+   }\
+   Index size = (std::min)(_rows,_cols); \
+   Index rows = IsLower ? _rows : size; \
+   Index cols = IsLower ? size : _cols; \
+\
+   typedef VectorX##EIGPREFIX VectorRhs; \
+   EIGTYPE *x, *y;\
+\
+/* Set x*/ \
+   Map<const VectorRhs, 0, InnerStride<> > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \
+   VectorRhs x_tmp; \
+   if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
+   x = x_tmp.data(); \
+\
+/* Square part handling */\
+\
+   char trans, uplo, diag; \
+   MKL_INT m, n, lda, incx, incy; \
+   EIGTYPE const *a; \
+   MKLTYPE alpha_, beta_; \
+   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
+   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \
+\
+/* Set m, n */ \
+   n = (MKL_INT)size; \
+   lda = lhsStride; \
+   incx = 1; \
+   incy = resIncr; \
+\
+/* Set uplo, trans and diag*/ \
+   trans = 'N'; \
+   uplo = IsLower ? 'L' : 'U'; \
+   diag = IsUnitDiag ? 'U' : 'N'; \
+\
+/* call ?TRMV*/ \
+   MKLPREFIX##trmv(&uplo, &trans, &diag, &n, (const MKLTYPE*)_lhs, &lda, (MKLTYPE*)x, &incx); \
+\
+/* Add op(a_tr)rhs into res*/ \
+   MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \
+/* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \
+   if (size<(std::max)(rows,cols)) { \
+     typedef Matrix<EIGTYPE, Dynamic, Dynamic> MatrixLhs; \
+     if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
+     x = x_tmp.data(); \
+     if (size<rows) { \
+       y = _res + size*resIncr; \
+       a = _lhs + size; \
+       m = rows-size; \
+       n = size; \
+     } \
+     else { \
+       x += size; \
+       y = _res; \
+       a = _lhs + size*lda; \
+       m = size; \
+       n = cols-size; \
+     } \
+     MKLPREFIX##gemv(&trans, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)x, &incx, &beta_, (MKLTYPE*)y, &incy); \
+   } \
+  } \
+};
+
+EIGEN_MKL_TRMV_CM(double, double, d, d)
+EIGEN_MKL_TRMV_CM(dcomplex, MKL_Complex16, cd, z)
+EIGEN_MKL_TRMV_CM(float, float, f, s)
+EIGEN_MKL_TRMV_CM(scomplex, MKL_Complex8, cf, c)
+
+// implements row-major: res += alpha * op(triangular) * vector
+#define EIGEN_MKL_TRMV_RM(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
+struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor> { \
+  enum { \
+    IsLower = (Mode&Lower) == Lower, \
+    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \
+    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
+    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
+    LowUp = IsLower ? Lower : Upper \
+  }; \
+ static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \
+                 const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \
+ { \
+   if (IsZeroDiag) { \
+     triangular_matrix_vector_product<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor,BuiltIn>::run( \
+       _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+     return; \
+   }\
+   Index size = (std::min)(_rows,_cols); \
+   Index rows = IsLower ? _rows : size; \
+   Index cols = IsLower ? size : _cols; \
+\
+   typedef VectorX##EIGPREFIX VectorRhs; \
+   EIGTYPE *x, *y;\
+\
+/* Set x*/ \
+   Map<const VectorRhs, 0, InnerStride<> > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \
+   VectorRhs x_tmp; \
+   if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
+   x = x_tmp.data(); \
+\
+/* Square part handling */\
+\
+   char trans, uplo, diag; \
+   MKL_INT m, n, lda, incx, incy; \
+   EIGTYPE const *a; \
+   MKLTYPE alpha_, beta_; \
+   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
+   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \
+\
+/* Set m, n */ \
+   n = (MKL_INT)size; \
+   lda = lhsStride; \
+   incx = 1; \
+   incy = resIncr; \
+\
+/* Set uplo, trans and diag*/ \
+   trans = ConjLhs ? 'C' : 'T'; \
+   uplo = IsLower ? 'U' : 'L'; \
+   diag = IsUnitDiag ? 'U' : 'N'; \
+\
+/* call ?TRMV*/ \
+   MKLPREFIX##trmv(&uplo, &trans, &diag, &n, (const MKLTYPE*)_lhs, &lda, (MKLTYPE*)x, &incx); \
+\
+/* Add op(a_tr)rhs into res*/ \
+   MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \
+/* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \
+   if (size<(std::max)(rows,cols)) { \
+     typedef Matrix<EIGTYPE, Dynamic, Dynamic> MatrixLhs; \
+     if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
+     x = x_tmp.data(); \
+     if (size<rows) { \
+       y = _res + size*resIncr; \
+       a = _lhs + size*lda; \
+       m = rows-size; \
+       n = size; \
+     } \
+     else { \
+       x += size; \
+       y = _res; \
+       a = _lhs + size; \
+       m = size; \
+       n = cols-size; \
+     } \
+     MKLPREFIX##gemv(&trans, &n, &m, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)x, &incx, &beta_, (MKLTYPE*)y, &incy); \
+   } \
+  } \
+};
+
+EIGEN_MKL_TRMV_RM(double, double, d, d)
+EIGEN_MKL_TRMV_RM(dcomplex, MKL_Complex16, cd, z)
+EIGEN_MKL_TRMV_RM(float, float, f, s)
+EIGEN_MKL_TRMV_RM(scomplex, MKL_Complex8, cf, c)
+
+} // end namespase internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/TriangularSolverMatrix.h b/third_party/eigen3/Eigen/src/Core/products/TriangularSolverMatrix.h
new file mode 100644
index 0000000000..f5de67c59f
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -0,0 +1,331 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_H
+#define EIGEN_TRIANGULAR_SOLVER_MATRIX_H
+
+namespace Eigen { 
+
+namespace internal {
+
+// if the rhs is row major, let's transpose the product
+template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder>
+struct triangular_solve_matrix<Scalar,Index,Side,Mode,Conjugate,TriStorageOrder,RowMajor>
+{
+  static void run(
+    Index size, Index cols,
+    const Scalar*  tri, Index triStride,
+    Scalar* _other, Index otherStride,
+    level3_blocking<Scalar,Scalar>& blocking)
+  {
+    triangular_solve_matrix<
+      Scalar, Index, Side==OnTheLeft?OnTheRight:OnTheLeft,
+      (Mode&UnitDiag) | ((Mode&Upper) ? Lower : Upper),
+      NumTraits<Scalar>::IsComplex && Conjugate,
+      TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor>
+      ::run(size, cols, tri, triStride, _other, otherStride, blocking);
+  }
+};
+
+/* Optimized triangular solver with multiple right hand side and the triangular matrix on the left
+ */
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
+struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor>
+{
+  static EIGEN_DONT_INLINE void run(
+    Index size, Index otherSize,
+    const Scalar* _tri, Index triStride,
+    Scalar* _other, Index otherStride,
+    level3_blocking<Scalar,Scalar>& blocking);
+};
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
+EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor>::run(
+    Index size, Index otherSize,
+    const Scalar* _tri, Index triStride,
+    Scalar* _other, Index otherStride,
+    level3_blocking<Scalar,Scalar>& blocking)
+  {
+    Index cols = otherSize;
+
+    typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> TriMapper;
+    typedef blas_data_mapper<Scalar, Index, ColMajor> OtherMapper;
+    TriMapper tri(_tri, triStride);
+    OtherMapper other(_other, otherStride);
+
+    typedef gebp_traits<Scalar,Scalar> Traits;
+
+    enum {
+      SmallPanelWidth   = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
+      IsLower = (Mode&Lower) == Lower
+    };
+
+    Index kc = blocking.kc();                   // cache block size along the K direction
+    Index mc = (std::min)(size,blocking.mc());  // cache block size along the M direction
+
+    std::size_t sizeA = kc*mc;
+    std::size_t sizeB = kc*cols;
+
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+
+    conj_if<Conjugate> conj;
+    gebp_kernel<Scalar, Scalar, Index, OtherMapper, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
+    gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
+    gemm_pack_rhs<Scalar, Index, OtherMapper, Traits::nr, ColMajor, false, true> pack_rhs;
+
+    // the goal here is to subdivise the Rhs panels such that we keep some cache
+    // coherence when accessing the rhs elements
+    std::ptrdiff_t l1, l2, l3;
+    manage_caching_sizes(GetAction, &l1, &l2, &l3);
+    Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0;
+    subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr);
+
+    for(Index k2=IsLower ? 0 : size;
+        IsLower ? k2<size : k2>0;
+        IsLower ? k2+=kc : k2-=kc)
+    {
+      const Index actual_kc = (std::min)(IsLower ? size-k2 : k2, kc);
+
+      // We have selected and packed a big horizontal panel R1 of rhs. Let B be the packed copy of this panel,
+      // and R2 the remaining part of rhs. The corresponding vertical panel of lhs is split into
+      // A11 (the triangular part) and A21 the remaining rectangular part.
+      // Then the high level algorithm is:
+      //  - B = R1                    => general block copy (done during the next step)
+      //  - R1 = A11^-1 B             => tricky part
+      //  - update B from the new R1  => actually this has to be performed continuously during the above step
+      //  - R2 -= A21 * B             => GEPP
+
+      // The tricky part: compute R1 = A11^-1 B while updating B from R1
+      // The idea is to split A11 into multiple small vertical panels.
+      // Each panel can be split into a small triangular part T1k which is processed without optimization,
+      // and the remaining small part T2k which is processed using gebp with appropriate block strides
+      for(Index j2=0; j2<cols; j2+=subcols)
+      {
+        Index actual_cols = (std::min)(cols-j2,subcols);
+        // for each small vertical panels [T1k^T, T2k^T]^T of lhs
+        for (Index k1=0; k1<actual_kc; k1+=SmallPanelWidth)
+        {
+          Index actualPanelWidth = std::min<Index>(actual_kc-k1, SmallPanelWidth);
+          // tr solve
+          for (Index k=0; k<actualPanelWidth; ++k)
+          {
+            // TODO write a small kernel handling this (can be shared with trsv)
+            Index i  = IsLower ? k2+k1+k : k2-k1-k-1;
+            Index s  = IsLower ? k2+k1 : i+1;
+            Index rs = actualPanelWidth - k - 1; // remaining size
+
+            Scalar a = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(tri(i,i));
+            for (Index j=j2; j<j2+actual_cols; ++j)
+            {
+              if (TriStorageOrder==RowMajor)
+              {
+                Scalar b(0);
+                const Scalar* l = &tri(i,s);
+                Scalar* r = &other(s,j);
+                for (Index i3=0; i3<k; ++i3)
+                  b += conj(l[i3]) * r[i3];
+
+                other(i,j) = (other(i,j) - b)*a;
+              }
+              else
+              {
+                Index s = IsLower ? i+1 : i-rs;
+                Scalar b = (other(i,j) *= a);
+                Scalar* r = &other(s,j);
+                const Scalar* l = &tri(s,i);
+                for (Index i3=0;i3<rs;++i3)
+                  r[i3] -= b * conj(l[i3]);
+              }
+            }
+          }
+
+          Index lengthTarget = actual_kc-k1-actualPanelWidth;
+          Index startBlock   = IsLower ? k2+k1 : k2-k1-actualPanelWidth;
+          Index blockBOffset = IsLower ? k1 : lengthTarget;
+
+          // update the respective rows of B from other
+          pack_rhs(blockB+actual_kc*j2, other.getSubMapper(startBlock,j2), actualPanelWidth, actual_cols, actual_kc, blockBOffset);
+
+          // GEBP
+          if (lengthTarget>0)
+          {
+            Index startTarget  = IsLower ? k2+k1+actualPanelWidth : k2-actual_kc;
+
+            pack_lhs(blockA, tri.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
+
+            gebp_kernel(other.getSubMapper(startTarget,j2), blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
+                        actualPanelWidth, actual_kc, 0, blockBOffset);
+          }
+        }
+      }
+      
+      // R2 -= A21 * B => GEPP
+      {
+        Index start = IsLower ? k2+kc : 0;
+        Index end   = IsLower ? size : k2-kc;
+        for(Index i2=start; i2<end; i2+=mc)
+        {
+          const Index actual_mc = (std::min)(mc,end-i2);
+          if (actual_mc>0)
+          {
+            pack_lhs(blockA, tri.getSubMapper(i2, IsLower ? k2 : k2-kc), actual_kc, actual_mc);
+
+            gebp_kernel(other.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
+          }
+        }
+      }
+    }
+  }
+
+/* Optimized triangular solver with multiple left hand sides and the trinagular matrix on the right
+ */
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
+struct triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor>
+{
+  static EIGEN_DONT_INLINE void run(
+    Index size, Index otherSize,
+    const Scalar* _tri, Index triStride,
+    Scalar* _other, Index otherStride,
+    level3_blocking<Scalar,Scalar>& blocking);
+};
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
+EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor>::run(
+    Index size, Index otherSize,
+    const Scalar* _tri, Index triStride,
+    Scalar* _other, Index otherStride,
+    level3_blocking<Scalar,Scalar>& blocking)
+  {
+    Index rows = otherSize;
+
+    typedef blas_data_mapper<Scalar, Index, ColMajor> LhsMapper;
+    typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper;
+    LhsMapper lhs(_other, otherStride);
+    RhsMapper rhs(_tri, triStride);
+
+    typedef gebp_traits<Scalar,Scalar> Traits;
+    enum {
+      RhsStorageOrder   = TriStorageOrder,
+      SmallPanelWidth   = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
+      IsLower = (Mode&Lower) == Lower
+    };
+
+    Index kc = blocking.kc();                   // cache block size along the K direction
+    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
+
+    std::size_t sizeA = kc*mc;
+    std::size_t sizeB = kc*size;
+
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+
+    conj_if<Conjugate> conj;
+    gebp_kernel<Scalar, Scalar, Index, LhsMapper, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder,false,true> pack_rhs_panel;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
+
+    for(Index k2=IsLower ? size : 0;
+        IsLower ? k2>0 : k2<size;
+        IsLower ? k2-=kc : k2+=kc)
+    {
+      const Index actual_kc = (std::min)(IsLower ? k2 : size-k2, kc);
+      Index actual_k2 = IsLower ? k2-actual_kc : k2 ;
+
+      Index startPanel = IsLower ? 0 : k2+actual_kc;
+      Index rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc;
+      Scalar* geb = blockB+actual_kc*actual_kc;
+
+      if (rs>0) pack_rhs(geb, rhs.getSubMapper(actual_k2,startPanel), actual_kc, rs);
+
+      // triangular packing (we only pack the panels off the diagonal,
+      // neglecting the blocks overlapping the diagonal
+      {
+        for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth)
+        {
+          Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
+          Index actual_j2 = actual_k2 + j2;
+          Index panelOffset = IsLower ? j2+actualPanelWidth : 0;
+          Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
+
+          if (panelLength>0)
+          pack_rhs_panel(blockB+j2*actual_kc,
+                         rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
+                         panelLength, actualPanelWidth,
+                         actual_kc, panelOffset);
+        }
+      }
+
+      for(Index i2=0; i2<rows; i2+=mc)
+      {
+        const Index actual_mc = (std::min)(mc,rows-i2);
+
+        // triangular solver kernel
+        {
+          // for each small block of the diagonal (=> vertical panels of rhs)
+          for (Index j2 = IsLower
+                      ? (actual_kc - ((actual_kc%SmallPanelWidth) ? Index(actual_kc%SmallPanelWidth)
+                                                                  : Index(SmallPanelWidth)))
+                      : 0;
+               IsLower ? j2>=0 : j2<actual_kc;
+               IsLower ? j2-=SmallPanelWidth : j2+=SmallPanelWidth)
+          {
+            Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
+            Index absolute_j2 = actual_k2 + j2;
+            Index panelOffset = IsLower ? j2+actualPanelWidth : 0;
+            Index panelLength = IsLower ? actual_kc - j2 - actualPanelWidth : j2;
+
+            // GEBP
+            if(panelLength>0)
+            {
+              gebp_kernel(lhs.getSubMapper(i2,absolute_j2),
+                          blockA, blockB+j2*actual_kc,
+                          actual_mc, panelLength, actualPanelWidth,
+                          Scalar(-1),
+                          actual_kc, actual_kc, // strides
+                          panelOffset, panelOffset); // offsets
+            }
+
+            // unblocked triangular solve
+            for (Index k=0; k<actualPanelWidth; ++k)
+            {
+              Index j = IsLower ? absolute_j2+actualPanelWidth-k-1 : absolute_j2+k;
+
+              Scalar* r = &lhs(i2,j);
+              for (Index k3=0; k3<k; ++k3)
+              {
+                Scalar b = conj(rhs(IsLower ? j+1+k3 : absolute_j2+k3,j));
+                Scalar* a = &lhs(i2,IsLower ? j+1+k3 : absolute_j2+k3);
+                for (Index i=0; i<actual_mc; ++i)
+                  r[i] -= a[i] * b;
+              }
+              Scalar b = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(rhs(j,j));
+              for (Index i=0; i<actual_mc; ++i)
+                r[i] *= b;
+            }
+
+            // pack the just computed part of lhs to A
+            pack_lhs_panel(blockA, LhsMapper(_other+absolute_j2*otherStride+i2, otherStride),
+                           actualPanelWidth, actual_mc,
+                           actual_kc, j2);
+          }
+        }
+
+        if (rs>0)
+          gebp_kernel(lhs.getSubMapper(i2, startPanel), blockA, geb,
+                      actual_mc, actual_kc, rs, Scalar(-1),
+                      -1, -1, 0, 0);
+      }
+    }
+  }
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h b/third_party/eigen3/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h
new file mode 100644
index 0000000000..6a0bb83393
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h
@@ -0,0 +1,155 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to Intel(R) MKL
+ *   Triangular matrix * matrix product functionality based on ?TRMM.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H
+#define EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H
+
+namespace Eigen {
+
+namespace internal {
+
+// implements LeftSide op(triangular)^-1 * general
+#define EIGEN_MKL_TRSM_L(EIGTYPE, MKLTYPE, MKLPREFIX) \
+template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
+struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor> \
+{ \
+  enum { \
+    IsLower = (Mode&Lower) == Lower, \
+    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
+    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
+    conjA = ((TriStorageOrder==ColMajor) && Conjugate) ? 1 : 0 \
+  }; \
+  static void run( \
+      Index size, Index otherSize, \
+      const EIGTYPE* _tri, Index triStride, \
+      EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
+  { \
+   MKL_INT m = size, n = otherSize, lda, ldb; \
+   char side = 'L', uplo, diag='N', transa; \
+   /* Set alpha_ */ \
+   MKLTYPE alpha; \
+   EIGTYPE myone(1); \
+   assign_scalar_eig2mkl(alpha, myone); \
+   ldb = otherStride;\
+\
+   const EIGTYPE *a; \
+/* Set trans */ \
+   transa = (TriStorageOrder==RowMajor) ? ((Conjugate) ? 'C' : 'T') : 'N'; \
+/* Set uplo */ \
+   uplo = IsLower ? 'L' : 'U'; \
+   if (TriStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \
+/* Set a, lda */ \
+   typedef Matrix<EIGTYPE, Dynamic, Dynamic, TriStorageOrder> MatrixTri; \
+   Map<const MatrixTri, 0, OuterStride<> > tri(_tri,size,size,OuterStride<>(triStride)); \
+   MatrixTri a_tmp; \
+\
+   if (conjA) { \
+     a_tmp = tri.conjugate(); \
+     a = a_tmp.data(); \
+     lda = a_tmp.outerStride(); \
+   } else { \
+     a = _tri; \
+     lda = triStride; \
+   } \
+   if (IsUnitDiag) diag='U'; \
+/* call ?trsm*/ \
+   MKLPREFIX##trsm(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const MKLTYPE*)a, &lda, (MKLTYPE*)_other, &ldb); \
+ } \
+};
+
+EIGEN_MKL_TRSM_L(double, double, d)
+EIGEN_MKL_TRSM_L(dcomplex, MKL_Complex16, z)
+EIGEN_MKL_TRSM_L(float, float, s)
+EIGEN_MKL_TRSM_L(scomplex, MKL_Complex8, c)
+
+
+// implements RightSide general * op(triangular)^-1
+#define EIGEN_MKL_TRSM_R(EIGTYPE, MKLTYPE, MKLPREFIX) \
+template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
+struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor> \
+{ \
+  enum { \
+    IsLower = (Mode&Lower) == Lower, \
+    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
+    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
+    conjA = ((TriStorageOrder==ColMajor) && Conjugate) ? 1 : 0 \
+  }; \
+  static void run( \
+      Index size, Index otherSize, \
+      const EIGTYPE* _tri, Index triStride, \
+      EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
+  { \
+   MKL_INT m = otherSize, n = size, lda, ldb; \
+   char side = 'R', uplo, diag='N', transa; \
+   /* Set alpha_ */ \
+   MKLTYPE alpha; \
+   EIGTYPE myone(1); \
+   assign_scalar_eig2mkl(alpha, myone); \
+   ldb = otherStride;\
+\
+   const EIGTYPE *a; \
+/* Set trans */ \
+   transa = (TriStorageOrder==RowMajor) ? ((Conjugate) ? 'C' : 'T') : 'N'; \
+/* Set uplo */ \
+   uplo = IsLower ? 'L' : 'U'; \
+   if (TriStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \
+/* Set a, lda */ \
+   typedef Matrix<EIGTYPE, Dynamic, Dynamic, TriStorageOrder> MatrixTri; \
+   Map<const MatrixTri, 0, OuterStride<> > tri(_tri,size,size,OuterStride<>(triStride)); \
+   MatrixTri a_tmp; \
+\
+   if (conjA) { \
+     a_tmp = tri.conjugate(); \
+     a = a_tmp.data(); \
+     lda = a_tmp.outerStride(); \
+   } else { \
+     a = _tri; \
+     lda = triStride; \
+   } \
+   if (IsUnitDiag) diag='U'; \
+/* call ?trsm*/ \
+   MKLPREFIX##trsm(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const MKLTYPE*)a, &lda, (MKLTYPE*)_other, &ldb); \
+   /*std::cout << "TRMS_L specialization!\n";*/ \
+ } \
+};
+
+EIGEN_MKL_TRSM_R(double, double, d)
+EIGEN_MKL_TRSM_R(dcomplex, MKL_Complex16, z)
+EIGEN_MKL_TRSM_R(float, float, s)
+EIGEN_MKL_TRSM_R(scomplex, MKL_Complex8, c)
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H
diff --git a/third_party/eigen3/Eigen/src/Core/products/TriangularSolverVector.h b/third_party/eigen3/Eigen/src/Core/products/TriangularSolverVector.h
new file mode 100644
index 0000000000..b994759b26
--- /dev/null
+++ b/third_party/eigen3/Eigen/src/Core/products/TriangularSolverVector.h
@@ -0,0 +1,145 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TRIANGULAR_SOLVER_VECTOR_H
+#define EIGEN_TRIANGULAR_SOLVER_VECTOR_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate, int StorageOrder>
+struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheRight, Mode, Conjugate, StorageOrder>
+{
+  static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs)
+  {
+    triangular_solve_vector<LhsScalar,RhsScalar,Index,OnTheLeft,
+        ((Mode&Upper)==Upper ? Lower : Upper) | (Mode&UnitDiag),
+        Conjugate,StorageOrder==RowMajor?ColMajor:RowMajor
+      >::run(size, _lhs, lhsStride, rhs);
+  }
+};
+
+// forward and backward substitution, row-major, rhs is a vector
+template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate>
+struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, RowMajor>
+{
+  enum {
+    IsLower = ((Mode&Lower)==Lower)
+  };
+  static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs)
+  {
+    typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap;
+    const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride));
+
+    typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
+
+    typename internal::conditional<
+                          Conjugate,
+                          const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
+                          const LhsMap&>
+                        ::type cjLhs(lhs);
+    static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
+    for(Index pi=IsLower ? 0 : size;
+        IsLower ? pi<size : pi>0;
+        IsLower ? pi+=PanelWidth : pi-=PanelWidth)
+    {
+      Index actualPanelWidth = (std::min)(IsLower ? size - pi : pi, PanelWidth);
+
+      Index r = IsLower ? pi : size - pi; // remaining size
+      if (r > 0)
+      {
+        // let's directly call the low level product function because:
+        // 1 - it is faster to compile
+        // 2 - it is slighlty faster at runtime
+        Index startRow = IsLower ? pi : pi-actualPanelWidth;
+        Index startCol = IsLower ? 0 : pi;
+
+        general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
+          actualPanelWidth, r,
+          LhsMapper(&lhs.coeffRef(startRow,startCol), lhsStride),
+          RhsMapper(rhs + startCol, 1),
+          rhs + startRow, 1,
+          RhsScalar(-1));
+      }
+
+      for(Index k=0; k<actualPanelWidth; ++k)
+      {
+        Index i = IsLower ? pi+k : pi-k-1;
+        Index s = IsLower ? pi   : i+1;
+        if (k>0)
+          rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum();
+
+        if(!(Mode & UnitDiag))
+          rhs[i] /= cjLhs(i,i);
+      }
+    }
+  }
+};
+
+// forward and backward substitution, column-major, rhs is a vector
+template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate>
+struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, ColMajor>
+{
+  enum {
+    IsLower = ((Mode&Lower)==Lower)
+  };
+  static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs)
+  {
+    typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
+    const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride));
+    typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
+    typename internal::conditional<Conjugate,
+                                   const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
+                                   const LhsMap&
+                                  >::type cjLhs(lhs);
+    static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
+
+    for(Index pi=IsLower ? 0 : size;
+        IsLower ? pi<size : pi>0;
+        IsLower ? pi+=PanelWidth : pi-=PanelWidth)
+    {
+      Index actualPanelWidth = (std::min)(IsLower ? size - pi : pi, PanelWidth);
+      Index startBlock = IsLower ? pi : pi-actualPanelWidth;
+      Index endBlock = IsLower ? pi + actualPanelWidth : 0;
+
+      for(Index k=0; k<actualPanelWidth; ++k)
+      {
+        Index i = IsLower ? pi+k : pi-k-1;
+        if(!(Mode & UnitDiag))
+          rhs[i] /= cjLhs.coeff(i,i);
+
+        Index r = actualPanelWidth - k - 1; // remaining size
+        Index s = IsLower ? i+1 : i-r;
+        if (r>0)
+          Map<Matrix<RhsScalar,Dynamic,1> >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r);
+      }
+      Index r = IsLower ? size - endBlock : startBlock; // remaining size
+      if (r > 0)
+      {
+        // let's directly call the low level product function because:
+        // 1 - it is faster to compile
+        // 2 - it is slighlty faster at runtime
+        general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
+            r, actualPanelWidth,
+            LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride),
+            RhsMapper(rhs+startBlock, 1),
+            rhs+endBlock, 1, RhsScalar(-1));
+      }
+    }
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TRIANGULAR_SOLVER_VECTOR_H