62 files changed, 2142 insertions, 610 deletions
diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h
index 6831eab3d..67e97ffb8 100644
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -16,6 +16,15 @@
 namespace Eigen {
 
 namespace internal {
+  template<typename _MatrixType, int _UpLo> struct traits<LDLT<_MatrixType, _UpLo> >
+   : traits<_MatrixType>
+  {
+    typedef MatrixXpr XprKind;
+    typedef SolverStorage StorageKind;
+    typedef int StorageIndex;
+    enum { Flags = 0 };
+  };
+
   template<typename MatrixType, int UpLo> struct LDLT_Traits;
 
   // PositiveSemiDef means positive semi-definite and non-zero; same for NegativeSemiDef
@@ -48,20 +57,19 @@ namespace internal {
   * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT
   */
 template<typename _MatrixType, int _UpLo> class LDLT
+        : public SolverBase<LDLT<_MatrixType, _UpLo> >
 {
   public:
     typedef _MatrixType MatrixType;
+    typedef SolverBase<LDLT> Base;
+    friend class SolverBase<LDLT>;
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(LDLT)
     enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
       UpLo = _UpLo
     };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
-    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef Matrix<Scalar, RowsAtCompileTime, 1, 0, MaxRowsAtCompileTime, 1> TmpMatrixType;
 
     typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime> TranspositionType;
@@ -180,6 +188,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
       return m_sign == internal::NegativeSemiDef || m_sign == internal::ZeroSign;
     }
 
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** \returns a solution x of \f$ A x = b \f$ using the current decomposition of A.
       *
       * This function also supports in-place solves using the syntax <tt>x = decompositionObject.solve(x)</tt> .
@@ -197,13 +206,8 @@ template<typename _MatrixType, int _UpLo> class LDLT
       */
     template<typename Rhs>
     inline const Solve<LDLT, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "LDLT is not initialized.");
-      eigen_assert(m_matrix.rows()==b.rows()
-                && "LDLT::solve(): invalid number of rows of the right hand side matrix b");
-      return Solve<LDLT, Rhs>(*this, b.derived());
-    }
+    solve(const MatrixBase<Rhs>& b) const;
+    #endif
 
     template<typename Derived>
     bool solveInPlace(MatrixBase<Derived> &bAndX) const;
@@ -259,6 +263,9 @@ template<typename _MatrixType, int _UpLo> class LDLT
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+    template<bool Conjugate, typename RhsType, typename DstType>
+    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
     #endif
 
   protected:
@@ -559,14 +566,22 @@ template<typename _MatrixType, int _UpLo>
 template<typename RhsType, typename DstType>
 void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  eigen_assert(rhs.rows() == rows());
+  _solve_impl_transposed<true>(rhs, dst);
+}
+
+template<typename _MatrixType,int _UpLo>
+template<bool Conjugate, typename RhsType, typename DstType>
+void LDLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
   // dst = P b
   dst = m_transpositions * rhs;
 
   // dst = L^-1 (P b)
-  matrixL().solveInPlace(dst);
+  // dst = L^-*T (P b)
+  matrixL().template conjugateIf<!Conjugate>().solveInPlace(dst);
 
-  // dst = D^-1 (L^-1 P b)
+  // dst = D^-* (L^-1 P b)
+  // dst = D^-1 (L^-*T P b)
   // more precisely, use pseudo-inverse of D (see bug 241)
   using std::abs;
   const typename Diagonal<const MatrixType>::RealReturnType vecD(vectorD());
@@ -578,7 +593,6 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons
   // Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
   // Using numeric_limits::min() gives us more robustness to denormals.
   RealScalar tolerance = (std::numeric_limits<RealScalar>::min)();
-
   for (Index i = 0; i < vecD.size(); ++i)
   {
     if(abs(vecD(i)) > tolerance)
@@ -587,10 +601,12 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons
       dst.row(i).setZero();
   }
 
-  // dst = L^-T (D^-1 L^-1 P b)
-  matrixU().solveInPlace(dst);
+  // dst = L^-* (D^-* L^-1 P b)
+  // dst = L^-T (D^-1 L^-*T P b)
+  matrixL().transpose().template conjugateIf<Conjugate>().solveInPlace(dst);
 
-  // dst = P^-1 (L^-T D^-1 L^-1 P b) = A^-1 b
+  // dst = P^T (L^-* D^-* L^-1 P b) = A^-1 b
+  // dst = P^-T (L^-T D^-1 L^-*T P b) = A^-1 b
   dst = m_transpositions.transpose() * dst;
 }
 #endif
diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h
index 868766365..5876966e6 100644
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -13,6 +13,16 @@
 namespace Eigen {
 
 namespace internal{
+
+template<typename _MatrixType, int _UpLo> struct traits<LLT<_MatrixType, _UpLo> >
+ : traits<_MatrixType>
+{
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
+  enum { Flags = 0 };
+};
+
 template<typename MatrixType, int UpLo> struct LLT_Traits;
 }
 
@@ -54,18 +64,17 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
   * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
   */
 template<typename _MatrixType, int _UpLo> class LLT
+        : public SolverBase<LLT<_MatrixType, _UpLo> >
 {
   public:
     typedef _MatrixType MatrixType;
+    typedef SolverBase<LLT> Base;
+    friend class SolverBase<LLT>;
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(LLT)
     enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
-    typedef typename MatrixType::StorageIndex StorageIndex;
 
     enum {
       PacketSize = internal::packet_traits<Scalar>::size,
@@ -129,6 +138,7 @@ template<typename _MatrixType, int _UpLo> class LLT
       return Traits::getL(m_matrix);
     }
 
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
       *
       * Since this LLT class assumes anyway that the matrix A is invertible, the solution
@@ -141,13 +151,8 @@ template<typename _MatrixType, int _UpLo> class LLT
       */
     template<typename Rhs>
     inline const Solve<LLT, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "LLT is not initialized.");
-      eigen_assert(m_matrix.rows()==b.rows()
-                && "LLT::solve(): invalid number of rows of the right hand side matrix b");
-      return Solve<LLT, Rhs>(*this, b.derived());
-    }
+    solve(const MatrixBase<Rhs>& b) const;
+    #endif
 
     template<typename Derived>
     void solveInPlace(const MatrixBase<Derived> &bAndX) const;
@@ -205,6 +210,9 @@ template<typename _MatrixType, int _UpLo> class LLT
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+    template<bool Conjugate, typename RhsType, typename DstType>
+    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
     #endif
 
   protected:
@@ -476,8 +484,17 @@ template<typename _MatrixType,int _UpLo>
 template<typename RhsType, typename DstType>
 void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  dst = rhs;
-  solveInPlace(dst);
+  _solve_impl_transposed<true>(rhs, dst);
+}
+
+template<typename _MatrixType,int _UpLo>
+template<bool Conjugate, typename RhsType, typename DstType>
+void LLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+    dst = rhs;
+
+    matrixL().template conjugateIf<!Conjugate>().solveInPlace(dst);
+    matrixU().template conjugateIf<!Conjugate>().solveInPlace(dst);
 }
 #endif
 
diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h
index 16770fc7b..e58e68eda 100644
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h
@@ -178,6 +178,46 @@ class Array
       Base::_check_template_params();
       this->template _init2<T0,T1>(val0, val1);
     }
+
+    #if EIGEN_HAS_CXX11
+    /** \copydoc PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+     *
+     * Example: \include Array_variadic_ctor_cxx11.cpp
+     * Output: \verbinclude Array_variadic_ctor_cxx11.out
+     *
+     * \sa Array(const std::initializer_list<std::initializer_list<Scalar>>&)
+     * \sa Array(Scalar), Array(Scalar,Scalar)
+     */
+    template <typename... ArgTypes>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+      : Base(a0, a1, a2, a3, args...) {}
+
+    /** \brief Constructs an array and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11
+      * 
+      * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients:
+      * 
+      * Example: \include Array_initializer_list_23_cxx11.cpp
+      * Output: \verbinclude Array_initializer_list_23_cxx11.out
+      * 
+      * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered.
+      * 
+      * In the case of a compile-time column 1D array, implicit transposition from a single row is allowed.
+      * Therefore <code> Array<int,Dynamic,1>{{1,2,3,4,5}}</code> is legal and the more verbose syntax
+      * <code>Array<int,Dynamic,1>{{1},{2},{3},{4},{5}}</code> can be avoided:
+      * 
+      * Example: \include Array_initializer_list_vector_cxx11.cpp
+      * Output: \verbinclude Array_initializer_list_vector_cxx11.out
+      * 
+      * In the case of fixed-sized arrays, the initializer list sizes must exactly match the array sizes,
+      * and implicit transposition is allowed for compile-time 1D arrays only.
+      * 
+      * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Array(const std::initializer_list<std::initializer_list<Scalar> >& list) : Base(list) {}
+    #endif // end EIGEN_HAS_CXX11
+
     #else
     /** \brief Constructs a fixed-sized array initialized with coefficients starting at \a data */
     EIGEN_DEVICE_FUNC explicit Array(const Scalar *data);
@@ -189,7 +229,8 @@ class Array
       */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE explicit Array(Index dim);
-    /** constructs an initialized 1x1 Array with the given coefficient */
+    /** constructs an initialized 1x1 Array with the given coefficient
+      * \sa const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args */
     Array(const Scalar& value);
     /** constructs an uninitialized array with \a rows rows and \a cols columns.
       *
@@ -197,11 +238,14 @@ class Array
       * it is redundant to pass these parameters, so one should use the default constructor
       * Array() instead. */
     Array(Index rows, Index cols);
-    /** constructs an initialized 2D vector with given coefficients */
+    /** constructs an initialized 2D vector with given coefficients
+      * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) */
     Array(const Scalar& val0, const Scalar& val1);
-    #endif
+    #endif  // end EIGEN_PARSED_BY_DOXYGEN 
 
-    /** constructs an initialized 3D vector with given coefficients */
+    /** constructs an initialized 3D vector with given coefficients
+      * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+      */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2)
     {
@@ -211,7 +255,9 @@ class Array
       m_storage.data()[1] = val1;
       m_storage.data()[2] = val2;
     }
-    /** constructs an initialized 4D vector with given coefficients */
+    /** constructs an initialized 4D vector with given coefficients
+      * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+      */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2, const Scalar& val3)
     {
diff --git a/Eigen/src/Core/CwiseUnaryView.h b/Eigen/src/Core/CwiseUnaryView.h
index 271033056..21cf5ea9e 100644
--- a/Eigen/src/Core/CwiseUnaryView.h
+++ b/Eigen/src/Core/CwiseUnaryView.h
@@ -81,7 +81,7 @@ class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename in
 
     /** \returns the nested expression */
     typename internal::remove_reference<MatrixTypeNested>::type&
-    nestedExpression() { return m_matrix.const_cast_derived(); }
+    nestedExpression() { return m_matrix; }
 
   protected:
     MatrixTypeNested m_matrix;
diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h
index f8feefa27..65ec1f54b 100644
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -150,8 +150,8 @@ template<typename Derived> class DenseBase
           * \sa SizeAtCompileTime, MaxRowsAtCompileTime, MaxColsAtCompileTime
           */
 
-      IsVectorAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime == 1
-                           || internal::traits<Derived>::MaxColsAtCompileTime == 1,
+      IsVectorAtCompileTime = internal::traits<Derived>::RowsAtCompileTime == 1
+                           || internal::traits<Derived>::ColsAtCompileTime == 1,
         /**< This is set to true if either the number of rows or the number of
           * columns is known at compile-time to be equal to 1. Indeed, in that case,
           * we are dealing with a column-vector (if there is only one column) or with
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 2b2ee9e2c..04a321b9f 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -214,6 +214,21 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; }
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pandnot(const Packet& a, const Packet& b) { return a & (~b); }
 
+/** \internal \returns ones */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+ptrue(const Packet& /*a*/) { Packet b; memset((void*)&b, 0xff, sizeof(b)); return b;}
+
+template <typename RealScalar>
+EIGEN_DEVICE_FUNC inline std::complex<RealScalar> ptrue(const std::complex<RealScalar>& /*a*/) {
+  RealScalar b;
+  b = ptrue(b);
+  return std::complex<RealScalar>(b, b);
+}
+
+/** \internal \returns the bitwise not of \a a */
+template <typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pnot(const Packet& a) { return pxor(ptrue(a), a);}
+
 /** \internal \returns \a a shifted by N bits to the right */
 template<int N> EIGEN_DEVICE_FUNC inline int
 pshiftright(const int& a) { return a >> N; }
@@ -250,19 +265,19 @@ pselect(const Packet& mask, const Packet& a, const Packet& b) {
 
 /** \internal \returns a <= b as a bit mask */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pcmp_le(const Packet& a, const Packet& b); /* { return a<=b ? pnot(pxor(a,a)) : pxor(a,a); } */
+pcmp_le(const Packet& a, const Packet& b)  { return a<=b ? ptrue(a) : pzero(a); }
 
 /** \internal \returns a < b as a bit mask */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pcmp_lt(const Packet& a, const Packet& b); /* { return a<b  ? pnot(pxor(a,a)) : pxor(a,a); } */
+pcmp_lt(const Packet& a, const Packet& b)  { return a<b ? ptrue(a) : pzero(a); }
 
 /** \internal \returns a == b as a bit mask */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pcmp_eq(const Packet& a, const Packet& b); /* { return a==b ? pnot(pxor(a,a)) : pxor(a,a); } */
+pcmp_eq(const Packet& a, const Packet& b) { return a==b ? ptrue(a) : pzero(a); }
 
 /** \internal \returns a < b or a==NaN or b==NaN as a bit mask */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pcmp_lt_or_nan(const Packet& a, const Packet& b); /* { return pnot(pcmp_le(b,a)); } */
+pcmp_lt_or_nan(const Packet& a, const Packet& b) { return pnot(pcmp_le(b,a)); } 
 
 /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
@@ -393,18 +408,39 @@ typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_trai
 predux_half_dowto4(const Packet& a)
 { return a; }
 
-/** \internal \returns the product of the elements of \a a*/
+/** \internal \returns the product of the elements of \a a */
 template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
 { return a; }
 
-/** \internal \returns the min of the elements of \a a*/
+/** \internal \returns the min of the elements of \a a */
 template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a)
 { return a; }
 
-/** \internal \returns the max of the elements of \a a*/
+/** \internal \returns the max of the elements of \a a */
 template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(const Packet& a)
 { return a; }
 
+/** \internal \returns true if all coeffs of \a a means "true"
+  * It is supposed to be called on values returned by pcmp_*.
+  */
+// not needed yet
+// template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_all(const Packet& a)
+// { return bool(a); }
+
+/** \internal \returns true if any coeffs of \a a means "true"
+  * It is supposed to be called on values returned by pcmp_*.
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_any(const Packet& a)
+{
+  // Dirty but generic implementation where "true" is assumed to be non 0 and all the sames.
+  // It is expected that "true" is either:
+  //  - Scalar(1)
+  //  - bits full of ones (NaN for floats),
+  //  - or first bit equals to 1 (1 for ints, smallest denormal for floats).
+  // For all these cases, taking the sum is just fine, and this boils down to a no-op for scalars.
+  return bool(predux(a));
+}
+
 /** \internal \returns the reversed elements of \a a*/
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a)
 { return a; }
diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h
index 563df6e84..71377cee5 100644
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@@ -66,6 +66,11 @@ namespace Eigen
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op,hyperbolic sine,\sa ArrayBase::sinh)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op,hyperbolic cosine,\sa ArrayBase::cosh)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op,hyperbolic tangent,\sa ArrayBase::tanh)
+#if EIGEN_HAS_CXX11_MATH
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asinh,scalar_asinh_op,inverse hyperbolic sine,\sa ArrayBase::asinh)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acosh,scalar_acosh_op,inverse hyperbolic cosine,\sa ArrayBase::acosh)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atanh,scalar_atanh_op,inverse hyperbolic tangent,\sa ArrayBase::atanh)
+#endif
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(logistic,scalar_logistic_op,logistic function,\sa ArrayBase::logistic)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op,natural logarithm of the gamma function,\sa ArrayBase::lgamma)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma)
diff --git a/Eigen/src/Core/IO.h b/Eigen/src/Core/IO.h
index da7fd6cce..063511f24 100644
--- a/Eigen/src/Core/IO.h
+++ b/Eigen/src/Core/IO.h
@@ -41,6 +41,7 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
   *  - \b rowSuffix string printed at the end of each row
   *  - \b matPrefix string printed at the beginning of the matrix
   *  - \b matSuffix string printed at the end of the matrix
+  *  - \b fill character printed to fill the empty space in aligned columns
   *
   * Example: \include IOFormat.cpp
   * Output: \verbinclude IOFormat.out
@@ -53,9 +54,9 @@ struct IOFormat
   IOFormat(int _precision = StreamPrecision, int _flags = 0,
     const std::string& _coeffSeparator = " ",
     const std::string& _rowSeparator = "\n", const std::string& _rowPrefix="", const std::string& _rowSuffix="",
-    const std::string& _matPrefix="", const std::string& _matSuffix="")
+    const std::string& _matPrefix="", const std::string& _matSuffix="", const char _fill=' ')
   : matPrefix(_matPrefix), matSuffix(_matSuffix), rowPrefix(_rowPrefix), rowSuffix(_rowSuffix), rowSeparator(_rowSeparator),
-    rowSpacer(""), coeffSeparator(_coeffSeparator), precision(_precision), flags(_flags)
+    rowSpacer(""), coeffSeparator(_coeffSeparator), fill(_fill), precision(_precision), flags(_flags)
   {
     // TODO check if rowPrefix, rowSuffix or rowSeparator contains a newline
     // don't add rowSpacer if columns are not to be aligned
@@ -71,6 +72,7 @@ struct IOFormat
   std::string matPrefix, matSuffix;
   std::string rowPrefix, rowSuffix, rowSeparator, rowSpacer;
   std::string coeffSeparator;
+  char fill;
   int precision;
   int flags;
 };
@@ -176,18 +178,26 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
         width = std::max<Index>(width, Index(sstr.str().length()));
       }
   }
+  std::streamsize old_width = s.width();
+  char old_fill_character = s.fill();
   s << fmt.matPrefix;
   for(Index i = 0; i < m.rows(); ++i)
   {
     if (i)
       s << fmt.rowSpacer;
     s << fmt.rowPrefix;
-    if(width) s.width(width);
+    if(width) {
+      s.fill(fmt.fill);
+      s.width(width);
+    }
     s << m.coeff(i, 0);
     for(Index j = 1; j < m.cols(); ++j)
     {
       s << fmt.coeffSeparator;
-      if (width) s.width(width);
+      if(width) {
+        s.fill(fmt.fill);
+        s.width(width);
+      }
       s << m.coeff(i, j);
     }
     s << fmt.rowSuffix;
@@ -196,6 +206,10 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
   }
   s << fmt.matSuffix;
   if(explicit_precision) s.precision(old_precision);
+  if(width) {
+    s.fill(old_fill_character);
+    s.width(old_width);
+  }
   return s;
 }
 
diff --git a/Eigen/src/Core/IndexedView.h b/Eigen/src/Core/IndexedView.h
index 3485d8f46..377f8a5cc 100644
--- a/Eigen/src/Core/IndexedView.h
+++ b/Eigen/src/Core/IndexedView.h
@@ -132,7 +132,7 @@ public:
 
   /** \returns the nested expression */
   typename internal::remove_reference<XprType>::type&
-  nestedExpression() { return m_xpr.const_cast_derived(); }
+  nestedExpression() { return m_xpr; }
 
   /** \returns a const reference to the object storing/generating the row indices */
   const RowIndices& rowIndices() const { return m_rowIndices; }
diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index 7f4a7af93..32269ed2e 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -301,6 +301,45 @@ class Matrix
       Base::_check_template_params();
       Base::template _init2<T0,T1>(x, y);
     }
+
+    #if EIGEN_HAS_CXX11
+    /** \copydoc PlainObjectBase(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...)
+     *
+     * Example: \include Matrix_variadic_ctor_cxx11.cpp
+     * Output: \verbinclude Matrix_variadic_ctor_cxx11.out
+     *
+     * \sa Matrix(const std::initializer_list<std::initializer_list<Scalar>>&)
+     */
+    template <typename... ArgTypes>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2,  const Scalar& a3, const ArgTypes&... args)
+      : Base(a0, a1, a2, a3, args...) {}
+
+    /** \brief Constructs a Matrix and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11
+      * 
+      * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients:
+      * 
+      * Example: \include Matrix_initializer_list_23_cxx11.cpp
+      * Output: \verbinclude Matrix_initializer_list_23_cxx11.out
+      * 
+      * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered.
+      * 
+      * In the case of a compile-time column vector, implicit transposition from a single row is allowed.
+      * Therefore <code>VectorXd{{1,2,3,4,5}}</code> is legal and the more verbose syntax
+      * <code>RowVectorXd{{1},{2},{3},{4},{5}}</code> can be avoided:
+      * 
+      * Example: \include Matrix_initializer_list_vector_cxx11.cpp
+      * Output: \verbinclude Matrix_initializer_list_vector_cxx11.out
+      * 
+      * In the case of fixed-sized matrices, the initializer list sizes must exactly match the matrix sizes,
+      * and implicit transposition is allowed for compile-time vectors only.
+      * 
+      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...)
+      */
+     EIGEN_DEVICE_FUNC
+    explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list<std::initializer_list<Scalar>>& list) : Base(list) {}
+    #endif // end EIGEN_HAS_CXX11
+
     #else
     /** \brief Constructs a fixed-sized matrix initialized with coefficients starting at \a data */
     EIGEN_DEVICE_FUNC
@@ -319,7 +358,8 @@ class Matrix
       * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives).
       */
     EIGEN_STRONG_INLINE explicit Matrix(Index dim);
-    /** \brief Constructs an initialized 1x1 matrix with the given coefficient */
+    /** \brief Constructs an initialized 1x1 matrix with the given coefficient
+      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...) */
     Matrix(const Scalar& x);
     /** \brief Constructs an uninitialized matrix with \a rows rows and \a cols columns.
       *
@@ -336,11 +376,14 @@ class Matrix
     EIGEN_DEVICE_FUNC
     Matrix(Index rows, Index cols);
     
-    /** \brief Constructs an initialized 2D vector with given coefficients */
+    /** \brief Constructs an initialized 2D vector with given coefficients
+      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...) */
     Matrix(const Scalar& x, const Scalar& y);
-    #endif
+    #endif  // end EIGEN_PARSED_BY_DOXYGEN
 
-    /** \brief Constructs an initialized 3D vector with given coefficients */
+    /** \brief Constructs an initialized 3D vector with given coefficients
+      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...)
+      */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z)
     {
@@ -350,7 +393,9 @@ class Matrix
       m_storage.data()[1] = y;
       m_storage.data()[2] = z;
     }
-    /** \brief Constructs an initialized 4D vector with given coefficients */
+    /** \brief Constructs an initialized 4D vector with given coefficients
+      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...)
+      */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w)
     {
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index 596cdd133..4744e5cc4 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -468,6 +468,11 @@ template<typename Derived> class MatrixBase
     const MatrixFunctionReturnValue<Derived> matrixFunction(StemFunction f) const;
     EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cosh, hyperbolic cosine)
     EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sinh, hyperbolic sine)
+#if EIGEN_HAS_CXX11_MATH
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, atanh, inverse hyperbolic cosine)
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, acosh, inverse hyperbolic cosine)
+    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, asinh, inverse hyperbolic sine)
+#endif
     EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cos, cosine)
     EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sin, sine)
     EIGEN_MATRIX_FUNCTION(MatrixSquareRootReturnValue, sqrt, square root)
diff --git a/Eigen/src/Core/NestByValue.h b/Eigen/src/Core/NestByValue.h
index 01cf192e9..239bbba63 100644
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h
@@ -16,7 +16,11 @@ namespace Eigen {
 namespace internal {
 template<typename ExpressionType>
 struct traits<NestByValue<ExpressionType> > : public traits<ExpressionType>
-{};
+{
+  enum {
+    Flags = traits<ExpressionType>::Flags & ~NestByRefBit
+  };
+};
 }
 
 /** \class NestByValue
@@ -43,55 +47,11 @@ template<typename ExpressionType> class NestByValue
 
     EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); }
     EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); }
-    EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); }
-    EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); }
-
-    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const
-    {
-      return m_expression.coeff(row, col);
-    }
-
-    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
-    {
-      return m_expression.const_cast_derived().coeffRef(row, col);
-    }
-
-    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const
-    {
-      return m_expression.coeff(index);
-    }
-
-    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
-    {
-      return m_expression.const_cast_derived().coeffRef(index);
-    }
-
-    template<int LoadMode>
-    EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index row, Index col) const
-    {
-      return m_expression.template packet<LoadMode>(row, col);
-    }
-
-    template<int LoadMode>
-    EIGEN_DEVICE_FUNC inline void writePacket(Index row, Index col, const PacketScalar& x)
-    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(row, col, x);
-    }
-
-    template<int LoadMode>
-    EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index index) const
-    {
-      return m_expression.template packet<LoadMode>(index);
-    }
-
-    template<int LoadMode>
-    EIGEN_DEVICE_FUNC inline void writePacket(Index index, const PacketScalar& x)
-    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(index, x);
-    }
 
     EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }
 
+    EIGEN_DEVICE_FUNC const ExpressionType& nestedExpression() const { return m_expression; }
+
   protected:
     const ExpressionType m_expression;
 };
@@ -105,6 +65,21 @@ DenseBase<Derived>::nestByValue() const
   return NestByValue<Derived>(derived());
 }
 
+namespace internal {
+
+// Evaluator of Solve -> eval into a temporary
+template<typename ArgType>
+struct evaluator<NestByValue<ArgType> >
+  : public evaluator<ArgType>
+{
+  typedef evaluator<ArgType> Base;
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const NestByValue<ArgType>& xpr)
+    : Base(xpr.nestedExpression())
+  {}
+};
+}
+
 } // end namespace Eigen
 
 #endif // EIGEN_NESTBYVALUE_H
diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h
index f551dabb0..2deaa5aab 100644
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -526,6 +526,71 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
 //       EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
     }
 
+    #if EIGEN_HAS_CXX11
+    /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients. \cpp11
+      *
+      * \only_for_vectors
+      * 
+      * This constructor is for 1D array or vectors with more than 4 coefficients.
+      * There exists c++98 anologue constructors for fixed-size array/vector having 1, 2, 3, or 4 coefficients.
+      * 
+      * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this 
+      * constructor must match the the fixed number of rows (resp. columns) of \c *this.
+      */
+    template <typename... ArgTypes>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2,  const Scalar& a3, const ArgTypes&... args)
+      : m_storage()
+    {
+      _check_template_params();
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, sizeof...(args) + 4);
+      m_storage.data()[0] = a0;
+      m_storage.data()[1] = a1;
+      m_storage.data()[2] = a2;
+      m_storage.data()[3] = a3;
+      int i = 4;
+      auto x = {(m_storage.data()[i++] = args, 0)...};
+      static_cast<void>(x);
+    }
+
+    /** \brief Constructs a Matrix or Array and initializes it by elements given by an initializer list of initializer
+      * lists \cpp11
+      */
+    EIGEN_DEVICE_FUNC
+    explicit EIGEN_STRONG_INLINE PlainObjectBase(const std::initializer_list<std::initializer_list<Scalar>>& list)
+      : m_storage()
+    {
+      _check_template_params();
+
+      size_t list_size = 0;
+      if (list.begin() != list.end()) {
+        list_size = list.begin()->size();
+      }
+
+      // This is to allow syntax like VectorXi {{1, 2, 3, 4}}
+      if (ColsAtCompileTime == 1 && list.size() == 1) {
+        eigen_assert(list_size == static_cast<size_t>(RowsAtCompileTime) || RowsAtCompileTime == Dynamic);
+        resize(list_size, ColsAtCompileTime);
+        std::copy(list.begin()->begin(), list.begin()->end(), m_storage.data());
+      } else {
+        eigen_assert(list.size() == static_cast<size_t>(RowsAtCompileTime) || RowsAtCompileTime == Dynamic);
+        eigen_assert(list_size == static_cast<size_t>(ColsAtCompileTime) || ColsAtCompileTime == Dynamic);
+        resize(list.size(), list_size);
+       
+        Index row_index = 0;
+        for (const std::initializer_list<Scalar>& row : list) {
+          eigen_assert(list_size == row.size());
+          Index col_index = 0;
+          for (const Scalar& e : row) {
+            coeffRef(row_index, col_index) = e;
+            ++col_index;
+          }
+          ++row_index;
+        }
+      }
+    }
+    #endif  // end EIGEN_HAS_CXX11
+
     /** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index 720b6030c..e231a7d7d 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -397,6 +397,8 @@ public:
   * The template parameter \a BinaryOp is the type of the functor \a func which must be
   * an associative operator. Both current C++98 and C++11 functor styles are handled.
   *
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
+  *
   * \sa DenseBase::sum(), DenseBase::minCoeff(), DenseBase::maxCoeff(), MatrixBase::colwise(), MatrixBase::rowwise()
   */
 template<typename Derived>
@@ -415,6 +417,7 @@ DenseBase<Derived>::redux(const Func& func) const
 }
 
 /** \returns the minimum of all coefficients of \c *this.
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
   * \warning the result is undefined if \c *this contains NaN.
   */
 template<typename Derived>
@@ -425,6 +428,7 @@ DenseBase<Derived>::minCoeff() const
 }
 
 /** \returns the maximum of all coefficients of \c *this.
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
   * \warning the result is undefined if \c *this contains NaN.
   */
 template<typename Derived>
diff --git a/Eigen/src/Core/Reshaped.h b/Eigen/src/Core/Reshaped.h
index b7bd1b292..c955815e6 100644
--- a/Eigen/src/Core/Reshaped.h
+++ b/Eigen/src/Core/Reshaped.h
@@ -191,7 +191,7 @@ class ReshapedImpl_dense<XprType,Rows,Cols,Order,false>
     /** \returns the nested expression */
     EIGEN_DEVICE_FUNC
     typename internal::remove_reference<XprType>::type&
-    nestedExpression() { return m_xpr.const_cast_derived(); }
+    nestedExpression() { return m_xpr; }
 
   protected:
 
diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h
index 8b6b3ab03..711dbcf9a 100644
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h
@@ -203,7 +203,7 @@ struct vectorwise_reverse_inplace_impl<Horizontal>
 template<typename ExpressionType, int Direction>
 EIGEN_DEVICE_FUNC void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
 {
-  internal::vectorwise_reverse_inplace_impl<Direction>::run(_expression().const_cast_derived());
+  internal::vectorwise_reverse_inplace_impl<Direction>::run(m_matrix);
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h
index 2cf3fa1ef..2173799d9 100644
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@@ -61,6 +61,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
     typedef typename internal::traits<SelfAdjointView>::Scalar Scalar; 
     typedef typename MatrixType::StorageIndex StorageIndex;
     typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
+    typedef SelfAdjointView<typename internal::add_const<MatrixType>::type, UpLo> ConstSelfAdjointView;
 
     enum {
       Mode = internal::traits<SelfAdjointView>::Mode,
@@ -197,6 +198,18 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
     inline const ConjugateReturnType conjugate() const
     { return ConjugateReturnType(m_matrix.conjugate()); }
 
+    /** \returns an expression of the complex conjugate of \c *this if Cond==true,
+     *           returns \c *this otherwise.
+     */
+    template<bool Cond>
+    EIGEN_DEVICE_FUNC
+    inline typename internal::conditional<Cond,ConjugateReturnType,ConstSelfAdjointView>::type
+    conjugateIf() const
+    {
+      typedef typename internal::conditional<Cond,ConjugateReturnType,ConstSelfAdjointView>::type ReturnType;
+      return ReturnType(m_matrix.template conjugateIf<Cond>());
+    }
+
     typedef SelfAdjointView<const typename MatrixType::AdjointReturnType,TransposeMode> AdjointReturnType;
     /** \sa MatrixBase::adjoint() const */
     EIGEN_DEVICE_FUNC
diff --git a/Eigen/src/Core/Solve.h b/Eigen/src/Core/Solve.h
index 2bf940a26..ec4b4a987 100644
--- a/Eigen/src/Core/Solve.h
+++ b/Eigen/src/Core/Solve.h
@@ -19,7 +19,7 @@ template<typename Decomposition, typename RhsType, typename StorageKind> class S
   *
   * \brief Pseudo expression representing a solving operation
   *
-  * \tparam Decomposition the type of the matrix or decomposion object
+  * \tparam Decomposition the type of the matrix or decomposition object
   * \tparam Rhstype the type of the right-hand side
   *
   * This class represents an expression of A.solve(B)
diff --git a/Eigen/src/Core/SolverBase.h b/Eigen/src/Core/SolverBase.h
index 702a5485c..501461042 100644
--- a/Eigen/src/Core/SolverBase.h
+++ b/Eigen/src/Core/SolverBase.h
@@ -14,8 +14,35 @@ namespace Eigen {
 
 namespace internal {
 
+template<typename Derived>
+struct solve_assertion {
+    template<bool Transpose_, typename Rhs>
+    static void run(const Derived& solver, const Rhs& b) { solver.template _check_solve_assertion<Transpose_>(b); }
+};
+
+template<typename Derived>
+struct solve_assertion<Transpose<Derived> >
+{
+    typedef Transpose<Derived> type;
+
+    template<bool Transpose_, typename Rhs>
+    static void run(const type& transpose, const Rhs& b)
+    {
+        internal::solve_assertion<typename internal::remove_all<Derived>::type>::template run<true>(transpose.nestedExpression(), b);
+    }
+};
 
+template<typename Scalar, typename Derived>
+struct solve_assertion<CwiseUnaryOp<Eigen::internal::scalar_conjugate_op<Scalar>, const Transpose<Derived> > >
+{
+    typedef CwiseUnaryOp<Eigen::internal::scalar_conjugate_op<Scalar>, const Transpose<Derived> > type;
 
+    template<bool Transpose_, typename Rhs>
+    static void run(const type& adjoint, const Rhs& b)
+    {
+        internal::solve_assertion<typename internal::remove_all<Transpose<Derived> >::type>::template run<true>(adjoint.nestedExpression(), b);
+    }
+};
 } // end namespace internal
 
 /** \class SolverBase
@@ -35,7 +62,7 @@ namespace internal {
   *
   * \warning Currently, any other usage of transpose() and adjoint() are not supported and will produce compilation errors.
   *
-  * \sa class PartialPivLU, class FullPivLU
+  * \sa class PartialPivLU, class FullPivLU, class HouseholderQR, class ColPivHouseholderQR, class FullPivHouseholderQR, class CompleteOrthogonalDecomposition, class LLT, class LDLT, class SVDBase
   */
 template<typename Derived>
 class SolverBase : public EigenBase<Derived>
@@ -46,6 +73,9 @@ class SolverBase : public EigenBase<Derived>
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef Scalar CoeffReturnType;
 
+    template<typename Derived_>
+    friend struct internal::solve_assertion;
+
     enum {
       RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
       ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
@@ -75,7 +105,7 @@ class SolverBase : public EigenBase<Derived>
     inline const Solve<Derived, Rhs>
     solve(const MatrixBase<Rhs>& b) const
     {
-      eigen_assert(derived().rows()==b.rows() && "solve(): invalid number of rows of the right hand side matrix b");
+      internal::solve_assertion<typename internal::remove_all<Derived>::type>::template run<false>(derived(), b);
       return Solve<Derived, Rhs>(derived(), b.derived());
     }
 
@@ -113,6 +143,13 @@ class SolverBase : public EigenBase<Derived>
     }
 
   protected:
+
+    template<bool Transpose_, typename Rhs>
+    void _check_solve_assertion(const Rhs& b) const {
+        EIGEN_ONLY_USED_FOR_DEBUG(b);
+        eigen_assert(derived().m_isInitialized && "Solver is not initialized.");
+        eigen_assert((Transpose_?derived().cols():derived().rows())==b.rows() && "SolverBase::solve(): invalid number of rows of the right hand side matrix b");
+    }
 };
 
 namespace internal {
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index d7c204579..91a9ab1b9 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -392,7 +392,8 @@ struct checkTransposeAliasing_impl<Derived, OtherDerived, false>
 template<typename Dst, typename Src>
 void check_for_aliasing(const Dst &dst, const Src &src)
 {
-  internal::checkTransposeAliasing_impl<Dst, Src>::run(dst, src);
+  if((!Dst::IsVectorAtCompileTime) && dst.rows()>1 && dst.cols()>1)
+    internal::checkTransposeAliasing_impl<Dst, Src>::run(dst, src);
 }
 
 } // end namespace internal
diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h
index 521de6160..cf3532f06 100644
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -198,6 +198,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
     typedef typename internal::traits<TriangularView>::MatrixTypeNestedNonRef MatrixTypeNestedNonRef;
 
     typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
+    typedef TriangularView<typename internal::add_const<MatrixType>::type, _Mode> ConstTriangularView;
     
   public:
 
@@ -243,6 +244,18 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
     inline const ConjugateReturnType conjugate() const
     { return ConjugateReturnType(m_matrix.conjugate()); }
 
+    /** \returns an expression of the complex conjugate of \c *this if Cond==true,
+     *           returns \c *this otherwise.
+     */
+    template<bool Cond>
+    EIGEN_DEVICE_FUNC
+    inline typename internal::conditional<Cond,ConjugateReturnType,ConstTriangularView>::type
+    conjugateIf() const
+    {
+      typedef typename internal::conditional<Cond,ConjugateReturnType,ConstTriangularView>::type ReturnType;
+      return ReturnType(m_matrix.template conjugateIf<Cond>());
+    }
+
     typedef TriangularView<const typename MatrixType::AdjointReturnType,TransposeMode> AdjointReturnType;
     /** \sa MatrixBase::adjoint() const */
     EIGEN_DEVICE_FUNC
diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h
index a88b6e736..db0b9f8c4 100644
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@@ -173,6 +173,14 @@ struct member_redux {
   * Example: \include MatrixBase_colwise_iterator_cxx11.cpp
   * Output: \verbinclude MatrixBase_colwise_iterator_cxx11.out
   * 
+  * For a partial reduction on an empty input, some rules apply.
+  * For the sake of clarity, let's consider a vertical reduction:
+  *   - If the number of columns is zero, then a 1x0 row-major vector expression is returned.
+  *   - Otherwise, if the number of rows is zero, then
+  *       - a row vector of zeros is returned for sum-like reductions (sum, squaredNorm, norm, etc.)
+  *       - a row vector of ones is returned for a product reduction (e.g., <code>MatrixXd(n,0).colwise().prod()</code>)
+  *       - an assert is triggered for all other reductions (minCoeff,maxCoeff,redux(bin_op))
+  * 
   * \sa DenseBase::colwise(), DenseBase::rowwise(), class PartialReduxExpr
   */
 template<typename ExpressionType, int Direction> class VectorwiseOp
@@ -294,13 +302,19 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       * The template parameter \a BinaryOp is the type of the functor
       * of the custom redux operator. Note that func must be an associative operator.
       *
+      * \warning the size along the reduction direction must be strictly positive,
+      *          otherwise an assertion is triggered.
+      * 
       * \sa class VectorwiseOp, DenseBase::colwise(), DenseBase::rowwise()
       */
     template<typename BinaryOp>
     EIGEN_DEVICE_FUNC
     const typename ReduxReturnType<BinaryOp>::Type
     redux(const BinaryOp& func = BinaryOp()) const
-    { return typename ReduxReturnType<BinaryOp>::Type(_expression(), internal::member_redux<BinaryOp,Scalar>(func)); }
+    {
+      eigen_assert(redux_length()>0 && "you are using an empty matrix");
+      return typename ReduxReturnType<BinaryOp>::Type(_expression(), internal::member_redux<BinaryOp,Scalar>(func));
+    }
 
     typedef typename ReturnType<internal::member_minCoeff>::Type MinCoeffReturnType;
     typedef typename ReturnType<internal::member_maxCoeff>::Type MaxCoeffReturnType;
@@ -325,6 +339,9 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     /** \returns a row (or column) vector expression of the smallest coefficient
       * of each column (or row) of the referenced expression.
       *
+      * \warning the size along the reduction direction must be strictly positive,
+      *          otherwise an assertion is triggered.
+      * 
       * \warning the result is undefined if \c *this contains NaN.
       *
       * Example: \include PartialRedux_minCoeff.cpp
@@ -333,11 +350,17 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       * \sa DenseBase::minCoeff() */
     EIGEN_DEVICE_FUNC
     const MinCoeffReturnType minCoeff() const
-    { return MinCoeffReturnType(_expression()); }
+    {
+      eigen_assert(redux_length()>0 && "you are using an empty matrix");
+      return MinCoeffReturnType(_expression());
+    }
 
     /** \returns a row (or column) vector expression of the largest coefficient
       * of each column (or row) of the referenced expression.
       *
+      * \warning the size along the reduction direction must be strictly positive,
+      *          otherwise an assertion is triggered.
+      * 
       * \warning the result is undefined if \c *this contains NaN.
       *
       * Example: \include PartialRedux_maxCoeff.cpp
@@ -346,7 +369,10 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       * \sa DenseBase::maxCoeff() */
     EIGEN_DEVICE_FUNC
     const MaxCoeffReturnType maxCoeff() const
-    { return MaxCoeffReturnType(_expression()); }
+    {
+      eigen_assert(redux_length()>0 && "you are using an empty matrix");
+      return MaxCoeffReturnType(_expression());
+    }
 
     /** \returns a row (or column) vector expression of the squared norm
       * of each column (or row) of the referenced expression.
@@ -531,7 +557,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
       EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
       //eigen_assert((m_matrix.isNull()) == (other.isNull())); FIXME
-      return const_cast<ExpressionType&>(m_matrix = extendedTo(other.derived()));
+      return m_matrix = extendedTo(other.derived());
     }
 
     /** Adds the vector \a other to each subvector of \c *this */
@@ -541,7 +567,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
       EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
-      return const_cast<ExpressionType&>(m_matrix += extendedTo(other.derived()));
+      return m_matrix += extendedTo(other.derived());
     }
 
     /** Substracts the vector \a other to each subvector of \c *this */
@@ -551,7 +577,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
       EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
-      return const_cast<ExpressionType&>(m_matrix -= extendedTo(other.derived()));
+      return m_matrix -= extendedTo(other.derived());
     }
 
     /** Multiples each subvector of \c *this by the vector \a other */
@@ -563,7 +589,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
       EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
       m_matrix *= extendedTo(other.derived());
-      return const_cast<ExpressionType&>(m_matrix);
+      return m_matrix;
     }
 
     /** Divides each subvector of \c *this by the vector \a other */
@@ -575,7 +601,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
       EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
       m_matrix /= extendedTo(other.derived());
-      return const_cast<ExpressionType&>(m_matrix);
+      return m_matrix;
     }
 
     /** Returns the expression of the sum of the vector \a other to each subvector of \c *this */
@@ -690,6 +716,10 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     const HNormalizedReturnType hnormalized() const;
 
   protected:
+    Index redux_length() const
+    {
+      return Direction==Vertical ? m_matrix.rows() : m_matrix.cols();
+    }
     ExpressionTypeNested m_matrix;
 };
 
diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h
index 54c1883d9..f67d83bd1 100644
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h
@@ -40,6 +40,14 @@ struct visitor_impl<Visitor, Derived, 1>
   }
 };
 
+// This specialization enables visitors on empty matrices at compile-time
+template<typename Visitor, typename Derived>
+struct visitor_impl<Visitor, Derived, 0> {
+  EIGEN_DEVICE_FUNC
+  static inline void run(const Derived &/*mat*/, Visitor& /*visitor*/)
+  {}
+};
+
 template<typename Visitor, typename Derived>
 struct visitor_impl<Visitor, Derived, Dynamic>
 {
@@ -98,6 +106,8 @@ protected:
   *
   * \note compared to one or two \em for \em loops, visitors offer automatic
   * unrolling for small fixed size matrix.
+  * 
+  * \note if the matrix is empty, then the visitor is left unchanged.
   *
   * \sa minCoeff(Index*,Index*), maxCoeff(Index*,Index*), DenseBase::redux()
   */
@@ -106,6 +116,9 @@ template<typename Visitor>
 EIGEN_DEVICE_FUNC
 void DenseBase<Derived>::visit(Visitor& visitor) const
 {
+  if(size()==0)
+    return;
+  
   typedef typename internal::visitor_evaluator<Derived> ThisEvaluator;
   ThisEvaluator thisEval(derived());
   
@@ -124,6 +137,8 @@ namespace internal {
 template <typename Derived>
 struct coeff_visitor
 {
+  // default initialization to avoid countless invalid maybe-uninitialized warnings by gcc
+  coeff_visitor() : row(-1), col(-1), res(0) {}
   typedef typename Derived::Scalar Scalar;
   Index row, col;
   Scalar res;
@@ -196,6 +211,9 @@ struct functor_traits<max_coeff_visitor<Scalar> > {
 
 /** \fn DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
   * \returns the minimum of all coefficients of *this and puts in *row and *col its location.
+  * 
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
+  * 
   * \warning the result is undefined if \c *this contains NaN.
   *
   * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff()
@@ -206,6 +224,8 @@ EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
 {
+  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+
   internal::min_coeff_visitor<Derived> minVisitor;
   this->visit(minVisitor);
   *rowId = minVisitor.row;
@@ -214,6 +234,9 @@ DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
 }
 
 /** \returns the minimum of all coefficients of *this and puts in *index its location.
+  * 
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
+  * 
   * \warning the result is undefined if \c *this contains NaN. 
   *
   * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::minCoeff()
@@ -224,6 +247,8 @@ EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff(IndexType* index) const
 {
+  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   internal::min_coeff_visitor<Derived> minVisitor;
   this->visit(minVisitor);
@@ -233,6 +258,9 @@ DenseBase<Derived>::minCoeff(IndexType* index) const
 
 /** \fn DenseBase<Derived>::maxCoeff(IndexType* rowId, IndexType* colId) const
   * \returns the maximum of all coefficients of *this and puts in *row and *col its location.
+  * 
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
+  * 
   * \warning the result is undefined if \c *this contains NaN. 
   *
   * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff()
@@ -243,6 +271,8 @@ EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const
 {
+  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+
   internal::max_coeff_visitor<Derived> maxVisitor;
   this->visit(maxVisitor);
   *rowPtr = maxVisitor.row;
@@ -251,6 +281,9 @@ DenseBase<Derived>::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const
 }
 
 /** \returns the maximum of all coefficients of *this and puts in *index its location.
+  * 
+  * \warning the matrix must be not empty, otherwise an assertion is triggered.
+  *
   * \warning the result is undefined if \c *this contains NaN.
   *
   * \sa DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::maxCoeff()
@@ -261,6 +294,8 @@ EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff(IndexType* index) const
 {
+  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   internal::max_coeff_visitor<Derived> maxVisitor;
   this->visit(maxVisitor);
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h
index e7e2a1033..5b8ff59bd 100644
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -69,6 +69,14 @@ template<> EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, con
   return Packet4cf(result);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) {
+  __m256 eq = _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ);
+  return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf ptrue<Packet4cf>(const Packet4cf& a) { return Packet4cf(ptrue(Packet8f(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet4cf pnot<Packet4cf>(const Packet4cf& a) { return Packet4cf(pnot(Packet8f(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet4cf pand   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf por    <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf pxor   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); }
@@ -276,6 +284,14 @@ template<> EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, con
   return Packet2cd(_mm256_addsub_pd(even, odd));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) {
+  __m256d eq = _mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ);
+  return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd ptrue<Packet2cd>(const Packet2cd& a) { return Packet2cd(ptrue(Packet4d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet2cd pnot<Packet2cd>(const Packet2cd& a) { return Packet2cd(pnot(Packet4d(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet2cd pand   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd por    <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd pxor   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); }
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index e771c0f25..ee00f1f7d 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -228,6 +228,7 @@ template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const
 template<> EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LE_OQ); }
 template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LT_OQ); }
 template<> EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_EQ_OQ); }
+template<> EIGEN_STRONG_INLINE Packet4d pcmp_eq(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_EQ_OQ); }
 template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a, b, _CMP_NGE_UQ); }
 
 template<> EIGEN_STRONG_INLINE Packet8i pcmp_eq(const Packet8i& a, const Packet8i& b) {
@@ -249,6 +250,37 @@ template<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { ret
 template<> EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) { return _mm256_floor_ps(a); }
 template<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { return _mm256_floor_pd(a); }
 
+
+template<> EIGEN_STRONG_INLINE Packet8i ptrue<Packet8i>(const Packet8i& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  // vpcmpeqd has lower latency than the more general vcmpps
+  return _mm256_cmpeq_epi32(a,a);
+#else
+  const __m256 b = _mm256_castsi256_ps(a);
+  return _mm256_castps_si256(_mm256_cmp_ps(b,b,_CMP_TRUE_UQ));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f ptrue<Packet8f>(const Packet8f& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  // vpcmpeqd has lower latency than the more general vcmpps
+  const __m256i b = _mm256_castps_si256(a);
+  return _mm256_castsi256_ps(_mm256_cmpeq_epi32(b,b));
+#else
+  return _mm256_cmp_ps(a,a,_CMP_TRUE_UQ);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet4d ptrue<Packet4d>(const Packet4d& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  // vpcmpeqq has lower latency than the more general vcmppd
+  const __m256i b = _mm256_castpd_si256(a);
+  return _mm256_castsi256_pd(_mm256_cmpeq_epi64(b,b));
+#else
+  return _mm256_cmp_pd(a,a,_CMP_TRUE_UQ);
+#endif
+}
+
 template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(const Packet8i& a, const Packet8i& b) {
@@ -575,6 +607,16 @@ template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
   return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
 }
 
+// not needed yet
+// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet8f& x)
+// {
+//   return _mm256_movemask_ps(x)==0xFF;
+// }
+
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x)
+{
+  return _mm256_movemask_ps(x)!=0;
+}
 
 template<int Offset>
 struct palign_impl<Offset,Packet8f>
diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h
index 569ee01ff..9a89dd01f 100644
--- a/Eigen/src/Core/arch/AVX512/Complex.h
+++ b/Eigen/src/Core/arch/AVX512/Complex.h
@@ -56,6 +56,8 @@ template<> struct unpacket_traits<Packet8cf> {
   typedef Packet4cf half;
 };
 
+template<> EIGEN_STRONG_INLINE Packet8cf ptrue<Packet8cf>(const Packet8cf& a) { return Packet8cf(ptrue(Packet16f(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet8cf pnot<Packet8cf>(const Packet8cf& a) { return Packet8cf(pnot(Packet16f(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet8cf padd<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_add_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet8cf psub<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_sub_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet8cf pnegate(const Packet8cf& a)
@@ -67,7 +69,7 @@ template<> EIGEN_STRONG_INLINE Packet8cf pconj(const Packet8cf& a)
   const __m512 mask = _mm512_castsi512_ps(_mm512_setr_epi32(
     0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,
     0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000));
-  return Packet8cf(_mm512_xor_ps(a.v,mask));
+  return Packet8cf(pxor(a.v,mask));
 }
 
 template<> EIGEN_STRONG_INLINE Packet8cf pmul<Packet8cf>(const Packet8cf& a, const Packet8cf& b)
@@ -76,10 +78,16 @@ template<> EIGEN_STRONG_INLINE Packet8cf pmul<Packet8cf>(const Packet8cf& a, con
   return Packet8cf(_mm512_fmaddsub_ps(_mm512_moveldup_ps(a.v), b.v, tmp2));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8cf pand   <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_and_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet8cf por    <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_or_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet8cf pxor   <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_xor_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet8cf pandnot<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_andnot_ps(b.v,a.v)); }
+template<> EIGEN_STRONG_INLINE Packet8cf pand   <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pand(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet8cf por    <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(por(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet8cf pxor   <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pxor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet8cf pandnot<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pandnot(a.v,b.v)); }
+
+template <>
+EIGEN_STRONG_INLINE Packet8cf pcmp_eq(const Packet8cf& a, const Packet8cf& b) {
+  __m512 eq = pcmp_eq<Packet16f>(a.v, b.v);
+  return Packet8cf(pand(eq, _mm512_permute_ps(eq, 0xB1)));
+}
 
 template<> EIGEN_STRONG_INLINE Packet8cf pload <Packet8cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload<Packet16f>(&numext::real_ref(*from))); }
 template<> EIGEN_STRONG_INLINE Packet8cf ploadu<Packet8cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet8cf(ploadu<Packet16f>(&numext::real_ref(*from))); }
@@ -125,20 +133,20 @@ template<> EIGEN_STRONG_INLINE Packet8cf preverse(const Packet8cf& a) {
 
 template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet8cf>(const Packet8cf& a)
 {
-  return predux(padd(Packet4cf(_mm512_extractf32x8_ps(a.v,0)),
-                     Packet4cf(_mm512_extractf32x8_ps(a.v,1))));
+  return predux(padd(Packet4cf(extract256<0>(a.v)),
+                     Packet4cf(extract256<1>(a.v))));
 }
 
 template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet8cf>(const Packet8cf& a)
 {
-  return predux_mul(pmul(Packet4cf(_mm512_extractf32x8_ps(a.v, 0)),
-                         Packet4cf(_mm512_extractf32x8_ps(a.v, 1))));
+  return predux_mul(pmul(Packet4cf(extract256<0>(a.v)),
+                         Packet4cf(extract256<1>(a.v))));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet4cf predux_half_dowto4<Packet8cf>(const Packet8cf& a) {
-  __m256 lane0 = _mm512_extractf32x8_ps(a.v, 0);
-  __m256 lane1 = _mm512_extractf32x8_ps(a.v, 1);
+  __m256 lane0 = extract256<0>(a.v);
+  __m256 lane1 = extract256<1>(a.v);
   __m256 res = _mm256_add_ps(lane0, lane1);
   return Packet4cf(res);
 }
@@ -264,10 +272,18 @@ template<> EIGEN_STRONG_INLINE Packet4cd pmul<Packet4cd>(const Packet4cd& a, con
   return Packet4cd(_mm512_fmaddsub_pd(tmp1, b.v, odd));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4cd pand   <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_and_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd por    <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_or_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd pxor   <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_xor_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd pandnot<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_andnot_pd(b.v,a.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd ptrue<Packet4cd>(const Packet4cd& a) { return Packet4cd(ptrue(Packet8d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet4cd pnot<Packet4cd>(const Packet4cd& a) { return Packet4cd(pnot(Packet8d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet4cd pand   <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pand(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd por    <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(por(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd pxor   <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pxor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd pandnot<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pandnot(a.v,b.v)); }
+
+template <>
+EIGEN_STRONG_INLINE Packet4cd pcmp_eq(const Packet4cd& a, const Packet4cd& b) {
+  __m512d eq = pcmp_eq<Packet8d>(a.v, b.v);
+  return Packet4cd(pand(eq, _mm512_permute_pd(eq, 0x55)));
+}
 
 template<> EIGEN_STRONG_INLINE Packet4cd pload <Packet4cd>(const std::complex<double>* from)
 { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cd(pload<Packet8d>((const double*)from)); }
@@ -294,23 +310,23 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<
 template<> EIGEN_DEVICE_FUNC inline Packet4cd pgather<std::complex<double>, Packet4cd>(const std::complex<double>* from, Index stride)
 {
   return Packet4cd(_mm512_insertf64x4(_mm512_castpd256_pd512(
-            _mm256_insertf128_pd(_mm256_castpd128_pd256(pload<Packet1cd>(from+0*stride).v), pload<Packet1cd>(from+1*stride).v,1)),
-            _mm256_insertf128_pd(_mm256_castpd128_pd256(pload<Packet1cd>(from+2*stride).v), pload<Packet1cd>(from+3*stride).v,1), 1));
+            _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu<Packet1cd>(from+0*stride).v), ploadu<Packet1cd>(from+1*stride).v,1)),
+            _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu<Packet1cd>(from+2*stride).v), ploadu<Packet1cd>(from+3*stride).v,1), 1));
 }
 
 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet4cd>(std::complex<double>* to, const Packet4cd& from, Index stride)
 {
   __m512i fromi = _mm512_castpd_si512(from.v);
   double* tod = (double*)(void*)to;
-  _mm_store_pd(tod+0*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,0)) );
-  _mm_store_pd(tod+2*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,1)) );
-  _mm_store_pd(tod+4*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,2)) );
-  _mm_store_pd(tod+6*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,3)) );
+  _mm_storeu_pd(tod+0*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,0)) );
+  _mm_storeu_pd(tod+2*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,1)) );
+  _mm_storeu_pd(tod+4*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,2)) );
+  _mm_storeu_pd(tod+6*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,3)) );
 }
 
 template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet4cd>(const Packet4cd& a)
 {
-  __m128d low = _mm512_extractf64x2_pd(a.v, 0);
+  __m128d low = extract128<0>(a.v);
   EIGEN_ALIGN16 double res[2];
   _mm_store_pd(res, low);
   return std::complex<double>(res[0],res[1]);
@@ -392,12 +408,40 @@ template<> EIGEN_STRONG_INLINE Packet4cd pcplxflip<Packet4cd>(const Packet4cd& x
 
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet8cf,4>& kernel) {
-  ptranspose(reinterpret_cast<PacketBlock<Packet8d,4>&>(kernel));
+  PacketBlock<Packet8d,4> pb;
+  
+  pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v);
+  pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v);
+  pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v);
+  pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v);
+  ptranspose(pb);
+  kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]);
+  kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]);
+  kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]);
+  kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]);
 }
 
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet8cf,8>& kernel) {
-  ptranspose(reinterpret_cast<PacketBlock<Packet8d,8>&>(kernel));
+  PacketBlock<Packet8d,8> pb;
+  
+  pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v);
+  pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v);
+  pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v);
+  pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v);
+  pb.packet[4] = _mm512_castps_pd(kernel.packet[4].v);
+  pb.packet[5] = _mm512_castps_pd(kernel.packet[5].v);
+  pb.packet[6] = _mm512_castps_pd(kernel.packet[6].v);
+  pb.packet[7] = _mm512_castps_pd(kernel.packet[7].v);
+  ptranspose(pb);
+  kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]);
+  kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]);
+  kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]);
+  kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]);
+  kernel.packet[4].v = _mm512_castpd_ps(pb.packet[4]);
+  kernel.packet[5].v = _mm512_castpd_ps(pb.packet[5]);
+  kernel.packet[6].v = _mm512_castpd_ps(pb.packet[6]);
+  kernel.packet[7].v = _mm512_castpd_ps(pb.packet[7]);
 }
 
 EIGEN_DEVICE_FUNC inline void
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
index aac707596..c2158c538 100644
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -47,6 +47,7 @@ plog<Packet16f>(const Packet16f& _x) {
   // The smallest non denormalized float number.
   _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000);
   _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000);
+  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(pos_inf, 0x7f800000);
   _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
 
   // Polynomial coefficients.
@@ -116,10 +117,16 @@ plog<Packet16f>(const Packet16f& _x) {
   x = padd(x, y);
   x = padd(x, y2);
 
-  // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
+  __mmask16 pos_inf_mask = _mm512_cmp_ps_mask(_x,p16f_pos_inf,_CMP_EQ_OQ);
+  // Filter out invalid inputs, i.e.:
+  //  - negative arg will be NAN,
+  //  - 0 will be -INF.
+  //  - +INF will be +INF
   return _mm512_mask_blend_ps(iszero_mask,
-                              _mm512_mask_blend_ps(invalid_mask, x, p16f_nan),
-                              p16f_minus_inf);
+            _mm512_mask_blend_ps(invalid_mask,
+              _mm512_mask_blend_ps(pos_inf_mask,x,p16f_pos_inf),
+              p16f_nan),
+            p16f_minus_inf);
 }
 #endif
 
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index 9c3121062..2199970ad 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -57,7 +57,7 @@ template<> struct packet_traits<float>  : default_packet_traits
     HasBlend = 0,
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
-#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG
+#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
 #ifdef EIGEN_VECTORIZE_AVX512DQ
     HasLog = 1,
 #endif
@@ -77,7 +77,7 @@ template<> struct packet_traits<double> : default_packet_traits
     AlignedOnScalar = 1,
     size = 8,
     HasHalfPacket = 1,
-#if EIGEN_GNUC_AT_LEAST(5, 3)
+#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
     HasSqrt = EIGEN_FAST_MATH,
     HasRsqrt = EIGEN_FAST_MATH,
 #endif
@@ -262,12 +262,78 @@ EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a,
   return _mm512_max_pd(b, a);
 }
 
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I_); }
+template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { return _mm512_extractf64x2_pd(x,I_); }
+EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { return _mm512_insertf32x8(_mm512_castps256_ps512(a),b,1); }
+#else
+// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
+template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) {
+  return  _mm256_castsi256_ps(_mm512_extracti64x4_epi64( _mm512_castps_si512(x),I_));
+}
+
+// AVX512F does not define _mm512_extractf64x2_pd to extract _m128 from _m512
+template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) {
+  return _mm_castsi128_pd(_mm512_extracti32x4_epi32( _mm512_castpd_si512(x),I_));
+}
+
+EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) {
+  return _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)),
+                                                _mm256_castps_si256(b),1));
+}
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
+  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ);
+  return _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
+  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ);
+  return _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
+  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGT_UQ);
+  return _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
+}
+
 template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) {
-  __m256i lo = _mm256_cmpeq_epi32(_mm512_extracti64x4_epi64(a, 0), _mm512_extracti64x4_epi64(b, 0));
-  __m256i hi = _mm256_cmpeq_epi32(_mm512_extracti64x4_epi64(a, 1), _mm512_extracti64x4_epi64(b, 1));
-  return _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1);
+  __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _CMP_EQ_OQ);
+  return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
+  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ);
+  return _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) {
+  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ);
+  return _mm512_castsi512_pd(
+      _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i ptrue<Packet16i>(const Packet16i& /*a*/) {
+  return _mm512_set1_epi32(0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f ptrue<Packet16f>(const Packet16f& a) {
+  return _mm512_castsi512_ps(ptrue<Packet16i>(_mm512_castps_si512(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d ptrue<Packet8d>(const Packet8d& a) {
+  return _mm512_castsi512_pd(ptrue<Packet16i>(_mm512_castpd_si512(a)));
+}
 
 template <>
 EIGEN_STRONG_INLINE Packet16i pand<Packet16i>(const Packet16i& a,
@@ -924,6 +990,13 @@ EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) {
   return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
 }
 
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x)
+{
+  Packet16i xi = _mm512_castps_si512(x);
+  __mmask16 tmp = _mm512_test_epi32_mask(xi,xi);
+  return !_mm512_kortestz(tmp,tmp);
+}
+
 template <int Offset>
 struct palign_impl<Offset, Packet16f> {
   static EIGEN_STRONG_INLINE void run(Packet16f& first,
diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
index 5404a624e..440d058d8 100644
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -82,14 +82,14 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<f
 
 template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
 {
-  std::complex<float> EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 std::complex<float> af[2];
   af[0] = from[0*stride];
   af[1] = from[1*stride];
   return pload<Packet2cf>(af);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
 {
-  std::complex<float> EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 std::complex<float> af[2];
   pstore<std::complex<float> >((std::complex<float> *) af, from);
   to[0*stride] = af[0];
   to[1*stride] = af[1];
@@ -128,7 +128,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::co
 
 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
-  std::complex<float> EIGEN_ALIGN16 res[2];
+  EIGEN_ALIGN16 std::complex<float> res[2];
   pstore((float *)&res, a.v);
 
   return res[0];
@@ -298,14 +298,14 @@ template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<dou
 
 template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
 {
-  std::complex<double> EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 std::complex<double> af[2];
   af[0] = from[0*stride];
   af[1] = from[1*stride];
   return pload<Packet1cd>(af);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride)
 {
-  std::complex<double> EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 std::complex<double> af[2];
   pstore<std::complex<double> >(af, from);
   to[0*stride] = af[0];
   to[1*stride] = af[1];
@@ -345,7 +345,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::c
 
 template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
 {
-  std::complex<double> EIGEN_ALIGN16 res[2];
+  EIGEN_ALIGN16 std::complex<double> res[2];
   pstore<std::complex<double> >(res, a);
 
   return res[0];
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 2c06003ed..9535724eb 100755
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -324,7 +324,7 @@ pbroadcast4<Packet4i>(const int *a,
 
 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
 {
-  float EIGEN_ALIGN16 af[4];
+  EIGEN_ALIGN16 float af[4];
   af[0] = from[0*stride];
   af[1] = from[1*stride];
   af[2] = from[2*stride];
@@ -333,7 +333,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const floa
 }
 template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
 {
-  int EIGEN_ALIGN16 ai[4];
+  EIGEN_ALIGN16 int ai[4];
   ai[0] = from[0*stride];
   ai[1] = from[1*stride];
   ai[2] = from[2*stride];
@@ -342,7 +342,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* f
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
 {
-  float EIGEN_ALIGN16 af[4];
+  EIGEN_ALIGN16 float af[4];
   pstore<float>(af, from);
   to[0*stride] = af[0];
   to[1*stride] = af[1];
@@ -351,7 +351,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, co
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
 {
-  int EIGEN_ALIGN16 ai[4];
+  EIGEN_ALIGN16 int ai[4];
   pstore<int>((int *)ai, from);
   to[0*stride] = ai[0];
   to[1*stride] = ai[1];
@@ -565,8 +565,8 @@ template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet4f&
 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr)    { EIGEN_PPC_PREFETCH(addr); }
 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr)    { EIGEN_PPC_PREFETCH(addr); }
 
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; }
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int   x; vec_ste(a, 0, &x); return x; }
 
 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
 {
@@ -720,6 +720,11 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
   return pfirst(res);
 }
 
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
+{
+  return vec_any_ne(x, pzero(x));
+}
+
 template<int Offset>
 struct palign_impl<Offset,Packet4f>
 {
@@ -980,14 +985,14 @@ pbroadcast4<Packet2d>(const double *a,
 
 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
 {
-  double EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 double af[2];
   af[0] = from[0*stride];
   af[1] = from[1*stride];
  return pload<Packet2d>(af);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
 {
-  double EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 double af[2];
   pstore<double>(af, from);
   to[0*stride] = af[0];
   to[1*stride] = af[1];
@@ -1059,7 +1064,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d&
 
 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }
 
-template<> EIGEN_STRONG_INLINE double  pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore<double>(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE double  pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; }
 
 template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
 {
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 9481850c6..ce3f0fc68 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -3,7 +3,7 @@
 //
 // Copyright (C) 2007 Julien Pommier
 // Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
-// Copyright (C) 2009-2018 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -54,6 +54,7 @@ Packet plog_float(const Packet _x)
   // The smallest non denormalized float number.
   const Packet cst_min_norm_pos   = pset1frombits<Packet>( 0x00800000u);
   const Packet cst_minus_inf      = pset1frombits<Packet>( 0xff800000u);
+  const Packet cst_pos_inf        = pset1frombits<Packet>( 0x7f800000u);
 
   // Polynomial coefficients.
   const Packet cst_cephes_SQRTHF = pset1<Packet>(0.707106781186547524f);
@@ -69,9 +70,6 @@ Packet plog_float(const Packet _x)
   const Packet cst_cephes_log_q1 = pset1<Packet>(-2.12194440e-4f);
   const Packet cst_cephes_log_q2 = pset1<Packet>(0.693359375f);
 
-  Packet invalid_mask = pcmp_lt_or_nan(x, pzero(x));
-  Packet iszero_mask  = pcmp_eq(x,pzero(x));
-
   // Truncate input values to the minimum positive normal.
   x = pmax(x, cst_min_norm_pos);
 
@@ -117,8 +115,15 @@ Packet plog_float(const Packet _x)
   x   = padd(x, y);
   x   = padd(x, y2);
 
-  // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
-  return pselect(iszero_mask, cst_minus_inf, por(x, invalid_mask));
+  Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
+  Packet iszero_mask  = pcmp_eq(_x,pzero(_x));
+  Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf);
+  // Filter out invalid inputs, i.e.:
+  //  - negative arg will be NAN
+  //  - 0 will be -INF
+  //  - +INF will be +INF
+  return pselect(iszero_mask, cst_minus_inf,
+                              por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
 }
 
 // Exponential function. Works by writing "x = m*log(2) + r" where
@@ -248,15 +253,68 @@ Packet pexp_double(const Packet _x)
   return pmax(pldexp(x,fx), _x);
 }
 
-/* The code is the rewriting of the cephes sinf/cosf functions.
-   Precision is excellent as long as x < 8192 (I did not bother to
-   take into account the special handling they have for greater values
-   -- it does not return garbage for arguments over 8192, though, but
-   the extra precision is missing).
-
-   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
-   surprising but correct result.
-*/
+// The following code is inspired by the following stack-overflow answer:
+//   https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
+// It has been largely optimized:
+//  - By-pass calls to frexp.
+//  - Aligned loads of required 96 bits of 2/pi. This is accomplished by
+//    (1) balancing the mantissa and exponent to the required bits of 2/pi are
+//    aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi.
+//  - Avoid a branch in rounding and extraction of the remaining fractional part.
+// Overall, I measured a speed up higher than x2 on x86-64.
+inline float trig_reduce_huge (float xf, int *quadrant)
+{
+  using Eigen::numext::int32_t;
+  using Eigen::numext::uint32_t;
+  using Eigen::numext::int64_t;
+  using Eigen::numext::uint64_t;
+
+  const double pio2_62 = 3.4061215800865545e-19;    // pi/2 * 2^-62
+  const uint64_t zero_dot_five = uint64_t(1) << 61; // 0.5 in 2.62-bit fixed-point foramt
+
+  // 192 bits of 2/pi for Payne-Hanek reduction
+  // Bits are introduced by packet of 8 to enable aligned reads.
+  static const uint32_t two_over_pi [] = 
+  {
+    0x00000028, 0x000028be, 0x0028be60, 0x28be60db,
+    0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a,
+    0x91054a7f, 0x054a7f09, 0x4a7f09d5, 0x7f09d5f4,
+    0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770,
+    0x4d377036, 0x377036d8, 0x7036d8a5, 0x36d8a566,
+    0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410,
+    0x10e41000, 0xe4100000
+  };
+  
+  uint32_t xi = numext::as_uint(xf);
+  // Below, -118 = -126 + 8.
+  //   -126 is to get the exponent,
+  //   +8 is to enable alignment of 2/pi's bits on 8 bits.
+  // This is possible because the fractional part of x as only 24 meaningful bits.
+  uint32_t e = (xi >> 23) - 118;
+  // Extract the mantissa and shift it to align it wrt the exponent
+  xi = ((xi & 0x007fffffu)| 0x00800000u) << (e & 0x7);
+
+  uint32_t i = e >> 3;
+  uint32_t twoopi_1  = two_over_pi[i-1];
+  uint32_t twoopi_2  = two_over_pi[i+3];
+  uint32_t twoopi_3  = two_over_pi[i+7];
+
+  // Compute x * 2/pi in 2.62-bit fixed-point format.
+  uint64_t p;
+  p = uint64_t(xi) * twoopi_3;
+  p = uint64_t(xi) * twoopi_2 + (p >> 32);
+  p = (uint64_t(xi * twoopi_1) << 32) + p;
+
+  // Round to nearest: add 0.5 and extract integral part.
+  uint64_t q = (p + zero_dot_five) >> 62;
+  *quadrant = int(q);
+  // Now it remains to compute "r = x - q*pi/2" with high accuracy,
+  // since we have p=x/(pi/2) with high accuracy, we can more efficiently compute r as:
+  //   r = (p-q)*pi/2,
+  // where the product can be be carried out with sufficient accuracy using double precision.
+  p -= q<<62;
+  return float(double(int64_t(p)) * pio2_62);
+}
 
 template<bool ComputeSine,typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
@@ -264,80 +322,116 @@ EIGEN_UNUSED
 Packet psincos_float(const Packet& _x)
 {
   typedef typename unpacket_traits<Packet>::integer_packet PacketI;
-  const Packet cst_1          = pset1<Packet>(1.0f);
-  const Packet cst_half       = pset1<Packet>(0.5f);
-
-  const PacketI csti_1        = pset1<PacketI>(1);
-  const PacketI csti_not1     = pset1<PacketI>(~1);
-  const PacketI csti_2        = pset1<PacketI>(2);
-  const PacketI csti_3        = pset1<PacketI>(3);
-
-  const Packet cst_sign_mask  = pset1frombits<Packet>(0x80000000u);
-
-  const Packet cst_minus_cephes_DP1 = pset1<Packet>(-0.78515625f);
-  const Packet cst_minus_cephes_DP2 = pset1<Packet>(-2.4187564849853515625e-4f);
-  const Packet cst_minus_cephes_DP3 = pset1<Packet>(-3.77489497744594108e-8f);
-  const Packet cst_sincof_p0        = pset1<Packet>(-1.9515295891E-4f);
-  const Packet cst_sincof_p1        = pset1<Packet>( 8.3321608736E-3f);
-  const Packet cst_sincof_p2        = pset1<Packet>(-1.6666654611E-1f);
-  const Packet cst_coscof_p0        = pset1<Packet>( 2.443315711809948E-005f);
-  const Packet cst_coscof_p1        = pset1<Packet>(-1.388731625493765E-003f);
-  const Packet cst_coscof_p2        = pset1<Packet>( 4.166664568298827E-002f);
-  const Packet cst_cephes_FOPI      = pset1<Packet>( 1.27323954473516f); // 4 / M_PI
 
-  Packet x = pabs(_x);
+  const Packet  cst_2oPI            = pset1<Packet>(0.636619746685028076171875f); // 2/PI
+  const Packet  cst_rounding_magic  = pset1<Packet>(12582912); // 2^23 for rounding
+  const PacketI csti_1              = pset1<PacketI>(1);
+  const Packet  cst_sign_mask       = pset1frombits<Packet>(0x80000000u);
 
-  // Scale x by 4/Pi to find x's octant.
-  Packet y = pmul(x, cst_cephes_FOPI);
+  Packet x = pabs(_x);
 
-  // Get the octant. We'll reduce x by this number of octants or by one more than it.
-  PacketI y_int = pcast<Packet,PacketI>(y);
-  // x's from even-numbered octants will translate to octant 0: [0, +Pi/4].
-  // x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0].
-  // Adjustment for odd-numbered octants: octant = (octant + 1) & (~1).
-  PacketI y_int1 = pand(padd(y_int, csti_1), csti_not1); // could be pbitclear<0>(...)
-  y = pcast<PacketI,Packet>(y_int1);
+  // Scale x by 2/Pi to find x's octant.
+  Packet y = pmul(x, cst_2oPI);
+
+  // Rounding trick:
+  Packet y_round = padd(y, cst_rounding_magic);
+  PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24)
+  y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi
+
+  // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4
+  // using "Extended precision modular arithmetic"
+  #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD)
+  // This version requires true FMA for high accuracy
+  // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
+  const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
+  x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
+  x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x);
+  x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x);
+  #else
+  // Without true FMA, the previous set of coefficients maintain 1ULP accuracy
+  // up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7.
+  // We thus use one more iteration to maintain 2ULPs up to reasonably large inputs.
+
+  // The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively.
+  // and 2 ULP up to:
+  const float huge_th = ComputeSine ? 25966.f : 18838.f;
+  x = pmadd(y, pset1<Packet>(-1.5703125), x); // = 0xbfc90000
+  x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x); // = 0xb9fdc000
+  x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x); // = 0x342ee000
+  x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee
+
+  // For the record, the following set of coefficients maintain 2ULP up
+  // to a slightly larger range:
+  // const float huge_th = ComputeSine ? 51981.f : 39086.125f;
+  // but it slightly fails to maintain 1ULP for two values of sin below pi.
+  // x = pmadd(y, pset1<Packet>(-3.140625/2.), x);
+  // x = pmadd(y, pset1<Packet>(-0.00048351287841796875), x);
+  // x = pmadd(y, pset1<Packet>(-3.13855707645416259765625e-07), x);
+  // x = pmadd(y, pset1<Packet>(-6.0771006282767103812147979624569416046142578125e-11), x);
+
+  // For the record, with only 3 iterations it is possible to maintain
+  // 1 ULP up to 3PI (maybe more) and 2ULP up to 255.
+  // The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee
+  #endif
+
+  if(predux_any(pcmp_le(pset1<Packet>(huge_th),pabs(_x))))
+  {
+    const int PacketSize = unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize];
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float x_cpy[PacketSize];
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) int y_int2[PacketSize];
+    pstoreu(vals, pabs(_x));
+    pstoreu(x_cpy, x);
+    pstoreu(y_int2, y_int);
+    for(int k=0; k<PacketSize;++k)
+    {
+      float val = vals[k];
+      if(val>=huge_th && (numext::isfinite)(val))
+        x_cpy[k] = trig_reduce_huge(val,&y_int2[k]);
+    }
+    x = ploadu<Packet>(x_cpy);
+    y_int = ploadu<PacketI>(y_int2);
+  }
 
   // Compute the sign to apply to the polynomial.
-  // sign = third_bit(y_int1) xor signbit(_x)
-  Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(pshiftleft<29>(y_int1)))
-                                : preinterpret<Packet>(pshiftleft<29>(padd(y_int1,csti_3)));
+  // sin: sign = second_bit(y_int) xor signbit(_x)
+  // cos: sign = second_bit(y_int+1)
+  Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(pshiftleft<30>(y_int)))
+                                : preinterpret<Packet>(pshiftleft<30>(padd(y_int,csti_1)));
   sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
 
-  // Get the polynomial selection mask from the second bit of y_int1
+  // Get the polynomial selection mask from the second bit of y_int
   // We'll calculate both (sin and cos) polynomials and then select from the two.
-  Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int1, csti_2), pzero(y_int1)));
-
-  // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4.
-  // The magic pass: "Extended precision modular arithmetic"
-  // x = ((x - y * DP1) - y * DP2) - y * DP3
-  x = pmadd(y, cst_minus_cephes_DP1, x);
-  x = pmadd(y, cst_minus_cephes_DP2, x);
-  x = pmadd(y, cst_minus_cephes_DP3, x);
+  Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int)));
 
   Packet x2 = pmul(x,x);
 
-  // Evaluate the cos(x) polynomial. (0 <= x <= Pi/4)
-  Packet y1 = cst_coscof_p0;
-  y1 = pmadd(y1, x2, cst_coscof_p1);
-  y1 = pmadd(y1, x2, cst_coscof_p2);
-  y1 = pmul(y1, x2);
-  y1 = pmul(y1, x2);
-  y1 = psub(y1, pmul(x2, cst_half));
-  y1 = padd(y1, cst_1);
-
-  // Evaluate the sin(x) polynomial. (Pi/4 <= x <= 0) 
-  Packet y2 = cst_sincof_p0;
-  y2 = pmadd(y2, x2, cst_sincof_p1);
-  y2 = pmadd(y2, x2, cst_sincof_p2);
+  // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4)
+  Packet y1 =        pset1<Packet>(2.4372266125283204019069671630859375e-05f);
+  y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f     ));
+  y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f           ));
+  y1 = pmadd(y1, x2, pset1<Packet>(-0.5f));
+  y1 = pmadd(y1, x2, pset1<Packet>(1.f));
+
+  // Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4)
+  // octave/matlab code to compute those coefficients:
+  //    x = (0:0.0001:pi/4)';
+  //    A = [x.^3 x.^5 x.^7];
+  //    w = ((1.-(x/(pi/4)).^2).^5)*2000+1;         # weights trading relative accuracy
+  //    c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1
+  //    printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1))
+  //
+  Packet y2 =        pset1<Packet>(-0.0001959234114083702898469196984621021329076029360294342041015625f);
+  y2 = pmadd(y2, x2, pset1<Packet>( 0.0083326873655616851693794799871284340042620897293090820312500000f));
+  y2 = pmadd(y2, x2, pset1<Packet>(-0.1666666203982298255503735617821803316473960876464843750000000000f));
   y2 = pmul(y2, x2);
   y2 = pmadd(y2, x, x);
 
-  // Select the correct result from the two polynoms.
+  // Select the correct result from the two polynomials.
   y = ComputeSine ? pselect(poly_mask,y2,y1)
                   : pselect(poly_mask,y1,y2);
 
-  // Update the sign
+  // Update the sign and filter huge inputs
   return pxor(y, sign_bit);
 }
 
diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h
index eaba60e26..c1b097fb9 100644
--- a/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h
@@ -100,6 +100,117 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const do
   return make_double2(from, from);
 }
 
+namespace {
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a,
+                                                        const float& b) {
+  return __int_as_float(__float_as_int(a) & __float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a,
+                                                         const double& b) {
+  return __longlong_as_double(__double_as_longlong(a) &
+                              __double_as_longlong(b));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a,
+                                                       const float& b) {
+  return __int_as_float(__float_as_int(a) | __float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a,
+                                                        const double& b) {
+  return __longlong_as_double(__double_as_longlong(a) |
+                              __double_as_longlong(b));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a,
+                                                        const float& b) {
+  return __int_as_float(__float_as_int(a) ^ __float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a,
+                                                         const double& b) {
+  return __longlong_as_double(__double_as_longlong(a) ^
+                              __double_as_longlong(b));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a,
+                                                           const float& b) {
+  return __int_as_float(__float_as_int(a) & ~__float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a,
+                                                            const double& b) {
+  return __longlong_as_double(__double_as_longlong(a) &
+                              ~__double_as_longlong(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a,
+                                                    const float& b) {
+  return __int_as_float(a == b ? 0xffffffffu : 0u);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a,
+                                                     const double& b) {
+  return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull);
+}
+
+}  // namespace
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(const float4& a,
+                                                          const float4& b) {
+  return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y),
+                     bitwise_and(a.z, b.z), bitwise_and(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand<double2>(const double2& a,
+                                                            const double2& b) {
+  return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por<float4>(const float4& a,
+                                                         const float4& b) {
+  return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y),
+                     bitwise_or(a.z, b.z), bitwise_or(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por<double2>(const double2& a,
+                                                           const double2& b) {
+  return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor<float4>(const float4& a,
+                                                          const float4& b) {
+  return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y),
+                     bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor<double2>(const double2& a,
+                                                            const double2& b) {
+  return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot<float4>(const float4& a,
+                                                             const float4& b) {
+  return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y),
+                     bitwise_andnot(a.z, b.z), bitwise_andnot(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pandnot<double2>(const double2& a, const double2& b) {
+  return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq<float4>(const float4& a,
+                                                             const float4& b) {
+  return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z),
+                     eq_mask(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pcmp_eq<double2>(const double2& a, const double2& b) {
+  return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y));
+}
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
   return make_float4(a, a+1, a+2, a+3);
diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
index cc5c484b6..316ac0283 100644
--- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
@@ -143,6 +143,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs<half2>(const half2&
   return result;
 }
 
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue<half2>(const half2& a) {
+  half2 result;
+  *(reinterpret_cast<unsigned*>(&(result))) = 0xffffffffu;
+}
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
 ptranspose(PacketBlock<half2,2>& kernel) {
@@ -640,6 +644,14 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
 #endif
 }
 
+template<> EIGEN_STRONG_INLINE Packet16h pnot(const Packet16h& a) {
+  Packet16h r; r.x = _mm256_xor_si256(a.x, pcmp_eq(a.x, a.x)); return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) {
+  Packet16h r; r.x = Packet8i(ptrue(a.x)); return r;
+}
+
 template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) {
   // in some cases Packet8i is a wrapper around __m256i, so we need to 
   // cast to Packet8i to call the correct overload.
@@ -655,6 +667,13 @@ template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet
   Packet16h r; r.x = pandnot(Packet8i(a.x),Packet8i(b.x)); return r;
 }
 
+template<> EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a,const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = pcmp_eq(af, bf);
+  return float2half(rf);
+}
+
 template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
   // FIXME we could do that with bit manipulation
   Packet16f af = half2float(a);
@@ -1078,6 +1097,10 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
 #endif
 }
 
+template<> EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) {
+  Packet8h r; r.x = _mm_cmpeq_epi32(a.x, a.x); return r;
+}
+
 template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) {
   // in some cases Packet4i is a wrapper around __m128i, so we either need to 
   // cast to Packet4i to directly call the intrinsics as below:
@@ -1093,6 +1116,13 @@ template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h
   Packet8h r; r.x = _mm_andnot_si128(b.x,a.x); return r;
 }
 
+template<> EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a,const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = pcmp_eq(af, bf);
+  return float2half(rf);
+}
+
 template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
 
 template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index 5e6de1f40..d149275b5 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -146,7 +146,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::co
 
 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
-  std::complex<float> EIGEN_ALIGN16 x[2];
+  EIGEN_ALIGN16 std::complex<float> x[2];
   vst1q_f32((float *)x, a.v);
   return x[0];
 }
@@ -401,7 +401,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1c
 
 template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
 {
-  std::complex<double> EIGEN_ALIGN16 res;
+  EIGEN_ALIGN16 std::complex<double> res;
   pstore<std::complex<double> >(&res, a);
 
   return res;
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index ca4f2bf94..e8b351849 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -375,8 +375,8 @@ template<> EIGEN_STRONG_INLINE void prefetch<float>  (const float*    addr) { EI
 template<> EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t*  addr) { EIGEN_ARM_PREFETCH(addr); }
 
 // FIXME only store the 2 first elements ?
-template<> EIGEN_STRONG_INLINE float   pfirst<Packet4f>(const Packet4f& a) { float   EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
-template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE float   pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float   x[4]; vst1q_f32(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int32_t x[4]; vst1q_s32(x, a); return x[0]; }
 
 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
   float32x2_t a_lo, a_hi;
@@ -551,6 +551,13 @@ template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a)
   return vget_lane_s32(max, 0);
 }
 
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
+{
+  uint32x2_t tmp = vorr_u32(vget_low_u32( vreinterpretq_u32_f32(x)),
+                            vget_high_u32(vreinterpretq_u32_f32(x)));
+  return vget_lane_u32(vpmax_u32(tmp,tmp),0);
+}
+
 // this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
 // see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
 #define PALIGN_NEON(Offset,Type,Command) \
@@ -704,6 +711,8 @@ template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, con
   return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
 }
 
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return vreinterpretq_f64_u64(vceqq_f64(a,b)); }
+
 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); }
 
 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); }
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index 911fe066e..f39988eac 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -82,6 +82,9 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
   #endif
 }
 
+template<> EIGEN_STRONG_INLINE Packet2cf ptrue  <Packet2cf>(const Packet2cf& a) { return Packet2cf(ptrue(Packet4f(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pnot   <Packet2cf>(const Packet2cf& a) { return Packet2cf(pnot(Packet4f(a.v))); }
+
 template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); }
@@ -305,6 +308,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con
   #endif
 }
 
+template<> EIGEN_STRONG_INLINE Packet1cd ptrue  <Packet1cd>(const Packet1cd& a) { return Packet1cd(ptrue(Packet2d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet1cd pnot   <Packet1cd>(const Packet1cd& a) { return Packet1cd(pnot(Packet2d(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); }
@@ -439,6 +444,18 @@ ptranspose(PacketBlock<Packet2cf,2>& kernel) {
   kernel.packet[1].v = tmp;
 }
 
+template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b)
+{
+  __m128 eq = _mm_cmpeq_ps(a.v, b.v);
+  return Packet2cf(pand<Packet4f>(eq, vec4f_swizzle1(eq, 1, 0, 3, 2)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b)
+{
+  __m128d eq = _mm_cmpeq_pd(a.v, b.v);
+  return Packet1cd(pand<Packet2d>(eq, vec2d_swizzle1(eq, 1, 0)));
+}
+
 template<>  EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
   __m128d result = pblend<Packet2d>(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v));
   return Packet2cf(_mm_castpd_ps(result));
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 4c7dc5b64..9c3750af0 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -374,9 +374,21 @@ template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const
 template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ptrue<Packet4i>(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); }
+template<> EIGEN_STRONG_INLINE Packet4f
+ptrue<Packet4f>(const Packet4f& a) {
+  Packet4i b = _mm_castps_si128(a);
+  return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b));
+}
+template<> EIGEN_STRONG_INLINE Packet2d
+ptrue<Packet2d>(const Packet2d& a) {
+  Packet4i b = _mm_castpd_si128(a);
+  return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b));
+}
 
 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
@@ -812,6 +824,17 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
 #endif // EIGEN_VECTORIZE_SSE4_1
 }
 
+// not needed yet
+// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet4f& x)
+// {
+//   return _mm_movemask_ps(x) == 0xF;
+// }
+
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
+{
+  return _mm_movemask_ps(x) != 0x0;
+}
+
 #if EIGEN_COMP_GNUC
 // template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f&  a, const Packet4f&  b, const Packet4f&  c)
 // {
diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
index 0c2d2cfca..55994047e 100644
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -548,6 +548,23 @@ struct functor_traits<scalar_tanh_op<Scalar> > {
   };
 };
 
+#if EIGEN_HAS_CXX11_MATH
+/** \internal
+  * \brief Template functor to compute the atanh of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::atanh()
+  */
+template <typename Scalar>
+struct scalar_atanh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_atanh_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::atanh(a); }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_atanh_op<Scalar> > {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+#endif
+
 /** \internal
   * \brief Template functor to compute the sinh of a scalar
   * \sa class CwiseUnaryOp, ArrayBase::sinh()
@@ -567,6 +584,23 @@ struct functor_traits<scalar_sinh_op<Scalar> >
   };
 };
 
+#if EIGEN_HAS_CXX11_MATH
+/** \internal
+  * \brief Template functor to compute the asinh of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::asinh()
+  */
+template <typename Scalar>
+struct scalar_asinh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_asinh_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::asinh(a); }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_asinh_op<Scalar> > {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+#endif
+
 /** \internal
   * \brief Template functor to compute the cosh of a scalar
   * \sa class CwiseUnaryOp, ArrayBase::cosh()
@@ -586,6 +620,23 @@ struct functor_traits<scalar_cosh_op<Scalar> >
   };
 };
 
+#if EIGEN_HAS_CXX11_MATH
+/** \internal
+  * \brief Template functor to compute the acosh of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::acosh()
+  */
+template <typename Scalar>
+struct scalar_acosh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_acosh_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::acosh(a); }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_acosh_op<Scalar> > {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+#endif
+
 /** \internal
   * \brief Template functor to compute the inverse of a scalar
   * \sa class CwiseUnaryOp, Cwise::inverse()
@@ -598,9 +649,13 @@ struct scalar_inverse_op {
   EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
   { return internal::pdiv(pset1<Packet>(Scalar(1)),a); }
 };
-template<typename Scalar>
-struct functor_traits<scalar_inverse_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };
+template <typename Scalar>
+struct functor_traits<scalar_inverse_op<Scalar> > {
+  enum {
+    PacketAccess = packet_traits<Scalar>::HasDiv,
+    Cost = scalar_div_cost<Scalar, PacketAccess>::value
+  };
+};
 
 /** \internal
   * \brief Template functor to compute the square of a scalar
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index afbd83eda..030c7740a 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -353,6 +353,24 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_
 //   #define CJMADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); C = padd(C,T);
 #endif
 
+template <typename RhsPacket, typename RhsPacketx4, int registers_taken>
+struct RhsPanelHelper {
+ private:
+  static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken;
+ public:
+  typedef typename conditional<remaining_registers>=4, RhsPacketx4, RhsPacket>::type type;
+};
+
+template <typename Packet>
+struct QuadPacket
+{
+  Packet B_0, B1, B2, B3;
+  const Packet& get(const FixedInt<0>&) const { return B_0; }
+  const Packet& get(const FixedInt<1>&) const { return B1; }
+  const Packet& get(const FixedInt<2>&) const { return B2; }
+  const Packet& get(const FixedInt<3>&) const { return B3; }
+};
+
 template <int N, typename T1, typename T2, typename T3>
 struct packet_conditional { typedef T3 type; };
 
@@ -448,29 +466,35 @@ public:
   typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
   typedef LhsPacket LhsPacket4Packing;
 
+  typedef QuadPacket<RhsPacket> RhsPacketx4;
   typedef ResPacket AccPacket;
   
   EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
   {
     p = pset1<ResPacket>(ResScalar(0));
   }
-  
-  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
-  {
-    pbroadcast4(b, b0, b1, b2, b3);
-  }
-  
-//   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
-//   {
-//     pbroadcast2(b, b0, b1);
-//   }
-  
+
   template<typename RhsPacketType>
   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
   {
     dest = pset1<RhsPacketType>(*b);
   }
-  
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+  {
+    pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
+  }
+
+  template<typename RhsPacketType>
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
+  {
+    loadRhs(b, dest);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+  {
+  }
+
   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
   {
     dest = ploadquad<RhsPacket>(b);
@@ -488,8 +512,8 @@ public:
     dest = ploadu<LhsPacketType>(a);
   }
 
-  template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
-  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const
+  template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
   {
     conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
     // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
@@ -504,6 +528,12 @@ public:
 #endif
   }
 
+  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
+  {
+    madd(a, b.get(lane), c, tmp, lane);
+  }
+
   EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
   {
     r = pmadd(c,alpha,r);
@@ -555,6 +585,8 @@ public:
   typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
   typedef LhsPacket LhsPacket4Packing;
 
+  typedef QuadPacket<RhsPacket> RhsPacketx4;
+
   typedef ResPacket AccPacket;
 
   EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
@@ -567,6 +599,20 @@ public:
   {
     dest = pset1<RhsPacketType>(*b);
   }
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+  {
+    pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
+  }
+
+  template<typename RhsPacketType>
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
+  {
+    loadRhs(b, dest);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+  {}
   
   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
   {
@@ -598,18 +644,8 @@ public:
     dest = ploadu<LhsPacketType>(a);
   }
 
-  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
-  {
-    pbroadcast4(b, b0, b1, b2, b3);
-  }
-  
-//   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
-//   {
-//     pbroadcast2(b, b0, b1);
-//   }
-
-  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
-  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp) const
+  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
   {
     madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
   }
@@ -630,10 +666,16 @@ public:
     c += a * b;
   }
 
+  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
+  {
+    madd(a, b.get(lane), c, tmp, lane);
+  }
+
   template <typename ResPacketType, typename AccPacketType>
   EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
   {
-    const conj_helper<ResPacketType,ResPacketType,ConjLhs,false> cj;
+    conj_helper<ResPacketType,ResPacketType,ConjLhs,false> cj;
     r = cj.pmadd(c,alpha,r);
   }
 
@@ -756,6 +798,9 @@ public:
   typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
   typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
   typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
+
+  // this actualy holds 8 packets!
+  typedef QuadPacket<RhsPacket> RhsPacketx4;
   
   EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
 
@@ -778,39 +823,37 @@ public:
     dest.first  = pset1<RealPacketType>(real(*b));
     dest.second = pset1<RealPacketType>(imag(*b));
   }
-  
-  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
   {
-    loadRhs(b,dest);
+    loadRhs(b, dest.B_0);
+    loadRhs(b + 1, dest.B1);
+    loadRhs(b + 2, dest.B2);
+    loadRhs(b + 3, dest.B3);
   }
-  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
+
+  // Scalar path
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const
   {
-    loadQuadToDoublePacket(b,dest);
+    loadRhs(b, dest);
   }
-  
-  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+
+  // Vectorized path
+  template<typename RealPacketType>
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
   {
-    // FIXME not sure that's the best way to implement it!
-    loadRhs(b+0, b0);
-    loadRhs(b+1, b1);
-    loadRhs(b+2, b2);
-    loadRhs(b+3, b3);
+    loadRhs(b, dest);
   }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
   
-  // Vectorized path
-  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1)
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
   {
-    // FIXME not sure that's the best way to implement it!
-    loadRhs(b+0, b0);
-    loadRhs(b+1, b1);
+    loadRhs(b,dest);
   }
-  
-  // Scalar path
-  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1)
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
   {
-    // FIXME not sure that's the best way to implement it!
-    loadRhs(b+0, b0);
-    loadRhs(b+1, b1);
+    loadQuadToDoublePacket(b,dest);
   }
 
   // nothing special here
@@ -825,17 +868,26 @@ public:
     dest = ploadu<LhsPacketType>((const typename unpacket_traits<LhsPacketType>::type*)(a));
   }
 
-  template<typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType>
-  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket<ResPacketType>& c, TmpType& /*tmp*/) const
+  template<typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType, typename LaneIdType>
+  EIGEN_STRONG_INLINE
+  typename enable_if<!is_same<RhsPacketType,RhsPacketx4>::value>::type
+  madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket<ResPacketType>& c, TmpType& /*tmp*/, const LaneIdType&) const
   {
     c.first   = padd(pmul(a,b.first), c.first);
     c.second  = padd(pmul(a,b.second),c.second);
   }
 
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const
+  template<typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const
   {
     c = cj.pmadd(a,b,c);
   }
+
+  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
+  {
+    madd(a, b.get(lane), c, tmp, lane);
+  }
   
   EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
   
@@ -914,7 +966,7 @@ public:
   typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
   typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
   typedef LhsPacket LhsPacket4Packing;
-
+  typedef QuadPacket<RhsPacket> RhsPacketx4;
   typedef ResPacket AccPacket;
 
   EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
@@ -927,18 +979,20 @@ public:
   {
     dest = pset1<RhsPacketType>(*b);
   }
-  
-  void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
   {
-    pbroadcast4(b, b0, b1, b2, b3);
+    pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
   }
-  
-//   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
-//   {
-//     // FIXME not sure that's the best way to implement it!
-//     b0 = pload1<RhsPacket>(b+0);
-//     b1 = pload1<RhsPacket>(b+1);
-//   }
+
+  template<typename RhsPacketType>
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
+  {
+    loadRhs(b, dest);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+  {}
 
   EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
   {
@@ -956,8 +1010,8 @@ public:
     dest = ploaddup<LhsPacketType>(a);
   }
 
-  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
-  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp) const
+  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
   {
     madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
   }
@@ -979,48 +1033,135 @@ public:
     c += a * b;
   }
 
+  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
+  {
+    madd(a, b.get(lane), c, tmp, lane);
+  }
+
   template <typename ResPacketType, typename AccPacketType>
   EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
   {
-    const conj_helper<ResPacketType,ResPacketType,false,ConjRhs> cj;
+    conj_helper<ResPacketType,ResPacketType,false,ConjRhs> cj;
     r = cj.pmadd(alpha,c,r);
   }
 
 protected:
-  
+
 };
 
 
 #if EIGEN_ARCH_ARM64 && defined EIGEN_VECTORIZE_NEON
 
-template<>
-struct gebp_traits <float, float, false, false,Architecture::NEON>
- : gebp_traits<float,float,false,false,Architecture::Generic>
+template<int _PacketSize>
+struct gebp_traits <float, float, false, false,Architecture::NEON,_PacketSize>
+ : gebp_traits<float,float,false,false,Architecture::Generic,_PacketSize>
 {
   typedef float RhsPacket;
 
-  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+  typedef float32x4_t RhsPacketx4;
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
   {
-    loadRhs(b+0, b0);
-    loadRhs(b+1, b1);
-    loadRhs(b+2, b2);
-    loadRhs(b+3, b3);
+    dest = *b;
   }
 
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
   {
-     dest = *b;
+    dest = vld1q_f32(b);
   }
 
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
+  {
+    dest = *b;
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketx4& dest) const
+  {}
+
   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
   {
     loadRhs(b,dest);
   }
 
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/) const
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
   {
     c = vfmaq_n_f32(c, a, b);
   }
+
+  template<int LaneID>
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<LaneID>&) const
+  {
+    #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
+    // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
+    // vfmaq_laneq_f32 is implemented through a costly dup
+         if(LaneID==0)  asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) :  );
+    else if(LaneID==1)  asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) :  );
+    else if(LaneID==2)  asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) :  );
+    else if(LaneID==3)  asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) :  );
+    #else
+    c = vfmaq_laneq_f32(c, a, b, LaneID);
+    #endif
+  }
+};
+
+
+template<>
+struct gebp_traits <double, double, false, false,Architecture::NEON>
+ : gebp_traits<double,double,false,false,Architecture::Generic>
+{
+  typedef double RhsPacket;
+
+  struct RhsPacketx4 {
+    float64x2_t B_0, B_1;
+  };
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+  {
+    dest = *b;
+  }
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+  {
+    dest.B_0 = vld1q_f64(b);
+    dest.B_1 = vld1q_f64(b+2);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
+  {
+    loadRhs(b,dest);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketx4& dest) const
+  {}
+
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+  {
+    loadRhs(b,dest);
+  }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
+  {
+    c = vfmaq_n_f64(c, a, b);
+  }
+
+  template<int LaneID>
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<LaneID>&) const
+  {
+    #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
+    // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
+    // vfmaq_laneq_f64 is implemented through a costly dup
+         if(LaneID==0)  asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) :  );
+    else if(LaneID==1)  asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) :  );
+    else if(LaneID==2)  asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) :  );
+    else if(LaneID==3)  asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) :  );
+    #else
+         if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0);
+    else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1);
+    else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0);
+    else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1);
+    #endif
+  }
 };
 
 #endif
@@ -1044,6 +1185,9 @@ struct gebp_kernel
   typedef typename Traits::RhsPacket RhsPacket;
   typedef typename Traits::ResPacket ResPacket;
   typedef typename Traits::AccPacket AccPacket;
+  typedef typename Traits::RhsPacketx4 RhsPacketx4;
+
+  typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15;
 
   typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
 
@@ -1148,7 +1292,7 @@ struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper,  mr,
 	    SRhsPacketQuarter b0;
 	    straits.loadLhsUnaligned(blB, a0);
 	    straits.loadRhs(blA, b0);
-	    straits.madd(a0,b0,c0,b0);
+	    straits.madd(a0,b0,c0,b0, fix<0>);
 	    blB += SwappedTraits::LhsProgress/4;
 	    blA += 1;
 	  }
@@ -1165,21 +1309,25 @@ struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper,  mr,
 template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
 struct lhs_process_one_packet
 {
+  typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
 
-  EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
+  EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
   {
         EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
         EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
-        traits.loadLhs(&blA[(0+1*K)*(LhsProgress)], *A0);
-        traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3);
-        traits.madd(*A0, *B_0, *C0, *B_0);
-        traits.madd(*A0, *B1,  *C1, *B1);
-        traits.madd(*A0, *B2,  *C2, *B2);
-        traits.madd(*A0, *B3,  *C3, *B3);
+        traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0);
+        traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel);
+        traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
+        traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
+        traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
+        traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
         EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
   }
 
-  EIGEN_STRONG_INLINE void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha, Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB, Index prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
+  EIGEN_STRONG_INLINE void operator()(
+    const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha,
+    Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB,
+    int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
   {
     GEBPTraits traits;
 
@@ -1221,18 +1369,19 @@ struct lhs_process_one_packet
         for(Index k=0; k<peeled_kc; k+=pk)
         {
           EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4");
-          RhsPacket B_0, B1, B2, B3;
+          RhsPacketx4 rhs_panel;
+          RhsPacket T0;
 
           internal::prefetch(blB+(48+0));
-          peeled_kc_onestep(0, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3);
-          peeled_kc_onestep(1, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3);
-          peeled_kc_onestep(2, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3);
-          peeled_kc_onestep(3, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3);
+          peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+          peeled_kc_onestep(1, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+          peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+          peeled_kc_onestep(3, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
           internal::prefetch(blB+(48+16));
-          peeled_kc_onestep(4, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3);
-          peeled_kc_onestep(5, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3);
-          peeled_kc_onestep(6, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3);
-          peeled_kc_onestep(7, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3);
+          peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+          peeled_kc_onestep(5, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+          peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+          peeled_kc_onestep(7, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
 
           blB += pk*4*RhsProgress;
           blA += pk*LhsProgress;
@@ -1243,8 +1392,9 @@ struct lhs_process_one_packet
         // process remaining peeled loop
         for(Index k=peeled_kc; k<depth; k++)
         {
-          RhsPacket B_0, B1, B2, B3;
-          peeled_kc_onestep(0, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3);
+          RhsPacketx4 rhs_panel;
+          RhsPacket T0;
+          peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
           blB += 4*RhsProgress;
           blA += LhsProgress;
         }
@@ -1293,9 +1443,10 @@ struct lhs_process_one_packet
 	      do {                                                      \
 		EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
 		EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+    /* FIXME: why unaligned???? */ \
 		traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \
 		traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);		\
-		traits.madd(A0, B_0, C0, B_0);				\
+		traits.madd(A0, B_0, C0, B_0, fix<0>);				\
 		EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
 	      } while(false);
 
@@ -1372,7 +1523,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
     const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0;
     enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
     const Index peeled_kc  = depth & ~(pk-1);
-    const Index prefetch_res_offset = 32/sizeof(ResScalar);    
+    const int prefetch_res_offset = 32/sizeof(ResScalar);    
 //     const Index depth2     = depth & ~1;
 
     //---------- Process 3 * LhsProgress rows at once ----------
@@ -1430,36 +1581,48 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           for(Index k=0; k<peeled_kc; k+=pk)
           {
             EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
-            RhsPacket B_0, T0;
+            // 15 registers are taken (12 for acc, 2 for lhs).
+            RhsPanel15 rhs_panel;
+            RhsPacket T0;
             LhsPacket A2;
-
-#define EIGEN_GEBP_ONESTEP(K) \
-            do { \
-              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
+            #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0))
+            // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
+            // without this workaround A0, A1, and A2 are loaded in the same register,
+            // which is not good for pipelining
+            #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__  ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2));
+            #else
+            #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
+            #endif
+#define EIGEN_GEBP_ONESTEP(K)                                                     \
+            do {                                                                  \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4");          \
               EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
-              internal::prefetch(blA+(3*K+16)*LhsProgress); \
-              if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \
-              traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
-              traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
-              traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
-              traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \
-              traits.madd(A0, B_0, C0, T0); \
-              traits.madd(A1, B_0, C4, T0); \
-              traits.madd(A2, B_0, C8, B_0); \
-              traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \
-              traits.madd(A0, B_0, C1, T0); \
-              traits.madd(A1, B_0, C5, T0); \
-              traits.madd(A2, B_0, C9, B_0); \
-              traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \
-              traits.madd(A0, B_0, C2,  T0); \
-              traits.madd(A1, B_0, C6,  T0); \
-              traits.madd(A2, B_0, C10, B_0); \
-              traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \
-              traits.madd(A0, B_0, C3 , T0); \
-              traits.madd(A1, B_0, C7,  T0); \
-              traits.madd(A2, B_0, C11, B_0); \
-              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
-            } while(false)
+              internal::prefetch(blA + (3 * K + 16) * LhsProgress);               \
+              if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) {                            \
+                internal::prefetch(blB + (4 * K + 16) * RhsProgress);             \
+              } /* Bug 953 */                                                     \
+              traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0);                \
+              traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1);                \
+              traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2);                \
+              EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
+              traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel);     \
+              traits.madd(A0, rhs_panel, C0, T0, fix<0>);                         \
+              traits.madd(A1, rhs_panel, C4, T0, fix<0>);                         \
+              traits.madd(A2, rhs_panel, C8, T0, fix<0>);                         \
+              traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel);   \
+              traits.madd(A0, rhs_panel, C1, T0, fix<1>);                         \
+              traits.madd(A1, rhs_panel, C5, T0, fix<1>);                         \
+              traits.madd(A2, rhs_panel, C9, T0, fix<1>);                         \
+              traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel);   \
+              traits.madd(A0, rhs_panel, C2, T0, fix<2>);                         \
+              traits.madd(A1, rhs_panel, C6, T0, fix<2>);                         \
+              traits.madd(A2, rhs_panel, C10, T0, fix<2>);                        \
+              traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel);   \
+              traits.madd(A0, rhs_panel, C3, T0, fix<3>);                         \
+              traits.madd(A1, rhs_panel, C7, T0, fix<3>);                         \
+              traits.madd(A2, rhs_panel, C11, T0, fix<3>);                        \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4");            \
+            } while (false)
 
             internal::prefetch(blB);
             EIGEN_GEBP_ONESTEP(0);
@@ -1479,7 +1642,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           // process remaining peeled loop
           for(Index k=peeled_kc; k<depth; k++)
           {
-            RhsPacket B_0, T0;
+            RhsPanel15 rhs_panel;
+            RhsPacket T0;
             LhsPacket A2;
             EIGEN_GEBP_ONESTEP(0);
             blB += 4*RhsProgress;
@@ -1559,20 +1723,20 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           {
             EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
             RhsPacket B_0;
-#define EIGEN_GEBGP_ONESTEP(K) \
-            do { \
-              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
+#define EIGEN_GEBGP_ONESTEP(K)                                                    \
+            do {                                                                  \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1");          \
               EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
-              traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
-              traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
-              traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
-              traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);   \
-              traits.madd(A0, B_0, C0, B_0); \
-              traits.madd(A1, B_0, C4, B_0); \
-              traits.madd(A2, B_0, C8, B_0); \
-              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
-            } while(false)
-        
+              traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0);                \
+              traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1);                \
+              traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2);                \
+              traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0);                   \
+              traits.madd(A0, B_0, C0, B_0, fix<0>);                              \
+              traits.madd(A1, B_0, C4, B_0, fix<0>);                              \
+              traits.madd(A2, B_0, C8, B_0, fix<0>);                              \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1");            \
+            } while (false)
+
             EIGEN_GEBGP_ONESTEP(0);
             EIGEN_GEBGP_ONESTEP(1);
             EIGEN_GEBGP_ONESTEP(2);
@@ -1661,7 +1825,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           for(Index k=0; k<peeled_kc; k+=pk)
           {
             EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
-            RhsPacket B_0, B1, B2, B3, T0;
+            RhsPacketx4 rhs_panel;
+            RhsPacket T0;
 
           // NOTE: the begin/end asm comments below work around bug 935!
           // but they are not enough for gcc>=6 without FMA (bug 1637)
@@ -1670,24 +1835,24 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           #else
             #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
           #endif
-          #define EIGEN_GEBGP_ONESTEP(K) \
-            do {                                                                \
-              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4");        \
-              traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);                    \
-              traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);                    \
-              traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \
-              traits.madd(A0, B_0, C0, T0);                                     \
-              traits.madd(A1, B_0, C4, B_0);                                    \
-              traits.madd(A0, B1,  C1, T0);                                     \
-              traits.madd(A1, B1,  C5, B1);                                     \
-              traits.madd(A0, B2,  C2, T0);                                     \
-              traits.madd(A1, B2,  C6, B2);                                     \
-              traits.madd(A0, B3,  C3, T0);                                     \
-              traits.madd(A1, B3,  C7, B3);                                     \
-              EIGEN_GEBP_2PX4_SPILLING_WORKAROUND                               \
-              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4");          \
-            } while(false)
-            
+#define EIGEN_GEBGP_ONESTEP(K)                                            \
+            do {                                                          \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4");  \
+              traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0);        \
+              traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1);        \
+              traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
+              traits.madd(A0, rhs_panel, C0, T0, fix<0>);                 \
+              traits.madd(A1, rhs_panel, C4, T0, fix<0>);                 \
+              traits.madd(A0, rhs_panel, C1, T0, fix<1>);                 \
+              traits.madd(A1, rhs_panel, C5, T0, fix<1>);                 \
+              traits.madd(A0, rhs_panel, C2, T0, fix<2>);                 \
+              traits.madd(A1, rhs_panel, C6, T0, fix<2>);                 \
+              traits.madd(A0, rhs_panel, C3, T0, fix<3>);                 \
+              traits.madd(A1, rhs_panel, C7, T0, fix<3>);                 \
+              EIGEN_GEBP_2PX4_SPILLING_WORKAROUND                         \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4");    \
+            } while (false)
+
             internal::prefetch(blB+(48+0));
             EIGEN_GEBGP_ONESTEP(0);
             EIGEN_GEBGP_ONESTEP(1);
@@ -1707,7 +1872,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           // process remaining peeled loop
           for(Index k=peeled_kc; k<depth; k++)
           {
-            RhsPacket B_0, B1, B2, B3, T0;
+            RhsPacketx4 rhs_panel;
+            RhsPacket T0;
             EIGEN_GEBGP_ONESTEP(0);
             blB += 4*RhsProgress;
             blA += 2*Traits::LhsProgress;
@@ -1778,8 +1944,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
               traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);                      \
               traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);                      \
               traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);                       \
-              traits.madd(A0, B_0, C0, B1);                                       \
-              traits.madd(A1, B_0, C4, B_0);                                      \
+              traits.madd(A0, B_0, C0, B1, fix<0>);                               \
+              traits.madd(A1, B_0, C4, B_0, fix<0>);                              \
               EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1");            \
             } while(false)
         
@@ -1882,15 +2048,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
 
               straits.loadRhsQuad(blA+0*spk, B_0);
               straits.loadRhsQuad(blA+1*spk, B_1);
-              straits.madd(A0,B_0,C0,B_0);
-              straits.madd(A1,B_1,C1,B_1);
+              straits.madd(A0,B_0,C0,B_0, fix<0>);
+              straits.madd(A1,B_1,C1,B_1, fix<0>);
 
               straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
               straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
               straits.loadRhsQuad(blA+2*spk, B_0);
               straits.loadRhsQuad(blA+3*spk, B_1);
-              straits.madd(A0,B_0,C2,B_0);
-              straits.madd(A1,B_1,C3,B_1);
+              straits.madd(A0,B_0,C2,B_0, fix<0>);
+              straits.madd(A1,B_1,C3,B_1, fix<0>);
 
               blB += 4*SwappedTraits::LhsProgress;
               blA += 4*spk;
@@ -1903,7 +2069,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
 
               straits.loadLhsUnaligned(blB, A0);
               straits.loadRhsQuad(blA, B_0);
-              straits.madd(A0,B_0,C0,B_0);
+              straits.madd(A0,B_0,C0,B_0, fix<0>);
 
               blB += SwappedTraits::LhsProgress;
               blA += spk;
@@ -1927,7 +2093,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
                 straits.loadLhsUnaligned(blB, a0);
                 straits.loadRhs(blA, b0);
                 SAccPacketHalf c0 = predux_half_dowto4(C0);
-                straits.madd(a0,b0,c0,b0);
+                straits.madd(a0,b0,c0,b0, fix<0>);
                 straits.acc(c0, alphav, R);
               }
               else
@@ -2273,7 +2439,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
     }
 
     pack -= psize;
-    int left = rows - i;
+    Index left = rows - i;
     if (pack <= 0) {
       if (!gone_last &&
           (starting_pos == i || left >= psize/2 || left >= psize/4) &&
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index 121476394..68765d4b2 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -29,10 +29,15 @@
  *
  * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
  * vectorized and non-vectorized code.
+ * 
+ * FIXME: this code can be cleaned up once we switch to proper C++11 only.
  */
 #if (defined EIGEN_CUDACC)
   #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
   #define EIGEN_ALIGNOF(x) __alignof(x)
+#elif EIGEN_HAS_ALIGNAS
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) alignas(n)
+  #define EIGEN_ALIGNOF(x) alignof(x)
 #elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
   #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
   #define EIGEN_ALIGNOF(x) __alignof(x)
@@ -44,7 +49,7 @@
   #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
   #define EIGEN_ALIGNOF(x) __alignof(x)
 #else
-  #error Please tell me what is the equivalent of __attribute__((aligned(n))) and __alignof(x) for your compiler
+  #error Please tell me what is the equivalent of alignas(n) and alignof(x) for your compiler
 #endif
 
 // If the user explicitly disable vectorization, then we also disable alignment
@@ -381,7 +386,7 @@
 #endif
 
 #if defined(EIGEN_HAS_CUDA_FP16)
-  #include <host_defines.h>
+  #include <cuda_runtime_api.h>
   #include <cuda_fp16.h>
 #endif
 
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index 3ab3a5f50..5d86a51ac 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -260,6 +260,7 @@ template<typename MatrixType> class HouseholderQR;
 template<typename MatrixType> class ColPivHouseholderQR;
 template<typename MatrixType> class FullPivHouseholderQR;
 template<typename MatrixType> class CompleteOrthogonalDecomposition;
+template<typename MatrixType> class SVDBase;
 template<typename MatrixType, int QRPreconditioner = ColPivHouseholderQRPreconditioner> class JacobiSVD;
 template<typename MatrixType> class BDCSVD;
 template<typename MatrixType, int UpLo = Lower> class LLT;
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index c7dba1fc4..3a8001e8f 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -129,16 +129,21 @@
   #define EIGEN_COMP_MSVC_STRICT 0
 #endif
 
-/// \internal EIGEN_COMP_IBM set to 1 if the compiler is IBM XL C++
-#if defined(__IBMCPP__) || defined(__xlc__)
-  #define EIGEN_COMP_IBM 1
+/// \internal EIGEN_COMP_IBM set to xlc version if the compiler is IBM XL C++
+// XLC   version
+// 3.1   0x0301	
+// 4.5   0x0405	
+// 5.0   0x0500
+// 12.1  0x0C01
+#if defined(__IBMCPP__) || defined(__xlc__) || defined(__ibmxl__)
+  #define EIGEN_COMP_IBM __xlC__
 #else
   #define EIGEN_COMP_IBM 0
 #endif
 
-/// \internal EIGEN_COMP_PGI set to 1 if the compiler is Portland Group Compiler
+/// \internal EIGEN_COMP_PGI set to PGI version if the compiler is Portland Group Compiler
 #if defined(__PGI)
-  #define EIGEN_COMP_PGI 1
+  #define EIGEN_COMP_PGI (__PGIC__*100+__PGIC_MINOR__)
 #else
   #define EIGEN_COMP_PGI 0
 #endif
@@ -347,9 +352,17 @@
   #define EIGEN_OS_WIN_STRICT 0
 #endif
 
-/// \internal EIGEN_OS_SUN set to 1 if the OS is SUN
+/// \internal EIGEN_OS_SUN set to __SUNPRO_C if the OS is SUN
+// compiler  solaris   __SUNPRO_C
+// version   studio
+// 5.7       10        0x570
+// 5.8       11        0x580
+// 5.9       12        0x590
+// 5.10	     12.1      0x5100
+// 5.11	     12.2      0x5110
+// 5.12	     12.3      0x5120
 #if (defined(sun) || defined(__sun)) && !(defined(__SVR4) || defined(__svr4__))
-  #define EIGEN_OS_SUN 1
+  #define EIGEN_OS_SUN __SUNPRO_C
 #else
   #define EIGEN_OS_SUN 0
 #endif
@@ -546,6 +559,22 @@
 #endif
 #endif
 
+#ifndef EIGEN_HAS_ALIGNAS
+#if EIGEN_MAX_CPP_VER>=11 && EIGEN_HAS_CXX11 &&   \
+      (     __has_feature(cxx_alignas)            \
+        ||  EIGEN_HAS_CXX14                       \
+        || (EIGEN_COMP_MSVC >= 1800)              \
+        || (EIGEN_GNUC_AT_LEAST(4,8))             \
+        || (EIGEN_COMP_CLANG>=305)                \
+        || (EIGEN_COMP_ICC>=1500)                 \
+        || (EIGEN_COMP_PGI>=1500)                 \
+        || (EIGEN_COMP_SUNCC>=0x5130))
+#define EIGEN_HAS_ALIGNAS 1
+#else
+#define EIGEN_HAS_ALIGNAS 0
+#endif
+#endif
+
 // Does the compiler support type_traits?
 // - full support of type traits was added only to GCC 5.1.0.
 // - 20150626 corresponds to the last release of 4.x libstdc++
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 1415b3fc1..8fcb18a94 100755
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -636,8 +636,41 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
 bool not_equal_strict(const double& x,const double& y) { return std::not_equal_to<double>()(x,y); }
 #endif
 
+/** \internal extract the bits of the float \a x */
+inline unsigned int as_uint(float x)
+{
+  unsigned int ret;
+  std::memcpy(&ret, &x, sizeof(float));
+  return ret;
+}
+
 } // end namespace numext
 
 } // end namespace Eigen
 
+// Define portable (u)int{32,64} types
+#if EIGEN_HAS_CXX11
+#include <cstdint>
+namespace Eigen {
+namespace numext {
+typedef std::uint32_t uint32_t;
+typedef std::int32_t  int32_t;
+typedef std::uint64_t uint64_t;
+typedef std::int64_t  int64_t;
+}
+}
+#else
+// Without c++11, all compilers able to compile Eigen also
+// provides the C99 stdint.h header file.
+#include <stdint.h>
+namespace Eigen {
+namespace numext {
+typedef ::uint32_t uint32_t;
+typedef ::int32_t  int32_t;
+typedef ::uint64_t uint64_t;
+typedef ::int64_t  int64_t;
+}
+}
+#endif
+
 #endif // EIGEN_META_H
diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h
index b2f95153e..67714e444 100644
--- a/Eigen/src/Core/util/StaticAssert.h
+++ b/Eigen/src/Core/util/StaticAssert.h
@@ -104,7 +104,8 @@
         STORAGE_INDEX_MUST_MATCH=1,
         CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1,
         SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1,
-        INVALID_TEMPLATE_PARAMETER=1
+        INVALID_TEMPLATE_PARAMETER=1,
+        GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS=1
       };
     };
 
diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h
index 75991aaed..3090351a0 100644
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h
@@ -97,6 +97,9 @@ template<int Mode> struct transform_make_affine;
   *              - #AffineCompact: the transformation is stored as a (Dim)x(Dim+1) matrix.
   *              - #Projective: the transformation is stored as a (Dim+1)^2 matrix
   *                             without any assumption.
+  *              - #Isometry: same as #Affine with the additional assumption that
+  *                           the linear part represents a rotation. This assumption is exploited
+  *                           to speed up some functions such as inverse() and rotation().
   * \tparam _Options has the same meaning as in class Matrix. It allows to specify DontAlign and/or RowMajor.
   *                  These Options are passed directly to the underlying matrix type.
   *
@@ -252,11 +255,11 @@ protected:
 public:
 
   /** Default constructor without initialization of the meaningful coefficients.
-    * If Mode==Affine, then the last row is set to [0 ... 0 1] */
+    * If Mode==Affine or Mode==Isometry, then the last row is set to [0 ... 0 1] */
   EIGEN_DEVICE_FUNC inline Transform()
   {
     check_template_params();
-    internal::transform_make_affine<(int(Mode)==Affine) ? Affine : AffineCompact>::run(m_matrix);
+    internal::transform_make_affine<(int(Mode)==Affine || int(Mode)==Isometry) ? Affine : AffineCompact>::run(m_matrix);
   }
 
   EIGEN_DEVICE_FUNC inline Transform(const Transform& other)
@@ -602,7 +605,9 @@ public:
   template<typename Derived>
   EIGEN_DEVICE_FUNC inline Transform operator*(const RotationBase<Derived,Dim>& r) const;
 
-  EIGEN_DEVICE_FUNC const LinearMatrixType rotation() const;
+  typedef typename internal::conditional<int(Mode)==Isometry,ConstLinearPart,const LinearMatrixType>::type RotationReturnType;
+  EIGEN_DEVICE_FUNC RotationReturnType rotation() const;
+
   template<typename RotationMatrixType, typename ScalingMatrixType>
   EIGEN_DEVICE_FUNC
   void computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const;
@@ -1046,20 +1051,43 @@ EIGEN_DEVICE_FUNC inline Transform<Scalar,Dim,Mode,Options> Transform<Scalar,Dim
 *** Special functions ***
 ************************/
 
+namespace internal {
+template<int Mode> struct transform_rotation_impl {
+  template<typename TransformType>
+  EIGEN_DEVICE_FUNC static inline
+  const typename TransformType::LinearMatrixType run(const TransformType& t)
+  {
+    typedef typename TransformType::LinearMatrixType LinearMatrixType; 
+    LinearMatrixType result;
+    t.computeRotationScaling(&result, (LinearMatrixType*)0);
+    return result;
+  }
+};
+template<> struct transform_rotation_impl<Isometry> {
+  template<typename TransformType>
+  EIGEN_DEVICE_FUNC static inline
+  typename TransformType::ConstLinearPart run(const TransformType& t)
+  {
+    return t.linear();
+  }
+};
+}
 /** \returns the rotation part of the transformation
   *
+  * If Mode==Isometry, then this method is an alias for linear(),
+  * otherwise it calls computeRotationScaling() to extract the rotation
+  * through a SVD decomposition.
   *
   * \svd_module
   *
   * \sa computeRotationScaling(), computeScalingRotation(), class SVD
   */
 template<typename Scalar, int Dim, int Mode, int Options>
-EIGEN_DEVICE_FUNC const typename Transform<Scalar,Dim,Mode,Options>::LinearMatrixType
+EIGEN_DEVICE_FUNC
+typename Transform<Scalar,Dim,Mode,Options>::RotationReturnType
 Transform<Scalar,Dim,Mode,Options>::rotation() const
 {
-  LinearMatrixType result;
-  computeRotationScaling(&result, (LinearMatrixType*)0);
-  return result;
+  return internal::transform_rotation_impl<Mode>::run(*this);
 }
 
 
diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h
index e62befcb6..9318c281f 100644
--- a/Eigen/src/Householder/HouseholderSequence.h
+++ b/Eigen/src/Householder/HouseholderSequence.h
@@ -156,6 +156,12 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       Side
     > TransposeReturnType;
 
+    typedef HouseholderSequence<
+      typename internal::add_const<VectorsType>::type,
+      typename internal::add_const<CoeffsType>::type,
+      Side
+    > ConstHouseholderSequence;
+
     /** \brief Constructor.
       * \param[in]  v      %Matrix containing the essential parts of the Householder vectors
       * \param[in]  h      Vector containing the Householder coefficients
@@ -244,6 +250,18 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
              .setShift(m_shift);
     }
 
+    /** \returns an expression of the complex conjugate of \c *this if Cond==true,
+     *           returns \c *this otherwise.
+     */
+    template<bool Cond>
+    EIGEN_DEVICE_FUNC
+    inline typename internal::conditional<Cond,ConjugateReturnType,ConstHouseholderSequence>::type
+    conjugateIf() const
+    {
+      typedef typename internal::conditional<Cond,ConjugateReturnType,ConstHouseholderSequence>::type ReturnType;
+      return ReturnType(m_vectors.template conjugateIf<Cond>(), m_coeffs.template conjugateIf<Cond>());
+    }
+
     /** \brief Adjoint (conjugate transpose) of the Householder sequence. */
     AdjointReturnType adjoint() const
     {
diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h
index 344ec8926..ef93ec5eb 100644
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@@ -18,6 +18,7 @@ template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> >
 {
   typedef MatrixXpr XprKind;
   typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
   enum { Flags = 0 };
 };
 
@@ -62,9 +63,9 @@ template<typename _MatrixType> class FullPivLU
   public:
     typedef _MatrixType MatrixType;
     typedef SolverBase<FullPivLU> Base;
+    friend class SolverBase<FullPivLU>;
 
     EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivLU)
-    // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int
     enum {
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
@@ -218,6 +219,7 @@ template<typename _MatrixType> class FullPivLU
       return internal::image_retval<FullPivLU>(*this, originalMatrix);
     }
 
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** \return a solution x to the equation Ax=b, where A is the matrix of which
       * *this is the LU decomposition.
       *
@@ -237,14 +239,10 @@ template<typename _MatrixType> class FullPivLU
       *
       * \sa TriangularView::solve(), kernel(), inverse()
       */
-    // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion.
     template<typename Rhs>
     inline const Solve<FullPivLU, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "LU is not initialized.");
-      return Solve<FullPivLU, Rhs>(*this, b.derived());
-    }
+    solve(const MatrixBase<Rhs>& b) const;
+    #endif
 
     /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is
         the LU decomposition.
@@ -529,8 +527,8 @@ void FullPivLU<MatrixType>::computeInPlace()
       m_nonzero_pivots = k;
       for(Index i = k; i < size; ++i)
       {
-        m_rowsTranspositions.coeffRef(i) = i;
-        m_colsTranspositions.coeffRef(i) = i;
+        m_rowsTranspositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
+        m_colsTranspositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
       }
       break;
     }
@@ -541,8 +539,8 @@ void FullPivLU<MatrixType>::computeInPlace()
     // Now that we've found the pivot, we need to apply the row/col swaps to
     // bring it to the location (k,k).
 
-    m_rowsTranspositions.coeffRef(k) = row_of_biggest_in_corner;
-    m_colsTranspositions.coeffRef(k) = col_of_biggest_in_corner;
+    m_rowsTranspositions.coeffRef(k) = internal::convert_index<StorageIndex>(row_of_biggest_in_corner);
+    m_colsTranspositions.coeffRef(k) = internal::convert_index<StorageIndex>(col_of_biggest_in_corner);
     if(k != row_of_biggest_in_corner) {
       m_lu.row(k).swap(m_lu.row(row_of_biggest_in_corner));
       ++number_of_transpositions;
@@ -755,7 +753,6 @@ void FullPivLU<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
   const Index rows = this->rows(),
               cols = this->cols(),
               nonzero_pivots = this->rank();
-  eigen_assert(rhs.rows() == rows);
   const Index smalldim = (std::min)(rows, cols);
 
   if(nonzero_pivots == 0)
@@ -805,7 +802,6 @@ void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType
 
   const Index rows = this->rows(), cols = this->cols(),
     nonzero_pivots = this->rank();
-   eigen_assert(rhs.rows() == cols);
   const Index smalldim = (std::min)(rows, cols);
 
   if(nonzero_pivots == 0)
@@ -819,29 +815,19 @@ void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType
   // Step 1
   c = permutationQ().inverse() * rhs;
 
-  if (Conjugate) {
-    // Step 2
-    m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
-        .template triangularView<Upper>()
-        .adjoint()
-        .solveInPlace(c.topRows(nonzero_pivots));
-    // Step 3
-    m_lu.topLeftCorner(smalldim, smalldim)
-        .template triangularView<UnitLower>()
-        .adjoint()
-        .solveInPlace(c.topRows(smalldim));
-  } else {
-    // Step 2
-    m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
-        .template triangularView<Upper>()
-        .transpose()
-        .solveInPlace(c.topRows(nonzero_pivots));
-    // Step 3
-    m_lu.topLeftCorner(smalldim, smalldim)
-        .template triangularView<UnitLower>()
-        .transpose()
-        .solveInPlace(c.topRows(smalldim));
-  }
+  // Step 2
+  m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
+      .template triangularView<Upper>()
+      .transpose()
+      .template conjugateIf<Conjugate>()
+      .solveInPlace(c.topRows(nonzero_pivots));
+
+  // Step 3
+  m_lu.topLeftCorner(smalldim, smalldim)
+      .template triangularView<UnitLower>()
+      .transpose()
+      .template conjugateIf<Conjugate>()
+      .solveInPlace(c.topRows(smalldim));
 
   // Step 4
   PermutationPType invp = permutationP().inverse().eval();
diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index bfcd2c95b..b414b5c46 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@@ -19,6 +19,7 @@ template<typename _MatrixType> struct traits<PartialPivLU<_MatrixType> >
 {
   typedef MatrixXpr XprKind;
   typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
   typedef traits<_MatrixType> BaseTraits;
   enum {
     Flags = BaseTraits::Flags & RowMajorBit,
@@ -79,8 +80,9 @@ template<typename _MatrixType> class PartialPivLU
 
     typedef _MatrixType MatrixType;
     typedef SolverBase<PartialPivLU> Base;
+    friend class SolverBase<PartialPivLU>;
+
     EIGEN_GENERIC_PUBLIC_INTERFACE(PartialPivLU)
-    // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int
     enum {
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
@@ -152,6 +154,7 @@ template<typename _MatrixType> class PartialPivLU
       return m_p;
     }
 
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** This method returns the solution x to the equation Ax=b, where A is the matrix of which
       * *this is the LU decomposition.
       *
@@ -169,14 +172,10 @@ template<typename _MatrixType> class PartialPivLU
       *
       * \sa TriangularView::solve(), inverse(), computeInverse()
       */
-    // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion.
     template<typename Rhs>
     inline const Solve<PartialPivLU, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
-      return Solve<PartialPivLU, Rhs>(*this, b.derived());
-    }
+    solve(const MatrixBase<Rhs>& b) const;
+    #endif
 
     /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is
         the LU decomposition.
@@ -231,8 +230,6 @@ template<typename _MatrixType> class PartialPivLU
       * Step 3: replace c by the solution x to Ux = c.
       */
 
-      eigen_assert(rhs.rows() == m_lu.rows());
-
       // Step 1
       dst = permutationP() * rhs;
 
@@ -246,26 +243,21 @@ template<typename _MatrixType> class PartialPivLU
     template<bool Conjugate, typename RhsType, typename DstType>
     EIGEN_DEVICE_FUNC
     void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const {
-     /* The decomposition PA = LU can be rewritten as A = P^{-1} L U.
+     /* The decomposition PA = LU can be rewritten as A^T = U^T L^T P.
       * So we proceed as follows:
-      * Step 1: compute c = Pb.
-      * Step 2: replace c by the solution x to Lx = c.
-      * Step 3: replace c by the solution x to Ux = c.
+      * Step 1: compute c as the solution to L^T c = b
+      * Step 2: replace c by the solution x to U^T x = c.
+      * Step 3: update  c = P^-1 c.
       */
 
       eigen_assert(rhs.rows() == m_lu.cols());
 
-      if (Conjugate) {
-        // Step 1
-        dst = m_lu.template triangularView<Upper>().adjoint().solve(rhs);
-        // Step 2
-        m_lu.template triangularView<UnitLower>().adjoint().solveInPlace(dst);
-      } else {
-        // Step 1
-        dst = m_lu.template triangularView<Upper>().transpose().solve(rhs);
-        // Step 2
-        m_lu.template triangularView<UnitLower>().transpose().solveInPlace(dst);
-      }
+      // Step 1
+      dst = m_lu.template triangularView<Upper>().transpose()
+                .template conjugateIf<Conjugate>().solve(rhs);
+      // Step 2
+      m_lu.template triangularView<UnitLower>().transpose()
+          .template conjugateIf<Conjugate>().solveInPlace(dst);
       // Step 3
       dst = permutationP().transpose() * dst;
     }
@@ -519,7 +511,10 @@ void PartialPivLU<MatrixType>::compute()
   // the row permutation is stored as int indices, so just to be sure:
   eigen_assert(m_lu.rows()<NumTraits<int>::highest());
 
-  m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff();
+  if(m_lu.cols()>0)
+    m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff();
+  else
+    m_l1_norm = RealScalar(0);
 
   eigen_assert(m_lu.rows() == m_lu.cols() && "PartialPivLU is only for square (and moreover invertible) matrices");
   const Index size = m_lu.rows();
diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h
index 1faa3442e..9b677e9bf 100644
--- a/Eigen/src/QR/ColPivHouseholderQR.h
+++ b/Eigen/src/QR/ColPivHouseholderQR.h
@@ -17,6 +17,9 @@ namespace internal {
 template<typename _MatrixType> struct traits<ColPivHouseholderQR<_MatrixType> >
  : traits<_MatrixType>
 {
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
   enum { Flags = 0 };
 };
 
@@ -46,20 +49,19 @@ template<typename _MatrixType> struct traits<ColPivHouseholderQR<_MatrixType> >
   * \sa MatrixBase::colPivHouseholderQr()
   */
 template<typename _MatrixType> class ColPivHouseholderQR
+        : public SolverBase<ColPivHouseholderQR<_MatrixType> >
 {
   public:
 
     typedef _MatrixType MatrixType;
+    typedef SolverBase<ColPivHouseholderQR> Base;
+    friend class SolverBase<ColPivHouseholderQR>;
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(ColPivHouseholderQR)
     enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::RealScalar RealScalar;
-    // FIXME should be int
-    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
     typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime> PermutationType;
     typedef typename internal::plain_row_type<MatrixType, Index>::type IntRowVectorType;
@@ -156,6 +158,7 @@ template<typename _MatrixType> class ColPivHouseholderQR
       computeInPlace();
     }
 
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
       * *this is the QR decomposition, if any exists.
       *
@@ -172,11 +175,8 @@ template<typename _MatrixType> class ColPivHouseholderQR
       */
     template<typename Rhs>
     inline const Solve<ColPivHouseholderQR, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-      return Solve<ColPivHouseholderQR, Rhs>(*this, b.derived());
-    }
+    solve(const MatrixBase<Rhs>& b) const;
+    #endif
 
     HouseholderSequenceType householderQ() const;
     HouseholderSequenceType matrixQ() const
@@ -417,6 +417,9 @@ template<typename _MatrixType> class ColPivHouseholderQR
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+    template<bool Conjugate, typename RhsType, typename DstType>
+    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
     #endif
 
   protected:
@@ -583,8 +586,6 @@ template<typename _MatrixType>
 template<typename RhsType, typename DstType>
 void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  eigen_assert(rhs.rows() == rows());
-
   const Index nonzero_pivots = nonzeroPivots();
 
   if(nonzero_pivots == 0)
@@ -604,6 +605,31 @@ void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &
   for(Index i = 0; i < nonzero_pivots; ++i) dst.row(m_colsPermutation.indices().coeff(i)) = c.row(i);
   for(Index i = nonzero_pivots; i < cols(); ++i) dst.row(m_colsPermutation.indices().coeff(i)).setZero();
 }
+
+template<typename _MatrixType>
+template<bool Conjugate, typename RhsType, typename DstType>
+void ColPivHouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+  const Index nonzero_pivots = nonzeroPivots();
+
+  if(nonzero_pivots == 0)
+  {
+    dst.setZero();
+    return;
+  }
+
+  typename RhsType::PlainObject c(m_colsPermutation.transpose()*rhs);
+
+  m_qr.topLeftCorner(nonzero_pivots, nonzero_pivots)
+        .template triangularView<Upper>()
+        .transpose().template conjugateIf<Conjugate>()
+        .solveInPlace(c.topRows(nonzero_pivots));
+
+  dst.topRows(nonzero_pivots) = c.topRows(nonzero_pivots);
+  dst.bottomRows(rows()-nonzero_pivots).setZero();
+
+  dst.applyOnTheLeft(householderQ().setLength(nonzero_pivots).template conjugateIf<!Conjugate>() );
+}
 #endif
 
 namespace internal {
diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h
index 03017a375..2fc3c871a 100644
--- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h
+++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h
@@ -16,6 +16,9 @@ namespace internal {
 template <typename _MatrixType>
 struct traits<CompleteOrthogonalDecomposition<_MatrixType> >
     : traits<_MatrixType> {
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
   enum { Flags = 0 };
 };
 
@@ -44,19 +47,21 @@ struct traits<CompleteOrthogonalDecomposition<_MatrixType> >
   * 
   * \sa MatrixBase::completeOrthogonalDecomposition()
   */
-template <typename _MatrixType>
-class CompleteOrthogonalDecomposition {
+template <typename _MatrixType> class CompleteOrthogonalDecomposition
+          : public SolverBase<CompleteOrthogonalDecomposition<_MatrixType> >
+{
  public:
   typedef _MatrixType MatrixType;
+  typedef SolverBase<CompleteOrthogonalDecomposition> Base;
+
+  template<typename Derived>
+  friend struct internal::solve_assertion;
+
+  EIGEN_GENERIC_PUBLIC_INTERFACE(CompleteOrthogonalDecomposition)
   enum {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
   };
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::RealScalar RealScalar;
-  typedef typename MatrixType::StorageIndex StorageIndex;
   typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
   typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime>
       PermutationType;
@@ -131,9 +136,9 @@ class CompleteOrthogonalDecomposition {
       m_temp(matrix.cols())
   {
     computeInPlace();
-  }
-
+  } 
 
+  #ifdef EIGEN_PARSED_BY_DOXYGEN
   /** This method computes the minimum-norm solution X to a least squares
    * problem \f[\mathrm{minimize} \|A X - B\|, \f] where \b A is the matrix of
    * which \c *this is the complete orthogonal decomposition.
@@ -145,11 +150,8 @@ class CompleteOrthogonalDecomposition {
    */
   template <typename Rhs>
   inline const Solve<CompleteOrthogonalDecomposition, Rhs> solve(
-      const MatrixBase<Rhs>& b) const {
-    eigen_assert(m_cpqr.m_isInitialized &&
-                 "CompleteOrthogonalDecomposition is not initialized.");
-    return Solve<CompleteOrthogonalDecomposition, Rhs>(*this, b.derived());
-  }
+      const MatrixBase<Rhs>& b) const;
+  #endif
 
   HouseholderSequenceType householderQ(void) const;
   HouseholderSequenceType matrixQ(void) const { return m_cpqr.householderQ(); }
@@ -158,8 +160,8 @@ class CompleteOrthogonalDecomposition {
    */
   MatrixType matrixZ() const {
     MatrixType Z = MatrixType::Identity(m_cpqr.cols(), m_cpqr.cols());
-    applyZAdjointOnTheLeftInPlace(Z);
-    return Z.adjoint();
+    applyZOnTheLeftInPlace<false>(Z);
+    return Z;
   }
 
   /** \returns a reference to the matrix where the complete orthogonal
@@ -275,6 +277,7 @@ class CompleteOrthogonalDecomposition {
    */
   inline const Inverse<CompleteOrthogonalDecomposition> pseudoInverse() const
   {
+    eigen_assert(m_cpqr.m_isInitialized && "CompleteOrthogonalDecomposition is not initialized.");
     return Inverse<CompleteOrthogonalDecomposition>(*this);
   }
 
@@ -368,6 +371,9 @@ class CompleteOrthogonalDecomposition {
 #ifndef EIGEN_PARSED_BY_DOXYGEN
   template <typename RhsType, typename DstType>
   void _solve_impl(const RhsType& rhs, DstType& dst) const;
+
+  template<bool Conjugate, typename RhsType, typename DstType>
+  void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
 #endif
 
  protected:
@@ -375,8 +381,22 @@ class CompleteOrthogonalDecomposition {
     EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
   }
 
+  template<bool Transpose_, typename Rhs>
+  void _check_solve_assertion(const Rhs& b) const {
+      EIGEN_ONLY_USED_FOR_DEBUG(b);
+      eigen_assert(m_cpqr.m_isInitialized && "CompleteOrthogonalDecomposition is not initialized.");
+      eigen_assert((Transpose_?derived().cols():derived().rows())==b.rows() && "CompleteOrthogonalDecomposition::solve(): invalid number of rows of the right hand side matrix b");
+  }
+
   void computeInPlace();
 
+  /** Overwrites \b rhs with \f$ \mathbf{Z} * \mathbf{rhs} \f$ or
+   *  \f$ \mathbf{\overline Z} * \mathbf{rhs} \f$ if \c Conjugate 
+   *  is set to \c true.
+   */
+  template <bool Conjugate, typename Rhs>
+  void applyZOnTheLeftInPlace(Rhs& rhs) const;
+
   /** Overwrites \b rhs with \f$ \mathbf{Z}^* * \mathbf{rhs} \f$.
    */
   template <typename Rhs>
@@ -465,13 +485,35 @@ void CompleteOrthogonalDecomposition<MatrixType>::computeInPlace()
 }
 
 template <typename MatrixType>
+template <bool Conjugate, typename Rhs>
+void CompleteOrthogonalDecomposition<MatrixType>::applyZOnTheLeftInPlace(
+    Rhs& rhs) const {
+  const Index cols = this->cols();
+  const Index nrhs = rhs.cols();
+  const Index rank = this->rank();
+  Matrix<typename Rhs::Scalar, Dynamic, 1> temp((std::max)(cols, nrhs));
+  for (Index k = rank-1; k >= 0; --k) {
+    if (k != rank - 1) {
+      rhs.row(k).swap(rhs.row(rank - 1));
+    }
+    rhs.middleRows(rank - 1, cols - rank + 1)
+        .applyHouseholderOnTheLeft(
+            matrixQTZ().row(k).tail(cols - rank).transpose().template conjugateIf<!Conjugate>(), zCoeffs().template conjugateIf<Conjugate>()(k),
+            &temp(0));
+    if (k != rank - 1) {
+      rhs.row(k).swap(rhs.row(rank - 1));
+    }
+  }
+}
+
+template <typename MatrixType>
 template <typename Rhs>
 void CompleteOrthogonalDecomposition<MatrixType>::applyZAdjointOnTheLeftInPlace(
     Rhs& rhs) const {
   const Index cols = this->cols();
   const Index nrhs = rhs.cols();
   const Index rank = this->rank();
-  Matrix<typename MatrixType::Scalar, Dynamic, 1> temp((std::max)(cols, nrhs));
+  Matrix<typename Rhs::Scalar, Dynamic, 1> temp((std::max)(cols, nrhs));
   for (Index k = 0; k < rank; ++k) {
     if (k != rank - 1) {
       rhs.row(k).swap(rhs.row(rank - 1));
@@ -491,8 +533,6 @@ template <typename _MatrixType>
 template <typename RhsType, typename DstType>
 void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl(
     const RhsType& rhs, DstType& dst) const {
-  eigen_assert(rhs.rows() == this->rows());
-
   const Index rank = this->rank();
   if (rank == 0) {
     dst.setZero();
@@ -520,6 +560,34 @@ void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl(
   // Undo permutation to get x = P^{-1} * y.
   dst = colsPermutation() * dst;
 }
+
+template<typename _MatrixType>
+template<bool Conjugate, typename RhsType, typename DstType>
+void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+  const Index rank = this->rank();
+
+  if (rank == 0) {
+    dst.setZero();
+    return;
+  }
+
+  typename RhsType::PlainObject c(colsPermutation().transpose()*rhs);
+
+  if (rank < cols()) {
+    applyZOnTheLeftInPlace<!Conjugate>(c);
+  }
+
+  matrixT().topLeftCorner(rank, rank)
+           .template triangularView<Upper>()
+           .transpose().template conjugateIf<Conjugate>()
+           .solveInPlace(c.topRows(rank));
+
+  dst.topRows(rank) = c.topRows(rank);
+  dst.bottomRows(rows()-rank).setZero();
+
+  dst.applyOnTheLeft(householderQ().setLength(rank).template conjugateIf<!Conjugate>() );
+}
 #endif
 
 namespace internal {
diff --git a/Eigen/src/QR/FullPivHouseholderQR.h b/Eigen/src/QR/FullPivHouseholderQR.h
index c31e47cc4..d0664a1d8 100644
--- a/Eigen/src/QR/FullPivHouseholderQR.h
+++ b/Eigen/src/QR/FullPivHouseholderQR.h
@@ -18,6 +18,9 @@ namespace internal {
 template<typename _MatrixType> struct traits<FullPivHouseholderQR<_MatrixType> >
  : traits<_MatrixType>
 {
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
   enum { Flags = 0 };
 };
 
@@ -55,20 +58,19 @@ struct traits<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
   * \sa MatrixBase::fullPivHouseholderQr()
   */
 template<typename _MatrixType> class FullPivHouseholderQR
+        : public SolverBase<FullPivHouseholderQR<_MatrixType> >
 {
   public:
 
     typedef _MatrixType MatrixType;
+    typedef SolverBase<FullPivHouseholderQR> Base;
+    friend class SolverBase<FullPivHouseholderQR>;
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivHouseholderQR)
     enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::RealScalar RealScalar;
-    // FIXME should be int
-    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef internal::FullPivHouseholderQRMatrixQReturnType<MatrixType> MatrixQReturnType;
     typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
     typedef Matrix<StorageIndex, 1,
@@ -156,6 +158,7 @@ template<typename _MatrixType> class FullPivHouseholderQR
       computeInPlace();
     }
 
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
       * \c *this is the QR decomposition.
       *
@@ -173,11 +176,8 @@ template<typename _MatrixType> class FullPivHouseholderQR
       */
     template<typename Rhs>
     inline const Solve<FullPivHouseholderQR, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
-      return Solve<FullPivHouseholderQR, Rhs>(*this, b.derived());
-    }
+    solve(const MatrixBase<Rhs>& b) const;
+    #endif
 
     /** \returns Expression object representing the matrix Q
       */
@@ -396,6 +396,9 @@ template<typename _MatrixType> class FullPivHouseholderQR
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+    template<bool Conjugate, typename RhsType, typename DstType>
+    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
     #endif
 
   protected:
@@ -498,15 +501,15 @@ void FullPivHouseholderQR<MatrixType>::computeInPlace()
       m_nonzero_pivots = k;
       for(Index i = k; i < size; i++)
       {
-        m_rows_transpositions.coeffRef(i) = i;
-        m_cols_transpositions.coeffRef(i) = i;
+        m_rows_transpositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
+        m_cols_transpositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
         m_hCoeffs.coeffRef(i) = Scalar(0);
       }
       break;
     }
 
-    m_rows_transpositions.coeffRef(k) = row_of_biggest_in_corner;
-    m_cols_transpositions.coeffRef(k) = col_of_biggest_in_corner;
+    m_rows_transpositions.coeffRef(k) = internal::convert_index<StorageIndex>(row_of_biggest_in_corner);
+    m_cols_transpositions.coeffRef(k) = internal::convert_index<StorageIndex>(col_of_biggest_in_corner);
     if(k != row_of_biggest_in_corner) {
       m_qr.row(k).tail(cols-k).swap(m_qr.row(row_of_biggest_in_corner).tail(cols-k));
       ++number_of_transpositions;
@@ -540,7 +543,6 @@ template<typename _MatrixType>
 template<typename RhsType, typename DstType>
 void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  eigen_assert(rhs.rows() == rows());
   const Index l_rank = rank();
 
   // FIXME introduce nonzeroPivots() and use it here. and more generally,
@@ -553,7 +555,7 @@ void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType
 
   typename RhsType::PlainObject c(rhs);
 
-  Matrix<Scalar,1,RhsType::ColsAtCompileTime> temp(rhs.cols());
+  Matrix<typename RhsType::Scalar,1,RhsType::ColsAtCompileTime> temp(rhs.cols());
   for (Index k = 0; k < l_rank; ++k)
   {
     Index remainingSize = rows()-k;
@@ -570,6 +572,42 @@ void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType
   for(Index i = 0; i < l_rank; ++i) dst.row(m_cols_permutation.indices().coeff(i)) = c.row(i);
   for(Index i = l_rank; i < cols(); ++i) dst.row(m_cols_permutation.indices().coeff(i)).setZero();
 }
+
+template<typename _MatrixType>
+template<bool Conjugate, typename RhsType, typename DstType>
+void FullPivHouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+  const Index l_rank = rank();
+
+  if(l_rank == 0)
+  {
+    dst.setZero();
+    return;
+  }
+
+  typename RhsType::PlainObject c(m_cols_permutation.transpose()*rhs);
+
+  m_qr.topLeftCorner(l_rank, l_rank)
+         .template triangularView<Upper>()
+         .transpose().template conjugateIf<Conjugate>()
+         .solveInPlace(c.topRows(l_rank));
+
+  dst.topRows(l_rank) = c.topRows(l_rank);
+  dst.bottomRows(rows()-l_rank).setZero();
+
+  Matrix<Scalar, 1, DstType::ColsAtCompileTime> temp(dst.cols());
+  const Index size = (std::min)(rows(), cols());
+  for (Index k = size-1; k >= 0; --k)
+  {
+    Index remainingSize = rows()-k;
+
+    dst.bottomRightCorner(remainingSize, dst.cols())
+       .applyHouseholderOnTheLeft(m_qr.col(k).tail(remainingSize-1).template conjugateIf<!Conjugate>(),
+                                  m_hCoeffs.template conjugateIf<Conjugate>().coeff(k), &temp.coeffRef(0));
+
+    dst.row(k).swap(dst.row(m_rows_transpositions.coeff(k)));
+  }
+}
 #endif
 
 namespace internal {
diff --git a/Eigen/src/QR/HouseholderQR.h b/Eigen/src/QR/HouseholderQR.h
index 33cb9c8ff..801739fbd 100644
--- a/Eigen/src/QR/HouseholderQR.h
+++ b/Eigen/src/QR/HouseholderQR.h
@@ -14,6 +14,18 @@
 
 namespace Eigen { 
 
+namespace internal {
+template<typename _MatrixType> struct traits<HouseholderQR<_MatrixType> >
+ : traits<_MatrixType>
+{
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
+  enum { Flags = 0 };
+};
+
+} // end namespace internal
+
 /** \ingroup QR_Module
   *
   *
@@ -42,20 +54,19 @@ namespace Eigen {
   * \sa MatrixBase::householderQr()
   */
 template<typename _MatrixType> class HouseholderQR
+        : public SolverBase<HouseholderQR<_MatrixType> >
 {
   public:
 
     typedef _MatrixType MatrixType;
+    typedef SolverBase<HouseholderQR> Base;
+    friend class SolverBase<HouseholderQR>;
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(HouseholderQR)
     enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::RealScalar RealScalar;
-    // FIXME should be int
-    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime, (MatrixType::Flags&RowMajorBit) ? RowMajor : ColMajor, MaxRowsAtCompileTime, MaxRowsAtCompileTime> MatrixQType;
     typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
     typedef typename internal::plain_row_type<MatrixType>::type RowVectorType;
@@ -121,6 +132,7 @@ template<typename _MatrixType> class HouseholderQR
       computeInPlace();
     }
 
+    #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
       * *this is the QR decomposition, if any exists.
       *
@@ -137,11 +149,8 @@ template<typename _MatrixType> class HouseholderQR
       */
     template<typename Rhs>
     inline const Solve<HouseholderQR, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "HouseholderQR is not initialized.");
-      return Solve<HouseholderQR, Rhs>(*this, b.derived());
-    }
+    solve(const MatrixBase<Rhs>& b) const;
+    #endif
 
     /** This method returns an expression of the unitary matrix Q as a sequence of Householder transformations.
       *
@@ -214,6 +223,9 @@ template<typename _MatrixType> class HouseholderQR
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+    template<bool Conjugate, typename RhsType, typename DstType>
+    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
     #endif
 
   protected:
@@ -349,7 +361,6 @@ template<typename RhsType, typename DstType>
 void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
   const Index rank = (std::min)(rows(), cols());
-  eigen_assert(rhs.rows() == rows());
 
   typename RhsType::PlainObject c(rhs);
 
@@ -362,6 +373,25 @@ void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) c
   dst.topRows(rank) = c.topRows(rank);
   dst.bottomRows(cols()-rank).setZero();
 }
+
+template<typename _MatrixType>
+template<bool Conjugate, typename RhsType, typename DstType>
+void HouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+  const Index rank = (std::min)(rows(), cols());
+
+  typename RhsType::PlainObject c(rhs);
+
+  m_qr.topLeftCorner(rank, rank)
+      .template triangularView<Upper>()
+      .transpose().template conjugateIf<Conjugate>()
+      .solveInPlace(c.topRows(rank));
+
+  dst.topRows(rank) = c.topRows(rank);
+  dst.bottomRows(rows()-rank).setZero();
+
+  dst.applyOnTheLeft(householderQ().setLength(rank).template conjugateIf<!Conjugate>() );
+}
 #endif
 
 /** Performs the QR factorization of the given matrix \a matrix. The result of
diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h
index 18d7bdc0a..e3fddacbc 100644
--- a/Eigen/src/SVD/BDCSVD.h
+++ b/Eigen/src/SVD/BDCSVD.h
@@ -39,6 +39,7 @@ namespace internal {
 
 template<typename _MatrixType> 
 struct traits<BDCSVD<_MatrixType> >
+        : traits<_MatrixType>
 {
   typedef _MatrixType MatrixType;
 };  
@@ -1006,7 +1007,7 @@ void BDCSVD<MatrixType>::perturbCol0
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
       assert((std::isfinite)(tmp));
 #endif
-      zhat(k) = col0(k) > Literal(0) ? tmp : -tmp;
+      zhat(k) = col0(k) > Literal(0) ? RealScalar(tmp) : RealScalar(-tmp);
     }
   }
 }
diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h
index 1c7c80376..2b6891105 100644
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
@@ -425,6 +425,7 @@ struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, true>
 
 template<typename _MatrixType, int QRPreconditioner> 
 struct traits<JacobiSVD<_MatrixType,QRPreconditioner> >
+        : traits<_MatrixType>
 {
   typedef _MatrixType MatrixType;
 };
diff --git a/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h
index 851ad6836..68df48921 100644
--- a/Eigen/src/SVD/SVDBase.h
+++ b/Eigen/src/SVD/SVDBase.h
@@ -17,6 +17,18 @@
 #define EIGEN_SVDBASE_H
 
 namespace Eigen {
+
+namespace internal {
+template<typename Derived> struct traits<SVDBase<Derived> >
+ : traits<Derived>
+{
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
+  enum { Flags = 0 };
+};
+}
+
 /** \ingroup SVD_Module
  *
  *
@@ -44,15 +56,18 @@ namespace Eigen {
  * terminate in finite (and reasonable) time.
  * \sa class BDCSVD, class JacobiSVD
  */
-template<typename Derived>
-class SVDBase
+template<typename Derived> class SVDBase
+ : public SolverBase<SVDBase<Derived> >
 {
+public: 
+   
+  template<typename Derived_>
+  friend struct internal::solve_assertion;
 
-public:
   typedef typename internal::traits<Derived>::MatrixType MatrixType;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-  typedef typename MatrixType::StorageIndex StorageIndex;
+  typedef typename Eigen::internal::traits<SVDBase>::StorageIndex StorageIndex;
   typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
   enum {
     RowsAtCompileTime = MatrixType::RowsAtCompileTime,
@@ -194,6 +209,7 @@ public:
   inline Index rows() const { return m_rows; }
   inline Index cols() const { return m_cols; }
   
+  #ifdef EIGEN_PARSED_BY_DOXYGEN
   /** \returns a (least squares) solution of \f$ A x = b \f$ using the current SVD decomposition of A.
     *
     * \param b the right-hand-side of the equation to solve.
@@ -205,16 +221,15 @@ public:
     */
   template<typename Rhs>
   inline const Solve<Derived, Rhs>
-  solve(const MatrixBase<Rhs>& b) const
-  {
-    eigen_assert(m_isInitialized && "SVD is not initialized.");
-    eigen_assert(computeU() && computeV() && "SVD::solve() requires both unitaries U and V to be computed (thin unitaries suffice).");
-    return Solve<Derived, Rhs>(derived(), b.derived());
-  }
-  
+  solve(const MatrixBase<Rhs>& b) const;
+  #endif
+
   #ifndef EIGEN_PARSED_BY_DOXYGEN
   template<typename RhsType, typename DstType>
   void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+  template<bool Conjugate, typename RhsType, typename DstType>
+  void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
   #endif
 
 protected:
@@ -223,6 +238,14 @@ protected:
   {
     EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
   }
+
+  template<bool Transpose_, typename Rhs>
+  void _check_solve_assertion(const Rhs& b) const {
+      EIGEN_ONLY_USED_FOR_DEBUG(b);
+      eigen_assert(m_isInitialized && "SVD is not initialized.");
+      eigen_assert(computeU() && computeV() && "SVDBase::solve(): Both unitaries U and V are required to be computed (thin unitaries suffice).");
+      eigen_assert((Transpose_?cols():rows())==b.rows() && "SVDBase::solve(): invalid number of rows of the right hand side matrix b");
+  }
   
   // return true if already allocated
   bool allocate(Index rows, Index cols, unsigned int computationOptions) ;
@@ -263,17 +286,30 @@ template<typename Derived>
 template<typename RhsType, typename DstType>
 void SVDBase<Derived>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  eigen_assert(rhs.rows() == rows());
-
   // A = U S V^*
   // So A^{-1} = V S^{-1} U^*
 
-  Matrix<Scalar, Dynamic, RhsType::ColsAtCompileTime, 0, MatrixType::MaxRowsAtCompileTime, RhsType::MaxColsAtCompileTime> tmp;
+  Matrix<typename RhsType::Scalar, Dynamic, RhsType::ColsAtCompileTime, 0, MatrixType::MaxRowsAtCompileTime, RhsType::MaxColsAtCompileTime> tmp;
   Index l_rank = rank();
   tmp.noalias() =  m_matrixU.leftCols(l_rank).adjoint() * rhs;
   tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp;
   dst = m_matrixV.leftCols(l_rank) * tmp;
 }
+
+template<typename Derived>
+template<bool Conjugate, typename RhsType, typename DstType>
+void SVDBase<Derived>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+  // A = U S V^*
+  // So  A^{-*} = U S^{-1} V^*
+  // And A^{-T} = U_conj S^{-1} V^T
+  Matrix<typename RhsType::Scalar, Dynamic, RhsType::ColsAtCompileTime, 0, MatrixType::MaxRowsAtCompileTime, RhsType::MaxColsAtCompileTime> tmp;
+  Index l_rank = rank();
+
+  tmp.noalias() =  m_matrixV.leftCols(l_rank).transpose().template conjugateIf<Conjugate>() * rhs;
+  tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp;
+  dst = m_matrixU.template conjugateIf<!Conjugate>().leftCols(l_rank) * tmp;
+}
 #endif
 
 template<typename MatrixType>
diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h
index d89fa0dae..acd986fab 100644
--- a/Eigen/src/SparseCore/CompressedStorage.h
+++ b/Eigen/src/SparseCore/CompressedStorage.h
@@ -207,6 +207,22 @@ class CompressedStorage
       return m_values[id];
     }
 
+    void moveChunk(Index from, Index to, Index chunkSize)
+    {
+      eigen_internal_assert(to+chunkSize <= m_size);
+      if(to>from && from+chunkSize>to)
+      {
+        // move backward
+        internal::smart_memmove(m_values+from,  m_values+from+chunkSize,  m_values+to);
+        internal::smart_memmove(m_indices+from, m_indices+from+chunkSize, m_indices+to);
+      }
+      else
+      {
+        internal::smart_copy(m_values+from,  m_values+from+chunkSize,  m_values+to);
+        internal::smart_copy(m_indices+from, m_indices+from+chunkSize, m_indices+to);
+      }
+    }
+
     void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision())
     {
       Index k = 0;
diff --git a/Eigen/src/SparseCore/SparseAssign.h b/Eigen/src/SparseCore/SparseAssign.h
index 71452e75e..905485c88 100644
--- a/Eigen/src/SparseCore/SparseAssign.h
+++ b/Eigen/src/SparseCore/SparseAssign.h
@@ -246,35 +246,22 @@ struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Sparse>
 {
   typedef typename DstXprType::StorageIndex StorageIndex;
   typedef typename DstXprType::Scalar Scalar;
-  typedef Array<StorageIndex,Dynamic,1> ArrayXI;
-  typedef Array<Scalar,Dynamic,1> ArrayXS;
-  template<int Options>
-  static void run(SparseMatrix<Scalar,Options,StorageIndex> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
-  {
-    Index dstRows = src.rows();
-    Index dstCols = src.cols();
-    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
-      dst.resize(dstRows, dstCols);
 
-    Index size = src.diagonal().size();
-    dst.makeCompressed();
-    dst.resizeNonZeros(size);
-    Map<ArrayXI>(dst.innerIndexPtr(), size).setLinSpaced(0,StorageIndex(size)-1);
-    Map<ArrayXI>(dst.outerIndexPtr(), size+1).setLinSpaced(0,StorageIndex(size));
-    Map<ArrayXS>(dst.valuePtr(), size) = src.diagonal();
-  }
+  template<int Options, typename AssignFunc>
+  static void run(SparseMatrix<Scalar,Options,StorageIndex> &dst, const SrcXprType &src, const AssignFunc &func)
+  { dst.assignDiagonal(src.diagonal(), func); }
   
   template<typename DstDerived>
   static void run(SparseMatrixBase<DstDerived> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
-  {
-    dst.diagonal() = src.diagonal();
-  }
+  { dst.derived().diagonal() = src.diagonal(); }
   
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
-  { dst.diagonal() += src.diagonal(); }
+  template<typename DstDerived>
+  static void run(SparseMatrixBase<DstDerived> &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  { dst.derived().diagonal() += src.diagonal(); }
   
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
-  { dst.diagonal() -= src.diagonal(); }
+  template<typename DstDerived>
+  static void run(SparseMatrixBase<DstDerived> &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  { dst.derived().diagonal() -= src.diagonal(); }
 };
 } // end namespace internal
 
diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h
index e0b3c22b6..6a2c7a8ce 100644
--- a/Eigen/src/SparseCore/SparseCompressedBase.h
+++ b/Eigen/src/SparseCore/SparseCompressedBase.h
@@ -128,6 +128,28 @@ class SparseCompressedBase
   protected:
     /** Default constructor. Do nothing. */
     SparseCompressedBase() {}
+
+    /** \internal return the index of the coeff at (row,col) or just before if it does not exist.
+      * This is an analogue of std::lower_bound.
+      */
+    internal::LowerBoundIndex lower_bound(Index row, Index col) const
+    {
+      eigen_internal_assert(row>=0 && row<this->rows() && col>=0 && col<this->cols());
+
+      const Index outer = Derived::IsRowMajor ? row : col;
+      const Index inner = Derived::IsRowMajor ? col : row;
+
+      Index start = this->outerIndexPtr()[outer];
+      Index end = this->isCompressed() ? this->outerIndexPtr()[outer+1] : this->outerIndexPtr()[outer] + this->innerNonZeroPtr()[outer];
+      eigen_assert(end>=start && "you are using a non finalized sparse matrix or written coefficient does not exist");
+      internal::LowerBoundIndex p;
+      p.value = std::lower_bound(this->innerIndexPtr()+start, this->innerIndexPtr()+end,inner) - this->innerIndexPtr();
+      p.found = (p.value<end) && (this->innerIndexPtr()[p.value]==inner);
+      return p;
+    }
+
+    friend struct internal::evaluator<SparseCompressedBase<Derived> >;
+
   private:
     template<typename OtherDerived> explicit SparseCompressedBase(const SparseCompressedBase<OtherDerived>&);
 };
@@ -333,17 +355,8 @@ protected:
 
   Index find(Index row, Index col) const
   {
-    eigen_internal_assert(row>=0 && row<m_matrix->rows() && col>=0 && col<m_matrix->cols());
-
-    const Index outer = Derived::IsRowMajor ? row : col;
-    const Index inner = Derived::IsRowMajor ? col : row;
-
-    Index start = m_matrix->outerIndexPtr()[outer];
-    Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer];
-    eigen_assert(end>=start && "you are using a non finalized sparse matrix or written coefficient does not exist");
-    const Index p = std::lower_bound(m_matrix->innerIndexPtr()+start, m_matrix->innerIndexPtr()+end,inner) - m_matrix->innerIndexPtr();
-
-    return ((p<end) && (m_matrix->innerIndexPtr()[p]==inner)) ? p : Dynamic;
+    internal::LowerBoundIndex p = m_matrix->lower_bound(row,col);
+    return p.found ? p.value : Dynamic;
   }
 
   const Derived *m_matrix;
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index eedae47e8..63dd1cc32 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -99,6 +99,8 @@ class SparseMatrix
     typedef SparseCompressedBase<SparseMatrix> Base;
     using Base::convert_index;
     friend class SparseVector<_Scalar,0,_StorageIndex>;
+    template<typename, typename, typename, typename, typename>
+    friend struct internal::Assignment;
   public:
     using Base::isCompressed;
     using Base::nonZeros;
@@ -502,7 +504,7 @@ class SparseMatrix
         m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i]; 
       }
     }
-    
+
     /** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerance \a epsilon */
     void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision())
     {
@@ -895,6 +897,113 @@ public:
       m_data.index(p) = convert_index(inner);
       return (m_data.value(p) = Scalar(0));
     }
+protected:
+    struct IndexPosPair {
+      IndexPosPair(Index a_i, Index a_p) : i(a_i), p(a_p) {}
+      Index i;
+      Index p;
+    };
+
+    /** \internal assign \a diagXpr to the diagonal of \c *this
+      * There are different strategies:
+      *   1 - if *this is overwritten (Func==assign_op) or *this is empty, then we can work treat *this as a dense vector expression.
+      *   2 - otherwise, for each diagonal coeff,
+      *     2.a - if it already exists, then we update it,
+      *     2.b - otherwise, if *this is uncompressed and that the current inner-vector has empty room for at least 1 element, then we perform an in-place insertion.
+      *     2.c - otherwise, we'll have to reallocate and copy everything, so instead of doing so for each new element, it is recorded in a std::vector.
+      *   3 - at the end, if some entries failed to be inserted in-place, then we alloc a new buffer, copy each chunk at the right position, and insert the new elements.
+      * 
+      * TODO: some piece of code could be isolated and reused for a general in-place update strategy.
+      * TODO: if we start to defer the insertion of some elements (i.e., case 2.c executed once),
+      *       then it *might* be better to disable case 2.b since they will have to be copied anyway.
+      */
+    template<typename DiagXpr, typename Func>
+    void assignDiagonal(const DiagXpr diagXpr, const Func& assignFunc)
+    {
+      Index n = diagXpr.size();
+
+      const bool overwrite = internal::is_same<Func, internal::assign_op<Scalar,Scalar> >::value;
+      if(overwrite)
+      {
+        if((this->rows()!=n) || (this->cols()!=n))
+          this->resize(n, n);
+      }
+
+      if(m_data.size()==0 || overwrite)
+      {
+        typedef Array<StorageIndex,Dynamic,1> ArrayXI;  
+        this->makeCompressed();
+        this->resizeNonZeros(n);
+        Eigen::Map<ArrayXI>(this->innerIndexPtr(), n).setLinSpaced(0,StorageIndex(n)-1);
+        Eigen::Map<ArrayXI>(this->outerIndexPtr(), n+1).setLinSpaced(0,StorageIndex(n));
+        Eigen::Map<Array<Scalar,Dynamic,1> > values = this->coeffs();
+        values.setZero();
+        internal::call_assignment_no_alias(values, diagXpr, assignFunc);
+      }
+      else
+      {
+        bool isComp = isCompressed();
+        internal::evaluator<DiagXpr> diaEval(diagXpr);
+        std::vector<IndexPosPair> newEntries;
+
+        // 1 - try in-place update and record insertion failures
+        for(Index i = 0; i<n; ++i)
+        {
+          internal::LowerBoundIndex lb = this->lower_bound(i,i);
+          Index p = lb.value;
+          if(lb.found)
+          {
+            // the coeff already exists
+            assignFunc.assignCoeff(m_data.value(p), diaEval.coeff(i));
+          }
+          else if((!isComp) && m_innerNonZeros[i] < (m_outerIndex[i+1]-m_outerIndex[i]))
+          {
+            // non compressed mode with local room for inserting one element
+            m_data.moveChunk(p, p+1, m_outerIndex[i]+m_innerNonZeros[i]-p);
+            m_innerNonZeros[i]++;
+            m_data.value(p) = Scalar(0);
+            m_data.index(p) = StorageIndex(i);
+            assignFunc.assignCoeff(m_data.value(p), diaEval.coeff(i));
+          }
+          else
+          {
+            // defer insertion
+            newEntries.push_back(IndexPosPair(i,p));
+          }
+        }
+        // 2 - insert deferred entries
+        Index n_entries = Index(newEntries.size());
+        if(n_entries>0)
+        {
+          Storage newData(m_data.size()+n_entries);
+          Index prev_p = 0;
+          Index prev_i = 0;
+          for(Index k=0; k<n_entries;++k)
+          {
+            Index i = newEntries[k].i;
+            Index p = newEntries[k].p;
+            internal::smart_copy(m_data.valuePtr()+prev_p, m_data.valuePtr()+p, newData.valuePtr()+prev_p+k);
+            internal::smart_copy(m_data.indexPtr()+prev_p, m_data.indexPtr()+p, newData.indexPtr()+prev_p+k);
+            for(Index j=prev_i;j<i;++j)
+              m_outerIndex[j+1] += k;
+            if(!isComp)
+              m_innerNonZeros[i]++;
+            prev_p = p;
+            prev_i = i;
+            newData.value(p+k) = Scalar(0);
+            newData.index(p+k) = StorageIndex(i);
+            assignFunc.assignCoeff(newData.value(p+k), diaEval.coeff(i));
+          }
+          {
+            internal::smart_copy(m_data.valuePtr()+prev_p, m_data.valuePtr()+m_data.size(), newData.valuePtr()+prev_p+n_entries);
+            internal::smart_copy(m_data.indexPtr()+prev_p, m_data.indexPtr()+m_data.size(), newData.indexPtr()+prev_p+n_entries);
+            for(Index j=prev_i+1;j<=m_outerSize;++j)
+              m_outerIndex[j] += n_entries;
+          }
+          m_data.swap(newData);
+        }
+      }
+    }
 
 private:
   static void check_template_parameters()
diff --git a/Eigen/src/SparseCore/SparseUtil.h b/Eigen/src/SparseCore/SparseUtil.h
index 74df0d496..ceb936887 100644
--- a/Eigen/src/SparseCore/SparseUtil.h
+++ b/Eigen/src/SparseCore/SparseUtil.h
@@ -140,6 +140,14 @@ struct SparseSelfAdjointShape { static std::string debugName() { return "SparseS
 template<> struct glue_shapes<SparseShape,SelfAdjointShape> { typedef SparseSelfAdjointShape type;  };
 template<> struct glue_shapes<SparseShape,TriangularShape > { typedef SparseTriangularShape  type;  };
 
+// return type of SparseCompressedBase::lower_bound;
+struct LowerBoundIndex {
+  LowerBoundIndex() : value(-1), found(false) {}
+  LowerBoundIndex(Index val, bool ok) : value(val), found(ok) {}
+  Index value;
+  bool found;
+};
+
 } // end namespace internal
 
 /** \ingroup SparseCore_Module
diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
index e928db467..2f99ee0b2 100644
--- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
@@ -23,6 +23,11 @@ typedef CwiseUnaryOp<internal::scalar_atan_op<Scalar>, const Derived> AtanReturn
 typedef CwiseUnaryOp<internal::scalar_tanh_op<Scalar>, const Derived> TanhReturnType;
 typedef CwiseUnaryOp<internal::scalar_logistic_op<Scalar>, const Derived> LogisticReturnType;
 typedef CwiseUnaryOp<internal::scalar_sinh_op<Scalar>, const Derived> SinhReturnType;
+#if EIGEN_HAS_CXX11_MATH
+typedef CwiseUnaryOp<internal::scalar_atanh_op<Scalar>, const Derived> AtanhReturnType;
+typedef CwiseUnaryOp<internal::scalar_asinh_op<Scalar>, const Derived> AsinhReturnType;
+typedef CwiseUnaryOp<internal::scalar_acosh_op<Scalar>, const Derived> AcoshReturnType;
+#endif
 typedef CwiseUnaryOp<internal::scalar_cosh_op<Scalar>, const Derived> CoshReturnType;
 typedef CwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived> SquareReturnType;
 typedef CwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived> CubeReturnType;
@@ -327,7 +332,7 @@ sinh() const
   * Example: \include Cwise_cosh.cpp
   * Output: \verbinclude Cwise_cosh.out
   *
-  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cosh">Math functions</a>, tan(), sinh(), cosh()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cosh">Math functions</a>, tanh(), sinh(), cosh()
   */
 EIGEN_DEVICE_FUNC
 inline const CoshReturnType
@@ -336,6 +341,41 @@ cosh() const
   return CoshReturnType(derived());
 }
 
+#if EIGEN_HAS_CXX11_MATH
+/** \returns an expression of the coefficient-wise inverse hyperbolic tan of *this.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_atanh">Math functions</a>, atanh(), asinh(), acosh()
+  */
+EIGEN_DEVICE_FUNC
+inline const AtanhReturnType
+atanh() const
+{
+  return AtanhReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise inverse hyperbolic sin of *this.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_asinh">Math functions</a>, atanh(), asinh(), acosh()
+  */
+EIGEN_DEVICE_FUNC
+inline const AsinhReturnType
+asinh() const
+{
+  return AsinhReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise inverse hyperbolic cos of *this.
+  *
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_acosh">Math functions</a>, atanh(), asinh(), acosh()
+  */
+EIGEN_DEVICE_FUNC
+inline const AcoshReturnType
+acosh() const
+{
+  return AcoshReturnType(derived());
+}
+#endif
+
 /** \returns an expression of the coefficient-wise logistic of *this.
   */
 EIGEN_DEVICE_FUNC
diff --git a/Eigen/src/plugins/CommonCwiseUnaryOps.h b/Eigen/src/plugins/CommonCwiseUnaryOps.h
index 89f4faaac..5418dc415 100644
--- a/Eigen/src/plugins/CommonCwiseUnaryOps.h
+++ b/Eigen/src/plugins/CommonCwiseUnaryOps.h
@@ -76,6 +76,20 @@ conjugate() const
   return ConjugateReturnType(derived());
 }
 
+/// \returns an expression of the complex conjugate of \c *this if Cond==true, returns derived() otherwise.
+///
+EIGEN_DOC_UNARY_ADDONS(conjugate,complex conjugate)
+///
+/// \sa conjugate()
+template<bool Cond>
+EIGEN_DEVICE_FUNC
+inline typename internal::conditional<Cond,ConjugateReturnType,const Derived&>::type
+conjugateIf() const
+{
+  typedef typename internal::conditional<Cond,ConjugateReturnType,const Derived&>::type ReturnType;
+  return ReturnType(derived());
+}
+
 /// \returns a read-only expression of the real part of \c *this.
 ///
 EIGEN_DOC_UNARY_ADDONS(real,real part function)