merge with default Eigen

author: Gael Guennebaud <g.gael@free.fr> 2018-09-21 11:51:49 +0200
committer: Gael Guennebaud <g.gael@free.fr> 2018-09-21 11:51:49 +0200
commit: a488d59787d25337fd71bcc994d1c2c6ab011731 (patch)
tree: 2ac09b50a43bf7586f8df8ac0c069902e326773b /Eigen
parent: 47720e797012f6ef2f50da1f0e79fdc4f16e60a1 (diff)
parent: 3ec29859146f33e323ebd6d41e46acda63dffcef (diff)
217 files changed, 9761 insertions, 3122 deletions
diff --git a/Eigen/Cholesky b/Eigen/Cholesky
index 369d1f5ec..1332b540d 100644
--- a/Eigen/Cholesky
+++ b/Eigen/Cholesky
@@ -9,6 +9,7 @@
 #define EIGEN_CHOLESKY_MODULE_H
 
 #include "Core"
+#include "Jacobi"
 
 #include "src/Core/util/DisableStupidWarnings.h"
 
@@ -31,7 +32,11 @@
 #include "src/Cholesky/LLT.h"
 #include "src/Cholesky/LDLT.h"
 #ifdef EIGEN_USE_LAPACKE
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
 #include "src/misc/lapacke.h"
+#endif
 #include "src/Cholesky/LLT_LAPACKE.h"
 #endif
 
diff --git a/Eigen/Core b/Eigen/Core
index 884546a2b..7347a2480 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -14,61 +14,26 @@
 // first thing Eigen does: stop the compiler from committing suicide
 #include "src/Core/util/DisableStupidWarnings.h"
 
-// Handle NVCC/CUDA/SYCL
-#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__)
-  // Do not try asserts on CUDA and SYCL!
-  #ifndef EIGEN_NO_DEBUG
-  #define EIGEN_NO_DEBUG
-  #endif
-
-  #ifdef EIGEN_INTERNAL_DEBUGGING
-  #undef EIGEN_INTERNAL_DEBUGGING
-  #endif
-
-  #ifdef EIGEN_EXCEPTIONS
-  #undef EIGEN_EXCEPTIONS
-  #endif
-
-  // All functions callable from CUDA code must be qualified with __device__
-  #ifdef __CUDACC__
-    // Do not try to vectorize on CUDA and SYCL!
-    #ifndef EIGEN_DONT_VECTORIZE
-    #define EIGEN_DONT_VECTORIZE
-    #endif
-
-    #define EIGEN_DEVICE_FUNC __host__ __device__
-    // We need math_functions.hpp to ensure that that EIGEN_USING_STD_MATH macro
-    // works properly on the device side
-    #include <math_functions.hpp>
-  #else
-    #define EIGEN_DEVICE_FUNC
-  #endif
-#else
-  #define EIGEN_DEVICE_FUNC
-#endif
+// then include this file where all our macros are defined. It's really important to do it first because
+// it's where we do all the compiler/OS/arch detections and define most defaults.
+#include "src/Core/util/Macros.h"
 
-// When compiling CUDA device code with NVCC, pull in math functions from the
-// global namespace.  In host mode, and when device doee with clang, use the
-// std versions.
-#if defined(__CUDA_ARCH__) && defined(__NVCC__)
-  #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
-#else
-  #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
-#endif
+// This detects SSE/AVX/NEON/etc. and configure alignment settings
+#include "src/Core/util/ConfigureVectorization.h"
 
-#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL)
-  #define EIGEN_EXCEPTIONS
+// We need cuda_runtime.h/hip_runtime.h to ensure that
+// the EIGEN_USING_STD_MATH macro works properly on the device side
+#if defined(EIGEN_CUDACC)
+  #include <cuda_runtime.h>
+#elif defined(EIGEN_HIPCC)
+  #include <hip/hip_runtime.h>
 #endif
 
+
 #ifdef EIGEN_EXCEPTIONS
   #include <new>
 #endif
 
-// then include this file where all our macros are defined. It's really important to do it first because
-// it's where we do all the alignment settings (platform detection and honoring the user's will if he
-// defined e.g. EIGEN_DONT_ALIGN) so it needs to be done before we do anything with vectorization.
-#include "src/Core/util/Macros.h"
-
 // Disable the ipa-cp-clone optimization flag with MinGW 6.x or newer (enabled by default with -O3)
 // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details.
 #if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6)
@@ -81,169 +46,9 @@
 // and inclusion of their respective header files
 #include "src/Core/util/MKL_support.h"
 
-// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into
-// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks
-#if EIGEN_MAX_ALIGN_BYTES==0
-  #ifndef EIGEN_DONT_VECTORIZE
-    #define EIGEN_DONT_VECTORIZE
-  #endif
-#endif
-
-#if EIGEN_COMP_MSVC
-  #include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled
-  #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later
-    // Remember that usage of defined() in a #define is undefined by the standard.
-    // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP.
-    #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64
-      #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
-    #endif
-  #endif
-#else
-  // Remember that usage of defined() in a #define is undefined by the standard
-  #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) )
-    #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC
-  #endif
-#endif
-
-#ifndef EIGEN_DONT_VECTORIZE
-
-  #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
-
-    // Defines symbols for compile-time detection of which instructions are
-    // used.
-    // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_SSE
-    #define EIGEN_VECTORIZE_SSE2
-
-    // Detect sse3/ssse3/sse4:
-    // gcc and icc defines __SSE3__, ...
-    // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you
-    // want to force the use of those instructions with msvc.
-    #ifdef __SSE3__
-      #define EIGEN_VECTORIZE_SSE3
-    #endif
-    #ifdef __SSSE3__
-      #define EIGEN_VECTORIZE_SSSE3
-    #endif
-    #ifdef __SSE4_1__
-      #define EIGEN_VECTORIZE_SSE4_1
-    #endif
-    #ifdef __SSE4_2__
-      #define EIGEN_VECTORIZE_SSE4_2
-    #endif
-    #ifdef __AVX__
-      #define EIGEN_VECTORIZE_AVX
-      #define EIGEN_VECTORIZE_SSE3
-      #define EIGEN_VECTORIZE_SSSE3
-      #define EIGEN_VECTORIZE_SSE4_1
-      #define EIGEN_VECTORIZE_SSE4_2
-    #endif
-    #ifdef __AVX2__
-      #define EIGEN_VECTORIZE_AVX2
-      #define EIGEN_VECTORIZE_AVX
-      #define EIGEN_VECTORIZE_SSE3
-      #define EIGEN_VECTORIZE_SSSE3
-      #define EIGEN_VECTORIZE_SSE4_1
-      #define EIGEN_VECTORIZE_SSE4_2
-    #endif
-    #ifdef __FMA__
-      #define EIGEN_VECTORIZE_FMA
-    #endif
-    #if defined(__AVX512F__)
-      #define EIGEN_VECTORIZE_AVX512
-      #define EIGEN_VECTORIZE_AVX2
-      #define EIGEN_VECTORIZE_AVX
-      #define EIGEN_VECTORIZE_FMA
-      #define EIGEN_VECTORIZE_SSE3
-      #define EIGEN_VECTORIZE_SSSE3
-      #define EIGEN_VECTORIZE_SSE4_1
-      #define EIGEN_VECTORIZE_SSE4_2
-      #ifdef __AVX512DQ__
-        #define EIGEN_VECTORIZE_AVX512DQ
-      #endif
-    #endif
-
-    // include files
-
-    // This extern "C" works around a MINGW-w64 compilation issue
-    // https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354
-    // In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do).
-    // However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations
-    // with conflicting linkage.  The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know;
-    // so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too.
-    // notice that since these are C headers, the extern "C" is theoretically needed anyways.
-    extern "C" {
-      // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
-      // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
-      #if EIGEN_COMP_ICC >= 1110
-        #include <immintrin.h>
-      #else
-        #include <mmintrin.h>
-        #include <emmintrin.h>
-        #include <xmmintrin.h>
-        #ifdef  EIGEN_VECTORIZE_SSE3
-        #include <pmmintrin.h>
-        #endif
-        #ifdef EIGEN_VECTORIZE_SSSE3
-        #include <tmmintrin.h>
-        #endif
-        #ifdef EIGEN_VECTORIZE_SSE4_1
-        #include <smmintrin.h>
-        #endif
-        #ifdef EIGEN_VECTORIZE_SSE4_2
-        #include <nmmintrin.h>
-        #endif
-        #if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512)
-        #include <immintrin.h>
-        #endif
-      #endif
-    } // end extern "C"
-  #elif defined __VSX__
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_VSX
-    #include <altivec.h>
-    // We need to #undef all these ugly tokens defined in <altivec.h>
-    // => use __vector instead of vector
-    #undef bool
-    #undef vector
-    #undef pixel
-  #elif defined __ALTIVEC__
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_ALTIVEC
-    #include <altivec.h>
-    // We need to #undef all these ugly tokens defined in <altivec.h>
-    // => use __vector instead of vector
-    #undef bool
-    #undef vector
-    #undef pixel
-  #elif (defined  __ARM_NEON) || (defined __ARM_NEON__)
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_NEON
-    #include <arm_neon.h>
-  #elif (defined __s390x__ && defined __VEC__)
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_ZVECTOR
-    #include <vecintrin.h>
-  #endif
-#endif
-
-#if defined(__F16C__) && !defined(EIGEN_COMP_CLANG)
-  // We can use the optimized fp16 to float and float to fp16 conversion routines
-  #define EIGEN_HAS_FP16_C
-#endif
-
-#if defined __CUDACC__
-  #define EIGEN_VECTORIZE_CUDA
-  #include <vector_types.h>
-  #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-    #define EIGEN_HAS_CUDA_FP16
-  #endif
-#endif
 
-#if defined EIGEN_HAS_CUDA_FP16
-  #include <host_defines.h>
-  #include <cuda_fp16.h>
+#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
+  #define EIGEN_HAS_GPU_FP16
 #endif
 
 #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
@@ -275,6 +80,10 @@
 // for min/max:
 #include <algorithm>
 
+#if EIGEN_HAS_CXX11
+#include <array>
+#endif
+
 // for std::is_nothrow_move_assignable
 #ifdef EIGEN_INCLUDE_TYPE_TRAITS
 #include <type_traits>
@@ -299,38 +108,6 @@
   #include <SYCL/sycl.hpp>
 #endif
 
-/** \brief Namespace containing all symbols from the %Eigen library. */
-namespace Eigen {
-
-inline static const char *SimdInstructionSetsInUse(void) {
-#if defined(EIGEN_VECTORIZE_AVX512)
-  return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
-#elif defined(EIGEN_VECTORIZE_AVX)
-  return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
-#elif defined(EIGEN_VECTORIZE_SSE4_2)
-  return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
-#elif defined(EIGEN_VECTORIZE_SSE4_1)
-  return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
-#elif defined(EIGEN_VECTORIZE_SSSE3)
-  return "SSE, SSE2, SSE3, SSSE3";
-#elif defined(EIGEN_VECTORIZE_SSE3)
-  return "SSE, SSE2, SSE3";
-#elif defined(EIGEN_VECTORIZE_SSE2)
-  return "SSE, SSE2";
-#elif defined(EIGEN_VECTORIZE_ALTIVEC)
-  return "AltiVec";
-#elif defined(EIGEN_VECTORIZE_VSX)
-  return "VSX";
-#elif defined(EIGEN_VECTORIZE_NEON)
-  return "ARM NEON";
-#elif defined(EIGEN_VECTORIZE_ZVECTOR)
-  return "S390X ZVECTOR";
-#else
-  return "None";
-#endif
-}
-
-} // end namespace Eigen
 
 #if defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS || defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API || defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS || defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API || defined EIGEN2_SUPPORT
 // This will generate an error message:
@@ -339,7 +116,7 @@ inline static const char *SimdInstructionSetsInUse(void) {
 
 namespace Eigen {
 
-// we use size_t frequently and we'll never remember to prepend it with std:: everytime just to
+// we use size_t frequently and we'll never remember to prepend it with std:: every time just to
 // ensure QNX/QCC support
 using std::size_t;
 // gcc 4.6.0 wants std:: for ptrdiff_t
@@ -366,11 +143,11 @@ using std::ptrdiff_t;
 #include "src/Core/util/IntegralConstant.h"
 #include "src/Core/util/SymbolicIndex.h"
 
-
 #include "src/Core/NumTraits.h"
 #include "src/Core/MathFunctions.h"
 #include "src/Core/GenericPacketMath.h"
 #include "src/Core/MathFunctionsImpl.h"
+#include "src/Core/arch/Default/ConjHelper.h"
 
 #if defined EIGEN_VECTORIZE_AVX512
   #include "src/Core/arch/SSE/PacketMath.h"
@@ -388,6 +165,7 @@ using std::ptrdiff_t;
   #include "src/Core/arch/AVX/MathFunctions.h"
   #include "src/Core/arch/AVX/Complex.h"
   #include "src/Core/arch/AVX/TypeCasting.h"
+  #include "src/Core/arch/SSE/TypeCasting.h"
 #elif defined EIGEN_VECTORIZE_SSE
   #include "src/Core/arch/SSE/PacketMath.h"
   #include "src/Core/arch/SSE/MathFunctions.h"
@@ -401,22 +179,33 @@ using std::ptrdiff_t;
   #include "src/Core/arch/NEON/PacketMath.h"
   #include "src/Core/arch/NEON/MathFunctions.h"
   #include "src/Core/arch/NEON/Complex.h"
+  #include "src/Core/arch/NEON/TypeCasting.h"
 #elif defined EIGEN_VECTORIZE_ZVECTOR
   #include "src/Core/arch/ZVector/PacketMath.h"
   #include "src/Core/arch/ZVector/MathFunctions.h"
   #include "src/Core/arch/ZVector/Complex.h"
+#elif defined EIGEN_VECTORIZE_MSA
+  #include "src/Core/arch/MSA/PacketMath.h"
+  #include "src/Core/arch/MSA/MathFunctions.h"
+  #include "src/Core/arch/MSA/Complex.h"
 #endif
 
 // Half float support
-#include "src/Core/arch/CUDA/Half.h"
-#include "src/Core/arch/CUDA/PacketMathHalf.h"
-#include "src/Core/arch/CUDA/TypeCasting.h"
+#include "src/Core/arch/GPU/Half.h"
+#include "src/Core/arch/GPU/PacketMathHalf.h"
+#include "src/Core/arch/GPU/TypeCasting.h"
 
-#if defined EIGEN_VECTORIZE_CUDA
-  #include "src/Core/arch/CUDA/PacketMath.h"
-  #include "src/Core/arch/CUDA/MathFunctions.h"
+#if defined EIGEN_VECTORIZE_GPU
+  #include "src/Core/arch/GPU/PacketMath.h"
+  #include "src/Core/arch/GPU/MathFunctions.h"
 #endif
 
+#if defined EIGEN_VECTORIZE_SYCL
+  #include "src/Core/arch/SYCL/InteropHeaders.h"
+  #include "src/Core/arch/SYCL/PacketMath.h"
+  #include "src/Core/arch/SYCL/MathFunctions.h"
+  #include "src/Core/arch/SYCL/TypeCasting.h"
+#endif
 #include "src/Core/arch/Default/Settings.h"
 
 #include "src/Core/functors/TernaryFunctors.h"
@@ -428,7 +217,9 @@ using std::ptrdiff_t;
 
 // Specialized functors to enable the processing of complex numbers
 // on CUDA devices
+#ifdef EIGEN_CUDACC
 #include "src/Core/arch/CUDA/Complex.h"
+#endif
 
 #include "src/Core/util/IndexedViewHelper.h"
 #include "src/Core/util/ReshapedHelper.h"
diff --git a/Eigen/Eigenvalues b/Eigen/Eigenvalues
index 009e529e1..7d6ac787b 100644
--- a/Eigen/Eigenvalues
+++ b/Eigen/Eigenvalues
@@ -10,14 +10,14 @@
 
 #include "Core"
 
-#include "src/Core/util/DisableStupidWarnings.h"
-
 #include "Cholesky"
 #include "Jacobi"
 #include "Householder"
 #include "LU"
 #include "Geometry"
 
+#include "src/Core/util/DisableStupidWarnings.h"
+
 /** \defgroup Eigenvalues_Module Eigenvalues module
   *
   *
@@ -45,7 +45,11 @@
 #include "src/Eigenvalues/GeneralizedEigenSolver.h"
 #include "src/Eigenvalues/MatrixBaseEigenvalues.h"
 #ifdef EIGEN_USE_LAPACKE
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
 #include "src/misc/lapacke.h"
+#endif
 #include "src/Eigenvalues/RealSchur_LAPACKE.h"
 #include "src/Eigenvalues/ComplexSchur_LAPACKE.h"
 #include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h"
diff --git a/Eigen/Geometry b/Eigen/Geometry
index 131a4edfc..04aa316cb 100644
--- a/Eigen/Geometry
+++ b/Eigen/Geometry
@@ -10,12 +10,12 @@
 
 #include "Core"
 
-#include "src/Core/util/DisableStupidWarnings.h"
-
 #include "SVD"
 #include "LU"
 #include <limits>
 
+#include "src/Core/util/DisableStupidWarnings.h"
+
 /** \defgroup Geometry_Module Geometry module
   *
   * This module provides support for:
diff --git a/Eigen/KLUSupport b/Eigen/KLUSupport
new file mode 100644
index 000000000..b23d90535
--- /dev/null
+++ b/Eigen/KLUSupport
@@ -0,0 +1,41 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_KLUSUPPORT_MODULE_H
+#define EIGEN_KLUSUPPORT_MODULE_H
+
+#include <Eigen/SparseCore>
+
+#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+
+extern "C" {
+#include <btf.h>
+#include <klu.h>
+   }
+
+/** \ingroup Support_modules
+  * \defgroup KLUSupport_Module KLUSupport module
+  *
+  * This module provides an interface to the KLU library which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
+  * It provides the following factorization class:
+  * - class KLU: a sparse LU factorization, well-suited for circuit simulation.
+  *
+  * \code
+  * #include <Eigen/KLUSupport>
+  * \endcode
+  *
+  * In order to use this module, the klu and btf headers must be accessible from the include paths, and your binary must be linked to the klu library and its dependencies.
+  * The dependencies depend on how umfpack has been compiled.
+  * For a cmake based project, you can use our FindKLU.cmake module to help you in this task.
+  *
+  */
+
+#include "src/KLUSupport/KLUSupport.h"
+
+#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+
+#endif // EIGEN_KLUSUPPORT_MODULE_H
diff --git a/Eigen/LU b/Eigen/LU
index 6f6c55629..6418a86e1 100644
--- a/Eigen/LU
+++ b/Eigen/LU
@@ -28,7 +28,11 @@
 #include "src/LU/FullPivLU.h"
 #include "src/LU/PartialPivLU.h"
 #ifdef EIGEN_USE_LAPACKE
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
 #include "src/misc/lapacke.h"
+#endif
 #include "src/LU/PartialPivLU_LAPACKE.h"
 #endif
 #include "src/LU/Determinant.h"
diff --git a/Eigen/PaStiXSupport b/Eigen/PaStiXSupport
index de3a63b4d..234619acc 100644
--- a/Eigen/PaStiXSupport
+++ b/Eigen/PaStiXSupport
@@ -36,6 +36,7 @@ extern "C" {
   * \endcode
   *
   * In order to use this module, the PaSTiX headers must be accessible from the include paths, and your binary must be linked to the PaSTiX library and its dependencies.
+  * This wrapper resuires PaStiX version 5.x compiled without MPI support.
   * The dependencies depend on how PaSTiX has been compiled.
   * For a cmake based project, you can use our FindPaSTiX.cmake module to help you in this task.
   *
diff --git a/Eigen/PardisoSupport b/Eigen/PardisoSupport
index 340edf51f..340edf51f 100755..100644
--- a/Eigen/PardisoSupport
+++ b/Eigen/PardisoSupport
diff --git a/Eigen/QR b/Eigen/QR
index 80838e3bd..1be1863a1 100644
--- a/Eigen/QR
+++ b/Eigen/QR
@@ -10,12 +10,12 @@
 
 #include "Core"
 
-#include "src/Core/util/DisableStupidWarnings.h"
-
 #include "Cholesky"
 #include "Jacobi"
 #include "Householder"
 
+#include "src/Core/util/DisableStupidWarnings.h"
+
 /** \defgroup QR_Module QR module
   *
   *
@@ -36,7 +36,11 @@
 #include "src/QR/ColPivHouseholderQR.h"
 #include "src/QR/CompleteOrthogonalDecomposition.h"
 #ifdef EIGEN_USE_LAPACKE
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
 #include "src/misc/lapacke.h"
+#endif
 #include "src/QR/HouseholderQR_LAPACKE.h"
 #include "src/QR/ColPivHouseholderQR_LAPACKE.h"
 #endif
diff --git a/Eigen/QtAlignedMalloc b/Eigen/QtAlignedMalloc
index c6571f129..4f07df02a 100644
--- a/Eigen/QtAlignedMalloc
+++ b/Eigen/QtAlignedMalloc
@@ -27,7 +27,7 @@ void qFree(void *ptr)
 void *qRealloc(void *ptr, std::size_t size)
 {
   void* newPtr = Eigen::internal::aligned_malloc(size);
-  memcpy(newPtr, ptr, size);
+  std::memcpy(newPtr, ptr, size);
   Eigen::internal::aligned_free(ptr);
   return newPtr;
 }
diff --git a/Eigen/SVD b/Eigen/SVD
index 86143c23d..5d0e75f7f 100644
--- a/Eigen/SVD
+++ b/Eigen/SVD
@@ -37,7 +37,11 @@
 #include "src/SVD/JacobiSVD.h"
 #include "src/SVD/BDCSVD.h"
 #if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
 #include "src/misc/lapacke.h"
+#endif
 #include "src/SVD/JacobiSVD_LAPACKE.h"
 #endif
 
diff --git a/Eigen/SparseLU b/Eigen/SparseLU
index 38b38b531..37c4a5c5a 100644
--- a/Eigen/SparseLU
+++ b/Eigen/SparseLU
@@ -23,6 +23,8 @@
 // Ordering interface
 #include "OrderingMethods"
 
+#include "src/Core/util/DisableStupidWarnings.h"
+
 #include "src/SparseLU/SparseLU_gemm_kernel.h"
 
 #include "src/SparseLU/SparseLU_Structs.h"
@@ -43,4 +45,6 @@
 #include "src/SparseLU/SparseLU_Utils.h"
 #include "src/SparseLU/SparseLU.h"
 
+#include "src/Core/util/ReenableStupidWarnings.h"
+
 #endif // EIGEN_SPARSELU_MODULE_H
diff --git a/Eigen/SparseQR b/Eigen/SparseQR
index a6f3b7f7d..f5fc5fa7f 100644
--- a/Eigen/SparseQR
+++ b/Eigen/SparseQR
@@ -28,7 +28,6 @@
   * 
   */
 
-#include "OrderingMethods"
 #include "src/SparseCore/SparseColEtree.h"
 #include "src/SparseQR/SparseQR.h"
 
diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h
index fcee7b2e3..2dfeac333 100644
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -247,8 +247,8 @@ template<typename _MatrixType, int _UpLo> class LDLT
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
-      *          \c NumericalIssue if the matrix.appears to be negative.
+      * \returns \c Success if computation was successful,
+      *          \c NumericalIssue if the factorization failed because of a zero pivot.
       */
     ComputationInfo info() const
     {
@@ -258,7 +258,6 @@ template<typename _MatrixType, int _UpLo> class LDLT
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
     #endif
 
@@ -376,6 +375,8 @@ template<> struct ldlt_inplace<Lower>
 
       if((rs>0) && pivot_is_valid)
         A21 /= realAkk;
+      else if(rs>0)
+        ret = ret && (A21.array()==Scalar(0)).all();
 
       if(found_zero_pivot && pivot_is_valid) ret = false; // factorization failed
       else if(!pivot_is_valid) found_zero_pivot = true;
@@ -568,13 +569,14 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons
   // more precisely, use pseudo-inverse of D (see bug 241)
   using std::abs;
   const typename Diagonal<const MatrixType>::RealReturnType vecD(vectorD());
-  // In some previous versions, tolerance was set to the max of 1/highest and the maximal diagonal entry * epsilon
-  // as motivated by LAPACK's xGELSS:
+  // In some previous versions, tolerance was set to the max of 1/highest (or rather numeric_limits::min())
+  // and the maximal diagonal entry * epsilon as motivated by LAPACK's xGELSS:
   // RealScalar tolerance = numext::maxi(vecD.array().abs().maxCoeff() * NumTraits<RealScalar>::epsilon(),RealScalar(1) / NumTraits<RealScalar>::highest());
   // However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest
   // diagonal element is not well justified and leads to numerical issues in some cases.
   // Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
-  RealScalar tolerance = RealScalar(1) / NumTraits<RealScalar>::highest();
+  // Using numeric_limits::min() gives us more robustness to denormals.
+  RealScalar tolerance = (std::numeric_limits<RealScalar>::min)();
 
   for (Index i = 0; i < vecD.size(); ++i)
   {
diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h
index 87ca8d423..868766365 100644
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -24,7 +24,7 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
   *
   * \tparam _MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition
   * \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
-  *             The other triangular part won't be read.
+  *               The other triangular part won't be read.
   *
   * This class performs a LL^T Cholesky decomposition of a symmetric, positive definite
   * matrix A such that A = LL^* = U^*U, where L is lower triangular.
@@ -41,14 +41,18 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
   * Example: \include LLT_example.cpp
   * Output: \verbinclude LLT_example.out
   *
+  * \b Performance: for best performance, it is recommended to use a column-major storage format
+  * with the Lower triangular part (the default), or, equivalently, a row-major storage format
+  * with the Upper triangular part. Otherwise, you might get a 20% slowdown for the full factorization
+  * step, and rank-updates can be up to 3 times slower.
+  *
   * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
   *
+  * Note that during the decomposition, only the lower (or upper, as defined by _UpLo) triangular part of A is considered.
+  * Therefore, the strict lower part does not have to store correct values.
+  *
   * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
   */
- /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
-  * Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
-  * the strict lower part does not have to store correct values.
-  */
 template<typename _MatrixType, int _UpLo> class LLT
 {
   public:
@@ -96,7 +100,7 @@ template<typename _MatrixType, int _UpLo> class LLT
       compute(matrix.derived());
     }
 
-    /** \brief Constructs a LDLT factorization from a given matrix
+    /** \brief Constructs a LLT factorization from a given matrix
       *
       * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when
       * \c MatrixType is a Eigen::Ref.
@@ -146,7 +150,7 @@ template<typename _MatrixType, int _UpLo> class LLT
     }
 
     template<typename Derived>
-    void solveInPlace(MatrixBase<Derived> &bAndX) const;
+    void solveInPlace(const MatrixBase<Derived> &bAndX) const;
 
     template<typename InputType>
     LLT& compute(const EigenBase<InputType>& matrix);
@@ -176,8 +180,8 @@ template<typename _MatrixType, int _UpLo> class LLT
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
-      *          \c NumericalIssue if the matrix.appears to be negative.
+      * \returns \c Success if computation was successful,
+      *          \c NumericalIssue if the matrix.appears not to be positive definite.
       */
     ComputationInfo info() const
     {
@@ -196,11 +200,10 @@ template<typename _MatrixType, int _UpLo> class LLT
     inline Index cols() const { return m_matrix.cols(); }
 
     template<typename VectorType>
-    LLT rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
+    LLT & rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
     #endif
 
@@ -425,7 +428,8 @@ LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>
   eigen_assert(a.rows()==a.cols());
   const Index size = a.rows();
   m_matrix.resize(size, size);
-  m_matrix = a.derived();
+  if (!internal::is_same_dense(m_matrix, a.derived()))
+    m_matrix = a.derived();
 
   // Compute matrix L1 norm = max abs column sum.
   m_l1_norm = RealScalar(0);
@@ -454,7 +458,7 @@ LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>
   */
 template<typename _MatrixType, int _UpLo>
 template<typename VectorType>
-LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma)
+LLT<_MatrixType,_UpLo> & LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorType);
   eigen_assert(v.size()==m_matrix.cols());
@@ -485,11 +489,14 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
   *
   * This version avoids a copy when the right hand side matrix b is not needed anymore.
   *
+  * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
+  * This function will const_cast it, so constness isn't honored here.
+  *
   * \sa LLT::solve(), MatrixBase::llt()
   */
 template<typename MatrixType, int _UpLo>
 template<typename Derived>
-void LLT<MatrixType,_UpLo>::solveInPlace(MatrixBase<Derived> &bAndX) const
+void LLT<MatrixType,_UpLo>::solveInPlace(const MatrixBase<Derived> &bAndX) const
 {
   eigen_assert(m_isInitialized && "LLT is not initialized.");
   eigen_assert(m_matrix.rows()==bAndX.rows());
diff --git a/Eigen/src/CholmodSupport/CholmodSupport.h b/Eigen/src/CholmodSupport/CholmodSupport.h
index 61faf43ba..adaf52858 100644
--- a/Eigen/src/CholmodSupport/CholmodSupport.h
+++ b/Eigen/src/CholmodSupport/CholmodSupport.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_CHOLMODSUPPORT_H
 #define EIGEN_CHOLMODSUPPORT_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -79,12 +79,12 @@ cholmod_sparse viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_StorageIndex> >
 
   res.dtype   = 0;
   res.stype   = -1;
-  
+
   if (internal::is_same<_StorageIndex,int>::value)
   {
     res.itype = CHOLMOD_INT;
   }
-  else if (internal::is_same<_StorageIndex,long>::value)
+  else if (internal::is_same<_StorageIndex,SuiteSparse_long>::value)
   {
     res.itype = CHOLMOD_LONG;
   }
@@ -95,9 +95,9 @@ cholmod_sparse viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_StorageIndex> >
 
   // setup res.xtype
   internal::cholmod_configure_matrix<_Scalar>::run(res);
-  
+
   res.stype = 0;
-  
+
   return res;
 }
 
@@ -121,7 +121,7 @@ template<typename _Scalar, int _Options, typename _Index, unsigned int UpLo>
 cholmod_sparse viewAsCholmod(const SparseSelfAdjointView<const SparseMatrix<_Scalar,_Options,_Index>, UpLo>& mat)
 {
   cholmod_sparse res = viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_Index> >(mat.matrix().const_cast_derived()));
-  
+
   if(UpLo==Upper) res.stype =  1;
   if(UpLo==Lower) res.stype = -1;
   // swap stype for rowmajor matrices (only works for real matrices)
@@ -167,12 +167,12 @@ namespace internal {
 // template specializations for int and long that call the correct cholmod method
 
 #define EIGEN_CHOLMOD_SPECIALIZE0(ret, name) \
-    template<typename _StorageIndex> ret cm_ ## name       (cholmod_common &Common) { return cholmod_ ## name   (&Common); } \
-    template<>                       ret cm_ ## name<long> (cholmod_common &Common) { return cholmod_l_ ## name (&Common); }
+    template<typename _StorageIndex> inline ret cm_ ## name       (cholmod_common &Common) { return cholmod_ ## name   (&Common); } \
+    template<>                       inline ret cm_ ## name<SuiteSparse_long> (cholmod_common &Common) { return cholmod_l_ ## name (&Common); }
 
 #define EIGEN_CHOLMOD_SPECIALIZE1(ret, name, t1, a1) \
-    template<typename _StorageIndex> ret cm_ ## name       (t1& a1, cholmod_common &Common) { return cholmod_ ## name   (&a1, &Common); } \
-    template<>                       ret cm_ ## name<long> (t1& a1, cholmod_common &Common) { return cholmod_l_ ## name (&a1, &Common); }
+    template<typename _StorageIndex> inline ret cm_ ## name       (t1& a1, cholmod_common &Common) { return cholmod_ ## name   (&a1, &Common); } \
+    template<>                       inline ret cm_ ## name<SuiteSparse_long> (t1& a1, cholmod_common &Common) { return cholmod_l_ ## name (&a1, &Common); }
 
 EIGEN_CHOLMOD_SPECIALIZE0(int, start)
 EIGEN_CHOLMOD_SPECIALIZE0(int, finish)
@@ -183,16 +183,16 @@ EIGEN_CHOLMOD_SPECIALIZE1(int, free_sparse, cholmod_sparse*, A)
 
 EIGEN_CHOLMOD_SPECIALIZE1(cholmod_factor*, analyze, cholmod_sparse, A)
 
-template<typename _StorageIndex> cholmod_dense*  cm_solve         (int sys, cholmod_factor& L, cholmod_dense&  B, cholmod_common &Common) { return cholmod_solve     (sys, &L, &B, &Common); }
-template<>                       cholmod_dense*  cm_solve<long>   (int sys, cholmod_factor& L, cholmod_dense&  B, cholmod_common &Common) { return cholmod_l_solve   (sys, &L, &B, &Common); }
+template<typename _StorageIndex> inline cholmod_dense*  cm_solve         (int sys, cholmod_factor& L, cholmod_dense&  B, cholmod_common &Common) { return cholmod_solve     (sys, &L, &B, &Common); }
+template<>                       inline cholmod_dense*  cm_solve<SuiteSparse_long>   (int sys, cholmod_factor& L, cholmod_dense&  B, cholmod_common &Common) { return cholmod_l_solve   (sys, &L, &B, &Common); }
 
-template<typename _StorageIndex> cholmod_sparse* cm_spsolve       (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_spsolve   (sys, &L, &B, &Common); }
-template<>                       cholmod_sparse* cm_spsolve<long> (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_l_spsolve (sys, &L, &B, &Common); }
+template<typename _StorageIndex> inline cholmod_sparse* cm_spsolve       (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_spsolve   (sys, &L, &B, &Common); }
+template<>                       inline cholmod_sparse* cm_spsolve<SuiteSparse_long> (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_l_spsolve (sys, &L, &B, &Common); }
 
 template<typename _StorageIndex>
-int  cm_factorize_p       (cholmod_sparse*  A, double beta[2], _StorageIndex* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_factorize_p   (A, beta, fset, fsize, L, &Common); }
+inline int  cm_factorize_p       (cholmod_sparse*  A, double beta[2], _StorageIndex* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_factorize_p   (A, beta, fset, fsize, L, &Common); }
 template<>
-int  cm_factorize_p<long> (cholmod_sparse*  A, double beta[2], long* fset,          std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_l_factorize_p (A, beta, fset, fsize, L, &Common); }
+inline int  cm_factorize_p<SuiteSparse_long> (cholmod_sparse*  A, double beta[2], SuiteSparse_long* fset,          std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_l_factorize_p (A, beta, fset, fsize, L, &Common); }
 
 #undef EIGEN_CHOLMOD_SPECIALIZE0
 #undef EIGEN_CHOLMOD_SPECIALIZE1
@@ -254,10 +254,10 @@ class CholmodBase : public SparseSolverBase<Derived>
         internal::cm_free_factor<StorageIndex>(m_cholmodFactor, m_cholmod);
       internal::cm_finish<StorageIndex>(m_cholmod);
     }
-    
+
     inline StorageIndex cols() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
     inline StorageIndex rows() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
-    
+
     /** \brief Reports whether previous computation was successful.
       *
       * \returns \c Success if computation was successful,
@@ -276,11 +276,11 @@ class CholmodBase : public SparseSolverBase<Derived>
       factorize(matrix);
       return derived();
     }
-    
+
     /** Performs a symbolic decomposition on the sparsity pattern of \a matrix.
       *
       * This function is particularly useful when solving for several problems having the same structure.
-      * 
+      *
       * \sa factorize()
       */
     void analyzePattern(const MatrixType& matrix)
@@ -292,13 +292,13 @@ class CholmodBase : public SparseSolverBase<Derived>
       }
       cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView<UpLo>());
       m_cholmodFactor = internal::cm_analyze<StorageIndex>(A, m_cholmod);
-      
+
       this->m_isInitialized = true;
       this->m_info = Success;
       m_analysisIsOk = true;
       m_factorizationIsOk = false;
     }
-    
+
     /** Performs a numeric decomposition of \a matrix
       *
       * The given matrix must have the same sparsity pattern as the matrix on which the symbolic decomposition has been performed.
@@ -315,11 +315,11 @@ class CholmodBase : public SparseSolverBase<Derived>
       this->m_info = (m_cholmodFactor->minor == m_cholmodFactor->n ? Success : NumericalIssue);
       m_factorizationIsOk = true;
     }
-    
+
     /** Returns a reference to the Cholmod's configuration structure to get a full control over the performed operations.
      *  See the Cholmod user guide for details. */
     cholmod_common& cholmod() { return m_cholmod; }
-    
+
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal */
     template<typename Rhs,typename Dest>
@@ -329,7 +329,7 @@ class CholmodBase : public SparseSolverBase<Derived>
       const Index size = m_cholmodFactor->n;
       EIGEN_UNUSED_VARIABLE(size);
       eigen_assert(size==b.rows());
-      
+
       // Cholmod needs column-major storage without inner-stride, which corresponds to the default behavior of Ref.
       Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b.derived());
 
@@ -345,7 +345,7 @@ class CholmodBase : public SparseSolverBase<Derived>
       dest = Matrix<Scalar,Dest::RowsAtCompileTime,Dest::ColsAtCompileTime>::Map(reinterpret_cast<Scalar*>(x_cd->x),b.rows(),b.cols());
       internal::cm_free_dense<StorageIndex>(x_cd, m_cholmod);
     }
-    
+
     /** \internal */
     template<typename RhsDerived, typename DestDerived>
     void _solve_impl(const SparseMatrixBase<RhsDerived> &b, SparseMatrixBase<DestDerived> &dest) const
@@ -370,8 +370,8 @@ class CholmodBase : public SparseSolverBase<Derived>
       internal::cm_free_sparse<StorageIndex>(x_cs, m_cholmod);
     }
     #endif // EIGEN_PARSED_BY_DOXYGEN
-    
-    
+
+
     /** Sets the shift parameter that will be used to adjust the diagonal coefficients during the numerical factorization.
       *
       * During the numerical factorization, an offset term is added to the diagonal coefficients:\n
@@ -386,7 +386,7 @@ class CholmodBase : public SparseSolverBase<Derived>
       m_shiftOffset[0] = double(offset);
       return derived();
     }
-    
+
     /** \returns the determinant of the underlying matrix from the current factorization */
     Scalar determinant() const
     {
@@ -441,7 +441,7 @@ class CholmodBase : public SparseSolverBase<Derived>
     template<typename Stream>
     void dumpMemory(Stream& /*s*/)
     {}
-    
+
   protected:
     mutable cholmod_common m_cholmod;
     cholmod_factor* m_cholmodFactor;
@@ -478,11 +478,11 @@ class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimpl
 {
     typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLLT> Base;
     using Base::m_cholmod;
-    
+
   public:
-    
+
     typedef _MatrixType MatrixType;
-    
+
     CholmodSimplicialLLT() : Base() { init(); }
 
     CholmodSimplicialLLT(const MatrixType& matrix) : Base()
@@ -529,11 +529,11 @@ class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimp
 {
     typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLDLT> Base;
     using Base::m_cholmod;
-    
+
   public:
-    
+
     typedef _MatrixType MatrixType;
-    
+
     CholmodSimplicialLDLT() : Base() { init(); }
 
     CholmodSimplicialLDLT(const MatrixType& matrix) : Base()
@@ -578,11 +578,11 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper
 {
     typedef CholmodBase<_MatrixType, _UpLo, CholmodSupernodalLLT> Base;
     using Base::m_cholmod;
-    
+
   public:
-    
+
     typedef _MatrixType MatrixType;
-    
+
     CholmodSupernodalLLT() : Base() { init(); }
 
     CholmodSupernodalLLT(const MatrixType& matrix) : Base()
@@ -629,11 +629,11 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom
 {
     typedef CholmodBase<_MatrixType, _UpLo, CholmodDecomposition> Base;
     using Base::m_cholmod;
-    
+
   public:
-    
+
     typedef _MatrixType MatrixType;
-    
+
     CholmodDecomposition() : Base() { init(); }
 
     CholmodDecomposition(const MatrixType& matrix) : Base()
@@ -643,7 +643,7 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom
     }
 
     ~CholmodDecomposition() {}
-    
+
     void setMode(CholmodMode mode)
     {
       switch(mode)
diff --git a/Eigen/src/Core/ArithmeticSequence.h b/Eigen/src/Core/ArithmeticSequence.h
index ada1571f1..db6da0001 100644
--- a/Eigen/src/Core/ArithmeticSequence.h
+++ b/Eigen/src/Core/ArithmeticSequence.h
@@ -29,17 +29,17 @@ template<int N> struct aseq_negate<FixedInt<N> > {
 template<> struct aseq_negate<FixedInt<DynamicIndex> > {};
 
 template<typename FirstType,typename SizeType,typename IncrType,
-         bool FirstIsSymbolic=Symbolic::is_symbolic<FirstType>::value,
-         bool SizeIsSymbolic =Symbolic::is_symbolic<SizeType>::value>
+         bool FirstIsSymbolic=symbolic::is_symbolic<FirstType>::value,
+         bool SizeIsSymbolic =symbolic::is_symbolic<SizeType>::value>
 struct aseq_reverse_first_type {
   typedef Index type;
 };
 
 template<typename FirstType,typename SizeType,typename IncrType>
 struct aseq_reverse_first_type<FirstType,SizeType,IncrType,true,true> {
-  typedef Symbolic::AddExpr<FirstType,
-                            Symbolic::ProductExpr<Symbolic::AddExpr<SizeType,Symbolic::ValueExpr<FixedInt<-1> > >,
-                                                  Symbolic::ValueExpr<IncrType> >
+  typedef symbolic::AddExpr<FirstType,
+                            symbolic::ProductExpr<symbolic::AddExpr<SizeType,symbolic::ValueExpr<FixedInt<-1> > >,
+                                                  symbolic::ValueExpr<IncrType> >
                            > type;
 };
 
@@ -56,14 +56,14 @@ struct aseq_reverse_first_type_aux<SizeType,IncrType,typename internal::enable_i
 template<typename FirstType,typename SizeType,typename IncrType>
 struct aseq_reverse_first_type<FirstType,SizeType,IncrType,true,false> {
   typedef typename aseq_reverse_first_type_aux<SizeType,IncrType>::type Aux;
-  typedef Symbolic::AddExpr<FirstType,Symbolic::ValueExpr<Aux> > type;
+  typedef symbolic::AddExpr<FirstType,symbolic::ValueExpr<Aux> > type;
 };
 
 template<typename FirstType,typename SizeType,typename IncrType>
 struct aseq_reverse_first_type<FirstType,SizeType,IncrType,false,true> {
-  typedef Symbolic::AddExpr<Symbolic::ProductExpr<Symbolic::AddExpr<SizeType,Symbolic::ValueExpr<FixedInt<-1> > >,
-                                                  Symbolic::ValueExpr<IncrType> >,
-                            Symbolic::ValueExpr<> > type;
+  typedef symbolic::AddExpr<symbolic::ProductExpr<symbolic::AddExpr<SizeType,symbolic::ValueExpr<FixedInt<-1> > >,
+                                                  symbolic::ValueExpr<IncrType> >,
+                            symbolic::ValueExpr<> > type;
 };
 #endif
 
@@ -225,10 +225,11 @@ auto seq(FirstType f, LastType l, IncrType incr)
                -typename internal::cleanup_index_type<FirstType>::type(f)+CleanedIncrType(incr)) / CleanedIncrType(incr),
               CleanedIncrType(incr));
 }
-#else
+
+#else // EIGEN_HAS_CXX11
 
 template<typename FirstType,typename LastType>
-typename internal::enable_if<!(Symbolic::is_symbolic<FirstType>::value || Symbolic::is_symbolic<LastType>::value),
+typename internal::enable_if<!(symbolic::is_symbolic<FirstType>::value || symbolic::is_symbolic<LastType>::value),
                              ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,Index> >::type
 seq(FirstType f, LastType l)
 {
@@ -237,35 +238,35 @@ seq(FirstType f, LastType l)
 }
 
 template<typename FirstTypeDerived,typename LastType>
-typename internal::enable_if<!Symbolic::is_symbolic<LastType>::value,
-    ArithmeticSequence<FirstTypeDerived, Symbolic::AddExpr<Symbolic::AddExpr<Symbolic::NegateExpr<FirstTypeDerived>,Symbolic::ValueExpr<> >,
-                                                            Symbolic::ValueExpr<internal::FixedInt<1> > > > >::type
-seq(const Symbolic::BaseExpr<FirstTypeDerived> &f, LastType l)
+typename internal::enable_if<!symbolic::is_symbolic<LastType>::value,
+    ArithmeticSequence<FirstTypeDerived, symbolic::AddExpr<symbolic::AddExpr<symbolic::NegateExpr<FirstTypeDerived>,symbolic::ValueExpr<> >,
+                                                            symbolic::ValueExpr<internal::FixedInt<1> > > > >::type
+seq(const symbolic::BaseExpr<FirstTypeDerived> &f, LastType l)
 {
   return seqN(f.derived(),(typename internal::cleanup_index_type<LastType>::type(l)-f.derived()+fix<1>()));
 }
 
 template<typename FirstType,typename LastTypeDerived>
-typename internal::enable_if<!Symbolic::is_symbolic<FirstType>::value,
+typename internal::enable_if<!symbolic::is_symbolic<FirstType>::value,
     ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
-                        Symbolic::AddExpr<Symbolic::AddExpr<LastTypeDerived,Symbolic::ValueExpr<> >,
-                                          Symbolic::ValueExpr<internal::FixedInt<1> > > > >::type
-seq(FirstType f, const Symbolic::BaseExpr<LastTypeDerived> &l)
+                        symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::ValueExpr<> >,
+                                          symbolic::ValueExpr<internal::FixedInt<1> > > > >::type
+seq(FirstType f, const symbolic::BaseExpr<LastTypeDerived> &l)
 {
   return seqN(typename internal::cleanup_index_type<FirstType>::type(f),(l.derived()-typename internal::cleanup_index_type<FirstType>::type(f)+fix<1>()));
 }
 
 template<typename FirstTypeDerived,typename LastTypeDerived>
 ArithmeticSequence<FirstTypeDerived,
-                    Symbolic::AddExpr<Symbolic::AddExpr<LastTypeDerived,Symbolic::NegateExpr<FirstTypeDerived> >,Symbolic::ValueExpr<internal::FixedInt<1> > > >
-seq(const Symbolic::BaseExpr<FirstTypeDerived> &f, const Symbolic::BaseExpr<LastTypeDerived> &l)
+                    symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::NegateExpr<FirstTypeDerived> >,symbolic::ValueExpr<internal::FixedInt<1> > > >
+seq(const symbolic::BaseExpr<FirstTypeDerived> &f, const symbolic::BaseExpr<LastTypeDerived> &l)
 {
   return seqN(f.derived(),(l.derived()-f.derived()+fix<1>()));
 }
 
 
 template<typename FirstType,typename LastType, typename IncrType>
-typename internal::enable_if<!(Symbolic::is_symbolic<FirstType>::value || Symbolic::is_symbolic<LastType>::value),
+typename internal::enable_if<!(symbolic::is_symbolic<FirstType>::value || symbolic::is_symbolic<LastType>::value),
     ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,Index,typename internal::cleanup_seq_incr<IncrType>::type> >::type
 seq(FirstType f, LastType l, IncrType incr)
 {
@@ -275,27 +276,27 @@ seq(FirstType f, LastType l, IncrType incr)
 }
 
 template<typename FirstTypeDerived,typename LastType, typename IncrType>
-typename internal::enable_if<!Symbolic::is_symbolic<LastType>::value,
+typename internal::enable_if<!symbolic::is_symbolic<LastType>::value,
     ArithmeticSequence<FirstTypeDerived,
-                        Symbolic::QuotientExpr<Symbolic::AddExpr<Symbolic::AddExpr<Symbolic::NegateExpr<FirstTypeDerived>,
-                                                                                   Symbolic::ValueExpr<> >,
-                                                                 Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
-                                              Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                        symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<symbolic::NegateExpr<FirstTypeDerived>,
+                                                                                   symbolic::ValueExpr<> >,
+                                                                 symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                                              symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
                         typename internal::cleanup_seq_incr<IncrType>::type> >::type
-seq(const Symbolic::BaseExpr<FirstTypeDerived> &f, LastType l, IncrType incr)
+seq(const symbolic::BaseExpr<FirstTypeDerived> &f, LastType l, IncrType incr)
 {
   typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
   return seqN(f.derived(),(typename internal::cleanup_index_type<LastType>::type(l)-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr);
 }
 
 template<typename FirstType,typename LastTypeDerived, typename IncrType>
-typename internal::enable_if<!Symbolic::is_symbolic<FirstType>::value,
+typename internal::enable_if<!symbolic::is_symbolic<FirstType>::value,
     ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
-                        Symbolic::QuotientExpr<Symbolic::AddExpr<Symbolic::AddExpr<LastTypeDerived,Symbolic::ValueExpr<> >,
-                                                                 Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
-                                               Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                        symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::ValueExpr<> >,
+                                                                 symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                                               symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
                         typename internal::cleanup_seq_incr<IncrType>::type> >::type
-seq(FirstType f, const Symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
+seq(FirstType f, const symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
 {
   typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
   return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
@@ -304,26 +305,55 @@ seq(FirstType f, const Symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
 
 template<typename FirstTypeDerived,typename LastTypeDerived, typename IncrType>
 ArithmeticSequence<FirstTypeDerived,
-                    Symbolic::QuotientExpr<Symbolic::AddExpr<Symbolic::AddExpr<LastTypeDerived,
-                                                                               Symbolic::NegateExpr<FirstTypeDerived> >,
-                                                             Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
-                                          Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                    symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,
+                                                                               symbolic::NegateExpr<FirstTypeDerived> >,
+                                                             symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                                          symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
                     typename internal::cleanup_seq_incr<IncrType>::type>
-seq(const Symbolic::BaseExpr<FirstTypeDerived> &f, const Symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
+seq(const symbolic::BaseExpr<FirstTypeDerived> &f, const symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
 {
   typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
   return seqN(f.derived(),(l.derived()-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr);
 }
-#endif
+#endif // EIGEN_HAS_CXX11
 
 #endif // EIGEN_PARSED_BY_DOXYGEN
 
+
+#if EIGEN_HAS_CXX11
+/** \cpp11
+  * \returns a symbolic ArithmeticSequence representing the last \a size elements with increment \a incr.
+  *
+  * It is a shortcut for: \code seqN(last-(size-fix<1>)*incr, size, incr) \endcode
+  * 
+  * \sa lastN(SizeType), seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */
+template<typename SizeType,typename IncrType>
+auto lastN(SizeType size, IncrType incr)
+-> decltype(seqN(Eigen::last-(size-fix<1>())*incr, size, incr))
+{
+  return seqN(Eigen::last-(size-fix<1>())*incr, size, incr);
+}
+
+/** \cpp11
+  * \returns a symbolic ArithmeticSequence representing the last \a size elements with a unit increment.
+  *
+  *  It is a shortcut for: \code seq(last+fix<1>-size, last) \endcode
+  * 
+  * \sa lastN(SizeType,IncrType, seqN(FirstType,SizeType), seq(FirstType,LastType) */
+template<typename SizeType>
+auto lastN(SizeType size)
+-> decltype(seqN(Eigen::last+fix<1>()-size, size))
+{
+  return seqN(Eigen::last+fix<1>()-size, size);
+}
+#endif
+
 namespace internal {
 
 // Convert a symbolic span into a usable one (i.e., remove last/end "keywords")
 template<typename T>
 struct make_size_type {
-  typedef typename internal::conditional<Symbolic::is_symbolic<T>::value, Index, T>::type type;
+  typedef typename internal::conditional<symbolic::is_symbolic<T>::value, Index, T>::type type;
 };
 
 template<typename FirstType,typename SizeType,typename IncrType,int XprSize>
@@ -345,6 +375,39 @@ struct get_compile_time_incr<ArithmeticSequence<FirstType,SizeType,IncrType> > {
 
 } // end namespace internal
 
+/** \namespace Eigen::indexing
+  * \ingroup Core_Module
+  * 
+  * The sole purpose of this namespace is to be able to import all functions
+  * and symbols that are expected to be used within operator() for indexing
+  * and slicing. If you already imported the whole Eigen namespace:
+  * \code using namespace Eigen; \endcode
+  * then you are already all set. Otherwise, if you don't want/cannot import
+  * the whole Eigen namespace, the following line:
+  * \code using namespace Eigen::indexing; \endcode
+  * is equivalent to:
+  * \code
+  using Eigen::all;
+  using Eigen::seq;
+  using Eigen::seqN;
+  using Eigen::lastN; // c++11 only
+  using Eigen::last;
+  using Eigen::lastp1;
+  using Eigen::fix;
+  \endcode
+  */
+namespace indexing {
+  using Eigen::all;
+  using Eigen::seq;
+  using Eigen::seqN;
+  #if EIGEN_HAS_CXX11
+  using Eigen::lastN;
+  #endif
+  using Eigen::last;
+  using Eigen::lastp1;
+  using Eigen::fix;
+}
+
 } // end namespace Eigen
 
 #endif // EIGEN_ARITHMETIC_SEQUENCE_H
diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h
index 0d34269fd..e10020d4f 100644
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h
@@ -231,10 +231,16 @@ class Array
             : Base(other)
     { }
 
+  private:
+    struct PrivateType {};
+  public:
+
     /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other)
+    EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other,
+                              typename internal::enable_if<internal::is_convertible<typename OtherDerived::Scalar,Scalar>::value,
+                                                           PrivateType>::type = PrivateType())
       : Base(other.derived())
     { }
 
diff --git a/Eigen/src/Core/ArrayBase.h b/Eigen/src/Core/ArrayBase.h
index af5fb2566..9da960f08 100644
--- a/Eigen/src/Core/ArrayBase.h
+++ b/Eigen/src/Core/ArrayBase.h
@@ -175,7 +175,7 @@ template<typename Derived> class ArrayBase
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
 {
   call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -188,7 +188,7 @@ ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
 {
   call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -201,7 +201,7 @@ ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
 {
   call_assignment(derived(), other.derived(), internal::mul_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -214,7 +214,7 @@ ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator/=(const ArrayBase<OtherDerived>& other)
 {
   call_assignment(derived(), other.derived(), internal::div_assign_op<Scalar,typename OtherDerived::Scalar>());
diff --git a/Eigen/src/Core/ArrayWrapper.h b/Eigen/src/Core/ArrayWrapper.h
index a04521a16..757b31825 100644
--- a/Eigen/src/Core/ArrayWrapper.h
+++ b/Eigen/src/Core/ArrayWrapper.h
@@ -32,7 +32,8 @@ struct traits<ArrayWrapper<ExpressionType> >
   // Let's remove NestByRefBit
   enum {
     Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
-    Flags = Flags0 & ~NestByRefBit
+    LvalueBitFlag = is_lvalue<ExpressionType>::value ? LvalueBit : 0,
+    Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag
   };
 };
 }
@@ -89,8 +90,8 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
     EIGEN_DEVICE_FUNC
     inline void evalTo(Dest& dst) const { dst = m_expression; }
 
-    const typename internal::remove_all<NestedExpressionType>::type& 
     EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<NestedExpressionType>::type& 
     nestedExpression() const 
     {
       return m_expression;
@@ -129,7 +130,8 @@ struct traits<MatrixWrapper<ExpressionType> >
   // Let's remove NestByRefBit
   enum {
     Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
-    Flags = Flags0 & ~NestByRefBit
+    LvalueBitFlag = is_lvalue<ExpressionType>::value ? LvalueBit : 0,
+    Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag
   };
 };
 }
diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h
index 53806ba33..655412efd 100644
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -16,7 +16,7 @@ namespace Eigen {
 
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
   ::lazyAssign(const DenseBase<OtherDerived>& other)
 {
   enum{
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
index b0ec7b7ca..362d905d2 100644
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -39,7 +39,7 @@ public:
   enum {
     DstAlignment = DstEvaluator::Alignment,
     SrcAlignment = SrcEvaluator::Alignment,
-    DstHasDirectAccess = DstFlags & DirectAccessBit,
+    DstHasDirectAccess = (DstFlags & DirectAccessBit) == DirectAccessBit,
     JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment)
   };
 
@@ -83,7 +83,7 @@ private:
                        && int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0
                        && (EIGEN_UNALIGNED_VECTORIZE  || int(JointAlignment)>=int(InnerRequiredAlignment)),
     MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
-    MayLinearVectorize = bool(MightVectorize) && MayLinearize && DstHasDirectAccess
+    MayLinearVectorize = bool(MightVectorize) && bool(MayLinearize) && bool(DstHasDirectAccess)
                        && (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic),
       /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
          so it's only good for large enough sizes. */
@@ -97,7 +97,7 @@ private:
 
 public:
   enum {
-    Traversal = int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize) ? int(LinearVectorizedTraversal)
+    Traversal = (int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize)) ? int(LinearVectorizedTraversal)
               : int(MayInnerVectorize)   ? int(InnerVectorizedTraversal)
               : int(MayLinearVectorize)  ? int(LinearVectorizedTraversal)
               : int(MaySliceVectorize)   ? int(SliceVectorizedTraversal)
@@ -756,7 +756,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType
 // AssignmentKind must define a Kind typedef.
 template<typename DstShape, typename SrcShape> struct AssignmentKind;
 
-// Assignement kind defined in this file:
+// Assignment kind defined in this file:
 struct Dense2Dense {};
 struct EigenBase2EigenBase {};
 
@@ -899,7 +899,7 @@ struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Weak>
     src.evalTo(dst);
   }
 
-  // NOTE The following two functions are templated to avoid their instanciation if not needed
+  // NOTE The following two functions are templated to avoid their instantiation if not needed
   //      This is needed because some expressions supports evalTo only and/or have 'void' as scalar type.
   template<typename SrcScalarType>
   EIGEN_DEVICE_FUNC
diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h
index 6c2ab9264..6866095bf 100755
--- a/Eigen/src/Core/Assign_MKL.h
+++ b/Eigen/src/Core/Assign_MKL.h
@@ -84,7 +84,8 @@ class vml_assign_traits
   struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE,EIGENTYPE>,   \
                    Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> {              \
     typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType;                                            \
-    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &/*func*/) {                   \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &func) {                       \
+      resize_if_allowed(dst, src, func);                                                                                        \
       eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                       \
       if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) {                                              \
         VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(),                                                        \
@@ -144,7 +145,8 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil,  Ceil,   _)
                    Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> {            \
     typedef CwiseBinaryOp<scalar_##EIGENOP##_op<EIGENTYPE,EIGENTYPE>, SrcXprNested,                                           \
                     const CwiseNullaryOp<internal::scalar_constant_op<EIGENTYPE>,Plain> > SrcXprType;                         \
-    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &/*func*/) {                 \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &func) {                     \
+      resize_if_allowed(dst, src, func);                                                                                      \
       eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                     \
       VMLTYPE exponent = reinterpret_cast<const VMLTYPE&>(src.rhs().functor().m_other);                                       \
       if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal)                                              \
diff --git a/Eigen/src/Core/BooleanRedux.h b/Eigen/src/Core/BooleanRedux.h
index ed607d5d8..ccf519067 100644
--- a/Eigen/src/Core/BooleanRedux.h
+++ b/Eigen/src/Core/BooleanRedux.h
@@ -76,7 +76,7 @@ struct any_unroller<Derived, Dynamic, Rows>
   * \sa any(), Cwise::operator<()
   */
 template<typename Derived>
-inline bool DenseBase<Derived>::all() const
+EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::all() const
 {
   typedef internal::evaluator<Derived> Evaluator;
   enum {
@@ -100,7 +100,7 @@ inline bool DenseBase<Derived>::all() const
   * \sa all()
   */
 template<typename Derived>
-inline bool DenseBase<Derived>::any() const
+EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::any() const
 {
   typedef internal::evaluator<Derived> Evaluator;
   enum {
@@ -124,7 +124,7 @@ inline bool DenseBase<Derived>::any() const
   * \sa all(), any()
   */
 template<typename Derived>
-inline Eigen::Index DenseBase<Derived>::count() const
+EIGEN_DEVICE_FUNC inline Eigen::Index DenseBase<Derived>::count() const
 {
   return derived().template cast<bool>().template cast<Index>().sum();
 }
diff --git a/Eigen/src/Core/CommaInitializer.h b/Eigen/src/Core/CommaInitializer.h
index d218e9814..35fdbb819 100644
--- a/Eigen/src/Core/CommaInitializer.h
+++ b/Eigen/src/Core/CommaInitializer.h
@@ -141,7 +141,7 @@ struct CommaInitializer
   * \sa CommaInitializer::finished(), class CommaInitializer
   */
 template<typename Derived>
-inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
+EIGEN_DEVICE_FUNC inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
 {
   return CommaInitializer<Derived>(*static_cast<Derived*>(this), s);
 }
@@ -149,7 +149,7 @@ inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s
 /** \sa operator<<(const Scalar&) */
 template<typename Derived>
 template<typename OtherDerived>
-inline CommaInitializer<Derived>
+EIGEN_DEVICE_FUNC inline CommaInitializer<Derived>
 DenseBase<Derived>::operator<<(const DenseBase<OtherDerived>& other)
 {
   return CommaInitializer<Derived>(*static_cast<Derived *>(this), other);
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index 412f5a661..264446f65 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -134,19 +134,21 @@ private:
 // this helper permits to completely eliminate m_outerStride if it is known at compiletime.
 template<typename Scalar,int OuterStride> class plainobjectbase_evaluator_data {
 public:
-  plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr)
+  EIGEN_DEVICE_FUNC plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr)
   {
-    EIGEN_ONLY_USED_FOR_DEBUG(outerStride);
+#ifndef EIGEN_INTERNAL_DEBUGGING
+    EIGEN_UNUSED_VARIABLE(outerStride);
+#endif
     eigen_internal_assert(outerStride==OuterStride);
   }
-  Index outerStride() const { return OuterStride; }
+  EIGEN_DEVICE_FUNC Index outerStride() const { return OuterStride; }
   const Scalar *data;
 };
 
 template<typename Scalar> class plainobjectbase_evaluator_data<Scalar,Dynamic> {
 public:
-  plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {}
-  Index outerStride() const { return m_outerStride; }
+  EIGEN_DEVICE_FUNC plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {}
+  EIGEN_DEVICE_FUNC Index outerStride() const { return m_outerStride; }
   const Scalar *data;
 protected:
   Index m_outerStride;
@@ -1034,7 +1036,7 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
     OuterStrideAtCompileTime = HasSameStorageOrderAsArgType
                              ? int(outer_stride_at_compile_time<ArgType>::ret)
                              : int(inner_stride_at_compile_time<ArgType>::ret),
-    MaskPacketAccessBit = (InnerStrideAtCompileTime == 1) ? PacketAccessBit : 0,
+    MaskPacketAccessBit = (InnerStrideAtCompileTime == 1 || HasSameStorageOrderAsArgType) ? PacketAccessBit : 0,
     
     FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator<ArgType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,    
     FlagsRowMajorBit = XprType::Flags&RowMajorBit,
@@ -1044,7 +1046,9 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
     Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit,
     
     PacketAlignment = unpacket_traits<PacketScalar>::alignment,
-    Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0,
+    Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic)
+                             && (OuterStrideAtCompileTime!=0)
+                             && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0,
     Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ArgType>::Alignment, Alignment0)
   };
   typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type;
@@ -1075,14 +1079,16 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
   EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block)
     : m_argImpl(block.nestedExpression()), 
       m_startRow(block.startRow()), 
-      m_startCol(block.startCol()) 
+      m_startCol(block.startCol()),
+      m_linear_offset(ForwardLinearAccess?(ArgType::IsRowMajor ? block.startRow()*block.nestedExpression().cols() + block.startCol() : block.startCol()*block.nestedExpression().rows() + block.startRow()):0)
   { }
  
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
   enum {
-    RowsAtCompileTime = XprType::RowsAtCompileTime
+    RowsAtCompileTime = XprType::RowsAtCompileTime,
+    ForwardLinearAccess = (InnerPanel || int(XprType::IsRowMajor)==int(ArgType::IsRowMajor)) && bool(evaluator<ArgType>::Flags&LinearAccessBit)
   };
  
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -1094,7 +1100,10 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index index) const
   { 
-    return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
+    if (ForwardLinearAccess)
+      return m_argImpl.coeff(m_linear_offset.value() + index); 
+    else
+      return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -1106,7 +1115,10 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Scalar& coeffRef(Index index)
   { 
-    return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
+    if (ForwardLinearAccess)
+      return m_argImpl.coeffRef(m_linear_offset.value() + index); 
+    else
+      return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
   }
  
   template<int LoadMode, typename PacketType>
@@ -1120,8 +1132,11 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
   EIGEN_STRONG_INLINE
   PacketType packet(Index index) const 
   { 
-    return packet<LoadMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
-                                       RowsAtCompileTime == 1 ? index : 0);
+    if (ForwardLinearAccess)
+      return m_argImpl.template packet<LoadMode,PacketType>(m_linear_offset.value() + index);
+    else
+      return packet<LoadMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
+                                         RowsAtCompileTime == 1 ? index : 0);
   }
   
   template<int StoreMode, typename PacketType>
@@ -1135,15 +1150,19 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
   EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketType& x) 
   {
-    return writePacket<StoreMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
-                                             RowsAtCompileTime == 1 ? index : 0,
-                                             x);
+    if (ForwardLinearAccess)
+      return m_argImpl.template writePacket<StoreMode,PacketType>(m_linear_offset.value() + index, x);
+    else
+      return writePacket<StoreMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
+                                              RowsAtCompileTime == 1 ? index : 0,
+                                              x);
   }
  
 protected:
   evaluator<ArgType> m_argImpl;
   const variable_if_dynamic<Index, (ArgType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
   const variable_if_dynamic<Index, (ArgType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
+  const variable_if_dynamic<Index, ForwardLinearAccess ? Dynamic : 0> m_linear_offset;
 };
 
 // TODO: This evaluator does not actually use the child evaluator; 
diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h
index a36765e39..bf2632d9e 100644
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -158,7 +158,7 @@ public:
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
 {
   call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -171,7 +171,7 @@ MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
 {
   call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -181,4 +181,3 @@ MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
 } // end namespace Eigen
 
 #endif // EIGEN_CWISE_BINARY_OP_H
-
diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h
index dd498f758..b1923da0f 100644
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -105,7 +105,7 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp
   */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func)
 {
   return CwiseNullaryOp<CustomNullaryOp, PlainObject>(rows, cols, func);
@@ -131,7 +131,7 @@ DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f
   */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@@ -150,7 +150,7 @@ DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
   */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
 {
   return CwiseNullaryOp<CustomNullaryOp, PlainObject>(RowsAtCompileTime, ColsAtCompileTime, func);
@@ -170,7 +170,7 @@ DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
   * \sa class CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
 {
   return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_constant_op<Scalar>(value));
@@ -192,7 +192,7 @@ DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
   * \sa class CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Constant(Index size, const Scalar& value)
 {
   return DenseBase<Derived>::NullaryExpr(size, internal::scalar_constant_op<Scalar>(value));
@@ -208,7 +208,7 @@ DenseBase<Derived>::Constant(Index size, const Scalar& value)
   * \sa class CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Constant(const Scalar& value)
 {
   EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
@@ -220,7 +220,7 @@ DenseBase<Derived>::Constant(const Scalar& value)
   * \sa LinSpaced(Index,Scalar,Scalar), setLinSpaced(Index,const Scalar&,const Scalar&)
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@@ -232,7 +232,7 @@ DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const
   * \sa LinSpaced(Scalar,Scalar)
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@@ -264,7 +264,7 @@ DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& hig
   * \sa setLinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@@ -276,7 +276,7 @@ DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
   * Special version for fixed size types which does not require the size parameter.
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@@ -286,7 +286,7 @@ DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
 
 /** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */
 template<typename Derived>
-bool DenseBase<Derived>::isApproxToConstant
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApproxToConstant
 (const Scalar& val, const RealScalar& prec) const
 {
   typename internal::nested_eval<Derived,1>::type self(derived());
@@ -301,7 +301,7 @@ bool DenseBase<Derived>::isApproxToConstant
   *
   * \returns true if all coefficients in this matrix are approximately equal to \a value, to within precision \a prec */
 template<typename Derived>
-bool DenseBase<Derived>::isConstant
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isConstant
 (const Scalar& val, const RealScalar& prec) const
 {
   return isApproxToConstant(val, prec);
@@ -312,7 +312,7 @@ bool DenseBase<Derived>::isConstant
   * \sa setConstant(), Constant(), class CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
 {
   setConstant(val);
 }
@@ -322,7 +322,7 @@ EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
   * \sa fill(), setConstant(Index,const Scalar&), setConstant(Index,Index,const Scalar&), setZero(), setOnes(), Constant(), class CwiseNullaryOp, setZero(), setOnes()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
 {
   return derived() = Constant(rows(), cols(), val);
 }
@@ -337,7 +337,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
   * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
 {
   resize(size);
@@ -356,7 +356,7 @@ PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
   * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
 {
   resize(rows, cols);
@@ -380,7 +380,7 @@ PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
   * \sa LinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar,PacketScalar>(low,high,newSize));
@@ -400,7 +400,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, con
   * \sa LinSpaced(Index,const Scalar&,const Scalar&), setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return setLinSpaced(size(), low, high);
@@ -423,7 +423,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low,
   * \sa Zero(), Zero(Index)
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Zero(Index rows, Index cols)
 {
   return Constant(rows, cols, Scalar(0));
@@ -446,7 +446,7 @@ DenseBase<Derived>::Zero(Index rows, Index cols)
   * \sa Zero(), Zero(Index,Index)
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Zero(Index size)
 {
   return Constant(size, Scalar(0));
@@ -463,7 +463,7 @@ DenseBase<Derived>::Zero(Index size)
   * \sa Zero(Index), Zero(Index,Index)
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Zero()
 {
   return Constant(Scalar(0));
@@ -478,7 +478,7 @@ DenseBase<Derived>::Zero()
   * \sa class CwiseNullaryOp, Zero()
   */
 template<typename Derived>
-bool DenseBase<Derived>::isZero(const RealScalar& prec) const
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isZero(const RealScalar& prec) const
 {
   typename internal::nested_eval<Derived,1>::type self(derived());
   for(Index j = 0; j < cols(); ++j)
@@ -496,7 +496,7 @@ bool DenseBase<Derived>::isZero(const RealScalar& prec) const
   * \sa class CwiseNullaryOp, Zero()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
 {
   return setConstant(Scalar(0));
 }
@@ -511,7 +511,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
   * \sa DenseBase::setZero(), setZero(Index,Index), class CwiseNullaryOp, DenseBase::Zero()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setZero(Index newSize)
 {
   resize(newSize);
@@ -529,7 +529,7 @@ PlainObjectBase<Derived>::setZero(Index newSize)
   * \sa DenseBase::setZero(), setZero(Index), class CwiseNullaryOp, DenseBase::Zero()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setZero(Index rows, Index cols)
 {
   resize(rows, cols);
@@ -553,7 +553,7 @@ PlainObjectBase<Derived>::setZero(Index rows, Index cols)
   * \sa Ones(), Ones(Index), isOnes(), class Ones
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Ones(Index rows, Index cols)
 {
   return Constant(rows, cols, Scalar(1));
@@ -576,7 +576,7 @@ DenseBase<Derived>::Ones(Index rows, Index cols)
   * \sa Ones(), Ones(Index,Index), isOnes(), class Ones
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Ones(Index newSize)
 {
   return Constant(newSize, Scalar(1));
@@ -593,7 +593,7 @@ DenseBase<Derived>::Ones(Index newSize)
   * \sa Ones(Index), Ones(Index,Index), isOnes(), class Ones
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Ones()
 {
   return Constant(Scalar(1));
@@ -608,7 +608,7 @@ DenseBase<Derived>::Ones()
   * \sa class CwiseNullaryOp, Ones()
   */
 template<typename Derived>
-bool DenseBase<Derived>::isOnes
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isOnes
 (const RealScalar& prec) const
 {
   return isApproxToConstant(Scalar(1), prec);
@@ -622,7 +622,7 @@ bool DenseBase<Derived>::isOnes
   * \sa class CwiseNullaryOp, Ones()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
 {
   return setConstant(Scalar(1));
 }
@@ -637,7 +637,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
   * \sa MatrixBase::setOnes(), setOnes(Index,Index), class CwiseNullaryOp, MatrixBase::Ones()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setOnes(Index newSize)
 {
   resize(newSize);
@@ -655,7 +655,7 @@ PlainObjectBase<Derived>::setOnes(Index newSize)
   * \sa MatrixBase::setOnes(), setOnes(Index), class CwiseNullaryOp, MatrixBase::Ones()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
 {
   resize(rows, cols);
@@ -679,7 +679,7 @@ PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
   * \sa Identity(), setIdentity(), isIdentity()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
 MatrixBase<Derived>::Identity(Index rows, Index cols)
 {
   return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_identity_op<Scalar>());
@@ -696,7 +696,7 @@ MatrixBase<Derived>::Identity(Index rows, Index cols)
   * \sa Identity(Index,Index), setIdentity(), isIdentity()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
 MatrixBase<Derived>::Identity()
 {
   EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
@@ -771,7 +771,7 @@ struct setIdentity_impl<Derived, true>
   * \sa class CwiseNullaryOp, Identity(), Identity(Index,Index), isIdentity()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
 {
   return internal::setIdentity_impl<Derived>::run(derived());
 }
@@ -787,7 +787,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
   * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Identity()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols)
 {
   derived().resize(rows, cols);
   return setIdentity();
@@ -800,7 +800,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index
   * \sa MatrixBase::Unit(Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return BasisReturnType(SquareMatrixType::Identity(newSize,newSize), i);
@@ -815,7 +815,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
   * \sa MatrixBase::Unit(Index,Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return BasisReturnType(SquareMatrixType::Identity(),i);
@@ -828,7 +828,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
   * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX()
 { return Derived::Unit(0); }
 
 /** \returns an expression of the Y axis unit vector (0,1{,0}^*)
@@ -838,7 +838,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
   * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY()
 { return Derived::Unit(1); }
 
 /** \returns an expression of the Z axis unit vector (0,0,1{,0}^*)
@@ -848,7 +848,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
   * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ()
 { return Derived::Unit(2); }
 
 /** \returns an expression of the W axis unit vector (0,0,0,1)
@@ -858,9 +858,45 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
   * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
 { return Derived::Unit(3); }
 
+/** \brief Set the coefficients of \c *this to the i-th unit (basis) vector
+  *
+  * \param i index of the unique coefficient to be set to 1
+  *
+  * \only_for_vectors
+  *
+  * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index i)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  eigen_assert(i<size());
+  derived().setZero();
+  derived().coeffRef(i) = Scalar(1);
+  return derived();
+}
+
+/** \brief Resizes to the given \a newSize, and writes the i-th unit (basis) vector into *this.
+  *
+  * \param newSize the new size of the vector
+  * \param i index of the unique coefficient to be set to 1
+  *
+  * \only_for_vectors
+  *
+  * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
+  */
+template<typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index newSize, Index i)
+{
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  eigen_assert(i<newSize);
+  derived().resize(newSize);
+  return setUnit(i);
+}
+
 } // end namespace Eigen
 
 #endif // EIGEN_CWISE_NULLARY_OP_H
diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h
index cd84b07a5..0c0ea95f4 100644
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -157,6 +157,11 @@ template<typename Derived> class DenseBase
           * we are dealing with a column-vector (if there is only one column) or with
           * a row-vector (if there is only one row). */
 
+      NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2,
+        /**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors, 
+         * and 2 for matrices.
+         */
+
       Flags = internal::traits<Derived>::Flags,
         /**< This stores expression \ref flags flags which may or may not be inherited by new expressions
           * constructed from this one. See the \ref flags "list of flags".
@@ -296,7 +301,7 @@ template<typename Derived> class DenseBase
     EIGEN_DEVICE_FUNC
     Derived& operator=(const ReturnByValue<OtherDerived>& func);
 
-    /** \ínternal
+    /** \internal
       * Copies \a other into *this without evaluating other. \returns a reference to *this.
       * \deprecated */
     template<typename OtherDerived>
@@ -395,7 +400,7 @@ template<typename Derived> class DenseBase
       * Notice that in the case of a plain matrix or vector (not an expression) this function just returns
       * a const reference, in order to avoid a useless copy.
       * 
-      * \warning Be carefull with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink.
+      * \warning Be careful with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink.
       */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE EvalReturnType eval() const
@@ -484,9 +489,9 @@ template<typename Derived> class DenseBase
       return derived().coeff(0,0);
     }
 
-    bool all() const;
-    bool any() const;
-    Index count() const;
+    EIGEN_DEVICE_FUNC bool all() const;
+    EIGEN_DEVICE_FUNC bool any() const;
+    EIGEN_DEVICE_FUNC Index count() const;
 
     typedef VectorwiseOp<Derived, Horizontal> RowwiseReturnType;
     typedef const VectorwiseOp<const Derived, Horizontal> ConstRowwiseReturnType;
diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h
index 7958feeb9..3c02a1025 100644
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@@ -61,7 +61,7 @@ struct plain_array
 #if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
   #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask)
 #elif EIGEN_GNUC_AT_LEAST(4,7) 
-  // GCC 4.7 is too aggressive in its optimizations and remove the alignement test based on the fact the array is declared to be aligned.
+  // GCC 4.7 is too aggressive in its optimizations and remove the alignment test based on the fact the array is declared to be aligned.
   // See this bug report: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53900
   // Hiding the origin of the array pointer behind a function argument seems to do the trick even if the function is inlined:
   template<typename PtrType>
@@ -207,7 +207,9 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
       EIGEN_UNUSED_VARIABLE(rows);
       EIGEN_UNUSED_VARIABLE(cols);
     }
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data, other.m_data);
+    }
     EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
     EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
@@ -267,7 +269,11 @@ template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic
     }
     EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {}
     EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
-    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
+    {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_rows,other.m_rows);
+      numext::swap(m_cols,other.m_cols);
+    }
     EIGEN_DEVICE_FUNC Index rows() const {return m_rows;}
     EIGEN_DEVICE_FUNC Index cols() const {return m_cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; }
@@ -296,7 +302,11 @@ template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Si
       return *this; 
     }
     EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {}
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
+    {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_rows,other.m_rows);
+    }
     EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
     EIGEN_DEVICE_FUNC Index cols(void) const {return _Cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index) { m_rows = rows; }
@@ -325,11 +335,14 @@ template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Si
       return *this;
     }
     EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) {}
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_cols,other.m_cols);
+    }
     EIGEN_DEVICE_FUNC Index rows(void) const {return _Rows;}
     EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
-    void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
-    void resize(Index, Index, Index cols) { m_cols = cols; }
+    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
+    EIGEN_DEVICE_FUNC void resize(Index, Index, Index cols) { m_cols = cols; }
     EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
     EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };
@@ -381,16 +394,19 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
     EIGEN_DEVICE_FUNC
     DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
     {
-      using std::swap;
-      swap(m_data, other.m_data);
-      swap(m_rows, other.m_rows);
-      swap(m_cols, other.m_cols);
+      numext::swap(m_data, other.m_data);
+      numext::swap(m_rows, other.m_rows);
+      numext::swap(m_cols, other.m_cols);
       return *this;
     }
 #endif
     EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); }
     EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
-    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
+    {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_rows,other.m_rows);
+      numext::swap(m_cols,other.m_cols);
+    }
     EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
     EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
     void conservativeResize(Index size, Index rows, Index cols)
@@ -459,14 +475,16 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
     EIGEN_DEVICE_FUNC
     DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
     {
-      using std::swap;
-      swap(m_data, other.m_data);
-      swap(m_cols, other.m_cols);
+      numext::swap(m_data, other.m_data);
+      numext::swap(m_cols, other.m_cols);
       return *this;
     }
 #endif
     EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); }
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_cols,other.m_cols);
+    }
     EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
     EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols)
@@ -533,14 +551,16 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
     EIGEN_DEVICE_FUNC
     DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
     {
-      using std::swap;
-      swap(m_data, other.m_data);
-      swap(m_rows, other.m_rows);
+      numext::swap(m_data, other.m_data);
+      numext::swap(m_rows, other.m_rows);
       return *this;
     }
 #endif
     EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); }
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_rows,other.m_rows);
+    }
     EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
     EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
     void conservativeResize(Index size, Index rows, Index)
diff --git a/Eigen/src/Core/Diagonal.h b/Eigen/src/Core/Diagonal.h
index 49e711257..563135fb2 100644
--- a/Eigen/src/Core/Diagonal.h
+++ b/Eigen/src/Core/Diagonal.h
@@ -70,7 +70,10 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
     EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal)
 
     EIGEN_DEVICE_FUNC
-    explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {}
+    explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index)
+    {
+      eigen_assert( a_index <= m_matrix.cols() && -a_index <= m_matrix.rows() );
+    }
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal)
 
@@ -184,7 +187,7 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
   *
   * \sa class Diagonal */
 template<typename Derived>
-inline typename MatrixBase<Derived>::DiagonalReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalReturnType
 MatrixBase<Derived>::diagonal()
 {
   return DiagonalReturnType(derived());
@@ -192,7 +195,7 @@ MatrixBase<Derived>::diagonal()
 
 /** This is the const version of diagonal(). */
 template<typename Derived>
-inline typename MatrixBase<Derived>::ConstDiagonalReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalReturnType
 MatrixBase<Derived>::diagonal() const
 {
   return ConstDiagonalReturnType(derived());
@@ -210,7 +213,7 @@ MatrixBase<Derived>::diagonal() const
   *
   * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
-inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
 MatrixBase<Derived>::diagonal(Index index)
 {
   return DiagonalDynamicIndexReturnType(derived(), index);
@@ -218,7 +221,7 @@ MatrixBase<Derived>::diagonal(Index index)
 
 /** This is the const version of diagonal(Index). */
 template<typename Derived>
-inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
 MatrixBase<Derived>::diagonal(Index index) const
 {
   return ConstDiagonalDynamicIndexReturnType(derived(), index);
@@ -237,6 +240,7 @@ MatrixBase<Derived>::diagonal(Index index) const
   * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
 template<int Index_>
+EIGEN_DEVICE_FUNC
 inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index_>::Type
 MatrixBase<Derived>::diagonal()
 {
@@ -246,6 +250,7 @@ MatrixBase<Derived>::diagonal()
 /** This is the const version of diagonal<int>(). */
 template<typename Derived>
 template<int Index_>
+EIGEN_DEVICE_FUNC
 inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index_>::Type
 MatrixBase<Derived>::diagonal() const
 {
diff --git a/Eigen/src/Core/DiagonalMatrix.h b/Eigen/src/Core/DiagonalMatrix.h
index ecfdce8ef..4e8297ee6 100644
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h
@@ -44,7 +44,7 @@ class DiagonalBase : public EigenBase<Derived>
 
     EIGEN_DEVICE_FUNC
     DenseMatrixType toDenseMatrix() const { return derived(); }
-    
+
     EIGEN_DEVICE_FUNC
     inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); }
     EIGEN_DEVICE_FUNC
@@ -273,7 +273,7 @@ class DiagonalWrapper
   * \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal()
   **/
 template<typename Derived>
-inline const DiagonalWrapper<const Derived>
+EIGEN_DEVICE_FUNC inline const DiagonalWrapper<const Derived>
 MatrixBase<Derived>::asDiagonal() const
 {
   return DiagonalWrapper<const Derived>(derived());
diff --git a/Eigen/src/Core/DiagonalProduct.h b/Eigen/src/Core/DiagonalProduct.h
index d372b938f..7911d1cd1 100644
--- a/Eigen/src/Core/DiagonalProduct.h
+++ b/Eigen/src/Core/DiagonalProduct.h
@@ -17,7 +17,7 @@ namespace Eigen {
   */
 template<typename Derived>
 template<typename DiagonalDerived>
-inline const Product<Derived, DiagonalDerived, LazyProduct>
+EIGEN_DEVICE_FUNC inline const Product<Derived, DiagonalDerived, LazyProduct>
 MatrixBase<Derived>::operator*(const DiagonalBase<DiagonalDerived> &a_diagonal) const
 {
   return Product<Derived, DiagonalDerived, LazyProduct>(derived(),a_diagonal.derived());
diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h
index 06ef18b8b..11da432b2 100644
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -31,7 +31,8 @@ struct dot_nocheck
   typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
   typedef typename conj_prod::result_type ResScalar;
   EIGEN_DEVICE_FUNC
-  static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
+  EIGEN_STRONG_INLINE
+  static ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
   {
     return a.template binaryExpr<conj_prod>(b).sum();
   }
@@ -43,7 +44,8 @@ struct dot_nocheck<T, U, true>
   typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
   typedef typename conj_prod::result_type ResScalar;
   EIGEN_DEVICE_FUNC
-  static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
+  EIGEN_STRONG_INLINE
+  static ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
   {
     return a.transpose().template binaryExpr<conj_prod>(b).sum();
   }
@@ -65,6 +67,7 @@ struct dot_nocheck<T, U, true>
 template<typename Derived>
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE
 typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
 MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
 {
@@ -90,7 +93,7 @@ MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
   * \sa dot(), norm(), lpNorm()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
 {
   return numext::real((*this).cwiseAbs2().sum());
 }
@@ -102,7 +105,7 @@ EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scala
   * \sa lpNorm(), dot(), squaredNorm()
   */
 template<typename Derived>
-inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
 {
   return numext::sqrt(squaredNorm());
 }
@@ -117,7 +120,7 @@ inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real Matr
   * \sa norm(), normalize()
   */
 template<typename Derived>
-inline const typename MatrixBase<Derived>::PlainObject
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::normalized() const
 {
   typedef typename internal::nested_eval<Derived,2>::type _Nested;
@@ -139,7 +142,7 @@ MatrixBase<Derived>::normalized() const
   * \sa norm(), normalized()
   */
 template<typename Derived>
-inline void MatrixBase<Derived>::normalize()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize()
 {
   RealScalar z = squaredNorm();
   // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
@@ -160,7 +163,7 @@ inline void MatrixBase<Derived>::normalize()
   * \sa stableNorm(), stableNormalize(), normalized()
   */
 template<typename Derived>
-inline const typename MatrixBase<Derived>::PlainObject
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::stableNormalized() const
 {
   typedef typename internal::nested_eval<Derived,3>::type _Nested;
@@ -185,7 +188,7 @@ MatrixBase<Derived>::stableNormalized() const
   * \sa stableNorm(), stableNormalized(), normalize()
   */
 template<typename Derived>
-inline void MatrixBase<Derived>::stableNormalize()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::stableNormalize()
 {
   RealScalar w = cwiseAbs().maxCoeff();
   RealScalar z = (derived()/w).squaredNorm();
@@ -257,9 +260,9 @@ struct lpNorm_selector<Derived, Infinity>
 template<typename Derived>
 template<int p>
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+EIGEN_DEVICE_FUNC inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 #else
-MatrixBase<Derived>::RealScalar
+EIGEN_DEVICE_FUNC MatrixBase<Derived>::RealScalar
 #endif
 MatrixBase<Derived>::lpNorm() const
 {
diff --git a/Eigen/src/Core/EigenBase.h b/Eigen/src/Core/EigenBase.h
index f76995af9..b195506a9 100644
--- a/Eigen/src/Core/EigenBase.h
+++ b/Eigen/src/Core/EigenBase.h
@@ -14,6 +14,7 @@
 namespace Eigen {
 
 /** \class EigenBase
+  * \ingroup Core_Module
   * 
   * Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T).
   *
@@ -128,6 +129,7 @@ template<typename Derived> struct EigenBase
   */
 template<typename Derived>
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
 {
   call_assignment(derived(), other.derived());
@@ -136,6 +138,7 @@ Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
 
 template<typename Derived>
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
 {
   call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -144,6 +147,7 @@ Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
 
 template<typename Derived>
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived> &other)
 {
   call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
diff --git a/Eigen/src/Core/Fuzzy.h b/Eigen/src/Core/Fuzzy.h
index 3e403a09d..43aa49b2b 100644
--- a/Eigen/src/Core/Fuzzy.h
+++ b/Eigen/src/Core/Fuzzy.h
@@ -100,7 +100,7 @@ struct isMuchSmallerThan_scalar_selector<Derived, true>
   */
 template<typename Derived>
 template<typename OtherDerived>
-bool DenseBase<Derived>::isApprox(
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApprox(
   const DenseBase<OtherDerived>& other,
   const RealScalar& prec
 ) const
@@ -122,7 +122,7 @@ bool DenseBase<Derived>::isApprox(
   * \sa isApprox(), isMuchSmallerThan(const DenseBase<OtherDerived>&, RealScalar) const
   */
 template<typename Derived>
-bool DenseBase<Derived>::isMuchSmallerThan(
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
   const typename NumTraits<Scalar>::Real& other,
   const RealScalar& prec
 ) const
@@ -142,7 +142,7 @@ bool DenseBase<Derived>::isMuchSmallerThan(
   */
 template<typename Derived>
 template<typename OtherDerived>
-bool DenseBase<Derived>::isMuchSmallerThan(
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
   const DenseBase<OtherDerived>& other,
   const RealScalar& prec
 ) const
diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h
index 0f16cd8e3..43f3b84c8 100644
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -18,18 +18,33 @@ enum {
   Small = 3
 };
 
+// Define the threshold value to fallback from the generic matrix-matrix product
+// implementation (heavy) to the lightweight coeff-based product one.
+// See generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
+// in products/GeneralMatrixMatrix.h for more details.
+// TODO This threshold should also be used in the compile-time selector below.
+#ifndef EIGEN_GEMM_TO_COEFFBASED_THRESHOLD
+// This default value has been obtained on a Haswell architecture.
+#define EIGEN_GEMM_TO_COEFFBASED_THRESHOLD 20
+#endif
+
 namespace internal {
 
 template<int Rows, int Cols, int Depth> struct product_type_selector;
 
 template<int Size, int MaxSize> struct product_size_category
 {
-  enum { is_large = MaxSize == Dynamic ||
-                    Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
-                    (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
-         value = is_large  ? Large
-               : Size == 1 ? 1
-                           : Small
+  enum {
+    #ifndef EIGEN_GPU_COMPILE_PHASE
+    is_large = MaxSize == Dynamic ||
+               Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
+               (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
+    #else
+    is_large = 0,
+    #endif
+    value = is_large  ? Large
+          : Size == 1 ? 1
+                      : Small
   };
 };
 
@@ -148,13 +163,13 @@ template<typename Scalar,int Size,int MaxSize,bool Cond> struct gemv_static_vect
 template<typename Scalar,int Size,int MaxSize>
 struct gemv_static_vector_if<Scalar,Size,MaxSize,false>
 {
-  EIGEN_STRONG_INLINE  Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
 };
 
 template<typename Scalar,int Size>
 struct gemv_static_vector_if<Scalar,Size,Dynamic,true>
 {
-  EIGEN_STRONG_INLINE Scalar* data() { return 0; }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { return 0; }
 };
 
 template<typename Scalar,int Size,int MaxSize>
@@ -379,10 +394,9 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
   *
   * \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*()
   */
-#ifndef __CUDACC__
-
 template<typename Derived>
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 inline const Product<Derived, OtherDerived>
 MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
 {
@@ -412,8 +426,6 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
   return Product<Derived, OtherDerived>(derived(), other.derived());
 }
 
-#endif // __CUDACC__
-
 /** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation.
   *
   * The returned product will behave like any other expressions: the coefficients of the product will be
@@ -428,7 +440,7 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
 template<typename Derived>
 template<typename OtherDerived>
 const Product<Derived,OtherDerived,LazyProduct>
-MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
+EIGEN_DEVICE_FUNC MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
 {
   enum {
     ProductIsValid =  Derived::ColsAtCompileTime==Dynamic
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index ac5552d3e..b67c41d8a 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -82,7 +82,11 @@ struct default_packet_traits
     HasPolygamma = 0,
     HasErf = 0,
     HasErfc = 0,
+    HasI0e = 0,
+    HasI1e = 0,
     HasIGamma = 0,
+    HasIGammaDerA = 0,
+    HasGammaSampleDerAlpha = 0,
     HasIGammac = 0,
     HasBetaInc = 0,
 
@@ -231,7 +235,7 @@ pload1(const typename unpacket_traits<Packet>::type  *a) { return pset1<Packet>(
   * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]}
   * Currently, this function is only used for scalar * complex products.
   */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
 ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
 
 /** \internal \returns a packet with elements of \a *from quadrupled.
@@ -279,7 +283,7 @@ inline void pbroadcast2(const typename unpacket_traits<Packet>::type *a,
 }
 
 /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
 plset(const typename unpacket_traits<Packet>::type& a) { return a; }
 
 /** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
@@ -299,7 +303,9 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu
 /** \internal tries to do cache prefetching of \a addr */
 template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
 {
-#ifdef __CUDA_ARCH__
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+  // do nothing
+#elif defined(EIGEN_CUDA_ARCH)
 #if defined(__LP64__)
   // 64-bit pointer operand constraint for inlined asm
   asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
@@ -324,13 +330,13 @@ preduxp(const Packet* vecs) { return vecs[0]; }
 template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux(const Packet& a)
 { return a; }
 
-/** \internal \returns the sum of the elements of \a a by block of 4 elements.
+/** \internal \returns the sum of the elements of upper and lower half of \a a if \a a is larger than 4.
   * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7}
   * For packet-size smaller or equal to 4, this boils down to a noop.
   */
 template<typename Packet> EIGEN_DEVICE_FUNC inline
 typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type
-predux_downto4(const Packet& a)
+predux_half_dowto4(const Packet& a)
 { return a; }
 
 /** \internal \returns the product of the elements of \a a*/
@@ -487,7 +493,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& fro
   * by the current computation.
   */
 template<typename Packet, int LoadMode>
-inline Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
 {
   return ploadt<Packet, LoadMode>(from);
 }
@@ -526,7 +532,7 @@ inline void palign(PacketType& first, const PacketType& second)
 ***************************************************************************/
 
 // Eigen+CUDA does not support complexes.
-#ifndef __CUDACC__
+#if !defined(EIGEN_GPUCC)
 
 template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
 { return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h
index 12828a7c3..563df6e84 100644
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@@ -66,6 +66,7 @@ namespace Eigen
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op,hyperbolic sine,\sa ArrayBase::sinh)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op,hyperbolic cosine,\sa ArrayBase::cosh)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op,hyperbolic tangent,\sa ArrayBase::tanh)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(logistic,scalar_logistic_op,logistic function,\sa ArrayBase::logistic)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op,natural logarithm of the gamma function,\sa ArrayBase::lgamma)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op,error function,\sa ArrayBase::erf)
@@ -89,7 +90,7 @@ namespace Eigen
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isinf,scalar_isinf_op,infinite value test,\sa Eigen::isnan DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isinf)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite,scalar_isfinite_op,finite value test,\sa Eigen::isinf DOXCOMMA Eigen::isnan DOXCOMMA ArrayBase::isfinite)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sign,scalar_sign_op,sign (or 0),\sa ArrayBase::sign)
-  
+
   /** \returns an expression of the coefficient-wise power of \a x to the given constant \a exponent.
     *
     * \tparam ScalarExponent is the scalar type of \a exponent. It must be compatible with the scalar type of the given expression (\c Derived::Scalar).
@@ -103,17 +104,18 @@ namespace Eigen
   inline const CwiseBinaryOp<internal::scalar_pow_op<Derived::Scalar,ScalarExponent>,Derived,Constant<ScalarExponent> >
   pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent);
 #else
-  template<typename Derived,typename ScalarExponent>
-  inline typename internal::enable_if<   !(internal::is_same<typename Derived::Scalar,ScalarExponent>::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent),
-          const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,ScalarExponent,pow) >::type
-  pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent) {
-    return x.derived().pow(exponent);
-  }
-
-  template<typename Derived>
-  inline const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename Derived::Scalar,pow)
-  pow(const Eigen::ArrayBase<Derived>& x, const typename Derived::Scalar& exponent) {
-    return x.derived().pow(exponent);
+  template <typename Derived,typename ScalarExponent>
+  EIGEN_DEVICE_FUNC inline
+  EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(
+    const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg<typename Derived::Scalar
+                                                 EIGEN_COMMA ScalarExponent EIGEN_COMMA
+                                                 EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent)>::type,pow))
+  pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent)
+  {
+    typedef typename internal::promote_scalar_arg<typename Derived::Scalar,ScalarExponent,
+                                                  EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent)>::type PromotedExponent;
+    return EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,PromotedExponent,pow)(x.derived(),
+           typename internal::plain_constant_type<Derived,PromotedExponent>::type(x.derived().rows(), x.derived().cols(), internal::scalar_constant_op<PromotedExponent>(exponent)));
   }
 #endif
 
@@ -123,21 +125,21 @@ namespace Eigen
     *
     * Example: \include Cwise_array_power_array.cpp
     * Output: \verbinclude Cwise_array_power_array.out
-    * 
+    *
     * \sa ArrayBase::pow()
     *
     * \relates ArrayBase
     */
   template<typename Derived,typename ExponentDerived>
   inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>
-  pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<ExponentDerived>& exponents) 
+  pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<ExponentDerived>& exponents)
   {
     return Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>(
       x.derived(),
       exponents.derived()
     );
   }
-  
+
   /** \returns an expression of the coefficient-wise power of the scalar \a x to the given array of \a exponents.
     *
     * This function computes the coefficient-wise power between a scalar and an array of exponents.
@@ -146,7 +148,7 @@ namespace Eigen
     *
     * Example: \include Cwise_scalar_power_array.cpp
     * Output: \verbinclude Cwise_scalar_power_array.out
-    * 
+    *
     * \sa ArrayBase::pow()
     *
     * \relates ArrayBase
@@ -156,21 +158,17 @@ namespace Eigen
   inline const CwiseBinaryOp<internal::scalar_pow_op<Scalar,Derived::Scalar>,Constant<Scalar>,Derived>
   pow(const Scalar& x,const Eigen::ArrayBase<Derived>& x);
 #else
-  template<typename Scalar, typename Derived>
-  inline typename internal::enable_if<   !(internal::is_same<typename Derived::Scalar,Scalar>::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar),
-          const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow) >::type
-  pow(const Scalar& x, const Eigen::ArrayBase<Derived>& exponents)
-  {
-    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow)(
-            typename internal::plain_constant_type<Derived,Scalar>::type(exponents.rows(), exponents.cols(), x), exponents.derived() );
-  }
-
-  template<typename Derived>
-  inline const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)
-  pow(const typename Derived::Scalar& x, const Eigen::ArrayBase<Derived>& exponents)
-  {
-    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)(
-      typename internal::plain_constant_type<Derived,typename Derived::Scalar>::type(exponents.rows(), exponents.cols(), x), exponents.derived() );
+  template <typename Scalar, typename Derived>
+  EIGEN_DEVICE_FUNC inline
+  EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(
+    const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg<typename Derived::Scalar
+                                                 EIGEN_COMMA Scalar EIGEN_COMMA
+                                                 EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar)>::type,Derived,pow))
+  pow(const Scalar& x, const Eigen::ArrayBase<Derived>& exponents) {
+    typedef typename internal::promote_scalar_arg<typename Derived::Scalar,Scalar,
+                                                  EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar)>::type PromotedScalar;
+    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(PromotedScalar,Derived,pow)(
+           typename internal::plain_constant_type<Derived,PromotedScalar>::type(exponents.derived().rows(), exponents.derived().cols(), internal::scalar_constant_op<PromotedScalar>(x)), exponents.derived());
   }
 #endif
 
diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h
index 06d196702..c437f1a92 100644
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -20,11 +20,17 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> >
 {
   typedef traits<PlainObjectType> TraitsBase;
   enum {
+    PlainObjectTypeInnerSize = ((traits<PlainObjectType>::Flags&RowMajorBit)==RowMajorBit)
+                             ? PlainObjectType::ColsAtCompileTime
+                             : PlainObjectType::RowsAtCompileTime,
+
     InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
                              ? int(PlainObjectType::InnerStrideAtCompileTime)
                              : int(StrideType::InnerStrideAtCompileTime),
     OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
-                             ? int(PlainObjectType::OuterStrideAtCompileTime)
+                             ? (InnerStrideAtCompileTime==Dynamic || PlainObjectTypeInnerSize==Dynamic
+                                ? Dynamic
+                                : int(InnerStrideAtCompileTime) * int(PlainObjectTypeInnerSize))
                              : int(StrideType::OuterStrideAtCompileTime),
     Alignment = int(MapOptions)&int(AlignedMask),
     Flags0 = TraitsBase::Flags & (~NestByRefBit),
@@ -108,9 +114,10 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
     inline Index outerStride() const
     {
       return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
-           : IsVectorAtCompileTime ? this->size()
-           : int(Flags)&RowMajorBit ? this->cols()
-           : this->rows();
+           : internal::traits<Map>::OuterStrideAtCompileTime != Dynamic ? Index(internal::traits<Map>::OuterStrideAtCompileTime)
+           : IsVectorAtCompileTime ? (this->size() * innerStride())
+           : int(Flags)&RowMajorBit ? (this->cols() * innerStride())
+           : (this->rows() * innerStride());
     }
 
     /** Constructor in the fixed-size case.
diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h
index 020f939ad..668922ffc 100644
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -43,6 +43,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
     enum {
       RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
       ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
+      InnerStrideAtCompileTime = internal::traits<Derived>::InnerStrideAtCompileTime,
       SizeAtCompileTime = Base::SizeAtCompileTime
     };
 
@@ -187,8 +188,11 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
     void checkSanity(typename internal::enable_if<(internal::traits<T>::Alignment>0),void*>::type = 0) const
     {
 #if EIGEN_MAX_ALIGN_BYTES>0
+      // innerStride() is not set yet when this function is called, so we optimistically assume the lowest plausible value:
+      const Index minInnerStride = InnerStrideAtCompileTime == Dynamic ? 1 : Index(InnerStrideAtCompileTime);
+      EIGEN_ONLY_USED_FOR_DEBUG(minInnerStride);
       eigen_assert((   ((internal::UIntPtr(m_data) % internal::traits<Derived>::Alignment) == 0)
-                    || (cols() * rows() * innerStride() * sizeof(Scalar)) < internal::traits<Derived>::Alignment ) && "data is not aligned");
+                    || (cols() * rows() * minInnerStride * sizeof(Scalar)) < internal::traits<Derived>::Alignment ) && "data is not aligned");
 #endif
     }
 
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 7a6b999af..72116e144 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -96,7 +96,7 @@ struct real_default_impl<Scalar,true>
 
 template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
 
-#ifdef __CUDA_ARCH__
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 template<typename T>
 struct real_impl<std::complex<T> >
 {
@@ -144,7 +144,7 @@ struct imag_default_impl<Scalar,true>
 
 template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
 
-#ifdef __CUDA_ARCH__
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 template<typename T>
 struct imag_impl<std::complex<T> >
 {
@@ -238,7 +238,7 @@ struct imag_ref_retval
 ****************************************************************************/
 
 template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
-struct conj_impl
+struct conj_default_impl
 {
   EIGEN_DEVICE_FUNC
   static inline Scalar run(const Scalar& x)
@@ -248,7 +248,7 @@ struct conj_impl
 };
 
 template<typename Scalar>
-struct conj_impl<Scalar,true>
+struct conj_default_impl<Scalar,true>
 {
   EIGEN_DEVICE_FUNC
   static inline Scalar run(const Scalar& x)
@@ -258,6 +258,20 @@ struct conj_impl<Scalar,true>
   }
 };
 
+template<typename Scalar> struct conj_impl : conj_default_impl<Scalar> {};
+
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+template<typename T>
+struct conj_impl<std::complex<T> >
+{
+  EIGEN_DEVICE_FUNC
+  static inline std::complex<T> run(const std::complex<T>& x)
+  {
+    return std::complex<T>(x.real(), -x.imag());
+  }
+};
+#endif
+
 template<typename Scalar>
 struct conj_retval
 {
@@ -347,31 +361,7 @@ struct norm1_retval
 * Implementation of hypot                                                *
 ****************************************************************************/
 
-template<typename Scalar>
-struct hypot_impl
-{
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  static inline RealScalar run(const Scalar& x, const Scalar& y)
-  {
-    EIGEN_USING_STD_MATH(abs);
-    EIGEN_USING_STD_MATH(sqrt);
-    RealScalar _x = abs(x);
-    RealScalar _y = abs(y);
-    Scalar p, qp;
-    if(_x>_y)
-    {
-      p = _x;
-      qp = _y / p;
-    }
-    else
-    {
-      p = _y;
-      qp = _x / p;
-    }
-    if(p==RealScalar(0)) return RealScalar(0);
-    return p * sqrt(RealScalar(1) + qp*qp);
-  }
-};
+template<typename Scalar> struct hypot_impl;
 
 template<typename Scalar>
 struct hypot_retval
@@ -445,7 +435,12 @@ struct round_retval
   struct arg_impl {
     static inline Scalar run(const Scalar& x)
     {
+      #if defined(EIGEN_HIP_DEVICE_COMPILE)
+      // HIP does not seem to have a native device side implementation for the math routine "arg"
+      using std::arg;
+      #else 		  
       EIGEN_USING_STD_MATH(arg);
+      #endif
       return arg(x);
     }
   };
@@ -497,11 +492,11 @@ namespace std_fallback {
 
     EIGEN_USING_STD_MATH(exp);
     Scalar u = exp(x);
-    if (u == Scalar(1)) {
+    if (numext::equal_strict(u, Scalar(1))) {
       return x;
     }
     Scalar um1 = u - RealScalar(1);
-    if (um1 == Scalar(-1)) {
+    if (numext::equal_strict(um1, Scalar(-1))) {
       return RealScalar(-1);
     }
 
@@ -512,7 +507,7 @@ namespace std_fallback {
 
 template<typename Scalar>
 struct expm1_impl {
-  static inline Scalar run(const Scalar& x)
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
   {
     EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
     #if EIGEN_HAS_CXX11_MATH
@@ -543,13 +538,13 @@ namespace std_fallback {
     typedef typename NumTraits<Scalar>::Real RealScalar;
     EIGEN_USING_STD_MATH(log);
     Scalar x1p = RealScalar(1) + x;
-    return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
+    return numext::equal_strict(x1p, Scalar(1)) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
   }
 }
 
 template<typename Scalar>
 struct log1p_impl {
-  static inline Scalar run(const Scalar& x)
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
   {
     EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
     #if EIGEN_HAS_CXX11_MATH
@@ -689,20 +684,27 @@ struct random_default_impl<Scalar, false, true>
 {
   static inline Scalar run(const Scalar& x, const Scalar& y)
   {
-    typedef typename conditional<NumTraits<Scalar>::IsSigned,std::ptrdiff_t,std::size_t>::type ScalarX;
-    if(y<x)
+    if (y <= x)
       return x;
-    // the following difference might overflow on a 32 bits system,
-    // but since y>=x the result converted to an unsigned long is still correct.
-    std::size_t range = ScalarX(y)-ScalarX(x);
-    std::size_t offset = 0;
-    // rejection sampling
-    std::size_t divisor = 1;
-    std::size_t multiplier = 1;
-    if(range<RAND_MAX) divisor = (std::size_t(RAND_MAX)+1)/(range+1);
-    else               multiplier = 1 + range/(std::size_t(RAND_MAX)+1);
+    // ScalarU is the unsigned counterpart of Scalar, possibly Scalar itself.
+    typedef typename make_unsigned<Scalar>::type ScalarU;
+    // ScalarX is the widest of ScalarU and unsigned int.
+    // We'll deal only with ScalarX and unsigned int below thus avoiding signed
+    // types and arithmetic and signed overflows (which are undefined behavior).
+    typedef typename conditional<(ScalarU(-1) > unsigned(-1)), ScalarU, unsigned>::type ScalarX;
+    // The following difference doesn't overflow, provided our integer types are two's
+    // complement and have the same number of padding bits in signed and unsigned variants.
+    // This is the case in most modern implementations of C++.
+    ScalarX range = ScalarX(y) - ScalarX(x);
+    ScalarX offset = 0;
+    ScalarX divisor = 1;
+    ScalarX multiplier = 1;
+    const unsigned rand_max = RAND_MAX;
+    if (range <= rand_max) divisor = (rand_max + 1) / (range + 1);
+    else                   multiplier = 1 + range / (rand_max + 1);
+    // Rejection sampling.
     do {
-      offset = (std::size_t(std::rand()) * multiplier) / divisor;
+      offset = (unsigned(std::rand()) * multiplier) / divisor;
     } while (offset > range);
     return Scalar(ScalarX(x) + offset);
   }
@@ -749,7 +751,7 @@ inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random()
   return EIGEN_MATHFUNC_IMPL(random, Scalar)::run();
 }
 
-// Implementatin of is* functions
+// Implementation of is* functions
 
 // std::is* do not work with fast-math and gcc, std::is* are available on MSVC 2013 and newer, as well as in clang.
 #if (EIGEN_HAS_CXX11_MATH && !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || (EIGEN_COMP_MSVC>=1800) || (EIGEN_COMP_CLANG)
@@ -778,7 +780,7 @@ EIGEN_DEVICE_FUNC
 typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
 isfinite_impl(const T& x)
 {
-  #ifdef __CUDA_ARCH__
+  #if defined(EIGEN_GPU_COMPILE_PHASE)
     return (::isfinite)(x);
   #elif EIGEN_USE_STD_FPCLASSIFY
     using std::isfinite;
@@ -793,7 +795,7 @@ EIGEN_DEVICE_FUNC
 typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
 isinf_impl(const T& x)
 {
-  #ifdef __CUDA_ARCH__
+  #if defined(EIGEN_GPU_COMPILE_PHASE)
     return (::isinf)(x);
   #elif EIGEN_USE_STD_FPCLASSIFY
     using std::isinf;
@@ -808,7 +810,7 @@ EIGEN_DEVICE_FUNC
 typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
 isnan_impl(const T& x)
 {
-  #ifdef __CUDA_ARCH__
+  #if defined(EIGEN_GPU_COMPILE_PHASE)
     return (::isnan)(x);
   #elif EIGEN_USE_STD_FPCLASSIFY
     using std::isnan;
@@ -874,7 +876,7 @@ template<typename T> T generic_fast_tanh_float(const T& a_x);
 
 namespace numext {
 
-#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
+#if (!defined(EIGEN_GPUCC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) 
 template<typename T>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
@@ -890,111 +892,137 @@ EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
   EIGEN_USING_STD_MATH(max);
   return max EIGEN_NOT_A_MACRO (x,y);
 }
-
-
-#elif defined(__SYCL_DEVICE_ONLY__)
+#else
 template<typename T>
+EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
 {
-
   return y < x ? y : x;
 }
-
-template<typename T>
-EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
-{
-
-  return x < y ? y : x;
-}
-
-EIGEN_ALWAYS_INLINE int mini(const int& x, const int& y)
-{
-  return cl::sycl::min(x,y);
-}
-
-EIGEN_ALWAYS_INLINE int maxi(const int& x, const int& y)
-{
-  return cl::sycl::max(x,y);
-}
-
-EIGEN_ALWAYS_INLINE unsigned int mini(const unsigned int& x, const unsigned int& y)
-{
-  return cl::sycl::min(x,y);
-}
-
-EIGEN_ALWAYS_INLINE unsigned int maxi(const unsigned int& x, const unsigned int& y)
-{
-  return cl::sycl::max(x,y);
-}
-
-EIGEN_ALWAYS_INLINE  long mini(const long & x, const long & y)
-{
-  return cl::sycl::min(x,y);
-}
-
-EIGEN_ALWAYS_INLINE  long maxi(const long & x, const long & y)
-{
-  return cl::sycl::max(x,y);
-}
-
-EIGEN_ALWAYS_INLINE unsigned long mini(const unsigned long& x, const unsigned long& y)
-{
-  return cl::sycl::min(x,y);
-}
-
-EIGEN_ALWAYS_INLINE unsigned long maxi(const unsigned long& x, const unsigned long& y)
-{
-  return cl::sycl::max(x,y);
-}
-
-
+template<>
+EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y)
 {
-  return cl::sycl::fmin(x,y);
-}
-
-EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y)
-{
-  return cl::sycl::fmax(x,y);
+  return fminf(x, y);
 }
-
+template<>
+EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y)
 {
-  return cl::sycl::fmin(x,y);
+  return fmin(x, y);
 }
-
-EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y)
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y)
 {
-  return cl::sycl::fmax(x,y);
+#if defined(EIGEN_HIPCC)
+  // no "fminl" on HIP yet
+  return (x < y) ? x : y;
+#else
+  return fminl(x, y);
+#endif
 }
 
-#else
 template<typename T>
 EIGEN_DEVICE_FUNC
-EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
+EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
 {
-  return y < x ? y : x;
+  return x < y ? y : x;
 }
 template<>
 EIGEN_DEVICE_FUNC
-EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y)
+EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y)
 {
-  return fminf(x, y);
+  return fmaxf(x, y);
 }
-template<typename T>
+template<>
 EIGEN_DEVICE_FUNC
-EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
+EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y)
 {
-  return x < y ? y : x;
+  return fmax(x, y);
 }
 template<>
 EIGEN_DEVICE_FUNC
-EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y)
+EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y)
 {
-  return fmaxf(x, y);
+#if defined(EIGEN_HIPCC)
+  // no "fmaxl" on HIP yet
+  return (x > y) ? x : y;
+#else
+  return fmaxl(x, y);
+#endif
 }
 #endif
 
+#if defined(__SYCL_DEVICE_ONLY__)
+
+
+#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_char)   \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_short)  \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_int)    \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
+#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_char)   \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_short)  \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_int)    \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
+#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar)  \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uint)   \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
+#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar)  \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uint)   \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
+#define SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC)
+#define SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC)
+#define SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC,cl::sycl::cl_double)
+#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC,cl::sycl::cl_double)
+#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(NAME, FUNC, RET_TYPE) \
+  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_float) \
+  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_double) 
+
+#define SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \
+template<>                                               \
+  EIGEN_DEVICE_FUNC                                      \
+  EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE& x) { \
+    return cl::sycl::FUNC(x);                            \
+  }
+
+#define SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, TYPE) \
+  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, TYPE, TYPE)
+
+#define SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE1, ARG_TYPE2) \
+  template<>                                                                  \
+  EIGEN_DEVICE_FUNC                                                           \
+  EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE1& x, const ARG_TYPE2& y) { \
+    return cl::sycl::FUNC(x, y);                                              \
+  }
+
+#define SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \
+  SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE, ARG_TYPE)
+
+#define SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, TYPE) \
+  SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, TYPE, TYPE)
+
+SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(mini, min)
+SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(mini, fmin)
+SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(maxi, max)
+SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(maxi, fmax)
+
+#endif // defined(__SYCL_DEVICE_ONLY__)
+
 
 template<typename Scalar>
 EIGEN_DEVICE_FUNC
@@ -1059,6 +1087,9 @@ inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x)
   return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x);
 }
 
+EIGEN_DEVICE_FUNC
+inline bool abs2(bool x) { return x; }
+
 template<typename Scalar>
 EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x)
@@ -1073,6 +1104,10 @@ inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar&
   return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y);
 }
 
+#if defined(__SYCL_DEVICE_ONLY__)
+  SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(hypot, hypot)
+#endif // defined(__SYCL_DEVICE_ONLY__)
+
 template<typename Scalar>
 EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
@@ -1081,11 +1116,10 @@ inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
 }
 
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   log1p(float x) { return cl::sycl::log1p(x); }
-EIGEN_ALWAYS_INLINE double  log1p(double x) { return cl::sycl::log1p(x); }
-#endif // defined(__SYCL_DEVICE_ONLY__)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log1p, log1p)
+#endif //defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float log1p(const float &x) { return ::log1pf(x); }
 
@@ -1101,8 +1135,7 @@ inline typename internal::pow_impl<ScalarX,ScalarY>::result_type pow(const Scala
 }
 
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   pow(float x, float y) { return cl::sycl::pow(x, y); }
-EIGEN_ALWAYS_INLINE double  pow(double x, double y) { return cl::sycl::pow(x, y); }
+SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(pow, pow)
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
 template<typename T> EIGEN_DEVICE_FUNC bool (isnan)   (const T &x) { return internal::isnan_impl(x); }
@@ -1110,12 +1143,9 @@ template<typename T> EIGEN_DEVICE_FUNC bool (isinf)   (const T &x) { return inte
 template<typename T> EIGEN_DEVICE_FUNC bool (isfinite)(const T &x) { return internal::isfinite_impl(x); }
 
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   isnan(float x) { return cl::sycl::isnan(x); }
-EIGEN_ALWAYS_INLINE double  isnan(double x) { return cl::sycl::isnan(x); }
-EIGEN_ALWAYS_INLINE float   isinf(float x) { return cl::sycl::isinf(x); }
-EIGEN_ALWAYS_INLINE double  isinf(double x) { return cl::sycl::isinf(x); }
-EIGEN_ALWAYS_INLINE float   isfinite(float x) { return cl::sycl::isfinite(x); }
-EIGEN_ALWAYS_INLINE double  isfinite(double x) { return cl::sycl::isfinite(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isnan, isnan, bool)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isinf, isinf, bool)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isfinite, isfinite, bool)
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
 template<typename Scalar>
@@ -1126,8 +1156,7 @@ inline EIGEN_MATHFUNC_RETVAL(round, Scalar) round(const Scalar& x)
 }
 
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   round(float x) { return cl::sycl::round(x); }
-EIGEN_ALWAYS_INLINE double  round(double x) { return cl::sycl::round(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(round, round)
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
 template<typename T>
@@ -1139,11 +1168,10 @@ T (floor)(const T& x)
 }
 
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   floor(float x) { return cl::sycl::floor(x); }
-EIGEN_ALWAYS_INLINE double  floor(double x) { return cl::sycl::floor(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(floor, floor)
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float floor(const float &x) { return ::floorf(x); }
 
@@ -1160,11 +1188,10 @@ T (ceil)(const T& x)
 }
 
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   ceil(float x) { return cl::sycl::ceil(x); }
-EIGEN_ALWAYS_INLINE double  ceil(double x) { return cl::sycl::ceil(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(ceil, ceil)
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float ceil(const float &x) { return ::ceilf(x); }
 
@@ -1205,8 +1232,7 @@ T sqrt(const T &x)
 }
 
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   sqrt(float x) { return cl::sycl::sqrt(x); }
-EIGEN_ALWAYS_INLINE double  sqrt(double x) { return cl::sycl::sqrt(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sqrt, sqrt)
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
 template<typename T>
@@ -1217,12 +1243,11 @@ T log(const T &x) {
 }
 
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   log(float x) { return cl::sycl::log(x); }
-EIGEN_ALWAYS_INLINE double  log(double x) { return cl::sycl::log(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log, log)
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
 
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float log(const float &x) { return ::logf(x); }
 
@@ -1232,17 +1257,25 @@ double log(const double &x) { return ::log(x); }
 
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-typename NumTraits<T>::Real abs(const T &x) {
+typename internal::enable_if<NumTraits<T>::IsSigned || NumTraits<T>::IsComplex,typename NumTraits<T>::Real>::type
+abs(const T &x) {
   EIGEN_USING_STD_MATH(abs);
   return abs(x);
 }
 
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+typename internal::enable_if<!(NumTraits<T>::IsSigned || NumTraits<T>::IsComplex),typename NumTraits<T>::Real>::type
+abs(const T &x) {
+  return x;
+}
+
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   abs(float x) { return cl::sycl::fabs(x); }
-EIGEN_ALWAYS_INLINE double  abs(double x) { return cl::sycl::fabs(x); }
+SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(abs, abs)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(abs, fabs)
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float abs(const float &x) { return ::fabsf(x); }
 
@@ -1268,16 +1301,31 @@ T exp(const T &x) {
 }
 
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   exp(float x) { return cl::sycl::exp(x); }
-EIGEN_ALWAYS_INLINE double  exp(double x) { return cl::sycl::exp(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(exp, exp)
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float exp(const float &x) { return ::expf(x); }
 
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 double exp(const double &x) { return ::exp(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+std::complex<float> exp(const std::complex<float>& x) {
+  float com = ::expf(x.real());
+  float res_real = com * ::cosf(x.imag());
+  float res_imag = com * ::sinf(x.imag());
+  return std::complex<float>(res_real, res_imag);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+std::complex<double> exp(const std::complex<double>& x) {
+  double com = ::exp(x.real());
+  double res_real = com * ::cos(x.imag());
+  double res_imag = com * ::sin(x.imag());
+  return std::complex<double>(res_real, res_imag);
+}
 #endif
 
 template<typename Scalar>
@@ -1288,11 +1336,10 @@ inline EIGEN_MATHFUNC_RETVAL(expm1, Scalar) expm1(const Scalar& x)
 }
 
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   expm1(float x) { return cl::sycl::expm1(x); }
-EIGEN_ALWAYS_INLINE double  expm1(double x) { return cl::sycl::expm1(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(expm1, expm1)
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float expm1(const float &x) { return ::expm1f(x); }
 
@@ -1308,11 +1355,10 @@ T cos(const T &x) {
 }
 
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   cos(float x) { return cl::sycl::cos(x); }
-EIGEN_ALWAYS_INLINE double  cos(double x) { return cl::sycl::cos(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cos,cos)
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float cos(const float &x) { return ::cosf(x); }
 
@@ -1328,11 +1374,10 @@ T sin(const T &x) {
 }
 
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   sin(float x) { return cl::sycl::sin(x); }
-EIGEN_ALWAYS_INLINE double  sin(double x) { return cl::sycl::sin(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sin, sin)
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float sin(const float &x) { return ::sinf(x); }
 
@@ -1348,11 +1393,10 @@ T tan(const T &x) {
 }
 
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   tan(float x) { return cl::sycl::tan(x); }
-EIGEN_ALWAYS_INLINE double  tan(double x) { return cl::sycl::tan(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tan, tan)
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tan(const float &x) { return ::tanf(x); }
 
@@ -1367,12 +1411,21 @@ T acos(const T &x) {
   return acos(x);
 }
 
+#if EIGEN_HAS_CXX11_MATH
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T acosh(const T &x) {
+  EIGEN_USING_STD_MATH(acosh);
+  return acosh(x);
+}
+#endif
+
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   acos(float x) { return cl::sycl::acos(x); }
-EIGEN_ALWAYS_INLINE double  acos(double x) { return cl::sycl::acos(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acos, acos)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acosh, acosh)
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float acos(const float &x) { return ::acosf(x); }
 
@@ -1387,12 +1440,21 @@ T asin(const T &x) {
   return asin(x);
 }
 
+#if EIGEN_HAS_CXX11_MATH
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T asinh(const T &x) {
+  EIGEN_USING_STD_MATH(asinh);
+  return asinh(x);
+}
+#endif
+
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   asin(float x) { return cl::sycl::asin(x); }
-EIGEN_ALWAYS_INLINE double  asin(double x) { return cl::sycl::asin(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asin, asin)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asinh, asinh)
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float asin(const float &x) { return ::asinf(x); }
 
@@ -1407,12 +1469,21 @@ T atan(const T &x) {
   return atan(x);
 }
 
+#if EIGEN_HAS_CXX11_MATH
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T atanh(const T &x) {
+  EIGEN_USING_STD_MATH(atanh);
+  return atanh(x);
+}
+#endif
+
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   atan(float x) { return cl::sycl::atan(x); }
-EIGEN_ALWAYS_INLINE double  atan(double x) { return cl::sycl::atan(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atan, atan)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atanh, atanh)
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float atan(const float &x) { return ::atanf(x); }
 
@@ -1429,11 +1500,10 @@ T cosh(const T &x) {
 }
 
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   cosh(float x) { return cl::sycl::cosh(x); }
-EIGEN_ALWAYS_INLINE double  cosh(double x) { return cl::sycl::cosh(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cosh, cosh)
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float cosh(const float &x) { return ::coshf(x); }
 
@@ -1449,11 +1519,10 @@ T sinh(const T &x) {
 }
 
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   sinh(float x) { return cl::sycl::sinh(x); }
-EIGEN_ALWAYS_INLINE double  sinh(double x) { return cl::sycl::sinh(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sinh, sinh)
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float sinh(const float &x) { return ::sinhf(x); }
 
@@ -1468,15 +1537,16 @@ T tanh(const T &x) {
   return tanh(x);
 }
 
-#if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   tanh(float x) { return cl::sycl::tanh(x); }
-EIGEN_ALWAYS_INLINE double  tanh(double x) { return cl::sycl::tanh(x); }
-#elif (!defined(__CUDACC__)) && EIGEN_FAST_MATH
+#if (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH && (!defined(__SYCL_DEVICE_ONLY__))
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tanh(float x) { return internal::generic_fast_tanh_float(x); }
 #endif
 
-#ifdef __CUDACC__
+#if defined(__SYCL_DEVICE_ONLY__)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tanh, tanh)
+#endif // defined(__SYCL_DEVICE_ONLY__)
+
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tanh(const float &x) { return ::tanhf(x); }
 
@@ -1492,11 +1562,10 @@ T fmod(const T& a, const T& b) {
 }
 
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   fmod(float x, float y) { return cl::sycl::fmod(x, y); }
-EIGEN_ALWAYS_INLINE double  fmod(double x, double y) { return cl::sycl::fmod(x, y); }
+SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(fmod, fmod)
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float fmod(const float& a, const float& b) {
@@ -1510,6 +1579,23 @@ double fmod(const double& a, const double& b) {
 }
 #endif
 
+#if defined(__SYCL_DEVICE_ONLY__)
+#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY
+#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY
+#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY
+#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
+#undef SYCL_SPECIALIZE_INTEGER_TYPES_BINARY
+#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
+#undef SYCL_SPECIALIZE_FLOATING_TYPES_BINARY
+#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY
+#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE
+#undef SYCL_SPECIALIZE_GEN_UNARY_FUNC
+#undef SYCL_SPECIALIZE_UNARY_FUNC
+#undef SYCL_SPECIALIZE_GEN1_BINARY_FUNC
+#undef SYCL_SPECIALIZE_GEN2_BINARY_FUNC
+#undef SYCL_SPECIALIZE_BINARY_FUNC
+#endif // defined(__SYCL_DEVICE_ONLY__)
+
 } // end namespace numext
 
 namespace internal {
diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h
index ae1386b4c..a23e93ccb 100644
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@@ -66,6 +66,30 @@ T generic_fast_tanh_float(const T& a_x)
   return pdiv(p, q);
 }
 
+template<typename RealScalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y)
+{
+  EIGEN_USING_STD_MATH(sqrt);
+  RealScalar p, qp;
+  p = numext::maxi(x,y);
+  if(p==RealScalar(0)) return RealScalar(0);
+  qp = numext::mini(y,x) / p;    
+  return p * sqrt(RealScalar(1) + qp*qp);
+}
+
+template<typename Scalar>
+struct hypot_impl
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static EIGEN_DEVICE_FUNC
+  inline RealScalar run(const Scalar& x, const Scalar& y)
+  {
+    EIGEN_USING_STD_MATH(abs);
+    return positive_real_hypot<RealScalar>(abs(x), abs(y));
+  }
+};
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index 675c94e12..6046c8bae 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -160,20 +160,11 @@ template<typename Derived> class MatrixBase
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator-=(const MatrixBase<OtherDerived>& other);
 
-#ifdef __CUDACC__
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
-    const Product<Derived,OtherDerived,LazyProduct>
-    operator*(const MatrixBase<OtherDerived> &other) const
-    { return this->lazyProduct(other); }
-#else
-
-    template<typename OtherDerived>
     const Product<Derived,OtherDerived>
     operator*(const MatrixBase<OtherDerived> &other) const;
 
-#endif
-
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
     const Product<Derived,OtherDerived,LazyProduct>
@@ -277,6 +268,8 @@ template<typename Derived> class MatrixBase
     Derived& setIdentity();
     EIGEN_DEVICE_FUNC
     Derived& setIdentity(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC Derived& setUnit(Index i);
+    EIGEN_DEVICE_FUNC Derived& setUnit(Index newSize, Index i);
 
     bool isIdentity(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
     bool isDiagonal(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
@@ -294,7 +287,7 @@ template<typename Derived> class MatrixBase
       *          fuzzy comparison such as isApprox()
       * \sa isApprox(), operator!= */
     template<typename OtherDerived>
-    inline bool operator==(const MatrixBase<OtherDerived>& other) const
+    EIGEN_DEVICE_FUNC inline bool operator==(const MatrixBase<OtherDerived>& other) const
     { return cwiseEqual(other).all(); }
 
     /** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other.
@@ -302,10 +295,10 @@ template<typename Derived> class MatrixBase
       *          fuzzy comparison such as isApprox()
       * \sa isApprox(), operator== */
     template<typename OtherDerived>
-    inline bool operator!=(const MatrixBase<OtherDerived>& other) const
+    EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase<OtherDerived>& other) const
     { return cwiseNotEqual(other).any(); }
 
-    NoAlias<Derived,Eigen::MatrixBase > noalias();
+    NoAlias<Derived,Eigen::MatrixBase > EIGEN_DEVICE_FUNC noalias();
 
     // TODO forceAlignedAccess is temporarily disabled
     // Need to find a nicer workaround.
@@ -335,6 +328,7 @@ template<typename Derived> class MatrixBase
 
     inline const PartialPivLU<PlainObject> lu() const;
 
+    EIGEN_DEVICE_FUNC
     inline const Inverse<Derived> inverse() const;
 
     template<typename ResultType>
@@ -344,12 +338,15 @@ template<typename Derived> class MatrixBase
       bool& invertible,
       const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
     ) const;
+
     template<typename ResultType>
     inline void computeInverseWithCheck(
       ResultType& inverse,
       bool& invertible,
       const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
     ) const;
+
+    EIGEN_DEVICE_FUNC
     Scalar determinant() const;
 
 /////////// Cholesky module ///////////
@@ -421,15 +418,19 @@ template<typename Derived> class MatrixBase
 
 ////////// Householder module ///////////
 
+    EIGEN_DEVICE_FUNC
     void makeHouseholderInPlace(Scalar& tau, RealScalar& beta);
     template<typename EssentialPart>
+    EIGEN_DEVICE_FUNC
     void makeHouseholder(EssentialPart& essential,
                          Scalar& tau, RealScalar& beta) const;
     template<typename EssentialPart>
+    EIGEN_DEVICE_FUNC
     void applyHouseholderOnTheLeft(const EssentialPart& essential,
                                    const Scalar& tau,
                                    Scalar* workspace);
     template<typename EssentialPart>
+    EIGEN_DEVICE_FUNC
     void applyHouseholderOnTheRight(const EssentialPart& essential,
                                     const Scalar& tau,
                                     Scalar* workspace);
@@ -437,8 +438,10 @@ template<typename Derived> class MatrixBase
 ///////// Jacobi module /////////
 
     template<typename OtherScalar>
+    EIGEN_DEVICE_FUNC
     void applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j);
     template<typename OtherScalar>
+    EIGEN_DEVICE_FUNC
     void applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j);
 
 ///////// SparseCore module /////////
diff --git a/Eigen/src/Core/NestByValue.h b/Eigen/src/Core/NestByValue.h
index 13adf070e..01cf192e9 100644
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h
@@ -67,25 +67,25 @@ template<typename ExpressionType> class NestByValue
     }
 
     template<int LoadMode>
-    inline const PacketScalar packet(Index row, Index col) const
+    EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index row, Index col) const
     {
       return m_expression.template packet<LoadMode>(row, col);
     }
 
     template<int LoadMode>
-    inline void writePacket(Index row, Index col, const PacketScalar& x)
+    EIGEN_DEVICE_FUNC inline void writePacket(Index row, Index col, const PacketScalar& x)
     {
       m_expression.const_cast_derived().template writePacket<LoadMode>(row, col, x);
     }
 
     template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
+    EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index index) const
     {
       return m_expression.template packet<LoadMode>(index);
     }
 
     template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& x)
+    EIGEN_DEVICE_FUNC inline void writePacket(Index index, const PacketScalar& x)
     {
       m_expression.const_cast_derived().template writePacket<LoadMode>(index, x);
     }
@@ -99,7 +99,7 @@ template<typename ExpressionType> class NestByValue
 /** \returns an expression of the temporary version of *this.
   */
 template<typename Derived>
-inline const NestByValue<Derived>
+EIGEN_DEVICE_FUNC inline const NestByValue<Derived>
 DenseBase<Derived>::nestByValue() const
 {
   return NestByValue<Derived>(derived());
diff --git a/Eigen/src/Core/NoAlias.h b/Eigen/src/Core/NoAlias.h
index 33908010b..570283d90 100644
--- a/Eigen/src/Core/NoAlias.h
+++ b/Eigen/src/Core/NoAlias.h
@@ -33,6 +33,7 @@ class NoAlias
   public:
     typedef typename ExpressionType::Scalar Scalar;
     
+    EIGEN_DEVICE_FUNC
     explicit NoAlias(ExpressionType& expression) : m_expression(expression) {}
     
     template<typename OtherDerived>
@@ -74,10 +75,10 @@ class NoAlias
   *
   * More precisely, noalias() allows to bypass the EvalBeforeAssignBit flag.
   * Currently, even though several expressions may alias, only product
-  * expressions have this flag. Therefore, noalias() is only usefull when
+  * expressions have this flag. Therefore, noalias() is only useful when
   * the source expression contains a matrix product.
   *
-  * Here are some examples where noalias is usefull:
+  * Here are some examples where noalias is useful:
   * \code
   * D.noalias()  = A * B;
   * D.noalias() += A.transpose() * B;
@@ -98,7 +99,7 @@ class NoAlias
   * \sa class NoAlias
   */
 template<typename Derived>
-NoAlias<Derived,MatrixBase> MatrixBase<Derived>::noalias()
+NoAlias<Derived,MatrixBase> EIGEN_DEVICE_FUNC MatrixBase<Derived>::noalias()
 {
   return NoAlias<Derived, Eigen::MatrixBase >(derived());
 }
diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h
index aebc0c259..b053cff07 100644
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -21,12 +21,14 @@ template< typename T,
           bool is_integer = NumTraits<T>::IsInteger>
 struct default_digits10_impl
 {
+  EIGEN_DEVICE_FUNC
   static int run() { return std::numeric_limits<T>::digits10; }
 };
 
 template<typename T>
 struct default_digits10_impl<T,false,false> // Floating point
 {
+  EIGEN_DEVICE_FUNC
   static int run() {
     using std::log10;
     using std::ceil;
@@ -38,6 +40,38 @@ struct default_digits10_impl<T,false,false> // Floating point
 template<typename T>
 struct default_digits10_impl<T,false,true> // Integer
 {
+  EIGEN_DEVICE_FUNC
+  static int run() { return 0; }
+};
+
+
+// default implementation of digits(), based on numeric_limits if specialized,
+// 0 for integer types, and log2(epsilon()) otherwise.
+template< typename T,
+          bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
+          bool is_integer = NumTraits<T>::IsInteger>
+struct default_digits_impl
+{
+  EIGEN_DEVICE_FUNC
+  static int run() { return std::numeric_limits<T>::digits; }
+};
+
+template<typename T>
+struct default_digits_impl<T,false,false> // Floating point
+{
+  EIGEN_DEVICE_FUNC
+  static int run() {
+    using std::log;
+    using std::ceil;
+    typedef typename NumTraits<T>::Real Real;
+    return int(ceil(-log(NumTraits<Real>::epsilon())/log(static_cast<Real>(2))));
+  }
+};
+
+template<typename T>
+struct default_digits_impl<T,false,true> // Integer
+{
+  EIGEN_DEVICE_FUNC
   static int run() { return 0; }
 };
 
@@ -119,6 +153,12 @@ template<typename T> struct GenericNumTraits
   }
 
   EIGEN_DEVICE_FUNC
+  static inline int digits()
+  {
+    return internal::default_digits_impl<T>::run();
+  }
+
+  EIGEN_DEVICE_FUNC
   static inline Real dummy_precision()
   {
     // make sure to override this for floating-point types
@@ -215,6 +255,8 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
   static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); }
   EIGEN_DEVICE_FUNC
   static inline RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); }
+
+  static inline int digits10() { return NumTraits<Scalar>::digits10(); }
 };
 
 template<> struct NumTraits<std::string>
diff --git a/Eigen/src/Core/PermutationMatrix.h b/Eigen/src/Core/PermutationMatrix.h
index b1fb455b9..acd085301 100644
--- a/Eigen/src/Core/PermutationMatrix.h
+++ b/Eigen/src/Core/PermutationMatrix.h
@@ -99,13 +99,13 @@ class PermutationBase : public EigenBase<Derived>
     #endif
 
     /** \returns the number of rows */
-    inline Index rows() const { return Index(indices().size()); }
+    inline EIGEN_DEVICE_FUNC Index rows() const { return Index(indices().size()); }
 
     /** \returns the number of columns */
-    inline Index cols() const { return Index(indices().size()); }
+    inline EIGEN_DEVICE_FUNC Index cols() const { return Index(indices().size()); }
 
     /** \returns the size of a side of the respective square matrix, i.e., the number of indices */
-    inline Index size() const { return Index(indices().size()); }
+    inline EIGEN_DEVICE_FUNC Index size() const { return Index(indices().size()); }
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename DenseDerived>
diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h
index 77f4f6066..da329fd4f 100644
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -577,6 +577,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       * while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned
       * \a data pointers.
       *
+      * Here is an example using strides:
+      * \include Matrix_Map_stride.cpp
+      * Output: \verbinclude Matrix_Map_stride.out
+      *
       * \see class Map
       */
     //@{
@@ -776,7 +780,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       resize(size);
     }
     
-    // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitely converted)
+    // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitly converted)
     template<typename T>
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if<Base::SizeAtCompileTime==1 && internal::is_convertible<T, Scalar>::value,T>::type* = 0)
@@ -917,13 +921,19 @@ namespace internal {
 template <typename Derived, typename OtherDerived, bool IsVector>
 struct conservative_resize_like_impl
 {
+  #if EIGEN_HAS_TYPE_TRAITS
+  static const bool IsRelocatable = std::is_trivially_copyable<typename Derived::Scalar>::value;
+  #else
+  static const bool IsRelocatable = !NumTraits<typename Derived::Scalar>::RequireInitialization;
+  #endif
   static void run(DenseBase<Derived>& _this, Index rows, Index cols)
   {
     if (_this.rows() == rows && _this.cols() == cols) return;
     EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
 
-    if ( ( Derived::IsRowMajor && _this.cols() == cols) || // row-major and we change only the number of rows
-         (!Derived::IsRowMajor && _this.rows() == rows) )  // column-major and we change only the number of columns
+    if ( IsRelocatable
+          && (( Derived::IsRowMajor && _this.cols() == cols) ||  // row-major and we change only the number of rows
+              (!Derived::IsRowMajor && _this.rows() == rows) ))  // column-major and we change only the number of columns
     {
       internal::check_rows_cols_for_overflow<Derived::MaxSizeAtCompileTime>::run(rows, cols);
       _this.derived().m_storage.conservativeResize(rows*cols,rows,cols);
@@ -951,8 +961,9 @@ struct conservative_resize_like_impl
     EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
     EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(OtherDerived)
 
-    if ( ( Derived::IsRowMajor && _this.cols() == other.cols()) || // row-major and we change only the number of rows
-         (!Derived::IsRowMajor && _this.rows() == other.rows()) )  // column-major and we change only the number of columns
+    if ( IsRelocatable &&
+          (( Derived::IsRowMajor && _this.cols() == other.cols()) ||  // row-major and we change only the number of rows
+           (!Derived::IsRowMajor && _this.rows() == other.rows()) ))  // column-major and we change only the number of columns
     {
       const Index new_rows = other.rows() - _this.rows();
       const Index new_cols = other.cols() - _this.cols();
@@ -980,13 +991,18 @@ template <typename Derived, typename OtherDerived>
 struct conservative_resize_like_impl<Derived,OtherDerived,true>
   : conservative_resize_like_impl<Derived,OtherDerived,false>
 {
-  using conservative_resize_like_impl<Derived,OtherDerived,false>::run;
+  typedef conservative_resize_like_impl<Derived,OtherDerived,false> Base;
+  using Base::run;
+  using Base::IsRelocatable;
   
   static void run(DenseBase<Derived>& _this, Index size)
   {
     const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : size;
     const Index new_cols = Derived::RowsAtCompileTime==1 ? size : 1;
-    _this.derived().m_storage.conservativeResize(size,new_rows,new_cols);
+    if(IsRelocatable)
+      _this.derived().m_storage.conservativeResize(size,new_rows,new_cols);
+    else
+      Base::run(_this.derived(), new_rows, new_cols);
   }
 
   static void run(DenseBase<Derived>& _this, const DenseBase<OtherDerived>& other)
@@ -997,7 +1013,10 @@ struct conservative_resize_like_impl<Derived,OtherDerived,true>
 
     const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : other.rows();
     const Index new_cols = Derived::RowsAtCompileTime==1 ? other.cols() : 1;
-    _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);
+    if(IsRelocatable)
+      _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);
+    else
+      Base::run(_this.derived(), new_rows, new_cols);
 
     if (num_new_elements > 0)
       _this.tail(num_new_elements) = other.tail(num_new_elements);
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index ae0c94b38..70790dbd4 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -97,8 +97,8 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option,
         && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
     }
 
-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
 
     EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; }
     EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; }
@@ -116,7 +116,7 @@ class dense_product_base
  : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
 {};
 
-/** Convertion to scalar for inner-products */
+/** Conversion to scalar for inner-products */
 template<typename Lhs, typename Rhs, int Option>
 class dense_product_base<Lhs, Rhs, Option, InnerProduct>
  : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
@@ -127,7 +127,7 @@ public:
   using Base::derived;
   typedef typename Base::Scalar Scalar;
   
-  operator const Scalar() const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator const Scalar() const
   {
     return internal::evaluator<ProductXpr>(derived()).coeff(0,0);
   }
@@ -162,7 +162,7 @@ class ProductImpl<Lhs,Rhs,Option,Dense>
     
   public:
   
-    EIGEN_DEVICE_FUNC Scalar coeff(Index row, Index col) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index row, Index col) const
     {
       EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
       eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
@@ -170,7 +170,7 @@ class ProductImpl<Lhs,Rhs,Option,Dense>
       return internal::evaluator<Derived>(derived()).coeff(row,col);
     }
 
-    EIGEN_DEVICE_FUNC Scalar coeff(Index i) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index i) const
     {
       EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
       eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index 583b7f59e..2787987e7 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -20,7 +20,7 @@ namespace internal {
 /** \internal
   * Evaluator of a product expression.
   * Since products require special treatments to handle all possible cases,
-  * we simply deffer the evaluation logic to a product_evaluator class
+  * we simply defer the evaluation logic to a product_evaluator class
   * which offers more partial specialization possibilities.
   * 
   * \sa class product_evaluator
@@ -32,7 +32,7 @@ struct evaluator<Product<Lhs, Rhs, Options> >
   typedef Product<Lhs, Rhs, Options> XprType;
   typedef product_evaluator<XprType> Base;
   
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {}
 };
  
 // Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B"
@@ -55,7 +55,7 @@ struct evaluator<CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
                                const Product<Lhs, Rhs, DefaultProduct> > XprType;
   typedef evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1,Lhs,product), Rhs, DefaultProduct> > Base;
 
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
     : Base(xpr.lhs().functor().m_other * xpr.rhs().lhs() * xpr.rhs().rhs())
   {}
 };
@@ -68,7 +68,7 @@ struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> >
   typedef Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> XprType;
   typedef evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> > Base;
   
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
     : Base(Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>(
         Product<Lhs, Rhs, LazyProduct>(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()),
         xpr.index() ))
@@ -128,7 +128,7 @@ protected:
   PlainObject m_result;
 };
 
-// The following three shortcuts are enabled only if the scalar types match excatly.
+// The following three shortcuts are enabled only if the scalar types match exactly.
 // TODO: we could enable them for different scalar types when the product is not vectorized.
 
 // Dense = Product
@@ -137,7 +137,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scal
   typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
   typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static EIGEN_STRONG_INLINE
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
   {
     Index dstRows = src.rows();
@@ -155,7 +155,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<
   typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
   typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static EIGEN_STRONG_INLINE
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,Scalar> &)
   {
     eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
@@ -170,7 +170,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<
   typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
   typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static EIGEN_STRONG_INLINE
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,Scalar> &)
   {
     eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
@@ -190,7 +190,7 @@ struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_product_op<ScalarBi
   typedef CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>,
                         const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>,
                         const Product<Lhs,Rhs,DefaultProduct> > SrcXprType;
-  static EIGEN_STRONG_INLINE
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func)
   {
     call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs())*src.rhs().rhs(), func);
@@ -207,11 +207,17 @@ struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_sum_op<typename
   static const bool value = true;
 };
 
+template<typename OtherXpr, typename Lhs, typename Rhs>
+struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_difference_op<typename OtherXpr::Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, const OtherXpr,
+                                               const Product<Lhs,Rhs,DefaultProduct> >, DenseShape > {
+  static const bool value = true;
+};
+
 template<typename DstXprType, typename OtherXpr, typename ProductType, typename Func1, typename Func2>
 struct assignment_from_xpr_op_product
 {
   template<typename SrcXprType, typename InitialFunc>
-  static EIGEN_STRONG_INLINE
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/)
   {
     call_assignment_no_alias(dst, src.lhs(), Func1());
@@ -240,19 +246,19 @@ template<typename Lhs, typename Rhs>
 struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
 {
   template<typename Dst>
-  static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
   }
   
   template<typename Dst>
-  static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum();
   }
   
   template<typename Dst>
-  static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   { dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); }
 };
 
@@ -263,10 +269,10 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
 
 // Column major result
 template<typename Dst, typename Lhs, typename Rhs, typename Func>
-void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
+void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
 {
   evaluator<Rhs> rhsEval(rhs);
-  typename nested_eval<Lhs,Rhs::SizeAtCompileTime>::type actual_lhs(lhs);
+  ei_declare_local_nested_eval(Lhs,lhs,Rhs::SizeAtCompileTime,actual_lhs);
   // FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored
   // FIXME not very good if rhs is real and lhs complex while alpha is real too
   const Index cols = dst.cols();
@@ -276,10 +282,10 @@ void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const
 
 // Row major result
 template<typename Dst, typename Lhs, typename Rhs, typename Func>
-void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
+void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
 {
   evaluator<Lhs> lhsEval(lhs);
-  typename nested_eval<Rhs,Lhs::SizeAtCompileTime>::type actual_rhs(rhs);
+  ei_declare_local_nested_eval(Rhs,rhs,Lhs::SizeAtCompileTime,actual_rhs);
   // FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored
   // FIXME not very good if lhs is real and rhs complex while alpha is real too
   const Index rows = dst.rows();
@@ -294,37 +300,37 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,OuterProduct>
   typedef typename Product<Lhs,Rhs>::Scalar Scalar;
   
   // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose
-  struct set  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
-  struct add  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
-  struct sub  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
+  struct set  { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
+  struct add  { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
+  struct sub  { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
   struct adds {
     Scalar m_scale;
     explicit adds(const Scalar& s) : m_scale(s) {}
-    template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const {
+    template<typename Dst, typename Src> void EIGEN_DEVICE_FUNC operator()(const Dst& dst, const Src& src) const {
       dst.const_cast_derived() += m_scale * src;
     }
   };
   
   template<typename Dst>
-  static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>());
   }
   
   template<typename Dst>
-  static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>());
   }
   
   template<typename Dst>
-  static inline void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>());
   }
   
   template<typename Dst>
-  static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
   {
     internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>());
   }
@@ -339,19 +345,19 @@ struct generic_product_impl_base
   typedef typename Product<Lhs,Rhs>::Scalar Scalar;
   
   template<typename Dst>
-  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   { dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); }
 
   template<typename Dst>
-  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   { scaleAndAddTo(dst,lhs, rhs, Scalar(1)); }
 
   template<typename Dst>
-  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   { scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); }
   
   template<typename Dst>
-  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
   { Derived::scaleAndAddTo(dst,lhs,rhs,alpha); }
 
 };
@@ -367,7 +373,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct>
   typedef typename internal::remove_all<typename internal::conditional<int(Side)==OnTheRight,LhsNested,RhsNested>::type>::type MatrixType;
 
   template<typename Dest>
-  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
   {
     LhsNested actual_lhs(lhs);
     RhsNested actual_rhs(rhs);
@@ -384,26 +390,52 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
   typedef typename Product<Lhs,Rhs>::Scalar Scalar;
   
   template<typename Dst>
-  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     // Same as: dst.noalias() = lhs.lazyProduct(rhs);
     // but easier on the compiler side
     call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar,Scalar>());
   }
-  
+
   template<typename Dst>
-  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     // dst.noalias() += lhs.lazyProduct(rhs);
     call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op<typename Dst::Scalar,Scalar>());
   }
   
   template<typename Dst>
-  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     // dst.noalias() -= lhs.lazyProduct(rhs);
     call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
   }
+
+  // Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor:
+  //    dst {,+,-}= s * (A.lazyProduct(B))
+  // This is a huge benefit for heap-allocated matrix types as it save one costly allocation.
+  // For them, this strategy is also faster than simply by-passing the heap allocation through
+  // stack allocation.
+  // For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower,
+  // and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only,
+  // that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
+  template<typename Dst, typename Scalar1, typename Scalar2, typename Plain1, typename Xpr2, typename Func>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void eval_dynamic(Dst& dst, const CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
+                                           const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func)
+  {
+    call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func);
+  }
+
+  // Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above
+  // overload more specialized.
+  template<typename Dst, typename LhsT, typename Func>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func)
+  {
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
+  }
+  
   
 //   template<typename Dst>
 //   static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
@@ -735,7 +767,8 @@ struct generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag>
   typedef typename Product<Lhs,Rhs>::Scalar Scalar;
   
   template<typename Dest>
-  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC
+  void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
   {
     selfadjoint_product_impl<typename Lhs::MatrixType,Lhs::Mode,false,Rhs,0,Rhs::IsVectorAtCompileTime>::run(dst, lhs.nestedExpression(), rhs, alpha);
   }
@@ -779,7 +812,11 @@ public:
     _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
     _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
     Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0),
-    Alignment = evaluator<MatrixType>::Alignment
+    Alignment = evaluator<MatrixType>::Alignment,
+
+    AsScalarProduct =     (DiagonalType::SizeAtCompileTime==1)
+                      ||  (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft)
+                      ||  (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight)
   };
   
   diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
@@ -791,7 +828,10 @@ public:
   
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
   {
-    return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
+    if(AsScalarProduct)
+      return m_diagImpl.coeff(0) * m_matImpl.coeff(idx);
+    else
+      return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
   }
   
 protected:
@@ -845,7 +885,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
     return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
   }
   
-#ifndef __CUDACC__
+#ifndef EIGEN_GPUCC
   template<int LoadMode,typename PacketType>
   EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
   {
@@ -889,7 +929,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
     return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
   }
   
-#ifndef __CUDACC__
+#ifndef EIGEN_GPUCC
   template<int LoadMode,typename PacketType>
   EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
   {
diff --git a/Eigen/src/Core/Random.h b/Eigen/src/Core/Random.h
index 6faf789c7..486e9ed52 100644
--- a/Eigen/src/Core/Random.h
+++ b/Eigen/src/Core/Random.h
@@ -128,7 +128,7 @@ DenseBase<Derived>::Random()
   * \sa class CwiseNullaryOp, setRandom(Index), setRandom(Index,Index)
   */
 template<typename Derived>
-inline Derived& DenseBase<Derived>::setRandom()
+EIGEN_DEVICE_FUNC inline Derived& DenseBase<Derived>::setRandom()
 {
   return *this = Random(rows(), cols());
 }
diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index b6e8f8887..e449ef3ac 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -23,22 +23,22 @@ namespace internal {
 * Part 1 : the logic deciding a strategy for vectorization and unrolling
 ***************************************************************************/
 
-template<typename Func, typename Derived>
+template<typename Func, typename Evaluator>
 struct redux_traits
 {
 public:
-    typedef typename find_best_packet<typename Derived::Scalar,Derived::SizeAtCompileTime>::type PacketType;
+    typedef typename find_best_packet<typename Evaluator::Scalar,Evaluator::SizeAtCompileTime>::type PacketType;
   enum {
     PacketSize = unpacket_traits<PacketType>::size,
-    InnerMaxSize = int(Derived::IsRowMajor)
-                 ? Derived::MaxColsAtCompileTime
-                 : Derived::MaxRowsAtCompileTime
+    InnerMaxSize = int(Evaluator::IsRowMajor)
+                 ? Evaluator::MaxColsAtCompileTime
+                 : Evaluator::MaxRowsAtCompileTime
   };
 
   enum {
-    MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit)
+    MightVectorize = (int(Evaluator::Flags)&ActualPacketAccessBit)
                   && (functor_traits<Func>::PacketAccess),
-    MayLinearVectorize = bool(MightVectorize) && (int(Derived::Flags)&LinearAccessBit),
+    MayLinearVectorize = bool(MightVectorize) && (int(Evaluator::Flags)&LinearAccessBit),
     MaySliceVectorize  = bool(MightVectorize) && int(InnerMaxSize)>=3*PacketSize
   };
 
@@ -51,8 +51,8 @@ public:
 
 public:
   enum {
-    Cost = Derived::SizeAtCompileTime == Dynamic ? HugeCost
-         : Derived::SizeAtCompileTime * Derived::CoeffReadCost + (Derived::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
+    Cost = Evaluator::SizeAtCompileTime == Dynamic ? HugeCost
+         : Evaluator::SizeAtCompileTime * Evaluator::CoeffReadCost + (Evaluator::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
     UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize))
   };
 
@@ -64,9 +64,9 @@ public:
 #ifdef EIGEN_DEBUG_ASSIGN
   static void debug()
   {
-    std::cerr << "Xpr: " << typeid(typename Derived::XprType).name() << std::endl;
+    std::cerr << "Xpr: " << typeid(typename Evaluator::XprType).name() << std::endl;
     std::cerr.setf(std::ios::hex, std::ios::basefield);
-    EIGEN_DEBUG_VAR(Derived::Flags)
+    EIGEN_DEBUG_VAR(Evaluator::Flags)
     std::cerr.unsetf(std::ios::hex);
     EIGEN_DEBUG_VAR(InnerMaxSize)
     EIGEN_DEBUG_VAR(PacketSize)
@@ -87,88 +87,88 @@ public:
 
 /*** no vectorization ***/
 
-template<typename Func, typename Derived, int Start, int Length>
+template<typename Func, typename Evaluator, int Start, int Length>
 struct redux_novec_unroller
 {
   enum {
     HalfLength = Length/2
   };
 
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
 
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
+  static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func& func)
   {
-    return func(redux_novec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
-                redux_novec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func));
+    return func(redux_novec_unroller<Func, Evaluator, Start, HalfLength>::run(eval,func),
+                redux_novec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::run(eval,func));
   }
 };
 
-template<typename Func, typename Derived, int Start>
-struct redux_novec_unroller<Func, Derived, Start, 1>
+template<typename Func, typename Evaluator, int Start>
+struct redux_novec_unroller<Func, Evaluator, Start, 1>
 {
   enum {
-    outer = Start / Derived::InnerSizeAtCompileTime,
-    inner = Start % Derived::InnerSizeAtCompileTime
+    outer = Start / Evaluator::InnerSizeAtCompileTime,
+    inner = Start % Evaluator::InnerSizeAtCompileTime
   };
 
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
 
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&)
+  static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func&)
   {
-    return mat.coeffByOuterInner(outer, inner);
+    return eval.coeffByOuterInner(outer, inner);
   }
 };
 
 // This is actually dead code and will never be called. It is required
 // to prevent false warnings regarding failed inlining though
 // for 0 length run() will never be called at all.
-template<typename Func, typename Derived, int Start>
-struct redux_novec_unroller<Func, Derived, Start, 0>
+template<typename Func, typename Evaluator, int Start>
+struct redux_novec_unroller<Func, Evaluator, Start, 0>
 {
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
   EIGEN_DEVICE_FUNC 
-  static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); }
+  static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); }
 };
 
 /*** vectorization ***/
 
-template<typename Func, typename Derived, int Start, int Length>
+template<typename Func, typename Evaluator, int Start, int Length>
 struct redux_vec_unroller
 {
   enum {
-    PacketSize = redux_traits<Func, Derived>::PacketSize,
+    PacketSize = redux_traits<Func, Evaluator>::PacketSize,
     HalfLength = Length/2
   };
 
-  typedef typename Derived::Scalar Scalar;
-  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
 
-  static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func)
+  static EIGEN_STRONG_INLINE PacketScalar run(const Evaluator &eval, const Func& func)
   {
     return func.packetOp(
-            redux_vec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
-            redux_vec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func) );
+            redux_vec_unroller<Func, Evaluator, Start, HalfLength>::run(eval,func),
+            redux_vec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::run(eval,func) );
   }
 };
 
-template<typename Func, typename Derived, int Start>
-struct redux_vec_unroller<Func, Derived, Start, 1>
+template<typename Func, typename Evaluator, int Start>
+struct redux_vec_unroller<Func, Evaluator, Start, 1>
 {
   enum {
-    index = Start * redux_traits<Func, Derived>::PacketSize,
-    outer = index / int(Derived::InnerSizeAtCompileTime),
-    inner = index % int(Derived::InnerSizeAtCompileTime),
-    alignment = Derived::Alignment
+    index = Start * redux_traits<Func, Evaluator>::PacketSize,
+    outer = index / int(Evaluator::InnerSizeAtCompileTime),
+    inner = index % int(Evaluator::InnerSizeAtCompileTime),
+    alignment = Evaluator::Alignment
   };
 
-  typedef typename Derived::Scalar Scalar;
-  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
 
-  static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&)
+  static EIGEN_STRONG_INLINE PacketScalar run(const Evaluator &eval, const Func&)
   {
-    return mat.template packetByOuterInner<alignment,PacketScalar>(outer, inner);
+    return eval.template packetByOuterInner<alignment,PacketScalar>(outer, inner);
   }
 };
 
@@ -176,53 +176,65 @@ struct redux_vec_unroller<Func, Derived, Start, 1>
 * Part 3 : implementation of all cases
 ***************************************************************************/
 
-template<typename Func, typename Derived,
-         int Traversal = redux_traits<Func, Derived>::Traversal,
-         int Unrolling = redux_traits<Func, Derived>::Unrolling
+template<typename Func, typename Evaluator,
+         int Traversal = redux_traits<Func, Evaluator>::Traversal,
+         int Unrolling = redux_traits<Func, Evaluator>::Unrolling
 >
 struct redux_impl;
 
-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>
+template<typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>
 {
-  typedef typename Derived::Scalar Scalar;
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
+  typedef typename Evaluator::Scalar Scalar;
+
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
+  Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
   {
-    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
+    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
     Scalar res;
-    res = mat.coeffByOuterInner(0, 0);
-    for(Index i = 1; i < mat.innerSize(); ++i)
-      res = func(res, mat.coeffByOuterInner(0, i));
-    for(Index i = 1; i < mat.outerSize(); ++i)
-      for(Index j = 0; j < mat.innerSize(); ++j)
-        res = func(res, mat.coeffByOuterInner(i, j));
+    res = eval.coeffByOuterInner(0, 0);
+    for(Index i = 1; i < xpr.innerSize(); ++i)
+      res = func(res, eval.coeffByOuterInner(0, i));
+    for(Index i = 1; i < xpr.outerSize(); ++i)
+      for(Index j = 0; j < xpr.innerSize(); ++j)
+        res = func(res, eval.coeffByOuterInner(i, j));
     return res;
   }
 };
 
-template<typename Func, typename Derived>
-struct redux_impl<Func,Derived, DefaultTraversal, CompleteUnrolling>
-  : public redux_novec_unroller<Func,Derived, 0, Derived::SizeAtCompileTime>
-{};
+template<typename Func, typename Evaluator>
+struct redux_impl<Func,Evaluator, DefaultTraversal, CompleteUnrolling>
+  : redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime>
+{
+  typedef redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
+  typedef typename Evaluator::Scalar Scalar;
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
+  Scalar run(const Evaluator &eval, const Func& func, const XprType& /*xpr*/)
+  {
+    return Base::run(eval,func);
+  }
+};
 
-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
+template<typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, NoUnrolling>
 {
-  typedef typename Derived::Scalar Scalar;
-  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
 
-  static Scalar run(const Derived &mat, const Func& func)
+  template<typename XprType>
+  static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
   {
-    const Index size = mat.size();
+    const Index size = xpr.size();
     
-    const Index packetSize = redux_traits<Func, Derived>::PacketSize;
+    const Index packetSize = redux_traits<Func, Evaluator>::PacketSize;
     const int packetAlignment = unpacket_traits<PacketScalar>::alignment;
     enum {
-      alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
-      alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment)
+      alignment0 = (bool(Evaluator::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
+      alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Evaluator::Alignment)
     };
-    const Index alignedStart = internal::first_default_aligned(mat.nestedExpression());
+    const Index alignedStart = internal::first_default_aligned(xpr);
     const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
     const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize);
     const Index alignedEnd2 = alignedStart + alignedSize2;
@@ -230,34 +242,34 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
     Scalar res;
     if(alignedSize)
     {
-      PacketScalar packet_res0 = mat.template packet<alignment,PacketScalar>(alignedStart);
+      PacketScalar packet_res0 = eval.template packet<alignment,PacketScalar>(alignedStart);
       if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop
       {
-        PacketScalar packet_res1 = mat.template packet<alignment,PacketScalar>(alignedStart+packetSize);
+        PacketScalar packet_res1 = eval.template packet<alignment,PacketScalar>(alignedStart+packetSize);
         for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize)
         {
-          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(index));
-          packet_res1 = func.packetOp(packet_res1, mat.template packet<alignment,PacketScalar>(index+packetSize));
+          packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(index));
+          packet_res1 = func.packetOp(packet_res1, eval.template packet<alignment,PacketScalar>(index+packetSize));
         }
 
         packet_res0 = func.packetOp(packet_res0,packet_res1);
         if(alignedEnd>alignedEnd2)
-          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(alignedEnd2));
+          packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(alignedEnd2));
       }
       res = func.predux(packet_res0);
 
       for(Index index = 0; index < alignedStart; ++index)
-        res = func(res,mat.coeff(index));
+        res = func(res,eval.coeff(index));
 
       for(Index index = alignedEnd; index < size; ++index)
-        res = func(res,mat.coeff(index));
+        res = func(res,eval.coeff(index));
     }
     else // too small to vectorize anything.
          // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
     {
-      res = mat.coeff(0);
+      res = eval.coeff(0);
       for(Index index = 1; index < size; ++index)
-        res = func(res,mat.coeff(index));
+        res = func(res,eval.coeff(index));
     }
 
     return res;
@@ -265,130 +277,106 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
 };
 
 // NOTE: for SliceVectorizedTraversal we simply bypass unrolling
-template<typename Func, typename Derived, int Unrolling>
-struct redux_impl<Func, Derived, SliceVectorizedTraversal, Unrolling>
+template<typename Func, typename Evaluator, int Unrolling>
+struct redux_impl<Func, Evaluator, SliceVectorizedTraversal, Unrolling>
 {
-  typedef typename Derived::Scalar Scalar;
-  typedef typename redux_traits<Func, Derived>::PacketType PacketType;
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketType;
 
-  EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func)
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
   {
-    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
-    const Index innerSize = mat.innerSize();
-    const Index outerSize = mat.outerSize();
+    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
+    const Index innerSize = xpr.innerSize();
+    const Index outerSize = xpr.outerSize();
     enum {
-      packetSize = redux_traits<Func, Derived>::PacketSize
+      packetSize = redux_traits<Func, Evaluator>::PacketSize
     };
     const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize;
     Scalar res;
     if(packetedInnerSize)
     {
-      PacketType packet_res = mat.template packet<Unaligned,PacketType>(0,0);
+      PacketType packet_res = eval.template packet<Unaligned,PacketType>(0,0);
       for(Index j=0; j<outerSize; ++j)
         for(Index i=(j==0?packetSize:0); i<packetedInnerSize; i+=Index(packetSize))
-          packet_res = func.packetOp(packet_res, mat.template packetByOuterInner<Unaligned,PacketType>(j,i));
+          packet_res = func.packetOp(packet_res, eval.template packetByOuterInner<Unaligned,PacketType>(j,i));
 
       res = func.predux(packet_res);
       for(Index j=0; j<outerSize; ++j)
         for(Index i=packetedInnerSize; i<innerSize; ++i)
-          res = func(res, mat.coeffByOuterInner(j,i));
+          res = func(res, eval.coeffByOuterInner(j,i));
     }
     else // too small to vectorize anything.
          // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
     {
-      res = redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>::run(mat, func);
+      res = redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>::run(eval, func, xpr);
     }
 
     return res;
   }
 };
 
-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
+template<typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, CompleteUnrolling>
 {
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
 
-  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
   enum {
-    PacketSize = redux_traits<Func, Derived>::PacketSize,
-    Size = Derived::SizeAtCompileTime,
+    PacketSize = redux_traits<Func, Evaluator>::PacketSize,
+    Size = Evaluator::SizeAtCompileTime,
     VectorizedSize = (Size / PacketSize) * PacketSize
   };
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
+
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
+  Scalar run(const Evaluator &eval, const Func& func, const XprType &xpr)
   {
-    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
+    EIGEN_ONLY_USED_FOR_DEBUG(xpr)
+    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
     if (VectorizedSize > 0) {
-      Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
+      Scalar res = func.predux(redux_vec_unroller<Func, Evaluator, 0, Size / PacketSize>::run(eval,func));
       if (VectorizedSize != Size)
-        res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
+        res = func(res,redux_novec_unroller<Func, Evaluator, VectorizedSize, Size-VectorizedSize>::run(eval,func));
       return res;
     }
     else {
-      return redux_novec_unroller<Func, Derived, 0, Size>::run(mat,func);
+      return redux_novec_unroller<Func, Evaluator, 0, Size>::run(eval,func);
     }
   }
 };
 
 // evaluator adaptor
 template<typename _XprType>
-class redux_evaluator
+class redux_evaluator : public internal::evaluator<_XprType>
 {
+  typedef internal::evaluator<_XprType> Base;
 public:
   typedef _XprType XprType;
-  EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
+  EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : Base(xpr) {}
   
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketScalar PacketScalar;
-  typedef typename XprType::PacketReturnType PacketReturnType;
   
   enum {
     MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = XprType::MaxColsAtCompileTime,
     // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator
-    Flags = evaluator<XprType>::Flags & ~DirectAccessBit,
+    Flags = Base::Flags & ~DirectAccessBit,
     IsRowMajor = XprType::IsRowMajor,
     SizeAtCompileTime = XprType::SizeAtCompileTime,
-    InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime,
-    CoeffReadCost = evaluator<XprType>::CoeffReadCost,
-    Alignment = evaluator<XprType>::Alignment
+    InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime
   };
   
-  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
-  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
-  EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }
-  EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); }
-  EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); }
-
-  EIGEN_DEVICE_FUNC
-  CoeffReturnType coeff(Index row, Index col) const
-  { return m_evaluator.coeff(row, col); }
-
-  EIGEN_DEVICE_FUNC
-  CoeffReturnType coeff(Index index) const
-  { return m_evaluator.coeff(index); }
-
-  template<int LoadMode, typename PacketType>
-  PacketType packet(Index row, Index col) const
-  { return m_evaluator.template packet<LoadMode,PacketType>(row, col); }
-
-  template<int LoadMode, typename PacketType>
-  PacketType packet(Index index) const
-  { return m_evaluator.template packet<LoadMode,PacketType>(index); }
-  
   EIGEN_DEVICE_FUNC
   CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
-  { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
+  { return Base::coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
   
   template<int LoadMode, typename PacketType>
   PacketType packetByOuterInner(Index outer, Index inner) const
-  { return m_evaluator.template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
-  
-  const XprType & nestedExpression() const { return m_xpr; }
+  { return Base::template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
   
-protected:
-  internal::evaluator<XprType> m_evaluator;
-  const XprType &m_xpr;
 };
 
 } // end namespace internal
@@ -407,7 +395,7 @@ protected:
   */
 template<typename Derived>
 template<typename Func>
-typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::redux(const Func& func) const
 {
   eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
@@ -415,14 +403,16 @@ DenseBase<Derived>::redux(const Func& func) const
   typedef typename internal::redux_evaluator<Derived> ThisEvaluator;
   ThisEvaluator thisEval(derived());
   
-  return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func);
+  // The initial expression is passed to the reducer as an additional argument instead of
+  // passing it as a member of redux_evaluator to help  
+  return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func, derived());
 }
 
 /** \returns the minimum of all coefficients of \c *this.
   * \warning the result is undefined if \c *this contains NaN.
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff() const
 {
   return derived().redux(Eigen::internal::scalar_min_op<Scalar,Scalar>());
@@ -432,7 +422,7 @@ DenseBase<Derived>::minCoeff() const
   * \warning the result is undefined if \c *this contains NaN.
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff() const
 {
   return derived().redux(Eigen::internal::scalar_max_op<Scalar,Scalar>());
@@ -445,7 +435,7 @@ DenseBase<Derived>::maxCoeff() const
   * \sa trace(), prod(), mean()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::sum() const
 {
   if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
@@ -458,7 +448,7 @@ DenseBase<Derived>::sum() const
 * \sa trace(), prod(), sum()
 */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::mean() const
 {
 #ifdef __INTEL_COMPILER
@@ -479,7 +469,7 @@ DenseBase<Derived>::mean() const
   * \sa sum(), mean(), trace()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::prod() const
 {
   if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
@@ -494,7 +484,7 @@ DenseBase<Derived>::prod() const
   * \sa diagonal(), sum()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 MatrixBase<Derived>::trace() const
 {
   return derived().diagonal().sum();
diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h
index abb1e5121..ac9502bc4 100644
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@@ -95,6 +95,8 @@ protected:
   template<typename Expression>
   EIGEN_DEVICE_FUNC void construct(Expression& expr)
   {
+    EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(PlainObjectType,Expression);
+
     if(PlainObjectType::RowsAtCompileTime==1)
     {
       eigen_assert(expr.rows()==1 || expr.cols()==1);
diff --git a/Eigen/src/Core/Replicate.h b/Eigen/src/Core/Replicate.h
index 9960ef884..0b2d6d743 100644
--- a/Eigen/src/Core/Replicate.h
+++ b/Eigen/src/Core/Replicate.h
@@ -115,7 +115,7 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
   */
 template<typename Derived>
 template<int RowFactor, int ColFactor>
-const Replicate<Derived,RowFactor,ColFactor>
+EIGEN_DEVICE_FUNC const Replicate<Derived,RowFactor,ColFactor>
 DenseBase<Derived>::replicate() const
 {
   return Replicate<Derived,RowFactor,ColFactor>(derived());
@@ -130,7 +130,7 @@ DenseBase<Derived>::replicate() const
   * \sa VectorwiseOp::replicate(), DenseBase::replicate(), class Replicate
   */
 template<typename ExpressionType, int Direction>
-const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
+EIGEN_DEVICE_FUNC const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
 VectorwiseOp<ExpressionType,Direction>::replicate(Index factor) const
 {
   return typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
diff --git a/Eigen/src/Core/ReturnByValue.h b/Eigen/src/Core/ReturnByValue.h
index c44b7673b..11dc86d07 100644
--- a/Eigen/src/Core/ReturnByValue.h
+++ b/Eigen/src/Core/ReturnByValue.h
@@ -79,7 +79,7 @@ template<typename Derived> class ReturnByValue
 
 template<typename Derived>
 template<typename OtherDerived>
-Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
+EIGEN_DEVICE_FUNC Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
 {
   other.evalTo(derived());
   return derived();
diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h
index 0640cda2a..8b6b3ab03 100644
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h
@@ -114,7 +114,7 @@ template<typename MatrixType, int Direction> class Reverse
   *
   */
 template<typename Derived>
-inline typename DenseBase<Derived>::ReverseReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ReverseReturnType
 DenseBase<Derived>::reverse()
 {
   return ReverseReturnType(derived());
@@ -136,7 +136,7 @@ DenseBase<Derived>::reverse()
   *
   * \sa VectorwiseOp::reverseInPlace(), reverse() */
 template<typename Derived>
-inline void DenseBase<Derived>::reverseInPlace()
+EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::reverseInPlace()
 {
   if(cols()>rows())
   {
@@ -201,7 +201,7 @@ struct vectorwise_reverse_inplace_impl<Horizontal>
   *
   * \sa DenseBase::reverseInPlace(), reverse() */
 template<typename ExpressionType, int Direction>
-void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
+EIGEN_DEVICE_FUNC void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
 {
   internal::vectorwise_reverse_inplace_impl<Direction>::run(_expression().const_cast_derived());
 }
diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h
index 504c98f0e..2cf3fa1ef 100644
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@@ -71,7 +71,9 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
 
     EIGEN_DEVICE_FUNC
     explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
-    {}
+    {
+      EIGEN_STATIC_ASSERT(UpLo==Lower || UpLo==Upper,SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY);
+    }
 
     EIGEN_DEVICE_FUNC
     inline Index rows() const { return m_matrix.rows(); }
@@ -189,7 +191,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
                                    TriangularView<typename MatrixType::AdjointReturnType,TriMode> >::type(tmp2);
     }
 
-    typedef SelfAdjointView<const MatrixConjugateReturnType,Mode> ConjugateReturnType;
+    typedef SelfAdjointView<const MatrixConjugateReturnType,UpLo> ConjugateReturnType;
     /** \sa MatrixBase::conjugate() const */
     EIGEN_DEVICE_FUNC
     inline const ConjugateReturnType conjugate() const
@@ -322,7 +324,7 @@ public:
 /** This is the const version of MatrixBase::selfadjointView() */
 template<typename Derived>
 template<unsigned int UpLo>
-typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView() const
 {
   return typename ConstSelfAdjointViewReturnType<UpLo>::Type(derived());
@@ -339,7 +341,7 @@ MatrixBase<Derived>::selfadjointView() const
   */
 template<typename Derived>
 template<unsigned int UpLo>
-typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView()
 {
   return typename SelfAdjointViewReturnType<UpLo>::Type(derived());
diff --git a/Eigen/src/Core/SelfCwiseBinaryOp.h b/Eigen/src/Core/SelfCwiseBinaryOp.h
index 719ed72a5..7c89c2e23 100644
--- a/Eigen/src/Core/SelfCwiseBinaryOp.h
+++ b/Eigen/src/Core/SelfCwiseBinaryOp.h
@@ -15,33 +15,29 @@ namespace Eigen {
 // TODO generalize the scalar type of 'other'
 
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
 {
-  typedef typename Derived::PlainObject PlainObject;
   internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar,Scalar>());
   return derived();
 }
 
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
 {
-  typedef typename Derived::PlainObject PlainObject;
   internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar,Scalar>());
   return derived();
 }
 
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
 {
-  typedef typename Derived::PlainObject PlainObject;
   internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar,Scalar>());
   return derived();
 }
 
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
 {
-  typedef typename Derived::PlainObject PlainObject;
   internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar,Scalar>());
   return derived();
 }
diff --git a/Eigen/src/Core/Solve.h b/Eigen/src/Core/Solve.h
index 960a58597..2bf940a26 100644
--- a/Eigen/src/Core/Solve.h
+++ b/Eigen/src/Core/Solve.h
@@ -34,12 +34,12 @@ template<typename Decomposition, typename RhsType,typename StorageKind> struct s
 template<typename Decomposition, typename RhsType>
 struct solve_traits<Decomposition,RhsType,Dense>
 {
-  typedef Matrix<typename RhsType::Scalar,
+  typedef typename make_proper_matrix_type<typename RhsType::Scalar,
                  Decomposition::ColsAtCompileTime,
                  RhsType::ColsAtCompileTime,
                  RhsType::PlainObject::Options,
                  Decomposition::MaxColsAtCompileTime,
-                 RhsType::MaxColsAtCompileTime> PlainObject;  
+                 RhsType::MaxColsAtCompileTime>::type PlainObject;
 };
 
 template<typename Decomposition, typename RhsType>
@@ -181,7 +181,7 @@ struct Assignment<DstXprType, Solve<CwiseUnaryOp<internal::scalar_conjugate_op<t
   }
 };
 
-} // end namepsace internal
+} // end namespace internal
 
 } // end namespace Eigen
 
diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h
index 049890b25..a0011d4f9 100644
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -164,7 +164,7 @@ struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename MatrixType, unsigned int Mode>
 template<int Side, typename OtherDerived>
-void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
 {
   OtherDerived& other = _other.const_cast_derived();
   eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) );
diff --git a/Eigen/src/Core/SolverBase.h b/Eigen/src/Core/SolverBase.h
index 8a4adc229..702a5485c 100644
--- a/Eigen/src/Core/SolverBase.h
+++ b/Eigen/src/Core/SolverBase.h
@@ -56,7 +56,8 @@ class SolverBase : public EigenBase<Derived>
       MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
                                                              internal::traits<Derived>::MaxColsAtCompileTime>::ret),
       IsVectorAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime == 1
-                           || internal::traits<Derived>::MaxColsAtCompileTime == 1
+                           || internal::traits<Derived>::MaxColsAtCompileTime == 1,
+      NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2
     };
 
     /** Default constructor */
diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h
index d2fe1e199..77ea3c261 100644
--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h
@@ -50,6 +50,71 @@ inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& sc
     ssq += (bl*invScale).squaredNorm();
 }
 
+template<typename VectorType, typename RealScalar>
+void stable_norm_impl_inner_step(const VectorType &vec, RealScalar& ssq, RealScalar& scale, RealScalar& invScale)
+{
+  typedef typename VectorType::Scalar Scalar;
+  const Index blockSize = 4096;
+  
+  typedef typename internal::nested_eval<VectorType,2>::type VectorTypeCopy;
+  typedef typename internal::remove_all<VectorTypeCopy>::type VectorTypeCopyClean;
+  const VectorTypeCopy copy(vec);
+  
+  enum {
+    CanAlign = (   (int(VectorTypeCopyClean::Flags)&DirectAccessBit)
+                || (int(internal::evaluator<VectorTypeCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
+               ) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT)
+                 && (EIGEN_MAX_STATIC_ALIGN_BYTES>0) // if we cannot allocate on the stack, then let's not bother about this optimization
+  };
+  typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<VectorTypeCopyClean>::Alignment>,
+                                                   typename VectorTypeCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
+  Index n = vec.size();
+  
+  Index bi = internal::first_default_aligned(copy);
+  if (bi>0)
+    internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
+  for (; bi<n; bi+=blockSize)
+    internal::stable_norm_kernel(SegmentWrapper(copy.segment(bi,numext::mini(blockSize, n - bi))), ssq, scale, invScale);
+}
+
+template<typename VectorType>
+typename VectorType::RealScalar
+stable_norm_impl(const VectorType &vec, typename enable_if<VectorType::IsVectorAtCompileTime>::type* = 0 )
+{
+  using std::sqrt;
+  using std::abs;
+
+  Index n = vec.size();
+
+  if(n==1)
+    return abs(vec.coeff(0));
+
+  typedef typename VectorType::RealScalar RealScalar;
+  RealScalar scale(0);
+  RealScalar invScale(1);
+  RealScalar ssq(0); // sum of squares
+
+  stable_norm_impl_inner_step(vec, ssq, scale, invScale);
+  
+  return scale * sqrt(ssq);
+}
+
+template<typename MatrixType>
+typename MatrixType::RealScalar
+stable_norm_impl(const MatrixType &mat, typename enable_if<!MatrixType::IsVectorAtCompileTime>::type* = 0 )
+{
+  using std::sqrt;
+
+  typedef typename MatrixType::RealScalar RealScalar;
+  RealScalar scale(0);
+  RealScalar invScale(1);
+  RealScalar ssq(0); // sum of squares
+
+  for(Index j=0; j<mat.outerSize(); ++j)
+    stable_norm_impl_inner_step(mat.innerVector(j), ssq, scale, invScale);
+  return scale * sqrt(ssq);
+}
+
 template<typename Derived>
 inline typename NumTraits<typename traits<Derived>::Scalar>::Real
 blueNorm_impl(const EigenBase<Derived>& _vec)
@@ -74,7 +139,7 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
     // are used. For any specific computer, each of the assignment
     // statements can be replaced
     ibeta = std::numeric_limits<RealScalar>::radix;                 // base for floating-point numbers
-    it    = std::numeric_limits<RealScalar>::digits;                // number of base-beta digits in mantissa
+    it    = NumTraits<RealScalar>::digits();                        // number of base-beta digits in mantissa
     iemin = std::numeric_limits<RealScalar>::min_exponent;          // minimum exponent
     iemax = std::numeric_limits<RealScalar>::max_exponent;          // maximum exponent
     rbig  = (std::numeric_limits<RealScalar>::max)();               // largest floating-point number
@@ -98,12 +163,16 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
   RealScalar asml = RealScalar(0);
   RealScalar amed = RealScalar(0);
   RealScalar abig = RealScalar(0);
-  for(typename Derived::InnerIterator it(vec, 0); it; ++it)
+
+  for(Index j=0; j<vec.outerSize(); ++j)
   {
-    RealScalar ax = abs(it.value());
-    if(ax > ab2)     abig += numext::abs2(ax*s2m);
-    else if(ax < b1) asml += numext::abs2(ax*s1m);
-    else             amed += numext::abs2(ax);
+    for(typename Derived::InnerIterator it(vec, j); it; ++it)
+    {
+      RealScalar ax = abs(it.value());
+      if(ax > ab2)     abig += numext::abs2(ax*s2m);
+      else if(ax < b1) asml += numext::abs2(ax*s1m);
+      else             amed += numext::abs2(ax);
+    }
   }
   if(amed!=amed)
     return amed;  // we got a NaN
@@ -156,35 +225,7 @@ template<typename Derived>
 inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 MatrixBase<Derived>::stableNorm() const
 {
-  using std::sqrt;
-  using std::abs;
-  const Index blockSize = 4096;
-  RealScalar scale(0);
-  RealScalar invScale(1);
-  RealScalar ssq(0); // sum of square
-  
-  typedef typename internal::nested_eval<Derived,2>::type DerivedCopy;
-  typedef typename internal::remove_all<DerivedCopy>::type DerivedCopyClean;
-  DerivedCopy copy(derived());
-  
-  enum {
-    CanAlign = (   (int(DerivedCopyClean::Flags)&DirectAccessBit)
-                || (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
-               ) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT) // ifwe cannot allocate on the stack, then let's not bother about this optimization
-  };
-  typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>,
-                                                   typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
-  Index n = size();
-  
-  if(n==1)
-    return abs(this->coeff(0));
-  
-  Index bi = internal::first_default_aligned(copy);
-  if (bi>0)
-    internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
-  for (; bi<n; bi+=blockSize)
-    internal::stable_norm_kernel(SegmentWrapper(copy.segment(bi,numext::mini(blockSize, n - bi))), ssq, scale, invScale);
-  return scale * sqrt(ssq);
+  return internal::stable_norm_impl(derived());
 }
 
 /** \returns the \em l2 norm of \c *this using the Blue's algorithm.
@@ -212,7 +253,10 @@ template<typename Derived>
 inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 MatrixBase<Derived>::hypotNorm() const
 {
-  return this->cwiseAbs().redux(internal::scalar_hypot_op<RealScalar>());
+  if(size()==1)
+    return numext::abs(coeff(0,0));
+  else
+    return this->cwiseAbs().redux(internal::scalar_hypot_op<RealScalar>());
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index 79b767bcc..d7c204579 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -79,6 +79,7 @@ template<typename MatrixType> class Transpose
     nestedExpression() { return m_matrix; }
 
     /** \internal */
+    EIGEN_DEVICE_FUNC
     void resize(Index nrows, Index ncols) {
       m_matrix.resize(ncols,nrows);
     }
@@ -168,7 +169,7 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
   *
   * \sa transposeInPlace(), adjoint() */
 template<typename Derived>
-inline Transpose<Derived>
+EIGEN_DEVICE_FUNC inline Transpose<Derived>
 DenseBase<Derived>::transpose()
 {
   return TransposeReturnType(derived());
@@ -180,7 +181,7 @@ DenseBase<Derived>::transpose()
   *
   * \sa transposeInPlace(), adjoint() */
 template<typename Derived>
-inline typename DenseBase<Derived>::ConstTransposeReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ConstTransposeReturnType
 DenseBase<Derived>::transpose() const
 {
   return ConstTransposeReturnType(derived());
@@ -206,7 +207,7 @@ DenseBase<Derived>::transpose() const
   *
   * \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */
 template<typename Derived>
-inline const typename MatrixBase<Derived>::AdjointReturnType
+EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::AdjointReturnType
 MatrixBase<Derived>::adjoint() const
 {
   return AdjointReturnType(this->transpose());
@@ -281,7 +282,7 @@ struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non squ
   *
   * \sa transpose(), adjoint(), adjointInPlace() */
 template<typename Derived>
-inline void DenseBase<Derived>::transposeInPlace()
+EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::transposeInPlace()
 {
   eigen_assert((rows() == cols() || (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic))
                && "transposeInPlace() called on a non-square non-resizable matrix");
@@ -312,7 +313,7 @@ inline void DenseBase<Derived>::transposeInPlace()
   *
   * \sa transpose(), adjoint(), transposeInPlace() */
 template<typename Derived>
-inline void MatrixBase<Derived>::adjointInPlace()
+EIGEN_DEVICE_FUNC inline void MatrixBase<Derived>::adjointInPlace()
 {
   derived() = adjoint().eval();
 }
diff --git a/Eigen/src/Core/Transpositions.h b/Eigen/src/Core/Transpositions.h
index 19c17bb4a..81a4a5855 100644
--- a/Eigen/src/Core/Transpositions.h
+++ b/Eigen/src/Core/Transpositions.h
@@ -84,7 +84,7 @@ class TranspositionsBase
     }
 
     // FIXME: do we want such methods ?
-    // might be usefull when the target matrix expression is complex, e.g.:
+    // might be useful when the target matrix expression is complex, e.g.:
     // object.matrix().block(..,..,..,..) = trans * object.matrix().block(..,..,..,..);
     /*
     template<typename MatrixType>
@@ -384,7 +384,7 @@ class Transpose<TranspositionsBase<TranspositionsDerived> >
     const Product<OtherDerived, Transpose, AliasFreeProduct>
     operator*(const MatrixBase<OtherDerived>& matrix, const Transpose& trt)
     {
-      return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trt.derived());
+      return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trt);
     }
 
     /** \returns the \a matrix with the inverse transpositions applied to the rows.
diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h
index 667ef09dc..521de6160 100644
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -65,6 +65,7 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
     inline Index innerStride() const { return derived().innerStride(); }
     
     // dummy resize function
+    EIGEN_DEVICE_FUNC
     void resize(Index rows, Index cols)
     {
       EIGEN_UNUSED_VARIABLE(rows);
@@ -470,7 +471,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
       * \a Side==OnTheLeft (the default), or the right-inverse-multiply  \a other * inverse(\c *this) if
       * \a Side==OnTheRight.
       *
-      * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft
+      * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft
       *
       * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the
       * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
@@ -488,7 +489,6 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
       * \sa TriangularView::solveInPlace()
       */
     template<int Side, typename Other>
-    EIGEN_DEVICE_FUNC
     inline const internal::triangular_solve_retval<Side,TriangularViewType, Other>
     solve(const MatrixBase<Other>& other) const;
 
@@ -497,7 +497,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
       * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
       * This function will const_cast it, so constness isn't honored here.
       *
-      * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft
+      * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft
       *
       * See TriangularView:solve() for the details.
       */
@@ -554,7 +554,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
 // FIXME should we keep that possibility
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-inline TriangularView<MatrixType, Mode>&
+EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
 TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDerived>& other)
 {
   internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -564,7 +564,7 @@ TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDer
 // FIXME should we keep that possibility
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
 {
   internal::call_assignment_no_alias(derived(), other.template triangularView<Mode>());
 }
@@ -573,7 +573,7 @@ void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<Ot
 
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-inline TriangularView<MatrixType, Mode>&
+EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
 TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<OtherDerived>& other)
 {
   eigen_assert(Mode == int(OtherDerived::Mode));
@@ -583,7 +583,7 @@ TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<Othe
 
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
 {
   eigen_assert(Mode == int(OtherDerived::Mode));
   internal::call_assignment_no_alias(derived(), other.derived());
@@ -598,7 +598,7 @@ void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBas
   * If the matrix is triangular, the opposite part is set to zero. */
 template<typename Derived>
 template<typename DenseDerived>
-void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
+EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
 {
   evalToLazy(other.derived());
 }
@@ -624,6 +624,7 @@ void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
   */
 template<typename Derived>
 template<unsigned int Mode>
+EIGEN_DEVICE_FUNC
 typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type
 MatrixBase<Derived>::triangularView()
 {
@@ -633,6 +634,7 @@ MatrixBase<Derived>::triangularView()
 /** This is the const version of MatrixBase::triangularView() */
 template<typename Derived>
 template<unsigned int Mode>
+EIGEN_DEVICE_FUNC
 typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type
 MatrixBase<Derived>::triangularView() const
 {
@@ -715,6 +717,7 @@ struct unary_evaluator<TriangularView<MatrixType,Mode>, IndexBased>
 {
   typedef TriangularView<MatrixType,Mode> XprType;
   typedef evaluator<typename internal::remove_all<MatrixType>::type> Base;
+  EIGEN_DEVICE_FUNC
   unary_evaluator(const XprType &xpr) : Base(xpr.nestedExpression()) {}
 };
 
@@ -930,7 +933,7 @@ struct triangular_assignment_loop<Kernel, Mode, Dynamic, SetOpposite>
   * If the matrix is triangular, the opposite part is set to zero. */
 template<typename Derived>
 template<typename DenseDerived>
-void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
+EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
 {
   other.derived().resize(this->rows(), this->cols());
   internal::call_triangular_assignment_loop<Derived::Mode,(Derived::Mode&SelfAdjoint)==0 /* SetOpposite */>(other.derived(), derived().nestedExpression());
diff --git a/Eigen/src/Core/VectorBlock.h b/Eigen/src/Core/VectorBlock.h
index d72fbf7e9..0ede5d58e 100644
--- a/Eigen/src/Core/VectorBlock.h
+++ b/Eigen/src/Core/VectorBlock.h
@@ -35,7 +35,7 @@ struct traits<VectorBlock<VectorType, Size> >
   * It is the return type of DenseBase::segment(Index,Index) and DenseBase::segment<int>(Index) and
   * most of the time this is the only way it is used.
   *
-  * However, if you want to directly maniputate sub-vector expressions,
+  * However, if you want to directly manipulate sub-vector expressions,
   * for instance if you want to write a function returning such an expression, you
   * will need to use this class.
   *
diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h
index 4fe267e9f..893bc796f 100644
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@@ -670,7 +670,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
   * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
   */
 template<typename Derived>
-inline typename DenseBase<Derived>::ColwiseReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ColwiseReturnType
 DenseBase<Derived>::colwise()
 {
   return ColwiseReturnType(derived());
@@ -684,7 +684,7 @@ DenseBase<Derived>::colwise()
   * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
   */
 template<typename Derived>
-inline typename DenseBase<Derived>::RowwiseReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::RowwiseReturnType
 DenseBase<Derived>::rowwise()
 {
   return RowwiseReturnType(derived());
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h
index 99439c8aa..7fa61969d 100644
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -204,23 +204,7 @@ template<> struct conj_helper<Packet4cf, Packet4cf, true,true>
   }
 };
 
-template<> struct conj_helper<Packet8f, Packet4cf, false,false>
-{
-  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const
-  { return Packet4cf(Eigen::internal::pmul(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet4cf, Packet8f, false,false>
-{
-  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const
-  { return Packet4cf(Eigen::internal::pmul(x.v, y)); }
-};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f)
 
 template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
 {
@@ -400,23 +384,7 @@ template<> struct conj_helper<Packet2cd, Packet2cd, true,true>
   }
 };
 
-template<> struct conj_helper<Packet4d, Packet2cd, false,false>
-{
-  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const
-  { return Packet2cd(Eigen::internal::pmul(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet2cd, Packet4d, false,false>
-{
-  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const
-  { return Packet2cd(Eigen::internal::pmul(x.v, y)); }
-};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d)
 
 template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
 {
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 636230944..774e64981 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -318,9 +318,9 @@ template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
 }
 
 #ifndef EIGEN_VECTORIZE_AVX512
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
 #endif
 
 template<> EIGEN_STRONG_INLINE float  pfirst<Packet8f>(const Packet8f& a) {
@@ -343,9 +343,12 @@ template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
 {
    __m256d tmp = _mm256_shuffle_pd(a,a,5);
   return _mm256_permute2f128_pd(tmp, tmp, 1);
-
+  #if 0
+  // This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd
+  // exhibit the same latency/throughput, but it is here for future reference/benchmarking...
   __m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
     return _mm256_permute_pd(swap_halves,5);
+  #endif
 }
 
 // pabs should be ok
@@ -412,7 +415,7 @@ template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
   return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1))));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f predux_downto4<Packet8f>(const Packet8f& a)
+template<> EIGEN_STRONG_INLINE Packet4f predux_half_dowto4<Packet8f>(const Packet8f& a)
 {
   return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
 }
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
index 399be0ee4..ba1246722 100644
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -88,9 +88,9 @@ plog<Packet16f>(const Packet16f& _x) {
   //     x = x + x - 1.0;
   //   } else { x = x - 1.0; }
   __mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ);
-  Packet16f tmp = _mm512_mask_blend_ps(mask, x, _mm512_setzero_ps());
+  Packet16f tmp = _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), x);
   x = psub(x, p16f_1);
-  e = psub(e, _mm512_mask_blend_ps(mask, p16f_1, _mm512_setzero_ps()));
+  e = psub(e, _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), p16f_1));
   x = padd(x, tmp);
 
   Packet16f x2 = pmul(x, x);
@@ -119,8 +119,9 @@ plog<Packet16f>(const Packet16f& _x) {
   x = padd(x, y2);
 
   // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
-  return _mm512_mask_blend_ps(iszero_mask, p16f_minus_inf,
-                              _mm512_mask_blend_ps(invalid_mask, p16f_nan, x));
+  return _mm512_mask_blend_ps(iszero_mask,
+                              _mm512_mask_blend_ps(invalid_mask, x, p16f_nan),
+                              p16f_minus_inf);
 }
 #endif
 
@@ -257,50 +258,39 @@ pexp<Packet8d>(const Packet8d& _x) {
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
 psqrt<Packet16f>(const Packet16f& _x) {
-  _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
-  _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
-  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
-
-  Packet16f neg_half = pmul(_x, p16f_minus_half);
+  Packet16f neg_half = pmul(_x, pset1<Packet16f>(-.5f));
+  __mmask16 denormal_mask = _mm512_kand(
+      _mm512_cmp_ps_mask(_x, pset1<Packet16f>((std::numeric_limits<float>::min)()),
+                        _CMP_LT_OQ),
+      _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ));
 
-  // select only the inverse sqrt of positive normal inputs (denormals are
-  // flushed to zero and cause infs as well).
-  __mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ);
-  Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_rsqrt14_ps(_x),
-                                     _mm512_setzero_ps());
+  Packet16f x = _mm512_rsqrt14_ps(_x);
 
   // Do a single step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
+  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet16f>(1.5f)));
 
-  // Multiply the original _x by it's reciprocal square root to extract the
-  // square root.
-  return pmul(_x, x);
+  // Flush results for denormals to zero.
+  return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps());
 }
 
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
 psqrt<Packet8d>(const Packet8d& _x) {
-  _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
-  _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
-  _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
-
-  Packet8d neg_half = pmul(_x, p8d_minus_half);
+  Packet8d neg_half = pmul(_x, pset1<Packet8d>(-.5f));
+  __mmask16 denormal_mask = _mm512_kand(
+      _mm512_cmp_pd_mask(_x, pset1<Packet8d>((std::numeric_limits<double>::min)()),
+                        _CMP_LT_OQ),
+      _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ));
 
-  // select only the inverse sqrt of positive normal inputs (denormals are
-  // flushed to zero and cause infs as well).
-  __mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ);
-  Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_rsqrt14_pd(_x),
-                                    _mm512_setzero_pd());
+  Packet8d x = _mm512_rsqrt14_pd(_x);
 
-  // Do a first step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
+  // Do a single step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5f)));
 
   // Do a second step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
+  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5f)));
 
-  // Multiply the original _x by it's reciprocal square root to extract the
-  // square root.
-  return pmul(_x, x);
+  return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd());
 }
 #else
 template <>
@@ -333,20 +323,18 @@ prsqrt<Packet16f>(const Packet16f& _x) {
   // select only the inverse sqrt of positive normal inputs (denormals are
   // flushed to zero and cause infs as well).
   __mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ);
-  Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(),
-                                     _mm512_rsqrt14_ps(_x));
+  Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps());
 
   // Fill in NaNs and Infs for the negative/zero entries.
   __mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ);
   Packet16f infs_and_nans = _mm512_mask_blend_ps(
-      neg_mask, p16f_nan,
-      _mm512_mask_blend_ps(le_zero_mask, p16f_inf, _mm512_setzero_ps()));
+      neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan);
 
   // Do a single step of Newton's iteration.
   x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
 
   // Insert NaNs and Infs in all the right places.
-  return _mm512_mask_blend_ps(le_zero_mask, infs_and_nans, x);
+  return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans);
 }
 
 template <>
@@ -363,14 +351,12 @@ prsqrt<Packet8d>(const Packet8d& _x) {
   // select only the inverse sqrt of positive normal inputs (denormals are
   // flushed to zero and cause infs as well).
   __mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ);
-  Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(),
-                                    _mm512_rsqrt14_pd(_x));
+  Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd());
 
   // Fill in NaNs and Infs for the negative/zero entries.
   __mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ);
   Packet8d infs_and_nans = _mm512_mask_blend_pd(
-      neg_mask, p8d_nan,
-      _mm512_mask_blend_pd(le_zero_mask, p8d_inf, _mm512_setzero_pd()));
+      neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan);
 
   // Do a first step of Newton's iteration.
   x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
@@ -379,9 +365,9 @@ prsqrt<Packet8d>(const Packet8d& _x) {
   x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
 
   // Insert NaNs and Infs in all the right places.
-  return _mm512_mask_blend_pd(le_zero_mask, infs_and_nans, x);
+  return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans);
 }
-#else
+#elif defined(EIGEN_VECTORIZE_AVX512ER)
 template <>
 EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
   return _mm512_rsqrt28_ps(x);
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index 12b897572..9fbb256a1 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -54,6 +54,7 @@ template<> struct packet_traits<float>  : default_packet_traits
     AlignedOnScalar = 1,
     size = 16,
     HasHalfPacket = 1,
+    HasBlend = 0,
 #if EIGEN_GNUC_AT_LEAST(5, 3)
 #ifdef EIGEN_VECTORIZE_AVX512DQ
     HasLog = 1,
@@ -470,6 +471,8 @@ EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) {
   __m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0));
   return pairs;
 }
+
+#ifdef EIGEN_VECTORIZE_AVX512DQ
 // Loads 4 doubles from memory a returns the packet {a0, a0  a1, a1, a2, a2, a3,
 // a3}
 template <>
@@ -481,6 +484,17 @@ EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
   x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[3]), 3);
   return x;
 }
+#else
+template <>
+EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
+  __m512d x = _mm512_setzero_pd();
+  x = _mm512_mask_broadcastsd_pd(x, 0x3<<0, _mm_load_sd(from+0));
+  x = _mm512_mask_broadcastsd_pd(x, 0x3<<2, _mm_load_sd(from+1));
+  x = _mm512_mask_broadcastsd_pd(x, 0x3<<4, _mm_load_sd(from+2));
+  x = _mm512_mask_broadcastsd_pd(x, 0x3<<6, _mm_load_sd(from+3));
+  return x;
+}
+#endif
 
 // Loads 4 floats from memory a returns the packet
 // {a0, a0  a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
@@ -537,7 +551,7 @@ EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet16i& from) {
 template <>
 EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
                                                              Index stride) {
-  Packet16i stride_vector = _mm512_set1_epi32(stride);
+  Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
   Packet16i stride_multiplier =
       _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
@@ -547,7 +561,7 @@ EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
 template <>
 EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from,
                                                             Index stride) {
-  Packet8i stride_vector = _mm256_set1_epi32(stride);
+  Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
   Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
   Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
 
@@ -558,7 +572,7 @@ template <>
 EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to,
                                                          const Packet16f& from,
                                                          Index stride) {
-  Packet16i stride_vector = _mm512_set1_epi32(stride);
+  Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
   Packet16i stride_multiplier =
       _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
@@ -568,7 +582,7 @@ template <>
 EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to,
                                                          const Packet8d& from,
                                                          Index stride) {
-  Packet8i stride_vector = _mm256_set1_epi32(stride);
+  Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
   Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
   Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
   _mm512_i32scatter_pd(to, indices, from, 8);
@@ -590,9 +604,9 @@ EIGEN_STRONG_INLINE void pstore1<Packet16i>(int* to, const int& a) {
   pstore(to, pa);
 }
 
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
 
 template <>
 EIGEN_STRONG_INLINE float pfirst<Packet16f>(const Packet16f& a) {
@@ -620,13 +634,13 @@ template<> EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a)
 template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a)
 {
   // _mm512_abs_ps intrinsic not found, so hack around it
-  return (__m512)_mm512_and_si512((__m512i)a, _mm512_set1_epi32(0x7fffffff));
+  return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x7fffffff)));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
   // _mm512_abs_ps intrinsic not found, so hack around it
-  return (__m512d)_mm512_and_si512((__m512i)a,
-                                   _mm512_set1_epi64(0x7fffffffffffffff));
+  return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a),
+                                   _mm512_set1_epi64(0x7fffffffffffffff)));
 }
 
 #ifdef EIGEN_VECTORIZE_AVX512DQ
@@ -646,8 +660,7 @@ EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
 
 #ifdef EIGEN_VECTORIZE_AVX512DQ
 #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
-  OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTA, 0);        \
-  OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTB, 1);
+  OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1);
 #else
 #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB)                    \
   OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \
@@ -841,7 +854,7 @@ template<> EIGEN_STRONG_INLINE Packet8d preduxp<Packet8d>(const Packet8d* vecs)
 
   final_1 = _mm256_add_pd(final_1, _mm256_blend_pd(tmp0, tmp1, 0xC));
 
-  __m512d final_output = _mm512_insertf64x4(final_output, final_0, 0);
+  __m512d final_output = _mm512_castpd256_pd512(final_0);
 
   return _mm512_insertf64x4(final_output, final_1, 1);
 }
@@ -874,7 +887,7 @@ EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) {
+EIGEN_STRONG_INLINE Packet8f predux_half_dowto4<Packet16f>(const Packet16f& a) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
   __m256 lane0 = _mm512_extractf32x8_ps(a, 0);
   __m256 lane1 = _mm512_extractf32x8_ps(a, 1);
@@ -890,7 +903,7 @@ EIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) {
 #endif
 }
 template <>
-EIGEN_STRONG_INLINE Packet4d predux_downto4<Packet8d>(const Packet8d& a) {
+EIGEN_STRONG_INLINE Packet4d predux_half_dowto4<Packet8d>(const Packet8d& a) {
   __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
   __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
   __m256d res = _mm256_add_pd(lane0, lane1);
@@ -1272,11 +1285,38 @@ EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& /*ifPacket*/,
   return Packet16f();
 }
 template <>
-EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& /*ifPacket*/,
-                                    const Packet8d& /*thenPacket*/,
-                                    const Packet8d& /*elsePacket*/) {
-  assert(false && "To be implemented");
-  return Packet8d();
+EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket,
+                                    const Packet8d& thenPacket,
+                                    const Packet8d& elsePacket) {
+  __mmask8 m = (ifPacket.select[0]   )
+             | (ifPacket.select[1]<<1)
+             | (ifPacket.select[2]<<2)
+             | (ifPacket.select[3]<<3)
+             | (ifPacket.select[4]<<4)
+             | (ifPacket.select[5]<<5)
+             | (ifPacket.select[6]<<6)
+             | (ifPacket.select[7]<<7);
+  return _mm512_mask_blend_pd(m, elsePacket, thenPacket);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pinsertfirst(const Packet16f& a, float b)
+{
+  return _mm512_mask_broadcastss_ps(a, (1), _mm_load_ss(&b));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8d pinsertfirst(const Packet8d& a, double b)
+{
+  return _mm512_mask_broadcastsd_pd(a, (1), _mm_load_sd(&b));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pinsertlast(const Packet16f& a, float b)
+{
+  return _mm512_mask_broadcastss_ps(a, (1<<15), _mm_load_ss(&b));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8d pinsertlast(const Packet8d& a, double b)
+{
+  return _mm512_mask_broadcastsd_pd(a, (1<<7), _mm_load_sd(&b));
 }
 
 } // end namespace internal
diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
index 67db2f8ee..3e665730c 100644
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -224,23 +224,7 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
   }
 };
 
-template<> struct conj_helper<Packet4f, Packet2cf, false,false>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
-  { return Packet2cf(internal::pmul<Packet4f>(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet2cf, Packet4f, false,false>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
-  { return Packet2cf(internal::pmul<Packet4f>(x.v, y)); }
-};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
 
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
@@ -416,23 +400,8 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
     return pconj(internal::pmul(a, b));
   }
 };
-template<> struct conj_helper<Packet2d, Packet1cd, false,false>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
-  { return Packet1cd(internal::pmul<Packet2d>(x, y.v)); }
-};
 
-template<> struct conj_helper<Packet1cd, Packet2d, false,false>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
-  { return Packet1cd(internal::pmul<Packet2d>(x.v, y)); }
-};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
 
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index b3f1ea199..7f4e90f75 100755
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -103,7 +103,7 @@ static Packet16uc p16uc_PSET32_WODD   = vec_sld((Packet16uc) vec_splat((Packet4u
 static Packet16uc p16uc_PSET32_WEVEN  = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
 static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8);      //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
 #else
-static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; 
+static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
 static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
 static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
 static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
@@ -388,10 +388,30 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); }
 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; }
 
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  #ifdef __VSX__
+  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
+  Packet4f ret;
+  __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+  return ret;
+  #else
+  return vec_min(a, b);
+  #endif
+}
 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
 
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  #ifdef __VSX__
+  // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
+  Packet4f ret;
+  __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+  return ret;
+  #else
+  return vec_max(a, b);
+  #endif
+}
 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
 
 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
@@ -434,7 +454,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
   return (Packet4i) vec_perm(MSQ, LSQ, mask);    // align the data
 }
 #else
-// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
+// We also need to redefine little endian loading of Packet4i/Packet4f using VSX
 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
 {
   EIGEN_DEBUG_UNALIGNED_LOAD
@@ -500,7 +520,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& f
   vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
 }
 #else
-// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
+// We also need to redefine little endian loading of Packet4i/Packet4f using VSX
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet4i& from)
 {
   EIGEN_DEBUG_ALIGNED_STORE
@@ -764,7 +784,7 @@ typedef __vector __bool long         Packet2bl;
 
 static Packet2l  p2l_ONE  = { 1, 1 };
 static Packet2l  p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
-static Packet2d  p2d_ONE  = { 1.0, 1.0 }; 
+static Packet2d  p2d_ONE  = { 1.0, 1.0 };
 static Packet2d  p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
 static Packet2d  p2d_MZERO = { -0.0, -0.0 };
 
@@ -910,9 +930,21 @@ template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const
 // for some weird raisons, it has to be overloaded for packet of integers
 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
 
-template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b)
+{
+  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
+  Packet2d ret;
+  __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+  return ret;
+ }
 
-template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b)
+{
+  // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
+  Packet2d ret;
+  __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+  return ret;
+}
 
 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
 
@@ -969,7 +1001,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
   Packet2d v[2], sum;
   v[0] = vecs[0] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[0]), reinterpret_cast<Packet4f>(vecs[0]), 8));
   v[1] = vecs[1] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[1]), reinterpret_cast<Packet4f>(vecs[1]), 8));
- 
+
 #ifdef _BIG_ENDIAN
   sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[0]), reinterpret_cast<Packet4f>(v[1]), 8));
 #else
@@ -1022,7 +1054,7 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {
 
 template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
   Packet2l select = { ifPacket.select[0], ifPacket.select[1] };
-  Packet2bl mask = vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE));
+  Packet2bl mask = reinterpret_cast<Packet2bl>( vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)) );
   return vec_sel(elsePacket, thenPacket, mask);
 }
 #endif // __VSX__
diff --git a/Eigen/src/Core/arch/CUDA/Complex.h b/Eigen/src/Core/arch/CUDA/Complex.h
index 9c2536509..57d1201f4 100644
--- a/Eigen/src/Core/arch/CUDA/Complex.h
+++ b/Eigen/src/Core/arch/CUDA/Complex.h
@@ -16,7 +16,7 @@ namespace Eigen {
 
 namespace internal {
 
-#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
 
 // Many std::complex methods such as operator+, operator-, operator* and
 // operator/ are not constexpr. Due to this, clang does not treat them as device
@@ -55,7 +55,7 @@ template<typename T> struct scalar_difference_op<std::complex<T>, std::complex<T
 // Product
 template<typename T> struct scalar_product_op<const std::complex<T>, const std::complex<T> >  : binary_op_base<const std::complex<T>, const std::complex<T> > {
   enum {
-    Vectorizable = packet_traits<std::complex<T>>::HasMul
+    Vectorizable = packet_traits<std::complex<T> >::HasMul
   };
   typedef typename std::complex<T> result_type;
 
@@ -76,7 +76,7 @@ template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T> >
 // Quotient
 template<typename T> struct scalar_quotient_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
   enum {
-    Vectorizable = packet_traits<std::complex<T>>::HasDiv
+    Vectorizable = packet_traits<std::complex<T> >::HasDiv
   };
   typedef typename std::complex<T> result_type;
 
diff --git a/Eigen/src/Core/arch/Default/ConjHelper.h b/Eigen/src/Core/arch/Default/ConjHelper.h
new file mode 100644
index 000000000..4cfe34e05
--- /dev/null
+++ b/Eigen/src/Core/arch/Default/ConjHelper.h
@@ -0,0 +1,29 @@
+
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARCH_CONJ_HELPER_H
+#define EIGEN_ARCH_CONJ_HELPER_H
+
+#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL)                                                          \
+  template<> struct conj_helper<PACKET_REAL, PACKET_CPLX, false,false> {                                          \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const \
+    { return padd(c, pmul(x,y)); }                                                                                \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, const PACKET_CPLX& y) const                        \
+    { return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x, y.v)); }                                           \
+  };                                                                                                              \
+                                                                                                                  \
+  template<> struct conj_helper<PACKET_CPLX, PACKET_REAL, false,false> {                                          \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const \
+    { return padd(c, pmul(x,y)); }                                                                                \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, const PACKET_REAL& y) const                        \
+    { return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x.v, y)); }                                           \
+  };
+
+#endif // EIGEN_ARCH_CONJ_HELPER_H
diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/GPU/Half.h
index db9878796..65b38bbfb 100644
--- a/Eigen/src/Core/arch/CUDA/Half.h
+++ b/Eigen/src/Core/arch/GPU/Half.h
@@ -13,7 +13,7 @@
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted.
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
@@ -26,15 +26,15 @@
 
 
 // Standard 16-bit float type, mostly useful for GPUs. Defines a new
-// type Eigen::half (inheriting from CUDA's __half struct) with
+// type Eigen::half (inheriting either from CUDA's or HIP's __half struct) with
 // operator overloads such that it behaves basically as an arithmetic
 // type. It will be quite slow on CPUs (so it is recommended to stay
 // in fp32 for CPUs, except for simple parameter conversions, I/O
 // to disk and the likes), but fast on GPUs.
 
 
-#ifndef EIGEN_HALF_CUDA_H
-#define EIGEN_HALF_CUDA_H
+#ifndef EIGEN_HALF_GPU_H
+#define EIGEN_HALF_GPU_H
 
 #if __cplusplus > 199711L
 #define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type()
@@ -49,39 +49,107 @@ struct half;
 
 namespace half_impl {
 
-#if !defined(EIGEN_HAS_CUDA_FP16)
-
-// Make our own __half definition that is similar to CUDA's.
-struct __half {
-  EIGEN_DEVICE_FUNC __half() : x(0) {}
-  explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {}
+#if !defined(EIGEN_HAS_GPU_FP16)
+// Make our own __half_raw definition that is similar to CUDA's.
+struct __half_raw {
+  EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
+  explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
   unsigned short x;
 };
+#elif defined(EIGEN_HAS_HIP_FP16)
+ #if defined(EIGEN_HAS_OLD_HIP_FP16)
+// Make a __half_raw definition that is
+// ++ compatible with that of Eigen and
+// ++ add an implicit conversion to the native __half of the old HIP implementation.
+//
+// Keeping ".x" as "unsigned short" keeps the interface the same between the Eigen and HIP implementation.
+//
+// In the old HIP implementation,
+//   ++ __half is a typedef of __fp16
+//   ++ the "__h*" routines take "__half" arguments
+// so we need to implicitly convert "__half_raw" to "__half" to avoid having to explicitly make 
+// that conversiion in each call to a "__h*" routine...that is why we have "operator __half" routine
+struct __half_raw {
+  EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
+  explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
+  union {
+    unsigned short x;
+    __half data;
+  };
+  operator __half(void) const { return data; }
+};
+ #endif
+#elif defined(EIGEN_HAS_CUDA_FP16)
+ #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
+// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
+ typedef __half __half_raw;
+ #endif // defined(EIGEN_HAS_CUDA_FP16)
+
+#elif defined(EIGEN_USE_SYCL) && defined(__SYCL_DEVICE_ONLY__)
+typedef cl::sycl::half __half_raw;
 
 #endif
 
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x);
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff);
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
 
-struct half_base : public __half {
+struct half_base : public __half_raw {
   EIGEN_DEVICE_FUNC half_base() {}
-  EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half(h) {}
-  EIGEN_DEVICE_FUNC half_base(const __half& h) : __half(h) {}
+  EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
+  EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
+
+#if defined(EIGEN_HAS_GPU_FP16)
+ #if defined(EIGEN_HAS_HIP_FP16)
+  #if defined(EIGEN_HAS_OLD_HIP_FP16)
+  EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(__half_as_ushort(h)) {}
+  #else
+  EIGEN_DEVICE_FUNC half_base(const __half& h) { x = __half_as_ushort(h); }
+  #endif
+ #elif defined(EIGEN_HAS_CUDA_FP16)
+  #if (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000)
+  EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
+  #endif
+ #endif    
+#endif
 };
 
 } // namespace half_impl
 
 // Class definition.
 struct half : public half_impl::half_base {
-  #if !defined(EIGEN_HAS_CUDA_FP16)
-    typedef half_impl::__half __half;
-  #endif
+
+  // Writing this out as separate #if-else blocks to make the code easier to follow
+  // The same applies to most #if-else blocks in this file
+#if !defined(EIGEN_HAS_GPU_FP16)
+  typedef half_impl::__half_raw __half_raw;
+#elif defined(EIGEN_HAS_HIP_FP16)
+ #if defined(EIGEN_HAS_OLD_HIP_FP16)
+  typedef half_impl::__half_raw __half_raw;
+ #endif
+#elif defined(EIGEN_HAS_CUDA_FP16)
+  // Note that EIGEN_CUDACC_VER is set to 0 even when compiling with HIP, so (EIGEN_CUDACC_VER < 90000) is true even for HIP!
+  // So keeping this within #if defined(EIGEN_HAS_CUDA_FP16) is needed
+ #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000  
+  typedef half_impl::__half_raw __half_raw;
+ #endif
+#endif
 
   EIGEN_DEVICE_FUNC half() {}
 
-  EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
+  EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
   EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
+  
+#if defined(EIGEN_HAS_GPU_FP16)
+ #if defined(EIGEN_HAS_HIP_FP16)
+  EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
+ #elif defined(EIGEN_HAS_CUDA_FP16)
+  #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
+  EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
+  #endif
+ #endif
+#endif
+  
 
   explicit EIGEN_DEVICE_FUNC half(bool b)
       : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
@@ -136,72 +204,136 @@ struct half : public half_impl::half_base {
     x = other.x;
     return *this;
   }
+
 };
 
+} // end namespace Eigen
+
+namespace std {
+template<>
+struct numeric_limits<Eigen::half> {
+  static const bool is_specialized = true;
+  static const bool is_signed = true;
+  static const bool is_integer = false;
+  static const bool is_exact = false;
+  static const bool has_infinity = true;
+  static const bool has_quiet_NaN = true;
+  static const bool has_signaling_NaN = true;
+  static const float_denorm_style has_denorm = denorm_present;
+  static const bool has_denorm_loss = false;
+  static const std::float_round_style round_style = std::round_to_nearest;
+  static const bool is_iec559 = false;
+  static const bool is_bounded = false;
+  static const bool is_modulo = false;
+  static const int digits = 11;
+  static const int digits10 = 3;      // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
+  static const int max_digits10 = 5;  // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
+  static const int radix = 2;
+  static const int min_exponent = -13;
+  static const int min_exponent10 = -4;
+  static const int max_exponent = 16;
+  static const int max_exponent10 = 4;
+  static const bool traps = true;
+  static const bool tinyness_before = false;
+
+  static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); }
+  static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
+  static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
+  static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); }
+  static Eigen::half round_error() { return Eigen::half(0.5); }
+  static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
+  static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
+  static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
+  static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); }
+};
+
+// If std::numeric_limits<T> is specialized, should also specialize
+// std::numeric_limits<const T>, std::numeric_limits<volatile T>, and
+// std::numeric_limits<const volatile T>
+// https://stackoverflow.com/a/16519653/
+template<>
+struct numeric_limits<const Eigen::half> : numeric_limits<Eigen::half> {};
+template<>
+struct numeric_limits<volatile Eigen::half> : numeric_limits<Eigen::half> {};
+template<>
+struct numeric_limits<const volatile Eigen::half> : numeric_limits<Eigen::half> {};
+} // end namespace std
+
+namespace Eigen {
+
 namespace half_impl {
 
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
 
 // Intrinsics for native fp16 support. Note that on current hardware,
 // these are no faster than fp32 arithmetic (you need to use the half2
 // versions to get the ALU speed increased), but you do save the
 // conversion steps back and forth.
 
-__device__ half operator + (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
+#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
+  return __hadd(::__half(a), ::__half(b));
+#else
   return __hadd(a, b);
+#endif
 }
-__device__ half operator * (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) {
   return __hmul(a, b);
 }
-__device__ half operator - (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
   return __hsub(a, b);
 }
-__device__ half operator / (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
+#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
+  return __hdiv(a, b);
+#else
   float num = __half2float(a);
   float denom = __half2float(b);
   return __float2half(num / denom);
+#endif
 }
-__device__ half operator - (const half& a) {
+EIGEN_STRONG_INLINE __device__ half operator - (const half& a) {
   return __hneg(a);
 }
-__device__ half& operator += (half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) {
   a = a + b;
   return a;
 }
-__device__ half& operator *= (half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) {
   a = a * b;
   return a;
 }
-__device__ half& operator -= (half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) {
   a = a - b;
   return a;
 }
-__device__ half& operator /= (half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) {
   a = a / b;
   return a;
 }
-__device__ bool operator == (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) {
   return __heq(a, b);
 }
-__device__ bool operator != (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) {
   return __hne(a, b);
 }
-__device__ bool operator < (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) {
   return __hlt(a, b);
 }
-__device__ bool operator <= (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) {
   return __hle(a, b);
 }
-__device__ bool operator > (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) {
   return __hgt(a, b);
 }
-__device__ bool operator >= (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
   return __hge(a, b);
 }
 
 #else  // Emulate support for half floats
 
-// Definitions for CPUs and older CUDA, mostly working through conversion
+// Definitions for CPUs and older HIP+CUDA, mostly working through conversion
 // to/from fp32.
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
@@ -238,10 +370,10 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b)
   return a;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
-  return float(a) == float(b);
+  return numext::equal_strict(float(a),float(b));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
-  return float(a) != float(b);
+  return numext::not_equal_strict(float(a), float(b));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
   return float(a) < float(b);
@@ -269,34 +401,36 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
 // these in hardware. If we need more performance on older/other CPUs, they are
 // also possible to vectorize directly.
 
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) {
-  __half h;
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) {
+  __half_raw h;
   h.x = x;
   return h;
 }
 
-union FP32 {
+union float32_bits {
   unsigned int u;
   float f;
 };
 
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
-  return __float2half(ff);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+  __half tmp_ff = __float2half(ff);
+  return *(__half_raw*)&tmp_ff;
 
 #elif defined(EIGEN_HAS_FP16_C)
-  __half h;
+  __half_raw h;
   h.x = _cvtss_sh(ff, 0);
   return h;
 
 #else
-  FP32 f; f.f = ff;
+  float32_bits f; f.f = ff;
 
-  const FP32 f32infty = { 255 << 23 };
-  const FP32 f16max = { (127 + 16) << 23 };
-  const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
+  const float32_bits f32infty = { 255 << 23 };
+  const float32_bits f16max = { (127 + 16) << 23 };
+  const float32_bits denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
   unsigned int sign_mask = 0x80000000u;
-  __half o;
+  __half_raw o;
   o.x = static_cast<unsigned short>(0x0u);
 
   unsigned int sign = f.u & sign_mask;
@@ -335,17 +469,18 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
 #endif
 }
 
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   return __half2float(h);
 
 #elif defined(EIGEN_HAS_FP16_C)
   return _cvtsh_ss(h.x);
 
 #else
-  const FP32 magic = { 113 << 23 };
+  const float32_bits magic = { 113 << 23 };
   const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
-  FP32 o;
+  float32_bits o;
 
   o.u = (h.x & 0x7fff) << 13;             // exponent/mantissa bits
   unsigned int exp = shifted_exp & o.u;   // just the exponent
@@ -370,7 +505,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
   return (a.x & 0x7fff) == 0x7c00;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   return __hisnan(a);
 #else
   return (a.x & 0x7fff) > 0x7c00;
@@ -386,7 +522,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
   return result;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
+#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hexp(a));
 #else
    return half(::expf(float(a)));
@@ -396,7 +533,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
   return half(numext::expm1(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   return half(::hlog(a));
 #else
   return half(::logf(float(a)));
@@ -409,7 +547,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
   return half(::log10f(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
+#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hsqrt(a));
 #else
     return half(::sqrtf(float(a)));
@@ -431,14 +570,16 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
   return half(::tanhf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hfloor(a));
 #else
   return half(::floorf(float(a)));
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hceil(a));
 #else
   return half(::ceilf(float(a)));
@@ -446,7 +587,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
 }
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   return __hlt(b, a) ? b : a;
 #else
   const float f1 = static_cast<float>(a);
@@ -455,7 +597,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   return __hlt(a, b) ? b : a;
 #else
   const float f1 = static_cast<float>(a);
@@ -496,6 +639,13 @@ template<> struct is_arithmetic<half> { enum { value = true }; };
 template<> struct NumTraits<Eigen::half>
     : GenericNumTraits<Eigen::half>
 {
+  enum {
+    IsSigned = true,
+    IsInteger = false,
+    IsComplex = false,
+    RequireInitialization = false
+  };
+
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() {
     return half_impl::raw_uint16_to_half(0x0800);
   }
@@ -526,7 +676,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
   return Eigen::half(::expf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
   return Eigen::half(::hlog(a));
 #else
   return Eigen::half(::logf(float(a)));
@@ -560,14 +711,22 @@ struct hash<Eigen::half> {
 
 
 // Add the missing shfl_xor intrinsic
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
+
 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
+  #if (EIGEN_CUDACC_VER < 90000) || \
+    defined(EIGEN_HAS_HIP_FP16)
   return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
+  #else
+  return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
+  #endif
 }
 #endif
 
-// ldg() has an overload for __half, but we also need one for Eigen::half.
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
+#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
   return Eigen::half_impl::raw_uint16_to_half(
       __ldg(reinterpret_cast<const unsigned short*>(ptr)));
@@ -575,7 +734,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr)
 #endif
 
 
-#if defined(__CUDA_ARCH__)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 namespace Eigen {
 namespace numext {
 
@@ -601,4 +760,4 @@ bool (isfinite)(const Eigen::half& h) {
 }  // namespace numext
 #endif
 
-#endif // EIGEN_HALF_CUDA_H
+#endif // EIGEN_HALF_GPU_H
diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/GPU/MathFunctions.h
index 987a5291c..d2b3a2568 100644
--- a/Eigen/src/Core/arch/CUDA/MathFunctions.h
+++ b/Eigen/src/Core/arch/GPU/MathFunctions.h
@@ -7,8 +7,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H
-#define EIGEN_MATH_FUNCTIONS_CUDA_H
+#ifndef EIGEN_MATH_FUNCTIONS_GPU_H
+#define EIGEN_MATH_FUNCTIONS_GPU_H
 
 namespace Eigen {
 
@@ -17,7 +17,7 @@ namespace internal {
 // Make sure this is only available when targeting a GPU: we don't want to
 // introduce conflicts between these packet_traits definitions and the ones
 // we'll use on the host side (SSE, AVX, ...)
-#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 plog<float4>(const float4& a)
 {
@@ -100,4 +100,4 @@ double2 prsqrt<double2>(const double2& a)
 
 } // end namespace Eigen
 
-#endif // EIGEN_MATH_FUNCTIONS_CUDA_H
+#endif // EIGEN_MATH_FUNCTIONS_GPU_H
diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h
index ad66399e0..ddf37b9c1 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h
@@ -7,8 +7,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_PACKET_MATH_CUDA_H
-#define EIGEN_PACKET_MATH_CUDA_H
+#ifndef EIGEN_PACKET_MATH_GPU_H
+#define EIGEN_PACKET_MATH_GPU_H
 
 namespace Eigen {
 
@@ -17,7 +17,7 @@ namespace internal {
 // Make sure this is only available when targeting a GPU: we don't want to
 // introduce conflicts between these packet_traits definitions and the ones
 // we'll use on the host side (SSE, AVX, ...)
-#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
 template<> struct is_arithmetic<float4>  { enum { value = true }; };
 template<> struct is_arithmetic<double2> { enum { value = true }; };
 
@@ -44,7 +44,11 @@ template<> struct packet_traits<float> : default_packet_traits
     HasPolygamma = 1,
     HasErf = 1,
     HasErfc = 1,
+    HasI0e = 1,
+    HasI1e = 1,
     HasIGamma = 1,
+    HasIGammaDerA = 1,
+    HasGammaSampleDerAlpha = 1,
     HasIGammac = 1,
     HasBetaInc = 1,
 
@@ -73,7 +77,11 @@ template<> struct packet_traits<double> : default_packet_traits
     HasPolygamma = 1,
     HasErf = 1,
     HasErfc = 1,
+    HasI0e = 1,
+    HasI1e = 1,
     HasIGamma = 1,
+    HasIGammaDerA = 1,
+    HasGammaSampleDerAlpha = 1,
     HasIGammac = 1,
     HasBetaInc = 1,
 
@@ -167,10 +175,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const d
   return make_double2(from[0], from[1]);
 }
 
-template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float*   from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float*   from) {
   return make_float4(from[0], from[0], from[1], from[1]);
 }
-template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double*  from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double*  from) {
   return make_double2(from[0], from[0]);
 }
 
@@ -196,7 +204,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to
 
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
   return __ldg((const float4*)from);
 #else
   return make_float4(from[0], from[1], from[2], from[3]);
@@ -204,7 +212,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const fl
 }
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
   return __ldg((const double2*)from);
 #else
   return make_double2(from[0], from[1]);
@@ -213,7 +221,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const
 
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
   return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
 #else
   return make_float4(from[0], from[1], from[2], from[3]);
@@ -221,7 +229,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const
 }
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
   return make_double2(__ldg(from+0), __ldg(from+1));
 #else
   return make_double2(from[0], from[1]);
@@ -291,7 +299,7 @@ template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
 
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<float4,4>& kernel) {
-  double tmp = kernel.packet[0].y;
+  float tmp = kernel.packet[0].y;
   kernel.packet[0].y = kernel.packet[1].x;
   kernel.packet[1].x = tmp;
 
@@ -330,4 +338,4 @@ ptranspose(PacketBlock<double2,2>& kernel) {
 } // end namespace Eigen
 
 
-#endif // EIGEN_PACKET_MATH_CUDA_H
+#endif // EIGEN_PACKET_MATH_GPU_H
diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
index b9a125b42..8787adcde 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
@@ -7,15 +7,16 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H
-#define EIGEN_PACKET_MATH_HALF_CUDA_H
+#ifndef EIGEN_PACKET_MATH_HALF_GPU_H
+#define EIGEN_PACKET_MATH_HALF_GPU_H
 
 
 namespace Eigen {
 namespace internal {
 
 // Most of the following operations require arch >= 3.0
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE))
 
 template<> struct is_arithmetic<half2> { enum { value = true }; };
 
@@ -42,70 +43,108 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
 
 template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
 
-template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+#if defined(EIGEN_HAS_OLD_HIP_FP16)
+  return half2half2(from);
+#else  
+  return __half2half2(from);
+#endif
+
+#else // EIGEN_CUDA_ARCH
   return __half2half2(from);
+#endif
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
   return *reinterpret_cast<const half2*>(from);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
   return __halves2half2(from[0], from[1]);
 }
 
-template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half*  from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half*  from) {
   return __halves2half2(from[0], from[0]);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
   *reinterpret_cast<half2*>(to) = from;
 }
 
-template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
   to[0] = __low2half(from);
   to[1] = __high2half(from);
 }
 
 template<>
- __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
-#if __CUDA_ARCH__ >= 350
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+#if defined(EIGEN_HAS_OLD_HIP_FP16)
+  return __halves2half2((*(from+0)), (*(from+1)));
+#else
+  return __ldg((const half2*)from);
+#endif
+
+#else  // EIGEN_CUDA_ARCH
+
+#if EIGEN_CUDA_ARCH >= 350
    return __ldg((const half2*)from);
 #else
   return __halves2half2(*(from+0), *(from+1));
 #endif
+
+#endif
 }
 
 template<>
-__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
-#if __CUDA_ARCH__ >= 350
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+#if defined(EIGEN_HAS_OLD_HIP_FP16)
+  return __halves2half2((*(from+0)), (*(from+1)));
+#else
+  return __halves2half2(__ldg(from+0), __ldg(from+1));
+#endif
+
+#else  // EIGEN_CUDA_ARCH
+
+#if EIGEN_CUDA_ARCH >= 350
    return __halves2half2(__ldg(from+0), __ldg(from+1));
 #else
   return __halves2half2(*(from+0), *(from+1));
 #endif
+
+#endif
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
   return __halves2half2(from[0*stride], from[1*stride]);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
   to[stride*0] = __low2half(from);
   to[stride*1] = __high2half(from);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
   return __low2half(a);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
   half2 result;
-  result.x = a.x & 0x7FFF7FFF;
+  unsigned temp = *(reinterpret_cast<const unsigned*>(&(a)));
+  *(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF;
   return result;
 }
 
 
-__device__ EIGEN_STRONG_INLINE void
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
 ptranspose(PacketBlock<half2,2>& kernel) {
   __half a1 = __low2half(kernel.packet[0]);
   __half a2 = __high2half(kernel.packet[0]);
@@ -115,17 +154,31 @@ ptranspose(PacketBlock<half2,2>& kernel) {
   kernel.packet[1] = __halves2half2(a2, b2);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
-#if __CUDA_ARCH__ >= 530
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+  
+  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
+  
+#else  // EIGEN_CUDA_ARCH
+
+#if EIGEN_CUDA_ARCH >= 530
   return __halves2half2(a, __hadd(a, __float2half(1.0f)));
 #else
   float f = __half2float(a) + 1.0f;
   return __halves2half2(a, __float2half(f));
 #endif
+
+#endif
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
-#if __CUDA_ARCH__ >= 530
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  return __hadd2(a, b);
+
+#else  // EIGEN_CUDA_ARCH
+
+#if EIGEN_CUDA_ARCH >= 530
   return __hadd2(a, b);
 #else
   float a1 = __low2float(a);
@@ -136,10 +189,18 @@ template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, cons
   float r2 = a2 + b2;
   return __floats2half2_rn(r1, r2);
 #endif
+
+#endif
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
-#if __CUDA_ARCH__ >= 530
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  return __hsub2(a, b);
+  
+#else  // EIGEN_CUDA_ARCH
+
+#if EIGEN_CUDA_ARCH >= 530
   return __hsub2(a, b);
 #else
   float a1 = __low2float(a);
@@ -150,22 +211,38 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, cons
   float r2 = a2 - b2;
   return __floats2half2_rn(r1, r2);
 #endif
+
+#endif
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
-#if __CUDA_ARCH__ >= 530
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  return __hneg2(a);
+
+#else  // EIGEN_CUDA_ARCH
+
+#if EIGEN_CUDA_ARCH >= 530
   return __hneg2(a);
 #else
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   return __floats2half2_rn(-a1, -a2);
 #endif
+
+#endif
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  return __hmul2(a, b);
+
+#else  // EIGEN_CUDA_ARCH
 
-template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
-#if __CUDA_ARCH__ >= 530
+#if EIGEN_CUDA_ARCH >= 530
   return __hmul2(a, b);
 #else
   float a1 = __low2float(a);
@@ -176,10 +253,18 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, cons
   float r2 = a2 * b2;
   return __floats2half2_rn(r1, r2);
 #endif
+
+#endif
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
-#if __CUDA_ARCH__ >= 530
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+   return __hfma2(a, b, c);
+
+#else  // EIGEN_CUDA_ARCH
+
+#if EIGEN_CUDA_ARCH >= 530
    return __hfma2(a, b, c);
 #else
   float a1 = __low2float(a);
@@ -192,9 +277,21 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, con
   float r2 = a2 * b2 + c2;
   return __floats2half2_rn(r1, r2);
 #endif
+
+#endif
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+  
+#if defined(EIGEN_HAS_OLD_HIP_FP16)
+  return h2div(a, b);
+#else
+  return __h2div(a, b);
+#endif
+  
+#else // EIGEN_CUDA_ARCH
+  
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float b1 = __low2float(b);
@@ -202,9 +299,11 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, cons
   float r1 = a1 / b1;
   float r2 = a2 / b2;
   return __floats2half2_rn(r1, r2);
+
+#endif
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float b1 = __low2float(b);
@@ -214,7 +313,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, cons
   return __halves2half2(r1, r2);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float b1 = __low2float(b);
@@ -224,18 +323,34 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, cons
   return __halves2half2(r1, r2);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
-#if __CUDA_ARCH__ >= 530
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  return __hadd(__low2half(a), __high2half(a));
+
+#else  // EIGEN_CUDA_ARCH
+
+#if EIGEN_CUDA_ARCH >= 530
   return __hadd(__low2half(a), __high2half(a));
 #else
   float a1 = __low2float(a);
   float a2 = __high2float(a);
-  return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 + a2)));
+  return Eigen::half(__float2half(a1 + a2));
+#endif
+
 #endif
 }
 
-template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
-#if __CUDA_ARCH__ >= 530
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  __half first = __low2half(a);
+  __half second = __high2half(a);
+  return __hgt(first, second) ? first : second;
+
+#else  // EIGEN_CUDA_ARCH
+
+#if EIGEN_CUDA_ARCH >= 530
   __half first = __low2half(a);
   __half second = __high2half(a);
   return __hgt(first, second) ? first : second;
@@ -244,10 +359,20 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const ha
   float a2 = __high2float(a);
   return a1 > a2 ? __low2half(a) : __high2half(a);
 #endif
+
+#endif
 }
 
-template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
-#if __CUDA_ARCH__ >= 530
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  __half first = __low2half(a);
+  __half second = __high2half(a);
+  return __hlt(first, second) ? first : second;
+
+#else  // EIGEN_CUDA_ARCH
+
+#if EIGEN_CUDA_ARCH >= 530
   __half first = __low2half(a);
   __half second = __high2half(a);
   return __hlt(first, second) ? first : second;
@@ -256,19 +381,29 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const ha
   float a2 = __high2float(a);
   return a1 < a2 ? __low2half(a) : __high2half(a);
 #endif
+
+#endif
 }
 
-template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
-#if __CUDA_ARCH__ >= 530
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  return __hmul(__low2half(a), __high2half(a));
+
+#else  // EIGEN_CUDA_ARCH
+
+#if EIGEN_CUDA_ARCH >= 530
   return __hmul(__low2half(a), __high2half(a));
 #else
   float a1 = __low2float(a);
   float a2 = __high2float(a);
-  return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 * a2)));
+  return Eigen::half(__float2half(a1 * a2));
+#endif
+
 #endif
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float r1 = log1pf(a1);
@@ -276,7 +411,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
   return __floats2half2_rn(r1, r2);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float r1 = expm1f(a1);
@@ -284,31 +419,32 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
   return __floats2half2_rn(r1, r2);
 }
 
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
+#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
 
-template<>  __device__ EIGEN_STRONG_INLINE
+template<>  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 half2 plog<half2>(const half2& a) {
   return h2log(a);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 half2 pexp<half2>(const half2& a) {
   return h2exp(a);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 half2 psqrt<half2>(const half2& a) {
   return h2sqrt(a);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 half2 prsqrt<half2>(const half2& a) {
   return h2rsqrt(a);
 }
 
 #else
 
-template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float r1 = logf(a1);
@@ -316,7 +452,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
   return __floats2half2_rn(r1, r2);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float r1 = expf(a1);
@@ -324,7 +460,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
   return __floats2half2_rn(r1, r2);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float r1 = sqrtf(a1);
@@ -332,7 +468,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
   return __floats2half2_rn(r1, r2);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float r1 = rsqrtf(a1);
@@ -361,10 +497,10 @@ struct packet_traits<half> : default_packet_traits {
     AlignedOnScalar = 1,
     size = 16,
     HasHalfPacket = 0,
-    HasAdd    = 0,
-    HasSub    = 0,
-    HasMul    = 0,
-    HasNegate = 0,
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasNegate = 1,
     HasAbs    = 0,
     HasAbs2   = 0,
     HasMin    = 0,
@@ -406,11 +542,30 @@ template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* fr
 }
 
 template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
-  _mm256_store_si256((__m256i*)to, from.x);
+  // (void*) -> workaround clang warning:
+  // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
+  _mm256_store_si256((__m256i*)(void*)to, from.x);
 }
 
 template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
-  _mm256_storeu_si256((__m256i*)to, from.x);
+  // (void*) -> workaround clang warning:
+  // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
+  _mm256_storeu_si256((__m256i*)(void*)to, from.x);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h
+ploaddup<Packet16h>(const Eigen::half*  from) {
+  Packet16h result;
+  unsigned short a = from[0].x;
+  unsigned short b = from[1].x;
+  unsigned short c = from[2].x;
+  unsigned short d = from[3].x;
+  unsigned short e = from[4].x;
+  unsigned short f = from[5].x;
+  unsigned short g = from[6].x;
+  unsigned short h = from[7].x;
+  result.x = _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
+  return result;
 }
 
 template<> EIGEN_STRONG_INLINE Packet16h
@@ -485,6 +640,13 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
 #endif
 }
 
+template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
+  // FIXME we could do that with bit manipulation
+  Packet16f af = half2float(a);
+  Packet16f rf = pnegate(af);
+  return float2half(rf);
+}
+
 template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
   Packet16f af = half2float(a);
   Packet16f bf = half2float(b);
@@ -492,6 +654,13 @@ template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, con
   return float2half(rf);
 }
 
+template<> EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = psub(af, bf);
+  return float2half(rf);
+}
+
 template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
   Packet16f af = half2float(a);
   Packet16f bf = half2float(b);
@@ -504,6 +673,57 @@ template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
   return half(predux(from_float));
 }
 
+template<> EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& from) {
+  Packet16f from_float = half2float(from);
+  return half(predux_mul(from_float));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h preduxp<Packet16h>(const Packet16h* p) {
+  Packet16f pf[16];
+  pf[0] = half2float(p[0]);
+  pf[1] = half2float(p[1]);
+  pf[2] = half2float(p[2]);
+  pf[3] = half2float(p[3]);
+  pf[4] = half2float(p[4]);
+  pf[5] = half2float(p[5]);
+  pf[6] = half2float(p[6]);
+  pf[7] = half2float(p[7]);
+  pf[8] = half2float(p[8]);
+  pf[9] = half2float(p[9]);
+  pf[10] = half2float(p[10]);
+  pf[11] = half2float(p[11]);
+  pf[12] = half2float(p[12]);
+  pf[13] = half2float(p[13]);
+  pf[14] = half2float(p[14]);
+  pf[15] = half2float(p[15]);
+  Packet16f reduced = preduxp<Packet16f>(pf);
+  return float2half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a)
+{
+  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
+  Packet16h res;
+  res.x = _mm256_insertf128_si256(
+                    _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a.x,1),m)),
+                                           _mm_shuffle_epi8(_mm256_extractf128_si256(a.x,0),m), 1);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pinsertfirst(const Packet16h& a, Eigen::half b)
+{
+  Packet16h res;
+  res.x = _mm256_insert_epi16(a.x,b.x,0);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pinsertlast(const Packet16h& a, Eigen::half b)
+{
+  Packet16h res;
+  res.x = _mm256_insert_epi16(a.x,b.x,15);
+  return res;
+}
+
 template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
 {
   Packet16h result;
@@ -611,20 +831,20 @@ ptranspose(PacketBlock<Packet16h,16>& kernel) {
 
   // NOTE: no unpacklo/hi instr in this case, so using permute instr.
   __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
-  __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
-  __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
-  __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
-  __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
-  __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
-  __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
-  __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
-  __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
-  __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
-  __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
-  __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
-  __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
-  __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
-  __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
+  __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
+  __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
+  __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
+  __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
+  __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
+  __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
+  __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
+  __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
+  __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
+  __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
+  __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
+  __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
+  __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
+  __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
   __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
 
   kernel.packet[0].x = a_p_0;
@@ -729,10 +949,10 @@ struct packet_traits<Eigen::half> : default_packet_traits {
     AlignedOnScalar = 1,
     size = 8,
     HasHalfPacket = 0,
-    HasAdd    = 0,
-    HasSub    = 0,
-    HasMul    = 0,
-    HasNegate = 0,
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasNegate = 1,
     HasAbs    = 0,
     HasAbs2   = 0,
     HasMin    = 0,
@@ -782,6 +1002,17 @@ template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const
 }
 
 template<> EIGEN_STRONG_INLINE Packet8h
+ploaddup<Packet8h>(const Eigen::half*  from) {
+  Packet8h result;
+  unsigned short a = from[0].x;
+  unsigned short b = from[1].x;
+  unsigned short c = from[2].x;
+  unsigned short d = from[3].x;
+  result.x = _mm_set_epi16(d, d, c, c, b, b, a, a);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h
 ploadquad<Packet8h>(const Eigen::half* from) {
   Packet8h result;
   unsigned short a = from[0].x;
@@ -834,6 +1065,13 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
 
 template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
 
+template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
+  // FIXME we could do that with bit manipulation
+  Packet8f af = half2float(a);
+  Packet8f rf = pnegate(af);
+  return float2half(rf);
+}
+
 template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
   Packet8f af = half2float(a);
   Packet8f bf = half2float(b);
@@ -841,6 +1079,13 @@ template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const
   return float2half(rf);
 }
 
+template<> EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = psub(af, bf);
+  return float2half(rf);
+}
+
 template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
   Packet8f af = half2float(a);
   Packet8f bf = half2float(b);
@@ -893,6 +1138,52 @@ template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h&
   return Eigen::half(reduced);
 }
 
+template<> EIGEN_STRONG_INLINE Packet8h preduxp<Packet8h>(const Packet8h* p) {
+  Packet8f pf[8];
+  pf[0] = half2float(p[0]);
+  pf[1] = half2float(p[1]);
+  pf[2] = half2float(p[2]);
+  pf[3] = half2float(p[3]);
+  pf[4] = half2float(p[4]);
+  pf[5] = half2float(p[5]);
+  pf[6] = half2float(p[6]);
+  pf[7] = half2float(p[7]);
+  Packet8f reduced = preduxp<Packet8f>(pf);
+  return float2half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a)
+{
+  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
+  Packet8h res;
+  res.x = _mm_shuffle_epi8(a.x,m);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pinsertfirst(const Packet8h& a, Eigen::half b)
+{
+  Packet8h res;
+  res.x = _mm_insert_epi16(a.x,int(b.x),0);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pinsertlast(const Packet8h& a, Eigen::half b)
+{
+  Packet8h res;
+  res.x = _mm_insert_epi16(a.x,int(b.x),7);
+  return res;
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet8h>
+{
+  static EIGEN_STRONG_INLINE void run(Packet8h& first, const Packet8h& second)
+  {
+    if (Offset!=0)
+      first.x = _mm_alignr_epi8(second.x,first.x, Offset*2);
+  }
+};
+
 EIGEN_STRONG_INLINE void
 ptranspose(PacketBlock<Packet8h,8>& kernel) {
   __m128i a = kernel.packet[0].x;
@@ -1129,4 +1420,4 @@ ptranspose(PacketBlock<Packet4h,4>& kernel) {
 }
 }
 
-#endif // EIGEN_PACKET_MATH_HALF_CUDA_H
+#endif // EIGEN_PACKET_MATH_HALF_GPU_H
diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/GPU/TypeCasting.h
index aa5fbce8e..57a55d08b 100644
--- a/Eigen/src/Core/arch/CUDA/TypeCasting.h
+++ b/Eigen/src/Core/arch/GPU/TypeCasting.h
@@ -7,8 +7,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_TYPE_CASTING_CUDA_H
-#define EIGEN_TYPE_CASTING_CUDA_H
+#ifndef EIGEN_TYPE_CASTING_GPU_H
+#define EIGEN_TYPE_CASTING_GPU_H
 
 namespace Eigen {
 
@@ -19,7 +19,8 @@ struct scalar_cast_op<float, Eigen::half> {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
   typedef Eigen::half result_type;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const {
-    #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+    #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+      (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
       return __float2half(a);
     #else
       return Eigen::half(a);
@@ -37,7 +38,8 @@ struct scalar_cast_op<int, Eigen::half> {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
   typedef Eigen::half result_type;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const {
-    #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+    #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+      (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
       return __float2half(static_cast<float>(a));
     #else
       return Eigen::half(static_cast<float>(a));
@@ -55,7 +57,8 @@ struct scalar_cast_op<Eigen::half, float> {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
   typedef float result_type;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const {
-    #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+    #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+      (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
       return __half2float(a);
     #else
       return static_cast<float>(a);
@@ -69,7 +72,8 @@ struct functor_traits<scalar_cast_op<Eigen::half, float> >
 
 
 
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
 
 template <>
 struct type_casting_traits<Eigen::half, float> {
@@ -209,4 +213,4 @@ template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f
 
 } // end namespace Eigen
 
-#endif // EIGEN_TYPE_CASTING_CUDA_H
+#endif // EIGEN_TYPE_CASTING_GPU_H
diff --git a/Eigen/src/Core/arch/HIP/hcc/math_constants.h b/Eigen/src/Core/arch/HIP/hcc/math_constants.h
new file mode 100644
index 000000000..25375a0a4
--- /dev/null
+++ b/Eigen/src/Core/arch/HIP/hcc/math_constants.h
@@ -0,0 +1,23 @@
+/*
+ * math_constants.h - 
+ *  HIP equivalent of the CUDA header of the same name
+ */
+
+#ifndef __MATH_CONSTANTS_H__
+#define __MATH_CONSTANTS_H__
+
+/* single precision constants */
+
+#define HIPRT_INF_F        __int_as_float(0x7f800000)
+#define HIPRT_NAN_F        __int_as_float(0x7fffffff)
+#define HIPRT_MIN_DENORM_F __int_as_float(0x00000001)
+#define HIPRT_MAX_NORMAL_F __int_as_float(0x7f7fffff)
+#define HIPRT_NEG_ZERO_F   __int_as_float(0x80000000)
+#define HIPRT_ZERO_F       0.0f
+#define HIPRT_ONE_F        1.0f
+
+/* double precision constants */
+#define HIPRT_INF          __hiloint2double(0x7ff00000, 0x00000000)
+#define HIPRT_NAN          __hiloint2double(0xfff80000, 0x00000000)
+
+#endif
diff --git a/Eigen/src/Core/arch/MSA/Complex.h b/Eigen/src/Core/arch/MSA/Complex.h
new file mode 100644
index 000000000..9a45cf51e
--- /dev/null
+++ b/Eigen/src/Core/arch/MSA/Complex.h
@@ -0,0 +1,759 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Wave Computing, Inc.
+// Written by:
+//   Chris Larsen
+//   Alexey Frunze (afrunze@wavecomp.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_MSA_H
+#define EIGEN_COMPLEX_MSA_H
+
+#include <iostream>
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- float ----------
+struct Packet2cf {
+  EIGEN_STRONG_INLINE Packet2cf() {
+  }
+  EIGEN_STRONG_INLINE explicit Packet2cf(const std::complex<float>& a,
+                                         const std::complex<float>& b) {
+    Packet4f t = { std::real(a), std::imag(a), std::real(b), std::imag(b) };
+    v = t;
+  }
+  EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {
+  }
+  EIGEN_STRONG_INLINE Packet2cf(const Packet2cf& a) : v(a.v) {
+  }
+  EIGEN_STRONG_INLINE Packet2cf& operator=(const Packet2cf& b) {
+    v = b.v;
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf conjugate(void) const {
+    return Packet2cf((Packet4f)__builtin_msa_bnegi_d((v2u64)v, 63));
+  }
+  EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) {
+    Packet4f v1, v2;
+
+    // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
+    v1 = (Packet4f)__builtin_msa_ilvev_w((v4i32)v, (v4i32)v);
+    // Get the imag values of a | a1_im | a1_im | a2_im | a2_im |
+    v2 = (Packet4f)__builtin_msa_ilvod_w((v4i32)v, (v4i32)v);
+    // Multiply the real a with b
+    v1 = pmul(v1, b.v);
+    // Multiply the imag a with b
+    v2 = pmul(v2, b.v);
+    // Conjugate v2
+    v2 = Packet2cf(v2).conjugate().v;
+    // Swap real/imag elements in v2.
+    v2 = (Packet4f)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(1, 0, 3, 2));
+    // Add and return the result
+    v = padd(v1, v2);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const {
+    return Packet2cf(*this) *= b;
+  }
+  EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) {
+    v = padd(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const {
+    return Packet2cf(*this) += b;
+  }
+  EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) {
+    v = psub(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const {
+    return Packet2cf(*this) -= b;
+  }
+  EIGEN_STRONG_INLINE Packet2cf& operator/=(const Packet2cf& b) {
+    *this *= b.conjugate();
+    Packet4f s = pmul<Packet4f>(b.v, b.v);
+    s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+    v = pdiv(v, s);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator/(const Packet2cf& b) const {
+    return Packet2cf(*this) /= b;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator-(void) const {
+    return Packet2cf(pnegate(v));
+  }
+
+  Packet4f v;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Packet2cf& value) {
+  os << "[ (" << value.v[0] << ", " << value.v[1]
+     << "i),"
+        "  ("
+     << value.v[2] << ", " << value.v[3] << "i) ]";
+  return os;
+}
+
+template <>
+struct packet_traits<std::complex<float> > : default_packet_traits {
+  typedef Packet2cf type;
+  typedef Packet2cf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    HasHalfPacket = 0,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasNegate = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasSetLinear = 0,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct unpacket_traits<Packet2cf> {
+  typedef std::complex<float> type;
+  enum { size = 2, alignment = Aligned16 };
+  typedef Packet2cf half;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
+  EIGEN_MSA_DEBUG;
+
+  float f0 = from.real(), f1 = from.imag();
+  Packet4f v0 = { f0, f0, f0, f0 };
+  Packet4f v1 = { f1, f1, f1, f1 };
+  return Packet2cf((Packet4f)__builtin_msa_ilvr_w((Packet4i)v1, (Packet4i)v0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a + b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a - b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return -a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a.conjugate();
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a * b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf(pand(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf(por(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf(pxor(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf(pandnot(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_MSA_DEBUG;
+
+  return pset1<Packet2cf>(*from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to,
+                                                      const Packet2cf& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_STORE pstore<float>((float*)to, from.v);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to,
+                                                       const Packet2cf& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu<float>((float*)to, from.v);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(
+    const std::complex<float>* from, Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf(from[0 * stride], from[1 * stride]);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to,
+                                                                       const Packet2cf& from,
+                                                                       Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  *to = std::complex<float>(from.v[0], from.v[1]);
+  to += stride;
+  *to = std::complex<float>(from.v[2], from.v[3]);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float>* addr) {
+  EIGEN_MSA_DEBUG;
+
+  prefetch(reinterpret_cast<const float*>(addr));
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return std::complex<float>(a.v[0], a.v[1]);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4f value = (Packet4f)preverse((Packet2d)a.v);
+  value += a.v;
+  return std::complex<float>(value[0], value[1]);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4f sum1, sum2, sum;
+
+  // Add the first two 64-bit float32x2_t of vecs[0]
+  sum1 = (Packet4f)__builtin_msa_ilvr_d((v2i64)vecs[1].v, (v2i64)vecs[0].v);
+  sum2 = (Packet4f)__builtin_msa_ilvl_d((v2i64)vecs[1].v, (v2i64)vecs[0].v);
+  sum = padd(sum1, sum2);
+
+  return Packet2cf(sum);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return std::complex<float>((a.v[0] * a.v[2]) - (a.v[1] * a.v[3]),
+                             (a.v[0] * a.v[3]) + (a.v[1] * a.v[2]));
+}
+
+template <int Offset>
+struct palign_impl<Offset, Packet2cf> {
+  EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second) {
+    if (Offset == 1) {
+      first.v = (Packet4f)__builtin_msa_sldi_b((v16i8)second.v, (v16i8)first.v, Offset * 8);
+    }
+  }
+};
+
+template <>
+struct conj_helper<Packet2cf, Packet2cf, false, true> {
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y,
+                                      const Packet2cf& c) const {
+    return padd(pmul(x, y), c);
+  }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const {
+    return internal::pmul(a, pconj(b));
+  }
+};
+
+template <>
+struct conj_helper<Packet2cf, Packet2cf, true, false> {
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y,
+                                      const Packet2cf& c) const {
+    return padd(pmul(x, y), c);
+  }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const {
+    return internal::pmul(pconj(a), b);
+  }
+};
+
+template <>
+struct conj_helper<Packet2cf, Packet2cf, true, true> {
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y,
+                                      const Packet2cf& c) const {
+    return padd(pmul(x, y), c);
+  }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const {
+    return pconj(internal::pmul(a, b));
+  }
+};
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a / b;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet2cf, 2>& value) {
+  os << "[ " << value.packet[0] << ", " << std::endl << "  " << value.packet[1] << " ]";
+  return os;
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4f tmp =
+      (Packet4f)__builtin_msa_ilvl_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
+  kernel.packet[0].v =
+      (Packet4f)__builtin_msa_ilvr_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
+  kernel.packet[1].v = tmp;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
+                                     const Packet2cf& elsePacket) {
+  return (Packet2cf)(Packet4f)pblend<Packet2d>(ifPacket, (Packet2d)thenPacket.v,
+                                               (Packet2d)elsePacket.v);
+}
+
+//---------- double ----------
+
+struct Packet1cd {
+  EIGEN_STRONG_INLINE Packet1cd() {
+  }
+  EIGEN_STRONG_INLINE explicit Packet1cd(const std::complex<double>& a) {
+    v[0] = std::real(a);
+    v[1] = std::imag(a);
+  }
+  EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {
+  }
+  EIGEN_STRONG_INLINE Packet1cd(const Packet1cd& a) : v(a.v) {
+  }
+  EIGEN_STRONG_INLINE Packet1cd& operator=(const Packet1cd& b) {
+    v = b.v;
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd conjugate(void) const {
+    static const v2u64 p2ul_CONJ_XOR = { 0x0, 0x8000000000000000 };
+    return (Packet1cd)pxor(v, (Packet2d)p2ul_CONJ_XOR);
+  }
+  EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) {
+    Packet2d v1, v2;
+
+    // Get the real values of a | a1_re | a1_re
+    v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)v, (v2i64)v);
+    // Get the imag values of a | a1_im | a1_im
+    v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)v, (v2i64)v);
+    // Multiply the real a with b
+    v1 = pmul(v1, b.v);
+    // Multiply the imag a with b
+    v2 = pmul(v2, b.v);
+    // Conjugate v2
+    v2 = Packet1cd(v2).conjugate().v;
+    // Swap real/imag elements in v2.
+    v2 = (Packet2d)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+    // Add and return the result
+    v = padd(v1, v2);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const {
+    return Packet1cd(*this) *= b;
+  }
+  EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) {
+    v = padd(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const {
+    return Packet1cd(*this) += b;
+  }
+  EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) {
+    v = psub(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const {
+    return Packet1cd(*this) -= b;
+  }
+  EIGEN_STRONG_INLINE Packet1cd& operator/=(const Packet1cd& b) {
+    *this *= b.conjugate();
+    Packet2d s = pmul<Packet2d>(b.v, b.v);
+    s = padd(s, preverse<Packet2d>(s));
+    v = pdiv(v, s);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator/(const Packet1cd& b) const {
+    return Packet1cd(*this) /= b;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator-(void) const {
+    return Packet1cd(pnegate(v));
+  }
+
+  Packet2d v;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Packet1cd& value) {
+  os << "[ (" << value.v[0] << ", " << value.v[1] << "i) ]";
+  return os;
+}
+
+template <>
+struct packet_traits<std::complex<double> > : default_packet_traits {
+  typedef Packet1cd type;
+  typedef Packet1cd half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 0,
+    size = 1,
+    HasHalfPacket = 0,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasNegate = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasSetLinear = 0
+  };
+};
+
+template <>
+struct unpacket_traits<Packet1cd> {
+  typedef std::complex<double> type;
+  enum { size = 1, alignment = Aligned16 };
+  typedef Packet1cd half;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a + b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a - b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return -a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a.conjugate();
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a * b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(pand(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(por(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(pxor(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(pandnot(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_MSA_DEBUG;
+
+  return pset1<Packet1cd>(*from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to,
+                                                       const Packet1cd& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_STORE pstore<double>((double*)to, from.v);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to,
+                                                        const Packet1cd& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu<double>((double*)to, from.v);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double>* addr) {
+  EIGEN_MSA_DEBUG;
+
+  prefetch(reinterpret_cast<const double*>(addr));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(
+    const std::complex<double>* from, Index stride __attribute__((unused))) {
+  EIGEN_MSA_DEBUG;
+
+  Packet1cd res;
+  res.v[0] = std::real(from[0]);
+  res.v[1] = std::imag(from[0]);
+  return res;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to,
+                                                                        const Packet1cd& from,
+                                                                        Index stride
+                                                                        __attribute__((unused))) {
+  EIGEN_MSA_DEBUG;
+
+  pstore(to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return std::complex<double>(a.v[0], a.v[1]);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return pfirst(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs) {
+  EIGEN_MSA_DEBUG;
+
+  return vecs[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return pfirst(a);
+}
+
+template <int Offset>
+struct palign_impl<Offset, Packet1cd> {
+  static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) {
+    // FIXME is it sure we never have to align a Packet1cd?
+    // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes
+    // boundary...
+  }
+};
+
+template <>
+struct conj_helper<Packet1cd, Packet1cd, false, true> {
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y,
+                                      const Packet1cd& c) const {
+    return padd(pmul(x, y), c);
+  }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const {
+    return internal::pmul(a, pconj(b));
+  }
+};
+
+template <>
+struct conj_helper<Packet1cd, Packet1cd, true, false> {
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y,
+                                      const Packet1cd& c) const {
+    return padd(pmul(x, y), c);
+  }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const {
+    return internal::pmul(pconj(a), b);
+  }
+};
+
+template <>
+struct conj_helper<Packet1cd, Packet1cd, true, true> {
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y,
+                                      const Packet1cd& c) const {
+    return padd(pmul(x, y), c);
+  }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const {
+    return pconj(internal::pmul(a, b));
+  }
+};
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d)
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a / b;
+}
+
+EIGEN_STRONG_INLINE Packet1cd pcplxflip /*<Packet1cd>*/ (const Packet1cd& x) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet1cd, 2>& value) {
+  os << "[ " << value.packet[0] << ", " << std::endl << "  " << value.packet[1] << " ]";
+  return os;
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd, 2>& kernel) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d v1, v2;
+
+  v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v);
+  // Get the imag values of a
+  v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v);
+
+  kernel.packet[0].v = v1;
+  kernel.packet[1].v = v2;
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_COMPLEX_MSA_H
diff --git a/Eigen/src/Core/arch/MSA/MathFunctions.h b/Eigen/src/Core/arch/MSA/MathFunctions.h
new file mode 100644
index 000000000..98e23e36f
--- /dev/null
+++ b/Eigen/src/Core/arch/MSA/MathFunctions.h
@@ -0,0 +1,387 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007 Julien Pommier
+// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// Copyright (C) 2018 Wave Computing, Inc.
+// Written by:
+//   Chris Larsen
+//   Alexey Frunze (afrunze@wavecomp.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* The sin, cos, exp, and log functions of this file come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+/* The tanh function of this file is an adaptation of
+ * template<typename T> T generic_fast_tanh_float(const T&)
+ * from MathFunctionsImpl.h.
+ */
+
+#ifndef EIGEN_MATH_FUNCTIONS_MSA_H
+#define EIGEN_MATH_FUNCTIONS_MSA_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+plog<Packet4f>(const Packet4f& _x) {
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292e-2f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, -1.1514610310e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, -1.2420140846e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, +1.4249322787e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, -1.6668057665e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, +2.0000714765e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, -2.4999993993e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, +3.3333331174e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
+  static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
+
+  // Convert negative argument into NAN (quiet negative, to be specific).
+  Packet4f zero = (Packet4f)__builtin_msa_ldi_w(0);
+  Packet4i neg_mask = __builtin_msa_fclt_w(_x, zero);
+  Packet4i zero_mask = __builtin_msa_fceq_w(_x, zero);
+  Packet4f non_neg_x_or_nan = padd(_x, (Packet4f)neg_mask);  // Add 0.0 or NAN.
+  Packet4f x = non_neg_x_or_nan;
+
+  // Extract exponent from x = mantissa * 2**exponent, where 1.0 <= mantissa < 2.0.
+  // N.B. the exponent is one less of what frexpf() would return.
+  Packet4i e_int = __builtin_msa_ftint_s_w(__builtin_msa_flog2_w(x));
+  // Multiply x by 2**(-exponent-1) to get 0.5 <= x < 1.0 as from frexpf().
+  x = __builtin_msa_fexp2_w(x, (Packet4i)__builtin_msa_nori_b((v16u8)e_int, 0));
+
+  /*
+     if (x < SQRTHF) {
+       x = x + x - 1.0;
+     } else {
+       e += 1;
+       x = x - 1.0;
+     }
+  */
+  Packet4f xx = padd(x, x);
+  Packet4i ge_mask = __builtin_msa_fcle_w(p4f_cephes_SQRTHF, x);
+  e_int = psub(e_int, ge_mask);
+  x = (Packet4f)__builtin_msa_bsel_v((v16u8)ge_mask, (v16u8)xx, (v16u8)x);
+  x = psub(x, p4f_1);
+  Packet4f e = __builtin_msa_ffint_s_w(e_int);
+
+  Packet4f x2 = pmul(x, x);
+  Packet4f x3 = pmul(x2, x);
+
+  Packet4f y, y1, y2;
+  y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
+  y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
+  y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
+  y = pmadd(y, x, p4f_cephes_log_p2);
+  y1 = pmadd(y1, x, p4f_cephes_log_p5);
+  y2 = pmadd(y2, x, p4f_cephes_log_p8);
+  y = pmadd(y, x3, y1);
+  y = pmadd(y, x3, y2);
+  y = pmul(y, x3);
+
+  y = pmadd(e, p4f_cephes_log_q1, y);
+  x = __builtin_msa_fmsub_w(x, x2, p4f_half);
+  x = padd(x, y);
+  x = pmadd(e, p4f_cephes_log_q2, x);
+
+  // x is now the logarithm result candidate. We still need to handle the
+  // extreme arguments of zero and positive infinity, though.
+  // N.B. if the argument is +INFINITY, x is NAN because the polynomial terms
+  // contain infinities of both signs (see the coefficients and code above).
+  // INFINITY - INFINITY is NAN.
+
+  // If the argument is +INFINITY, make it the new result candidate.
+  // To achieve that we choose the smaller of the result candidate and the
+  // argument.
+  // This is correct for all finite pairs of values (the logarithm is smaller
+  // than the argument).
+  // This is also correct in the special case when the argument is +INFINITY
+  // and the result candidate is NAN. This is because the fmin.df instruction
+  // prefers non-NANs to NANs.
+  x = __builtin_msa_fmin_w(x, non_neg_x_or_nan);
+
+  // If the argument is zero (including -0.0), the result becomes -INFINITY.
+  Packet4i neg_infs = __builtin_msa_slli_w(zero_mask, 23);
+  x = (Packet4f)__builtin_msa_bsel_v((v16u8)zero_mask, (v16u8)x, (v16u8)neg_infs);
+
+  return x;
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+pexp<Packet4f>(const Packet4f& _x) {
+  // Limiting single-precision pexp's argument to [-128, +128] lets pexp
+  // reach 0 and INFINITY naturally.
+  static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -128.0f);
+  static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, +128.0f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507e-3f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073e-3f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894e-2f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
+
+  Packet4f x = _x;
+
+  // Clamp x.
+  x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(x, p4f_exp_lo), (v16u8)x,
+                                     (v16u8)p4f_exp_lo);
+  x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_exp_hi, x), (v16u8)x,
+                                     (v16u8)p4f_exp_hi);
+
+  // Round to nearest integer by adding 0.5 (with x's sign) and truncating.
+  Packet4f x2_add = (Packet4f)__builtin_msa_binsli_w((v4u32)p4f_half, (v4u32)x, 0);
+  Packet4f x2 = pmadd(x, p4f_cephes_LOG2EF, x2_add);
+  Packet4i x2_int = __builtin_msa_ftrunc_s_w(x2);
+  Packet4f x2_int_f = __builtin_msa_ffint_s_w(x2_int);
+
+  x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C1);
+  x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C2);
+
+  Packet4f z = pmul(x, x);
+
+  Packet4f y = p4f_cephes_exp_p0;
+  y = pmadd(y, x, p4f_cephes_exp_p1);
+  y = pmadd(y, x, p4f_cephes_exp_p2);
+  y = pmadd(y, x, p4f_cephes_exp_p3);
+  y = pmadd(y, x, p4f_cephes_exp_p4);
+  y = pmadd(y, x, p4f_cephes_exp_p5);
+  y = pmadd(y, z, x);
+  y = padd(y, p4f_1);
+
+  // y *= 2**exponent.
+  y = __builtin_msa_fexp2_w(y, x2_int);
+
+  return y;
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+ptanh<Packet4f>(const Packet4f& _x) {
+  static _EIGEN_DECLARE_CONST_Packet4f(tanh_tiny, 1e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(tanh_hi, 9.0f);
+  // The monomial coefficients of the numerator polynomial (odd).
+  static _EIGEN_DECLARE_CONST_Packet4f(alpha_1, 4.89352455891786e-3f);
+  static _EIGEN_DECLARE_CONST_Packet4f(alpha_3, 6.37261928875436e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(alpha_5, 1.48572235717979e-5f);
+  static _EIGEN_DECLARE_CONST_Packet4f(alpha_7, 5.12229709037114e-8f);
+  static _EIGEN_DECLARE_CONST_Packet4f(alpha_9, -8.60467152213735e-11f);
+  static _EIGEN_DECLARE_CONST_Packet4f(alpha_11, 2.00018790482477e-13f);
+  static _EIGEN_DECLARE_CONST_Packet4f(alpha_13, -2.76076847742355e-16f);
+  // The monomial coefficients of the denominator polynomial (even).
+  static _EIGEN_DECLARE_CONST_Packet4f(beta_0, 4.89352518554385e-3f);
+  static _EIGEN_DECLARE_CONST_Packet4f(beta_2, 2.26843463243900e-3f);
+  static _EIGEN_DECLARE_CONST_Packet4f(beta_4, 1.18534705686654e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(beta_6, 1.19825839466702e-6f);
+
+  Packet4f x = pabs(_x);
+  Packet4i tiny_mask = __builtin_msa_fclt_w(x, p4f_tanh_tiny);
+
+  // Clamp the inputs to the range [-9, 9] since anything outside
+  // this range is -/+1.0f in single-precision.
+  x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_tanh_hi, x), (v16u8)x,
+                                     (v16u8)p4f_tanh_hi);
+
+  // Since the polynomials are odd/even, we need x**2.
+  Packet4f x2 = pmul(x, x);
+
+  // Evaluate the numerator polynomial p.
+  Packet4f p = pmadd(x2, p4f_alpha_13, p4f_alpha_11);
+  p = pmadd(x2, p, p4f_alpha_9);
+  p = pmadd(x2, p, p4f_alpha_7);
+  p = pmadd(x2, p, p4f_alpha_5);
+  p = pmadd(x2, p, p4f_alpha_3);
+  p = pmadd(x2, p, p4f_alpha_1);
+  p = pmul(x, p);
+
+  // Evaluate the denominator polynomial q.
+  Packet4f q = pmadd(x2, p4f_beta_6, p4f_beta_4);
+  q = pmadd(x2, q, p4f_beta_2);
+  q = pmadd(x2, q, p4f_beta_0);
+
+  // Divide the numerator by the denominator.
+  p = pdiv(p, q);
+
+  // Reinstate the sign.
+  p = (Packet4f)__builtin_msa_binsli_w((v4u32)p, (v4u32)_x, 0);
+
+  // When the argument is very small in magnitude it's more accurate to just return it.
+  p = (Packet4f)__builtin_msa_bsel_v((v16u8)tiny_mask, (v16u8)p, (v16u8)_x);
+
+  return p;
+}
+
+template <bool sine>
+Packet4f psincos_inner_msa_float(const Packet4f& _x) {
+  static _EIGEN_DECLARE_CONST_Packet4f(sincos_max_arg, 13176795.0f);  // Approx. (2**24) / (4/Pi).
+  static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1, -0.78515625f);
+  static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
+  static _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736e-3f);
+  static _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948e-5f);
+  static _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765e-3f);
+  static _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827e-2f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f);  // 4/Pi.
+  static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
+
+  Packet4f x = pabs(_x);
+
+  // Translate infinite arguments into NANs.
+  Packet4f zero_or_nan_if_inf = psub(_x, _x);
+  x = padd(x, zero_or_nan_if_inf);
+  // Prevent sin/cos from generating values larger than 1.0 in magnitude
+  // for very large arguments by setting x to 0.0.
+  Packet4i small_or_nan_mask = __builtin_msa_fcult_w(x, p4f_sincos_max_arg);
+  x = pand(x, (Packet4f)small_or_nan_mask);
+
+  // Scale x by 4/Pi to find x's octant.
+  Packet4f y = pmul(x, p4f_cephes_FOPI);
+  // Get the octant. We'll reduce x by this number of octants or by one more than it.
+  Packet4i y_int = __builtin_msa_ftrunc_s_w(y);
+  // x's from even-numbered octants will translate to octant 0: [0, +Pi/4].
+  // x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0].
+  // Adjustment for odd-numbered octants: octant = (octant + 1) & (~1).
+  Packet4i y_int1 = __builtin_msa_addvi_w(y_int, 1);
+  Packet4i y_int2 = (Packet4i)__builtin_msa_bclri_w((Packet4ui)y_int1, 0);
+  y = __builtin_msa_ffint_s_w(y_int2);
+
+  // Compute the sign to apply to the polynomial.
+  Packet4i sign_mask = sine ? pxor(__builtin_msa_slli_w(y_int1, 29), (Packet4i)_x)
+                            : __builtin_msa_slli_w(__builtin_msa_addvi_w(y_int, 3), 29);
+
+  // Get the polynomial selection mask.
+  // We'll calculate both (sin and cos) polynomials and then select from the two.
+  Packet4i poly_mask = __builtin_msa_ceqi_w(__builtin_msa_slli_w(y_int2, 30), 0);
+
+  // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4.
+  // The magic pass: "Extended precision modular arithmetic"
+  // x = ((x - y * DP1) - y * DP2) - y * DP3
+  Packet4f tmp1 = pmul(y, p4f_minus_cephes_DP1);
+  Packet4f tmp2 = pmul(y, p4f_minus_cephes_DP2);
+  Packet4f tmp3 = pmul(y, p4f_minus_cephes_DP3);
+  x = padd(x, tmp1);
+  x = padd(x, tmp2);
+  x = padd(x, tmp3);
+
+  // Evaluate the cos(x) polynomial.
+  y = p4f_coscof_p0;
+  Packet4f z = pmul(x, x);
+  y = pmadd(y, z, p4f_coscof_p1);
+  y = pmadd(y, z, p4f_coscof_p2);
+  y = pmul(y, z);
+  y = pmul(y, z);
+  y = __builtin_msa_fmsub_w(y, z, p4f_half);
+  y = padd(y, p4f_1);
+
+  // Evaluate the sin(x) polynomial.
+  Packet4f y2 = p4f_sincof_p0;
+  y2 = pmadd(y2, z, p4f_sincof_p1);
+  y2 = pmadd(y2, z, p4f_sincof_p2);
+  y2 = pmul(y2, z);
+  y2 = pmadd(y2, x, x);
+
+  // Select the correct result from the two polynomials.
+  y = sine ? (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y, (v16u8)y2)
+           : (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y2, (v16u8)y);
+
+  // Update the sign.
+  sign_mask = pxor(sign_mask, (Packet4i)y);
+  y = (Packet4f)__builtin_msa_binsli_w((v4u32)y, (v4u32)sign_mask, 0);
+  return y;
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+psin<Packet4f>(const Packet4f& x) {
+  return psincos_inner_msa_float</* sine */ true>(x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+pcos<Packet4f>(const Packet4f& x) {
+  return psincos_inner_msa_float</* sine */ false>(x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d
+pexp<Packet2d>(const Packet2d& _x) {
+  // Limiting double-precision pexp's argument to [-1024, +1024] lets pexp
+  // reach 0 and INFINITY naturally.
+  static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -1024.0);
+  static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, +1024.0);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
+  static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
+  static _EIGEN_DECLARE_CONST_Packet2d(1, 1.0);
+  static _EIGEN_DECLARE_CONST_Packet2d(2, 2.0);
+
+  Packet2d x = _x;
+
+  // Clamp x.
+  x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(x, p2d_exp_lo), (v16u8)x,
+                                     (v16u8)p2d_exp_lo);
+  x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(p2d_exp_hi, x), (v16u8)x,
+                                     (v16u8)p2d_exp_hi);
+
+  // Round to nearest integer by adding 0.5 (with x's sign) and truncating.
+  Packet2d x2_add = (Packet2d)__builtin_msa_binsli_d((v2u64)p2d_half, (v2u64)x, 0);
+  Packet2d x2 = pmadd(x, p2d_cephes_LOG2EF, x2_add);
+  Packet2l x2_long = __builtin_msa_ftrunc_s_d(x2);
+  Packet2d x2_long_d = __builtin_msa_ffint_s_d(x2_long);
+
+  x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C1);
+  x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C2);
+
+  x2 = pmul(x, x);
+
+  Packet2d px = p2d_cephes_exp_p0;
+  px = pmadd(px, x2, p2d_cephes_exp_p1);
+  px = pmadd(px, x2, p2d_cephes_exp_p2);
+  px = pmul(px, x);
+
+  Packet2d qx = p2d_cephes_exp_q0;
+  qx = pmadd(qx, x2, p2d_cephes_exp_q1);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q2);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q3);
+
+  x = pdiv(px, psub(qx, px));
+  x = pmadd(p2d_2, x, p2d_1);
+
+  // x *= 2**exponent.
+  x = __builtin_msa_fexp2_d(x, x2_long);
+
+  return x;
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_MSA_H
diff --git a/Eigen/src/Core/arch/MSA/PacketMath.h b/Eigen/src/Core/arch/MSA/PacketMath.h
new file mode 100644
index 000000000..094c874ee
--- /dev/null
+++ b/Eigen/src/Core/arch/MSA/PacketMath.h
@@ -0,0 +1,1317 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Wave Computing, Inc.
+// Written by:
+//   Chris Larsen
+//   Alexey Frunze (afrunze@wavecomp.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_MSA_H
+#define EIGEN_PACKET_MATH_MSA_H
+
+#include <iostream>
+#include <string>
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+#endif
+
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
+#endif
+
+#if 0
+#define EIGEN_MSA_DEBUG                                                             \
+  static bool firstTime = true;                                                     \
+  do {                                                                              \
+    if (firstTime) {                                                                \
+      std::cout << __FILE__ << ':' << __LINE__ << ':' << __FUNCTION__ << std::endl; \
+      firstTime = false;                                                            \
+    }                                                                               \
+  } while (0)
+#else
+#define EIGEN_MSA_DEBUG
+#endif
+
+#define EIGEN_MSA_SHF_I8(a, b, c, d) (((d) << 6) | ((c) << 4) | ((b) << 2) | (a))
+
+typedef v4f32 Packet4f;
+typedef v4i32 Packet4i;
+typedef v4u32 Packet4ui;
+
+#define _EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = { X, X, X, X }
+#define _EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = { X, X, X, X }
+#define _EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = { X, X, X, X }
+
+inline std::ostream& operator<<(std::ostream& os, const Packet4f& value) {
+  os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const Packet4i& value) {
+  os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const Packet4ui& value) {
+  os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
+  return os;
+}
+
+template <>
+struct packet_traits<float> : default_packet_traits {
+  typedef Packet4f type;
+  typedef Packet4f half;  // Packet2f intrinsics not implemented yet
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 0,  // Packet2f intrinsics not implemented yet
+    // FIXME check the Has*
+    HasDiv = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasTanh = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct packet_traits<int32_t> : default_packet_traits {
+  typedef Packet4i type;
+  typedef Packet4i half;  // Packet2i intrinsics not implemented yet
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 0,  // Packet2i intrinsics not implemented yet
+    // FIXME check the Has*
+    HasDiv = 1,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct unpacket_traits<Packet4f> {
+  typedef float type;
+  enum { size = 4, alignment = Aligned16 };
+  typedef Packet4f half;
+};
+
+template <>
+struct unpacket_traits<Packet4i> {
+  typedef int32_t type;
+  enum { size = 4, alignment = Aligned16 };
+  typedef Packet4i half;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4f v = { from, from, from, from };
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fill_w(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float* from) {
+  EIGEN_MSA_DEBUG;
+
+  float f = *from;
+  Packet4f v = { f, f, f, f };
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pload1<Packet4i>(const int32_t* from) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fill_w(*from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fadd_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_addv_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
+  EIGEN_MSA_DEBUG;
+
+  static const Packet4f countdown = { 0.0f, 1.0f, 2.0f, 3.0f };
+  return padd(pset1<Packet4f>(a), countdown);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) {
+  EIGEN_MSA_DEBUG;
+
+  static const Packet4i countdown = { 0, 1, 2, 3 };
+  return padd(pset1<Packet4i>(a), countdown);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fsub_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_subv_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_bnegi_w((v4u32)a, 31);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_addvi_w((v4i32)__builtin_msa_nori_b((v16u8)a, 0), 1);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fmul_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_mulv_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fdiv_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_div_s_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fmadd_w(c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
+  EIGEN_MSA_DEBUG;
+
+  // Use "asm" construct to avoid __builtin_msa_maddv_w GNU C bug.
+  Packet4i value = c;
+  __asm__("maddv.w %w[value], %w[a], %w[b]\n"
+          // Outputs
+          : [value] "+f"(value)
+          // Inputs
+          : [a] "f"(a), [b] "f"(b));
+  return value;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_and_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4i)__builtin_msa_and_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_or_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4i)__builtin_msa_or_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4i)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return pand(a, (Packet4f)__builtin_msa_xori_b((v16u8)b, 255));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return pand(a, (Packet4i)__builtin_msa_xori_b((v16u8)b, 255));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  // This prefers numbers to NaNs.
+  return __builtin_msa_fmin_w(a, b);
+#else
+  // This prefers NaNs to numbers.
+  Packet4i aNaN = __builtin_msa_fcun_w(a, a);
+  Packet4i aMinOrNaN = por(__builtin_msa_fclt_w(a, b), aNaN);
+  return (Packet4f)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_min_s_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  // This prefers numbers to NaNs.
+  return __builtin_msa_fmax_w(a, b);
+#else
+  // This prefers NaNs to numbers.
+  Packet4i aNaN = __builtin_msa_fcun_w(a, a);
+  Packet4i aMaxOrNaN = por(__builtin_msa_fclt_w(b, a), aNaN);
+  return (Packet4f)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_max_s_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast<float*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_LOAD return __builtin_msa_ld_w(const_cast<int32_t*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast<float*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4i)__builtin_msa_ld_w(const_cast<int32_t*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
+  EIGEN_MSA_DEBUG;
+
+  float f0 = from[0], f1 = from[1];
+  Packet4f v0 = { f0, f0, f0, f0 };
+  Packet4f v1 = { f1, f1, f1, f1 };
+  return (Packet4f)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from) {
+  EIGEN_MSA_DEBUG;
+
+  int32_t i0 = from[0], i1 = from[1];
+  Packet4i v0 = { i0, i0, i0, i0 };
+  Packet4i v1 = { i1, i1, i1, i1 };
+  return (Packet4i)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w(from, to, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w(from, to, 0);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  float f = *from;
+  Packet4f v = { f, f, f, f };
+  v[1] = from[stride];
+  v[2] = from[2 * stride];
+  v[3] = from[3 * stride];
+  return v;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  int32_t i = *from;
+  Packet4i v = { i, i, i, i };
+  v[1] = from[stride];
+  v[2] = from[2 * stride];
+  v[3] = from[3 * stride];
+  return v;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from,
+                                                        Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  *to = from[0];
+  to += stride;
+  *to = from[1];
+  to += stride;
+  *to = from[2];
+  to += stride;
+  *to = from[3];
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from,
+                                                          Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  *to = from[0];
+  to += stride;
+  *to = from[1];
+  to += stride;
+  *to = from[2];
+  to += stride;
+  *to = from[3];
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
+  EIGEN_MSA_DEBUG;
+
+  __builtin_prefetch(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) {
+  EIGEN_MSA_DEBUG;
+
+  __builtin_prefetch(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_bclri_w((v4u32)a, 31);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4i zero = __builtin_msa_ldi_w(0);
+  return __builtin_msa_add_a_w(zero, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4f s = padd(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return s[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) {
+  EIGEN_MSA_DEBUG;
+
+  v4i32 tmp1, tmp2, tmp3, tmp4;
+  Packet4f sum;
+
+  tmp1 = __builtin_msa_ilvr_w((v4i32)vecs[1], (v4i32)vecs[0]);
+  tmp2 = __builtin_msa_ilvr_w((v4i32)vecs[3], (v4i32)vecs[2]);
+  tmp3 = __builtin_msa_ilvl_w((v4i32)vecs[1], (v4i32)vecs[0]);
+  tmp4 = __builtin_msa_ilvl_w((v4i32)vecs[3], (v4i32)vecs[2]);
+
+  sum = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
+  sum = padd(sum, (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1));
+  sum = padd(sum, (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3));
+  sum = padd(sum, (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3));
+
+  return sum;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) {
+  EIGEN_MSA_DEBUG;
+
+  v4i32 tmp1, tmp2, tmp3, tmp4;
+  Packet4i sum;
+
+  tmp1 = __builtin_msa_ilvr_w((v4i32)vecs[1], (v4i32)vecs[0]);
+  tmp2 = __builtin_msa_ilvr_w((v4i32)vecs[3], (v4i32)vecs[2]);
+  tmp3 = __builtin_msa_ilvl_w((v4i32)vecs[1], (v4i32)vecs[0]);
+  tmp4 = __builtin_msa_ilvl_w((v4i32)vecs[3], (v4i32)vecs[2]);
+
+  sum = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
+  sum = padd(sum, (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1));
+  sum = padd(sum, (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3));
+  sum = padd(sum, (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3));
+
+  return sum;
+}
+
+template <>
+EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4i s = padd(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  s = padd(s, __builtin_msa_shf_w(s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return s[0];
+}
+
+// Other reduction functions:
+// mul
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4f p = pmul(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  p = pmul(p, (Packet4f)__builtin_msa_shf_w((v4i32)p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return p[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4i p = pmul(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  p = pmul(p, __builtin_msa_shf_w(p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return p[0];
+}
+
+// min
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  // Swap 64-bit halves of a.
+  Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+#if !EIGEN_FAST_MATH
+  // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit
+  // masks of all zeroes/ones in low 64 bits.
+  v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
+  // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes.
+  unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
+#endif
+  // Continue with min computation.
+  Packet4f v = __builtin_msa_fmin_w(a, swapped);
+  v = __builtin_msa_fmin_w(
+      v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+#if !EIGEN_FAST_MATH
+  // Based on the mask select between v and 4 qNaNs.
+  v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
+  v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
+#endif
+  return v[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4i m = pmin(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  m = pmin(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return m[0];
+}
+
+// max
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  // Swap 64-bit halves of a.
+  Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+#if !EIGEN_FAST_MATH
+  // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit
+  // masks of all zeroes/ones in low 64 bits.
+  v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
+  // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes.
+  unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
+#endif
+  // Continue with max computation.
+  Packet4f v = __builtin_msa_fmax_w(a, swapped);
+  v = __builtin_msa_fmax_w(
+      v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+#if !EIGEN_FAST_MATH
+  // Based on the mask select between v and 4 qNaNs.
+  v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
+  v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
+#endif
+  return v[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4i m = pmax(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  m = pmax(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return m[0];
+}
+
+#define PALIGN_MSA(Offset, Type, Command)                                                \
+  template <>                                                                            \
+  struct palign_impl<Offset, Type> {                                                     \
+    EIGEN_STRONG_INLINE static void run(Type& first, const Type& second) {               \
+      if (Offset != 0) first = (Type)(Command((v16i8)second, (v16i8)first, Offset * 4)); \
+    }                                                                                    \
+  };
+
+PALIGN_MSA(0, Packet4f, __builtin_msa_sldi_b)
+PALIGN_MSA(1, Packet4f, __builtin_msa_sldi_b)
+PALIGN_MSA(2, Packet4f, __builtin_msa_sldi_b)
+PALIGN_MSA(3, Packet4f, __builtin_msa_sldi_b)
+PALIGN_MSA(0, Packet4i, __builtin_msa_sldi_b)
+PALIGN_MSA(1, Packet4i, __builtin_msa_sldi_b)
+PALIGN_MSA(2, Packet4i, __builtin_msa_sldi_b)
+PALIGN_MSA(3, Packet4i, __builtin_msa_sldi_b)
+
+#undef PALIGN_MSA
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4f, 4>& value) {
+  os << "[ " << value.packet[0] << "," << std::endl
+     << "  " << value.packet[1] << "," << std::endl
+     << "  " << value.packet[2] << "," << std::endl
+     << "  " << value.packet[3] << " ]";
+  return os;
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
+  EIGEN_MSA_DEBUG;
+
+  v4i32 tmp1, tmp2, tmp3, tmp4;
+
+  tmp1 = __builtin_msa_ilvr_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
+  tmp2 = __builtin_msa_ilvr_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
+  tmp3 = __builtin_msa_ilvl_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
+  tmp4 = __builtin_msa_ilvl_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
+
+  kernel.packet[0] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
+  kernel.packet[1] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
+  kernel.packet[2] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
+  kernel.packet[3] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
+}
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4i, 4>& value) {
+  os << "[ " << value.packet[0] << "," << std::endl
+     << "  " << value.packet[1] << "," << std::endl
+     << "  " << value.packet[2] << "," << std::endl
+     << "  " << value.packet[3] << " ]";
+  return os;
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
+  EIGEN_MSA_DEBUG;
+
+  v4i32 tmp1, tmp2, tmp3, tmp4;
+
+  tmp1 = __builtin_msa_ilvr_w(kernel.packet[1], kernel.packet[0]);
+  tmp2 = __builtin_msa_ilvr_w(kernel.packet[3], kernel.packet[2]);
+  tmp3 = __builtin_msa_ilvl_w(kernel.packet[1], kernel.packet[0]);
+  tmp4 = __builtin_msa_ilvl_w(kernel.packet[3], kernel.packet[2]);
+
+  kernel.packet[0] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
+  kernel.packet[1] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
+  kernel.packet[2] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
+  kernel.packet[3] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fsqrt_w(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  return __builtin_msa_frsqrt_w(a);
+#else
+  Packet4f ones = __builtin_msa_ffint_s_w(__builtin_msa_ldi_w(1));
+  return pdiv(ones, psqrt(a));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
+  Packet4f v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"  // 3 = round towards -INFINITY.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.w %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
+  Packet4f v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"
+      "xori    %[new_mode], %[new_mode], 1\n"  // 2 = round towards +INFINITY.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.w %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
+  Packet4f v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"
+      "xori    %[new_mode], %[new_mode], 3\n"  // 0 = round to nearest, ties to even.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.w %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
+                                    const Packet4f& elsePacket) {
+  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2],
+                       ifPacket.select[3] };
+  Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
+  return (Packet4f)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
+                                    const Packet4i& elsePacket) {
+  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2],
+                       ifPacket.select[3] };
+  Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
+  return (Packet4i)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
+}
+
+//---------- double ----------
+
+typedef v2f64 Packet2d;
+typedef v2i64 Packet2l;
+typedef v2u64 Packet2ul;
+
+#define _EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = { X, X }
+#define _EIGEN_DECLARE_CONST_Packet2l(NAME, X) const Packet2l p2l_##NAME = { X, X }
+#define _EIGEN_DECLARE_CONST_Packet2ul(NAME, X) const Packet2ul p2ul_##NAME = { X, X }
+
+inline std::ostream& operator<<(std::ostream& os, const Packet2d& value) {
+  os << "[ " << value[0] << ", " << value[1] << " ]";
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const Packet2l& value) {
+  os << "[ " << value[0] << ", " << value[1] << " ]";
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const Packet2ul& value) {
+  os << "[ " << value[0] << ", " << value[1] << " ]";
+  return os;
+}
+
+template <>
+struct packet_traits<double> : default_packet_traits {
+  typedef Packet2d type;
+  typedef Packet2d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    HasHalfPacket = 0,
+    // FIXME check the Has*
+    HasDiv = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct unpacket_traits<Packet2d> {
+  typedef double type;
+  enum { size = 2, alignment = Aligned16 };
+  typedef Packet2d half;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d value = { from, from };
+  return value;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fadd_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
+  EIGEN_MSA_DEBUG;
+
+  static const Packet2d countdown = { 0.0, 1.0 };
+  return padd(pset1<Packet2d>(a), countdown);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fsub_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_bnegi_d((v2u64)a, 63);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fmul_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fdiv_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fmadd_d(c, a, b);
+}
+
+// Logical Operations are not supported for float, so we have to reinterpret casts using MSA
+// intrinsics
+template <>
+EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_and_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_or_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return pand(a, (Packet2d)__builtin_msa_xori_b((v16u8)b, 255));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast<double*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  // This prefers numbers to NaNs.
+  return __builtin_msa_fmin_d(a, b);
+#else
+  // This prefers NaNs to numbers.
+  v2i64 aNaN = __builtin_msa_fcun_d(a, a);
+  v2i64 aMinOrNaN = por(__builtin_msa_fclt_d(a, b), aNaN);
+  return (Packet2d)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  // This prefers numbers to NaNs.
+  return __builtin_msa_fmax_d(a, b);
+#else
+  // This prefers NaNs to numbers.
+  v2i64 aNaN = __builtin_msa_fcun_d(a, a);
+  v2i64 aMaxOrNaN = por(__builtin_msa_fclt_d(b, a), aNaN);
+  return (Packet2d)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast<double*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d value = { *from, *from };
+  return value;
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d value;
+  value[0] = *from;
+  from += stride;
+  value[1] = *from;
+  return value;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from,
+                                                         Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  *to = from[0];
+  to += stride;
+  *to = from[1];
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
+  EIGEN_MSA_DEBUG;
+
+  __builtin_prefetch(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_bclri_d((v2u64)a, 63);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d s = padd(a, preverse(a));
+  return s[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d v0 = (Packet2d)__builtin_msa_ilvev_d((v2i64)vecs[1], (v2i64)vecs[0]);
+  Packet2d v1 = (Packet2d)__builtin_msa_ilvod_d((v2i64)vecs[1], (v2i64)vecs[0]);
+
+  return padd(v0, v1);
+}
+
+// Other reduction functions:
+// mul
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d p = pmul(a, preverse(a));
+  return p[0];
+}
+
+// min
+template <>
+EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+  Packet2d v = __builtin_msa_fmin_d(a, swapped);
+  return v[0];
+#else
+  double a0 = a[0], a1 = a[1];
+  return ((numext::isnan)(a0) || a0 < a1) ? a0 : a1;
+#endif
+}
+
+// max
+template <>
+EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+  Packet2d v = __builtin_msa_fmax_d(a, swapped);
+  return v[0];
+#else
+  double a0 = a[0], a1 = a[1];
+  return ((numext::isnan)(a0) || a0 > a1) ? a0 : a1;
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fsqrt_d(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  return __builtin_msa_frsqrt_d(a);
+#else
+  Packet2d ones = __builtin_msa_ffint_s_d(__builtin_msa_ldi_d(1));
+  return pdiv(ones, psqrt(a));
+#endif
+}
+
+#define PALIGN_MSA(Offset, Type, Command)                                                \
+  template <>                                                                            \
+  struct palign_impl<Offset, Type> {                                                     \
+    EIGEN_STRONG_INLINE static void run(Type& first, const Type& second) {               \
+      if (Offset != 0) first = (Type)(Command((v16i8)second, (v16i8)first, Offset * 8)); \
+    }                                                                                    \
+  };
+
+PALIGN_MSA(0, Packet2d, __builtin_msa_sldi_b)
+PALIGN_MSA(1, Packet2d, __builtin_msa_sldi_b)
+
+#undef PALIGN_MSA
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet2d, 2>& value) {
+  os << "[ " << value.packet[0] << "," << std::endl << "  " << value.packet[1] << " ]";
+  return os;
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d trn1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
+  Packet2d trn2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
+  kernel.packet[0] = trn1;
+  kernel.packet[1] = trn2;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
+  Packet2d v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"  // 3 = round towards -INFINITY.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.d %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
+  Packet2d v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"
+      "xori    %[new_mode], %[new_mode], 1\n"  // 2 = round towards +INFINITY.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.d %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
+  Packet2d v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"
+      "xori    %[new_mode], %[new_mode], 3\n"  // 0 = round to nearest, ties to even.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.d %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
+                                    const Packet2d& elsePacket) {
+  Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
+  Packet2l mask = __builtin_msa_ceqi_d((Packet2l)select, 0);
+  return (Packet2d)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_PACKET_MATH_MSA_H
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index 57e9b431f..306a309be 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -67,7 +67,7 @@ template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type;
 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
   float32x2_t r64;
-  r64 = vld1_f32((float *)&from);
+  r64 = vld1_f32((const float *)&from);
 
   return Packet2cf(vcombine_f32(r64, r64));
 }
@@ -142,7 +142,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf
   to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
 }
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { EIGEN_ARM_PREFETCH((float *)addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { EIGEN_ARM_PREFETCH((const float *)addr); }
 
 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
@@ -265,6 +265,8 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
   }
 };
 
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
+
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
   // TODO optimize it for NEON
@@ -275,7 +277,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con
   s = vmulq_f32(b.v, b.v);
   rev_s = vrev64q_f32(s);
 
-  return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s)));
+  return Packet2cf(pdiv<Packet4f>(res.v, vaddq_f32(s,rev_s)));
 }
 
 EIGEN_DEVICE_FUNC inline void
@@ -381,7 +383,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { EIGEN_ARM_PREFETCH((double *)addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { EIGEN_ARM_PREFETCH((const double *)addr); }
 
 template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
 {
@@ -456,6 +458,8 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
   }
 };
 
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
+
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
   // TODO optimize it for NEON
diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h
index 6bb05bb92..c48c61023 100644
--- a/Eigen/src/Core/arch/NEON/MathFunctions.h
+++ b/Eigen/src/Core/arch/NEON/MathFunctions.h
@@ -84,6 +84,98 @@ Packet4f pexp<Packet4f>(const Packet4f& _x)
   return y;
 }
 
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f plog<Packet4f>(const Packet4f& _x)
+{
+  Packet4f x = _x;
+  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+
+  _EIGEN_DECLARE_CONST_Packet4i(inv_mant_mask, ~0x7f800000);
+
+  /* natural logarithm computed for 4 simultaneous float
+    return NaN for x <= 0
+  */
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
+
+  x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
+  Packet4ui invalid_mask = vcleq_f32(x, vdupq_n_f32(0));
+
+  Packet4i ux = vreinterpretq_s32_f32(x);
+
+  Packet4i emm0 = vshrq_n_s32(ux, 23);
+
+  /* keep only the fractional part */
+  ux = vandq_s32(ux, p4i_inv_mant_mask);
+  ux = vorrq_s32(ux, vreinterpretq_s32_f32(p4f_half));
+  x = vreinterpretq_f32_s32(ux);
+
+  emm0 = vsubq_s32(emm0, p4i_0x7f);
+  Packet4f e = vcvtq_f32_s32(emm0);
+
+  e = vaddq_f32(e, p4f_1);
+
+  /* part2:
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  Packet4ui mask = vcltq_f32(x, p4f_cephes_SQRTHF);
+  Packet4f tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
+  x = vsubq_f32(x, p4f_1);
+  e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(p4f_1), mask)));
+  x = vaddq_f32(x, tmp);
+
+  Packet4f z = vmulq_f32(x,x);
+
+  Packet4f y = p4f_cephes_log_p0;
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, p4f_cephes_log_p1);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, p4f_cephes_log_p2);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, p4f_cephes_log_p3);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, p4f_cephes_log_p4);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, p4f_cephes_log_p5);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, p4f_cephes_log_p6);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, p4f_cephes_log_p7);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, p4f_cephes_log_p8);
+  y = vmulq_f32(y, x);
+
+  y = vmulq_f32(y, z);
+
+  tmp = vmulq_f32(e, p4f_cephes_log_q1);
+  y = vaddq_f32(y, tmp);
+
+
+  tmp = vmulq_f32(z, p4f_half);
+  y = vsubq_f32(y, tmp);
+
+  tmp = vmulq_f32(e, p4f_cephes_log_q2);
+  x = vaddq_f32(x, y);
+  x = vaddq_f32(x, tmp);
+  x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN
+  return x;
+}
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 84a56bdcc..010739380 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -36,12 +36,43 @@ namespace internal {
 #endif
 #endif
 
+#if EIGEN_COMP_MSVC
+
+// In MSVC's arm_neon.h header file, all NEON vector types
+// are aliases to the same underlying type __n128.
+// We thus have to wrap them to make them different C++ types.
+// (See also bug 1428)
+
+template<typename T,int unique_id>
+struct eigen_packet_wrapper
+{
+  operator T&() { return m_val; }
+  operator const T&() const { return m_val; }
+  eigen_packet_wrapper() {}
+  eigen_packet_wrapper(const T &v) : m_val(v) {}
+  eigen_packet_wrapper& operator=(const T &v) {
+    m_val = v;
+    return *this;
+  }
+
+  T m_val;
+};
+typedef eigen_packet_wrapper<float32x2_t,0> Packet2f;
+typedef eigen_packet_wrapper<float32x4_t,1> Packet4f;
+typedef eigen_packet_wrapper<int32x4_t  ,2> Packet4i;
+typedef eigen_packet_wrapper<int32x2_t  ,3> Packet2i;
+typedef eigen_packet_wrapper<uint32x4_t ,4> Packet4ui;
+
+#else
+
 typedef float32x2_t Packet2f;
 typedef float32x4_t Packet4f;
 typedef int32x4_t   Packet4i;
 typedef int32x2_t   Packet2i;
 typedef uint32x4_t  Packet4ui;
 
+#endif // EIGEN_COMP_MSVC
+
 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
   const Packet4f p4f_##NAME = pset1<Packet4f>(X)
 
@@ -51,14 +82,17 @@ typedef uint32x4_t  Packet4ui;
 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
   const Packet4i p4i_##NAME = pset1<Packet4i>(X)
 
-// arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function
-// which available on LLVM and GCC (at least)
-#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
+#if EIGEN_ARCH_ARM64
+  // __builtin_prefetch tends to do nothing on ARM64 compilers because the
+  // prefetch instructions there are too detailed for __builtin_prefetch to map
+  // meaningfully to them.
+  #define EIGEN_ARM_PREFETCH(ADDR)  __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : );
+#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
   #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
 #elif defined __pld
   #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
-#elif !EIGEN_ARCH_ARM64
-  #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ( "   pld [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
+#elif EIGEN_ARCH_ARM32
+  #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : );
 #else
   // by default no explicit prefetching
   #define EIGEN_ARM_PREFETCH(ADDR)
@@ -78,7 +112,7 @@ template<> struct packet_traits<float>  : default_packet_traits
     // FIXME check the Has*
     HasSin  = 0,
     HasCos  = 0,
-    HasLog  = 0,
+    HasLog  = 1,
     HasExp  = 1,
     HasSqrt = 0
   };
@@ -113,7 +147,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t&    from)
 
 template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
 {
-  const float32_t f[] = {0, 1, 2, 3};
+  const float f[] = {0, 1, 2, 3};
   Packet4f countdown = vld1q_f32(f);
   return vaddq_f32(pset1<Packet4f>(a), countdown);
 }
diff --git a/Eigen/src/Core/arch/NEON/TypeCasting.h b/Eigen/src/Core/arch/NEON/TypeCasting.h
new file mode 100644
index 000000000..95d1fd0e4
--- /dev/null
+++ b/Eigen/src/Core/arch/NEON/TypeCasting.h
@@ -0,0 +1,48 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_NEON_H
+#define EIGEN_TYPE_CASTING_NEON_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <>
+struct type_casting_traits<float, int> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template <>
+struct type_casting_traits<int, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+
+template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+  return vcvtq_s32_f32(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+  return vcvtq_f32_s32(a);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_NEON_H
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index 5607fe0ab..d075043ce 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -128,7 +128,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf
                                      _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
 }
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
 
 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
@@ -229,23 +229,7 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
   }
 };
 
-template<> struct conj_helper<Packet4f, Packet2cf, false,false>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
-  { return Packet2cf(Eigen::internal::pmul<Packet4f>(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet2cf, Packet4f, false,false>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
-  { return Packet2cf(Eigen::internal::pmul<Packet4f>(x.v, y)); }
-};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
 
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
@@ -340,7 +324,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); }
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
 
 template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
 {
@@ -430,23 +414,7 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
   }
 };
 
-template<> struct conj_helper<Packet2d, Packet1cd, false,false>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
-  { return Packet1cd(Eigen::internal::pmul<Packet2d>(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet1cd, Packet2d, false,false>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
-  { return Packet1cd(Eigen::internal::pmul<Packet2d>(x.v, y)); }
-};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
 
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index 7b5f948e1..4af2c6cae 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -242,7 +242,7 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
   return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x);
 }
 
-/* evaluation of 4 sines at onces, using SSE2 intrinsics.
+/* evaluation of 4 sines at once, using SSE2 intrinsics.
 
    The code is the exact rewriting of the cephes sinf function.
    Precision is excellent as long as x < 8192 (I did not bother to
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 03c8a2c13..2e815c0c5 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -461,10 +461,16 @@ template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double&
   pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
 }
 
+#if EIGEN_COMP_PGI
+typedef const void * SsePrefetchPtrType;
+#else
+typedef const char * SsePrefetchPtrType;
+#endif
+
 #ifndef EIGEN_VECTORIZE_AVX
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
 #endif
 
 #if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
@@ -657,7 +663,7 @@ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
   // TODO try to call _mm_mul_epu32 directly
   EIGEN_ALIGN16 int aux[4];
   pstore(aux, a);
-  return  (aux[0] * aux[1]) * (aux[2] * aux[3]);;
+  return  (aux[0] * aux[1]) * (aux[2] * aux[3]);
 }
 
 // min
@@ -928,4 +934,14 @@ template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, co
 
 } // end namespace Eigen
 
+#if EIGEN_COMP_PGI
+// PGI++ does not define the following intrinsics in C++ mode.
+static inline __m128  _mm_castpd_ps   (__m128d x) { return reinterpret_cast<__m128&>(x);  }
+static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); }
+static inline __m128d _mm_castps_pd   (__m128  x) { return reinterpret_cast<__m128d&>(x); }
+static inline __m128i _mm_castps_si128(__m128  x) { return reinterpret_cast<__m128i&>(x); }
+static inline __m128  _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x);  }
+static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); }
+#endif
+
 #endif // EIGEN_PACKET_MATH_SSE_H
diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h
index c84893230..c6ca8c716 100644
--- a/Eigen/src/Core/arch/SSE/TypeCasting.h
+++ b/Eigen/src/Core/arch/SSE/TypeCasting.h
@@ -14,6 +14,7 @@ namespace Eigen {
 
 namespace internal {
 
+#ifndef EIGEN_VECTORIZE_AVX
 template <>
 struct type_casting_traits<float, int> {
   enum {
@@ -23,11 +24,6 @@ struct type_casting_traits<float, int> {
   };
 };
 
-template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
-  return _mm_cvttps_epi32(a);
-}
-
-
 template <>
 struct type_casting_traits<int, float> {
   enum {
@@ -37,11 +33,6 @@ struct type_casting_traits<int, float> {
   };
 };
 
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
-  return _mm_cvtepi32_ps(a);
-}
-
-
 template <>
 struct type_casting_traits<double, float> {
   enum {
@@ -51,10 +42,6 @@ struct type_casting_traits<double, float> {
   };
 };
 
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
-  return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
-}
-
 template <>
 struct type_casting_traits<float, double> {
   enum {
@@ -63,6 +50,19 @@ struct type_casting_traits<float, double> {
     TgtCoeffRatio = 2
   };
 };
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+  return _mm_cvttps_epi32(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+  return _mm_cvtepi32_ps(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
+  return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
+}
 
 template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
   // Simply discard the second half of the input
diff --git a/Eigen/src/Core/arch/SYCL/InteropHeaders.h b/Eigen/src/Core/arch/SYCL/InteropHeaders.h
new file mode 100644
index 000000000..c1da40d14
--- /dev/null
+++ b/Eigen/src/Core/arch/SYCL/InteropHeaders.h
@@ -0,0 +1,104 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * InteropHeaders.h
+ *
+ * \brief:
+ *  InteropHeaders
+ *
+*****************************************************************/
+
+#ifndef EIGEN_INTEROP_HEADERS_SYCL_H
+#define EIGEN_INTEROP_HEADERS_SYCL_H
+#if defined EIGEN_USE_SYCL
+namespace Eigen {
+
+namespace internal {
+#define SYCL_PACKET_TRAITS(packet_type, val, unpacket_type, lengths)\
+  template<> struct packet_traits<unpacket_type> : default_packet_traits\
+  {\
+    typedef packet_type type;\
+    typedef packet_type half;\
+    enum {\
+      Vectorizable = 1,\
+      AlignedOnScalar = 1,\
+      size=lengths,\
+      HasHalfPacket = 0,\
+      HasDiv  = 1,\
+      HasLog  = 1,\
+      HasExp  = 1,\
+      HasSqrt = 1,\
+      HasRsqrt = 1,\
+      HasSin    = 1,\
+      HasCos    = 1,\
+      HasTan    = 1,\
+      HasASin   = 1,\
+      HasACos   = 1,\
+      HasATan   = 1,\
+      HasSinh   = 1,\
+      HasCosh   = 1,\
+      HasTanh   = 1,\
+      HasLGamma = 0,\
+      HasDiGamma = 0,\
+      HasZeta = 0,\
+      HasPolygamma = 0,\
+      HasErf = 0,\
+      HasErfc = 0,\
+      HasIGamma = 0,\
+      HasIGammac = 0,\
+      HasBetaInc = 0,\
+      HasBlend = val,\
+      HasMax=1,\
+      HasMin=1,\
+      HasMul=1,\
+      HasAdd=1,\
+      HasFloor=1,\
+      HasRound=1,\
+      HasLog1p=1,\
+      HasExpm1=1,\
+      HasCeil=1,\
+    };\
+  };
+
+SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, float, 4)
+SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, const float, 4)
+SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, double, 2)
+SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, const double, 2)
+#undef SYCL_PACKET_TRAITS
+
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#define SYCL_ARITHMETIC(packet_type) template<> struct is_arithmetic<packet_type>  { enum { value = true }; };
+SYCL_ARITHMETIC(cl::sycl::cl_float4)
+SYCL_ARITHMETIC(cl::sycl::cl_double2)
+#undef SYCL_ARITHMETIC
+
+#define SYCL_UNPACKET_TRAITS(packet_type, unpacket_type, lengths)\
+template<> struct unpacket_traits<packet_type>  {\
+  typedef unpacket_type  type;\
+  enum {size=lengths, alignment=Aligned16};\
+  typedef packet_type half;\
+};
+SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4)
+SYCL_UNPACKET_TRAITS(cl::sycl::cl_double2, double, 2)
+
+#undef SYCL_UNPACKET_TRAITS
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_USE_SYCL
+#endif // EIGEN_INTEROP_HEADERS_SYCL_H
diff --git a/Eigen/src/Core/arch/SYCL/MathFunctions.h b/Eigen/src/Core/arch/SYCL/MathFunctions.h
new file mode 100644
index 000000000..422839c6c
--- /dev/null
+++ b/Eigen/src/Core/arch/SYCL/MathFunctions.h
@@ -0,0 +1,221 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * MathFunctions.h
+ *
+ * \brief:
+ *  MathFunctions
+ *
+*****************************************************************/
+
+#ifndef EIGEN_MATH_FUNCTIONS_SYCL_H
+#define EIGEN_MATH_FUNCTIONS_SYCL_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+//#if defined(__SYCL_DEVICE_ONLY__) && defined(EIGEN_USE_SYCL)
+#define SYCL_PLOG(packet_type) \
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+packet_type plog<packet_type>(const packet_type& a) { return cl::sycl::log(a); }
+
+SYCL_PLOG(cl::sycl::cl_float4)
+SYCL_PLOG(cl::sycl::cl_double2)
+#undef SYCL_PLOG
+
+#define SYCL_PLOG1P(packet_type) \
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+packet_type plog1p<packet_type>(const packet_type& a) { return cl::sycl::log1p(a); }
+
+SYCL_PLOG1P(cl::sycl::cl_float4)
+SYCL_PLOG1P(cl::sycl::cl_double2)
+#undef SYCL_PLOG1P
+
+#define SYCL_PLOG10(packet_type) \
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+packet_type plog10<packet_type>(const packet_type& a) { return cl::sycl::log10(a); }
+
+SYCL_PLOG10(cl::sycl::cl_float4)
+SYCL_PLOG10(cl::sycl::cl_double2)
+#undef SYCL_PLOG10
+
+#define SYCL_PEXP(packet_type) \
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+packet_type pexp<packet_type>(const packet_type& a) { return cl::sycl::exp(a); }
+
+SYCL_PEXP(cl::sycl::cl_float4)
+SYCL_PEXP(cl::sycl::cl_double2)
+#undef SYCL_PEXP
+
+#define SYCL_PEXPM1(packet_type) \
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+packet_type pexpm1<packet_type>(const packet_type& a) { return cl::sycl::expm1(a); }
+
+SYCL_PEXPM1(cl::sycl::cl_float4)
+SYCL_PEXPM1(cl::sycl::cl_double2)
+#undef SYCL_PEXPM1
+
+#define SYCL_PSQRT(packet_type) \
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+packet_type psqrt<packet_type>(const packet_type& a) { return cl::sycl::sqrt(a); }
+
+SYCL_PSQRT(cl::sycl::cl_float4)
+SYCL_PSQRT(cl::sycl::cl_double2)
+#undef SYCL_PSQRT
+
+
+#define SYCL_PRSQRT(packet_type) \
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+packet_type prsqrt<packet_type>(const packet_type& a) { return cl::sycl::rsqrt(a); }
+
+SYCL_PRSQRT(cl::sycl::cl_float4)
+SYCL_PRSQRT(cl::sycl::cl_double2)
+#undef SYCL_PRSQRT
+
+
+/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
+#define SYCL_PSIN(packet_type) \
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+packet_type psin<packet_type>(const packet_type& a)  { return cl::sycl::sin(a); }
+
+SYCL_PSIN(cl::sycl::cl_float4)
+SYCL_PSIN(cl::sycl::cl_double2)
+#undef SYCL_PSIN
+
+
+/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
+#define SYCL_PCOS(packet_type) \
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+packet_type pcos<packet_type>(const packet_type& a)  { return cl::sycl::cos(a); }
+
+SYCL_PCOS(cl::sycl::cl_float4)
+SYCL_PCOS(cl::sycl::cl_double2)
+#undef SYCL_PCOS
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
+#define SYCL_PTAN(packet_type) \
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+packet_type ptan<packet_type>(const packet_type& a) {  return cl::sycl::tan(a); }
+
+SYCL_PTAN(cl::sycl::cl_float4)
+SYCL_PTAN(cl::sycl::cl_double2)
+#undef SYCL_PTAN
+
+/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
+#define SYCL_PASIN(packet_type) \
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+packet_type pasin<packet_type>(const packet_type& a)  { return cl::sycl::asin(a); }
+
+SYCL_PASIN(cl::sycl::cl_float4)
+SYCL_PASIN(cl::sycl::cl_double2)
+#undef SYCL_PASIN
+
+
+/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
+#define SYCL_PACOS(packet_type) \
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+packet_type pacos<packet_type>(const packet_type& a)  { return cl::sycl::acos(a); }
+
+SYCL_PACOS(cl::sycl::cl_float4)
+SYCL_PACOS(cl::sycl::cl_double2)
+#undef SYCL_PACOS
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
+#define SYCL_PATAN(packet_type) \
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+packet_type patan<packet_type>(const packet_type& a) {  return cl::sycl::atan(a); }
+
+SYCL_PATAN(cl::sycl::cl_float4)
+SYCL_PATAN(cl::sycl::cl_double2)
+#undef SYCL_PATAN
+
+/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
+#define SYCL_PSINH(packet_type) \
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+packet_type psinh<packet_type>(const packet_type& a)  { return cl::sycl::sinh(a); }
+
+SYCL_PSINH(cl::sycl::cl_float4)
+SYCL_PSINH(cl::sycl::cl_double2)
+#undef SYCL_PSINH
+
+/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
+#define SYCL_PCOSH(packet_type) \
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+packet_type pcosh<packet_type>(const packet_type& a)  { return cl::sycl::cosh(a); }
+
+SYCL_PCOSH(cl::sycl::cl_float4)
+SYCL_PCOSH(cl::sycl::cl_double2)
+#undef SYCL_PCOSH
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
+#define SYCL_PTANH(packet_type) \
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+packet_type ptanh<packet_type>(const packet_type& a) {  return cl::sycl::tanh(a); }
+
+SYCL_PTANH(cl::sycl::cl_float4)
+SYCL_PTANH(cl::sycl::cl_double2)
+#undef SYCL_PTANH
+
+#define SYCL_PCEIL(packet_type) \
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+packet_type pceil<packet_type>(const packet_type& a) { return cl::sycl::ceil(a); }
+
+SYCL_PCEIL(cl::sycl::cl_float4)
+SYCL_PCEIL(cl::sycl::cl_double2)
+#undef SYCL_PCEIL
+
+
+#define SYCL_PROUND(packet_type) \
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+packet_type pround<packet_type>(const packet_type& a) { return cl::sycl::round(a); }
+
+SYCL_PROUND(cl::sycl::cl_float4)
+SYCL_PROUND(cl::sycl::cl_double2)
+#undef SYCL_PROUND
+
+#define SYCL_FLOOR(packet_type) \
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+packet_type pfloor<packet_type>(const packet_type& a) { return cl::sycl::floor(a); }
+
+SYCL_FLOOR(cl::sycl::cl_float4)
+SYCL_FLOOR(cl::sycl::cl_double2)
+#undef SYCL_FLOOR
+
+
+#define SYCL_PMIN(packet_type, expr) \
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+packet_type pmin<packet_type>(const packet_type& a, const packet_type& b) { return expr; }
+
+SYCL_PMIN(cl::sycl::cl_float4, cl::sycl::fmin(a, b))
+SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b))
+#undef SYCL_PMIN
+
+#define SYCL_PMAX(packet_type, expr) \
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+packet_type pmax<packet_type>(const packet_type& a, const packet_type& b) { return expr; }
+
+SYCL_PMAX(cl::sycl::cl_float4, cl::sycl::fmax(a, b))
+SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b))
+#undef SYCL_PMAX
+
+//#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATH_FUNCTIONS_CUDA_H
diff --git a/Eigen/src/Core/arch/SYCL/PacketMath.h b/Eigen/src/Core/arch/SYCL/PacketMath.h
new file mode 100644
index 000000000..820a83311
--- /dev/null
+++ b/Eigen/src/Core/arch/SYCL/PacketMath.h
@@ -0,0 +1,458 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * PacketMath.h
+ *
+ * \brief:
+ *  PacketMath
+ *
+*****************************************************************/
+
+#ifndef EIGEN_PACKET_MATH_SYCL_H
+#define EIGEN_PACKET_MATH_SYCL_H
+#include <type_traits>
+#if defined EIGEN_USE_SYCL
+namespace Eigen {
+
+namespace internal {
+
+#define SYCL_PLOADT_RO(address_space_target)\
+template<typename packet_type, int Alignment>\
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\
+ ploadt_ro(typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
+   cl::sycl::access::address_space::address_space_target>::pointer_t from) {\
+   typedef typename unpacket_traits<packet_type>::type scalar;\
+   typedef cl::sycl::multi_ptr<scalar, cl::sycl::access::address_space::address_space_target> multi_ptr;\
+   auto res=packet_type(static_cast<typename unpacket_traits<packet_type>::type>(0));\
+   res.load(0, multi_ptr(const_cast<typename multi_ptr::pointer_t>(from)));\
+   return res;\
+}
+
+SYCL_PLOADT_RO(global_space)
+SYCL_PLOADT_RO(local_space)
+
+#undef SYCL_PLOADT_RO
+
+
+#define SYCL_PLOAD(address_space_target, Alignment, AlignedType)\
+template<typename packet_type> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\
+ pload##AlignedType(typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
+   cl::sycl::access::address_space::address_space_target>::pointer_t from) {\
+   return ploadt_ro<packet_type, Alignment>(from);\
+ }
+
+// global space
+SYCL_PLOAD(global_space, Unaligned, u)
+SYCL_PLOAD(global_space, Aligned, )
+
+// local space
+SYCL_PLOAD(local_space, Unaligned, u)
+SYCL_PLOAD(local_space, Aligned, )
+
+// private space
+//SYCL_PLOAD(private_space, Unaligned, u)
+//SYCL_PLOAD(private_space, Aligned, )
+
+#undef SYCL_PLOAD
+
+
+/** \internal \returns a packet version of \a *from.
+  * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
+#define SYCL_PLOADT(address_space_target)\
+template<typename packet_type, int Alignment>\
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt(\
+  typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
+  cl::sycl::access::address_space::address_space_target>::pointer_t from)\
+{\
+  if(Alignment >= unpacket_traits<packet_type>::alignment)\
+    return pload<packet_type>(from);\
+  else\
+    return ploadu<packet_type>(from);\
+}
+
+// global space
+SYCL_PLOADT(global_space)
+// local space
+SYCL_PLOADT(local_space)
+
+//private_space
+// There is no need to specialise it for private space as it can use the GenericPacketMath version
+
+#define SYCL_PLOADT_RO_SPECIAL(packet_type, Alignment)\
+ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\
+  ploadt_ro<packet_type, Alignment>(const typename unpacket_traits<packet_type>::type * from) { \
+    typedef typename unpacket_traits<packet_type>::type scalar;\
+   auto res=packet_type(static_cast<scalar>(0));\
+   res. template load<cl::sycl::access::address_space::private_space>(0, const_cast<scalar*>(from));\
+   return res;\
+  }
+
+SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Aligned)
+SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Aligned)
+SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Unaligned)
+SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Unaligned)
+
+
+#define SYCL_PLOAD_SPECIAL(packet_type, alignment_type)\
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\
+  pload##alignment_type(const typename unpacket_traits<packet_type>::type * from) { \
+    typedef typename unpacket_traits<packet_type>::type scalar;\
+   auto res=packet_type(static_cast<scalar>(0));\
+   res. template load<cl::sycl::access::address_space::private_space>(0, const_cast<scalar*>(from));\
+   return res;\
+  }
+SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4,)
+SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2,)
+SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, u)
+SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, u)
+
+#undef SYCL_PLOAD_SPECIAL
+
+#define SYCL_PSTORE(scalar, packet_type, address_space_target, alignment)\
+template<>\
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
+   typename cl::sycl::multi_ptr<scalar, cl::sycl::access::address_space::address_space_target>::pointer_t to, \
+   const packet_type& from) {\
+     typedef cl::sycl::multi_ptr<scalar, cl::sycl::access::address_space::address_space_target> multi_ptr;\
+     from.store(0, multi_ptr(to));\
+}
+
+// global space
+SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, )
+SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, u)
+SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, )
+SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, u)
+
+SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, )
+SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, u)
+SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, )
+SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, u)
+
+SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, )
+SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, u)
+SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, )
+SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, u)
+
+
+#define SYCL_PSTORE_T(scalar, packet_type, Alignment)\
+template<>\
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret<scalar, packet_type, Alignment>(\
+  scalar* to,\
+  const packet_type& from) {\
+  if(Alignment)\
+  pstore(to, from);\
+  else\
+  pstoreu(to,from);\
+}
+
+
+SYCL_PSTORE_T(float, cl::sycl::cl_float4, Aligned)
+
+SYCL_PSTORE_T(float, cl::sycl::cl_float4, Unaligned)
+
+SYCL_PSTORE_T(double, cl::sycl::cl_double2, Aligned)
+
+SYCL_PSTORE_T(double, cl::sycl::cl_double2, Unaligned)
+
+
+#undef SYCL_PSTORE_T
+
+#define SYCL_PSET1(packet_type)\
+template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1<packet_type>(\
+  const typename unpacket_traits<packet_type>::type&  from) {\
+  return packet_type(from);\
+}
+
+// global space
+SYCL_PSET1(cl::sycl::cl_float4)
+SYCL_PSET1(cl::sycl::cl_double2)
+
+#undef SYCL_PSET1
+
+
+template <typename packet_type> struct get_base_packet {
+template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type get_ploaddup(sycl_multi_pointer ) {}
+
+  template <typename sycl_multi_pointer>
+    static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type get_pgather(sycl_multi_pointer , Index ) {}
+};
+
+template <> struct get_base_packet <cl::sycl::cl_float4>  {
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_ploaddup(sycl_multi_pointer from) {
+    return cl::sycl::cl_float4(from[0], from[0], from[1], from[1]);
+  }
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_pgather(sycl_multi_pointer from, Index stride) {
+    return cl::sycl::cl_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
+  }
+
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(sycl_multi_pointer to , const cl::sycl::cl_float4& from, Index stride) {
+    auto tmp = stride;
+    to[0] = from.x();
+    to[tmp] = from.y();
+    to[tmp += stride] = from.z();
+    to[tmp += stride] = from.w();
+ }
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 set_plset(const float& a) {
+   return  cl::sycl::cl_float4(static_cast<float>(a), static_cast<float>(a+1), static_cast<float>(a+2), static_cast<float>(a+3));
+ }
+};
+
+template <> struct get_base_packet <cl::sycl::cl_double2>  {
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_ploaddup(const sycl_multi_pointer from) {
+    return cl::sycl::cl_double2(from[0], from[0]);
+  }
+
+  template <typename sycl_multi_pointer, typename Index>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_pgather(const sycl_multi_pointer from, Index stride) {
+    return cl::sycl::cl_double2(from[0*stride], from[1*stride]);
+  }
+
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(sycl_multi_pointer to , const cl::sycl::cl_double2& from, Index stride) {
+    to[0] = from.x();
+    to[stride] = from.y();
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 set_plset(const double& a) {
+    return  cl::sycl::cl_double2(static_cast<double>(a), static_cast<double>(a + 1));
+  }
+};
+
+#define SYCL_PLOAD_DUP(address_space_target)\
+template<typename packet_type> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \
+ploaddup(typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
+  cl::sycl::access::address_space::address_space_target>::pointer_t from)\
+{\
+  return get_base_packet<packet_type>::get_ploaddup(from); \
+}
+
+// global space
+SYCL_PLOAD_DUP(global_space)
+// local_space
+SYCL_PLOAD_DUP(local_space)
+// private_space
+//SYCL_PLOAD_DUP(private_space)
+#undef SYCL_PLOAD_DUP
+
+#define SYCL_PLOAD_DUP_SPECILIZE(packet_type)\
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \
+ploaddup<packet_type>(const typename unpacket_traits<packet_type>::type * from)\
+{ \
+  return get_base_packet<packet_type>::get_ploaddup(from); \
+}
+
+SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_float4)
+SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2)
+
+#undef SYCL_PLOAD_DUP_SPECILIZE
+
+#define SYCL_PLSET(packet_type)\
+template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type plset<packet_type>(const typename unpacket_traits<packet_type>::type& a) {\
+  return get_base_packet<packet_type>::set_plset(a);\
+}
+
+SYCL_PLSET(cl::sycl::cl_float4)
+SYCL_PLSET(cl::sycl::cl_double2)
+
+#undef SYCL_PLSET
+
+
+#define SYCL_PGATHER(address_space_target)\
+template<typename Scalar, typename packet_type> EIGEN_DEVICE_FUNC inline packet_type pgather(\
+  typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
+  cl::sycl::access::address_space::address_space_target>::pointer_t from, Index stride) {\
+    return get_base_packet<packet_type>::get_pgather(from, stride); \
+}
+
+// global space
+SYCL_PGATHER(global_space)
+// local space
+SYCL_PGATHER(local_space)
+// private space
+//SYCL_PGATHER(private_space)
+
+#undef SYCL_PGATHER
+
+
+#define SYCL_PGATHER_SPECILIZE(scalar, packet_type)\
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \
+pgather<scalar, packet_type>(const typename unpacket_traits<packet_type>::type * from, Index stride)\
+{ \
+  return get_base_packet<packet_type>::get_pgather(from, stride); \
+}
+
+SYCL_PGATHER_SPECILIZE(float, cl::sycl::cl_float4)
+SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2)
+
+#undef SYCL_PGATHER_SPECILIZE
+
+#define SYCL_PSCATTER(address_space_target)\
+template<typename Scalar, typename packet_type> EIGEN_DEVICE_FUNC inline void pscatter(\
+    typename cl::sycl::multi_ptr<typename unpacket_traits<packet_type>::type,\
+    cl::sycl::access::address_space::address_space_target>::pointer_t to,\
+      const packet_type& from, Index stride) {\
+      get_base_packet<packet_type>::set_pscatter(to, from, stride);\
+}
+
+// global space
+SYCL_PSCATTER(global_space)
+// local space
+SYCL_PSCATTER(local_space)
+// private space
+//SYCL_PSCATTER(private_space)
+
+#undef SYCL_PSCATTER
+
+
+
+#define SYCL_PSCATTER_SPECILIZE(scalar, packet_type)\
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void \
+pscatter<scalar, packet_type>(typename unpacket_traits<packet_type>::type * to, const packet_type& from, Index stride)\
+{ \
+   get_base_packet<packet_type>::set_pscatter(to, from, stride);\
+}
+
+SYCL_PSCATTER_SPECILIZE(float, cl::sycl::cl_float4)
+SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2)
+
+#undef SYCL_PSCATTER_SPECILIZE
+
+
+#define SYCL_PMAD(packet_type)\
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pmadd( const packet_type& a,\
+  const packet_type& b, const packet_type& c){\
+  return cl::sycl::mad(a,b,c);\
+}
+
+SYCL_PMAD(cl::sycl::cl_float4)
+SYCL_PMAD(cl::sycl::cl_double2)
+#undef SYCL_PMAD
+
+
+
+template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float  pfirst<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
+  return a.x();
+}
+template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double pfirst<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
+  return a.x();
+}
+
+template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float  predux<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
+  return a.x() + a.y() + a.z() + a.w();
+}
+
+template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
+  return a.x() + a.y();
+}
+
+template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float  predux_max<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
+  return cl::sycl::fmax(cl::sycl::fmax(a.x(), a.y()), cl::sycl::fmax(a.z(), a.w()));
+}
+template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_max<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
+  return cl::sycl::fmax(a.x(), a.y());
+}
+
+template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float  predux_min<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
+  return cl::sycl::fmin(cl::sycl::fmin(a.x(), a.y()), cl::sycl::fmin(a.z(), a.w()));
+}
+template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_min<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
+  return cl::sycl::fmin(a.x(), a.y());
+}
+
+template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float  predux_mul<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
+  return a.x() * a.y() * a.z() * a.w();
+}
+template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_mul<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
+  return a.x() * a.y();
+}
+
+template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4  pabs<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
+  return cl::sycl::cl_float4(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()), cl::sycl::fabs(a.z()), cl::sycl::fabs(a.w()));
+}
+template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2 pabs<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
+  return cl::sycl::cl_double2(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()));
+}
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void
+ptranspose(PacketBlock<cl::sycl::cl_float4,4>& kernel) {
+  float tmp = kernel.packet[0].y();
+  kernel.packet[0].y() = kernel.packet[1].x();
+  kernel.packet[1].x() = tmp;
+//  std::swap(kernel.packet[0].y(), kernel.packet[1].x());
+
+  tmp = kernel.packet[0].z();
+  kernel.packet[0].z() = kernel.packet[2].x();
+  kernel.packet[2].x() = tmp;
+  //std::swap(kernel.packet[0].z(), kernel.packet[2].x());
+
+  tmp = kernel.packet[0].w();
+  kernel.packet[0].w() = kernel.packet[3].x();
+  kernel.packet[3].x() = tmp;
+
+  //std::swap(kernel.packet[0].w(), kernel.packet[3].x());
+
+  tmp = kernel.packet[1].z();
+  kernel.packet[1].z() = kernel.packet[2].y();
+  kernel.packet[2].y() = tmp;
+//  std::swap(kernel.packet[1].z(), kernel.packet[2].y());
+
+  tmp = kernel.packet[1].w();
+  kernel.packet[1].w() = kernel.packet[3].y();
+  kernel.packet[3].y() = tmp;
+//  std::swap(kernel.packet[1].w(), kernel.packet[3].y());
+
+  tmp = kernel.packet[2].w();
+  kernel.packet[2].w() = kernel.packet[3].z();
+  kernel.packet[3].z() = tmp;
+//  std::swap(kernel.packet[2].w(), kernel.packet[3].z());
+
+}
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void
+ptranspose(PacketBlock<cl::sycl::cl_double2,2>& kernel) {
+  double tmp = kernel.packet[0].y();
+  kernel.packet[0].y() = kernel.packet[1].x();
+  kernel.packet[1].x() = tmp;
+//std::swap(kernel.packet[0].y(), kernel.packet[1].x());
+}
+
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4
+pblend(const Selector<unpacket_traits<cl::sycl::cl_float4>::size>& ifPacket,
+  const cl::sycl::cl_float4& thenPacket, const cl::sycl::cl_float4& elsePacket) {
+  cl::sycl::cl_int4 condition(ifPacket.select[0] ? 0 : -1,
+                              ifPacket.select[1] ? 0 : -1,
+                              ifPacket.select[2] ? 0 : -1,
+                              ifPacket.select[3] ? 0 : -1);
+  return cl::sycl::select(thenPacket, elsePacket, condition);
+}
+
+template<> inline cl::sycl::cl_double2
+pblend(const Selector<unpacket_traits<cl::sycl::cl_double2>::size>& ifPacket,
+  const cl::sycl::cl_double2& thenPacket, const cl::sycl::cl_double2& elsePacket) {
+  cl::sycl::cl_long2 condition(ifPacket.select[0] ? 0 : -1,
+                               ifPacket.select[1] ? 0 : -1);
+  return cl::sycl::select(thenPacket, elsePacket, condition);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_USE_SYCL
+#endif // EIGEN_PACKET_MATH_SYCL_H
diff --git a/Eigen/src/Core/arch/SYCL/TypeCasting.h b/Eigen/src/Core/arch/SYCL/TypeCasting.h
new file mode 100644
index 000000000..dedd5c84a
--- /dev/null
+++ b/Eigen/src/Core/arch/SYCL/TypeCasting.h
@@ -0,0 +1,89 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TypeCasting.h
+ *
+ * \brief:
+ *  TypeCasting
+ *
+*****************************************************************/
+
+#ifndef EIGEN_TYPE_CASTING_SYCL_H
+#define EIGEN_TYPE_CASTING_SYCL_H
+
+namespace Eigen {
+
+namespace internal {
+#ifdef __SYCL_DEVICE_ONLY__
+template <>
+struct type_casting_traits<float, int> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_int4 pcast<cl::sycl::cl_float4, cl::sycl::cl_int4>(const cl::sycl::cl_float4& a) {
+  return a. template convert<cl::sycl::cl_int, cl::sycl::rounding_mode::automatic>();
+}
+
+
+template <>
+struct type_casting_traits<int, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pcast<cl::sycl::cl_int4, cl::sycl::cl_float4>(const cl::sycl::cl_int4& a) {
+  return a. template convert<cl::sycl::cl_float, cl::sycl::rounding_mode::automatic>();
+}
+
+template <>
+struct type_casting_traits<double, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 2,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pcast<cl::sycl::cl_double2, cl::sycl::cl_float4>(const cl::sycl::cl_double2& a, const cl::sycl::cl_double2& b) {
+  auto a1=a. template convert<cl::sycl::cl_float, cl::sycl::rounding_mode::automatic>();
+  auto b1=b. template convert<cl::sycl::cl_float, cl::sycl::rounding_mode::automatic>();
+  return cl::sycl::float4(a1.x(), a1.y(), b1.x(), b1.y());
+}
+
+template <>
+struct type_casting_traits<float, double> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 2
+  };
+};
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2 pcast<cl::sycl::cl_float4, cl::sycl::cl_double2>(const cl::sycl::cl_float4& a) {
+  // Simply discard the second half of the input
+  return cl::sycl::cl_double2(a.x(), a.y());
+}
+
+#endif
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_SYCL_H
diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h
index d39d2d105..95aba428f 100644
--- a/Eigen/src/Core/arch/ZVector/Complex.h
+++ b/Eigen/src/Core/arch/ZVector/Complex.h
@@ -15,6 +15,10 @@ namespace Eigen {
 
 namespace internal {
 
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+static Packet4ui  p4ui_CONJ_XOR = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; //vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);
+#endif
+
 static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
 static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
 
@@ -29,10 +33,14 @@ struct Packet2cf
 {
   EIGEN_STRONG_INLINE Packet2cf() {}
   EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
   union {
     Packet4f v;
     Packet1cd cd[2];
   };
+#else
+  Packet4f v;
+#endif
 };
 
 template<> struct packet_traits<std::complex<float> >  : default_packet_traits
@@ -89,63 +97,27 @@ template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type
 /* Forward declaration */
 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel);
 
-template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from)  { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
+/* complex<double> first */
 template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from)  { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
 template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *     to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *     to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
 
 template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
 { /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
-{
-  Packet2cf res;
-  res.cd[0] = Packet1cd(vec_ld2f((const float *)&from));
-  res.cd[1] = res.cd[0];
-  return res;
-}
-template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
-{
-  std::complex<float> EIGEN_ALIGN16 af[2];
-  af[0] = from[0*stride];
-  af[1] = from[1*stride];
-  return pload<Packet2cf>(af);
-}
 template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride EIGEN_UNUSED)
 {
   return pload<Packet1cd>(from);
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
-{
-  std::complex<float> EIGEN_ALIGN16 af[2];
-  pstore<std::complex<float> >((std::complex<float> *) af, from);
-  to[0*stride] = af[0];
-  to[1*stride] = af[1];
-}
 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride EIGEN_UNUSED)
 {
   pstore<std::complex<double> >(to, from);
 }
-
-template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd<Packet4f>(a.v, b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
-template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub<Packet4f>(a.v, b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); }
 template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
-{
-  Packet2cf res;
-  res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0]))).v;
-  res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1]))).v;
-  return res;
-}
-
 template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
   Packet2d a_re, a_im, v1, v2;
@@ -163,45 +135,182 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con
 
   return Packet1cd(v1 + v2);
 }
-template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+template<> EIGEN_STRONG_INLINE Packet1cd pand    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd por     <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pxor    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); }
+template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>*     from) {  return pset1<Packet1cd>(*from); }
+
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
+
+template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
+{
+  std::complex<double> EIGEN_ALIGN16 res;
+  pstore<std::complex<double> >(&res, a);
+
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
+{
+  return pfirst(a);
+}
+template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)
+{
+  return vecs[0];
+}
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
+{
+  return pfirst(a);
+}
+template<int Offset>
+struct palign_impl<Offset,Packet1cd>
+{
+  static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
+  {
+    // FIXME is it sure we never have to align a Packet1cd?
+    // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
+  }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    return internal::pmul(a, pconj(b));
+  }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    return internal::pmul(pconj(a), b);
+  }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    return pconj(internal::pmul(a, b));
+  }
+};
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
+
+template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  // TODO optimize it for AltiVec
+  Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
+  Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_);
+  return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64)));
+}
+
+EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
+{
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
+{
+  Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
+  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
+  kernel.packet[0].v = tmp;
+}
+
+/* complex<float> follows */
+template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from)  { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from)  { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *     to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *     to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
+
+template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
+{
+  std::complex<float> EIGEN_ALIGN16 res[2];
+  pstore<std::complex<float> >(res, a);
+
+  return res[0];
+}
+
+
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
+template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
   Packet2cf res;
-  res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[0]))).v;
-  res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[1]))).v;
+  res.cd[0] = Packet1cd(vec_ld2f((const float *)&from));
+  res.cd[1] = res.cd[0];
   return res;
 }
+#else
+template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
+{
+  Packet2cf res;
+  if((std::ptrdiff_t(&from) % 16) == 0)
+    res.v = pload<Packet4f>((const float *)&from);
+  else
+    res.v = ploadu<Packet4f>((const float *)&from);
+  res.v = vec_perm(res.v, res.v, p16uc_PSET64_HI);
+  return res;
+}
+#endif
+
+template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
+{
+  std::complex<float> EIGEN_ALIGN16 af[2];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+  return pload<Packet2cf>(af);
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
+{
+  std::complex<float> EIGEN_ALIGN16 af[2];
+  pstore<std::complex<float> >((std::complex<float> *) af, from);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd<Packet4f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub<Packet4f>(a.v, b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); }
 
-template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor<Packet4f>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); }
 template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot<Packet4f>(a.v,b.v)); }
 
-template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>*     from) {  return pset1<Packet1cd>(*from); }
 template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>*      from) {  return pset1<Packet2cf>(*from); }
 
 template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *     addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
 
-template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
-{
-  std::complex<double> EIGEN_ALIGN16 res;
-  pstore<std::complex<double> >(&res, a);
 
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
+{
+  Packet2cf res;
+  res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0]))).v;
+  res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1]))).v;
   return res;
 }
-template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
-{
-  std::complex<float> EIGEN_ALIGN16 res[2];
-  pstore<std::complex<float> >(res, a);
 
-  return res[0];
+template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  Packet2cf res;
+  res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[0]))).v;
+  res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[1]))).v;
+  return res;
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
 {
   Packet2cf res;
@@ -210,10 +319,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
   return res;
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
-{
-  return pfirst(a);
-}
 template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
 {
   std::complex<float> res;
@@ -222,10 +327,6 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packe
   return res;
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)
-{
-  return vecs[0];
-}
 template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
 {
   PacketBlock<Packet2cf,2> transpose;
@@ -236,10 +337,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vec
   return padd<Packet2cf>(transpose.packet[0], transpose.packet[1]);
 } 
 
-template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
-{
-  return pfirst(a);
-}
 template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
 {
   std::complex<float> res;
@@ -249,16 +346,6 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
 }
 
 template<int Offset>
-struct palign_impl<Offset,Packet1cd>
-{
-  static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
-  {
-    // FIXME is it sure we never have to align a Packet1cd?
-    // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
-  }
-};
-
-template<int Offset>
 struct palign_impl<Offset,Packet2cf>
 {
   static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
@@ -270,39 +357,143 @@ struct palign_impl<Offset,Packet2cf>
   }
 };
 
-template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
+template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
 {
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
   { return padd(pmul(x,y),c); }
 
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
   {
     return internal::pmul(a, pconj(b));
   }
 };
 
-template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
+template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
 {
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
   { return padd(pmul(x,y),c); }
 
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
   {
     return internal::pmul(pconj(a), b);
   }
 };
 
-template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
+template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
 {
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
   { return padd(pmul(x,y),c); }
 
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
   {
     return pconj(internal::pmul(a, b));
   }
 };
 
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
+
+template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  // TODO optimize it for AltiVec
+  Packet2cf res;
+  res.cd[0] = pdiv<Packet1cd>(a.cd[0], b.cd[0]);
+  res.cd[1] = pdiv<Packet1cd>(a.cd[1], b.cd[1]);
+  return res;
+}
+
+EIGEN_STRONG_INLINE Packet2cf pcplxflip/*<Packet2cf>*/(const Packet2cf& x)
+{
+  Packet2cf res;
+  res.cd[0] = pcplxflip(x.cd[0]);
+  res.cd[1] = pcplxflip(x.cd[1]);
+  return res;
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
+{
+  Packet1cd tmp = kernel.packet[0].cd[1];
+  kernel.packet[0].cd[1] = kernel.packet[1].cd[0];
+  kernel.packet[1].cd[0] = tmp;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+  Packet2cf result;
+  const Selector<4> ifPacket4 = { ifPacket.select[0], ifPacket.select[0], ifPacket.select[1], ifPacket.select[1] };
+  result.v = pblend<Packet4f>(ifPacket4, thenPacket.v, elsePacket.v);
+  return result;
+}
+#else
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  Packet4f a_re, a_im, prod, prod_im;
+
+  // Permute and multiply the real parts of a and b
+  a_re = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
+  
+  // Get the imaginary parts of a
+  a_im = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
+
+  // multiply a_im * b and get the conjugate result
+  prod_im = a_im * b.v;
+  prod_im = pxor<Packet4f>(prod_im, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR));
+  // permute back to a proper order
+  prod_im = vec_perm(prod_im, prod_im, p16uc_COMPLEX32_REV);
+
+  // multiply a_re * b, add prod_im
+  prod = pmadd<Packet4f>(a_re, b.v, prod_im);
+ 
+  return Packet2cf(prod);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
+{
+  Packet4f rev_a;
+  rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX32_REV2);
+  return Packet2cf(rev_a);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
+{
+  Packet4f b;
+  b = vec_sld(a.v, a.v, 8);
+  b = padd<Packet4f>(a.v, b);
+  return pfirst<Packet2cf>(Packet2cf(b));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
+{
+  Packet4f b1, b2;
+  b1 = vec_sld(vecs[0].v, vecs[1].v, 8);
+  b2 = vec_sld(vecs[1].v, vecs[0].v, 8);
+  b2 = vec_sld(b2, b2, 8);
+  b2 = padd<Packet4f>(b1, b2);
+
+  return Packet2cf(b2);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
+{
+  Packet4f b;
+  Packet2cf prod;
+  b = vec_sld(a.v, a.v, 8);
+  prod = pmul<Packet2cf>(a, Packet2cf(b));
+
+  return pfirst<Packet2cf>(prod);
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet2cf>
+{
+  static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
+  {
+    if (Offset==1)
+    {
+      first.v = vec_sld(first.v, second.v, 8);
+    }
+  }
+};
+
 template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
 {
   EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
@@ -336,56 +527,34 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
   }
 };
 
-template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
-  // TODO optimize it for AltiVec
-  Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
-  Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_);
-  return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64)));
-}
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
 
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
   // TODO optimize it for AltiVec
-  Packet2cf res;
-  res.cd[0] = pdiv<Packet1cd>(a.cd[0], b.cd[0]);
-  res.cd[1] = pdiv<Packet1cd>(a.cd[1], b.cd[1]);
-  return res;
+  Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a, b);
+  Packet4f s = pmul<Packet4f>(b.v, b.v);
+  return Packet2cf(pdiv(res.v, padd<Packet4f>(s, vec_perm(s, s, p16uc_COMPLEX32_REV))));
 }
 
-EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
+template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x)
 {
-  return Packet1cd(preverse(Packet2d(x.v)));
+  return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX32_REV));
 }
 
-EIGEN_STRONG_INLINE Packet2cf pcplxflip/*<Packet2cf>*/(const Packet2cf& x)
-{
-  Packet2cf res;
-  res.cd[0] = pcplxflip(x.cd[0]);
-  res.cd[1] = pcplxflip(x.cd[1]);
-  return res;
-}
-
-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
 {
-  Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
+  Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
   kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
   kernel.packet[0].v = tmp;
 }
 
-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
-{
-  Packet1cd tmp = kernel.packet[0].cd[1];
-  kernel.packet[0].cd[1] = kernel.packet[1].cd[0];
-  kernel.packet[1].cd[0] = tmp;
-}
-
 template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
   Packet2cf result;
-  const Selector<4> ifPacket4 = { ifPacket.select[0], ifPacket.select[0], ifPacket.select[1], ifPacket.select[1] };
-  result.v = pblend<Packet4f>(ifPacket4, thenPacket.v, elsePacket.v);
+  result.v = reinterpret_cast<Packet4f>(pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
   return result;
 }
+#endif
 
 } // end namespace internal
 
diff --git a/Eigen/src/Core/arch/ZVector/MathFunctions.h b/Eigen/src/Core/arch/ZVector/MathFunctions.h
index 5c7aa7256..ff33a975f 100644
--- a/Eigen/src/Core/arch/ZVector/MathFunctions.h
+++ b/Eigen/src/Core/arch/ZVector/MathFunctions.h
@@ -20,6 +20,50 @@ namespace Eigen {
 
 namespace internal {
 
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+static _EIGEN_DECLARE_CONST_Packet4i(23, 23);
+
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
+
+/* the smallest non denormalized float number */
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf,     0xff800000); // -1.f/0.f
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan,     0xffffffff);
+  
+/* natural logarithm computed for 4 simultaneous float
+  return NaN for x <= 0
+*/
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
+static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
+#endif
+
 static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
 static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
 static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
@@ -93,40 +137,91 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
 }
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f pexp<Packet4f>(const Packet4f& x)
+Packet4f pexp<Packet4f>(const Packet4f& _x)
 {
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+/*
+  Packet4f x = _x;
+
+  Packet4f tmp, fx;
+  Packet4i emm0;
+
+  // clamp x
+  x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo);
+
+  // express exp(x) as exp(g + n*log(2))
+  fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
+
+  fx = pfloor(fx);
+
+  tmp = pmul(fx, p4f_cephes_exp_C1);
+  Packet4f z = pmul(fx, p4f_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  z = pmul(x,x);
+
+  Packet4f y = p4f_cephes_exp_p0;
+  y = pmadd(y, x, p4f_cephes_exp_p1);
+  y = pmadd(y, x, p4f_cephes_exp_p2);
+  y = pmadd(y, x, p4f_cephes_exp_p3);
+  y = pmadd(y, x, p4f_cephes_exp_p4);
+  y = pmadd(y, x, p4f_cephes_exp_p5);
+  y = pmadd(y, z, x);
+  y = padd(y, p4f_1);
+
+  // build 2^n
+  emm0 = vec_cts(fx, 0);
+  emm0 = emm0 + p4i_0x7f;
+  emm0 = emm0 << reinterpret_cast<Packet4i>(p4i_23);
+
+  // Altivec's max & min operators just drop silent NaNs. Check NaNs in 
+  // inputs and return them unmodified.
+  Packet4ui isnumber_mask = reinterpret_cast<Packet4ui>(vec_cmpeq(_x, _x));
+  return vec_sel(_x, pmax(pmul(y, reinterpret_cast<Packet4f>(emm0)), _x),
+                 isnumber_mask);*/
+  return _x;
+#else
   Packet4f res;
-  res.v4f[0] = pexp<Packet2d>(x.v4f[0]);
-  res.v4f[1] = pexp<Packet2d>(x.v4f[1]);
+  res.v4f[0] = pexp<Packet2d>(_x.v4f[0]);
+  res.v4f[1] = pexp<Packet2d>(_x.v4f[1]);
   return res;
+#endif
 }
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d psqrt<Packet2d>(const Packet2d& x)
 {
-  return  __builtin_s390_vfsqdb(x);
+  return vec_sqrt(x);
 }
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f psqrt<Packet4f>(const Packet4f& x)
 {
   Packet4f res;
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+  res = vec_sqrt(x);
+#else
   res.v4f[0] = psqrt<Packet2d>(x.v4f[0]);
   res.v4f[1] = psqrt<Packet2d>(x.v4f[1]);
+#endif
   return res;
 }
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d prsqrt<Packet2d>(const Packet2d& x) {
-  // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation.
   return pset1<Packet2d>(1.0) / psqrt<Packet2d>(x);
 }
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f prsqrt<Packet4f>(const Packet4f& x) {
   Packet4f res;
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+  res = pset1<Packet4f>(1.0) / psqrt<Packet4f>(x);
+#else
   res.v4f[0] = prsqrt<Packet2d>(x.v4f[0]);
   res.v4f[1] = prsqrt<Packet2d>(x.v4f[1]);
+#endif
   return res;
 }
 
diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h
index 57b01fc63..0b37f4992 100755
--- a/Eigen/src/Core/arch/ZVector/PacketMath.h
+++ b/Eigen/src/Core/arch/ZVector/PacketMath.h
@@ -17,7 +17,7 @@ namespace Eigen {
 namespace internal {
 
 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16
 #endif
 
 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
@@ -29,7 +29,7 @@ namespace internal {
 #endif
 
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS  16
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS  32
 #endif
 
 typedef __vector int                 Packet4i;
@@ -41,9 +41,14 @@ typedef __vector double              Packet2d;
 typedef __vector unsigned long long  Packet2ul;
 typedef __vector long long           Packet2l;
 
+// Z14 has builtin support for float vectors
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+typedef __vector float               Packet4f;
+#else
 typedef struct {
 	Packet2d  v4f[2];
 } Packet4f;
+#endif
 
 typedef union {
   int32_t   i[4];
@@ -51,11 +56,15 @@ typedef union {
   int64_t   l[2];
   uint64_t ul[2];
   double    d[2];
+  float     f[4];
   Packet4i  v4i;
   Packet4ui v4ui;
   Packet2l  v2l;
   Packet2ul v2ul;
   Packet2d  v2d;
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+  Packet4f  v4f;
+#endif
 } Packet;
 
 // We don't want to write the same code all the time, but we need to reuse the constants
@@ -80,7 +89,7 @@ typedef union {
   Packet2l p2l_##NAME = pset1<Packet2l>(X)
 
 // These constants are endian-agnostic
-//static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1}
 
 static _EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0);
@@ -90,6 +99,21 @@ static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1);
 static Packet2d p2d_ONE = { 1.0, 1.0 }; 
 static Packet2d p2d_ZERO_ = { -0.0, -0.0 };
 
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
+  Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
+
+#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
+  Packet4f p4f_##NAME = pset1<Packet4f>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
+  const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
+
+static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
+static Packet4f p4f_MZERO = { 0x80000000, 0x80000000, 0x80000000, 0x80000000};
+#endif
+
 static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
 static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
 static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet16uc>(p2d_ZERO), reinterpret_cast<Packet16uc>(p2d_ONE), 8));
@@ -120,9 +144,9 @@ static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0
 static Packet16uc p16uc_TRANSPOSE64_HI = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
 static Packet16uc p16uc_TRANSPOSE64_LO = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
 
-//static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8);                                         //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
+static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8);                                         //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
 
-//static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);                                            //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);                                            //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
 
 
 #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
@@ -169,7 +193,11 @@ template<> struct packet_traits<float> : default_packet_traits
     HasSin  = 0,
     HasCos  = 0,
     HasLog  = 0,
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+    HasExp  = 0,
+#else
     HasExp  = 1,
+#endif
     HasSqrt = 1,
     HasRsqrt = 1,
     HasRound = 1,
@@ -258,31 +286,16 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
   return s;
 }
 
-/* Helper function to simulate a vec_splat_packet4f
- */
-template<int element> EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f&   from)
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
 {
-  Packet4f splat;
-  switch (element) {
-  case 0:
-    splat.v4f[0] = vec_splat(from.v4f[0], 0);
-    splat.v4f[1] = splat.v4f[0];
-    break;
-  case 1:
-    splat.v4f[0] = vec_splat(from.v4f[0], 1);
-    splat.v4f[1] = splat.v4f[0];
-    break;
-  case 2:
-    splat.v4f[0] = vec_splat(from.v4f[1], 0);
-    splat.v4f[1] = splat.v4f[0];
-    break;
-  case 3:
-    splat.v4f[0] = vec_splat(from.v4f[1], 1);
-    splat.v4f[1] = splat.v4f[0];
-    break;
-  }
-  return splat;
+  Packet vt;
+  vt.v4f = v;
+  s << vt.f[0] << ", " << vt.f[1] << ", " << vt.f[2] << ", " << vt.f[3];
+  return s;
 }
+#endif
+
 
 template<int Offset>
 struct palign_impl<Offset,Packet4i>
@@ -300,31 +313,6 @@ struct palign_impl<Offset,Packet4i>
   }
 };
 
-/* This is a tricky one, we have to translate float alignment to vector elements of sizeof double
- */
-template<int Offset>
-struct palign_impl<Offset,Packet4f>
-{
-  static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
-  {
-    switch (Offset % 4) {
-    case 1:
-      first.v4f[0] = vec_sld(first.v4f[0], first.v4f[1], 8);
-      first.v4f[1] = vec_sld(first.v4f[1], second.v4f[0], 8);
-      break;
-    case 2:
-      first.v4f[0] = first.v4f[1];
-      first.v4f[1] = second.v4f[0];
-      break;
-    case 3:
-      first.v4f[0] = vec_sld(first.v4f[1],  second.v4f[0], 8);
-      first.v4f[1] = vec_sld(second.v4f[0], second.v4f[1], 8);
-      break;
-    }
-  }
-};
-
-
 template<int Offset>
 struct palign_impl<Offset,Packet2d>
 {
@@ -344,16 +332,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from)
   return vfrom->v4i;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from)
-{
-  // FIXME: No intrinsic yet
-  EIGEN_DEBUG_ALIGNED_LOAD
-  Packet4f vfrom;
-  vfrom.v4f[0] = vec_ld2f(&from[0]);
-  vfrom.v4f[1] = vec_ld2f(&from[2]);
-  return vfrom;
-}
-
 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
 {
   // FIXME: No intrinsic yet
@@ -372,15 +350,6 @@ template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& f
   vto->v4i = from;
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from)
-{
-  // FIXME: No intrinsic yet
-  EIGEN_DEBUG_ALIGNED_STORE
-  vec_st2f(from.v4f[0], &to[0]);
-  vec_st2f(from.v4f[1], &to[2]);
-}
-
-
 template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from)
 {
   // FIXME: No intrinsic yet
@@ -397,13 +366,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)
 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
   return vec_splats(from);
 }
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&    from)
-{
-  Packet4f to;
-  to.v4f[0] = pset1<Packet2d>(static_cast<const double&>(from));
-  to.v4f[1] = to.v4f[0];
-  return to;
-}
 
 template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet4i>(const int *a,
@@ -417,17 +379,6 @@ pbroadcast4<Packet4i>(const int *a,
 }
 
 template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet4f>(const float *a,
-                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
-{
-  a3 = pload<Packet4f>(a);
-  a0 = vec_splat_packet4f<0>(a3);
-  a1 = vec_splat_packet4f<1>(a3);
-  a2 = vec_splat_packet4f<2>(a3);
-  a3 = vec_splat_packet4f<3>(a3);
-}
-
-template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet2d>(const double *a,
                       Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
 {
@@ -449,16 +400,6 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* f
  return pload<Packet4i>(ai);
 }
 
-template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
-{
-  float EIGEN_ALIGN16 ai[4];
-  ai[0] = from[0*stride];
-  ai[1] = from[1*stride];
-  ai[2] = from[2*stride];
-  ai[3] = from[3*stride];
- return pload<Packet4f>(ai);
-}
-
 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
 {
   double EIGEN_ALIGN16 af[2];
@@ -477,16 +418,6 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const
   to[3*stride] = ai[3];
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
-{
-  float EIGEN_ALIGN16 ai[4];
-  pstore<float>((float *)ai, from);
-  to[0*stride] = ai[0];
-  to[1*stride] = ai[1];
-  to[2*stride] = ai[2];
-  to[3*stride] = ai[3];
-}
-
 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
 {
   double EIGEN_ALIGN16 af[2];
@@ -496,160 +427,52 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to,
 }
 
 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a + b); }
-template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  Packet4f c;
-  c.v4f[0] = a.v4f[0] + b.v4f[0];
-  c.v4f[1] = a.v4f[1] + b.v4f[1];
-  return c;
-}
 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a + b); }
 
 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a - b); }
-template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  Packet4f c;
-  c.v4f[0] = a.v4f[0] - b.v4f[0];
-  c.v4f[1] = a.v4f[1] - b.v4f[1];
-  return c;
-}
 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a - b); }
 
 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a * b); }
-template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  Packet4f c;
-  c.v4f[0] = a.v4f[0] * b.v4f[0];
-  c.v4f[1] = a.v4f[1] * b.v4f[1];
-  return c;
-}
 template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a * b); }
 
 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a / b); }
-template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  Packet4f c;
-  c.v4f[0] = a.v4f[0] / b.v4f[0];
-  c.v4f[1] = a.v4f[1] / b.v4f[1];
-  return c;
-}
 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a / b); }
 
 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); }
-template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
-{
-  Packet4f c;
-  c.v4f[0] = -a.v4f[0];
-  c.v4f[1] = -a.v4f[1];
-  return c;
-}
 template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); }
 
 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
 
 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd<Packet4i>(pmul<Packet4i>(a, b), c); }
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
-{
-  Packet4f res;
-  res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]);
-  res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]);
-  return res;
-}
 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
 
 template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)    { return padd<Packet4i>(pset1<Packet4i>(a), p4i_COUNTDOWN); }
-template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)  { return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN); }
 template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return padd<Packet2d>(pset1<Packet2d>(a), p2d_COUNTDOWN); }
 
 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  Packet4f res;
-  res.v4f[0] = pmin(a.v4f[0], b.v4f[0]);
-  res.v4f[1] = pmin(a.v4f[1], b.v4f[1]);
-  return res;
-}
 
 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
 template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  Packet4f res;
-  res.v4f[0] = pmax(a.v4f[0], b.v4f[0]);
-  res.v4f[1] = pmax(a.v4f[1], b.v4f[1]);
-  return res;
-}
 
 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  Packet4f res;
-  res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
-  res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
-  return res;
-}
 
 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
 template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  Packet4f res;
-  res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
-  res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
-  return res;
-}
 
 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  Packet4f res;
-  res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
-  res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
-  return res;
-}
 
 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return pand<Packet4i>(a, vec_nor(b, b)); }
 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  Packet4f res;
-  res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]);
-  res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]);
-  return res;
-}
 
-template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
-{
-  Packet4f res;
-  res.v4f[0] = vec_round(a.v4f[0]);
-  res.v4f[1] = vec_round(a.v4f[1]);
-  return res;
-}
 template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); }
-template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const  Packet4f& a)
-{
-  Packet4f res;
-  res.v4f[0] = vec_ceil(a.v4f[0]);
-  res.v4f[1] = vec_ceil(a.v4f[1]);
-  return res;
-}
 template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const  Packet2d& a) { return vec_ceil(a); }
-template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
-{
-  Packet4f res;
-  res.v4f[0] = vec_floor(a.v4f[0]);
-  res.v4f[1] = vec_floor(a.v4f[1]);
-  return res;
-}
 template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
 
 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int*       from) { return pload<Packet4i>(from); }
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*     from) { return pload<Packet4f>(from); }
 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double*    from) { return pload<Packet2d>(from); }
 
 
@@ -659,14 +482,6 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
   return vec_perm(p, p, p16uc_DUPLICATE32_HI);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*    from)
-{
-  Packet4f p = pload<Packet4f>(from);
-  p.v4f[1] = vec_splat(p.v4f[0], 1);
-  p.v4f[0] = vec_splat(p.v4f[0], 0);
-  return p;
-}
-
 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
 {
   Packet2d p = pload<Packet2d>(from);
@@ -674,15 +489,12 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
 }
 
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*        to, const Packet4i& from) { pstore<int>(to, from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*    to, const Packet4f& from) { pstore<float>(to, from); }
 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from) { pstore<double>(to, from); }
 
 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
 
 template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int    EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float  EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; }
 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; }
 
 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
@@ -695,23 +507,8 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
   return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
-{
-  Packet4f rev;
-  rev.v4f[0] = preverse<Packet2d>(a.v4f[1]);
-  rev.v4f[1] = preverse<Packet2d>(a.v4f[0]);
-  return rev;
-}
-
 template<> EIGEN_STRONG_INLINE Packet4i pabs<Packet4i>(const Packet4i& a) { return vec_abs(a); }
 template<> EIGEN_STRONG_INLINE Packet2d pabs<Packet2d>(const Packet2d& a) { return vec_abs(a); }
-template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a)
-{
-  Packet4f res;
-  res.v4f[0] = pabs(a.v4f[0]);
-  res.v4f[1] = pabs(a.v4f[1]);
-  return res;
-}
 
 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
 {
@@ -730,13 +527,6 @@ template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
   sum = padd<Packet2d>(a, b);
   return pfirst(sum);
 }
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
-  Packet2d sum;
-  sum = padd<Packet2d>(a.v4f[0], a.v4f[1]);
-  double first = predux<Packet2d>(sum);
-  return static_cast<float>(first);
-}
 
 template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
 {
@@ -777,21 +567,6 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
   return sum;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
-{
-  PacketBlock<Packet4f,4> transpose;
-  transpose.packet[0] = vecs[0];
-  transpose.packet[1] = vecs[1];
-  transpose.packet[2] = vecs[2];
-  transpose.packet[3] = vecs[3];
-  ptranspose(transpose);
-
-  Packet4f sum = padd(transpose.packet[0], transpose.packet[1]);
-  sum = padd(sum, transpose.packet[2]);
-  sum = padd(sum, transpose.packet[3]);
-  return sum;
-}
-
 // Other reduction functions:
 // mul
 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
@@ -806,12 +581,6 @@ template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
   return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
 }
 
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
-{
-  // Return predux_mul<Packet2d> of the subvectors product
-  return static_cast<float>(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1]))));
-}
-
 // min
 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
 {
@@ -826,14 +595,6 @@ template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
   return pfirst(pmin<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
 }
 
-template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
-{
-  Packet2d b, res;
-  b   = pmin<Packet2d>(a.v4f[0], a.v4f[1]);
-  res = pmin<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
-  return static_cast<float>(pfirst(res));
-}
-
 // max
 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
 {
@@ -849,14 +610,6 @@ template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
   return pfirst(pmax<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
 }
 
-template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
-{
-  Packet2d b, res;
-  b   = pmax<Packet2d>(a.v4f[0], a.v4f[1]);
-  res = pmax<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
-  return static_cast<float>(pfirst(res));
-}
-
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet4i,4>& kernel) {
   Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
@@ -877,6 +630,321 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {
   kernel.packet[1] = t1;
 }
 
+template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
+  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
+  Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+
+template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
+  Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
+  Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+/* z13 has no vector float support so we emulate that with double
+   z14 has proper vector float support.
+*/
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
+/* Helper function to simulate a vec_splat_packet4f
+ */
+template<int element> EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f&   from)
+{
+  Packet4f splat;
+  switch (element) {
+  case 0:
+    splat.v4f[0] = vec_splat(from.v4f[0], 0);
+    splat.v4f[1] = splat.v4f[0];
+    break;
+  case 1:
+    splat.v4f[0] = vec_splat(from.v4f[0], 1);
+    splat.v4f[1] = splat.v4f[0];
+    break;
+  case 2:
+    splat.v4f[0] = vec_splat(from.v4f[1], 0);
+    splat.v4f[1] = splat.v4f[0];
+    break;
+  case 3:
+    splat.v4f[0] = vec_splat(from.v4f[1], 1);
+    splat.v4f[1] = splat.v4f[0];
+    break;
+  }
+  return splat;
+}
+
+/* This is a tricky one, we have to translate float alignment to vector elements of sizeof double
+ */
+template<int Offset>
+struct palign_impl<Offset,Packet4f>
+{
+  static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
+  {
+    switch (Offset % 4) {
+    case 1:
+      first.v4f[0] = vec_sld(first.v4f[0], first.v4f[1], 8);
+      first.v4f[1] = vec_sld(first.v4f[1], second.v4f[0], 8);
+      break;
+    case 2:
+      first.v4f[0] = first.v4f[1];
+      first.v4f[1] = second.v4f[0];
+      break;
+    case 3:
+      first.v4f[0] = vec_sld(first.v4f[1],  second.v4f[0], 8);
+      first.v4f[1] = vec_sld(second.v4f[0], second.v4f[1], 8);
+      break;
+    }
+  }
+};
+
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_LOAD
+  Packet4f vfrom;
+  vfrom.v4f[0] = vec_ld2f(&from[0]);
+  vfrom.v4f[1] = vec_ld2f(&from[2]);
+  return vfrom;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_st2f(from.v4f[0], &to[0]);
+  vec_st2f(from.v4f[1], &to[2]);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&    from)
+{
+  Packet4f to;
+  to.v4f[0] = pset1<Packet2d>(static_cast<const double&>(from));
+  to.v4f[1] = to.v4f[0];
+  return to;
+}
+
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4f>(const float *a,
+                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+{
+  a3 = pload<Packet4f>(a);
+  a0 = vec_splat_packet4f<0>(a3);
+  a1 = vec_splat_packet4f<1>(a3);
+  a2 = vec_splat_packet4f<2>(a3);
+  a3 = vec_splat_packet4f<3>(a3);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
+{
+  float EIGEN_ALIGN16 ai[4];
+  ai[0] = from[0*stride];
+  ai[1] = from[1*stride];
+  ai[2] = from[2*stride];
+  ai[3] = from[3*stride];
+ return pload<Packet4f>(ai);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+{
+  float EIGEN_ALIGN16 ai[4];
+  pstore<float>((float *)ai, from);
+  to[0*stride] = ai[0];
+  to[1*stride] = ai[1];
+  to[2*stride] = ai[2];
+  to[3*stride] = ai[3];
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] + b.v4f[0];
+  c.v4f[1] = a.v4f[1] + b.v4f[1];
+  return c;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] - b.v4f[0];
+  c.v4f[1] = a.v4f[1] - b.v4f[1];
+  return c;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] * b.v4f[0];
+  c.v4f[1] = a.v4f[1] * b.v4f[1];
+  return c;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] / b.v4f[0];
+  c.v4f[1] = a.v4f[1] / b.v4f[1];
+  return c;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
+{
+  Packet4f c;
+  c.v4f[0] = -a.v4f[0];
+  c.v4f[1] = -a.v4f[1];
+  return c;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
+{
+  Packet4f res;
+  res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]);
+  res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pmin(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pmin(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pmax(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pmax(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f res;
+  res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
+{
+  Packet4f res;
+  res.v4f[0] = vec_round(a.v4f[0]);
+  res.v4f[1] = vec_round(a.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const  Packet4f& a)
+{
+  Packet4f res;
+  res.v4f[0] = vec_ceil(a.v4f[0]);
+  res.v4f[1] = vec_ceil(a.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
+{
+  Packet4f res;
+  res.v4f[0] = vec_floor(a.v4f[0]);
+  res.v4f[1] = vec_floor(a.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*    from)
+{
+  Packet4f p = pload<Packet4f>(from);
+  p.v4f[1] = vec_splat(p.v4f[0], 1);
+  p.v4f[0] = vec_splat(p.v4f[0], 0);
+  return p;
+}
+
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float  EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; }
+
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
+{
+  Packet4f rev;
+  rev.v4f[0] = preverse<Packet2d>(a.v4f[1]);
+  rev.v4f[1] = preverse<Packet2d>(a.v4f[0]);
+  return rev;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a)
+{
+  Packet4f res;
+  res.v4f[0] = pabs(a.v4f[0]);
+  res.v4f[1] = pabs(a.v4f[1]);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
+{
+  Packet2d sum;
+  sum = padd<Packet2d>(a.v4f[0], a.v4f[1]);
+  double first = predux<Packet2d>(sum);
+  return static_cast<float>(first);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
+{
+  PacketBlock<Packet4f,4> transpose;
+  transpose.packet[0] = vecs[0];
+  transpose.packet[1] = vecs[1];
+  transpose.packet[2] = vecs[2];
+  transpose.packet[3] = vecs[3];
+  ptranspose(transpose);
+
+  Packet4f sum = padd(transpose.packet[0], transpose.packet[1]);
+  sum = padd(sum, transpose.packet[2]);
+  sum = padd(sum, transpose.packet[3]);
+  return sum;
+}
+
+template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
+{
+  // Return predux_mul<Packet2d> of the subvectors product
+  return static_cast<float>(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1]))));
+}
+
+template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
+{
+  Packet2d b, res;
+  b   = pmin<Packet2d>(a.v4f[0], a.v4f[1]);
+  res = pmin<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
+  return static_cast<float>(pfirst(res));
+}
+
+template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
+{
+  Packet2d b, res;
+  b   = pmax<Packet2d>(a.v4f[0], a.v4f[1]);
+  res = pmax<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
+  return static_cast<float>(pfirst(res));
+}
+
 /* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one
  */
 EIGEN_DEVICE_FUNC inline void
@@ -915,12 +983,6 @@ ptranspose(PacketBlock<Packet4f,4>& kernel) {
   kernel.packet[3].v4f[1] = t3.packet[1];
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
-  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
-  Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
-  return vec_sel(elsePacket, thenPacket, mask);
-}
-
 template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
   Packet2ul select_hi = { ifPacket.select[0], ifPacket.select[1] };
   Packet2ul select_lo = { ifPacket.select[2], ifPacket.select[3] };
@@ -931,13 +993,197 @@ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, cons
   result.v4f[1] = vec_sel(elsePacket.v4f[1], thenPacket.v4f[1], mask_lo);
   return result;
 }
+#else
+template<int Offset>
+struct palign_impl<Offset,Packet4f>
+{
+  static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
+  {
+    switch (Offset % 4) {
+    case 1:
+      first = vec_sld(first, second, 4); break;
+    case 2:
+      first = vec_sld(first, second, 8); break;
+    case 3:
+      first = vec_sld(first, second, 12); break;
+    }
+  }
+};
+
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_LOAD
+  Packet *vfrom;
+  vfrom = (Packet *) from;
+  return vfrom->v4f;
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
-  Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
-  Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE));
+template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_STORE
+  Packet *vto;
+  vto = (Packet *) to;
+  vto->v4f = from;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from)
+{
+  return vec_splats(from);
+}
+
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4f>(const float *a,
+                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+{
+  a3 = pload<Packet4f>(a);
+  a0 = vec_splat(a3, 0);
+  a1 = vec_splat(a3, 1);
+  a2 = vec_splat(a3, 2);
+  a3 = vec_splat(a3, 3);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
+{
+  float EIGEN_ALIGN16 af[4];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+  af[2] = from[2*stride];
+  af[3] = from[3*stride];
+ return pload<Packet4f>(af);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+{
+  float EIGEN_ALIGN16 af[4];
+  pstore<float>((float*)af, from);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+  to[2*stride] = af[2];
+  to[3*stride] = af[3];
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a + b); }
+template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a - b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a * b); }
+template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a / b); }
+template<> EIGEN_STRONG_INLINE Packet4f pnegate<Packet4f>(const Packet4f& a) { return (-a); }
+template<> EIGEN_STRONG_INLINE Packet4f pconj<Packet4f>  (const Packet4f& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4f pmadd<Packet4f>  (const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); }
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>   (const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>   (const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>   (const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>    (const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>   (const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f> (const Packet4f& a) { return vec_round(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>  (const Packet4f& a) { return vec_ceil(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f> (const Packet4f& a) { return vec_floor(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>   (const Packet4f& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
+
+template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
+{
+  Packet4f p = pload<Packet4f>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
+{
+  return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+}
+
+template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
+{
+  Packet4f b, sum;
+  b   = vec_sld(a, a, 8);
+  sum = padd<Packet4f>(a, b);
+  b   = vec_sld(sum, sum, 4);
+  sum = padd<Packet4f>(sum, b);
+  return pfirst(sum);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
+{
+  Packet4f v[4], sum[4];
+
+  // It's easier and faster to transpose then add as columns
+  // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
+  // Do the transpose, first set of moves
+  v[0] = vec_mergeh(vecs[0], vecs[2]);
+  v[1] = vec_mergel(vecs[0], vecs[2]);
+  v[2] = vec_mergeh(vecs[1], vecs[3]);
+  v[3] = vec_mergel(vecs[1], vecs[3]);
+  // Get the resulting vectors
+  sum[0] = vec_mergeh(v[0], v[2]);
+  sum[1] = vec_mergel(v[0], v[2]);
+  sum[2] = vec_mergeh(v[1], v[3]);
+  sum[3] = vec_mergel(v[1], v[3]);
+
+  // Now do the summation:
+  // Lines 0+1
+  sum[0] = padd<Packet4f>(sum[0], sum[1]);
+  // Lines 2+3
+  sum[1] = padd<Packet4f>(sum[2], sum[3]);
+  // Add the results
+  sum[0] = padd<Packet4f>(sum[0], sum[1]);
+
+  return sum[0];
+}
+
+// Other reduction functions:
+// mul
+template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
+{
+  Packet4f prod;
+  prod = pmul(a, vec_sld(a, a, 8));
+  return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
+}
+
+// min
+template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
+{
+  Packet4f b, res;
+  b   = pmin<Packet4f>(a, vec_sld(a, a, 8));
+  res = pmin<Packet4f>(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+// max
+template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
+{
+  Packet4f b, res;
+  b = pmax<Packet4f>(a, vec_sld(a, a, 8));
+  res = pmax<Packet4f>(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4f,4>& kernel) {
+  Packet4f t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  Packet4f t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  Packet4f t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  Packet4f t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
+  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
+  Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
   return vec_sel(elsePacket, thenPacket, mask);
 }
 
+#endif
+
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f> (const float* from) { return pload<Packet4f>(from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { pstore<float>(to, from); }
+template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>  (const float& a)  { return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN); }
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h
index 4153b877c..9765cc763 100644
--- a/Eigen/src/Core/functors/AssignmentFunctors.h
+++ b/Eigen/src/Core/functors/AssignmentFunctors.h
@@ -144,7 +144,7 @@ template<typename Scalar> struct swap_assign_op {
   EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const
   {
-#ifdef __CUDACC__
+#ifdef EIGEN_GPUCC
     // FIXME is there some kind of cuda::swap?
     Scalar t=b; const_cast<Scalar&>(b)=a; a=t;
 #else
diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h
index 96747bac7..401d597d8 100644
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@@ -255,7 +255,7 @@ struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_NEQ> : binary_op_base<LhsScalar,Rh
 
 
 /** \internal
-  * \brief Template functor to compute the hypot of two scalars
+  * \brief Template functor to compute the hypot of two \b positive \b and \b real scalars
   *
   * \sa MatrixBase::stableNorm(), class Redux
   */
@@ -263,22 +263,15 @@ template<typename Scalar>
 struct scalar_hypot_op<Scalar,Scalar> : binary_op_base<Scalar,Scalar>
 {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op)
-//   typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& _x, const Scalar& _y) const
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar &x, const Scalar &y) const
   {
-    EIGEN_USING_STD_MATH(sqrt)
-    Scalar p, qp;
-    if(_x>_y)
-    {
-      p = _x;
-      qp = _y / p;
-    }
-    else
-    {
-      p = _y;
-      qp = _x / p;
-    }
-    return p * sqrt(Scalar(1) + qp*qp);
+    // This functor is used by hypotNorm only for which it is faster to first apply abs
+    // on all coefficients prior to reduction through hypot.
+    // This way we avoid calling abs on positive and real entries, and this also permits
+    // to seamlessly handle complexes. Otherwise we would have to handle both real and complexes
+    // through the same functor...
+    return internal::positive_real_hypot(x,y);
   }
 };
 template<typename Scalar>
@@ -443,7 +436,7 @@ template<typename BinaryOp> struct bind1st_op : BinaryOp {
   typedef typename BinaryOp::second_argument_type second_argument_type;
   typedef typename BinaryOp::result_type          result_type;
 
-  bind1st_op(const first_argument_type &val) : m_value(val) {}
+  EIGEN_DEVICE_FUNC explicit bind1st_op(const first_argument_type &val) : m_value(val) {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); }
 
@@ -462,7 +455,7 @@ template<typename BinaryOp> struct bind2nd_op : BinaryOp {
   typedef typename BinaryOp::second_argument_type second_argument_type;
   typedef typename BinaryOp::result_type          result_type;
 
-  bind2nd_op(const second_argument_type &val) : m_value(val) {}
+  EIGEN_DEVICE_FUNC explicit bind2nd_op(const second_argument_type &val) : m_value(val) {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); }
 
diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h
index 6a30466fb..b03be0269 100644
--- a/Eigen/src/Core/functors/NullaryFunctors.h
+++ b/Eigen/src/Core/functors/NullaryFunctors.h
@@ -44,16 +44,16 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
 {
   linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
     m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)),
-    m_interPacket(plset<Packet>(0)),
     m_flip(numext::abs(high)<numext::abs(low))
   {}
 
   template<typename IndexType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const {
+    typedef typename NumTraits<Scalar>::Real RealScalar;
     if(m_flip)
-      return (i==0)? m_low : (m_high - (m_size1-i)*m_step);
+      return (i==0)? m_low : (m_high - RealScalar(m_size1-i)*m_step);
     else
-      return (i==m_size1)? m_high : (m_low + i*m_step);
+      return (i==m_size1)? m_high : (m_low + RealScalar(i)*m_step);
   }
 
   template<typename IndexType>
@@ -63,7 +63,7 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
     // [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
     if(m_flip)
     {
-      Packet pi = padd(pset1<Packet>(Scalar(i-m_size1)),m_interPacket);
+      Packet pi = plset<Packet>(Scalar(i-m_size1));
       Packet res = padd(pset1<Packet>(m_high), pmul(pset1<Packet>(m_step), pi));
       if(i==0)
         res = pinsertfirst(res, m_low);
@@ -71,7 +71,7 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
     }
     else
     {
-      Packet pi = padd(pset1<Packet>(Scalar(i)),m_interPacket);
+      Packet pi = plset<Packet>(Scalar(i));
       Packet res = padd(pset1<Packet>(m_low), pmul(pset1<Packet>(m_step), pi));
       if(i==m_size1-unpacket_traits<Packet>::size+1)
         res = pinsertlast(res, m_high);
@@ -83,7 +83,6 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
   const Scalar m_high;
   const Index m_size1;
   const Scalar m_step;
-  const Packet m_interPacket;
   const bool m_flip;
 };
 
diff --git a/Eigen/src/Core/functors/StlFunctors.h b/Eigen/src/Core/functors/StlFunctors.h
index 6df3fa501..9c1d75850 100644
--- a/Eigen/src/Core/functors/StlFunctors.h
+++ b/Eigen/src/Core/functors/StlFunctors.h
@@ -83,13 +83,17 @@ struct functor_traits<std::binder1st<T> >
 { enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
 #endif
 
+#if (__cplusplus < 201703L) && (EIGEN_COMP_MSVC < 1910)
+// std::unary_negate is deprecated since c++17 and will be removed in c++20
 template<typename T>
 struct functor_traits<std::unary_negate<T> >
 { enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
 
+// std::binary_negate is deprecated since c++17 and will be removed in c++20
 template<typename T>
 struct functor_traits<std::binary_negate<T> >
 { enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
+#endif
 
 #ifdef EIGEN_STDEXT_SUPPORT
 
diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
index bfc046556..c1cc2ab3b 100644
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -701,7 +701,7 @@ template<typename Scalar> struct scalar_isnan_op {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const {
 #if defined(__SYCL_DEVICE_ONLY__)
     return numext::isnan(a);
-#else  
+#else
     return (numext::isnan)(a);
 #endif
   }
@@ -815,7 +815,7 @@ struct scalar_sign_op<Scalar,true> {
 template<typename Scalar>
 struct functor_traits<scalar_sign_op<Scalar> >
 { enum {
-    Cost = 
+    Cost =
         NumTraits<Scalar>::IsComplex
         ? ( 8*NumTraits<Scalar>::MulCost  ) // roughly
         : ( 3*NumTraits<Scalar>::AddCost),
@@ -823,6 +823,34 @@ struct functor_traits<scalar_sign_op<Scalar> >
   };
 };
 
+/** \internal
+  * \brief Template functor to compute the logistic function of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::logistic()
+  */
+template <typename T>
+struct scalar_logistic_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_logistic_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const {
+    const T one = T(1);
+    return one / (one + numext::exp(-x));
+  }
+
+  template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Packet packetOp(const Packet& x) const {
+    const Packet one = pset1<Packet>(T(1));
+    return pdiv(one, padd(one, pexp(pnegate(x))));
+  }
+};
+template <typename T>
+struct functor_traits<scalar_logistic_op<T> > {
+  enum {
+    Cost = NumTraits<T>::AddCost * 2 + NumTraits<T>::MulCost * 6,
+    PacketAccess = packet_traits<T>::HasAdd && packet_traits<T>::HasDiv &&
+                   packet_traits<T>::HasNegate && packet_traits<T>::HasExp
+  };
+};
+
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 45230bce5..b012691c1 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -390,6 +390,7 @@ public:
   typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
   typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
   typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+  typedef LhsPacket LhsPacket4Packing;
 
   typedef ResPacket AccPacket;
   
@@ -496,6 +497,7 @@ public:
   typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
   typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
   typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+  typedef LhsPacket LhsPacket4Packing;
 
   typedef ResPacket AccPacket;
 
@@ -580,7 +582,7 @@ DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Pack
 }
 
 template<typename Packet>
-const DoublePacket<Packet>& predux_downto4(const DoublePacket<Packet> &a)
+const DoublePacket<Packet>& predux_half_dowto4(const DoublePacket<Packet> &a)
 {
   return a;
 }
@@ -626,6 +628,7 @@ public:
   typedef typename packet_traits<Scalar>::type     ScalarPacket;
   typedef DoublePacket<RealPacket> DoublePacketType;
 
+  typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type LhsPacket4Packing;
   typedef typename conditional<Vectorizable,RealPacket,  Scalar>::type LhsPacket;
   typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
   typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
@@ -777,6 +780,7 @@ public:
   typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
   typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
   typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+  typedef LhsPacket LhsPacket4Packing;
 
   typedef ResPacket AccPacket;
 
@@ -972,7 +976,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
               EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
               EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
               internal::prefetch(blA+(3*K+16)*LhsProgress); \
-              if (EIGEN_ARCH_ARM) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \
+              if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \
               traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
               traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
               traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
@@ -1025,9 +1029,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           ResPacket R0, R1, R2;
           ResPacket alphav = pset1<ResPacket>(alpha);
 
-          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
-          R2 = r0.loadPacket(2 * Traits::ResPacketSize);
+          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
           traits.acc(C4, alphav, R1);
           traits.acc(C8, alphav, R2);
@@ -1035,9 +1039,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           r0.storePacket(1 * Traits::ResPacketSize, R1);
           r0.storePacket(2 * Traits::ResPacketSize, R2);
 
-          R0 = r1.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r1.loadPacket(1 * Traits::ResPacketSize);
-          R2 = r1.loadPacket(2 * Traits::ResPacketSize);
+          R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
           traits.acc(C1, alphav, R0);
           traits.acc(C5, alphav, R1);
           traits.acc(C9, alphav, R2);
@@ -1045,9 +1049,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           r1.storePacket(1 * Traits::ResPacketSize, R1);
           r1.storePacket(2 * Traits::ResPacketSize, R2);
 
-          R0 = r2.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r2.loadPacket(1 * Traits::ResPacketSize);
-          R2 = r2.loadPacket(2 * Traits::ResPacketSize);
+          R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
           traits.acc(C2, alphav, R0);
           traits.acc(C6, alphav, R1);
           traits.acc(C10, alphav, R2);
@@ -1055,9 +1059,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           r2.storePacket(1 * Traits::ResPacketSize, R1);
           r2.storePacket(2 * Traits::ResPacketSize, R2);
 
-          R0 = r3.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r3.loadPacket(1 * Traits::ResPacketSize);
-          R2 = r3.loadPacket(2 * Traits::ResPacketSize);
+          R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
           traits.acc(C3, alphav, R0);
           traits.acc(C7, alphav, R1);
           traits.acc(C11, alphav, R2);
@@ -1134,9 +1138,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           ResPacket R0, R1, R2;
           ResPacket alphav = pset1<ResPacket>(alpha);
 
-          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
-          R2 = r0.loadPacket(2 * Traits::ResPacketSize);
+          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
           traits.acc(C4, alphav, R1);
           traits.acc(C8, alphav, R2);
@@ -1244,10 +1248,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           ResPacket R0, R1, R2, R3;
           ResPacket alphav = pset1<ResPacket>(alpha);
 
-          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
-          R2 = r1.loadPacket(0 * Traits::ResPacketSize);
-          R3 = r1.loadPacket(1 * Traits::ResPacketSize);
+          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
           traits.acc(C4, alphav, R1);
           traits.acc(C1, alphav, R2);
@@ -1257,10 +1261,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           r1.storePacket(0 * Traits::ResPacketSize, R2);
           r1.storePacket(1 * Traits::ResPacketSize, R3);
 
-          R0 = r2.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r2.loadPacket(1 * Traits::ResPacketSize);
-          R2 = r3.loadPacket(0 * Traits::ResPacketSize);
-          R3 = r3.loadPacket(1 * Traits::ResPacketSize);
+          R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
           traits.acc(C2,  alphav, R0);
           traits.acc(C6,  alphav, R1);
           traits.acc(C3,  alphav, R2);
@@ -1337,8 +1341,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           ResPacket R0, R1;
           ResPacket alphav = pset1<ResPacket>(alpha);
 
-          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
           traits.acc(C4, alphav, R1);
           r0.storePacket(0 * Traits::ResPacketSize, R0);
@@ -1431,15 +1435,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           ResPacket R0, R1;
           ResPacket alphav = pset1<ResPacket>(alpha);
 
-          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r1.loadPacket(0 * Traits::ResPacketSize);
+          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
           traits.acc(C1,  alphav, R1);
           r0.storePacket(0 * Traits::ResPacketSize, R0);
           r1.storePacket(0 * Traits::ResPacketSize, R1);
 
-          R0 = r2.loadPacket(0 * Traits::ResPacketSize);
-          R1 = r3.loadPacket(0 * Traits::ResPacketSize);
+          R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
           traits.acc(C2,  alphav, R0);
           traits.acc(C3,  alphav, R1);
           r2.storePacket(0 * Traits::ResPacketSize, R0);
@@ -1504,7 +1508,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
 #undef EIGEN_GEBGP_ONESTEP
           ResPacket R0;
           ResPacket alphav = pset1<ResPacket>(alpha);
-          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
           r0.storePacket(0 * Traits::ResPacketSize, R0);
         }
@@ -1523,13 +1527,13 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           prefetch(&blA[0]);
           const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
 
-          // The following piece of code wont work for 512 bit registers
+          // The following piece of code won't work for 512 bit registers
           // Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size
           // as nr (which is currently 4) for the return type.
-          typedef typename unpacket_traits<SResPacket>::half SResPacketHalf;
+          const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
           if ((SwappedTraits::LhsProgress % 4) == 0 &&
               (SwappedTraits::LhsProgress <= 8) &&
-              (SwappedTraits::LhsProgress!=8 || unpacket_traits<SResPacketHalf>::size==nr))
+              (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr))
           {
             SAccPacket C0, C1, C2, C3;
             straits.initAcc(C0);
@@ -1596,13 +1600,13 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
                 SRhsPacketHalf b0;
                 straits.loadLhsUnaligned(blB, a0);
                 straits.loadRhs(blA, b0);
-                SAccPacketHalf c0 = predux_downto4(C0);
+                SAccPacketHalf c0 = predux_half_dowto4(C0);
                 straits.madd(a0,b0,c0,b0);
                 straits.acc(c0, alphav, R);
               }
               else
               {
-                straits.acc(predux_downto4(C0), alphav, R);
+                straits.acc(predux_half_dowto4(C0), alphav, R);
               }
               res.scatterPacket(i, j2, R);
             }
@@ -1685,19 +1689,18 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
 //
 //  32 33 34 35 ...
 //  36 36 38 39 ...
-template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
 {
   typedef typename DataMapper::LinearMapper LinearMapper;
   EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
 };
 
-template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
   ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
-  typedef typename packet_traits<Scalar>::type Packet;
-  enum { PacketSize = packet_traits<Scalar>::size };
+  enum { PacketSize = unpacket_traits<Packet>::size };
 
   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
   EIGEN_UNUSED_VARIABLE(stride);
@@ -1725,9 +1728,9 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
       for(Index k=0; k<depth; k++)
       {
         Packet A, B, C;
-        A = lhs.loadPacket(i+0*PacketSize, k);
-        B = lhs.loadPacket(i+1*PacketSize, k);
-        C = lhs.loadPacket(i+2*PacketSize, k);
+        A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
+        B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
+        C = lhs.template loadPacket<Packet>(i+2*PacketSize, k);
         pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
         pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
         pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
@@ -1745,8 +1748,8 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
       for(Index k=0; k<depth; k++)
       {
         Packet A, B;
-        A = lhs.loadPacket(i+0*PacketSize, k);
-        B = lhs.loadPacket(i+1*PacketSize, k);
+        A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
+        B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
         pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
         pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
       }
@@ -1763,7 +1766,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
       for(Index k=0; k<depth; k++)
       {
         Packet A;
-        A = lhs.loadPacket(i+0*PacketSize, k);
+        A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
         pstore(blockA+count, cj.pconj(A));
         count+=PacketSize;
       }
@@ -1793,19 +1796,18 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
   }
 }
 
-template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
 {
   typedef typename DataMapper::LinearMapper LinearMapper;
   EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
 };
 
-template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
   ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
-  typedef typename packet_traits<Scalar>::type Packet;
-  enum { PacketSize = packet_traits<Scalar>::size };
+  enum { PacketSize = unpacket_traits<Packet>::size };
 
   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
   EIGEN_UNUSED_VARIABLE(stride);
@@ -1837,7 +1839,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Ro
           for (Index m = 0; m < pack; m += PacketSize)
           {
             PacketBlock<Packet> kernel;
-            for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k);
+            for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k);
             ptranspose(kernel);
             for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
           }
@@ -1924,7 +1926,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Co
 //       const Scalar* b6 = &rhs[(j2+6)*rhsStride];
 //       const Scalar* b7 = &rhs[(j2+7)*rhsStride];
 //       Index k=0;
-//       if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4
+//       if(PacketSize==8) // TODO enable vectorized transposition for PacketSize==4
 //       {
 //         for(; k<peeled_k; k+=PacketSize) {
 //           PacketBlock<Packet> kernel;
@@ -1971,10 +1973,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Co
       {
         for(; k<peeled_k; k+=PacketSize) {
           PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
-          kernel.packet[0] = dm0.loadPacket(k);
-          kernel.packet[1%PacketSize] = dm1.loadPacket(k);
-          kernel.packet[2%PacketSize] = dm2.loadPacket(k);
-          kernel.packet[3%PacketSize] = dm3.loadPacket(k);
+          kernel.packet[0           ] = dm0.template loadPacket<Packet>(k);
+          kernel.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
+          kernel.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k);
+          kernel.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k);
           ptranspose(kernel);
           pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
           pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
@@ -2075,7 +2077,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Co
       for(Index k=0; k<depth; k++)
       {
         if (PacketSize==4) {
-          Packet A = rhs.loadPacket(k, j2);
+          Packet A = rhs.template loadPacket<Packet>(k, j2);
           pstoreu(blockB+count, cj.pconj(A));
           count += PacketSize;
         } else {
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 6440e1d09..f49abcad5 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -75,7 +75,7 @@ static void run(Index rows, Index cols, Index depth,
   Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
   Index nc = (std::min)(cols,blocking.nc());  // cache block size along the N direction
 
-  gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+  gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
   gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
   gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
 
@@ -108,7 +108,7 @@ static void run(Index rows, Index cols, Index depth,
       // i.e., we test that info[tid].users equals 0.
       // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
       while(info[tid].users!=0) {}
-      info[tid].users += threads;
+      info[tid].users = threads;
 
       pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length);
 
@@ -146,7 +146,9 @@ static void run(Index rows, Index cols, Index depth,
       // Release all the sub blocks A'_i of A' for the current thread,
       // i.e., we simply decrement the number of users by 1
       for(Index i=0; i<threads; ++i)
+#if !EIGEN_HAS_CXX11_ATOMIC
         #pragma omp atomic
+#endif
         info[i].users -= 1;
     }
   }
@@ -427,8 +429,14 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
   template<typename Dst>
   static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
-    if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
-      lazyproduct::evalTo(dst, lhs, rhs);
+    // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=404 for a discussion and helper program
+    // to determine the following heuristic.
+    // EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h,
+    // unless it has been specialized by the user or for a given architecture.
+    // Note that the condition rhs.rows()>0 was required because lazy product is (was?) not happy with empty inputs.
+    // I'm not sure it is still required.
+    if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
+      lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op<typename Dst::Scalar,Scalar>());
     else
     {
       dst.setZero();
@@ -439,8 +447,8 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
   template<typename Dst>
   static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
-    if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
-      lazyproduct::addTo(dst, lhs, rhs);
+    if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
+      lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op<typename Dst::Scalar,Scalar>());
     else
       scaleAndAddTo(dst,lhs, rhs, Scalar(1));
   }
@@ -448,8 +456,8 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
   template<typename Dst>
   static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
-    if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
-      lazyproduct::subTo(dst, lhs, rhs);
+    if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
+      lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op<typename Dst::Scalar,Scalar>());
     else
       scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
   }
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
index 7122efa60..ec2825bf0 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -84,7 +84,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
     ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
     ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
 
-    gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
     gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
     gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
     tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, UpLo> sybb;
@@ -110,7 +110,6 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
           gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc,
                (std::min)(size,i2), alpha, -1, -1, 0, 0);
 
-
         sybb(_res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
 
         if (UpLo==Upper)
@@ -160,7 +159,7 @@ struct tribb_kernel
       if(UpLo==Upper)
         gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha,
                     -1, -1, 0, 0);
-
+      
       // selfadjoint micro block
       {
         Index i = j;
@@ -168,6 +167,7 @@ struct tribb_kernel
         // 1 - apply the kernel on the temporary buffer
         gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
                     -1, -1, 0, 0);
+
         // 2 - triangular accumulation
         for(Index j1=0; j1<actualBlockSize; ++j1)
         {
@@ -269,10 +269,13 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
     enum {
       IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0,
       LhsIsRowMajor = _ActualLhs::Flags&RowMajorBit ? 1 : 0,
-      RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0
+      RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0,
+      SkipDiag = (UpLo&(UnitDiag|ZeroDiag))!=0
     };
 
     Index size = mat.cols();
+    if(SkipDiag)
+      size--;
     Index depth = actualLhs.cols();
 
     typedef internal::gemm_blocking_space<IsRowMajor ? RowMajor : ColMajor,typename Lhs::Scalar,typename Rhs::Scalar,
@@ -283,21 +286,23 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
     internal::general_matrix_matrix_triangular_product<Index,
       typename Lhs::Scalar, LhsIsRowMajor ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
       typename Rhs::Scalar, RhsIsRowMajor ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
-      IsRowMajor ? RowMajor : ColMajor, UpLo>
+      IsRowMajor ? RowMajor : ColMajor, UpLo&(Lower|Upper)>
       ::run(size, depth,
-            &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &actualRhs.coeffRef(0,0), actualRhs.outerStride(),
-            mat.data(), mat.outerStride(), actualAlpha, blocking);
+            &actualLhs.coeffRef(SkipDiag&&(UpLo&Lower)==Lower ? 1 : 0,0), actualLhs.outerStride(),
+            &actualRhs.coeffRef(0,SkipDiag&&(UpLo&Upper)==Upper ? 1 : 0), actualRhs.outerStride(),
+            mat.data() + (SkipDiag ? (bool(IsRowMajor) != ((UpLo&Lower)==Lower) ? 1 : mat.outerStride() ) : 0), mat.outerStride(), actualAlpha, blocking);
   }
 };
 
 template<typename MatrixType, unsigned int UpLo>
 template<typename ProductType>
-TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta)
+EIGEN_DEVICE_FUNC TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta)
 {
+  EIGEN_STATIC_ASSERT((UpLo&UnitDiag)==0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED);
   eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols());
-  
+
   general_product_to_triangular_selector<MatrixType, ProductType, UpLo, internal::traits<ProductType>::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta);
-  
+
   return derived();
 }
 
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
index 5b7c15cca..49565c070 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
@@ -37,7 +37,7 @@ namespace Eigen {
 
 namespace internal {
 
-template <typename Index, typename Scalar, int AStorageOrder, bool ConjugateA, int ResStorageOrder, int  UpLo>
+template <typename Index, typename Scalar, int AStorageOrder, bool ConjugateA, int ResStorageOrder, int UpLo>
 struct general_matrix_matrix_rankupdate :
        general_matrix_matrix_triangular_product<
          Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,UpLo,BuiltIn> {};
@@ -52,7 +52,7 @@ struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,Con
   static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \
                           const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking<Scalar, Scalar>& blocking) \
   { \
-    if (lhs==rhs) { \
+    if ( lhs==rhs && ((UpLo&(Lower|Upper))==UpLo) ) { \
       general_matrix_matrix_rankupdate<Index,Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,UpLo> \
       ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \
     } else { \
@@ -88,7 +88,7 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C
    BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \
    char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'T':'N'); \
    EIGTYPE beta(1); \
-   BLASFUNC(&uplo, &trans, &n, &k, &numext::real_ref(alpha), lhs, &lda, &numext::real_ref(beta), res, &ldc); \
+   BLASFUNC(&uplo, &trans, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), lhs, &lda, (const BLASTYPE*)&numext::real_ref(beta), res, &ldc); \
   } \
 };
 
@@ -125,9 +125,13 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C
   } \
 };
 
-
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk)
+EIGEN_BLAS_RANKUPDATE_R(float,  float,  ssyrk)
+#else
 EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk_)
 EIGEN_BLAS_RANKUPDATE_R(float,  float,  ssyrk_)
+#endif
 
 // TODO hanlde complex cases
 // EIGEN_BLAS_RANKUPDATE_C(dcomplex, double, double, zherk_)
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
index 7a3bdbf20..b0f6b0d5b 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
@@ -46,7 +46,7 @@ namespace internal {
 
 // gemm specialization
 
-#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASPREFIX) \
+#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASFUNC) \
 template< \
   typename Index, \
   int LhsStorageOrder, bool ConjugateLhs, \
@@ -100,13 +100,20 @@ static void run(Index rows, Index cols, Index depth, \
     ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
   } else b = _rhs; \
 \
-  BLASPREFIX##gemm_(&transa, &transb, &m, &n, &k, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
+  BLASFUNC(&transa, &transb, &m, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
 }};
 
-GEMM_SPECIALIZATION(double,   d,  double, d)
-GEMM_SPECIALIZATION(float,    f,  float,  s)
-GEMM_SPECIALIZATION(dcomplex, cd, double, z)
-GEMM_SPECIALIZATION(scomplex, cf, float,  c)
+#ifdef EIGEN_USE_MKL
+GEMM_SPECIALIZATION(double,   d,  double, dgemm)
+GEMM_SPECIALIZATION(float,    f,  float,  sgemm)
+GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, zgemm)
+GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8,  cgemm)
+#else
+GEMM_SPECIALIZATION(double,   d,  double, dgemm_)
+GEMM_SPECIALIZATION(float,    f,  float,  sgemm_)
+GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_)
+GEMM_SPECIALIZATION(scomplex, cf, float,  cgemm_)
+#endif
 
 } // end namespase internal
 
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index 41d8242e1..767feb99d 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -48,7 +48,7 @@ typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
 
-EIGEN_DONT_INLINE static void run(
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
   Index rows, Index cols,
   const LhsMapper& lhs,
   const RhsMapper& rhs,
@@ -57,7 +57,7 @@ EIGEN_DONT_INLINE static void run(
 };
 
 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
   Index rows, Index cols,
   const LhsMapper& alhs,
   const RhsMapper& rhs,
@@ -201,7 +201,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,C
 }
 
 /* Optimized row-major matrix * vector product:
- * This algorithm processes 4 rows at onces that allows to both reduce
+ * This algorithm processes 4 rows at once that allows to both reduce
  * the number of load/stores of the result by a factor 4 and to reduce
  * the instruction dependency. Moreover, we know that all bands have the
  * same alignment pattern.
@@ -231,7 +231,7 @@ typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
 
-EIGEN_DONT_INLINE static void run(
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
   Index rows, Index cols,
   const LhsMapper& lhs,
   const RhsMapper& rhs,
@@ -240,7 +240,7 @@ EIGEN_DONT_INLINE static void run(
 };
 
 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
   Index rows, Index cols,
   const LhsMapper& alhs,
   const RhsMapper& rhs,
diff --git a/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h
index e3a5d5892..6e36c2b3c 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h
@@ -85,7 +85,7 @@ EIGEN_BLAS_GEMV_SPECIALIZE(float)
 EIGEN_BLAS_GEMV_SPECIALIZE(dcomplex)
 EIGEN_BLAS_GEMV_SPECIALIZE(scomplex)
 
-#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASPREFIX) \
+#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASFUNC) \
 template<typename Index, int LhsStorageOrder, bool ConjugateLhs, bool ConjugateRhs> \
 struct general_matrix_vector_product_gemv<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,ConjugateRhs> \
 { \
@@ -113,14 +113,21 @@ static void run( \
     x_ptr=x_tmp.data(); \
     incx=1; \
   } else x_ptr=rhs; \
-  BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \
+  BLASFUNC(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \
 }\
 };
 
-EIGEN_BLAS_GEMV_SPECIALIZATION(double,   double, d)
-EIGEN_BLAS_GEMV_SPECIALIZATION(float,    float,  s)
-EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, z)
-EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float,  c)
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_GEMV_SPECIALIZATION(double,   double, dgemv)
+EIGEN_BLAS_GEMV_SPECIALIZATION(float,    float,  sgemv)
+EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, zgemv)
+EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, MKL_Complex8 , cgemv)
+#else
+EIGEN_BLAS_GEMV_SPECIALIZATION(double,   double, dgemv_)
+EIGEN_BLAS_GEMV_SPECIALIZATION(float,    float,  sgemv_)
+EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, zgemv_)
+EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float,  cgemv_)
+#endif
 
 } // end namespase internal
 
diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index c2f084c82..92e9b0d9f 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -10,6 +10,10 @@
 #ifndef EIGEN_PARALLELIZER_H
 #define EIGEN_PARALLELIZER_H
 
+#if EIGEN_HAS_CXX11_ATOMIC
+#include <atomic>
+#endif
+
 namespace Eigen {
 
 namespace internal {
@@ -75,8 +79,17 @@ template<typename Index> struct GemmParallelInfo
 {
   GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {}
 
+  // volatile is not enough on all architectures (see bug 1572)
+  // to guarantee that when thread A says to thread B that it is
+  // done with packing a block, then all writes have been really
+  // carried out... C++11 memory model+atomic guarantees this.
+#if EIGEN_HAS_CXX11_ATOMIC
+  std::atomic<Index> sync;
+  std::atomic<int> users;
+#else
   Index volatile sync;
   int volatile users;
+#endif
 
   Index lhs_start;
   Index lhs_length;
@@ -87,11 +100,14 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth,
 {
   // TODO when EIGEN_USE_BLAS is defined,
   // we should still enable OMP for other scalar types
-#if !(defined (EIGEN_HAS_OPENMP)) || defined (EIGEN_USE_BLAS)
+  // Without C++11, we have to disable GEMM's parallelization on
+  // non x86 architectures because there volatile is not enough for our purpose.
+  // See bug 1572.
+#if (! defined(EIGEN_HAS_OPENMP)) || defined(EIGEN_USE_BLAS) || ((!EIGEN_HAS_CXX11_ATOMIC) && !(EIGEN_ARCH_i386_OR_x86_64))
   // FIXME the transpose variable is only needed to properly split
   // the matrix product when multithreading is enabled. This is a temporary
   // fix to support row-major destination matrices. This whole
-  // parallelizer mechanism has to be redisigned anyway.
+  // parallelizer mechanism has to be redesigned anyway.
   EIGEN_UNUSED_VARIABLE(depth);
   EIGEN_UNUSED_VARIABLE(transpose);
   func(0,rows, 0,cols);
@@ -117,7 +133,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth,
   // compute the number of threads we are going to use
   Index threads = std::min<Index>(nbThreads(), pb_max_threads);
 
-  // if multi-threading is explicitely disabled, not useful, or if we already are in a parallel session,
+  // if multi-threading is explicitly disabled, not useful, or if we already are in a parallel session,
   // then abort multi-threading
   // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp?
   if((!Condition) || (threads==1) || (omp_get_num_threads()>1))
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index da6f82abc..c84c71609 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -352,7 +352,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
     gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
     symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
     gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
-    gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
+    gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
 
     for(Index k2=0; k2<size; k2+=kc)
     {
@@ -387,7 +387,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
       for(Index i2=k2+kc; i2<size; i2+=mc)
       {
         const Index actual_mc = (std::min)(i2+mc,size)-i2;
-        gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
+        gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder,false>()
           (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
 
         gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
@@ -437,7 +437,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
     ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
 
     gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
     symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
 
     for(Index k2=0; k2<size; k2+=kc)
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h
index a45238d69..9a5318507 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h
@@ -40,7 +40,7 @@ namespace internal {
 
 /* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */
 
-#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
+#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
 template <typename Index, \
           int LhsStorageOrder, bool ConjugateLhs, \
           int RhsStorageOrder, bool ConjugateRhs> \
@@ -81,13 +81,13 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
       ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
     } else b = _rhs; \
 \
-    BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
+    BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
 \
   } \
 };
 
 
-#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
+#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
 template <typename Index, \
           int LhsStorageOrder, bool ConjugateLhs, \
           int RhsStorageOrder, bool ConjugateRhs> \
@@ -144,20 +144,26 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
       ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
     } \
 \
-    BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
+    BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
 \
   } \
 };
 
-EIGEN_BLAS_SYMM_L(double, double, d, d)
-EIGEN_BLAS_SYMM_L(float, float, f, s)
-EIGEN_BLAS_HEMM_L(dcomplex, double, cd, z)
-EIGEN_BLAS_HEMM_L(scomplex, float, cf, c)
-
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_SYMM_L(double, double, d, dsymm)
+EIGEN_BLAS_SYMM_L(float, float, f, ssymm)
+EIGEN_BLAS_HEMM_L(dcomplex, MKL_Complex16, cd, zhemm)
+EIGEN_BLAS_HEMM_L(scomplex, MKL_Complex8, cf, chemm)
+#else
+EIGEN_BLAS_SYMM_L(double, double, d, dsymm_)
+EIGEN_BLAS_SYMM_L(float, float, f, ssymm_)
+EIGEN_BLAS_HEMM_L(dcomplex, double, cd, zhemm_)
+EIGEN_BLAS_HEMM_L(scomplex, float, cf, chemm_)
+#endif
 
 /* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */
 
-#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
+#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
 template <typename Index, \
           int LhsStorageOrder, bool ConjugateLhs, \
           int RhsStorageOrder, bool ConjugateRhs> \
@@ -197,13 +203,13 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
       ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
     } else b = _lhs; \
 \
-    BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
+    BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
 \
   } \
 };
 
 
-#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
+#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
 template <typename Index, \
           int LhsStorageOrder, bool ConjugateLhs, \
           int RhsStorageOrder, bool ConjugateRhs> \
@@ -259,15 +265,21 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
       ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
     } \
 \
-    BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
+    BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
   } \
 };
 
-EIGEN_BLAS_SYMM_R(double, double, d, d)
-EIGEN_BLAS_SYMM_R(float, float, f, s)
-EIGEN_BLAS_HEMM_R(dcomplex, double, cd, z)
-EIGEN_BLAS_HEMM_R(scomplex, float, cf, c)
-
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_SYMM_R(double, double, d, dsymm)
+EIGEN_BLAS_SYMM_R(float, float, f, ssymm)
+EIGEN_BLAS_HEMM_R(dcomplex, MKL_Complex16, cd, zhemm)
+EIGEN_BLAS_HEMM_R(scomplex, MKL_Complex8, cf, chemm)
+#else
+EIGEN_BLAS_SYMM_R(double, double, d, dsymm_)
+EIGEN_BLAS_SYMM_R(float, float, f, ssymm_)
+EIGEN_BLAS_HEMM_R(dcomplex, double, cd, zhemm_)
+EIGEN_BLAS_HEMM_R(scomplex, float, cf, chemm_)
+#endif
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h
index 3fd180e6c..d38fd72b2 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -15,7 +15,7 @@ namespace Eigen {
 namespace internal {
 
 /* Optimized selfadjoint matrix * vector product:
- * This algorithm processes 2 columns at onces that allows to both reduce
+ * This algorithm processes 2 columns at once that allows to both reduce
  * the number of load/stores of the result by a factor 2 and to reduce
  * the instruction dependency.
  */
@@ -27,7 +27,8 @@ template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool Conju
 struct selfadjoint_matrix_vector_product
 
 {
-static EIGEN_DONT_INLINE void run(
+static EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC
+void run(
   Index size,
   const Scalar*  lhs, Index lhsStride,
   const Scalar*  rhs,
@@ -36,7 +37,8 @@ static EIGEN_DONT_INLINE void run(
 };
 
 template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run(
+EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC
+void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run(
   Index size,
   const Scalar*  lhs, Index lhsStride,
   const Scalar*  rhs,
@@ -62,8 +64,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
 
   Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha;
 
-
-  Index bound = (std::max)(Index(0),size-8) & 0xfffffffe;
+  Index bound = numext::maxi(Index(0), size-8) & 0xfffffffe;
   if (FirstTriangular)
     bound = size - bound;
 
@@ -175,7 +176,8 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,0,true>
   enum { LhsUpLo = LhsMode&(Upper|Lower) };
 
   template<typename Dest>
-  static void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC
+  void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
   {
     typedef typename Dest::Scalar ResScalar;
     typedef typename Rhs::Scalar RhsScalar;
diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h
index 38f23accf..1238345e3 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h
@@ -95,14 +95,21 @@ const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \
     x_tmp=map_x.conjugate(); \
     x_ptr=x_tmp.data(); \
   } else x_ptr=_rhs; \
-  BLASFUNC(&uplo, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \
+  BLASFUNC(&uplo, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \
 }\
 };
 
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_SYMV_SPECIALIZATION(double,   double, dsymv)
+EIGEN_BLAS_SYMV_SPECIALIZATION(float,    float,  ssymv)
+EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv)
+EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, MKL_Complex8,  chemv)
+#else
 EIGEN_BLAS_SYMV_SPECIALIZATION(double,   double, dsymv_)
 EIGEN_BLAS_SYMV_SPECIALIZATION(float,    float,  ssymv_)
 EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, double, zhemv_)
 EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float,  chemv_)
+#endif
 
 } // end namespace internal
 
diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h
index f038d686f..39c5b59ff 100644
--- a/Eigen/src/Core/products/SelfadjointProduct.h
+++ b/Eigen/src/Core/products/SelfadjointProduct.h
@@ -120,7 +120,7 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false>
 
 template<typename MatrixType, unsigned int UpLo>
 template<typename DerivedU>
-SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
+EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
 ::rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha)
 {
   selfadjoint_product_selector<MatrixType,DerivedU,UpLo>::run(_expression().const_cast_derived(), u.derived(), alpha);
diff --git a/Eigen/src/Core/products/SelfadjointRank2Update.h b/Eigen/src/Core/products/SelfadjointRank2Update.h
index 2ae364111..09209f733 100644
--- a/Eigen/src/Core/products/SelfadjointRank2Update.h
+++ b/Eigen/src/Core/products/SelfadjointRank2Update.h
@@ -24,7 +24,8 @@ struct selfadjoint_rank2_update_selector;
 template<typename Scalar, typename Index, typename UType, typename VType>
 struct selfadjoint_rank2_update_selector<Scalar,Index,UType,VType,Lower>
 {
-  static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC
+  void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
   {
     const Index size = u.size();
     for (Index i=0; i<size; ++i)
@@ -57,7 +58,7 @@ template<bool Cond, typename T> struct conj_expr_if
 
 template<typename MatrixType, unsigned int UpLo>
 template<typename DerivedU, typename DerivedV>
-SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
+EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
 ::rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, const Scalar& alpha)
 {
   typedef internal::blas_traits<DerivedU> UBlasTraits;
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h
index 6ec5a8a0b..85fd744b9 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -137,7 +137,13 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
     ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
     ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
 
-    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer((internal::constructor_without_unaligned_array_assert()));
+    // To work around an "error: member reference base type 'Matrix<...>
+    // (Eigen::internal::constructor_without_unaligned_array_assert (*)())' is
+    // not a structure or union" compilation error in nvcc (tested V8.0.61),
+    // create a dummy internal::constructor_without_unaligned_array_assert
+    // object to pass to the Matrix constructor.
+    internal::constructor_without_unaligned_array_assert a;
+    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer(a);
     triangularBuffer.setZero();
     if((Mode&ZeroDiag)==ZeroDiag)
       triangularBuffer.diagonal().setZero();
@@ -145,7 +151,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
       triangularBuffer.diagonal().setOnes();
 
     gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
     gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
 
     for(Index k2=IsLower ? depth : 0;
@@ -216,7 +222,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
         for(Index i2=start; i2<end; i2+=mc)
         {
           const Index actual_mc = (std::min)(i2+mc,end)-i2;
-          gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
+          gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder,false>()
             (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
 
           gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc,
@@ -284,7 +290,8 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
     ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
     ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
 
-    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer((internal::constructor_without_unaligned_array_assert()));
+    internal::constructor_without_unaligned_array_assert a;
+    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer(a);
     triangularBuffer.setZero();
     if((Mode&ZeroDiag)==ZeroDiag)
       triangularBuffer.diagonal().setZero();
@@ -292,7 +299,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
       triangularBuffer.diagonal().setOnes();
 
     gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
     gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
     gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
 
@@ -393,7 +400,9 @@ struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
 {
   template<typename Dest> static void run(Dest& dst, const Lhs &a_lhs, const Rhs &a_rhs, const typename Dest::Scalar& alpha)
   {
-    typedef typename Dest::Scalar     Scalar;
+    typedef typename Lhs::Scalar  LhsScalar;
+    typedef typename Rhs::Scalar  RhsScalar;
+    typedef typename Dest::Scalar Scalar;
     
     typedef internal::blas_traits<Lhs> LhsBlasTraits;
     typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
@@ -405,8 +414,9 @@ struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
     typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
     typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
 
-    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs)
-                               * RhsBlasTraits::extractScalarFactor(a_rhs);
+    LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(a_lhs);
+    RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(a_rhs);
+    Scalar actualAlpha = alpha * lhs_alpha * rhs_alpha;
 
     typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
               Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,4> BlockingType;
@@ -431,6 +441,21 @@ struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
         &dst.coeffRef(0,0), dst.outerStride(),    // result info
         actualAlpha, blocking
       );
+
+    // Apply correction if the diagonal is unit and a scalar factor was nested:
+    if ((Mode&UnitDiag)==UnitDiag)
+    {
+      if (LhsIsTriangular && lhs_alpha!=LhsScalar(1))
+      {
+        Index diagSize = (std::min)(lhs.rows(),lhs.cols());
+        dst.topRows(diagSize) -= ((lhs_alpha-LhsScalar(1))*a_rhs).topRows(diagSize);
+      }
+      else if ((!LhsIsTriangular) && rhs_alpha!=RhsScalar(1))
+      {
+        Index diagSize = (std::min)(rhs.rows(),rhs.cols());
+        dst.leftCols(diagSize) -= (rhs_alpha-RhsScalar(1))*a_lhs.leftCols(diagSize);
+      }
+    }
   }
 };
 
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h
index aecded6bb..a25197ab0 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h
@@ -75,7 +75,7 @@ EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, true)
 EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, false)
 
 // implements col-major += alpha * op(triangular) * op(general)
-#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
+#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
 template <typename Index, int Mode, \
           int LhsStorageOrder, bool ConjugateLhs, \
           int RhsStorageOrder, bool ConjugateRhs> \
@@ -172,7 +172,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
    } \
    /*std::cout << "TRMM_L: A is square! Go to BLAS TRMM implementation! \n";*/ \
 /* call ?trmm*/ \
-   BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
+   BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
 \
 /* Add op(a_triangular)*b into res*/ \
    Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
@@ -180,13 +180,20 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
   } \
 };
 
-EIGEN_BLAS_TRMM_L(double, double, d, d)
-EIGEN_BLAS_TRMM_L(dcomplex, double, cd, z)
-EIGEN_BLAS_TRMM_L(float, float, f, s)
-EIGEN_BLAS_TRMM_L(scomplex, float, cf, c)
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRMM_L(double, double, d, dtrmm)
+EIGEN_BLAS_TRMM_L(dcomplex, MKL_Complex16, cd, ztrmm)
+EIGEN_BLAS_TRMM_L(float, float, f, strmm)
+EIGEN_BLAS_TRMM_L(scomplex, MKL_Complex8, cf, ctrmm)
+#else
+EIGEN_BLAS_TRMM_L(double, double, d, dtrmm_)
+EIGEN_BLAS_TRMM_L(dcomplex, double, cd, ztrmm_)
+EIGEN_BLAS_TRMM_L(float, float, f, strmm_)
+EIGEN_BLAS_TRMM_L(scomplex, float, cf, ctrmm_)
+#endif
 
 // implements col-major += alpha * op(general) * op(triangular)
-#define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
+#define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
 template <typename Index, int Mode, \
           int LhsStorageOrder, bool ConjugateLhs, \
           int RhsStorageOrder, bool ConjugateRhs> \
@@ -282,7 +289,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
    } \
    /*std::cout << "TRMM_R: A is square! Go to BLAS TRMM implementation! \n";*/ \
 /* call ?trmm*/ \
-   BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
+   BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
 \
 /* Add op(a_triangular)*b into res*/ \
    Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
@@ -290,11 +297,17 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
   } \
 };
 
-EIGEN_BLAS_TRMM_R(double, double, d, d)
-EIGEN_BLAS_TRMM_R(dcomplex, double, cd, z)
-EIGEN_BLAS_TRMM_R(float, float, f, s)
-EIGEN_BLAS_TRMM_R(scomplex, float, cf, c)
-
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRMM_R(double, double, d, dtrmm)
+EIGEN_BLAS_TRMM_R(dcomplex, MKL_Complex16, cd, ztrmm)
+EIGEN_BLAS_TRMM_R(float, float, f, strmm)
+EIGEN_BLAS_TRMM_R(scomplex, MKL_Complex8, cf, ctrmm)
+#else
+EIGEN_BLAS_TRMM_R(double, double, d, dtrmm_)
+EIGEN_BLAS_TRMM_R(dcomplex, double, cd, ztrmm_)
+EIGEN_BLAS_TRMM_R(float, float, f, strmm_)
+EIGEN_BLAS_TRMM_R(scomplex, float, cf, ctrmm_)
+#endif
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h
index 4b292e74d..76bfa159c 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -221,8 +221,9 @@ template<int Mode> struct trmv_selector<Mode,ColMajor>
     typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
     typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
 
-    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs)
-                                  * RhsBlasTraits::extractScalarFactor(rhs);
+    LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs);
+    RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs);
+    ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha;
 
     enum {
       // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
@@ -274,6 +275,12 @@ template<int Mode> struct trmv_selector<Mode,ColMajor>
       else
         dest = MappedDest(actualDestPtr, dest.size());
     }
+
+    if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) )
+    {
+      Index diagSize = (std::min)(lhs.rows(),lhs.cols());
+      dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize);
+    }
   }
 };
 
@@ -295,8 +302,9 @@ template<int Mode> struct trmv_selector<Mode,RowMajor>
     typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
     typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
 
-    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs)
-                                  * RhsBlasTraits::extractScalarFactor(rhs);
+    LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs);
+    RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs);
+    ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha;
 
     enum {
       DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1
@@ -326,6 +334,12 @@ template<int Mode> struct trmv_selector<Mode,RowMajor>
             actualRhsPtr,1,
             dest.data(),dest.innerStride(),
             actualAlpha);
+
+    if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) )
+    {
+      Index diagSize = (std::min)(lhs.rows(),lhs.cols());
+      dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize);
+    }
   }
 };
 
diff --git a/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
index 07bf26ce5..3d47a2b94 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
@@ -71,7 +71,7 @@ EIGEN_BLAS_TRMV_SPECIALIZE(dcomplex)
 EIGEN_BLAS_TRMV_SPECIALIZE(scomplex)
 
 // implements col-major: res += alpha * op(triangular) * vector
-#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
+#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \
 template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
 struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor> { \
   enum { \
@@ -121,10 +121,10 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
    diag = IsUnitDiag ? 'U' : 'N'; \
 \
 /* call ?TRMV*/ \
-   BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
+   BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
 \
 /* Add op(a_tr)rhs into res*/ \
-   BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
+   BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
 /* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
    if (size<(std::max)(rows,cols)) { \
      if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
@@ -142,18 +142,25 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
        m = convert_index<BlasIndex>(size); \
        n = convert_index<BlasIndex>(cols-size); \
      } \
-     BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \
+     BLASPREFIX##gemv##BLASPOSTFIX(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \
    } \
   } \
 };
 
-EIGEN_BLAS_TRMV_CM(double,   double, d,  d)
-EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z)
-EIGEN_BLAS_TRMV_CM(float,    float,  f,  s)
-EIGEN_BLAS_TRMV_CM(scomplex, float,  cf, c)
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRMV_CM(double,   double, d,  d,)
+EIGEN_BLAS_TRMV_CM(dcomplex, MKL_Complex16, cd, z,)
+EIGEN_BLAS_TRMV_CM(float,    float,  f,  s,)
+EIGEN_BLAS_TRMV_CM(scomplex, MKL_Complex8,  cf, c,)
+#else
+EIGEN_BLAS_TRMV_CM(double,   double, d,  d, _)
+EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z, _)
+EIGEN_BLAS_TRMV_CM(float,    float,  f,  s, _)
+EIGEN_BLAS_TRMV_CM(scomplex, float,  cf, c, _)
+#endif
 
 // implements row-major: res += alpha * op(triangular) * vector
-#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
+#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \
 template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
 struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor> { \
   enum { \
@@ -203,10 +210,10 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
    diag = IsUnitDiag ? 'U' : 'N'; \
 \
 /* call ?TRMV*/ \
-   BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
+   BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
 \
 /* Add op(a_tr)rhs into res*/ \
-   BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
+   BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
 /* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
    if (size<(std::max)(rows,cols)) { \
      if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
@@ -224,15 +231,22 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
        m = convert_index<BlasIndex>(size); \
        n = convert_index<BlasIndex>(cols-size); \
      } \
-     BLASPREFIX##gemv_(&trans, &n, &m, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \
+     BLASPREFIX##gemv##BLASPOSTFIX(&trans, &n, &m, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \
    } \
   } \
 };
 
-EIGEN_BLAS_TRMV_RM(double,   double, d,  d)
-EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z)
-EIGEN_BLAS_TRMV_RM(float,    float,  f,  s)
-EIGEN_BLAS_TRMV_RM(scomplex, float,  cf, c)
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRMV_RM(double,   double, d,  d,)
+EIGEN_BLAS_TRMV_RM(dcomplex, MKL_Complex16, cd, z,)
+EIGEN_BLAS_TRMV_RM(float,    float,  f,  s,)
+EIGEN_BLAS_TRMV_RM(scomplex, MKL_Complex8,  cf, c,)
+#else
+EIGEN_BLAS_TRMV_RM(double,   double, d,  d,_)
+EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z,_)
+EIGEN_BLAS_TRMV_RM(float,    float,  f,  s,_)
+EIGEN_BLAS_TRMV_RM(scomplex, float,  cf, c,_)
+#endif
 
 } // end namespase internal
 
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h
index 223c38b86..8ff2e9d9d 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -76,7 +76,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
 
     conj_if<Conjugate> conj;
     gebp_kernel<Scalar, Scalar, Index, OtherMapper, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
+    gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, TriStorageOrder> pack_lhs;
     gemm_pack_rhs<Scalar, Index, OtherMapper, Traits::nr, ColMajor, false, true> pack_rhs;
 
     // the goal here is to subdivise the Rhs panels such that we keep some cache
@@ -229,7 +229,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
     gebp_kernel<Scalar, Scalar, Index, LhsMapper, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
     gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
     gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder,false,true> pack_rhs_panel;
-    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor, false, true> pack_lhs_panel;
 
     for(Index k2=IsLower ? size : 0;
         IsLower ? k2>0 : k2<size;
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h b/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h
index 88c0fb794..f0775116a 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h
@@ -38,7 +38,7 @@ namespace Eigen {
 namespace internal {
 
 // implements LeftSide op(triangular)^-1 * general
-#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASPREFIX) \
+#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASFUNC) \
 template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
 struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor> \
 { \
@@ -80,18 +80,24 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorage
    } \
    if (IsUnitDiag) diag='U'; \
 /* call ?trsm*/ \
-   BLASPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
+   BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
  } \
 };
 
-EIGEN_BLAS_TRSM_L(double,   double, d)
-EIGEN_BLAS_TRSM_L(dcomplex, double, z)
-EIGEN_BLAS_TRSM_L(float,    float,  s)
-EIGEN_BLAS_TRSM_L(scomplex, float,  c)
-
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRSM_L(double,   double, dtrsm)
+EIGEN_BLAS_TRSM_L(dcomplex, MKL_Complex16, ztrsm)
+EIGEN_BLAS_TRSM_L(float,    float,  strsm)
+EIGEN_BLAS_TRSM_L(scomplex, MKL_Complex8, ctrsm)
+#else
+EIGEN_BLAS_TRSM_L(double,   double, dtrsm_)
+EIGEN_BLAS_TRSM_L(dcomplex, double, ztrsm_)
+EIGEN_BLAS_TRSM_L(float,    float,  strsm_)
+EIGEN_BLAS_TRSM_L(scomplex, float,  ctrsm_)
+#endif
 
 // implements RightSide general * op(triangular)^-1
-#define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASPREFIX) \
+#define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASFUNC) \
 template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
 struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor> \
 { \
@@ -133,16 +139,22 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorag
    } \
    if (IsUnitDiag) diag='U'; \
 /* call ?trsm*/ \
-   BLASPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
+   BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
    /*std::cout << "TRMS_L specialization!\n";*/ \
  } \
 };
 
-EIGEN_BLAS_TRSM_R(double,   double, d)
-EIGEN_BLAS_TRSM_R(dcomplex, double, z)
-EIGEN_BLAS_TRSM_R(float,    float,  s)
-EIGEN_BLAS_TRSM_R(scomplex, float,  c)
-
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRSM_R(double,   double, dtrsm)
+EIGEN_BLAS_TRSM_R(dcomplex, MKL_Complex16, ztrsm)
+EIGEN_BLAS_TRSM_R(float,    float,  strsm)
+EIGEN_BLAS_TRSM_R(scomplex, MKL_Complex8,  ctrsm)
+#else
+EIGEN_BLAS_TRSM_R(double,   double, dtrsm_)
+EIGEN_BLAS_TRSM_R(dcomplex, double, ztrsm_)
+EIGEN_BLAS_TRSM_R(float,    float,  strsm_)
+EIGEN_BLAS_TRSM_R(scomplex, float,  ctrsm_)
+#endif
 
 } // end namespace internal
 
diff --git a/Eigen/src/Core/products/TriangularSolverVector.h b/Eigen/src/Core/products/TriangularSolverVector.h
index b994759b2..647317016 100644
--- a/Eigen/src/Core/products/TriangularSolverVector.h
+++ b/Eigen/src/Core/products/TriangularSolverVector.h
@@ -58,7 +58,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
       {
         // let's directly call the low level product function because:
         // 1 - it is faster to compile
-        // 2 - it is slighlty faster at runtime
+        // 2 - it is slightly faster at runtime
         Index startRow = IsLower ? pi : pi-actualPanelWidth;
         Index startCol = IsLower ? 0 : pi;
 
@@ -77,7 +77,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
         if (k>0)
           rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum();
 
-        if(!(Mode & UnitDiag))
+        if((!(Mode & UnitDiag)) && numext::not_equal_strict(rhs[i],RhsScalar(0)))
           rhs[i] /= cjLhs(i,i);
       }
     }
@@ -114,20 +114,23 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
       for(Index k=0; k<actualPanelWidth; ++k)
       {
         Index i = IsLower ? pi+k : pi-k-1;
-        if(!(Mode & UnitDiag))
-          rhs[i] /= cjLhs.coeff(i,i);
-
-        Index r = actualPanelWidth - k - 1; // remaining size
-        Index s = IsLower ? i+1 : i-r;
-        if (r>0)
-          Map<Matrix<RhsScalar,Dynamic,1> >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r);
+        if(numext::not_equal_strict(rhs[i],RhsScalar(0)))
+        {
+          if(!(Mode & UnitDiag))
+            rhs[i] /= cjLhs.coeff(i,i);
+
+          Index r = actualPanelWidth - k - 1; // remaining size
+          Index s = IsLower ? i+1 : i-r;
+          if (r>0)
+            Map<Matrix<RhsScalar,Dynamic,1> >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r);
+        }
       }
       Index r = IsLower ? size - endBlock : startBlock; // remaining size
       if (r > 0)
       {
         // let's directly call the low level product function because:
         // 1 - it is faster to compile
-        // 2 - it is slighlty faster at runtime
+        // 2 - it is slightly faster at runtime
         general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
             r, actualPanelWidth,
             LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride),
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index b1791fb3a..a32630ed7 100755
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -24,7 +24,7 @@ struct gebp_kernel;
 template<typename Scalar, typename Index, typename DataMapper, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
 struct gemm_pack_rhs;
 
-template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
 struct gemm_pack_lhs;
 
 template<
@@ -156,11 +156,9 @@ class BlasVectorMapper {
 };
 
 template<typename Scalar, typename Index, int AlignmentType>
-class BlasLinearMapper {
-  public:
-  typedef typename packet_traits<Scalar>::type Packet;
-  typedef typename packet_traits<Scalar>::half HalfPacket;
-
+class BlasLinearMapper
+{
+public:
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {}
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const {
@@ -171,29 +169,25 @@ class BlasLinearMapper {
     return m_data[i];
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
-    return ploadt<Packet, AlignmentType>(m_data + i);
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i) const {
+    return ploadt<PacketType, AlignmentType>(m_data + i);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
-    return ploadt<HalfPacket, AlignmentType>(m_data + i);
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const {
+    pstoret<Scalar, PacketType, AlignmentType>(m_data + i, p);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet &p) const {
-    pstoret<Scalar, Packet, AlignmentType>(m_data + i, p);
-  }
-
-  protected:
+protected:
   Scalar *m_data;
 };
 
 // Lightweight helper class to access matrix coefficients.
 template<typename Scalar, typename Index, int StorageOrder, int AlignmentType = Unaligned>
-class blas_data_mapper {
-  public:
-  typedef typename packet_traits<Scalar>::type Packet;
-  typedef typename packet_traits<Scalar>::half HalfPacket;
-
+class blas_data_mapper
+{
+public:
   typedef BlasLinearMapper<Scalar, Index, AlignmentType> LinearMapper;
   typedef BlasVectorMapper<Scalar, Index> VectorMapper;
 
@@ -218,8 +212,9 @@ class blas_data_mapper {
     return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride];
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
-    return ploadt<Packet, AlignmentType>(&operator()(i, j));
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i, Index j) const {
+    return ploadt<PacketType, AlignmentType>(&operator()(i, j));
   }
 
   template <typename PacketT, int AlignmentT>
@@ -227,10 +222,6 @@ class blas_data_mapper {
     return ploadt<PacketT, AlignmentT>(&operator()(i, j));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
-    return ploadt<HalfPacket, AlignmentType>(&operator()(i, j));
-  }
-
   template<typename SubPacket>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
     pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
@@ -251,7 +242,7 @@ class blas_data_mapper {
     return internal::first_default_aligned(m_data, size);
   }
 
-  protected:
+protected:
   Scalar* EIGEN_RESTRICT m_data;
   const Index m_stride;
 };
@@ -289,8 +280,8 @@ template<typename XprType> struct blas_traits
     ExtractType,
     typename _ExtractType::PlainObject
     >::type DirectLinearAccessType;
-  static inline ExtractType extract(const XprType& x) { return x; }
-  static inline const Scalar extractScalarFactor(const XprType&) { return Scalar(1); }
+  static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return x; }
+  static inline EIGEN_DEVICE_FUNC const Scalar extractScalarFactor(const XprType&) { return Scalar(1); }
 };
 
 // pop conjugate
@@ -318,8 +309,8 @@ struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp
   typedef blas_traits<NestedXpr> Base;
   typedef CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain>, NestedXpr> XprType;
   typedef typename Base::ExtractType ExtractType;
-  static inline ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); }
-  static inline Scalar extractScalarFactor(const XprType& x)
+  static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); }
+  static inline EIGEN_DEVICE_FUNC Scalar extractScalarFactor(const XprType& x)
   { return x.lhs().functor().m_other * Base::extractScalarFactor(x.rhs()); }
 };
 template<typename Scalar, typename NestedXpr, typename Plain>
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
new file mode 100644
index 000000000..e75c7d89e
--- /dev/null
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -0,0 +1,436 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2018 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CONFIGURE_VECTORIZATION_H
+#define EIGEN_CONFIGURE_VECTORIZATION_H
+
+// FIXME: not sure why this is needed, perhaps it is not needed anymore.
+#ifdef __NVCC__
+  #ifndef EIGEN_DONT_VECTORIZE
+  #define EIGEN_DONT_VECTORIZE
+  #endif
+#endif
+
+//------------------------------------------------------------------------------------------
+// Static and dynamic alignment control
+//
+// The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES
+// as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively.
+// The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not,
+// a default value is automatically computed based on architecture, compiler, and OS.
+//
+// This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX}
+// to be used to declare statically aligned buffers.
+//------------------------------------------------------------------------------------------
+
+
+/* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements.
+ * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled,
+ * so that vectorization doesn't affect binary compatibility.
+ *
+ * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
+ * vectorized and non-vectorized code.
+ */
+#if (defined EIGEN_CUDACC)
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
+  #define EIGEN_ALIGNOF(x) __alignof(x)
+#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
+  #define EIGEN_ALIGNOF(x) __alignof(x)
+#elif EIGEN_COMP_MSVC
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))
+  #define EIGEN_ALIGNOF(x) __alignof(x)
+#elif EIGEN_COMP_SUNCC
+  // FIXME not sure about this one:
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
+  #define EIGEN_ALIGNOF(x) __alignof(x)
+#else
+  #error Please tell me what is the equivalent of __attribute__((aligned(n))) and __alignof(x) for your compiler
+#endif
+
+// If the user explicitly disable vectorization, then we also disable alignment
+#if defined(EIGEN_DONT_VECTORIZE)
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0
+#elif defined(__AVX512F__)
+  // 64 bytes static alignment is preferred only if really required
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64
+#elif defined(__AVX__)
+  // 32 bytes static alignment is preferred only if really required
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32
+#else
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16
+#endif
+
+
+// EIGEN_MIN_ALIGN_BYTES defines the minimal value for which the notion of explicit alignment makes sense
+#define EIGEN_MIN_ALIGN_BYTES 16
+
+// Defined the boundary (in bytes) on which the data needs to be aligned. Note
+// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be
+// aligned at all regardless of the value of this #define.
+
+#if (defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN))  && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && EIGEN_MAX_STATIC_ALIGN_BYTES>0
+#error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY.
+#endif
+
+// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprecated
+// They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0
+#if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)
+  #ifdef EIGEN_MAX_STATIC_ALIGN_BYTES
+    #undef EIGEN_MAX_STATIC_ALIGN_BYTES
+  #endif
+  #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
+#endif
+
+#ifndef EIGEN_MAX_STATIC_ALIGN_BYTES
+
+  // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES
+
+  // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable
+  // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
+  // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
+  // certain common platform (compiler+architecture combinations) to avoid these problems.
+  // Only static alignment is really problematic (relies on nonstandard compiler extensions),
+  // try to keep heap alignment even when we have to disable static alignment.
+  #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || EIGEN_ARCH_MIPS)
+  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
+  #elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6)
+  // Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support.
+  // Not sure which version fixed it, hopefully it doesn't affect 4.7, which is still somewhat in use.
+  // 4.8 and newer seem definitely unaffected.
+  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
+  #else
+  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
+  #endif
+
+  // static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX
+  #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \
+  && !EIGEN_GCC3_OR_OLDER \
+  && !EIGEN_COMP_SUNCC \
+  && !EIGEN_OS_QNX
+    #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1
+  #else
+    #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
+  #endif
+
+  #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT
+    #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+  #else
+    #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
+  #endif
+
+#endif
+
+// If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_ALIGN_BYTES
+#if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES<EIGEN_MAX_STATIC_ALIGN_BYTES
+#undef EIGEN_MAX_STATIC_ALIGN_BYTES
+#define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
+#endif
+
+#if EIGEN_MAX_STATIC_ALIGN_BYTES==0 && !defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
+  #define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
+#endif
+
+// At this stage, EIGEN_MAX_STATIC_ALIGN_BYTES>0 is the true test whether we want to align arrays on the stack or not.
+// It takes into account both the user choice to explicitly enable/disable alignment (by setting EIGEN_MAX_STATIC_ALIGN_BYTES)
+// and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT).
+// Henceforth, only EIGEN_MAX_STATIC_ALIGN_BYTES should be used.
+
+
+// Shortcuts to EIGEN_ALIGN_TO_BOUNDARY
+#define EIGEN_ALIGN8  EIGEN_ALIGN_TO_BOUNDARY(8)
+#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16)
+#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32)
+#define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64)
+#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
+#define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES)
+#else
+#define EIGEN_ALIGN_MAX
+#endif
+
+
+// Dynamic alignment control
+
+#if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES>0
+#error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN.
+#endif
+
+#ifdef EIGEN_DONT_ALIGN
+  #ifdef EIGEN_MAX_ALIGN_BYTES
+    #undef EIGEN_MAX_ALIGN_BYTES
+  #endif
+  #define EIGEN_MAX_ALIGN_BYTES 0
+#elif !defined(EIGEN_MAX_ALIGN_BYTES)
+  #define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+#endif
+
+#if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES
+#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+#else
+#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
+#endif
+
+
+#ifndef EIGEN_UNALIGNED_VECTORIZE
+#define EIGEN_UNALIGNED_VECTORIZE 1
+#endif
+
+//----------------------------------------------------------------------
+
+
+
+// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into
+// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks
+#if EIGEN_MAX_ALIGN_BYTES==0
+  #ifndef EIGEN_DONT_VECTORIZE
+    #define EIGEN_DONT_VECTORIZE
+  #endif
+#endif
+
+
+// The following (except #include <malloc.h> and _M_IX86_FP ??) can likely be
+// removed as gcc 4.1 and msvc 2008 are not supported anyways.
+#if EIGEN_COMP_MSVC
+  #include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled
+  #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later
+    // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP.
+    #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64
+      #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
+    #endif
+  #endif
+#else
+  #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) )
+    #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC
+  #endif
+#endif
+
+
+#ifndef EIGEN_DONT_VECTORIZE
+
+  #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
+
+    // Defines symbols for compile-time detection of which instructions are
+    // used.
+    // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_SSE
+    #define EIGEN_VECTORIZE_SSE2
+
+    // Detect sse3/ssse3/sse4:
+    // gcc and icc defines __SSE3__, ...
+    // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you
+    // want to force the use of those instructions with msvc.
+    #ifdef __SSE3__
+      #define EIGEN_VECTORIZE_SSE3
+    #endif
+    #ifdef __SSSE3__
+      #define EIGEN_VECTORIZE_SSSE3
+    #endif
+    #ifdef __SSE4_1__
+      #define EIGEN_VECTORIZE_SSE4_1
+    #endif
+    #ifdef __SSE4_2__
+      #define EIGEN_VECTORIZE_SSE4_2
+    #endif
+    #ifdef __AVX__
+      #define EIGEN_VECTORIZE_AVX
+      #define EIGEN_VECTORIZE_SSE3
+      #define EIGEN_VECTORIZE_SSSE3
+      #define EIGEN_VECTORIZE_SSE4_1
+      #define EIGEN_VECTORIZE_SSE4_2
+    #endif
+    #ifdef __AVX2__
+      #define EIGEN_VECTORIZE_AVX2
+      #define EIGEN_VECTORIZE_AVX
+      #define EIGEN_VECTORIZE_SSE3
+      #define EIGEN_VECTORIZE_SSSE3
+      #define EIGEN_VECTORIZE_SSE4_1
+      #define EIGEN_VECTORIZE_SSE4_2
+    #endif
+    #ifdef __FMA__
+      #define EIGEN_VECTORIZE_FMA
+    #endif
+    #if defined(__AVX512F__)
+      #define EIGEN_VECTORIZE_AVX512
+      #define EIGEN_VECTORIZE_AVX2
+      #define EIGEN_VECTORIZE_AVX
+      #define EIGEN_VECTORIZE_FMA
+      #define EIGEN_VECTORIZE_SSE3
+      #define EIGEN_VECTORIZE_SSSE3
+      #define EIGEN_VECTORIZE_SSE4_1
+      #define EIGEN_VECTORIZE_SSE4_2
+      #ifdef __AVX512DQ__
+        #define EIGEN_VECTORIZE_AVX512DQ
+      #endif
+      #ifdef __AVX512ER__
+        #define EIGEN_VECTORIZE_AVX512ER
+      #endif
+    #endif
+
+    // include files
+
+    // This extern "C" works around a MINGW-w64 compilation issue
+    // https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354
+    // In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do).
+    // However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations
+    // with conflicting linkage.  The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know;
+    // so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too.
+    // notice that since these are C headers, the extern "C" is theoretically needed anyways.
+    extern "C" {
+      // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
+      // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
+      #if EIGEN_COMP_ICC >= 1110
+        #include <immintrin.h>
+      #else
+        #include <mmintrin.h>
+        #include <emmintrin.h>
+        #include <xmmintrin.h>
+        #ifdef  EIGEN_VECTORIZE_SSE3
+        #include <pmmintrin.h>
+        #endif
+        #ifdef EIGEN_VECTORIZE_SSSE3
+        #include <tmmintrin.h>
+        #endif
+        #ifdef EIGEN_VECTORIZE_SSE4_1
+        #include <smmintrin.h>
+        #endif
+        #ifdef EIGEN_VECTORIZE_SSE4_2
+        #include <nmmintrin.h>
+        #endif
+        #if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512)
+        #include <immintrin.h>
+        #endif
+      #endif
+    } // end extern "C"
+
+  #elif defined __VSX__
+
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_VSX
+    #include <altivec.h>
+    // We need to #undef all these ugly tokens defined in <altivec.h>
+    // => use __vector instead of vector
+    #undef bool
+    #undef vector
+    #undef pixel
+
+  #elif defined __ALTIVEC__
+
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_ALTIVEC
+    #include <altivec.h>
+    // We need to #undef all these ugly tokens defined in <altivec.h>
+    // => use __vector instead of vector
+    #undef bool
+    #undef vector
+    #undef pixel
+
+  #elif (defined  __ARM_NEON) || (defined __ARM_NEON__)
+
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_NEON
+    #include <arm_neon.h>
+
+  #elif (defined __s390x__ && defined __VEC__)
+
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_ZVECTOR
+    #include <vecintrin.h>
+
+  #elif defined __mips_msa
+
+    // Limit MSA optimizations to little-endian CPUs for now.
+    // TODO: Perhaps, eventually support MSA optimizations on big-endian CPUs?
+    #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+      #if defined(__LP64__)
+        #define EIGEN_MIPS_64
+      #else
+        #define EIGEN_MIPS_32
+      #endif
+      #define EIGEN_VECTORIZE
+      #define EIGEN_VECTORIZE_MSA
+      #include <msa.h>
+    #endif
+
+  #endif
+#endif
+
+#if defined(__F16C__) && !defined(EIGEN_COMP_CLANG)
+  // We can use the optimized fp16 to float and float to fp16 conversion routines
+  #define EIGEN_HAS_FP16_C
+#endif
+
+#if defined EIGEN_CUDACC
+  #define EIGEN_VECTORIZE_GPU
+  #include <vector_types.h>
+  #if EIGEN_CUDACC_VER >= 70500
+    #define EIGEN_HAS_CUDA_FP16
+  #endif
+#endif
+
+#if defined(EIGEN_HAS_CUDA_FP16)
+  #include <host_defines.h>
+  #include <cuda_fp16.h>
+#endif
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  #define EIGEN_VECTORIZE_GPU
+  #include <hip/hip_vector_types.h>
+
+  #define EIGEN_HAS_HIP_FP16
+  #include <hip/hip_fp16.h>
+
+  #define HIP_PATCH_WITH_NEW_FP16 18215
+  #if (HIP_VERSION_PATCH < HIP_PATCH_WITH_NEW_FP16)
+    #define EIGEN_HAS_OLD_HIP_FP16
+    // Old HIP implementation does not have a explicit typedef for "half2"
+    typedef __half2 half2;
+  #endif
+
+#endif
+
+
+/** \brief Namespace containing all symbols from the %Eigen library. */
+namespace Eigen {
+
+inline static const char *SimdInstructionSetsInUse(void) {
+#if defined(EIGEN_VECTORIZE_AVX512)
+  return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_AVX)
+  return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_SSE4_2)
+  return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_SSE4_1)
+  return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
+#elif defined(EIGEN_VECTORIZE_SSSE3)
+  return "SSE, SSE2, SSE3, SSSE3";
+#elif defined(EIGEN_VECTORIZE_SSE3)
+  return "SSE, SSE2, SSE3";
+#elif defined(EIGEN_VECTORIZE_SSE2)
+  return "SSE, SSE2";
+#elif defined(EIGEN_VECTORIZE_ALTIVEC)
+  return "AltiVec";
+#elif defined(EIGEN_VECTORIZE_VSX)
+  return "VSX";
+#elif defined(EIGEN_VECTORIZE_NEON)
+  return "ARM NEON";
+#elif defined(EIGEN_VECTORIZE_ZVECTOR)
+  return "S390X ZVECTOR";
+#elif defined(EIGEN_VECTORIZE_MSA)
+  return "MIPS MSA";
+#else
+  return "None";
+#endif
+}
+
+} // end namespace Eigen
+
+
+#endif // EIGEN_CONFIGURE_VECTORIZATION_H
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index 612dbf5e8..a5f63a9b5 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -470,6 +470,7 @@ namespace Architecture
     AltiVec = 0x2,
     VSX = 0x3,
     NEON = 0x4,
+    MSA = 0x5,
 #if defined EIGEN_VECTORIZE_SSE
     Target = SSE
 #elif defined EIGEN_VECTORIZE_ALTIVEC
@@ -478,6 +479,8 @@ namespace Architecture
     Target = VSX
 #elif defined EIGEN_VECTORIZE_NEON
     Target = NEON
+#elif defined EIGEN_VECTORIZE_MSA
+    Target = MSA
 #else
     Target = Generic
 #endif
diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h
index 4431f2fc4..6e93bbc0f 100755
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -45,16 +45,25 @@
     #pragma clang diagnostic ignored "-Wabsolute-value"
   #endif
 
-#elif defined __GNUC__ && __GNUC__>=6
+#elif defined __GNUC__
 
   #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
     #pragma GCC diagnostic push
   #endif
-  #pragma GCC diagnostic ignored "-Wignored-attributes"
+  // g++ warns about local variables shadowing member functions, which is too strict
+  #pragma GCC diagnostic ignored "-Wshadow"
+  #if __GNUC__ == 4 && __GNUC_MINOR__ < 8
+    // Until g++-4.7 there are warnings when comparing unsigned int vs 0, even in templated functions:
+    #pragma GCC diagnostic ignored "-Wtype-limits"
+  #endif
+  #if __GNUC__>=6
+    #pragma GCC diagnostic ignored "-Wignored-attributes"
+  #endif
 
 #endif
 
 #if defined __NVCC__
+  #pragma diag_suppress boolean_controlling_expr_is_constant
   // Disable the "statement is unreachable" message
   #pragma diag_suppress code_is_unreachable
   // Disable the "dynamic initialization in unreachable code" message
@@ -72,6 +81,15 @@
   #pragma diag_suppress 2671
   #pragma diag_suppress 2735
   #pragma diag_suppress 2737
+  #pragma diag_suppress 2739
 #endif
 
+#else
+// warnings already disabled:
+# ifndef EIGEN_WARNINGS_DISABLED_2
+#  define EIGEN_WARNINGS_DISABLED_2
+# elif defined(EIGEN_INTERNAL_DEBUGGING)
+#  error "Do not include \"DisableStupidWarnings.h\" recursively more than twice!"
+# endif
+
 #endif // not EIGEN_WARNINGS_DISABLED
diff --git a/Eigen/src/Core/util/IndexedViewHelper.h b/Eigen/src/Core/util/IndexedViewHelper.h
index ab01c857f..40e16fdb4 100644
--- a/Eigen/src/Core/util/IndexedViewHelper.h
+++ b/Eigen/src/Core/util/IndexedViewHelper.h
@@ -13,13 +13,6 @@
 
 namespace Eigen {
 
-/** \namespace Eigen::placeholders
-  * \ingroup Core_Module
-  *
-  * Namespace containing symbolic placeholder and identifiers
-  */
-namespace placeholders {
-
 namespace internal {
 struct symbolic_last_tag {};
 }
@@ -35,36 +28,35 @@ struct symbolic_last_tag {};
   * A typical usage example would be:
   * \code
   * using namespace Eigen;
-  * using Eigen::placeholders::last;
+  * using Eigen::last;
   * VectorXd v(n);
   * v(seq(2,last-2)).setOnes();
   * \endcode
   *
   * \sa end
   */
-static const Symbolic::SymbolExpr<internal::symbolic_last_tag> last;
+static const symbolic::SymbolExpr<internal::symbolic_last_tag> last; // PLEASE use Eigen::last   instead of Eigen::placeholders::last
 
-/** \var end
+/** \var lastp1
   * \ingroup Core_Module
   *
-  * Can be used as a parameter to Eigen::seq and Eigen::seqN functions to symbolically reference the last+1 element/row/columns
-  * of the underlying vector or matrix once passed to DenseBase::operator()(const RowIndices&, const ColIndices&).
+  * Can be used as a parameter to Eigen::seq and Eigen::seqN functions to symbolically
+  * reference the last+1 element/row/columns of the underlying vector or matrix once
+  * passed to DenseBase::operator()(const RowIndices&, const ColIndices&).
   *
   * This symbolic placeholder support standard arithmetic operation.
-  * It is essentially an alias to last+1
+  * It is essentially an alias to last+fix<1>.
   *
   * \sa last
   */
 #ifdef EIGEN_PARSED_BY_DOXYGEN
-static const auto end = last+1;
+static const auto lastp1 = last+fix<1>;
 #else
 // Using a FixedExpr<1> expression is important here to make sure the compiler
 // can fully optimize the computation starting indices with zero overhead.
-static const Symbolic::AddExpr<Symbolic::SymbolExpr<internal::symbolic_last_tag>,Symbolic::ValueExpr<Eigen::internal::FixedInt<1> > > end(last+fix<1>());
+static const symbolic::AddExpr<symbolic::SymbolExpr<internal::symbolic_last_tag>,symbolic::ValueExpr<Eigen::internal::FixedInt<1> > > lastp1(last+fix<1>());
 #endif
 
-} // end namespace placeholders
-
 namespace internal {
 
  // Replace symbolic last/end "keywords" by their true runtime value
@@ -74,9 +66,9 @@ template<int N>
 FixedInt<N> eval_expr_given_size(FixedInt<N> x, Index /*size*/)   { return x; }
 
 template<typename Derived>
-Index eval_expr_given_size(const Symbolic::BaseExpr<Derived> &x, Index size)
+Index eval_expr_given_size(const symbolic::BaseExpr<Derived> &x, Index size)
 {
-  return x.derived().eval(placeholders::last=size-1);
+  return x.derived().eval(last=size-1);
 }
 
 // Extract increment/step at compile time
@@ -117,7 +109,7 @@ template<> struct get_compile_time_incr<SingleRange> {
   enum { value = 1 }; // 1 or 0 ??
 };
 
-// Turn a single index into something that looks like an array (i.e., that exposes a .size(), and operatro[](int) methods)
+// Turn a single index into something that looks like an array (i.e., that exposes a .size(), and operator[](int) methods)
 template<typename T, int XprSize>
 struct IndexedViewCompatibleType<T,XprSize,typename internal::enable_if<internal::is_integral<T>::value>::type> {
   // Here we could simply use Array, but maybe it's less work for the compiler to use
@@ -127,13 +119,13 @@ struct IndexedViewCompatibleType<T,XprSize,typename internal::enable_if<internal
 };
 
 template<typename T, int XprSize>
-struct IndexedViewCompatibleType<T, XprSize, typename enable_if<Symbolic::is_symbolic<T>::value>::type> {
+struct IndexedViewCompatibleType<T, XprSize, typename enable_if<symbolic::is_symbolic<T>::value>::type> {
   typedef SingleRange type;
 };
 
 
 template<typename T>
-typename enable_if<Symbolic::is_symbolic<T>::value,SingleRange>::type
+typename enable_if<symbolic::is_symbolic<T>::value,SingleRange>::type
 makeIndexedViewCompatible(const T& id, Index size, SpecializedType) {
   return eval_expr_given_size(id,size);
 }
@@ -172,14 +164,21 @@ template<int Size> struct get_compile_time_incr<AllRange<Size> > {
 } // end namespace internal
 
 
-namespace placeholders {
-
 /** \var all
   * \ingroup Core_Module
   * Can be used as a parameter to DenseBase::operator()(const RowIndices&, const ColIndices&) to index all rows or columns
   */
-static const Eigen::internal::all_t all;
+static const Eigen::internal::all_t all; // PLEASE use Eigen::all instead of Eigen::placeholders::all
+
+
+namespace placeholders {
+  typedef symbolic::SymbolExpr<internal::symbolic_last_tag> last_t;
+  typedef symbolic::AddExpr<symbolic::SymbolExpr<internal::symbolic_last_tag>,symbolic::ValueExpr<Eigen::internal::FixedInt<1> > > end_t;
+  typedef Eigen::internal::all_t all_t;
 
+  EIGEN_DEPRECATED static const all_t  all  = Eigen::all;    // PLEASE use Eigen::all    instead of Eigen::placeholders::all
+  EIGEN_DEPRECATED static const last_t last = Eigen::last;   // PLEASE use Eigen::last   instead of Eigen::placeholders::last
+  EIGEN_DEPRECATED static const end_t  end  = Eigen::lastp1; // PLEASE use Eigen::lastp1 instead of Eigen::placeholders::end
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/util/IntegralConstant.h b/Eigen/src/Core/util/IntegralConstant.h
index c7d3b1c06..bf99cd8ab 100644
--- a/Eigen/src/Core/util/IntegralConstant.h
+++ b/Eigen/src/Core/util/IntegralConstant.h
@@ -151,9 +151,9 @@ struct get_fixed_value<variable_if_dynamic<T,N>,Default> {
   static const int value = N;
 };
 
-template<typename T> Index get_runtime_value(const T &x) { return x; }
+template<typename T> EIGEN_DEVICE_FUNC Index get_runtime_value(const T &x) { return x; }
 #if !EIGEN_HAS_CXX14
-template<int N> Index get_runtime_value(FixedInt<N> (*)()) { return N; }
+template<int N> EIGEN_DEVICE_FUNC Index get_runtime_value(FixedInt<N> (*)()) { return N; }
 #endif
 
 // Cleanup integer/FixedInt/VariableAndFixedInt/etc types:
diff --git a/Eigen/src/Core/util/MKL_support.h b/Eigen/src/Core/util/MKL_support.h
index 26b59669e..17963fad4 100755
--- a/Eigen/src/Core/util/MKL_support.h
+++ b/Eigen/src/Core/util/MKL_support.h
@@ -49,12 +49,17 @@
   #define EIGEN_USE_LAPACKE
 #endif
 
-#if defined(EIGEN_USE_MKL_VML)
+#if defined(EIGEN_USE_MKL_VML) && !defined(EIGEN_USE_MKL)
   #define EIGEN_USE_MKL
 #endif
 
+
 #if defined EIGEN_USE_MKL
-#   include <mkl.h> 
+#   if (!defined MKL_DIRECT_CALL) && (!defined EIGEN_MKL_NO_DIRECT_CALL)
+#       define MKL_DIRECT_CALL
+#       define MKL_DIRECT_CALL_JUST_SET
+#   endif
+#   include <mkl.h>
 /*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/
 #   ifndef INTEL_MKL_VERSION
 #       undef EIGEN_USE_MKL /* INTEL_MKL_VERSION is not even defined on older versions */
@@ -68,6 +73,9 @@
 #       undef   EIGEN_USE_MKL_VML
 #       undef   EIGEN_USE_LAPACKE_STRICT
 #       undef   EIGEN_USE_LAPACKE
+#       ifdef   MKL_DIRECT_CALL_JUST_SET
+#           undef MKL_DIRECT_CALL
+#       endif
 #   endif
 #endif
 
@@ -108,6 +116,10 @@
 #endif
 #endif
 
+#if defined(EIGEN_USE_BLAS) && !defined(EIGEN_USE_MKL)
+#include "../../misc/blas.h"
+#endif
+
 namespace Eigen {
 
 typedef std::complex<double> dcomplex;
@@ -121,8 +133,5 @@ typedef int BlasIndex;
 
 } // end namespace Eigen
 
-#if defined(EIGEN_USE_BLAS)
-#include "../../misc/blas.h"
-#endif
 
 #endif // EIGEN_MKL_SUPPORT_H
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 29c796647..3af6c4e37 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -11,6 +11,10 @@
 #ifndef EIGEN_MACROS_H
 #define EIGEN_MACROS_H
 
+//------------------------------------------------------------------------------------------
+// Eigen version and basic defaults
+//------------------------------------------------------------------------------------------
+
 #define EIGEN_WORLD_VERSION 3
 #define EIGEN_MAJOR_VERSION 3
 #define EIGEN_MINOR_VERSION 90
@@ -19,7 +23,40 @@
                                       (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
                                                                  EIGEN_MINOR_VERSION>=z))))
 
+#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
+#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::RowMajor
+#else
+#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::ColMajor
+#endif
+
+#ifndef EIGEN_DEFAULT_DENSE_INDEX_TYPE
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE std::ptrdiff_t
+#endif
+
+// Upperbound on the C++ version to use.
+// Expected values are 03, 11, 14, 17, etc.
+// By default, let's use an arbitrarily large C++ version.
+#ifndef EIGEN_MAX_CPP_VER
+#define EIGEN_MAX_CPP_VER 99
+#endif
+
+/** Allows to disable some optimizations which might affect the accuracy of the result.
+  * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
+  * They currently include:
+  *   - single precision ArrayBase::sin() and ArrayBase::cos() for SSE and AVX vectorization.
+  */
+#ifndef EIGEN_FAST_MATH
+#define EIGEN_FAST_MATH 1
+#endif
+
+#ifndef EIGEN_STACK_ALLOCATION_LIMIT
+// 131072 == 128 KB
+#define EIGEN_STACK_ALLOCATION_LIMIT 131072
+#endif
+
+//------------------------------------------------------------------------------------------
 // Compiler identification, EIGEN_COMP_*
+//------------------------------------------------------------------------------------------
 
 /// \internal EIGEN_COMP_GNUC set to 1 for all compilers compatible with GCC
 #ifdef __GNUC__
@@ -73,12 +110,17 @@
 
 // For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC:
 //  name  ver   MSC_VER
-//  2008    9      1500
-//  2010   10      1600
-//  2012   11      1700
-//  2013   12      1800
-//  2015   14      1900
-//  "15"   15      1900
+//  2008         9      1500
+//  2010        10      1600
+//  2012        11      1700
+//  2013        12      1800
+//  2015        14      1900
+//  "15"        15      1900
+//  2017-14.1   15.0    1910
+//  2017-14.11  15.3    1911
+//  2017-14.12  15.5    1912
+//  2017-14.13  15.6    1913
+//  2017-14.14  15.7    1914
 
 /// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC or clang-cl
 #if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC || EIGEN_COMP_LLVM || EIGEN_COMP_CLANG)
@@ -142,7 +184,11 @@
 #endif
 
 
+
+//------------------------------------------------------------------------------------------
 // Architecture identification, EIGEN_ARCH_*
+//------------------------------------------------------------------------------------------
+
 
 #if defined(__x86_64__) || defined(_M_X64) || defined(__amd64)
   #define EIGEN_ARCH_x86_64 1
@@ -212,7 +258,9 @@
 
 
 
+//------------------------------------------------------------------------------------------
 // Operating system identification, EIGEN_OS_*
+//------------------------------------------------------------------------------------------
 
 /// \internal EIGEN_OS_UNIX set to 1 if the OS is a unix variant
 #if defined(__unix__) || defined(__unix)
@@ -314,26 +362,114 @@
 #endif
 
 
+//------------------------------------------------------------------------------------------
+// Detect GPU compilers and architectures
+//------------------------------------------------------------------------------------------
 
-#if EIGEN_GNUC_AT_MOST(4,3) && !EIGEN_COMP_CLANG
-  // see bug 89
-  #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 0
-#else
-  #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 1
+// NVCC is not supported as the target platform for HIPCC
+// Note that this also makes EIGEN_CUDACC and EIGEN_HIPCC mutually exclusive
+#if defined(__NVCC__) && defined(__HIPCC__)
+  #error "NVCC as the target platform for HIPCC is currently not supported."
 #endif
 
-// This macro can be used to prevent from macro expansion, e.g.:
-//   std::max EIGEN_NOT_A_MACRO(a,b)
-#define EIGEN_NOT_A_MACRO
+#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
+  // Means the compiler is either nvcc or clang with CUDA enabled
+  #define EIGEN_CUDACC __CUDACC__
+#endif
 
-#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
-#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::RowMajor
+#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA)
+  // Means we are generating code for the device
+  #define EIGEN_CUDA_ARCH __CUDA_ARCH__
+#endif
+
+// Starting with CUDA 9 the composite __CUDACC_VER__ is not available.
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+  #define EIGEN_CUDACC_VER  ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
+#elif defined(__CUDACC_VER__)
+  #define EIGEN_CUDACC_VER __CUDACC_VER__
 #else
-#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::ColMajor
+  #define EIGEN_CUDACC_VER 0
 #endif
 
-#ifndef EIGEN_DEFAULT_DENSE_INDEX_TYPE
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE std::ptrdiff_t
+#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP)
+  // Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP)
+  #define EIGEN_HIPCC __HIPCC__
+
+  // We need to include hip_runtime.h here because it pulls in
+  // ++ hip_common.h which contains the define for  __HIP_DEVICE_COMPILE__
+  // ++ host_defines.h which contains the defines for the __host__ and __device__ macros
+  #include <hip/hip_runtime.h>
+
+  #if defined(__HIP_DEVICE_COMPILE__)
+    // analogous to EIGEN_CUDA_ARCH, but for HIP
+    #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__
+  #endif
+#endif
+
+// Unify CUDA/HIPCC
+
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
+//
+// If either EIGEN_CUDACC or EIGEN_HIPCC is defined, then define EIGEN_GPUCC
+//
+#define EIGEN_GPUCC
+//
+// EIGEN_HIPCC implies the HIP compiler and is used to tweak Eigen code for use in HIP kernels
+// EIGEN_CUDACC implies the CUDA compiler and is used to tweak Eigen code for use in CUDA kernels
+//
+// In most cases the same tweaks are required to the Eigen code to enable in both the HIP and CUDA kernels.
+// For those cases, the corresponding code should be guarded with
+//      #if defined(EIGEN_GPUCC)
+// instead of
+//      #if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
+//
+// For cases where the tweak is specific to HIP, the code should be guarded with
+//      #if defined(EIGEN_HIPCC)
+//
+// For cases where the tweak is specific to CUDA, the code should be guarded with
+//      #if defined(EIGEN_CUDACC)
+//
+#endif
+
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
+//
+// If either EIGEN_CUDA_ARCH or EIGEN_HIP_DEVICE_COMPILE is defined, then define EIGEN_GPU_COMPILE_PHASE
+//
+#define EIGEN_GPU_COMPILE_PHASE
+//
+// GPU compilers (HIPCC, NVCC) typically do two passes over the source code,
+//   + one to compile the source for the "host" (ie CPU)
+//   + another to compile the source for the "device" (ie. GPU)
+//
+// Code that needs to enabled only during the either the "host" or "device" compilation phase
+// needs to be guarded with a macro that indicates the current compilation phase
+//
+// EIGEN_HIP_DEVICE_COMPILE implies the device compilation phase in HIP
+// EIGEN_CUDA_ARCH implies the device compilation phase in CUDA
+//
+// In most cases, the "host" / "device" specific code is the same for both HIP and CUDA
+// For those cases, the code should be guarded with
+//       #if defined(EIGEN_GPU_COMPILE_PHASE)
+// instead of
+//       #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
+//
+// For cases where the tweak is specific to HIP, the code should be guarded with
+//      #if defined(EIGEN_HIP_DEVICE_COMPILE)
+//
+// For cases where the tweak is specific to CUDA, the code should be guarded with
+//      #if defined(EIGEN_CUDA_ARCH)
+//
+#endif
+
+//------------------------------------------------------------------------------------------
+// Detect Compiler/Architecture/OS specific features
+//------------------------------------------------------------------------------------------
+
+#if EIGEN_GNUC_AT_MOST(4,3) && !EIGEN_COMP_CLANG
+  // see bug 89
+  #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 0
+#else
+  #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 1
 #endif
 
 // Cross compiler wrapper around LLVM's __has_builtin
@@ -357,13 +493,6 @@
 #define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 0
 #endif
 
-// Upperbound on the C++ version to use.
-// Expected values are 03, 11, 14, 17, etc.
-// By default, let's use an arbitrarily large C++ version.
-#ifndef EIGEN_MAX_CPP_VER
-#define EIGEN_MAX_CPP_VER 99
-#endif
-
 #if EIGEN_MAX_CPP_VER>=11 && (defined(__cplusplus) && (__cplusplus >= 201103L) || EIGEN_COMP_MSVC >= 1900)
 #define EIGEN_HAS_CXX11 1
 #else
@@ -389,6 +518,8 @@
 #endif
 
 // Does the compiler support C99?
+// Need to include <cmath> to make sure _GLIBCXX_USE_C99 gets defined
+#include <cmath>
 #ifndef EIGEN_HAS_C99_MATH
 #if EIGEN_MAX_CPP_VER>=11 && \
     ((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901))       \
@@ -402,18 +533,35 @@
 #endif
 
 // Does the compiler support result_of?
+// It's likely that MSVC 2013 supports result_of but I couldn't not find a good source for that,
+// so let's be conservative.
 #ifndef EIGEN_HAS_STD_RESULT_OF
-#if EIGEN_MAX_CPP_VER>=11 && ((__has_feature(cxx_lambdas) || (defined(__cplusplus) && __cplusplus >= 201103L)))
+#if EIGEN_MAX_CPP_VER>=11 && \
+    (__has_feature(cxx_lambdas) || (defined(__cplusplus) && __cplusplus >= 201103L) || EIGEN_COMP_MSVC >= 1900)
 #define EIGEN_HAS_STD_RESULT_OF 1
 #else
 #define EIGEN_HAS_STD_RESULT_OF 0
 #endif
 #endif
 
+// Does the compiler support type_traits?
+// - full support of type traits was added only to GCC 5.1.0.
+// - 20150626 corresponds to the last release of 4.x libstdc++
+#ifndef EIGEN_HAS_TYPE_TRAITS
+#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_HAS_CXX11 || EIGEN_COMP_MSVC >= 1700) \
+  && ((!EIGEN_COMP_GNUC_STRICT) || EIGEN_GNUC_AT_LEAST(5, 1)) \
+  && ((!defined(__GLIBCXX__))   || __GLIBCXX__ > 20150626)
+#define EIGEN_HAS_TYPE_TRAITS 1
+#define EIGEN_INCLUDE_TYPE_TRAITS
+#else
+#define EIGEN_HAS_TYPE_TRAITS 0
+#endif
+#endif
+
 // Does the compiler support variadic templates?
 #ifndef EIGEN_HAS_VARIADIC_TEMPLATES
 #if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \
-  && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000) )
+  && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_CUDACC_VER >= 80000) )
     // ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices:
     //    this prevents nvcc from crashing when compiling Eigen on Tegra X1
 #define EIGEN_HAS_VARIADIC_TEMPLATES 1
@@ -426,23 +574,22 @@
 
 // Does the compiler fully support const expressions? (as in c++14)
 #ifndef EIGEN_HAS_CONSTEXPR
+  #if defined(EIGEN_CUDACC)
+  // Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above
+    #if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_CUDACC_VER >= 70500))
+      #define EIGEN_HAS_CONSTEXPR 1
+    #endif
+  #elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \
+    (EIGEN_GNUC_AT_LEAST(4,8) && (__cplusplus > 199711L)) || \
+    (EIGEN_COMP_CLANG >= 306 && (__cplusplus > 199711L)))
+    #define EIGEN_HAS_CONSTEXPR 1
+  #endif
 
-#if defined(__CUDACC__)
-// Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above
-#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && defined(__CUDACC_VER__) && (EIGEN_COMP_CLANG || __CUDACC_VER__ >= 70500))
-  #define EIGEN_HAS_CONSTEXPR 1
-#endif
-#elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \
-  (EIGEN_GNUC_AT_LEAST(4,8) && (__cplusplus > 199711L)) || \
-  (EIGEN_COMP_CLANG >= 306 && (__cplusplus > 199711L)))
-#define EIGEN_HAS_CONSTEXPR 1
-#endif
-
-#ifndef EIGEN_HAS_CONSTEXPR
-#define EIGEN_HAS_CONSTEXPR 0
-#endif
+  #ifndef EIGEN_HAS_CONSTEXPR
+    #define EIGEN_HAS_CONSTEXPR 0
+  #endif
 
-#endif
+#endif // EIGEN_HAS_CONSTEXPR
 
 // Does the compiler support C++11 math?
 // Let's be conservative and enable the default C++11 implementation only if we are sure it exists
@@ -480,15 +627,42 @@
   #endif
 #endif
 
-/** Allows to disable some optimizations which might affect the accuracy of the result.
-  * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
-  * They currently include:
-  *   - single precision ArrayBase::sin() and ArrayBase::cos() for SSE and AVX vectorization.
-  */
-#ifndef EIGEN_FAST_MATH
-#define EIGEN_FAST_MATH 1
+#ifndef EIGEN_HAS_CXX11_ATOMIC
+  #if    EIGEN_MAX_CPP_VER>=11 && \
+         (__has_feature(cxx_atomic) \
+      || (__cplusplus > 201103L) \
+      || ((__cplusplus >= 201103L) && (EIGEN_COMP_MSVC==0 || EIGEN_COMP_MSVC >= 1700)))
+    #define EIGEN_HAS_CXX11_ATOMIC 1
+  #else
+    #define EIGEN_HAS_CXX11_ATOMIC 0
+  #endif
+#endif
+
+#if defined(EIGEN_CUDACC) && EIGEN_HAS_CONSTEXPR
+  // While available already with c++11, this is useful mostly starting with c++14 and relaxed constexpr rules
+  #if defined(__NVCC__)
+    // nvcc considers constexpr functions as __host__ __device__ with the option --expt-relaxed-constexpr
+    #ifdef __CUDACC_RELAXED_CONSTEXPR__
+      #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC
+    #endif
+    // See bug 1580: clang/CUDA fails to make the following calls
+    // to constexpr bool std::equal_to::operator() even when
+    // EIGEN_CONSTEXPR_ARE_DEVICE_FUNC is defined in c++14 only.
+    // #elif defined(__clang__) && defined(__CUDA__) && EIGEN_HAS_CONSTEXPR == 1
+    //   // clang++ always considers constexpr functions as implicitly __host__ __device__
+    //   #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC
+  #endif
 #endif
 
+
+//------------------------------------------------------------------------------------------
+// Preprocessor programming helpers
+//------------------------------------------------------------------------------------------
+
+// This macro can be used to prevent from macro expansion, e.g.:
+//   std::max EIGEN_NOT_A_MACRO(a,b)
+#define EIGEN_NOT_A_MACRO
+
 #define EIGEN_DEBUG_VAR(x) std::cerr << #x << " = " << x << std::endl;
 
 // concatenate two tokens
@@ -504,11 +678,13 @@
 // EIGEN_STRONG_INLINE is a stronger version of the inline, using __forceinline on MSVC,
 // but it still doesn't use GCC's always_inline. This is useful in (common) situations where MSVC needs forceinline
 // but GCC is still doing fine with just inline.
+#ifndef EIGEN_STRONG_INLINE
 #if EIGEN_COMP_MSVC || EIGEN_COMP_ICC
 #define EIGEN_STRONG_INLINE __forceinline
 #else
 #define EIGEN_STRONG_INLINE inline
 #endif
+#endif
 
 // EIGEN_ALWAYS_INLINE is the stronget, it has the effect of making the function inline and adding every possible
 // attribute to maximize inlining. This should only be used when really necessary: in particular,
@@ -538,12 +714,42 @@
 #define EIGEN_PERMISSIVE_EXPR
 #endif
 
+// GPU stuff
+
+// Disable some features when compiling with GPU compilers (NVCC/clang-cuda/SYCL/HIPCC)
+#if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__) || defined(EIGEN_HIPCC)
+  // Do not try asserts on device code
+  #ifndef EIGEN_NO_DEBUG
+  #define EIGEN_NO_DEBUG
+  #endif
+
+  #ifdef EIGEN_INTERNAL_DEBUGGING
+  #undef EIGEN_INTERNAL_DEBUGGING
+  #endif
+
+  #ifdef EIGEN_EXCEPTIONS
+  #undef EIGEN_EXCEPTIONS
+  #endif
+#endif
+
+// All functions callable from CUDA/HIP code must be qualified with __device__
+#ifdef EIGEN_GPUCC
+  #ifndef EIGEN_DONT_VECTORIZE
+  #define EIGEN_DONT_VECTORIZE
+  #endif
+
+  #define EIGEN_DEVICE_FUNC __host__ __device__
+#else
+  #define EIGEN_DEVICE_FUNC
+#endif
+
+
 // this macro allows to get rid of linking errors about multiply defined functions.
 //  - static is not very good because it prevents definitions from different object files to be merged.
 //           So static causes the resulting linked executable to be bloated with multiple copies of the same function.
 //  - inline is not perfect either as it unwantedly hints the compiler toward inlining the function.
-#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS inline
+#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC
+#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC inline
 
 #ifdef NDEBUG
 # ifndef EIGEN_NO_DEBUG
@@ -649,169 +855,6 @@ namespace Eigen {
 #  define EIGEN_CONST_CONDITIONAL(cond)  cond
 #endif
 
-//------------------------------------------------------------------------------------------
-// Static and dynamic alignment control
-//
-// The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES
-// as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively.
-// The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not,
-// a default value is automatically computed based on architecture, compiler, and OS.
-//
-// This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX}
-// to be used to declare statically aligned buffers.
-//------------------------------------------------------------------------------------------
-
-
-/* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements.
- * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled,
- * so that vectorization doesn't affect binary compatibility.
- *
- * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
- * vectorized and non-vectorized code.
- */
-#if (defined __CUDACC__)
-  #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
-#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
-  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
-#elif EIGEN_COMP_MSVC
-  #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))
-#elif EIGEN_COMP_SUNCC
-  // FIXME not sure about this one:
-  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
-#else
-  #error Please tell me what is the equivalent of __attribute__((aligned(n))) for your compiler
-#endif
-
-// If the user explicitly disable vectorization, then we also disable alignment
-#if defined(EIGEN_DONT_VECTORIZE)
-  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0
-#elif defined(EIGEN_VECTORIZE_AVX512)
-  // 64 bytes static alignmeent is preferred only if really required
-  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64
-#elif defined(__AVX__)
-  // 32 bytes static alignmeent is preferred only if really required
-  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32
-#else
-  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16
-#endif
-
-
-// EIGEN_MIN_ALIGN_BYTES defines the minimal value for which the notion of explicit alignment makes sense
-#define EIGEN_MIN_ALIGN_BYTES 16
-
-// Defined the boundary (in bytes) on which the data needs to be aligned. Note
-// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be
-// aligned at all regardless of the value of this #define.
-
-#if (defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN))  && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && EIGEN_MAX_STATIC_ALIGN_BYTES>0
-#error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY.
-#endif
-
-// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprectated
-// They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0
-#if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)
-  #ifdef EIGEN_MAX_STATIC_ALIGN_BYTES
-    #undef EIGEN_MAX_STATIC_ALIGN_BYTES
-  #endif
-  #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
-#endif
-
-#ifndef EIGEN_MAX_STATIC_ALIGN_BYTES
-
-  // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES
-
-  // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable
-  // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
-  // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
-  // certain common platform (compiler+architecture combinations) to avoid these problems.
-  // Only static alignment is really problematic (relies on nonstandard compiler extensions),
-  // try to keep heap alignment even when we have to disable static alignment.
-  #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64)
-  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
-  #elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6)
-  // Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support.
-  // Not sure which version fixed it, hopefully it doesn't affect 4.7, which is still somewhat in use.
-  // 4.8 and newer seem definitely unaffected.
-  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
-  #else
-  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
-  #endif
-
-  // static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX
-  #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \
-  && !EIGEN_GCC3_OR_OLDER \
-  && !EIGEN_COMP_SUNCC \
-  && !EIGEN_OS_QNX
-    #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1
-  #else
-    #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
-  #endif
-
-  #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT
-    #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
-  #else
-    #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
-  #endif
-
-#endif
-
-// If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_ALIGN_BYTES
-#if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES<EIGEN_MAX_STATIC_ALIGN_BYTES
-#undef EIGEN_MAX_STATIC_ALIGN_BYTES
-#define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
-#endif
-
-#if EIGEN_MAX_STATIC_ALIGN_BYTES==0 && !defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
-  #define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
-#endif
-
-// At this stage, EIGEN_MAX_STATIC_ALIGN_BYTES>0 is the true test whether we want to align arrays on the stack or not.
-// It takes into account both the user choice to explicitly enable/disable alignment (by settting EIGEN_MAX_STATIC_ALIGN_BYTES)
-// and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT).
-// Henceforth, only EIGEN_MAX_STATIC_ALIGN_BYTES should be used.
-
-
-// Shortcuts to EIGEN_ALIGN_TO_BOUNDARY
-#define EIGEN_ALIGN8  EIGEN_ALIGN_TO_BOUNDARY(8)
-#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16)
-#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32)
-#define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64)
-#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
-#define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES)
-#else
-#define EIGEN_ALIGN_MAX
-#endif
-
-
-// Dynamic alignment control
-
-#if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES>0
-#error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN.
-#endif
-
-#ifdef EIGEN_DONT_ALIGN
-  #ifdef EIGEN_MAX_ALIGN_BYTES
-    #undef EIGEN_MAX_ALIGN_BYTES
-  #endif
-  #define EIGEN_MAX_ALIGN_BYTES 0
-#elif !defined(EIGEN_MAX_ALIGN_BYTES)
-  #define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
-#endif
-
-#if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES
-#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
-#else
-#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
-#endif
-
-
-#ifndef EIGEN_UNALIGNED_VECTORIZE
-#define EIGEN_UNALIGNED_VECTORIZE 1
-#endif
-
-//----------------------------------------------------------------------
-
-
 #ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD
   #define EIGEN_RESTRICT
 #endif
@@ -819,10 +862,6 @@ namespace Eigen {
   #define EIGEN_RESTRICT __restrict
 #endif
 
-#ifndef EIGEN_STACK_ALLOCATION_LIMIT
-// 131072 == 128 KB
-#define EIGEN_STACK_ALLOCATION_LIMIT 131072
-#endif
 
 #ifndef EIGEN_DEFAULT_IO_FORMAT
 #ifdef EIGEN_MAKING_DOCS
@@ -837,7 +876,20 @@ namespace Eigen {
 // just an empty macro !
 #define EIGEN_EMPTY
 
-#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 ||  defined(__CUDACC_VER__)) // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324)
+
+// When compiling CUDA/HIP device code with NVCC or HIPCC
+// pull in math functions from the global namespace.
+// In host mode, and when device code is compiled with clang,
+// use the std versions.
+#if (defined(EIGEN_CUDA_ARCH) && defined(__NVCC__)) || defined(EIGEN_HIP_DEVICE_COMPILE)
+  #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
+#else
+  #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
+#endif
+
+
+#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_CUDACC_VER>0)
+  // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324)
   #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
     using Base::operator =;
 #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
@@ -955,7 +1007,7 @@ namespace Eigen {
                 const typename internal::plain_constant_type<EXPR,SCALAR>::type, const EXPR>
 
 // Workaround for MSVC 2010 (see ML thread "patch with compile for for MSVC 2010")
-#if EIGEN_COMP_MSVC_STRICT<=1600
+#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC_STRICT<=1600)
 #define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) typename internal::enable_if<true,X>::type
 #else
 #define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) X
@@ -984,15 +1036,23 @@ namespace Eigen {
   EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME)
 
 
+#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) && !defined(EIGEN_HIP_DEVICE_COMPILE)
+  #define EIGEN_EXCEPTIONS
+#endif
+
+
 #ifdef EIGEN_EXCEPTIONS
 #  define EIGEN_THROW_X(X) throw X
 #  define EIGEN_THROW throw
 #  define EIGEN_TRY try
 #  define EIGEN_CATCH(X) catch (X)
 #else
-#  ifdef __CUDA_ARCH__
+#  if defined(EIGEN_CUDA_ARCH)
 #    define EIGEN_THROW_X(X) asm("trap;")
 #    define EIGEN_THROW asm("trap;")
+#  elif defined(EIGEN_HIP_DEVICE_COMPILE)
+#    define EIGEN_THROW_X(X) asm("s_trap 0")
+#    define EIGEN_THROW asm("s_trap 0")
 #  else
 #    define EIGEN_THROW_X(X) std::abort()
 #    define EIGEN_THROW std::abort()
@@ -1012,7 +1072,38 @@ namespace Eigen {
 #   define EIGEN_NOEXCEPT
 #   define EIGEN_NOEXCEPT_IF(x)
 #   define EIGEN_NO_THROW throw()
-#   define EIGEN_EXCEPTION_SPEC(X) throw(X)
+#   if EIGEN_COMP_MSVC
+      // MSVC does not support exception specifications (warning C4290),
+      // and they are deprecated in c++11 anyway.
+#     define EIGEN_EXCEPTION_SPEC(X) throw()
+#   else
+#     define EIGEN_EXCEPTION_SPEC(X) throw(X)
+#   endif
+#endif
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+// The all function is used to enable a variadic version of eigen_assert which can take a parameter pack as its input.
+namespace Eigen {
+namespace internal {
+
+inline bool all(){ return true; }
+
+template<typename T, typename ...Ts>
+bool all(T t, Ts ... ts){ return t && all(ts...); }
+
+}
+}
+#endif
+
+// Wrapping #pragma unroll in a macro since it is required for SYCL
+#if defined(__SYCL_DEVICE_ONLY__)
+  #if defined(_MSC_VER)
+    #define EIGEN_UNROLL_LOOP __pragma(unroll)
+  #else
+    #define EIGEN_UNROLL_LOOP _Pragma("unroll")
+  #endif
+#else
+  #define EIGEN_UNROLL_LOOP
 #endif
 
 #endif // EIGEN_MACROS_H
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 7d9053496..9dd2e0252 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -70,7 +70,20 @@ inline void throw_std_bad_alloc()
     throw std::bad_alloc();
   #else
     std::size_t huge = static_cast<std::size_t>(-1);
+    #if defined(EIGEN_HIPCC)
+    //
+    // calls to "::operator new" are to be treated as opaque function calls (i.e no inlining),
+    // and as a consequence the code in the #else block triggers the hipcc warning :
+    // "no overloaded function has restriction specifiers that are compatible with the ambient context"
+    //
+    // "throw_std_bad_alloc" has the EIGEN_DEVICE_FUNC attribute, so it seems that hipcc expects
+    // the same on "operator new"
+    // Reverting code back to the old version in this #if block for the hipcc compiler
+    //
     new int[huge];
+    #else
+    ::operator new(huge);
+    #endif
   #endif
 }
 
@@ -83,11 +96,12 @@ inline void throw_std_bad_alloc()
 /** \internal Like malloc, but the returned pointer is guaranteed to be 16-byte aligned.
   * Fast, but wastes 16 additional bytes of memory. Does not throw any exception.
   */
-inline void* handmade_aligned_malloc(std::size_t size)
+inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES)
 {
-  void *original = std::malloc(size+EIGEN_DEFAULT_ALIGN_BYTES);
+  eigen_assert(alignment >= sizeof(void*) && (alignment & -alignment) == alignment && "Alignment must be at least sizeof(void*) and a power of 2");
+  void *original = std::malloc(size+alignment);
   if (original == 0) return 0;
-  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1))) + EIGEN_DEFAULT_ALIGN_BYTES);
+  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(alignment-1))) + alignment);
   *(reinterpret_cast<void**>(aligned) - 1) = original;
   return aligned;
 }
@@ -156,9 +170,15 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size)
 
   void *result;
   #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
+
+    #if defined(EIGEN_HIP_DEVICE_COMPILE)
+    result = ::malloc(size);
+    #else
     result = std::malloc(size);
+    #endif
+
     #if EIGEN_DEFAULT_ALIGN_BYTES==16
-    eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade alignd memory allocator.");
+    eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade aligned memory allocator.");
     #endif
   #else
     result = handmade_aligned_malloc(size);
@@ -174,7 +194,13 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size)
 EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr)
 {
   #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
+
+    #if defined(EIGEN_HIP_DEVICE_COMPILE)
+    ::free(ptr);
+    #else
     std::free(ptr);
+    #endif
+
   #else
     handmade_aligned_free(ptr);
   #endif
@@ -218,7 +244,12 @@ template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(std:
 {
   check_that_malloc_is_allowed();
 
+  #if defined(EIGEN_HIP_DEVICE_COMPILE)
+  void *result = ::malloc(size);
+  #else
   void *result = std::malloc(size);
+  #endif
+
   if(!result && size)
     throw_std_bad_alloc();
   return result;
@@ -232,7 +263,11 @@ template<bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void
 
 template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void *ptr)
 {
+  #if defined(EIGEN_HIP_DEVICE_COMPILE)
+  ::free(ptr);
+  #else
   std::free(ptr);
+  #endif
 }
 
 template<bool Align> inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size)
@@ -493,7 +528,11 @@ template<typename T> struct smart_copy_helper<T,true> {
     IntPtr size = IntPtr(end)-IntPtr(start);
     if(size==0) return;
     eigen_internal_assert(start!=0 && end!=0 && target!=0);
-    memcpy(target, start, size);
+    #if defined(EIGEN_HIP_DEVICE_COMPILE)
+    ::memcpy(target, start, size);
+    #else
+    std::memcpy(target, start, size);
+    #endif
   }
 };
 
@@ -542,7 +581,7 @@ template<typename T> struct smart_memmove_helper<T,false> {
 
 // you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA
 // to the appropriate stack allocation function
-#ifndef EIGEN_ALLOCA
+#if ! defined EIGEN_ALLOCA && ! defined EIGEN_GPU_COMPILE_PHASE
   #if EIGEN_OS_LINUX || EIGEN_OS_MAC || (defined alloca)
     #define EIGEN_ALLOCA alloca
   #elif EIGEN_COMP_MSVC
@@ -550,6 +589,15 @@ template<typename T> struct smart_memmove_helper<T,false> {
   #endif
 #endif
 
+// With clang -Oz -mthumb, alloca changes the stack pointer in a way that is
+// not allowed in Thumb2. -DEIGEN_STACK_ALLOCATION_LIMIT=0 doesn't work because
+// the compiler still emits bad code because stack allocation checks use "<=".
+// TODO: Eliminate after https://bugs.llvm.org/show_bug.cgi?id=23772
+// is fixed.
+#if defined(__clang__) && defined(__thumb__)
+  #undef EIGEN_ALLOCA
+#endif
+
 // This helper class construct the allocated memory, and takes care of destructing and freeing the handled data
 // at destruction time. In practice this helper class is mainly useful to avoid memory leak in case of exceptions.
 template<typename T> class aligned_stack_memory_handler : noncopyable
@@ -561,12 +609,14 @@ template<typename T> class aligned_stack_memory_handler : noncopyable
      * In this case, the buffer elements will also be destructed when this handler will be destructed.
      * Finally, if \a dealloc is true, then the pointer \a ptr is freed.
      **/
+    EIGEN_DEVICE_FUNC
     aligned_stack_memory_handler(T* ptr, std::size_t size, bool dealloc)
       : m_ptr(ptr), m_size(size), m_deallocate(dealloc)
     {
       if(NumTraits<T>::RequireInitialization && m_ptr)
         Eigen::internal::construct_elements_of_array(m_ptr, size);
     }
+    EIGEN_DEVICE_FUNC
     ~aligned_stack_memory_handler()
     {
       if(NumTraits<T>::RequireInitialization && m_ptr)
@@ -580,6 +630,60 @@ template<typename T> class aligned_stack_memory_handler : noncopyable
     bool m_deallocate;
 };
 
+#ifdef EIGEN_ALLOCA
+
+template<typename Xpr, int NbEvaluations,
+         bool MapExternalBuffer = nested_eval<Xpr,NbEvaluations>::Evaluate && Xpr::MaxSizeAtCompileTime==Dynamic
+         >
+struct local_nested_eval_wrapper
+{
+  static const bool NeedExternalBuffer = false;
+  typedef typename Xpr::Scalar Scalar;
+  typedef typename nested_eval<Xpr,NbEvaluations>::type ObjectType;
+  ObjectType object;
+
+  EIGEN_DEVICE_FUNC
+  local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) : object(xpr)
+  {
+    EIGEN_UNUSED_VARIABLE(ptr);
+    eigen_internal_assert(ptr==0);
+  }
+};
+
+template<typename Xpr, int NbEvaluations>
+struct local_nested_eval_wrapper<Xpr,NbEvaluations,true>
+{
+  static const bool NeedExternalBuffer = true;
+  typedef typename Xpr::Scalar Scalar;
+  typedef typename plain_object_eval<Xpr>::type PlainObject;
+  typedef Map<PlainObject,EIGEN_DEFAULT_ALIGN_BYTES> ObjectType;
+  ObjectType object;
+
+  EIGEN_DEVICE_FUNC
+  local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr)
+    : object(ptr==0 ? reinterpret_cast<Scalar*>(Eigen::internal::aligned_malloc(sizeof(Scalar)*xpr.size())) : ptr, xpr.rows(), xpr.cols()),
+      m_deallocate(ptr==0)
+  {
+    if(NumTraits<Scalar>::RequireInitialization && object.data())
+      Eigen::internal::construct_elements_of_array(object.data(), object.size());
+    object = xpr;
+  }
+
+  EIGEN_DEVICE_FUNC
+  ~local_nested_eval_wrapper()
+  {
+    if(NumTraits<Scalar>::RequireInitialization && object.data())
+      Eigen::internal::destruct_elements_of_array(object.data(), object.size());
+    if(m_deallocate)
+      Eigen::internal::aligned_free(object.data());
+  }
+
+private:
+  bool m_deallocate;
+};
+
+#endif // EIGEN_ALLOCA
+
 template<typename T> class scoped_array : noncopyable
 {
   T* m_ptr;
@@ -607,9 +711,11 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
 } // end namespace internal
 
 /** \internal
-  * Declares, allocates and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack
-  * if SIZE is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform
-  * (currently, this is Linux and Visual Studio only). Otherwise the memory is allocated on the heap.
+  *
+  * The macro ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) declares, allocates,
+  * and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack
+  * if the size in bytes is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform
+  * (currently, this is Linux, OSX and Visual Studio only). Otherwise the memory is allocated on the heap.
   * The allocated buffer is automatically deleted when exiting the scope of this declaration.
   * If BUFFER is non null, then the declared variable is simply an alias for BUFFER, and no allocation/deletion occurs.
   * Here is an example:
@@ -620,6 +726,14 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
   * }
   * \endcode
   * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token.
+  *
+  * The macro ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) is analogue to
+  * \code
+  *   typename internal::nested_eval<XPRT_T,N>::type NAME(XPR);
+  * \endcode
+  * with the advantage of using aligned stack allocation even if the maximal size of XPR at compile time is unknown.
+  * This is accomplished through alloca if this later is supported and if the required number of bytes
+  * is below EIGEN_STACK_ALLOCATION_LIMIT.
   */
 #ifdef EIGEN_ALLOCA
 
@@ -639,6 +753,13 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
                     : Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE) );  \
     Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,sizeof(TYPE)*SIZE>EIGEN_STACK_ALLOCATION_LIMIT)
 
+
+  #define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) \
+    Eigen::internal::local_nested_eval_wrapper<XPR_T,N> EIGEN_CAT(NAME,_wrapper)(XPR, reinterpret_cast<typename XPR_T::Scalar*>( \
+      ( (Eigen::internal::local_nested_eval_wrapper<XPR_T,N>::NeedExternalBuffer) && ((sizeof(typename XPR_T::Scalar)*XPR.size())<=EIGEN_STACK_ALLOCATION_LIMIT) ) \
+        ? EIGEN_ALIGNED_ALLOCA( sizeof(typename XPR_T::Scalar)*XPR.size() ) : 0 ) ) ; \
+    typename Eigen::internal::local_nested_eval_wrapper<XPR_T,N>::ObjectType NAME(EIGEN_CAT(NAME,_wrapper).object)
+
 #else
 
   #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \
@@ -646,6 +767,9 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
     TYPE* NAME = (BUFFER)!=0 ? BUFFER : reinterpret_cast<TYPE*>(Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE));    \
     Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,true)
 
+
+#define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) typename Eigen::internal::nested_eval<XPR_T,N>::type NAME(XPR)
+
 #endif
 
 
@@ -688,15 +812,27 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
 #endif
 
 #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true)
-#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%EIGEN_MAX_ALIGN_BYTES==0)))
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size)                        \
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(                                                             \
+        ((Size)!=Eigen::Dynamic) &&                                                                    \
+        (((EIGEN_MAX_ALIGN_BYTES>=16) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES  )==0)) ||    \
+         ((EIGEN_MAX_ALIGN_BYTES>=32) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES/2)==0)) ||    \
+         ((EIGEN_MAX_ALIGN_BYTES>=64) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES/4)==0))   )))
 
 /****************************************************************************/
 
 /** \class aligned_allocator
 * \ingroup Core_Module
 *
-* \brief STL compatible allocator to use with with 16 byte aligned types
+* \brief STL compatible allocator to use with types requiring a non standrad alignment.
+*
+* The memory is aligned as for dynamically aligned matrix/array types such as MatrixXd.
+* By default, it will thus provide at least 16 bytes alignment and more in following cases:
+*  - 32 bytes alignment if AVX is enabled.
+*  - 64 bytes alignment if AVX512 is enabled.
+*
+* This can be controlled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented
+* \link TopicPreprocessorDirectivesPerformance there \endlink.
 *
 * Example:
 * \code
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 90eda6e70..1e4f95581 100755
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -11,9 +11,18 @@
 #ifndef EIGEN_META_H
 #define EIGEN_META_H
 
-#if defined(__CUDA_ARCH__)
-#include <cfloat>
-#include <math_constants.h>
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+
+ #include <cfloat>
+
+ #if defined(EIGEN_CUDA_ARCH)
+  #include <math_constants.h>
+ #endif
+
+ #if defined(EIGEN_HIP_DEVICE_COMPILE)
+  #include "Eigen/src/Core/arch/HIP/hcc/math_constants.h"
+  #endif
+
 #endif
 
 #if EIGEN_COMP_ICC>=1600 &&  __cplusplus >= 201103L
@@ -98,6 +107,8 @@ template<> struct is_arithmetic<signed long>   { enum { value = true }; };
 template<> struct is_arithmetic<unsigned long> { enum { value = true }; };
 
 #if EIGEN_HAS_CXX11
+template<> struct is_arithmetic<signed long long>   { enum { value = true }; };
+template<> struct is_arithmetic<unsigned long long> { enum { value = true }; };
 using std::is_integral;
 #else
 template<typename T> struct is_integral               { enum { value = false }; };
@@ -111,8 +122,33 @@ template<> struct is_integral<signed int>             { enum { value = true }; }
 template<> struct is_integral<unsigned int>           { enum { value = true }; };
 template<> struct is_integral<signed long>            { enum { value = true }; };
 template<> struct is_integral<unsigned long>          { enum { value = true }; };
+#if EIGEN_COMP_MSVC
+template<> struct is_integral<signed __int64>         { enum { value = true }; };
+template<> struct is_integral<unsigned __int64>       { enum { value = true }; };
+#endif
 #endif
 
+#if EIGEN_HAS_CXX11
+using std::make_unsigned;
+#else
+// TODO: Possibly improve this implementation of make_unsigned.
+// It is currently used only by
+// template<typename Scalar> struct random_default_impl<Scalar, false, true>.
+template<typename> struct make_unsigned;
+template<> struct make_unsigned<char>             { typedef unsigned char type; };
+template<> struct make_unsigned<signed char>      { typedef unsigned char type; };
+template<> struct make_unsigned<unsigned char>    { typedef unsigned char type; };
+template<> struct make_unsigned<signed short>     { typedef unsigned short type; };
+template<> struct make_unsigned<unsigned short>   { typedef unsigned short type; };
+template<> struct make_unsigned<signed int>       { typedef unsigned int type; };
+template<> struct make_unsigned<unsigned int>     { typedef unsigned int type; };
+template<> struct make_unsigned<signed long>      { typedef unsigned long type; };
+template<> struct make_unsigned<unsigned long>    { typedef unsigned long type; };
+#if EIGEN_COMP_MSVC
+template<> struct make_unsigned<signed __int64>   { typedef unsigned __int64 type; };
+template<> struct make_unsigned<unsigned __int64> { typedef unsigned __int64 type; };
+#endif
+#endif
 
 template <typename T> struct add_const { typedef const T type; };
 template <typename T> struct add_const<T&> { typedef T& type; };
@@ -139,16 +175,19 @@ private:
   struct yes {int a[1];};
   struct no  {int a[2];};
 
-  static yes test(const To&, int);
+  template<typename T>
+  static yes test(T, int);
+
+  template<typename T>
   static no  test(any_conversion, ...);
 
 public:
-  static From ms_from;
+  static typename internal::remove_reference<From>::type* ms_from;
 #ifdef __INTEL_COMPILER
   #pragma warning push
   #pragma warning ( disable : 2259 )
 #endif
-  enum { value = sizeof(test(ms_from, 0))==sizeof(yes) };
+  enum { value = sizeof(test<To>(*ms_from, 0))==sizeof(yes) };
 #ifdef __INTEL_COMPILER
   #pragma warning pop
 #endif
@@ -157,8 +196,7 @@ public:
 template<typename From, typename To>
 struct is_convertible
 {
-  enum { value = is_convertible_impl<typename remove_all<From>::type,
-                                     typename remove_all<To  >::type>::value };
+  enum { value = is_convertible_impl<From,To>::value };
 };
 
 /** \internal Allows to enable/disable an overload
@@ -169,7 +207,7 @@ template<bool Condition, typename T=void> struct enable_if;
 template<typename T> struct enable_if<true,T>
 { typedef T type; };
 
-#if defined(__CUDA_ARCH__)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 #if !defined(__FLT_EPSILON__)
 #define __FLT_EPSILON__ FLT_EPSILON
 #define __DBL_EPSILON__ DBL_EPSILON
@@ -191,13 +229,31 @@ template<> struct numeric_limits<float>
   EIGEN_DEVICE_FUNC
   static float epsilon() { return __FLT_EPSILON__; }
   EIGEN_DEVICE_FUNC
-  static float (max)() { return CUDART_MAX_NORMAL_F; }
+  static float (max)() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_MAX_NORMAL_F;
+  #else
+    return HIPRT_MAX_NORMAL_F;
+  #endif
+  }
   EIGEN_DEVICE_FUNC
   static float (min)() { return FLT_MIN; }
   EIGEN_DEVICE_FUNC
-  static float infinity() { return CUDART_INF_F; }
+  static float infinity() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_INF_F;
+  #else
+    return HIPRT_INF_F;
+  #endif
+  }
   EIGEN_DEVICE_FUNC
-  static float quiet_NaN() { return CUDART_NAN_F; }
+  static float quiet_NaN() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_NAN_F;
+  #else
+    return HIPRT_NAN_F;
+  #endif
+  }
 };
 template<> struct numeric_limits<double>
 {
@@ -208,9 +264,21 @@ template<> struct numeric_limits<double>
   EIGEN_DEVICE_FUNC
   static double (min)() { return DBL_MIN; }
   EIGEN_DEVICE_FUNC
-  static double infinity() { return CUDART_INF; }
+  static double infinity() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_INF;
+  #else
+    return HIPRT_INF;
+  #endif
+  }
   EIGEN_DEVICE_FUNC
-  static double quiet_NaN() { return CUDART_NAN; }
+  static double quiet_NaN() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_NAN;
+  #else
+    return HIPRT_NAN;
+  #endif
+  }
 };
 template<> struct numeric_limits<int>
 {
@@ -272,7 +340,7 @@ template<> struct numeric_limits<unsigned long long>
 #endif
 
 /** \internal
-  * A base class do disable default copy ctor and copy assignement operator.
+  * A base class do disable default copy ctor and copy assignment operator.
   */
 class noncopyable
 {
@@ -433,10 +501,10 @@ struct meta_no  { char a[2]; };
 template <typename T>
 struct has_ReturnType
 {
-  template <typename C> static meta_yes testFunctor(typename C::ReturnType const *);
-  template <typename C> static meta_no testFunctor(...);
+  template <typename C> static meta_yes testFunctor(C const *, typename C::ReturnType const * = 0);
+  template <typename C> static meta_no  testFunctor(...);
 
-  enum { value = sizeof(testFunctor<T>(0)) == sizeof(meta_yes) };
+  enum { value = sizeof(testFunctor<T>(static_cast<T*>(0))) == sizeof(meta_yes) };
 };
 
 template<typename T> const T* return_ptr();
@@ -522,14 +590,14 @@ template<typename T, typename U> struct scalar_product_traits
 } // end namespace internal
 
 namespace numext {
-  
-#if defined(__CUDA_ARCH__)
+
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 template<typename T> EIGEN_DEVICE_FUNC   void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; }
 #else
 template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); }
 #endif
 
-#if defined(__CUDA_ARCH__)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 using internal::device::numeric_limits;
 #else
 using std::numeric_limits;
@@ -538,11 +606,36 @@ using std::numeric_limits;
 // Integer division with rounding up.
 // T is assumed to be an integer type with a>=0, and b>0
 template<typename T>
+EIGEN_DEVICE_FUNC
 T div_ceil(const T &a, const T &b)
 {
   return (a+b-1) / b;
 }
 
+// The aim of the following functions is to bypass -Wfloat-equal warnings
+// when we really want a strict equality comparison on floating points.
+template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool equal_strict(const X& x,const Y& y) { return x == y; }
+
+#if !defined(EIGEN_GPU_COMPILE_PHASE) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool equal_strict(const float& x,const float& y) { return std::equal_to<float>()(x,y); }
+
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool equal_strict(const double& x,const double& y) { return std::equal_to<double>()(x,y); }
+#endif
+
+template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool not_equal_strict(const X& x,const Y& y) { return x != y; }
+
+#if !defined(EIGEN_GPU_COMPILE_PHASE) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to<float>()(x,y); }
+
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool not_equal_strict(const double& x,const double& y) { return std::not_equal_to<double>()(x,y); }
+#endif
+
 } // end namespace numext
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/util/ReenableStupidWarnings.h b/Eigen/src/Core/util/ReenableStupidWarnings.h
index 86b60f52f..e23a128d1 100644
--- a/Eigen/src/Core/util/ReenableStupidWarnings.h
+++ b/Eigen/src/Core/util/ReenableStupidWarnings.h
@@ -1,4 +1,8 @@
-#ifdef EIGEN_WARNINGS_DISABLED
+#ifdef EIGEN_WARNINGS_DISABLED_2
+// "DisableStupidWarnings.h" was included twice recursively: Do not reenable warnings yet!
+#  undef EIGEN_WARNINGS_DISABLED_2
+
+#elif defined(EIGEN_WARNINGS_DISABLED)
 #undef EIGEN_WARNINGS_DISABLED
 
 #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
@@ -8,7 +12,7 @@
     #pragma warning pop
   #elif defined __clang__
     #pragma clang diagnostic pop
-  #elif defined __GNUC__ && __GNUC__>=6
+  #elif defined __GNUC__
     #pragma GCC diagnostic pop
   #endif
 
diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h
index 983361a45..500e47792 100644
--- a/Eigen/src/Core/util/StaticAssert.h
+++ b/Eigen/src/Core/util/StaticAssert.h
@@ -24,6 +24,7 @@
  *
  */
 
+#ifndef EIGEN_STATIC_ASSERT
 #ifndef EIGEN_NO_STATIC_ASSERT
 
   #if EIGEN_MAX_CPP_VER>=11 && (__has_feature(cxx_static_assert) || (defined(__cplusplus) && __cplusplus >= 201103L) || (EIGEN_COMP_MSVC >= 1600))
@@ -44,64 +45,65 @@
     struct static_assertion<true>
     {
       enum {
-        YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX,
-        YOU_MIXED_VECTORS_OF_DIFFERENT_SIZES,
-        YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES,
-        THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE,
-        THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE,
-        THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE,
-        OUT_OF_RANGE_ACCESS,
-        YOU_MADE_A_PROGRAMMING_MISTAKE,
-        EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT,
-        EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE,
-        YOU_CALLED_A_FIXED_SIZE_METHOD_ON_A_DYNAMIC_SIZE_MATRIX_OR_VECTOR,
-        YOU_CALLED_A_DYNAMIC_SIZE_METHOD_ON_A_FIXED_SIZE_MATRIX_OR_VECTOR,
-        UNALIGNED_LOAD_AND_STORE_OPERATIONS_UNIMPLEMENTED_ON_ALTIVEC,
-        THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES,
-        FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED,
-        NUMERIC_TYPE_MUST_BE_REAL,
-        COEFFICIENT_WRITE_ACCESS_TO_SELFADJOINT_NOT_SUPPORTED,
-        WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED,
-        THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE,
-        INVALID_MATRIX_PRODUCT,
-        INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS,
-        INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION,
-        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY,
-        THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES,
-        THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES,
-        INVALID_MATRIX_TEMPLATE_PARAMETERS,
-        INVALID_MATRIXBASE_TEMPLATE_PARAMETERS,
-        BOTH_MATRICES_MUST_HAVE_THE_SAME_STORAGE_ORDER,
-        THIS_METHOD_IS_ONLY_FOR_DIAGONAL_MATRIX,
-        THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE,
-        THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES,
-        YOU_ALREADY_SPECIFIED_THIS_STRIDE,
-        INVALID_STORAGE_ORDER_FOR_THIS_VECTOR_EXPRESSION,
-        THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD,
-        PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1,
-        THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS,
-        YOU_CANNOT_MIX_ARRAYS_AND_MATRICES,
-        YOU_PERFORMED_AN_INVALID_TRANSFORMATION_CONVERSION,
-        THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY,
-        YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT,
-        THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS,
-        THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS,
-        THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL,
-        THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES,
-        YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED,
-        YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED,
-        THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE,
-        THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH,
-        OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG,
-        IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY,
-        STORAGE_LAYOUT_DOES_NOT_MATCH,
-        EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE,
-        THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS,
-        MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY,
-        THIS_TYPE_IS_NOT_SUPPORTED,
-        STORAGE_KIND_MUST_MATCH,
-        STORAGE_INDEX_MUST_MATCH,
-        CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY
+        YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX=1,
+        YOU_MIXED_VECTORS_OF_DIFFERENT_SIZES=1,
+        YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES=1,
+        THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE=1,
+        THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE=1,
+        THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE=1,
+        OUT_OF_RANGE_ACCESS=1,
+        YOU_MADE_A_PROGRAMMING_MISTAKE=1,
+        EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT=1,
+        EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE=1,
+        YOU_CALLED_A_FIXED_SIZE_METHOD_ON_A_DYNAMIC_SIZE_MATRIX_OR_VECTOR=1,
+        YOU_CALLED_A_DYNAMIC_SIZE_METHOD_ON_A_FIXED_SIZE_MATRIX_OR_VECTOR=1,
+        UNALIGNED_LOAD_AND_STORE_OPERATIONS_UNIMPLEMENTED_ON_ALTIVEC=1,
+        THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES=1,
+        FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED=1,
+        NUMERIC_TYPE_MUST_BE_REAL=1,
+        COEFFICIENT_WRITE_ACCESS_TO_SELFADJOINT_NOT_SUPPORTED=1,
+        WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED=1,
+        THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE=1,
+        INVALID_MATRIX_PRODUCT=1,
+        INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS=1,
+        INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION=1,
+        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY=1,
+        THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES=1,
+        THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES=1,
+        INVALID_MATRIX_TEMPLATE_PARAMETERS=1,
+        INVALID_MATRIXBASE_TEMPLATE_PARAMETERS=1,
+        BOTH_MATRICES_MUST_HAVE_THE_SAME_STORAGE_ORDER=1,
+        THIS_METHOD_IS_ONLY_FOR_DIAGONAL_MATRIX=1,
+        THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE=1,
+        THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES=1,
+        YOU_ALREADY_SPECIFIED_THIS_STRIDE=1,
+        INVALID_STORAGE_ORDER_FOR_THIS_VECTOR_EXPRESSION=1,
+        THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD=1,
+        PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1=1,
+        THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS=1,
+        YOU_CANNOT_MIX_ARRAYS_AND_MATRICES=1,
+        YOU_PERFORMED_AN_INVALID_TRANSFORMATION_CONVERSION=1,
+        THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY=1,
+        YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT=1,
+        THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS=1,
+        THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS=1,
+        THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL=1,
+        THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES=1,
+        YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED=1,
+        YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED=1,
+        THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE=1,
+        THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH=1,
+        OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG=1,
+        IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY=1,
+        STORAGE_LAYOUT_DOES_NOT_MATCH=1,
+        EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE=1,
+        THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS=1,
+        MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY=1,
+        THIS_TYPE_IS_NOT_SUPPORTED=1,
+        STORAGE_KIND_MUST_MATCH=1,
+        STORAGE_INDEX_MUST_MATCH=1,
+        CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1,
+        SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1
       };
     };
 
@@ -131,7 +133,7 @@
   #define EIGEN_STATIC_ASSERT(CONDITION,MSG) eigen_assert((CONDITION) && #MSG);
 
 #endif // EIGEN_NO_STATIC_ASSERT
-
+#endif // EIGEN_STATIC_ASSERT
 
 // static assertion failing if the type \a TYPE is not a vector type
 #define EIGEN_STATIC_ASSERT_VECTOR_ONLY(TYPE) \
diff --git a/Eigen/src/Core/util/SymbolicIndex.h b/Eigen/src/Core/util/SymbolicIndex.h
index bb6349eb9..17cf46f05 100644
--- a/Eigen/src/Core/util/SymbolicIndex.h
+++ b/Eigen/src/Core/util/SymbolicIndex.h
@@ -12,7 +12,7 @@
 
 namespace Eigen {
 
-/** \namespace Eigen::Symbolic
+/** \namespace Eigen::symbolic
   * \ingroup Core_Module
   *
   * This namespace defines a set of classes and functions to build and evaluate symbolic expressions of scalar type Index.
@@ -20,9 +20,9 @@ namespace Eigen {
   *
   * \code
   * // First step, defines symbols:
-  * struct x_tag {};  static const Symbolic::SymbolExpr<x_tag> x;
-  * struct y_tag {};  static const Symbolic::SymbolExpr<y_tag> y;
-  * struct z_tag {};  static const Symbolic::SymbolExpr<z_tag> z;
+  * struct x_tag {};  static const symbolic::SymbolExpr<x_tag> x;
+  * struct y_tag {};  static const symbolic::SymbolExpr<y_tag> y;
+  * struct z_tag {};  static const symbolic::SymbolExpr<z_tag> z;
   *
   * // Defines an expression:
   * auto expr = (x+3)/y+z;
@@ -35,10 +35,10 @@ namespace Eigen {
   * std::cout << expr98.eval(x=6) << "\n";
   * \endcode
   *
-  * It is currently only used internally to define and minipulate the placeholders::last and placeholders::end symbols in Eigen::seq and Eigen::seqN.
+  * It is currently only used internally to define and manipulate the Eigen::last and Eigen::lastp1 symbols in Eigen::seq and Eigen::seqN.
   *
   */
-namespace Symbolic {
+namespace symbolic {
 
 template<typename Tag> class Symbol;
 template<typename Arg0> class NegateExpr;
@@ -187,17 +187,10 @@ public:
 
 template<typename T>
 struct is_symbolic {
-  // BaseExpr has no conversion ctor, so we only have to check whether T can be staticaly cast to its base class BaseExpr<T>.
+  // BaseExpr has no conversion ctor, so we only have to check whether T can be statically cast to its base class BaseExpr<T>.
   enum { value = internal::is_convertible<T,BaseExpr<T> >::value };
 };
 
-// Specialization for functions, because is_convertible fails in this case.
-// Useful in c++98/11 mode when testing is_symbolic<decltype(fix<N>)>
-template<typename T>
-struct is_symbolic<T (*)()> {
-  enum { value = false };
-};
-
 /** Represents the actual value of a symbol identified by its tag
   *
   * It is the return type of SymbolValue::operator=, and most of the time this is only way it is used.
@@ -293,7 +286,7 @@ protected:
   Arg1 m_arg1;
 };
 
-} // end namespace Symbolic
+} // end namespace symbolic
 
 } // end namespace Eigen
 
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index 4b337f29f..836ff4711 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -34,6 +34,26 @@ inline IndexDest convert_index(const IndexSrc& idx) {
   return IndexDest(idx);
 }
 
+// true if T can be considered as an integral index (i.e., and integral type or enum)
+template<typename T> struct is_valid_index_type
+{
+  enum { value =
+#if EIGEN_HAS_TYPE_TRAITS
+    internal::is_integral<T>::value || std::is_enum<T>::value
+#elif EIGEN_COMP_MSVC
+    internal::is_integral<T>::value || __is_enum(T)
+#else
+    // without C++11, we use is_convertible to Index instead of is_integral in order to treat enums as Index.
+    internal::is_convertible<T,Index>::value
+#endif
+  };
+};
+
+// true if both types are not valid index types
+template<typename RowIndices, typename ColIndices>
+struct valid_indexed_view_overload {
+  enum { value = !(internal::is_valid_index_type<RowIndices>::value && internal::is_valid_index_type<ColIndices>::value) };
+};
 
 // promote_scalar_arg is an helper used in operation between an expression and a scalar, like:
 //    expression * scalar
@@ -385,7 +405,7 @@ template<typename T> struct plain_matrix_type_row_major
   typedef Matrix<typename traits<T>::Scalar,
                 Rows,
                 Cols,
-                (MaxCols==1&&MaxRows!=1) ? RowMajor : ColMajor,
+                (MaxCols==1&&MaxRows!=1) ? ColMajor : RowMajor,
                 MaxRows,
                 MaxCols
           > type;
@@ -440,7 +460,7 @@ template<typename T, int n, typename PlainObject = typename plain_object_eval<T>
 {
   enum {
     ScalarReadCost = NumTraits<typename traits<T>::Scalar>::ReadCost,
-    CoeffReadCost = evaluator<T>::CoeffReadCost,  // NOTE What if an evaluator evaluate itself into a tempory?
+    CoeffReadCost = evaluator<T>::CoeffReadCost,  // NOTE What if an evaluator evaluate itself into a temporary?
                                                   //      Then CoeffReadCost will be small (e.g., 1) but we still have to evaluate, especially if n>1.
                                                   //      This situation is already taken care by the EvalBeforeNestingBit flag, which is turned ON
                                                   //      for all evaluator creating a temporary. This flag is then propagated by the parent evaluators.
@@ -656,17 +676,32 @@ template<typename T> struct is_diagonal<DiagonalWrapper<T> >
 template<typename T, int S> struct is_diagonal<DiagonalMatrix<T,S> >
 { enum { ret = true }; };
 
+
+template<typename T> struct is_identity
+{ enum { value = false }; };
+
+template<typename T> struct is_identity<CwiseNullaryOp<internal::scalar_identity_op<typename T::Scalar>, T> >
+{ enum { value = true }; };
+
+
 template<typename S1, typename S2> struct glue_shapes;
 template<> struct glue_shapes<DenseShape,TriangularShape> { typedef TriangularShape type;  };
 
 template<typename T1, typename T2>
-bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if<has_direct_access<T1>::ret&&has_direct_access<T2>::ret, T1>::type * = 0)
+struct possibly_same_dense {
+  enum { value = has_direct_access<T1>::ret && has_direct_access<T2>::ret && is_same<typename T1::Scalar,typename T2::Scalar>::value };
+};
+
+template<typename T1, typename T2>
+EIGEN_DEVICE_FUNC
+bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if<possibly_same_dense<T1,T2>::value>::type * = 0)
 {
   return (mat1.data()==mat2.data()) && (mat1.innerStride()==mat2.innerStride()) && (mat1.outerStride()==mat2.outerStride());
 }
 
 template<typename T1, typename T2>
-bool is_same_dense(const T1 &, const T2 &, typename enable_if<!(has_direct_access<T1>::ret&&has_direct_access<T2>::ret), T1>::type * = 0)
+EIGEN_DEVICE_FUNC
+bool is_same_dense(const T1 &, const T2 &, typename enable_if<!possibly_same_dense<T1,T2>::value>::type * = 0)
 {
   return false;
 }
diff --git a/Eigen/src/Eigenvalues/ComplexEigenSolver.h b/Eigen/src/Eigenvalues/ComplexEigenSolver.h
index dc5fae06a..081e918f1 100644
--- a/Eigen/src/Eigenvalues/ComplexEigenSolver.h
+++ b/Eigen/src/Eigenvalues/ComplexEigenSolver.h
@@ -214,7 +214,7 @@ template<typename _MatrixType> class ComplexEigenSolver
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+      * \returns \c Success if computation was successful, \c NoConvergence otherwise.
       */
     ComputationInfo info() const
     {
diff --git a/Eigen/src/Eigenvalues/ComplexSchur.h b/Eigen/src/Eigenvalues/ComplexSchur.h
index 7f38919f7..b8b3490c6 100644
--- a/Eigen/src/Eigenvalues/ComplexSchur.h
+++ b/Eigen/src/Eigenvalues/ComplexSchur.h
@@ -212,7 +212,7 @@ template<typename _MatrixType> class ComplexSchur
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+      * \returns \c Success if computation was successful, \c NoConvergence otherwise.
       */
     ComputationInfo info() const
     {
diff --git a/Eigen/src/Eigenvalues/EigenSolver.h b/Eigen/src/Eigenvalues/EigenSolver.h
index f205b185d..997bebe7b 100644
--- a/Eigen/src/Eigenvalues/EigenSolver.h
+++ b/Eigen/src/Eigenvalues/EigenSolver.h
@@ -277,7 +277,7 @@ template<typename _MatrixType> class EigenSolver
     template<typename InputType>
     EigenSolver& compute(const EigenBase<InputType>& matrix, bool computeEigenvectors = true);
 
-    /** \returns NumericalIssue if the input contains INF or NaN values or overflow occured. Returns Success otherwise. */
+    /** \returns NumericalIssue if the input contains INF or NaN values or overflow occurred. Returns Success otherwise. */
     ComputationInfo info() const
     {
       eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
index 36a91dffc..87d789b3f 100644
--- a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
+++ b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
@@ -311,7 +311,6 @@ GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixTyp
     // Aliases:
     Map<VectorType> v(reinterpret_cast<Scalar*>(m_tmp.data()), size);
     ComplexVectorType &cv = m_tmp;
-    const MatrixType &mZ = m_realQZ.matrixZ();
     const MatrixType &mS = m_realQZ.matrixS();
     const MatrixType &mT = m_realQZ.matrixT();
 
@@ -351,7 +350,7 @@ GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixTyp
               }
             }
           }
-          m_eivec.col(i).real().noalias() = mZ.transpose() * v;
+          m_eivec.col(i).real().noalias() = m_realQZ.matrixZ().transpose() * v;
           m_eivec.col(i).real().normalize();
           m_eivec.col(i).imag().setConstant(0);
         }
@@ -400,7 +399,7 @@ GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixTyp
                               / (alpha*mT.coeffRef(j,j) - static_cast<Scalar>(beta*mS.coeffRef(j,j)));
             }
           }
-          m_eivec.col(i+1).noalias() = (mZ.transpose() * cv);
+          m_eivec.col(i+1).noalias() = (m_realQZ.matrixZ().transpose() * cv);
           m_eivec.col(i+1).normalize();
           m_eivec.col(i) = m_eivec.col(i+1).conjugate();
         }
diff --git a/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
index 5f6bb8289..d0f9091be 100644
--- a/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
+++ b/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
@@ -121,7 +121,7 @@ class GeneralizedSelfAdjointEigenSolver : public SelfAdjointEigenSolver<_MatrixT
       *
       * \returns    Reference to \c *this
       *
-      * Accoring to \p options, this function computes eigenvalues and (if requested)
+      * According to \p options, this function computes eigenvalues and (if requested)
       * the eigenvectors of one of the following three generalized eigenproblems:
       * - \c Ax_lBx: \f$ Ax = \lambda B x \f$
       * - \c ABx_lx: \f$ ABx = \lambda x \f$
diff --git a/Eigen/src/Eigenvalues/HessenbergDecomposition.h b/Eigen/src/Eigenvalues/HessenbergDecomposition.h
index f647f69b0..d947dac4e 100644
--- a/Eigen/src/Eigenvalues/HessenbergDecomposition.h
+++ b/Eigen/src/Eigenvalues/HessenbergDecomposition.h
@@ -315,7 +315,7 @@ void HessenbergDecomposition<MatrixType>::_compute(MatrixType& matA, CoeffVector
 
     // A = A H'
     matA.rightCols(remainingSize)
-        .applyHouseholderOnTheRight(matA.col(i).tail(remainingSize-1).conjugate(), numext::conj(h), &temp.coeffRef(0));
+        .applyHouseholderOnTheRight(matA.col(i).tail(remainingSize-1), numext::conj(h), &temp.coeffRef(0));
   }
 }
 
diff --git a/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h b/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
index 4fec8af0a..66e5a3dbb 100644
--- a/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
+++ b/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
@@ -66,7 +66,6 @@ template<typename Derived>
 inline typename MatrixBase<Derived>::EigenvaluesReturnType
 MatrixBase<Derived>::eigenvalues() const
 {
-  typedef typename internal::traits<Derived>::Scalar Scalar;
   return internal::eigenvalues_selector<Derived, NumTraits<Scalar>::IsComplex>::run(derived());
 }
 
@@ -85,10 +84,9 @@ MatrixBase<Derived>::eigenvalues() const
   * \sa SelfAdjointEigenSolver::eigenvalues(), MatrixBase::eigenvalues()
   */
 template<typename MatrixType, unsigned int UpLo> 
-inline typename SelfAdjointView<MatrixType, UpLo>::EigenvaluesReturnType
+EIGEN_DEVICE_FUNC inline typename SelfAdjointView<MatrixType, UpLo>::EigenvaluesReturnType
 SelfAdjointView<MatrixType, UpLo>::eigenvalues() const
 {
-  typedef typename SelfAdjointView<MatrixType, UpLo>::PlainObject PlainObject;
   PlainObject thisAsMatrix(*this);
   return SelfAdjointEigenSolver<PlainObject>(thisAsMatrix, false).eigenvalues();
 }
@@ -149,7 +147,7 @@ MatrixBase<Derived>::operatorNorm() const
   * \sa eigenvalues(), MatrixBase::operatorNorm()
   */
 template<typename MatrixType, unsigned int UpLo>
-inline typename SelfAdjointView<MatrixType, UpLo>::RealScalar
+EIGEN_DEVICE_FUNC inline typename SelfAdjointView<MatrixType, UpLo>::RealScalar
 SelfAdjointView<MatrixType, UpLo>::operatorNorm() const
 {
   return eigenvalues().cwiseAbs().maxCoeff();
diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h
index b3a910dd9..e2b37f40e 100644
--- a/Eigen/src/Eigenvalues/RealQZ.h
+++ b/Eigen/src/Eigenvalues/RealQZ.h
@@ -161,7 +161,7 @@ namespace Eigen {
 
       /** \brief Reports whether previous computation was successful.
        *
-       * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+       * \returns \c Success if computation was successful, \c NoConvergence otherwise.
        */
       ComputationInfo info() const
       {
diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h
index f5c86041d..aca8a8279 100644
--- a/Eigen/src/Eigenvalues/RealSchur.h
+++ b/Eigen/src/Eigenvalues/RealSchur.h
@@ -190,7 +190,7 @@ template<typename _MatrixType> class RealSchur
     RealSchur& computeFromHessenberg(const HessMatrixType& matrixH, const OrthMatrixType& matrixQ,  bool computeU);
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+      * \returns \c Success if computation was successful, \c NoConvergence otherwise.
       */
     ComputationInfo info() const
     {
@@ -270,8 +270,13 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::compute(const EigenBase<InputType>
   // Step 1. Reduce to Hessenberg form
   m_hess.compute(matrix.derived()/scale);
 
-  // Step 2. Reduce to real Schur form  
-  computeFromHessenberg(m_hess.matrixH(), m_hess.matrixQ(), computeU);
+  // Step 2. Reduce to real Schur form
+  // Note: we copy m_hess.matrixQ() into m_matU here and not in computeFromHessenberg
+  //       to be able to pass our working-space buffer for the Householder to Dense evaluation.
+  m_workspaceVector.resize(matrix.cols());
+  if(computeU)
+    m_hess.matrixQ().evalTo(m_matU, m_workspaceVector);
+  computeFromHessenberg(m_hess.matrixH(), m_matU, computeU);
 
   m_matT *= scale;
   
@@ -284,13 +289,13 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMa
   using std::abs;
 
   m_matT = matrixH;
-  if(computeU)
+  m_workspaceVector.resize(m_matT.cols());
+  if(computeU && !internal::is_same_dense(m_matU,matrixQ))
     m_matU = matrixQ;
   
   Index maxIters = m_maxIters;
   if (maxIters == -1)
     maxIters = m_maxIterationsPerRow * matrixH.rows();
-  m_workspaceVector.resize(m_matT.cols());
   Scalar* workspace = &m_workspaceVector.coeffRef(0);
 
   // The matrix m_matT is divided in three parts. 
@@ -303,7 +308,7 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMa
   Scalar exshift(0);   // sum of exceptional shifts
   Scalar norm = computeNormOfT();
 
-  if(norm!=0)
+  if(norm!=Scalar(0))
   {
     while (iu >= 0)
     {
@@ -327,7 +332,7 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMa
       else // No convergence yet
       {
         // The firstHouseholderVector vector has to be initialized to something to get rid of a silly GCC warning (-O1 -Wall -DNDEBUG )
-        Vector3s firstHouseholderVector(0,0,0), shiftInfo;
+        Vector3s firstHouseholderVector = Vector3s::Zero(), shiftInfo;
         computeShift(iu, iter, exshift, shiftInfo);
         iter = iter + 1;
         totalIter = totalIter + 1;
diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
index 9ddd553f2..f95606206 100644
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
@@ -20,7 +20,9 @@ class GeneralizedSelfAdjointEigenSolver;
 
 namespace internal {
 template<typename SolverType,int Size,bool IsComplex> struct direct_selfadjoint_eigenvalues;
+
 template<typename MatrixType, typename DiagType, typename SubDiagType>
+EIGEN_DEVICE_FUNC
 ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec);
 }
 
@@ -119,6 +121,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
         : m_eivec(),
           m_eivalues(),
           m_subdiag(),
+          m_info(InvalidInput),
           m_isInitialized(false)
     { }
 
@@ -337,7 +340,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+      * \returns \c Success if computation was successful, \c NoConvergence otherwise.
       */
     EIGEN_DEVICE_FUNC
     ComputationInfo info() const
@@ -354,7 +357,8 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
     static const int m_maxIterations = 30;
 
   protected:
-    static void check_template_parameters()
+    static EIGEN_DEVICE_FUNC
+    void check_template_parameters()
     {
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
     }
@@ -403,7 +407,7 @@ SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
   
   const InputType &matrix(a_matrix.derived());
   
-  using std::abs;
+  EIGEN_USING_STD_MATH(abs);
   eigen_assert(matrix.cols() == matrix.rows());
   eigen_assert((options&~(EigVecMask|GenEigMask))==0
           && (options&EigVecMask)!=EigVecMask
@@ -479,9 +483,10 @@ namespace internal {
   * \returns \c Success or \c NoConvergence
   */
 template<typename MatrixType, typename DiagType, typename SubDiagType>
+EIGEN_DEVICE_FUNC
 ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec)
 {
-  using std::abs;
+  EIGEN_USING_STD_MATH(abs);
 
   ComputationInfo info;
   typedef typename MatrixType::Scalar Scalar;
@@ -535,7 +540,7 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag
       diag.segment(i,n-i).minCoeff(&k);
       if (k > 0)
       {
-        std::swap(diag[i], diag[k+i]);
+        numext::swap(diag[i], diag[k+i]);
         if(computeEigenvectors)
           eivec.col(i).swap(eivec.col(k+i));
       }
@@ -605,7 +610,8 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
   EIGEN_DEVICE_FUNC
   static inline bool extract_kernel(MatrixType& mat, Ref<VectorType> res, Ref<VectorType> representative)
   {
-    using std::abs;
+    EIGEN_USING_STD_MATH(abs);
+    EIGEN_USING_STD_MATH(sqrt);
     Index i0;
     // Find non-zero column i0 (by construction, there must exist a non zero coefficient on the diagonal):
     mat.diagonal().cwiseAbs().maxCoeff(&i0);
@@ -616,8 +622,8 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
     VectorType c0, c1;
     n0 = (c0 = representative.cross(mat.col((i0+1)%3))).squaredNorm();
     n1 = (c1 = representative.cross(mat.col((i0+2)%3))).squaredNorm();
-    if(n0>n1) res = c0/std::sqrt(n0);
-    else      res = c1/std::sqrt(n1);
+    if(n0>n1) res = c0/sqrt(n0);
+    else      res = c1/sqrt(n1);
 
     return true;
   }
@@ -719,7 +725,7 @@ struct direct_selfadjoint_eigenvalues<SolverType,2,false>
   EIGEN_DEVICE_FUNC
   static inline void computeRoots(const MatrixType& m, VectorType& roots)
   {
-    using std::sqrt;
+    EIGEN_USING_STD_MATH(sqrt);
     const Scalar t0 = Scalar(0.5) * sqrt( numext::abs2(m(0,0)-m(1,1)) + Scalar(4)*numext::abs2(m(1,0)));
     const Scalar t1 = Scalar(0.5) * (m(0,0) + m(1,1));
     roots(0) = t1 - t0;
@@ -807,7 +813,7 @@ template<int StorageOrder,typename RealScalar, typename Scalar, typename Index>
 EIGEN_DEVICE_FUNC
 static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index start, Index end, Scalar* matrixQ, Index n)
 {
-  using std::abs;
+  EIGEN_USING_STD_MATH(abs);
   RealScalar td = (diag[end-1] - diag[end])*RealScalar(0.5);
   RealScalar e = subdiag[end-1];
   // Note that thanks to scaling, e^2 or td^2 cannot overflow, however they can still
diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h
index 3891cf883..b0c947dc0 100644
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h
@@ -37,7 +37,7 @@ namespace Eigen {
 
 /** \internal Specialization for the data types supported by LAPACKe */
 
-#define EIGEN_LAPACKE_EIG_SELFADJ(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, EIGCOLROW, LAPACKE_COLROW ) \
+#define EIGEN_LAPACKE_EIG_SELFADJ_2(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, EIGCOLROW ) \
 template<> template<typename InputType> inline \
 SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >& \
 SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const EigenBase<InputType>& matrix, int options) \
@@ -47,7 +47,7 @@ SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(c
           && (options&EigVecMask)!=EigVecMask \
           && "invalid option parameter"); \
   bool computeEigenvectors = (options&ComputeEigenvectors)==ComputeEigenvectors; \
-  lapack_int n = internal::convert_index<lapack_int>(matrix.cols()), lda, matrix_order, info; \
+  lapack_int n = internal::convert_index<lapack_int>(matrix.cols()), lda, info; \
   m_eivalues.resize(n,1); \
   m_subdiag.resize(n-1); \
   m_eivec = matrix; \
@@ -63,27 +63,24 @@ SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(c
   } \
 \
   lda = internal::convert_index<lapack_int>(m_eivec.outerStride()); \
-  matrix_order=LAPACKE_COLROW; \
   char jobz, uplo='L'/*, range='A'*/; \
   jobz = computeEigenvectors ? 'V' : 'N'; \
 \
-  info = LAPACKE_##LAPACKE_NAME( matrix_order, jobz, uplo, n, (LAPACKE_TYPE*)m_eivec.data(), lda, (LAPACKE_RTYPE*)m_eivalues.data() ); \
+  info = LAPACKE_##LAPACKE_NAME( LAPACK_COL_MAJOR, jobz, uplo, n, (LAPACKE_TYPE*)m_eivec.data(), lda, (LAPACKE_RTYPE*)m_eivalues.data() ); \
   m_info = (info==0) ? Success : NoConvergence; \
   m_isInitialized = true; \
   m_eigenvectorsOk = computeEigenvectors; \
   return *this; \
 }
 
+#define EIGEN_LAPACKE_EIG_SELFADJ(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME )              \
+        EIGEN_LAPACKE_EIG_SELFADJ_2(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, ColMajor )  \
+        EIGEN_LAPACKE_EIG_SELFADJ_2(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, RowMajor ) 
 
-EIGEN_LAPACKE_EIG_SELFADJ(double,   double,                double, dsyev, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_LAPACKE_EIG_SELFADJ(float,    float,                 float,  ssyev, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float,  float,  cheev, ColMajor, LAPACK_COL_MAJOR)
-
-EIGEN_LAPACKE_EIG_SELFADJ(double,   double,                double, dsyev, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_LAPACKE_EIG_SELFADJ(float,    float,                 float,  ssyev, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float,  float,  cheev, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_EIG_SELFADJ(double,   double,                double, dsyev)
+EIGEN_LAPACKE_EIG_SELFADJ(float,    float,                 float,  ssyev)
+EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev)
+EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float,  float,  cheev)
 
 } // end namespace Eigen
 
diff --git a/Eigen/src/Eigenvalues/Tridiagonalization.h b/Eigen/src/Eigenvalues/Tridiagonalization.h
index 1d102c17b..c5c1acf46 100644
--- a/Eigen/src/Eigenvalues/Tridiagonalization.h
+++ b/Eigen/src/Eigenvalues/Tridiagonalization.h
@@ -25,6 +25,7 @@ struct traits<TridiagonalizationMatrixTReturnType<MatrixType> >
 };
 
 template<typename MatrixType, typename CoeffVectorType>
+EIGEN_DEVICE_FUNC
 void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs);
 }
 
@@ -344,6 +345,7 @@ namespace internal {
   * \sa Tridiagonalization::packedMatrix()
   */
 template<typename MatrixType, typename CoeffVectorType>
+EIGEN_DEVICE_FUNC
 void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs)
 {
   using numext::conj;
@@ -424,6 +426,7 @@ struct tridiagonalization_inplace_selector;
   * \sa class Tridiagonalization
   */
 template<typename MatrixType, typename DiagonalType, typename SubDiagonalType>
+EIGEN_DEVICE_FUNC
 void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
 {
   eigen_assert(mat.cols()==mat.rows() && diag.size()==mat.rows() && subdiag.size()==mat.rows()-1);
@@ -439,7 +442,8 @@ struct tridiagonalization_inplace_selector
   typedef typename Tridiagonalization<MatrixType>::CoeffVectorType CoeffVectorType;
   typedef typename Tridiagonalization<MatrixType>::HouseholderSequenceType HouseholderSequenceType;
   template<typename DiagonalType, typename SubDiagonalType>
-  static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
+  static EIGEN_DEVICE_FUNC
+  void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
   {
     CoeffVectorType hCoeffs(mat.cols()-1);
     tridiagonalization_inplace(mat,hCoeffs);
@@ -508,7 +512,8 @@ struct tridiagonalization_inplace_selector<MatrixType,1,IsComplex>
   typedef typename MatrixType::Scalar Scalar;
 
   template<typename DiagonalType, typename SubDiagonalType>
-  static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ)
+  static EIGEN_DEVICE_FUNC
+  void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ)
   {
     diag(0,0) = numext::real(mat(0,0));
     if(extractQ)
diff --git a/Eigen/src/Geometry/AngleAxis.h b/Eigen/src/Geometry/AngleAxis.h
index 0af3c1b08..83ee1be46 100644
--- a/Eigen/src/Geometry/AngleAxis.h
+++ b/Eigen/src/Geometry/AngleAxis.h
@@ -178,7 +178,7 @@ EIGEN_DEVICE_FUNC AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const Quaterni
   if (n != Scalar(0))
   {
     m_angle = Scalar(2)*atan2(n, abs(q.w()));
-    if(q.w() < 0)
+    if(q.w() < Scalar(0))
       n = -n;
     m_axis  = q.vec() / n;
   }
diff --git a/Eigen/src/Geometry/Hyperplane.h b/Eigen/src/Geometry/Hyperplane.h
index 05929b299..cebe03557 100644
--- a/Eigen/src/Geometry/Hyperplane.h
+++ b/Eigen/src/Geometry/Hyperplane.h
@@ -119,7 +119,7 @@ public:
     * If the dimension of the ambient space is greater than 2, then there isn't uniqueness,
     * so an arbitrary choice is made.
     */
-  // FIXME to be consitent with the rest this could be implemented as a static Through function ??
+  // FIXME to be consistent with the rest this could be implemented as a static Through function ??
   EIGEN_DEVICE_FUNC explicit Hyperplane(const ParametrizedLine<Scalar, AmbientDimAtCompileTime>& parametrized)
   {
     normal() = parametrized.direction().unitOrthogonal();
diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h
index f6ef1bcf6..faea62f17 100644
--- a/Eigen/src/Geometry/Quaternion.h
+++ b/Eigen/src/Geometry/Quaternion.h
@@ -43,6 +43,11 @@ class QuaternionBase : public RotationBase<Derived, 3>
   typedef typename internal::traits<Derived>::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
   typedef typename internal::traits<Derived>::Coefficients Coefficients;
+  typedef typename Coefficients::CoeffReturnType CoeffReturnType;
+  typedef typename internal::conditional<bool(internal::traits<Derived>::Flags&LvalueBit),
+                                        Scalar&, CoeffReturnType>::type NonConstCoeffReturnType;
+
+
   enum {
     Flags = Eigen::internal::traits<Derived>::Flags
   };
@@ -58,22 +63,22 @@ class QuaternionBase : public RotationBase<Derived, 3>
 
 
   /** \returns the \c x coefficient */
-  EIGEN_DEVICE_FUNC inline Scalar x() const { return this->derived().coeffs().coeff(0); }
+  EIGEN_DEVICE_FUNC inline CoeffReturnType x() const { return this->derived().coeffs().coeff(0); }
   /** \returns the \c y coefficient */
-  EIGEN_DEVICE_FUNC inline Scalar y() const { return this->derived().coeffs().coeff(1); }
+  EIGEN_DEVICE_FUNC inline CoeffReturnType y() const { return this->derived().coeffs().coeff(1); }
   /** \returns the \c z coefficient */
-  EIGEN_DEVICE_FUNC inline Scalar z() const { return this->derived().coeffs().coeff(2); }
+  EIGEN_DEVICE_FUNC inline CoeffReturnType z() const { return this->derived().coeffs().coeff(2); }
   /** \returns the \c w coefficient */
-  EIGEN_DEVICE_FUNC inline Scalar w() const { return this->derived().coeffs().coeff(3); }
+  EIGEN_DEVICE_FUNC inline CoeffReturnType w() const { return this->derived().coeffs().coeff(3); }
 
-  /** \returns a reference to the \c x coefficient */
-  EIGEN_DEVICE_FUNC inline Scalar& x() { return this->derived().coeffs().coeffRef(0); }
-  /** \returns a reference to the \c y coefficient */
-  EIGEN_DEVICE_FUNC inline Scalar& y() { return this->derived().coeffs().coeffRef(1); }
-  /** \returns a reference to the \c z coefficient */
-  EIGEN_DEVICE_FUNC inline Scalar& z() { return this->derived().coeffs().coeffRef(2); }
-  /** \returns a reference to the \c w coefficient */
-  EIGEN_DEVICE_FUNC inline Scalar& w() { return this->derived().coeffs().coeffRef(3); }
+  /** \returns a reference to the \c x coefficient (if Derived is a non-const lvalue) */
+  EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType x() { return this->derived().coeffs().x(); }
+  /** \returns a reference to the \c y coefficient (if Derived is a non-const lvalue) */
+  EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType y() { return this->derived().coeffs().y(); }
+  /** \returns a reference to the \c z coefficient (if Derived is a non-const lvalue) */
+  EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType z() { return this->derived().coeffs().z(); }
+  /** \returns a reference to the \c w coefficient (if Derived is a non-const lvalue) */
+  EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType w() { return this->derived().coeffs().w(); }
 
   /** \returns a read-only vector expression of the imaginary part (x,y,z) */
   EIGEN_DEVICE_FUNC inline const VectorBlock<const Coefficients,3> vec() const { return coeffs().template head<3>(); }
@@ -271,6 +276,27 @@ public:
   EIGEN_DEVICE_FUNC explicit inline Quaternion(const Quaternion<OtherScalar, OtherOptions>& other)
   { m_coeffs = other.coeffs().template cast<Scalar>(); }
 
+#if EIGEN_HAS_RVALUE_REFERENCES
+  // We define a copy constructor, which means we don't get an implicit move constructor or assignment operator.
+  /** Default move constructor */
+  EIGEN_DEVICE_FUNC inline Quaternion(Quaternion&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value)
+    : m_coeffs(std::move(other.coeffs()))
+  {}
+
+  /** Default move assignment operator */
+  EIGEN_DEVICE_FUNC Quaternion& operator=(Quaternion&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
+  {
+    m_coeffs = std::move(other.coeffs());
+    return *this;
+  }
+
+  // And now because we declared a constructor, we don't get an implicit copy constructor. Say we want one.
+  /** Default copy constructor */
+  EIGEN_DEVICE_FUNC Quaternion(const Quaternion& other)
+    : m_coeffs(other.coeffs())
+  {}
+#endif
+
   EIGEN_DEVICE_FUNC static Quaternion UnitRandom();
 
   template<typename Derived1, typename Derived2>
@@ -423,7 +449,7 @@ typedef Map<Quaternion<double>, Aligned>  QuaternionMapAlignedd;
 // Generic Quaternion * Quaternion product
 // This product can be specialized for a given architecture via the Arch template argument.
 namespace internal {
-template<int Arch, class Derived1, class Derived2, typename Scalar, int _Options> struct quat_product
+template<int Arch, class Derived1, class Derived2, typename Scalar> struct quat_product
 {
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Quaternion<Scalar> run(const QuaternionBase<Derived1>& a, const QuaternionBase<Derived2>& b){
     return Quaternion<Scalar>
@@ -446,8 +472,7 @@ QuaternionBase<Derived>::operator* (const QuaternionBase<OtherDerived>& other) c
   EIGEN_STATIC_ASSERT((internal::is_same<typename Derived::Scalar, typename OtherDerived::Scalar>::value),
    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
   return internal::quat_product<Architecture::Target, Derived, OtherDerived,
-                         typename internal::traits<Derived>::Scalar,
-                         EIGEN_PLAIN_ENUM_MIN(internal::traits<Derived>::Alignment, internal::traits<OtherDerived>::Alignment)>::run(*this, other);
+                         typename internal::traits<Derived>::Scalar>::run(*this, other);
 }
 
 /** \sa operator*(Quaternion) */
@@ -624,7 +649,7 @@ EIGEN_DEVICE_FUNC Quaternion<Scalar,Options> Quaternion<Scalar,Options>::UnitRan
   const Scalar u1 = internal::random<Scalar>(0, 1),
                u2 = internal::random<Scalar>(0, 2*EIGEN_PI),
                u3 = internal::random<Scalar>(0, 2*EIGEN_PI);
-  const Scalar a = sqrt(1 - u1),
+  const Scalar a = sqrt(Scalar(1) - u1),
                b = sqrt(u1);
   return Quaternion (a * sin(u2), a * cos(u2), b * sin(u3), b * cos(u3));
 }
@@ -672,7 +697,7 @@ EIGEN_DEVICE_FUNC inline Quaternion<typename internal::traits<Derived>::Scalar>
 
 // Generic conjugate of a Quaternion
 namespace internal {
-template<int Arch, class Derived, typename Scalar, int _Options> struct quat_conj
+template<int Arch, class Derived, typename Scalar> struct quat_conj
 {
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Quaternion<Scalar> run(const QuaternionBase<Derived>& q){
     return Quaternion<Scalar>(q.w(),-q.x(),-q.y(),-q.z());
@@ -691,8 +716,7 @@ EIGEN_DEVICE_FUNC inline Quaternion<typename internal::traits<Derived>::Scalar>
 QuaternionBase<Derived>::conjugate() const
 {
   return internal::quat_conj<Architecture::Target, Derived,
-                         typename internal::traits<Derived>::Scalar,
-                         internal::traits<Derived>::Alignment>::run(*this);
+                         typename internal::traits<Derived>::Scalar>::run(*this);
                          
 }
 
diff --git a/Eigen/src/Geometry/Scaling.h b/Eigen/src/Geometry/Scaling.h
index f58ca03d9..df650fda6 100755
--- a/Eigen/src/Geometry/Scaling.h
+++ b/Eigen/src/Geometry/Scaling.h
@@ -29,6 +29,22 @@ namespace Eigen {
   *
   * \sa Scaling(), class DiagonalMatrix, MatrixBase::asDiagonal(), class Translation, class Transform
   */
+
+namespace internal
+{
+  // This helper helps nvcc+MSVC to properly parse this file.
+  // See bug 1412.
+  template <typename Scalar, int Dim, int Mode>
+  struct uniformscaling_times_affine_returntype
+  {
+    enum
+    {
+      NewMode = int(Mode) == int(Isometry) ? Affine : Mode
+    };
+    typedef Transform <Scalar, Dim, NewMode> type;
+  };
+}
+
 template<typename _Scalar>
 class UniformScaling
 {
@@ -60,9 +76,11 @@ public:
 
   /** Concatenates a uniform scaling and an affine transformation */
   template<int Dim, int Mode, int Options>
-  inline Transform<Scalar,Dim,(int(Mode)==int(Isometry)?Affine:Mode)> operator* (const Transform<Scalar,Dim, Mode, Options>& t) const
+  inline typename
+	internal::uniformscaling_times_affine_returntype<Scalar,Dim,Mode>::type
+	operator* (const Transform<Scalar, Dim, Mode, Options>& t) const
   {
-    Transform<Scalar,Dim,(int(Mode)==int(Isometry)?Affine:Mode)> res = t;
+    typename internal::uniformscaling_times_affine_returntype<Scalar,Dim,Mode>::type res = t;
     res.prescale(factor());
     return res;
   }
@@ -70,7 +88,7 @@ public:
   /** Concatenates a uniform scaling and a linear transformation matrix */
   // TODO returns an expression
   template<typename Derived>
-  inline typename internal::plain_matrix_type<Derived>::type operator* (const MatrixBase<Derived>& other) const
+  inline typename Eigen::internal::plain_matrix_type<Derived>::type operator* (const MatrixBase<Derived>& other) const
   { return other * m_factor; }
 
   template<typename Derived,int Dim>
@@ -110,7 +128,7 @@ public:
 /** Concatenates a linear transformation matrix and a uniform scaling
   * \relates UniformScaling
   */
-// NOTE this operator is defiend in MatrixBase and not as a friend function
+// NOTE this operator is defined in MatrixBase and not as a friend function
 // of UniformScaling to fix an internal crash of Intel's ICC
 template<typename Derived,typename Scalar>
 EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,Scalar,product)
diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h
index 2d36dfadf..75991aaed 100644
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h
@@ -115,7 +115,7 @@ template<int Mode> struct transform_make_affine;
   * \end{array} \right) \f$
   *
   * Note that for a projective transformation the last row can be anything,
-  * and then the interpretation of different parts might be sightly different.
+  * and then the interpretation of different parts might be slightly different.
   *
   * However, unlike a plain matrix, the Transform class provides many features
   * simplifying both its assembly and usage. In particular, it can be composed
diff --git a/Eigen/src/Geometry/Translation.h b/Eigen/src/Geometry/Translation.h
index 51d9a82eb..23b19f74f 100644
--- a/Eigen/src/Geometry/Translation.h
+++ b/Eigen/src/Geometry/Translation.h
@@ -70,18 +70,18 @@ public:
   /** Constructs and initialize the translation transformation from a vector of translation coefficients */
   EIGEN_DEVICE_FUNC explicit inline Translation(const VectorType& vector) : m_coeffs(vector) {}
 
-  /** \brief Retruns the x-translation by value. **/
+  /** \brief Returns the x-translation by value. **/
   EIGEN_DEVICE_FUNC inline Scalar x() const { return m_coeffs.x(); }
-  /** \brief Retruns the y-translation by value. **/
+  /** \brief Returns the y-translation by value. **/
   EIGEN_DEVICE_FUNC inline Scalar y() const { return m_coeffs.y(); }
-  /** \brief Retruns the z-translation by value. **/
+  /** \brief Returns the z-translation by value. **/
   EIGEN_DEVICE_FUNC inline Scalar z() const { return m_coeffs.z(); }
 
-  /** \brief Retruns the x-translation as a reference. **/
+  /** \brief Returns the x-translation as a reference. **/
   EIGEN_DEVICE_FUNC inline Scalar& x() { return m_coeffs.x(); }
-  /** \brief Retruns the y-translation as a reference. **/
+  /** \brief Returns the y-translation as a reference. **/
   EIGEN_DEVICE_FUNC inline Scalar& y() { return m_coeffs.y(); }
-  /** \brief Retruns the z-translation as a reference. **/
+  /** \brief Returns the z-translation as a reference. **/
   EIGEN_DEVICE_FUNC inline Scalar& z() { return m_coeffs.z(); }
 
   EIGEN_DEVICE_FUNC const VectorType& vector() const { return m_coeffs; }
diff --git a/Eigen/src/Geometry/arch/Geometry_SSE.h b/Eigen/src/Geometry/arch/Geometry_SSE.h
index 1a86ff837..d4346aa1c 100644
--- a/Eigen/src/Geometry/arch/Geometry_SSE.h
+++ b/Eigen/src/Geometry/arch/Geometry_SSE.h
@@ -16,34 +16,43 @@ namespace Eigen {
 namespace internal {
 
 template<class Derived, class OtherDerived>
-struct quat_product<Architecture::SSE, Derived, OtherDerived, float, Aligned16>
+struct quat_product<Architecture::SSE, Derived, OtherDerived, float>
 {
+  enum {
+    AAlignment = traits<Derived>::Alignment,
+    BAlignment = traits<OtherDerived>::Alignment,
+    ResAlignment = traits<Quaternion<float> >::Alignment
+  };
   static inline Quaternion<float> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
   {
     Quaternion<float> res;
-    const __m128 mask = _mm_setr_ps(0.f,0.f,0.f,-0.f);
-    __m128 a = _a.coeffs().template packet<Aligned16>(0);
-    __m128 b = _b.coeffs().template packet<Aligned16>(0);
-    __m128 s1 = _mm_mul_ps(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2));
-    __m128 s2 = _mm_mul_ps(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1));
-    pstore(&res.x(),
-              _mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,vec4f_swizzle1(b,3,3,3,3)),
-                                    _mm_mul_ps(vec4f_swizzle1(a,2,0,1,0),
+    const Packet4f mask = _mm_setr_ps(0.f,0.f,0.f,-0.f);
+    Packet4f a = _a.coeffs().template packet<AAlignment>(0);
+    Packet4f b = _b.coeffs().template packet<BAlignment>(0);
+    Packet4f s1 = pmul(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2));
+    Packet4f s2 = pmul(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1));
+    pstoret<float,Packet4f,ResAlignment>(
+              &res.x(),
+              padd(psub(pmul(a,vec4f_swizzle1(b,3,3,3,3)),
+                                    pmul(vec4f_swizzle1(a,2,0,1,0),
                                                vec4f_swizzle1(b,1,2,0,0))),
-                         _mm_xor_ps(mask,_mm_add_ps(s1,s2))));
+                         pxor(mask,padd(s1,s2))));
     
     return res;
   }
 };
 
-template<class Derived, int Alignment>
-struct quat_conj<Architecture::SSE, Derived, float, Alignment>
+template<class Derived>
+struct quat_conj<Architecture::SSE, Derived, float>
 {
+  enum {
+    ResAlignment = traits<Quaternion<float> >::Alignment
+  };
   static inline Quaternion<float> run(const QuaternionBase<Derived>& q)
   {
     Quaternion<float> res;
     const __m128 mask = _mm_setr_ps(-0.f,-0.f,-0.f,0.f);
-    pstore(&res.x(), _mm_xor_ps(mask, q.coeffs().template packet<Alignment>(0)));
+    pstoret<float,Packet4f,ResAlignment>(&res.x(), _mm_xor_ps(mask, q.coeffs().template packet<traits<Derived>::Alignment>(0)));
     return res;
   }
 };
@@ -52,6 +61,9 @@ struct quat_conj<Architecture::SSE, Derived, float, Alignment>
 template<typename VectorLhs,typename VectorRhs>
 struct cross3_impl<Architecture::SSE,VectorLhs,VectorRhs,float,true>
 {
+  enum {
+    ResAlignment = traits<typename plain_matrix_type<VectorLhs>::type>::Alignment
+  };
   static inline typename plain_matrix_type<VectorLhs>::type
   run(const VectorLhs& lhs, const VectorRhs& rhs)
   {
@@ -60,7 +72,7 @@ struct cross3_impl<Architecture::SSE,VectorLhs,VectorRhs,float,true>
     __m128 mul1=_mm_mul_ps(vec4f_swizzle1(a,1,2,0,3),vec4f_swizzle1(b,2,0,1,3));
     __m128 mul2=_mm_mul_ps(vec4f_swizzle1(a,2,0,1,3),vec4f_swizzle1(b,1,2,0,3));
     typename plain_matrix_type<VectorLhs>::type res;
-    pstore(&res.x(),_mm_sub_ps(mul1,mul2));
+    pstoret<float,Packet4f,ResAlignment>(&res.x(),_mm_sub_ps(mul1,mul2));
     return res;
   }
 };
@@ -68,9 +80,14 @@ struct cross3_impl<Architecture::SSE,VectorLhs,VectorRhs,float,true>
 
 
 
-template<class Derived, class OtherDerived, int Alignment>
-struct quat_product<Architecture::SSE, Derived, OtherDerived, double, Alignment>
+template<class Derived, class OtherDerived>
+struct quat_product<Architecture::SSE, Derived, OtherDerived, double>
 {
+  enum {
+    BAlignment = traits<OtherDerived>::Alignment,
+    ResAlignment = traits<Quaternion<double> >::Alignment
+  };
+
   static inline Quaternion<double> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
   {
   const Packet2d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
@@ -78,8 +95,8 @@ struct quat_product<Architecture::SSE, Derived, OtherDerived, double, Alignment>
   Quaternion<double> res;
 
   const double* a = _a.coeffs().data();
-  Packet2d b_xy = _b.coeffs().template packet<Alignment>(0);
-  Packet2d b_zw = _b.coeffs().template packet<Alignment>(2);
+  Packet2d b_xy = _b.coeffs().template packet<BAlignment>(0);
+  Packet2d b_zw = _b.coeffs().template packet<BAlignment>(2);
   Packet2d a_xx = pset1<Packet2d>(a[0]);
   Packet2d a_yy = pset1<Packet2d>(a[1]);
   Packet2d a_zz = pset1<Packet2d>(a[2]);
@@ -97,9 +114,9 @@ struct quat_product<Architecture::SSE, Derived, OtherDerived, double, Alignment>
   t2 = psub(pmul(a_zz, b_xy), pmul(a_xx, b_zw));
 #ifdef EIGEN_VECTORIZE_SSE3
   EIGEN_UNUSED_VARIABLE(mask)
-  pstore(&res.x(), _mm_addsub_pd(t1, preverse(t2)));
+  pstoret<double,Packet2d,ResAlignment>(&res.x(), _mm_addsub_pd(t1, preverse(t2)));
 #else
-  pstore(&res.x(), padd(t1, pxor(mask,preverse(t2))));
+  pstoret<double,Packet2d,ResAlignment>(&res.x(), padd(t1, pxor(mask,preverse(t2))));
 #endif
   
   /*
@@ -111,25 +128,28 @@ struct quat_product<Architecture::SSE, Derived, OtherDerived, double, Alignment>
   t2 = padd(pmul(a_zz, b_zw), pmul(a_xx, b_xy));
 #ifdef EIGEN_VECTORIZE_SSE3
   EIGEN_UNUSED_VARIABLE(mask)
-  pstore(&res.z(), preverse(_mm_addsub_pd(preverse(t1), t2)));
+  pstoret<double,Packet2d,ResAlignment>(&res.z(), preverse(_mm_addsub_pd(preverse(t1), t2)));
 #else
-  pstore(&res.z(), psub(t1, pxor(mask,preverse(t2))));
+  pstoret<double,Packet2d,ResAlignment>(&res.z(), psub(t1, pxor(mask,preverse(t2))));
 #endif
 
   return res;
 }
 };
 
-template<class Derived, int Alignment>
-struct quat_conj<Architecture::SSE, Derived, double, Alignment>
+template<class Derived>
+struct quat_conj<Architecture::SSE, Derived, double>
 {
+  enum {
+    ResAlignment = traits<Quaternion<double> >::Alignment
+  };
   static inline Quaternion<double> run(const QuaternionBase<Derived>& q)
   {
     Quaternion<double> res;
     const __m128d mask0 = _mm_setr_pd(-0.,-0.);
     const __m128d mask2 = _mm_setr_pd(-0.,0.);
-    pstore(&res.x(), _mm_xor_pd(mask0, q.coeffs().template packet<Alignment>(0)));
-    pstore(&res.z(), _mm_xor_pd(mask2, q.coeffs().template packet<Alignment>(2)));
+    pstoret<double,Packet2d,ResAlignment>(&res.x(), _mm_xor_pd(mask0, q.coeffs().template packet<traits<Derived>::Alignment>(0)));
+    pstoret<double,Packet2d,ResAlignment>(&res.z(), _mm_xor_pd(mask2, q.coeffs().template packet<traits<Derived>::Alignment>(2)));
     return res;
   }
 };
diff --git a/Eigen/src/Householder/BlockHouseholder.h b/Eigen/src/Householder/BlockHouseholder.h
index 01a7ed188..39ce1c2a0 100644
--- a/Eigen/src/Householder/BlockHouseholder.h
+++ b/Eigen/src/Householder/BlockHouseholder.h
@@ -63,8 +63,15 @@ void make_block_householder_triangular_factor(TriangularFactorType& triFactor, c
       triFactor.row(i).tail(rt).noalias() = -hCoeffs(i) * vectors.col(i).tail(rs).adjoint()
                                                         * vectors.bottomRightCorner(rs, rt).template triangularView<UnitLower>();
             
-      // FIXME add .noalias() once the triangular product can work inplace
-      triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView<Upper>();
+      // FIXME use the following line with .noalias() once the triangular product can work inplace
+      // triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView<Upper>();
+      for(Index j=nbVecs-1; j>i; --j)
+      {
+        typename TriangularFactorType::Scalar z = triFactor(i,j);
+        triFactor(i,j) = z * triFactor(j,j);
+        if(nbVecs-j-1>0)
+          triFactor.row(i).tail(nbVecs-j-1) += z * triFactor.row(j).tail(nbVecs-j-1);
+      }
       
     }
     triFactor(i,i) = hCoeffs(i);
diff --git a/Eigen/src/Householder/Householder.h b/Eigen/src/Householder/Householder.h
index 80de2c305..5bc037f00 100644
--- a/Eigen/src/Householder/Householder.h
+++ b/Eigen/src/Householder/Householder.h
@@ -39,6 +39,7 @@ template<int n> struct decrement_size
   *     MatrixBase::applyHouseholderOnTheRight()
   */
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 void MatrixBase<Derived>::makeHouseholderInPlace(Scalar& tau, RealScalar& beta)
 {
   VectorBlock<Derived, internal::decrement_size<Base::SizeAtCompileTime>::ret> essentialPart(derived(), 1, size()-1);
@@ -62,6 +63,7 @@ void MatrixBase<Derived>::makeHouseholderInPlace(Scalar& tau, RealScalar& beta)
   */
 template<typename Derived>
 template<typename EssentialPart>
+EIGEN_DEVICE_FUNC
 void MatrixBase<Derived>::makeHouseholder(
   EssentialPart& essential,
   Scalar& tau,
@@ -103,13 +105,14 @@ void MatrixBase<Derived>::makeHouseholder(
   * \param essential the essential part of the vector \c v
   * \param tau the scaling factor of the Householder transformation
   * \param workspace a pointer to working space with at least
-  *                  this->cols() * essential.size() entries
+  *                  this->cols() entries
   *
   * \sa MatrixBase::makeHouseholder(), MatrixBase::makeHouseholderInPlace(), 
   *     MatrixBase::applyHouseholderOnTheRight()
   */
 template<typename Derived>
 template<typename EssentialPart>
+EIGEN_DEVICE_FUNC
 void MatrixBase<Derived>::applyHouseholderOnTheLeft(
   const EssentialPart& essential,
   const Scalar& tau,
@@ -140,13 +143,14 @@ void MatrixBase<Derived>::applyHouseholderOnTheLeft(
   * \param essential the essential part of the vector \c v
   * \param tau the scaling factor of the Householder transformation
   * \param workspace a pointer to working space with at least
-  *                  this->cols() * essential.size() entries
+  *                  this->rows() entries
   *
   * \sa MatrixBase::makeHouseholder(), MatrixBase::makeHouseholderInPlace(), 
   *     MatrixBase::applyHouseholderOnTheLeft()
   */
 template<typename Derived>
 template<typename EssentialPart>
+EIGEN_DEVICE_FUNC
 void MatrixBase<Derived>::applyHouseholderOnTheRight(
   const EssentialPart& essential,
   const Scalar& tau,
@@ -160,10 +164,10 @@ void MatrixBase<Derived>::applyHouseholderOnTheRight(
   {
     Map<typename internal::plain_col_type<PlainObject>::type> tmp(workspace,rows());
     Block<Derived, Derived::RowsAtCompileTime, EssentialPart::SizeAtCompileTime> right(derived(), 0, 1, rows(), cols()-1);
-    tmp.noalias() = right * essential.conjugate();
+    tmp.noalias() = right * essential;
     tmp += this->col(0);
     this->col(0) -= tau * tmp;
-    right.noalias() -= tau * tmp * essential.transpose();
+    right.noalias() -= tau * tmp * essential.adjoint();
   }
 }
 
diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h
index 3ce0a693d..e62befcb6 100644
--- a/Eigen/src/Householder/HouseholderSequence.h
+++ b/Eigen/src/Householder/HouseholderSequence.h
@@ -87,7 +87,7 @@ struct hseq_side_dependent_impl
 {
   typedef Block<const VectorsType, Dynamic, 1> EssentialVectorType;
   typedef HouseholderSequence<VectorsType, CoeffsType, OnTheLeft> HouseholderSequenceType;
-  static inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k)
+  static EIGEN_DEVICE_FUNC inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k)
   {
     Index start = k+1+h.m_shift;
     return Block<const VectorsType,Dynamic,1>(h.m_vectors, start, k, h.rows()-start, 1);
@@ -140,6 +140,22 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       Side
     > ConjugateReturnType;
 
+    typedef HouseholderSequence<
+      VectorsType,
+      typename internal::conditional<NumTraits<Scalar>::IsComplex,
+        typename internal::remove_all<typename CoeffsType::ConjugateReturnType>::type,
+        CoeffsType>::type,
+      Side
+    > AdjointReturnType;
+
+    typedef HouseholderSequence<
+      typename internal::conditional<NumTraits<Scalar>::IsComplex,
+        typename internal::remove_all<typename VectorsType::ConjugateReturnType>::type,
+        VectorsType>::type,
+      CoeffsType,
+      Side
+    > TransposeReturnType;
+
     /** \brief Constructor.
       * \param[in]  v      %Matrix containing the essential parts of the Householder vectors
       * \param[in]  h      Vector containing the Householder coefficients
@@ -157,17 +173,19 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       *
       * \sa setLength(), setShift()
       */
+    EIGEN_DEVICE_FUNC
     HouseholderSequence(const VectorsType& v, const CoeffsType& h)
-      : m_vectors(v), m_coeffs(h), m_trans(false), m_length(v.diagonalSize()),
+      : m_vectors(v), m_coeffs(h), m_reverse(false), m_length(v.diagonalSize()),
         m_shift(0)
     {
     }
 
     /** \brief Copy constructor. */
+    EIGEN_DEVICE_FUNC
     HouseholderSequence(const HouseholderSequence& other)
       : m_vectors(other.m_vectors),
         m_coeffs(other.m_coeffs),
-        m_trans(other.m_trans),
+        m_reverse(other.m_reverse),
         m_length(other.m_length),
         m_shift(other.m_shift)
     {
@@ -177,12 +195,14 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       * \returns Number of rows 
       * \details This equals the dimension of the space that the transformation acts on.
       */
+    EIGEN_DEVICE_FUNC
     Index rows() const { return Side==OnTheLeft ? m_vectors.rows() : m_vectors.cols(); }
 
     /** \brief Number of columns of transformation viewed as a matrix.
       * \returns Number of columns
       * \details This equals the dimension of the space that the transformation acts on.
       */
+    EIGEN_DEVICE_FUNC
     Index cols() const { return rows(); }
 
     /** \brief Essential part of a Householder vector.
@@ -199,6 +219,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       *
       * \sa setShift(), shift()
       */
+    EIGEN_DEVICE_FUNC
     const EssentialVectorType essentialVector(Index k) const
     {
       eigen_assert(k >= 0 && k < m_length);
@@ -206,31 +227,39 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
     }
 
     /** \brief %Transpose of the Householder sequence. */
-    HouseholderSequence transpose() const
+    TransposeReturnType transpose() const
     {
-      return HouseholderSequence(*this).setTrans(!m_trans);
+      return TransposeReturnType(m_vectors.conjugate(), m_coeffs)
+              .setReverseFlag(!m_reverse)
+              .setLength(m_length)
+              .setShift(m_shift);
     }
 
     /** \brief Complex conjugate of the Householder sequence. */
     ConjugateReturnType conjugate() const
     {
       return ConjugateReturnType(m_vectors.conjugate(), m_coeffs.conjugate())
-             .setTrans(m_trans)
+             .setReverseFlag(m_reverse)
              .setLength(m_length)
              .setShift(m_shift);
     }
 
     /** \brief Adjoint (conjugate transpose) of the Householder sequence. */
-    ConjugateReturnType adjoint() const
+    AdjointReturnType adjoint() const
     {
-      return conjugate().setTrans(!m_trans);
+      return AdjointReturnType(m_vectors, m_coeffs.conjugate())
+              .setReverseFlag(!m_reverse)
+              .setLength(m_length)
+              .setShift(m_shift);
     }
 
     /** \brief Inverse of the Householder sequence (equals the adjoint). */
-    ConjugateReturnType inverse() const { return adjoint(); }
+    AdjointReturnType inverse() const { return adjoint(); }
 
     /** \internal */
-    template<typename DestType> inline void evalTo(DestType& dst) const
+    template<typename DestType>
+    inline EIGEN_DEVICE_FUNC
+    void evalTo(DestType& dst) const
     {
       Matrix<Scalar, DestType::RowsAtCompileTime, 1,
              AutoAlign|ColMajor, DestType::MaxRowsAtCompileTime, 1> workspace(rows());
@@ -239,6 +268,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
 
     /** \internal */
     template<typename Dest, typename Workspace>
+    EIGEN_DEVICE_FUNC
     void evalTo(Dest& dst, Workspace& workspace) const
     {
       workspace.resize(rows());
@@ -251,7 +281,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
         for(Index k = vecs-1; k >= 0; --k)
         {
           Index cornerSize = rows() - k - m_shift;
-          if(m_trans)
+          if(m_reverse)
             dst.bottomRightCorner(cornerSize, cornerSize)
                .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), workspace.data());
           else
@@ -265,18 +295,26 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
         for(Index k = 0; k<cols()-vecs ; ++k)
           dst.col(k).tail(rows()-k-1).setZero();
       }
+      else if(m_length>BlockSize)
+      {
+        dst.setIdentity(rows(), rows());
+        if(m_reverse)
+          applyThisOnTheLeft(dst,workspace,true);
+        else
+          applyThisOnTheLeft(dst,workspace,true);
+      }
       else
       {
         dst.setIdentity(rows(), rows());
         for(Index k = vecs-1; k >= 0; --k)
         {
           Index cornerSize = rows() - k - m_shift;
-          if(m_trans)
+          if(m_reverse)
             dst.bottomRightCorner(cornerSize, cornerSize)
-               .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), &workspace.coeffRef(0));
+               .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), workspace.data());
           else
             dst.bottomRightCorner(cornerSize, cornerSize)
-               .applyHouseholderOnTheLeft(essentialVector(k), m_coeffs.coeff(k), &workspace.coeffRef(0));
+               .applyHouseholderOnTheLeft(essentialVector(k), m_coeffs.coeff(k), workspace.data());
         }
       }
     }
@@ -295,31 +333,34 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       workspace.resize(dst.rows());
       for(Index k = 0; k < m_length; ++k)
       {
-        Index actual_k = m_trans ? m_length-k-1 : k;
+        Index actual_k = m_reverse ? m_length-k-1 : k;
         dst.rightCols(rows()-m_shift-actual_k)
            .applyHouseholderOnTheRight(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data());
       }
     }
 
     /** \internal */
-    template<typename Dest> inline void applyThisOnTheLeft(Dest& dst) const
+    template<typename Dest> inline void applyThisOnTheLeft(Dest& dst, bool inputIsIdentity = false) const
     {
       Matrix<Scalar,1,Dest::ColsAtCompileTime,RowMajor,1,Dest::MaxColsAtCompileTime> workspace;
-      applyThisOnTheLeft(dst, workspace);
+      applyThisOnTheLeft(dst, workspace, inputIsIdentity);
     }
 
     /** \internal */
     template<typename Dest, typename Workspace>
-    inline void applyThisOnTheLeft(Dest& dst, Workspace& workspace) const
+    inline void applyThisOnTheLeft(Dest& dst, Workspace& workspace, bool inputIsIdentity = false) const
     {
-      const Index BlockSize = 48;
+      if(inputIsIdentity && m_reverse)
+        inputIsIdentity = false;
       // if the entries are large enough, then apply the reflectors by block
       if(m_length>=BlockSize && dst.cols()>1)
       {
-        for(Index i = 0; i < m_length; i+=BlockSize)
+        // Make sure we have at least 2 useful blocks, otherwise it is point-less:
+        Index blockSize = m_length<Index(2*BlockSize) ? (m_length+1)/2 : Index(BlockSize);
+        for(Index i = 0; i < m_length; i+=blockSize)
         {
-          Index end = m_trans ? (std::min)(m_length,i+BlockSize) : m_length-i;
-          Index k = m_trans ? i : (std::max)(Index(0),end-BlockSize);
+          Index end = m_reverse ? (std::min)(m_length,i+blockSize) : m_length-i;
+          Index k = m_reverse ? i : (std::max)(Index(0),end-blockSize);
           Index bs = end-k;
           Index start = k + m_shift;
           
@@ -329,8 +370,15 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
                                                                    Side==OnTheRight ? bs : m_vectors.rows()-start,
                                                                    Side==OnTheRight ? m_vectors.cols()-start : bs);
           typename internal::conditional<Side==OnTheRight, Transpose<SubVectorsType>, SubVectorsType&>::type sub_vecs(sub_vecs1);
-          Block<Dest,Dynamic,Dynamic> sub_dst(dst,dst.rows()-rows()+m_shift+k,0, rows()-m_shift-k,dst.cols());
-          apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_trans);
+
+          Index dstStart = dst.rows()-rows()+m_shift+k;
+          Index dstRows  = rows()-m_shift-k;
+          Block<Dest,Dynamic,Dynamic> sub_dst(dst,
+                                              dstStart,
+                                              inputIsIdentity ? dstStart : 0,
+                                              dstRows,
+                                              inputIsIdentity ? dstRows : dst.cols());
+          apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_reverse);
         }
       }
       else
@@ -338,8 +386,9 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
         workspace.resize(dst.cols());
         for(Index k = 0; k < m_length; ++k)
         {
-          Index actual_k = m_trans ? k : m_length-k-1;
-          dst.bottomRows(rows()-m_shift-actual_k)
+          Index actual_k = m_reverse ? k : m_length-k-1;
+          Index dstStart = rows()-m_shift-actual_k;
+          dst.bottomRightCorner(dstStart, inputIsIdentity ? dstStart : dst.cols())
             .applyHouseholderOnTheLeft(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data());
         }
       }
@@ -357,7 +406,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
     {
       typename internal::matrix_type_times_scalar_type<Scalar, OtherDerived>::Type
         res(other.template cast<typename internal::matrix_type_times_scalar_type<Scalar,OtherDerived>::ResultScalar>());
-      applyThisOnTheLeft(res);
+      applyThisOnTheLeft(res, internal::is_identity<OtherDerived>::value && res.rows()==res.cols());
       return res;
     }
 
@@ -372,6 +421,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       *
       * \sa length()
       */
+    EIGEN_DEVICE_FUNC
     HouseholderSequence& setLength(Index length)
     {
       m_length = length;
@@ -389,13 +439,17 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       *
       * \sa shift()
       */
+    EIGEN_DEVICE_FUNC
     HouseholderSequence& setShift(Index shift)
     {
       m_shift = shift;
       return *this;
     }
 
+    EIGEN_DEVICE_FUNC
     Index length() const { return m_length; }  /**< \brief Returns the length of the Householder sequence. */
+
+    EIGEN_DEVICE_FUNC
     Index shift() const { return m_shift; }    /**< \brief Returns the shift of the Householder sequence. */
 
     /* Necessary for .adjoint() and .conjugate() */
@@ -403,27 +457,30 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
 
   protected:
 
-    /** \brief Sets the transpose flag.
-      * \param [in]  trans  New value of the transpose flag.
+    /** \internal
+      * \brief Sets the reverse flag.
+      * \param [in]  reverse  New value of the reverse flag.
       *
-      * By default, the transpose flag is not set. If the transpose flag is set, then this object represents 
-      * \f$ H^T = H_{n-1}^T \ldots H_1^T H_0^T \f$ instead of \f$ H = H_0 H_1 \ldots H_{n-1} \f$.
+      * By default, the reverse flag is not set. If the reverse flag is set, then this object represents
+      * \f$ H^r = H_{n-1} \ldots H_1 H_0 \f$ instead of \f$ H = H_0 H_1 \ldots H_{n-1} \f$.
+      * \note For real valued HouseholderSequence this is equivalent to transposing \f$ H \f$.
       *
-      * \sa trans()
+      * \sa reverseFlag(), transpose(), adjoint()
       */
-    HouseholderSequence& setTrans(bool trans)
+    HouseholderSequence& setReverseFlag(bool reverse)
     {
-      m_trans = trans;
+      m_reverse = reverse;
       return *this;
     }
 
-    bool trans() const { return m_trans; }     /**< \brief Returns the transpose flag. */
+    bool reverseFlag() const { return m_reverse; }     /**< \internal \brief Returns the reverse flag. */
 
     typename VectorsType::Nested m_vectors;
     typename CoeffsType::Nested m_coeffs;
-    bool m_trans;
+    bool m_reverse;
     Index m_length;
     Index m_shift;
+    enum { BlockSize = 48 };
 };
 
 /** \brief Computes the product of a matrix with a Householder sequence.
diff --git a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
index 358444aff..f66c846ef 100644
--- a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
+++ b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
@@ -152,13 +152,28 @@ class LeastSquareDiagonalPreconditioner : public DiagonalPreconditioner<_Scalar>
     {
       // Compute the inverse squared-norm of each column of mat
       m_invdiag.resize(mat.cols());
-      for(Index j=0; j<mat.outerSize(); ++j)
+      if(MatType::IsRowMajor)
       {
-        RealScalar sum = mat.innerVector(j).squaredNorm();
-        if(sum>0)
-          m_invdiag(j) = RealScalar(1)/sum;
-        else
-          m_invdiag(j) = RealScalar(1);
+        m_invdiag.setZero();
+        for(Index j=0; j<mat.outerSize(); ++j)
+        {
+          for(typename MatType::InnerIterator it(mat,j); it; ++it)
+            m_invdiag(it.index()) += numext::abs2(it.value());
+        }
+        for(Index j=0; j<mat.cols(); ++j)
+          if(numext::real(m_invdiag(j))>RealScalar(0))
+            m_invdiag(j) = RealScalar(1)/numext::real(m_invdiag(j));
+      }
+      else
+      {
+        for(Index j=0; j<mat.outerSize(); ++j)
+        {
+          RealScalar sum = mat.col(j).squaredNorm();
+          if(sum>RealScalar(0))
+            m_invdiag(j) = RealScalar(1)/sum;
+          else
+            m_invdiag(j) = RealScalar(1);
+        }
       }
       Base::m_isInitialized = true;
       return *this;
diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
index 395daa8e4..f7ce47134 100644
--- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
+++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
@@ -50,7 +50,8 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
     tol_error = 0;
     return;
   }
-  RealScalar threshold = tol*tol*rhsNorm2;
+  const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
+  RealScalar threshold = numext::maxi(tol*tol*rhsNorm2,considerAsZero);
   RealScalar residualNorm2 = residual.squaredNorm();
   if (residualNorm2 < threshold)
   {
@@ -58,7 +59,7 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
     tol_error = sqrt(residualNorm2 / rhsNorm2);
     return;
   }
-  
+
   VectorType p(n);
   p = precond.solve(residual);      // initial search direction
 
diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
index 338e6f10a..43bd8e8f6 100644
--- a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
@@ -136,7 +136,7 @@ class IncompleteLUT : public SparseSolverBase<IncompleteLUT<_Scalar, _StorageInd
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the matrix.appears to be negative.
       */
     ComputationInfo info() const
@@ -230,7 +230,7 @@ void IncompleteLUT<Scalar,StorageIndex>::analyzePattern(const _MatrixType& amat)
   SparseMatrix<Scalar,ColMajor, StorageIndex> mat1 = amat;
   SparseMatrix<Scalar,ColMajor, StorageIndex> mat2 = amat.transpose();
   // FIXME for a matrix with nearly symmetric pattern, mat2+mat1 is the appropriate choice.
-  //       on the other hand for a really non-symmetric pattern, mat2*mat1 should be prefered...
+  //       on the other hand for a really non-symmetric pattern, mat2*mat1 should be preferred...
   SparseMatrix<Scalar,ColMajor, StorageIndex> AtA = mat2 + mat1;
   AMDOrdering<StorageIndex> ordering;
   ordering(AtA,m_P);
diff --git a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
index 7c2326eb7..bfeee71cd 100644
--- a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
+++ b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
@@ -275,7 +275,7 @@ public:
   const Preconditioner& preconditioner() const { return m_preconditioner; }
 
   /** \returns the max number of iterations.
-    * It is either the value setted by setMaxIterations or, by default,
+    * It is either the value set by setMaxIterations or, by default,
     * twice the number of columns of the matrix.
     */
   Index maxIterations() const
diff --git a/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h b/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
index 0ace45177..79e1e4819 100644
--- a/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
+++ b/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
@@ -108,7 +108,7 @@ struct Assignment<DstXprType, SolveWithGuess<DecType,RhsType,GuessType>, interna
   }
 };
 
-} // end namepsace internal
+} // end namespace internal
 
 } // end namespace Eigen
 
diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h
index d25af8e90..4ccd49a04 100644
--- a/Eigen/src/Jacobi/Jacobi.h
+++ b/Eigen/src/Jacobi/Jacobi.h
@@ -37,17 +37,20 @@ template<typename Scalar> class JacobiRotation
     typedef typename NumTraits<Scalar>::Real RealScalar;
 
     /** Default constructor without any initialization. */
+    EIGEN_DEVICE_FUNC
     JacobiRotation() {}
 
     /** Construct a planar rotation from a cosine-sine pair (\a c, \c s). */
+    EIGEN_DEVICE_FUNC
     JacobiRotation(const Scalar& c, const Scalar& s) : m_c(c), m_s(s) {}
 
-    Scalar& c() { return m_c; }
-    Scalar c() const { return m_c; }
-    Scalar& s() { return m_s; }
-    Scalar s() const { return m_s; }
+    EIGEN_DEVICE_FUNC Scalar& c() { return m_c; }
+    EIGEN_DEVICE_FUNC Scalar c() const { return m_c; }
+    EIGEN_DEVICE_FUNC Scalar& s() { return m_s; }
+    EIGEN_DEVICE_FUNC Scalar s() const { return m_s; }
 
     /** Concatenates two planar rotation */
+    EIGEN_DEVICE_FUNC
     JacobiRotation operator*(const JacobiRotation& other)
     {
       using numext::conj;
@@ -56,20 +59,27 @@ template<typename Scalar> class JacobiRotation
     }
 
     /** Returns the transposed transformation */
+    EIGEN_DEVICE_FUNC
     JacobiRotation transpose() const { using numext::conj; return JacobiRotation(m_c, -conj(m_s)); }
 
     /** Returns the adjoint transformation */
+    EIGEN_DEVICE_FUNC
     JacobiRotation adjoint() const { using numext::conj; return JacobiRotation(conj(m_c), -m_s); }
 
     template<typename Derived>
+    EIGEN_DEVICE_FUNC
     bool makeJacobi(const MatrixBase<Derived>&, Index p, Index q);
+    EIGEN_DEVICE_FUNC
     bool makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z);
 
-    void makeGivens(const Scalar& p, const Scalar& q, Scalar* z=0);
+    EIGEN_DEVICE_FUNC
+    void makeGivens(const Scalar& p, const Scalar& q, Scalar* r=0);
 
   protected:
-    void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::true_type);
-    void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::false_type);
+    EIGEN_DEVICE_FUNC
+    void makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type);
+    EIGEN_DEVICE_FUNC
+    void makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type);
 
     Scalar m_c, m_s;
 };
@@ -80,11 +90,12 @@ template<typename Scalar> class JacobiRotation
   * \sa MatrixBase::makeJacobi(const MatrixBase<Derived>&, Index, Index), MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
   */
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 bool JacobiRotation<Scalar>::makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z)
 {
   using std::sqrt;
   using std::abs;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
+
   RealScalar deno = RealScalar(2)*abs(y);
   if(deno < (std::numeric_limits<RealScalar>::min)())
   {
@@ -124,6 +135,7 @@ bool JacobiRotation<Scalar>::makeJacobi(const RealScalar& x, const Scalar& y, co
   */
 template<typename Scalar>
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 inline bool JacobiRotation<Scalar>::makeJacobi(const MatrixBase<Derived>& m, Index p, Index q)
 {
   return makeJacobi(numext::real(m.coeff(p,p)), m.coeff(p,q), numext::real(m.coeff(q,q)));
@@ -133,7 +145,7 @@ inline bool JacobiRotation<Scalar>::makeJacobi(const MatrixBase<Derived>& m, Ind
   * \f$ V = \left ( \begin{array}{c} p \\ q \end{array} \right )\f$ yields:
   * \f$ G^* V = \left ( \begin{array}{c} r \\ 0 \end{array} \right )\f$.
   *
-  * The value of \a z is returned if \a z is not null (the default is null).
+  * The value of \a r is returned if \a r is not null (the default is null).
   * Also note that G is built such that the cosine is always real.
   *
   * Example: \include Jacobi_makeGivens.cpp
@@ -146,14 +158,16 @@ inline bool JacobiRotation<Scalar>::makeJacobi(const MatrixBase<Derived>& m, Ind
   * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
   */
 template<typename Scalar>
-void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* z)
+EIGEN_DEVICE_FUNC
+void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* r)
 {
-  makeGivens(p, q, z, typename internal::conditional<NumTraits<Scalar>::IsComplex, internal::true_type, internal::false_type>::type());
+  makeGivens(p, q, r, typename internal::conditional<NumTraits<Scalar>::IsComplex, internal::true_type, internal::false_type>::type());
 }
 
 
 // specialization for complexes
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type)
 {
   using std::sqrt;
@@ -213,6 +227,7 @@ void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar
 
 // specialization for reals
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type)
 {
   using std::sqrt;
@@ -264,6 +279,7 @@ namespace internal {
   * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
   */
 template<typename VectorX, typename VectorY, typename OtherScalar>
+EIGEN_DEVICE_FUNC
 void apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j);
 }
 
@@ -275,6 +291,7 @@ void apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>&
   */
 template<typename Derived>
 template<typename OtherScalar>
+EIGEN_DEVICE_FUNC
 inline void MatrixBase<Derived>::applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j)
 {
   RowXpr x(this->row(p));
@@ -290,6 +307,7 @@ inline void MatrixBase<Derived>::applyOnTheLeft(Index p, Index q, const JacobiRo
   */
 template<typename Derived>
 template<typename OtherScalar>
+EIGEN_DEVICE_FUNC
 inline void MatrixBase<Derived>::applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j)
 {
   ColXpr x(this->col(p));
@@ -298,132 +316,164 @@ inline void MatrixBase<Derived>::applyOnTheRight(Index p, Index q, const JacobiR
 }
 
 namespace internal {
-template<typename VectorX, typename VectorY, typename OtherScalar>
-void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j)
-{
-  typedef typename VectorX::Scalar Scalar;
-  enum { PacketSize = packet_traits<Scalar>::size };
-  typedef typename packet_traits<Scalar>::type Packet;
-  eigen_assert(xpr_x.size() == xpr_y.size());
-  Index size = xpr_x.size();
-  Index incrx = xpr_x.derived().innerStride();
-  Index incry = xpr_y.derived().innerStride();
-
-  Scalar* EIGEN_RESTRICT x = &xpr_x.derived().coeffRef(0);
-  Scalar* EIGEN_RESTRICT y = &xpr_y.derived().coeffRef(0);
-  
-  OtherScalar c = j.c();
-  OtherScalar s = j.s();
-  if (c==OtherScalar(1) && s==OtherScalar(0))
-    return;
 
-  /*** dynamic-size vectorized paths ***/
+template<typename Scalar, typename OtherScalar,
+         int SizeAtCompileTime, int MinAlignment, bool Vectorizable>
+struct apply_rotation_in_the_plane_selector
+{
+  static EIGEN_DEVICE_FUNC
+  inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s)
+  {
+    for(Index i=0; i<size; ++i)
+    {
+      Scalar xi = *x;
+      Scalar yi = *y;
+      *x =  c * xi + numext::conj(s) * yi;
+      *y = -s * xi + numext::conj(c) * yi;
+      x += incrx;
+      y += incry;
+    }
+  }
+};
 
-  if(VectorX::SizeAtCompileTime == Dynamic &&
-    (VectorX::Flags & VectorY::Flags & PacketAccessBit) &&
-    ((incrx==1 && incry==1) || PacketSize == 1))
+template<typename Scalar, typename OtherScalar,
+         int SizeAtCompileTime, int MinAlignment>
+struct apply_rotation_in_the_plane_selector<Scalar,OtherScalar,SizeAtCompileTime,MinAlignment,true /* vectorizable */>
+{
+  static inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s)
   {
-    // both vectors are sequentially stored in memory => vectorization
-    enum { Peeling = 2 };
+    enum {
+      PacketSize = packet_traits<Scalar>::size,
+      OtherPacketSize = packet_traits<OtherScalar>::size
+    };
+    typedef typename packet_traits<Scalar>::type Packet;
+    typedef typename packet_traits<OtherScalar>::type OtherPacket;
+
+    /*** dynamic-size vectorized paths ***/
+    if(SizeAtCompileTime == Dynamic && ((incrx==1 && incry==1) || PacketSize == 1))
+    {
+      // both vectors are sequentially stored in memory => vectorization
+      enum { Peeling = 2 };
 
-    Index alignedStart = internal::first_default_aligned(y, size);
-    Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize;
+      Index alignedStart = internal::first_default_aligned(y, size);
+      Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize;
 
-    const Packet pc = pset1<Packet>(c);
-    const Packet ps = pset1<Packet>(s);
-    conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex,false> pcj;
+      const OtherPacket pc = pset1<OtherPacket>(c);
+      const OtherPacket ps = pset1<OtherPacket>(s);
+      conj_helper<OtherPacket,Packet,NumTraits<OtherScalar>::IsComplex,false> pcj;
+      conj_helper<OtherPacket,Packet,false,false> pm;
 
-    for(Index i=0; i<alignedStart; ++i)
-    {
-      Scalar xi = x[i];
-      Scalar yi = y[i];
-      x[i] =  c * xi + numext::conj(s) * yi;
-      y[i] = -s * xi + numext::conj(c) * yi;
-    }
+      for(Index i=0; i<alignedStart; ++i)
+      {
+        Scalar xi = x[i];
+        Scalar yi = y[i];
+        x[i] =  c * xi + numext::conj(s) * yi;
+        y[i] = -s * xi + numext::conj(c) * yi;
+      }
 
-    Scalar* EIGEN_RESTRICT px = x + alignedStart;
-    Scalar* EIGEN_RESTRICT py = y + alignedStart;
+      Scalar* EIGEN_RESTRICT px = x + alignedStart;
+      Scalar* EIGEN_RESTRICT py = y + alignedStart;
 
-    if(internal::first_default_aligned(x, size)==alignedStart)
-    {
-      for(Index i=alignedStart; i<alignedEnd; i+=PacketSize)
+      if(internal::first_default_aligned(x, size)==alignedStart)
       {
-        Packet xi = pload<Packet>(px);
-        Packet yi = pload<Packet>(py);
-        pstore(px, padd(pmul(pc,xi),pcj.pmul(ps,yi)));
-        pstore(py, psub(pcj.pmul(pc,yi),pmul(ps,xi)));
-        px += PacketSize;
-        py += PacketSize;
+        for(Index i=alignedStart; i<alignedEnd; i+=PacketSize)
+        {
+          Packet xi = pload<Packet>(px);
+          Packet yi = pload<Packet>(py);
+          pstore(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi)));
+          pstore(py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi)));
+          px += PacketSize;
+          py += PacketSize;
+        }
       }
-    }
-    else
-    {
-      Index peelingEnd = alignedStart + ((size-alignedStart)/(Peeling*PacketSize))*(Peeling*PacketSize);
-      for(Index i=alignedStart; i<peelingEnd; i+=Peeling*PacketSize)
+      else
       {
-        Packet xi   = ploadu<Packet>(px);
-        Packet xi1  = ploadu<Packet>(px+PacketSize);
-        Packet yi   = pload <Packet>(py);
-        Packet yi1  = pload <Packet>(py+PacketSize);
-        pstoreu(px, padd(pmul(pc,xi),pcj.pmul(ps,yi)));
-        pstoreu(px+PacketSize, padd(pmul(pc,xi1),pcj.pmul(ps,yi1)));
-        pstore (py, psub(pcj.pmul(pc,yi),pmul(ps,xi)));
-        pstore (py+PacketSize, psub(pcj.pmul(pc,yi1),pmul(ps,xi1)));
-        px += Peeling*PacketSize;
-        py += Peeling*PacketSize;
+        Index peelingEnd = alignedStart + ((size-alignedStart)/(Peeling*PacketSize))*(Peeling*PacketSize);
+        for(Index i=alignedStart; i<peelingEnd; i+=Peeling*PacketSize)
+        {
+          Packet xi   = ploadu<Packet>(px);
+          Packet xi1  = ploadu<Packet>(px+PacketSize);
+          Packet yi   = pload <Packet>(py);
+          Packet yi1  = pload <Packet>(py+PacketSize);
+          pstoreu(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi)));
+          pstoreu(px+PacketSize, padd(pm.pmul(pc,xi1),pcj.pmul(ps,yi1)));
+          pstore (py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi)));
+          pstore (py+PacketSize, psub(pcj.pmul(pc,yi1),pm.pmul(ps,xi1)));
+          px += Peeling*PacketSize;
+          py += Peeling*PacketSize;
+        }
+        if(alignedEnd!=peelingEnd)
+        {
+          Packet xi = ploadu<Packet>(x+peelingEnd);
+          Packet yi = pload <Packet>(y+peelingEnd);
+          pstoreu(x+peelingEnd, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi)));
+          pstore (y+peelingEnd, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi)));
+        }
       }
-      if(alignedEnd!=peelingEnd)
+
+      for(Index i=alignedEnd; i<size; ++i)
       {
-        Packet xi = ploadu<Packet>(x+peelingEnd);
-        Packet yi = pload <Packet>(y+peelingEnd);
-        pstoreu(x+peelingEnd, padd(pmul(pc,xi),pcj.pmul(ps,yi)));
-        pstore (y+peelingEnd, psub(pcj.pmul(pc,yi),pmul(ps,xi)));
+        Scalar xi = x[i];
+        Scalar yi = y[i];
+        x[i] =  c * xi + numext::conj(s) * yi;
+        y[i] = -s * xi + numext::conj(c) * yi;
       }
     }
 
-    for(Index i=alignedEnd; i<size; ++i)
+    /*** fixed-size vectorized path ***/
+    else if(SizeAtCompileTime != Dynamic && MinAlignment>0) // FIXME should be compared to the required alignment
     {
-      Scalar xi = x[i];
-      Scalar yi = y[i];
-      x[i] =  c * xi + numext::conj(s) * yi;
-      y[i] = -s * xi + numext::conj(c) * yi;
+      const OtherPacket pc = pset1<OtherPacket>(c);
+      const OtherPacket ps = pset1<OtherPacket>(s);
+      conj_helper<OtherPacket,Packet,NumTraits<OtherPacket>::IsComplex,false> pcj;
+      conj_helper<OtherPacket,Packet,false,false> pm;
+      Scalar* EIGEN_RESTRICT px = x;
+      Scalar* EIGEN_RESTRICT py = y;
+      for(Index i=0; i<size; i+=PacketSize)
+      {
+        Packet xi = pload<Packet>(px);
+        Packet yi = pload<Packet>(py);
+        pstore(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi)));
+        pstore(py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi)));
+        px += PacketSize;
+        py += PacketSize;
+      }
     }
-  }
 
-  /*** fixed-size vectorized path ***/
-  else if(VectorX::SizeAtCompileTime != Dynamic &&
-          (VectorX::Flags & VectorY::Flags & PacketAccessBit) &&
-          (EIGEN_PLAIN_ENUM_MIN(evaluator<VectorX>::Alignment, evaluator<VectorY>::Alignment)>0)) // FIXME should be compared to the required alignment
-  {
-    const Packet pc = pset1<Packet>(c);
-    const Packet ps = pset1<Packet>(s);
-    conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex,false> pcj;
-    Scalar* EIGEN_RESTRICT px = x;
-    Scalar* EIGEN_RESTRICT py = y;
-    for(Index i=0; i<size; i+=PacketSize)
+    /*** non-vectorized path ***/
+    else
     {
-      Packet xi = pload<Packet>(px);
-      Packet yi = pload<Packet>(py);
-      pstore(px, padd(pmul(pc,xi),pcj.pmul(ps,yi)));
-      pstore(py, psub(pcj.pmul(pc,yi),pmul(ps,xi)));
-      px += PacketSize;
-      py += PacketSize;
+      apply_rotation_in_the_plane_selector<Scalar,OtherScalar,SizeAtCompileTime,MinAlignment,false>::run(x,incrx,y,incry,size,c,s);
     }
   }
+};
 
-  /*** non-vectorized path ***/
-  else
-  {
-    for(Index i=0; i<size; ++i)
-    {
-      Scalar xi = *x;
-      Scalar yi = *y;
-      *x =  c * xi + numext::conj(s) * yi;
-      *y = -s * xi + numext::conj(c) * yi;
-      x += incrx;
-      y += incry;
-    }
-  }
+template<typename VectorX, typename VectorY, typename OtherScalar>
+EIGEN_DEVICE_FUNC
+void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j)
+{
+  typedef typename VectorX::Scalar Scalar;
+  const bool Vectorizable =    (VectorX::Flags & VectorY::Flags & PacketAccessBit)
+                            && (int(packet_traits<Scalar>::size) == int(packet_traits<OtherScalar>::size));
+
+  eigen_assert(xpr_x.size() == xpr_y.size());
+  Index size = xpr_x.size();
+  Index incrx = xpr_x.derived().innerStride();
+  Index incry = xpr_y.derived().innerStride();
+
+  Scalar* EIGEN_RESTRICT x = &xpr_x.derived().coeffRef(0);
+  Scalar* EIGEN_RESTRICT y = &xpr_y.derived().coeffRef(0);
+  
+  OtherScalar c = j.c();
+  OtherScalar s = j.s();
+  if (c==OtherScalar(1) && s==OtherScalar(0))
+    return;
+
+  apply_rotation_in_the_plane_selector<
+    Scalar,OtherScalar,
+    VectorX::SizeAtCompileTime,
+    EIGEN_PLAIN_ENUM_MIN(evaluator<VectorX>::Alignment, evaluator<VectorY>::Alignment),
+    Vectorizable>::run(x,incrx,y,incry,size,c,s);
 }
 
 } // end namespace internal
diff --git a/Eigen/src/KLUSupport/KLUSupport.h b/Eigen/src/KLUSupport/KLUSupport.h
new file mode 100644
index 000000000..d2633a935
--- /dev/null
+++ b/Eigen/src/KLUSupport/KLUSupport.h
@@ -0,0 +1,358 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Kyle Macfarlan <kyle.macfarlan@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_KLUSUPPORT_H
+#define EIGEN_KLUSUPPORT_H
+
+namespace Eigen {
+
+/* TODO extract L, extract U, compute det, etc... */
+
+/** \ingroup KLUSupport_Module
+  * \brief A sparse LU factorization and solver based on KLU
+  *
+  * This class allows to solve for A.X = B sparse linear problems via a LU factorization
+  * using the KLU library. The sparse matrix A must be squared and full rank.
+  * The vectors or matrices X and B can be either dense or sparse.
+  *
+  * \warning The input matrix A should be in a \b compressed and \b column-major form.
+  * Otherwise an expensive copy will be made. You can call the inexpensive makeCompressed() to get a compressed matrix.
+  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
+  *
+  * \implsparsesolverconcept
+  *
+  * \sa \ref TutorialSparseSolverConcept, class UmfPackLU, class SparseLU
+  */
+
+
+inline int klu_solve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, double B [ ], klu_common *Common, double) {
+   return klu_solve(Symbolic, Numeric, internal::convert_index<int>(ldim), internal::convert_index<int>(nrhs), B, Common);
+}
+
+inline int klu_solve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, std::complex<double>B[], klu_common *Common, std::complex<double>) {
+   return klu_z_solve(Symbolic, Numeric, internal::convert_index<int>(ldim), internal::convert_index<int>(nrhs), &numext::real_ref(B[0]), Common);
+}
+
+inline int klu_tsolve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, double B[], klu_common *Common, double) {
+   return klu_tsolve(Symbolic, Numeric, internal::convert_index<int>(ldim), internal::convert_index<int>(nrhs), B, Common);
+}
+
+inline int klu_tsolve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, std::complex<double>B[], klu_common *Common, std::complex<double>) {
+   return klu_z_tsolve(Symbolic, Numeric, internal::convert_index<int>(ldim), internal::convert_index<int>(nrhs), &numext::real_ref(B[0]), 0, Common);
+}
+
+inline klu_numeric* klu_factor(int Ap [ ], int Ai [ ], double Ax [ ], klu_symbolic *Symbolic, klu_common *Common, double) {
+   return klu_factor(Ap, Ai, Ax, Symbolic, Common);
+}
+
+inline klu_numeric* klu_factor(int Ap[], int Ai[], std::complex<double> Ax[], klu_symbolic *Symbolic, klu_common *Common, std::complex<double>) {
+   return klu_z_factor(Ap, Ai, &numext::real_ref(Ax[0]), Symbolic, Common);
+}
+
+
+template<typename _MatrixType>
+class KLU : public SparseSolverBase<KLU<_MatrixType> >
+{
+  protected:
+    typedef SparseSolverBase<KLU<_MatrixType> > Base;
+    using Base::m_isInitialized;
+  public:
+    using Base::_solve_impl;
+    typedef _MatrixType MatrixType;
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef Matrix<Scalar,Dynamic,1> Vector;
+    typedef Matrix<int, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
+    typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
+    typedef SparseMatrix<Scalar> LUMatrixType;
+    typedef SparseMatrix<Scalar,ColMajor,int> KLUMatrixType;
+    typedef Ref<const KLUMatrixType, StandardCompressedFormat> KLUMatrixRef;
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
+
+  public:
+
+    KLU()
+      : m_dummy(0,0), mp_matrix(m_dummy)
+    {
+      init();
+    }
+
+    template<typename InputMatrixType>
+    explicit KLU(const InputMatrixType& matrix)
+      : mp_matrix(matrix)
+    {
+      init();
+      compute(matrix);
+    }
+
+    ~KLU()
+    {
+      if(m_symbolic) klu_free_symbolic(&m_symbolic,&m_common);
+      if(m_numeric)  klu_free_numeric(&m_numeric,&m_common);
+    }
+
+    inline Index rows() const { return mp_matrix.rows(); }
+    inline Index cols() const { return mp_matrix.cols(); }
+
+    /** \brief Reports whether previous computation was successful.
+      *
+      * \returns \c Success if computation was successful,
+      *          \c NumericalIssue if the matrix.appears to be negative.
+      */
+    ComputationInfo info() const
+    {
+      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+      return m_info;
+    }
+#if 0 // not implemented yet
+    inline const LUMatrixType& matrixL() const
+    {
+      if (m_extractedDataAreDirty) extractData();
+      return m_l;
+    }
+
+    inline const LUMatrixType& matrixU() const
+    {
+      if (m_extractedDataAreDirty) extractData();
+      return m_u;
+    }
+
+    inline const IntColVectorType& permutationP() const
+    {
+      if (m_extractedDataAreDirty) extractData();
+      return m_p;
+    }
+
+    inline const IntRowVectorType& permutationQ() const
+    {
+      if (m_extractedDataAreDirty) extractData();
+      return m_q;
+    }
+#endif
+    /** Computes the sparse Cholesky decomposition of \a matrix
+     *  Note that the matrix should be column-major, and in compressed format for best performance.
+     *  \sa SparseMatrix::makeCompressed().
+     */
+    template<typename InputMatrixType>
+    void compute(const InputMatrixType& matrix)
+    {
+      if(m_symbolic) klu_free_symbolic(&m_symbolic, &m_common);
+      if(m_numeric)  klu_free_numeric(&m_numeric, &m_common);
+      grab(matrix.derived());
+      analyzePattern_impl();
+      factorize_impl();
+    }
+
+    /** Performs a symbolic decomposition on the sparcity of \a matrix.
+      *
+      * This function is particularly useful when solving for several problems having the same structure.
+      *
+      * \sa factorize(), compute()
+      */
+    template<typename InputMatrixType>
+    void analyzePattern(const InputMatrixType& matrix)
+    {
+      if(m_symbolic) klu_free_symbolic(&m_symbolic, &m_common);
+      if(m_numeric)  klu_free_numeric(&m_numeric, &m_common);
+
+      grab(matrix.derived());
+
+      analyzePattern_impl();
+    }
+
+
+    /** Provides access to the control settings array used by KLU.
+      *
+      * See KLU documentation for details.
+      */
+    inline const klu_common& kluCommon() const
+    {
+      return m_common;
+    }
+
+    /** Provides access to the control settings array used by UmfPack.
+      *
+      * If this array contains NaN's, the default values are used.
+      *
+      * See KLU documentation for details.
+      */
+    inline klu_common& kluCommon()
+    {
+      return m_common;
+    }
+
+    /** Performs a numeric decomposition of \a matrix
+      *
+      * The given matrix must has the same sparcity than the matrix on which the pattern anylysis has been performed.
+      *
+      * \sa analyzePattern(), compute()
+      */
+    template<typename InputMatrixType>
+    void factorize(const InputMatrixType& matrix)
+    {
+      eigen_assert(m_analysisIsOk && "KLU: you must first call analyzePattern()");
+      if(m_numeric)
+        klu_free_numeric(&m_numeric,&m_common);
+
+      grab(matrix.derived());
+
+      factorize_impl();
+    }
+
+    /** \internal */
+    template<typename BDerived,typename XDerived>
+    bool _solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const;
+
+#if 0 // not implemented yet
+    Scalar determinant() const;
+
+    void extractData() const;
+#endif
+
+  protected:
+
+    void init()
+    {
+      m_info                  = InvalidInput;
+      m_isInitialized         = false;
+      m_numeric               = 0;
+      m_symbolic              = 0;
+      m_extractedDataAreDirty = true;
+
+      klu_defaults(&m_common);
+    }
+
+    void analyzePattern_impl()
+    {
+      m_info = InvalidInput;
+      m_analysisIsOk = false;
+      m_factorizationIsOk = false;
+      m_symbolic = klu_analyze(internal::convert_index<int>(mp_matrix.rows()),
+                                     const_cast<StorageIndex*>(mp_matrix.outerIndexPtr()), const_cast<StorageIndex*>(mp_matrix.innerIndexPtr()),
+                                     &m_common);
+      if (m_symbolic) {
+         m_isInitialized = true;
+         m_info = Success;
+         m_analysisIsOk = true;
+         m_extractedDataAreDirty = true;
+      }
+    }
+
+    void factorize_impl()
+    {
+
+      m_numeric = klu_factor(const_cast<StorageIndex*>(mp_matrix.outerIndexPtr()), const_cast<StorageIndex*>(mp_matrix.innerIndexPtr()), const_cast<Scalar*>(mp_matrix.valuePtr()),
+                                    m_symbolic, &m_common, Scalar());
+                                         
+
+      m_info = m_numeric ? Success : NumericalIssue;
+      m_factorizationIsOk = m_numeric ? 1 : 0;
+      m_extractedDataAreDirty = true;
+    }
+
+    template<typename MatrixDerived>
+    void grab(const EigenBase<MatrixDerived> &A)
+    {
+      mp_matrix.~KLUMatrixRef();
+      ::new (&mp_matrix) KLUMatrixRef(A.derived());
+    }
+
+    void grab(const KLUMatrixRef &A)
+    {
+      if(&(A.derived()) != &mp_matrix)
+      {
+        mp_matrix.~KLUMatrixRef();
+        ::new (&mp_matrix) KLUMatrixRef(A);
+      }
+    }
+
+    // cached data to reduce reallocation, etc.
+#if 0 // not implemented yet
+    mutable LUMatrixType m_l;
+    mutable LUMatrixType m_u;
+    mutable IntColVectorType m_p;
+    mutable IntRowVectorType m_q;
+#endif
+
+    KLUMatrixType m_dummy;
+    KLUMatrixRef mp_matrix;
+
+    klu_numeric* m_numeric;
+    klu_symbolic* m_symbolic;
+    klu_common m_common;
+    mutable ComputationInfo m_info;
+    int m_factorizationIsOk;
+    int m_analysisIsOk;
+    mutable bool m_extractedDataAreDirty;
+
+  private:
+    KLU(const KLU& ) { }
+};
+
+#if 0 // not implemented yet
+template<typename MatrixType>
+void KLU<MatrixType>::extractData() const
+{
+  if (m_extractedDataAreDirty)
+  {
+     eigen_assert(false && "KLU: extractData Not Yet Implemented");
+
+    // get size of the data
+    int lnz, unz, rows, cols, nz_udiag;
+    umfpack_get_lunz(&lnz, &unz, &rows, &cols, &nz_udiag, m_numeric, Scalar());
+
+    // allocate data
+    m_l.resize(rows,(std::min)(rows,cols));
+    m_l.resizeNonZeros(lnz);
+
+    m_u.resize((std::min)(rows,cols),cols);
+    m_u.resizeNonZeros(unz);
+
+    m_p.resize(rows);
+    m_q.resize(cols);
+
+    // extract
+    umfpack_get_numeric(m_l.outerIndexPtr(), m_l.innerIndexPtr(), m_l.valuePtr(),
+                        m_u.outerIndexPtr(), m_u.innerIndexPtr(), m_u.valuePtr(),
+                        m_p.data(), m_q.data(), 0, 0, 0, m_numeric);
+
+    m_extractedDataAreDirty = false;
+  }
+}
+
+template<typename MatrixType>
+typename KLU<MatrixType>::Scalar KLU<MatrixType>::determinant() const
+{
+  eigen_assert(false && "KLU: extractData Not Yet Implemented");
+  return Scalar();
+}
+#endif
+
+template<typename MatrixType>
+template<typename BDerived,typename XDerived>
+bool KLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const
+{
+  Index rhsCols = b.cols();
+  EIGEN_STATIC_ASSERT((XDerived::Flags&RowMajorBit)==0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+  eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or analyzePattern()/factorize()");
+
+  x = b;
+  int info = klu_solve(m_symbolic, m_numeric, b.rows(), rhsCols, x.const_cast_derived().data(), const_cast<klu_common*>(&m_common), Scalar());
+
+  m_info = info!=0 ? Success : NumericalIssue;
+  return true;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_KLUSUPPORT_H
diff --git a/Eigen/src/LU/Determinant.h b/Eigen/src/LU/Determinant.h
index d6a3c1e5a..6af63a6e7 100644
--- a/Eigen/src/LU/Determinant.h
+++ b/Eigen/src/LU/Determinant.h
@@ -15,6 +15,7 @@ namespace Eigen {
 namespace internal {
 
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 inline const typename Derived::Scalar bruteforce_det3_helper
 (const MatrixBase<Derived>& matrix, int a, int b, int c)
 {
@@ -23,6 +24,7 @@ inline const typename Derived::Scalar bruteforce_det3_helper
 }
 
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 const typename Derived::Scalar bruteforce_det4_helper
 (const MatrixBase<Derived>& matrix, int j, int k, int m, int n)
 {
@@ -44,7 +46,8 @@ template<typename Derived,
 
 template<typename Derived> struct determinant_impl<Derived, 1>
 {
-  static inline typename traits<Derived>::Scalar run(const Derived& m)
+  static inline EIGEN_DEVICE_FUNC
+  typename traits<Derived>::Scalar run(const Derived& m)
   {
     return m.coeff(0,0);
   }
@@ -52,7 +55,8 @@ template<typename Derived> struct determinant_impl<Derived, 1>
 
 template<typename Derived> struct determinant_impl<Derived, 2>
 {
-  static inline typename traits<Derived>::Scalar run(const Derived& m)
+  static inline EIGEN_DEVICE_FUNC
+  typename traits<Derived>::Scalar run(const Derived& m)
   {
     return m.coeff(0,0) * m.coeff(1,1) - m.coeff(1,0) * m.coeff(0,1);
   }
@@ -60,7 +64,8 @@ template<typename Derived> struct determinant_impl<Derived, 2>
 
 template<typename Derived> struct determinant_impl<Derived, 3>
 {
-  static inline typename traits<Derived>::Scalar run(const Derived& m)
+  static inline EIGEN_DEVICE_FUNC
+  typename traits<Derived>::Scalar run(const Derived& m)
   {
     return bruteforce_det3_helper(m,0,1,2)
           - bruteforce_det3_helper(m,1,0,2)
@@ -70,7 +75,8 @@ template<typename Derived> struct determinant_impl<Derived, 3>
 
 template<typename Derived> struct determinant_impl<Derived, 4>
 {
-  static typename traits<Derived>::Scalar run(const Derived& m)
+  static EIGEN_DEVICE_FUNC
+  typename traits<Derived>::Scalar run(const Derived& m)
   {
     // trick by Martin Costabel to compute 4x4 det with only 30 muls
     return bruteforce_det4_helper(m,0,1,2,3)
@@ -89,6 +95,7 @@ template<typename Derived> struct determinant_impl<Derived, 4>
   * \returns the determinant of this matrix
   */
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 inline typename internal::traits<Derived>::Scalar MatrixBase<Derived>::determinant() const
 {
   eigen_assert(rows() == cols());
diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h
index 03b6af706..344ec8926 100644
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@@ -48,12 +48,12 @@ template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> >
   * The data of the LU decomposition can be directly accessed through the methods matrixLU(),
   * permutationP(), permutationQ().
   *
-  * As an exemple, here is how the original matrix can be retrieved:
+  * As an example, here is how the original matrix can be retrieved:
   * \include class_FullPivLU.cpp
   * Output: \verbinclude class_FullPivLU.out
   *
   * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
-  * 
+  *
   * \sa MatrixBase::fullPivLu(), MatrixBase::determinant(), MatrixBase::inverse()
   */
 template<typename _MatrixType> class FullPivLU
@@ -320,7 +320,7 @@ template<typename _MatrixType> class FullPivLU
       return m_usePrescribedThreshold ? m_prescribedThreshold
       // this formula comes from experimenting (see "LU precision tuning" thread on the list)
       // and turns out to be identical to Higham's formula used already in LDLt.
-                                      : NumTraits<Scalar>::epsilon() * m_lu.diagonalSize();
+          : NumTraits<Scalar>::epsilon() * RealScalar(m_lu.diagonalSize());
     }
 
     /** \returns the rank of the matrix of which *this is the LU decomposition.
@@ -411,11 +411,9 @@ template<typename _MatrixType> class FullPivLU
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
 
     template<bool Conjugate, typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
     void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
     #endif
 
diff --git a/Eigen/src/LU/InverseImpl.h b/Eigen/src/LU/InverseImpl.h
index 018f99b58..1bab00c01 100644
--- a/Eigen/src/LU/InverseImpl.h
+++ b/Eigen/src/LU/InverseImpl.h
@@ -290,6 +290,7 @@ template<typename DstXprType, typename XprType>
 struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar>, Dense2Dense>
 {
   typedef Inverse<XprType> SrcXprType;
+  EIGEN_DEVICE_FUNC
   static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar> &)
   {
     Index dstRows = src.rows();
@@ -332,6 +333,7 @@ struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename Dst
   * \sa computeInverseAndDetWithCheck()
   */
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 inline const Inverse<Derived> MatrixBase<Derived>::inverse() const
 {
   EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsInteger,THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
@@ -404,7 +406,7 @@ inline void MatrixBase<Derived>::computeInverseWithCheck(
     const RealScalar& absDeterminantThreshold
   ) const
 {
-  RealScalar determinant;
+  Scalar determinant;
   // i'd love to put some static assertions there, but SFINAE means that they have no effect...
   eigen_assert(rows() == cols());
   computeInverseAndDetWithCheck(inverse,determinant,invertible,absDeterminantThreshold);
diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index d43961887..bfcd2c95b 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@@ -420,8 +420,8 @@ struct partial_lu_impl
     * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise.
     *
     * \note This very low level interface using pointers, etc. is to:
-    *   1 - reduce the number of instanciations to the strict minimum
-    *   2 - avoid infinite recursion of the instanciations with Block<Block<Block<...> > >
+    *   1 - reduce the number of instantiations to the strict minimum
+    *   2 - avoid infinite recursion of the instantiations with Block<Block<Block<...> > >
     */
   static Index blocked_lu(Index rows, Index cols, Scalar* lu_data, Index luStride, PivIndex* row_transpositions, PivIndex& nb_transpositions, Index maxBlockSize=256)
   {
diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h
index 933cd564b..67fcad3f7 100644
--- a/Eigen/src/OrderingMethods/Eigen_Colamd.h
+++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h
@@ -1004,7 +1004,7 @@ static IndexType find_ordering /* return the number of garbage collections */
     COLAMD_ASSERT (head [min_score] >= COLAMD_EMPTY) ;
 
     /* get pivot column from head of minimum degree list */
-    while (head [min_score] == COLAMD_EMPTY && min_score < n_col)
+    while (min_score < n_col && head [min_score] == COLAMD_EMPTY)
     {
       min_score++ ;
     }
@@ -1493,7 +1493,7 @@ static inline  void order_children
 	c = Col [c].shared1.parent ;
 
 	/* continue until we hit an ordered column.  There are */
-	/* guarranteed not to be anymore unordered columns */
+	/* guaranteed not to be anymore unordered columns */
 	/* above an ordered column */
       } while (Col [c].shared2.order == COLAMD_EMPTY) ;
 
@@ -1638,7 +1638,7 @@ static void detect_super_cols
 	  COLAMD_ASSERT (ROW_IS_ALIVE (*cp1))  ;
 	  COLAMD_ASSERT (ROW_IS_ALIVE (*cp2))  ;
 	  /* row indices will same order for both supercols, */
-	  /* no gather scatter nessasary */
+	  /* no gather scatter necessary */
 	  if (*cp1++ != *cp2++)
 	  {
 	    break ;
@@ -1688,7 +1688,7 @@ static void detect_super_cols
 
 /*
   Defragments and compacts columns and rows in the workspace A.  Used when
-  all avaliable memory has been used while performing row merging.  Returns
+  all available memory has been used while performing row merging.  Returns
   the index of the first free position in A, after garbage collection.  The
   time taken by this routine is linear is the size of the array A, which is
   itself linear in the number of nonzeros in the input matrix.
diff --git a/Eigen/src/OrderingMethods/Ordering.h b/Eigen/src/OrderingMethods/Ordering.h
index 7ea9b14d7..34dbef487 100644
--- a/Eigen/src/OrderingMethods/Ordering.h
+++ b/Eigen/src/OrderingMethods/Ordering.h
@@ -31,7 +31,7 @@ void ordering_helper_at_plus_a(const MatrixType& A, MatrixType& symmat)
   for (int i = 0; i < C.rows(); i++) 
   {
       for (typename MatrixType::InnerIterator it(C, i); it; ++it)
-        it.valueRef() = 0.0;
+        it.valueRef() = typename MatrixType::Scalar(0);
   }
   symmat = C + A;
 }
diff --git a/Eigen/src/PaStiXSupport/PaStiXSupport.h b/Eigen/src/PaStiXSupport/PaStiXSupport.h
index d2ebfd7bb..37426877a 100644
--- a/Eigen/src/PaStiXSupport/PaStiXSupport.h
+++ b/Eigen/src/PaStiXSupport/PaStiXSupport.h
@@ -64,28 +64,28 @@ namespace internal
     typedef typename _MatrixType::StorageIndex StorageIndex;
   };
   
-  void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, float *vals, int *perm, int * invp, float *x, int nbrhs, int *iparm, double *dparm)
+  inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, float *vals, int *perm, int * invp, float *x, int nbrhs, int *iparm, double *dparm)
   {
     if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; }
     if (nbrhs == 0) {x = NULL; nbrhs=1;}
     s_pastix(pastix_data, pastix_comm, n, ptr, idx, vals, perm, invp, x, nbrhs, iparm, dparm); 
   }
   
-  void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, double *vals, int *perm, int * invp, double *x, int nbrhs, int *iparm, double *dparm)
+  inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, double *vals, int *perm, int * invp, double *x, int nbrhs, int *iparm, double *dparm)
   {
     if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; }
     if (nbrhs == 0) {x = NULL; nbrhs=1;}
     d_pastix(pastix_data, pastix_comm, n, ptr, idx, vals, perm, invp, x, nbrhs, iparm, dparm); 
   }
   
-  void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex<float> *vals, int *perm, int * invp, std::complex<float> *x, int nbrhs, int *iparm, double *dparm)
+  inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex<float> *vals, int *perm, int * invp, std::complex<float> *x, int nbrhs, int *iparm, double *dparm)
   {
     if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; }
     if (nbrhs == 0) {x = NULL; nbrhs=1;}
     c_pastix(pastix_data, pastix_comm, n, ptr, idx, reinterpret_cast<PASTIX_COMPLEX*>(vals), perm, invp, reinterpret_cast<PASTIX_COMPLEX*>(x), nbrhs, iparm, dparm); 
   }
   
-  void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex<double> *vals, int *perm, int * invp, std::complex<double> *x, int nbrhs, int *iparm, double *dparm)
+  inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex<double> *vals, int *perm, int * invp, std::complex<double> *x, int nbrhs, int *iparm, double *dparm)
   {
     if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; }
     if (nbrhs == 0) {x = NULL; nbrhs=1;}
@@ -203,7 +203,7 @@ class PastixBase : public SparseSolverBase<Derived>
     
      /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the PaStiX reports a problem
       *          \c InvalidInput if the input matrix is invalid
       *
diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h
index 091c3970e..fb2ba04b4 100644
--- a/Eigen/src/PardisoSupport/PardisoSupport.h
+++ b/Eigen/src/PardisoSupport/PardisoSupport.h
@@ -140,7 +140,7 @@ class PardisoImpl : public SparseSolverBase<Derived>
   
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the matrix appears to be negative.
       */
     ComputationInfo info() const
diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h
index 0e47c8332..1faa3442e 100644
--- a/Eigen/src/QR/ColPivHouseholderQR.h
+++ b/Eigen/src/QR/ColPivHouseholderQR.h
@@ -402,7 +402,7 @@ template<typename _MatrixType> class ColPivHouseholderQR
       */
     RealScalar maxPivot() const { return m_maxpivot; }
 
-    /** \brief Reports whether the QR factorization was succesful.
+    /** \brief Reports whether the QR factorization was successful.
       *
       * \note This function always returns \c Success. It is provided for compatibility
       * with other factorization routines.
@@ -416,7 +416,6 @@ template<typename _MatrixType> class ColPivHouseholderQR
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
     #endif
 
@@ -506,8 +505,8 @@ void ColPivHouseholderQR<MatrixType>::computeInPlace()
     m_colNormsUpdated.coeffRef(k) = m_colNormsDirect.coeffRef(k);
   }
 
-  RealScalar threshold_helper =  numext::abs2<Scalar>(m_colNormsUpdated.maxCoeff() * NumTraits<Scalar>::epsilon()) / RealScalar(rows);
-  RealScalar norm_downdate_threshold = numext::sqrt(NumTraits<Scalar>::epsilon());
+  RealScalar threshold_helper =  numext::abs2<RealScalar>(m_colNormsUpdated.maxCoeff() * NumTraits<RealScalar>::epsilon()) / RealScalar(rows);
+  RealScalar norm_downdate_threshold = numext::sqrt(NumTraits<RealScalar>::epsilon());
 
   m_nonzero_pivots = size; // the generic case is that in which all pivots are nonzero (invertible case)
   m_maxpivot = RealScalar(0);
@@ -553,12 +552,12 @@ void ColPivHouseholderQR<MatrixType>::computeInPlace()
       // http://www.netlib.org/lapack/lawnspdf/lawn176.pdf
       // and used in LAPACK routines xGEQPF and xGEQP3.
       // See lines 278-297 in http://www.netlib.org/lapack/explore-html/dc/df4/sgeqpf_8f_source.html
-      if (m_colNormsUpdated.coeffRef(j) != 0) {
+      if (m_colNormsUpdated.coeffRef(j) != RealScalar(0)) {
         RealScalar temp = abs(m_qr.coeffRef(k, j)) / m_colNormsUpdated.coeffRef(j);
         temp = (RealScalar(1) + temp) * (RealScalar(1) - temp);
-        temp = temp < 0 ? 0 : temp;
-        RealScalar temp2 = temp * numext::abs2<Scalar>(m_colNormsUpdated.coeffRef(j) /
-                                                       m_colNormsDirect.coeffRef(j));
+        temp = temp <  RealScalar(0) ? RealScalar(0) : temp;
+        RealScalar temp2 = temp * numext::abs2<RealScalar>(m_colNormsUpdated.coeffRef(j) /
+                                                           m_colNormsDirect.coeffRef(j));
         if (temp2 <= norm_downdate_threshold) {
           // The updated norm has become too inaccurate so re-compute the column
           // norm directly.
@@ -596,11 +595,7 @@ void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &
 
   typename RhsType::PlainObject c(rhs);
 
-  // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T
-  c.applyOnTheLeft(householderSequence(m_qr, m_hCoeffs)
-                    .setLength(nonzero_pivots)
-                    .transpose()
-    );
+  c.applyOnTheLeft(householderQ().setLength(nonzero_pivots).adjoint() );
 
   m_qr.topLeftCorner(nonzero_pivots, nonzero_pivots)
       .template triangularView<Upper>()
diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h
index 34c637b70..03017a375 100644
--- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h
+++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h
@@ -353,7 +353,7 @@ class CompleteOrthogonalDecomposition {
   inline RealScalar maxPivot() const { return m_cpqr.maxPivot(); }
 
   /** \brief Reports whether the complete orthogonal decomposition was
-   * succesful.
+   * successful.
    *
    * \note This function always returns \c Success. It is provided for
    * compatibility
@@ -367,7 +367,7 @@ class CompleteOrthogonalDecomposition {
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
   template <typename RhsType, typename DstType>
-  EIGEN_DEVICE_FUNC void _solve_impl(const RhsType& rhs, DstType& dst) const;
+  void _solve_impl(const RhsType& rhs, DstType& dst) const;
 #endif
 
  protected:
@@ -452,7 +452,7 @@ void CompleteOrthogonalDecomposition<MatrixType>::computeInPlace()
         // Apply Z(k) to the first k rows of X_k
         m_cpqr.m_qr.topRightCorner(k, cols - rank + 1)
             .applyHouseholderOnTheRight(
-                m_cpqr.m_qr.row(k).tail(cols - rank).transpose(), m_zCoeffs(k),
+                m_cpqr.m_qr.row(k).tail(cols - rank).adjoint(), m_zCoeffs(k),
                 &m_temp(0));
       }
       if (k != rank - 1) {
@@ -500,11 +500,8 @@ void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl(
   }
 
   // Compute c = Q^* * rhs
-  // Note that the matrix Q = H_0^* H_1^*... so its inverse is
-  // Q^* = (H_0 H_1 ...)^T
   typename RhsType::PlainObject c(rhs);
-  c.applyOnTheLeft(
-      householderSequence(matrixQTZ(), hCoeffs()).setLength(rank).transpose());
+  c.applyOnTheLeft(matrixQ().setLength(rank).adjoint());
 
   // Solve T z = c(1:rank, :)
   dst.topRows(rank) = matrixT()
diff --git a/Eigen/src/QR/FullPivHouseholderQR.h b/Eigen/src/QR/FullPivHouseholderQR.h
index e489bddc2..c31e47cc4 100644
--- a/Eigen/src/QR/FullPivHouseholderQR.h
+++ b/Eigen/src/QR/FullPivHouseholderQR.h
@@ -392,22 +392,21 @@ template<typename _MatrixType> class FullPivHouseholderQR
       *          diagonal coefficient of U.
       */
     RealScalar maxPivot() const { return m_maxpivot; }
-    
+
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
     #endif
 
   protected:
-    
+
     static void check_template_parameters()
     {
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
     }
-    
+
     void computeInPlace();
-    
+
     MatrixType m_qr;
     HCoeffsType m_hCoeffs;
     IntDiagSizeVectorType m_rows_transpositions;
diff --git a/Eigen/src/QR/HouseholderQR.h b/Eigen/src/QR/HouseholderQR.h
index 3513d995c..33cb9c8ff 100644
--- a/Eigen/src/QR/HouseholderQR.h
+++ b/Eigen/src/QR/HouseholderQR.h
@@ -204,28 +204,27 @@ template<typename _MatrixType> class HouseholderQR
 
     inline Index rows() const { return m_qr.rows(); }
     inline Index cols() const { return m_qr.cols(); }
-    
+
     /** \returns a const reference to the vector of Householder coefficients used to represent the factor \c Q.
       * 
       * For advanced uses only.
       */
     const HCoeffsType& hCoeffs() const { return m_hCoeffs; }
-    
+
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
     #endif
 
   protected:
-    
+
     static void check_template_parameters()
     {
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
     }
 
     void computeInPlace();
-    
+
     MatrixType m_qr;
     HCoeffsType m_hCoeffs;
     RowVectorType m_temp;
@@ -292,7 +291,7 @@ template<typename MatrixQR, typename HCoeffs,
   bool InnerStrideIsOne = (MatrixQR::InnerStrideAtCompileTime == 1 && HCoeffs::InnerStrideAtCompileTime == 1)>
 struct householder_qr_inplace_blocked
 {
-  // This is specialized for MKL-supported Scalar types in HouseholderQR_MKL.h
+  // This is specialized for LAPACK-supported Scalar types in HouseholderQR_LAPACKE.h
   static void run(MatrixQR& mat, HCoeffs& hCoeffs, Index maxBlockSize=32,
       typename MatrixQR::Scalar* tempData = 0)
   {
@@ -354,11 +353,7 @@ void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) c
 
   typename RhsType::PlainObject c(rhs);
 
-  // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T
-  c.applyOnTheLeft(householderSequence(
-    m_qr.leftCols(rank),
-    m_hCoeffs.head(rank)).transpose()
-  );
+  c.applyOnTheLeft(householderQ().setLength(rank).adjoint() );
 
   m_qr.topLeftCorner(rank, rank)
       .template triangularView<Upper>()
diff --git a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
index 953d57c9d..1a5c5254e 100644
--- a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
+++ b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
@@ -220,7 +220,7 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
     
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the sparse QR can not be computed
       */
     ComputationInfo info() const
diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h
index 25fca6f4d..4daa9dd21 100644
--- a/Eigen/src/SVD/BDCSVD.h
+++ b/Eigen/src/SVD/BDCSVD.h
@@ -11,7 +11,7 @@
 // Copyright (C) 2013 Jean Ceccato <jean.ceccato@ensimag.fr>
 // Copyright (C) 2013 Pierre Zoppitelli <pierre.zoppitelli@ensimag.fr>
 // Copyright (C) 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
-// Copyright (C) 2014-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2014-2017 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -22,6 +22,11 @@
 // #define EIGEN_BDCSVD_DEBUG_VERBOSE
 // #define EIGEN_BDCSVD_SANITY_CHECKS
 
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+#undef eigen_internal_assert
+#define eigen_internal_assert(X) assert(X);
+#endif
+
 namespace Eigen {
 
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
@@ -57,7 +62,7 @@ struct traits<BDCSVD<_MatrixType> >
  * recommended and can several order of magnitude faster.
  *
  * \warning this algorithm is unlikely to provide accurate result when compiled with unsafe math optimizations.
- * For instance, this concerns Intel's compiler (ICC), which perfroms such optimization by default unless
+ * For instance, this concerns Intel's compiler (ICC), which performs such optimization by default unless
  * you compile with the \c -fp-model \c precise option. Likewise, the \c -ffast-math option of GCC or clang will
  * significantly degrade the accuracy.
  *
@@ -77,6 +82,7 @@ public:
   typedef _MatrixType MatrixType;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+  typedef typename NumTraits<RealScalar>::Literal Literal;
   enum {
     RowsAtCompileTime = MatrixType::RowsAtCompileTime, 
     ColsAtCompileTime = MatrixType::ColsAtCompileTime, 
@@ -211,7 +217,7 @@ public:
 
 // Method to allocate and initialize matrix and attributes
 template<typename MatrixType>
-void BDCSVD<MatrixType>::allocate(Index rows, Index cols, unsigned int computationOptions)
+void BDCSVD<MatrixType>::allocate(Eigen::Index rows, Eigen::Index cols, unsigned int computationOptions)
 {
   m_isTranspose = (cols > rows);
 
@@ -259,7 +265,7 @@ BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsign
   
   //**** step 0 - Copy the input matrix and apply scaling to reduce over/under-flows
   RealScalar scale = matrix.cwiseAbs().maxCoeff();
-  if(scale==RealScalar(0)) scale = RealScalar(1);
+  if(scale==Literal(0)) scale = Literal(1);
   MatrixX copy;
   if (m_isTranspose) copy = matrix.adjoint()/scale;
   else               copy = matrix/scale;
@@ -351,13 +357,13 @@ void BDCSVD<MatrixType>::structured_update(Block<MatrixXr,Dynamic,Dynamic> A, co
     Index k1=0, k2=0;
     for(Index j=0; j<n; ++j)
     {
-      if( (A.col(j).head(n1).array()!=0).any() )
+      if( (A.col(j).head(n1).array()!=Literal(0)).any() )
       {
         A1.col(k1) = A.col(j).head(n1);
         B1.row(k1) = B.row(j);
         ++k1;
       }
-      if( (A.col(j).tail(n2).array()!=0).any() )
+      if( (A.col(j).tail(n2).array()!=Literal(0)).any() )
       {
         A2.col(k2) = A.col(j).tail(n2);
         B2.row(k2) = B.row(j);
@@ -387,7 +393,7 @@ void BDCSVD<MatrixType>::structured_update(Block<MatrixXr,Dynamic,Dynamic> A, co
 //@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the last column of the U submatrix 
 // to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the reference paper.
 template<typename MatrixType>
-void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift)
+void BDCSVD<MatrixType>::divide (Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift)
 {
   // requires rows = cols + 1;
   using std::pow;
@@ -449,11 +455,11 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW,
     l = m_naiveU.row(1).segment(firstCol, k);
     f = m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1);
   }
-  if (m_compV) m_naiveV(firstRowW+k, firstColW) = 1;
+  if (m_compV) m_naiveV(firstRowW+k, firstColW) = Literal(1);
   if (r0<considerZero)
   {
-    c0 = 1;
-    s0 = 0;
+    c0 = Literal(1);
+    s0 = Literal(0);
   }
   else
   {
@@ -567,14 +573,14 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW,
 // handling of round-off errors, be consistent in ordering
 // For instance, to solve the secular equation using FMM, see http://www.stat.uchicago.edu/~lekheng/courses/302/classics/greengard-rokhlin.pdf
 template <typename MatrixType>
-void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V)
+void BDCSVD<MatrixType>::computeSVDofM(Eigen::Index firstCol, Eigen::Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V)
 {
   const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
   using std::abs;
   ArrayRef col0 = m_computed.col(firstCol).segment(firstCol, n);
   m_workspace.head(n) =  m_computed.block(firstCol, firstCol, n, n).diagonal();
   ArrayRef diag = m_workspace.head(n);
-  diag(0) = 0;
+  diag(0) = Literal(0);
 
   // Allocate space for singular values and vectors
   singVals.resize(n);
@@ -590,7 +596,7 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec
   // but others are interleaved and we must ignore them at this stage.
   // To this end, let's compute a permutation skipping them:
   Index actual_n = n;
-  while(actual_n>1 && diag(actual_n-1)==0) --actual_n;
+  while(actual_n>1 && diag(actual_n-1)==Literal(0)) {--actual_n; eigen_internal_assert(col0(actual_n)==Literal(0)); }
   Index m = 0; // size of the deflated problem
   for(Index k=0;k<actual_n;++k)
     if(abs(col0(k))>considerZero)
@@ -617,13 +623,11 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec
   std::cout << "  shift:    " << shifts.transpose() << "\n";
   
   {
-    Index actual_n = n;
-    while(actual_n>1 && abs(col0(actual_n-1))<considerZero) --actual_n;
     std::cout << "\n\n    mus:    " << mus.head(actual_n).transpose() << "\n\n";
     std::cout << "    check1 (expect0) : " << ((singVals.array()-(shifts+mus)) / singVals.array()).head(actual_n).transpose() << "\n\n";
+    assert((((singVals.array()-(shifts+mus)) / singVals.array()).head(actual_n) >= 0).all());
     std::cout << "    check2 (>0)      : " << ((singVals.array()-diag) / singVals.array()).head(actual_n).transpose() << "\n\n";
-    std::cout << "    check3 (>0)      : " << ((diag.segment(1,actual_n-1)-singVals.head(actual_n-1).array()) / singVals.head(actual_n-1).array()).transpose() << "\n\n\n";
-    std::cout << "    check4 (>0)      : " << ((singVals.segment(1,actual_n-1)-singVals.head(actual_n-1))).transpose() << "\n\n\n";
+    assert((((singVals.array()-diag) / singVals.array()).head(actual_n) >= 0).all());
   }
 #endif
   
@@ -651,13 +655,13 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec
 #endif
   
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
-  assert(U.allFinite());
-  assert(V.allFinite());
-  assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() < 1e-14 * n);
-  assert((V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 1e-14 * n);
   assert(m_naiveU.allFinite());
   assert(m_naiveV.allFinite());
   assert(m_computed.allFinite());
+  assert(U.allFinite());
+  assert(V.allFinite());
+//   assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() < 100*NumTraits<RealScalar>::epsilon() * n);
+//   assert((V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 100*NumTraits<RealScalar>::epsilon() * n);
 #endif
   
   // Because of deflation, the singular values might not be completely sorted.
@@ -672,6 +676,15 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec
       if(m_compV) V.col(i).swap(V.col(i+1));
     }
   }
+
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  {
+    bool singular_values_sorted = (((singVals.segment(1,actual_n-1)-singVals.head(actual_n-1))).array() >= 0).all();
+    if(!singular_values_sorted)
+      std::cout << "Singular values are not sorted: " << singVals.segment(1,actual_n).transpose() << "\n";
+    assert(singular_values_sorted);
+  }
+#endif
   
   // Reverse order so that singular values in increased order
   // Because of deflation, the zeros singular-values are already at the end
@@ -691,11 +704,13 @@ template <typename MatrixType>
 typename BDCSVD<MatrixType>::RealScalar BDCSVD<MatrixType>::secularEq(RealScalar mu, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, const ArrayRef& diagShifted, RealScalar shift)
 {
   Index m = perm.size();
-  RealScalar res = 1;
+  RealScalar res = Literal(1);
   for(Index i=0; i<m; ++i)
   {
     Index j = perm(i);
-    res += numext::abs2(col0(j)) / ((diagShifted(j) - mu) * (diag(j) + shift + mu));
+    // The following expression could be rewritten to involve only a single division,
+    // but this would make the expression more sensitive to overflow.
+    res += (col0(j) / (diagShifted(j) - mu)) * (col0(j) / (diag(j) + shift + mu));
   }
   return res;
 
@@ -707,19 +722,22 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
 {
   using std::abs;
   using std::swap;
+  using std::sqrt;
 
   Index n = col0.size();
   Index actual_n = n;
-  while(actual_n>1 && col0(actual_n-1)==0) --actual_n;
+  // Note that here actual_n is computed based on col0(i)==0 instead of diag(i)==0 as above
+  // because 1) we have diag(i)==0 => col0(i)==0 and 2) if col0(i)==0, then diag(i) is already a singular value.
+  while(actual_n>1 && col0(actual_n-1)==Literal(0)) --actual_n;
 
   for (Index k = 0; k < n; ++k)
   {
-    if (col0(k) == 0 || actual_n==1)
+    if (col0(k) == Literal(0) || actual_n==1)
     {
       // if col0(k) == 0, then entry is deflated, so singular value is on diagonal
       // if actual_n==1, then the deflated problem is already diagonalized
       singVals(k) = k==0 ? col0(0) : diag(k);
-      mus(k) = 0;
+      mus(k) = Literal(0);
       shifts(k) = k==0 ? col0(0) : diag(k);
       continue;
     } 
@@ -731,31 +749,36 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
       right = (diag(actual_n-1) + col0.matrix().norm());
     else
     {
-      // Skip deflated singular values
+      // Skip deflated singular values,
+      // recall that at this stage we assume that z[j]!=0 and all entries for which z[j]==0 have been put aside.
+      // This should be equivalent to using perm[]
       Index l = k+1;
-      while(col0(l)==0) { ++l; eigen_internal_assert(l<actual_n); }
+      while(col0(l)==Literal(0)) { ++l; eigen_internal_assert(l<actual_n); }
       right = diag(l);
     }
 
     // first decide whether it's closer to the left end or the right end
-    RealScalar mid = left + (right-left) / 2;
-    RealScalar fMid = secularEq(mid, col0, diag, perm, diag, 0);
+    RealScalar mid = left + (right-left) / Literal(2);
+    RealScalar fMid = secularEq(mid, col0, diag, perm, diag, Literal(0));
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-    std::cout << right-left << "\n";
-    std::cout << "fMid = " << fMid << " " << secularEq(mid-left, col0, diag, perm, diag-left, left) << " " << secularEq(mid-right, col0, diag, perm, diag-right, right)   << "\n";
-    std::cout << "     = " << secularEq(0.1*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.2*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.3*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.4*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.49*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.5*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.51*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.6*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.7*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.8*(left+right), col0, diag, perm, diag, 0)
-              << " "       << secularEq(0.9*(left+right), col0, diag, perm, diag, 0) << "\n";
+    std::cout << "right-left = " << right-left << "\n";
+//     std::cout << "fMid = " << fMid << " " << secularEq(mid-left, col0, diag, perm, ArrayXr(diag-left), left)
+//                            << " " << secularEq(mid-right, col0, diag, perm, ArrayXr(diag-right), right)   << "\n";
+    std::cout << "     = " << secularEq(left+RealScalar(0.000001)*(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.1)     *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.2)     *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.3)     *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.4)     *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.49)    *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.5)     *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.51)    *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.6)     *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.7)     *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.8)     *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.9)     *(right-left), col0, diag, perm, diag, 0)
+              << " "       << secularEq(left+RealScalar(0.999999)*(right-left), col0, diag, perm, diag, 0) << "\n";
 #endif
-    RealScalar shift = (k == actual_n-1 || fMid > 0) ? left : right;
+    RealScalar shift = (k == actual_n-1 || fMid > Literal(0)) ? left : right;
     
     // measure everything relative to shift
     Map<ArrayXr> diagShifted(m_workspace.data()+4*n, n);
@@ -785,26 +808,29 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
 
     // rational interpolation: fit a function of the form a / mu + b through the two previous
     // iterates and use its zero to compute the next iterate
-    bool useBisection = fPrev*fCur>0;
-    while (fCur!=0 && abs(muCur - muPrev) > 8 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(muCur), abs(muPrev)) && abs(fCur - fPrev)>NumTraits<RealScalar>::epsilon() && !useBisection)
+    bool useBisection = fPrev*fCur>Literal(0);
+    while (fCur!=Literal(0) && abs(muCur - muPrev) > Literal(8) * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(muCur), abs(muPrev)) && abs(fCur - fPrev)>NumTraits<RealScalar>::epsilon() && !useBisection)
     {
       ++m_numIters;
 
       // Find a and b such that the function f(mu) = a / mu + b matches the current and previous samples.
-      RealScalar a = (fCur - fPrev) / (1/muCur - 1/muPrev);
+      RealScalar a = (fCur - fPrev) / (Literal(1)/muCur - Literal(1)/muPrev);
       RealScalar b = fCur - a / muCur;
       // And find mu such that f(mu)==0:
       RealScalar muZero = -a/b;
       RealScalar fZero = secularEq(muZero, col0, diag, perm, diagShifted, shift);
+
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+      assert((std::isfinite)(fZero));
+#endif
       
       muPrev = muCur;
       fPrev = fCur;
       muCur = muZero;
       fCur = fZero;
       
-      
-      if (shift == left  && (muCur < 0 || muCur > right - left)) useBisection = true;
-      if (shift == right && (muCur < -(right - left) || muCur > 0)) useBisection = true;
+      if (shift == left  && (muCur < Literal(0) || muCur > right - left)) useBisection = true;
+      if (shift == right && (muCur < -(right - left) || muCur > Literal(0))) useBisection = true;
       if (abs(fCur)>abs(fPrev)) useBisection = true;
     }
 
@@ -817,37 +843,59 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
       RealScalar leftShifted, rightShifted;
       if (shift == left)
       {
-        leftShifted = (std::numeric_limits<RealScalar>::min)();
+        // to avoid overflow, we must have mu > max(real_min, |z(k)|/sqrt(real_max)),
+        // the factor 2 is to be more conservative
+        leftShifted = numext::maxi<RealScalar>( (std::numeric_limits<RealScalar>::min)(), Literal(2) * abs(col0(k)) / sqrt((std::numeric_limits<RealScalar>::max)()) );
+
+        // check that we did it right:
+        eigen_internal_assert( (numext::isfinite)( (col0(k)/leftShifted)*(col0(k)/(diag(k)+shift+leftShifted)) ) );
         // I don't understand why the case k==0 would be special there:
-        // if (k == 0) rightShifted = right - left; else 
-        rightShifted = (k==actual_n-1) ? right : ((right - left) * RealScalar(0.6)); // theoretically we can take 0.5, but let's be safe
+        // if (k == 0) rightShifted = right - left; else
+        rightShifted = (k==actual_n-1) ? right : ((right - left) * RealScalar(0.51)); // theoretically we can take 0.5, but let's be safe
       }
       else
       {
-        leftShifted = -(right - left) * RealScalar(0.6);
-        rightShifted = -(std::numeric_limits<RealScalar>::min)();
+        leftShifted = -(right - left) * RealScalar(0.51);
+        if(k+1<n)
+          rightShifted = -numext::maxi<RealScalar>( (std::numeric_limits<RealScalar>::min)(), abs(col0(k+1)) / sqrt((std::numeric_limits<RealScalar>::max)()) );
+        else
+          rightShifted = -(std::numeric_limits<RealScalar>::min)();
       }
-      
+
       RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift);
 
-#if defined EIGEN_INTERNAL_DEBUGGING || defined EIGEN_BDCSVD_DEBUG_VERBOSE
+#if defined EIGEN_INTERNAL_DEBUGGING || defined EIGEN_BDCSVD_SANITY_CHECKS
       RealScalar fRight = secularEq(rightShifted, col0, diag, perm, diagShifted, shift);
 #endif
 
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+      if(!(std::isfinite)(fLeft))
+        std::cout << "f(" << leftShifted << ") =" << fLeft << " ; " << left << " " << shift << " " << right << "\n";
+      assert((std::isfinite)(fLeft));
+
+      if(!(std::isfinite)(fRight))
+        std::cout << "f(" << rightShifted << ") =" << fRight << " ; " << left << " " << shift << " " << right << "\n";
+//       assert((std::isfinite)(fRight));
+#endif
+
 #ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
       if(!(fLeft * fRight<0))
       {
-        std::cout << "fLeft: " << leftShifted << " - " << diagShifted.head(10).transpose()  << "\n ; " << bool(left==shift) << " " << (left-shift) << "\n";
-        std::cout << k << " : " <<  fLeft << " * " << fRight << " == " << fLeft * fRight << "  ;  " << left << " - " << right << " -> " <<  leftShifted << " " << rightShifted << "   shift=" << shift << "\n";
+        std::cout << "f(leftShifted) using  leftShifted=" << leftShifted << " ;  diagShifted(1:10):" << diagShifted.head(10).transpose()  << "\n ; "
+                  << "left==shift=" << bool(left==shift) << " ; left-shift = " << (left-shift) << "\n";
+        std::cout << "k=" << k << ", " <<  fLeft << " * " << fRight << " == " << fLeft * fRight << "  ;  "
+                  << "[" << left << " .. " << right << "] -> [" << leftShifted << " " << rightShifted << "], shift=" << shift << " ,  f(right)=" <<  secularEq(0, col0, diag, perm, diagShifted, shift) << " == " << secularEq(right, col0, diag, perm, diag, 0) << "\n";
       }
 #endif
-      eigen_internal_assert(fLeft * fRight < 0);
+      eigen_internal_assert(fLeft * fRight < Literal(0));
       
-      while (rightShifted - leftShifted > 2 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(leftShifted), abs(rightShifted)))
+      while (rightShifted - leftShifted > Literal(2) * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(leftShifted), abs(rightShifted)))
       {
-        RealScalar midShifted = (leftShifted + rightShifted) / 2;
+        RealScalar midShifted = (leftShifted + rightShifted) / Literal(2);
         fMid = secularEq(midShifted, col0, diag, perm, diagShifted, shift);
-        if (fLeft * fMid < 0)
+        eigen_internal_assert((numext::isfinite)(fMid));
+
+        if (fLeft * fMid < Literal(0))
         {
           rightShifted = midShifted;
         }
@@ -858,13 +906,22 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
         }
       }
 
-      muCur = (leftShifted + rightShifted) / 2;
+      muCur = (leftShifted + rightShifted) / Literal(2);
     }
       
     singVals[k] = shift + muCur;
     shifts[k] = shift;
     mus[k] = muCur;
 
+#ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
+    if(k+1<n)
+      std::cout << "found " << singVals[k] << " == " << shift << " + " << muCur << " from " << diag(k) << " .. "  << diag(k+1) << "\n";
+#endif
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+    assert(k==0 || singVals[k]>=singVals[k-1]);
+    assert(singVals[k]>=diag(k));
+#endif
+
     // perturb singular value slightly if it equals diagonal entry to avoid division by zero later
     // (deflation is supposed to avoid this from happening)
     // - this does no seem to be necessary anymore -
@@ -888,25 +945,53 @@ void BDCSVD<MatrixType>::perturbCol0
     zhat.setZero();
     return;
   }
-  Index last = perm(m-1);
+  Index lastIdx = perm(m-1);
   // The offset permits to skip deflated entries while computing zhat
   for (Index k = 0; k < n; ++k)
   {
-    if (col0(k) == 0) // deflated
-      zhat(k) = 0;
+    if (col0(k) == Literal(0)) // deflated
+      zhat(k) = Literal(0);
     else
     {
       // see equation (3.6)
       RealScalar dk = diag(k);
-      RealScalar prod = (singVals(last) + dk) * (mus(last) + (shifts(last) - dk));
+      RealScalar prod = (singVals(lastIdx) + dk) * (mus(lastIdx) + (shifts(lastIdx) - dk));
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+      if(prod<0) {
+        std::cout << "k = " << k << " ;  z(k)=" << col0(k) << ", diag(k)=" << dk << "\n";
+        std::cout << "prod = " << "(" << singVals(lastIdx) << " + " << dk << ") * (" << mus(lastIdx) << " + (" << shifts(lastIdx) << " - " << dk << "))" << "\n";
+        std::cout << "     = " << singVals(lastIdx) + dk << " * " << mus(lastIdx) + (shifts(lastIdx) - dk) <<  "\n";
+      }
+      assert(prod>=0);
+#endif
 
       for(Index l = 0; l<m; ++l)
       {
         Index i = perm(l);
         if(i!=k)
         {
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+          if(i>=k && (l==0 || l-1>=m))
+          {
+            std::cout << "Error in perturbCol0\n";
+            std::cout << "  " << k << "/" << n << " "  << l << "/" << m << " " << i << "/" << n << " ; " << col0(k) << " " << diag(k) << " "  <<  "\n";
+            std::cout << "  " <<diag(i) << "\n";
+            Index j = (i<k /*|| l==0*/) ? i : perm(l-1);
+            std::cout << "  " << "j=" << j << "\n";
+          }
+#endif
           Index j = i<k ? i : perm(l-1);
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+          if(!(dk!=Literal(0) || diag(i)!=Literal(0)))
+          {
+            std::cout << "k=" << k << ", i=" << i << ", l=" << l << ", perm.size()=" << perm.size() << "\n";
+          }
+          assert(dk!=Literal(0) || diag(i)!=Literal(0));
+#endif
           prod *= ((singVals(j)+dk) / ((diag(i)+dk))) * ((mus(j)+(shifts(j)-dk)) / ((diag(i)-dk)));
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+          assert(prod>=0);
+#endif
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
           if(i!=k && std::abs(((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) - 1) > 0.9 )
             std::cout << "     " << ((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) << " == (" << (singVals(j)+dk) << " * " << (mus(j)+(shifts(j)-dk))
@@ -915,10 +1000,13 @@ void BDCSVD<MatrixType>::perturbCol0
         }
       }
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-      std::cout << "zhat(" << k << ") =  sqrt( " << prod << ")  ;  " << (singVals(last) + dk) << " * " << mus(last) + shifts(last) << " - " << dk << "\n";
+      std::cout << "zhat(" << k << ") =  sqrt( " << prod << ")  ;  " << (singVals(lastIdx) + dk) << " * " << mus(lastIdx) + shifts(lastIdx) << " - " << dk << "\n";
 #endif
       RealScalar tmp = sqrt(prod);
-      zhat(k) = col0(k) > 0 ? tmp : -tmp;
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+      assert((std::isfinite)(tmp));
+#endif
+      zhat(k) = col0(k) > Literal(0) ? tmp : -tmp;
     }
   }
 }
@@ -934,7 +1022,7 @@ void BDCSVD<MatrixType>::computeSingVecs
   
   for (Index k = 0; k < n; ++k)
   {
-    if (zhat(k) == 0)
+    if (zhat(k) == Literal(0))
     {
       U.col(k) = VectorType::Unit(n+1, k);
       if (m_compV) V.col(k) = VectorType::Unit(n, k);
@@ -947,7 +1035,7 @@ void BDCSVD<MatrixType>::computeSingVecs
         Index i = perm(l);
         U(i,k) = zhat(i)/(((diag(i) - shifts(k)) - mus(k)) )/( (diag(i) + singVals[k]));
       }
-      U(n,k) = 0;      
+      U(n,k) = Literal(0);
       U.col(k).normalize();
     
       if (m_compV)
@@ -958,7 +1046,7 @@ void BDCSVD<MatrixType>::computeSingVecs
           Index i = perm(l);
           V(i,k) = diag(i) * zhat(i) / (((diag(i) - shifts(k)) - mus(k)) )/( (diag(i) + singVals[k]));
         }
-        V(0,k) = -1;
+        V(0,k) = Literal(-1);
         V.col(k).normalize();
       }
     }
@@ -971,7 +1059,7 @@ void BDCSVD<MatrixType>::computeSingVecs
 // i >= 1, di almost null and zi non null.
 // We use a rotation to zero out zi applied to the left of M
 template <typename MatrixType>
-void BDCSVD<MatrixType>::deflation43(Index firstCol, Index shift, Index i, Index size)
+void BDCSVD<MatrixType>::deflation43(Eigen::Index firstCol, Eigen::Index shift, Eigen::Index i, Eigen::Index size)
 {
   using std::abs;
   using std::sqrt;
@@ -979,15 +1067,15 @@ void BDCSVD<MatrixType>::deflation43(Index firstCol, Index shift, Index i, Index
   Index start = firstCol + shift;
   RealScalar c = m_computed(start, start);
   RealScalar s = m_computed(start+i, start);
-  RealScalar r = sqrt(numext::abs2(c) + numext::abs2(s));
-  if (r == 0)
+  RealScalar r = numext::hypot(c,s);
+  if (r == Literal(0))
   {
-    m_computed(start+i, start+i) = 0;
+    m_computed(start+i, start+i) = Literal(0);
     return;
   }
   m_computed(start,start) = r;  
-  m_computed(start+i, start) = 0;
-  m_computed(start+i, start+i) = 0;
+  m_computed(start+i, start) = Literal(0);
+  m_computed(start+i, start+i) = Literal(0);
   
   JacobiRotation<RealScalar> J(c/r,-s/r);
   if (m_compU)  m_naiveU.middleRows(firstCol, size+1).applyOnTheRight(firstCol, firstCol+i, J);
@@ -1000,7 +1088,7 @@ void BDCSVD<MatrixType>::deflation43(Index firstCol, Index shift, Index i, Index
 // We apply two rotations to have zj = 0;
 // TODO deflation44 is still broken and not properly tested
 template <typename MatrixType>
-void BDCSVD<MatrixType>::deflation44(Index firstColu , Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size)
+void BDCSVD<MatrixType>::deflation44(Eigen::Index firstColu , Eigen::Index firstColm, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index i, Eigen::Index j, Eigen::Index size)
 {
   using std::abs;
   using std::sqrt;
@@ -1020,16 +1108,16 @@ void BDCSVD<MatrixType>::deflation44(Index firstColu , Index firstColm, Index fi
     << m_computed(firstColm + i+1, firstColm+i+1) << " "
     << m_computed(firstColm + i+2, firstColm+i+2) << "\n";
 #endif
-  if (r==0)
+  if (r==Literal(0))
   {
     m_computed(firstColm + i, firstColm + i) = m_computed(firstColm + j, firstColm + j);
     return;
   }
   c/=r;
   s/=r;
-  m_computed(firstColm + i, firstColm) = r;  
+  m_computed(firstColm + i, firstColm) = r;
   m_computed(firstColm + j, firstColm + j) = m_computed(firstColm + i, firstColm + i);
-  m_computed(firstColm + j, firstColm) = 0;
+  m_computed(firstColm + j, firstColm) = Literal(0);
 
   JacobiRotation<RealScalar> J(c,-s);
   if (m_compU)  m_naiveU.middleRows(firstColu, size+1).applyOnTheRight(firstColu + i, firstColu + j, J);
@@ -1040,7 +1128,7 @@ void BDCSVD<MatrixType>::deflation44(Index firstColu , Index firstColm, Index fi
 
 // acts on block from (firstCol+shift, firstCol+shift) to (lastCol+shift, lastCol+shift) [inclusive]
 template <typename MatrixType>
-void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift)
+void BDCSVD<MatrixType>::deflation(Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index k, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift)
 {
   using std::sqrt;
   using std::abs;
@@ -1053,7 +1141,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
   const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
   RealScalar maxDiag = diag.tail((std::max)(Index(1),length-1)).cwiseAbs().maxCoeff();
   RealScalar epsilon_strict = numext::maxi<RealScalar>(considerZero,NumTraits<RealScalar>::epsilon() * maxDiag);
-  RealScalar epsilon_coarse = 8 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(col0.cwiseAbs().maxCoeff(), maxDiag);
+  RealScalar epsilon_coarse = Literal(8) * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(col0.cwiseAbs().maxCoeff(), maxDiag);
   
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
   assert(m_naiveU.allFinite());
@@ -1081,7 +1169,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
 #ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
       std::cout << "deflation 4.2, set z(" << i << ") to zero because " << abs(col0(i)) << " < " << epsilon_strict << "  (diag(" << i << ")=" << diag(i) << ")\n";
 #endif
-      col0(i) = 0;
+      col0(i) = Literal(0);
     }
 
   //condition 4.3
@@ -1101,6 +1189,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
 #endif
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
   std::cout << "to be sorted: " << diag.transpose() << "\n\n";
+  std::cout << "            : " << col0.transpose() << "\n\n";
 #endif
   {
     // Check for total deflation
@@ -1191,7 +1280,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
        if( (diag(i) - diag(i-1)) < NumTraits<RealScalar>::epsilon()*maxDiag )
       {
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-        std::cout << "deflation 4.4 with i = " << i << " because " << (diag(i) - diag(i-1)) << " < " << NumTraits<RealScalar>::epsilon()*diag(i) << "\n";
+        std::cout << "deflation 4.4 with i = " << i << " because " << diag(i) << " - " << diag(i-1) << " == " << (diag(i) - diag(i-1)) << " < " << NumTraits<RealScalar>::epsilon()*/*diag(i)*/maxDiag << "\n";
 #endif
         eigen_internal_assert(abs(diag(i) - diag(i-1))<epsilon_coarse && " diagonal entries are not properly sorted");
         deflation44(firstCol, firstCol + shift, firstRowW, firstColW, i-1, i, length);
@@ -1210,7 +1299,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
 #endif
 }//end deflation
 
-#ifndef __CUDACC__
+#if !defined(EIGEN_GPUCC)
 /** \svd_module
   *
   * \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm
diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h
index 43488b1e0..1c7c80376 100644
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
@@ -610,7 +610,7 @@ template<typename _MatrixType, int QRPreconditioner> class JacobiSVD
 };
 
 template<typename MatrixType, int QRPreconditioner>
-void JacobiSVD<MatrixType, QRPreconditioner>::allocate(Index rows, Index cols, unsigned int computationOptions)
+void JacobiSVD<MatrixType, QRPreconditioner>::allocate(Eigen::Index rows, Eigen::Index cols, unsigned int computationOptions)
 {
   eigen_assert(rows >= 0 && cols >= 0);
 
diff --git a/Eigen/src/SVD/JacobiSVD_LAPACKE.h b/Eigen/src/SVD/JacobiSVD_LAPACKE.h
index 50272154f..ff0516f61 100644
--- a/Eigen/src/SVD/JacobiSVD_LAPACKE.h
+++ b/Eigen/src/SVD/JacobiSVD_LAPACKE.h
@@ -61,9 +61,10 @@ JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, ColPiv
     u    = (LAPACKE_TYPE*)m_matrixU.data(); \
   } else { ldu=1; u=&dummy; }\
   MatrixType localV; \
-  ldvt = (m_computeFullV) ? internal::convert_index<lapack_int>(m_cols) : (m_computeThinV) ? internal::convert_index<lapack_int>(m_diagSize) : 1; \
+  lapack_int vt_rows = (m_computeFullV) ? internal::convert_index<lapack_int>(m_cols) : (m_computeThinV) ? internal::convert_index<lapack_int>(m_diagSize) : 1; \
   if (computeV()) { \
-    localV.resize(ldvt, m_cols); \
+    localV.resize(vt_rows, m_cols); \
+    ldvt  = internal::convert_index<lapack_int>(localV.outerStride()); \
     vt   = (LAPACKE_TYPE*)localV.data(); \
   } else { ldvt=1; vt=&dummy; }\
   Matrix<LAPACKE_RTYPE, Dynamic, Dynamic> superb; superb.resize(m_diagSize, 1); \
diff --git a/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h
index cc90a3b75..429414797 100644
--- a/Eigen/src/SVD/SVDBase.h
+++ b/Eigen/src/SVD/SVDBase.h
@@ -212,7 +212,6 @@ public:
   
   #ifndef EIGEN_PARSED_BY_DOXYGEN
   template<typename RhsType, typename DstType>
-  EIGEN_DEVICE_FUNC
   void _solve_impl(const RhsType &rhs, DstType &dst) const;
   #endif
 
diff --git a/Eigen/src/SVD/UpperBidiagonalization.h b/Eigen/src/SVD/UpperBidiagonalization.h
index 0b1460894..997defc47 100644
--- a/Eigen/src/SVD/UpperBidiagonalization.h
+++ b/Eigen/src/SVD/UpperBidiagonalization.h
@@ -127,7 +127,7 @@ void upperbidiagonalization_inplace_unblocked(MatrixType& mat,
        .makeHouseholderInPlace(mat.coeffRef(k,k+1), upper_diagonal[k]);
     // apply householder transform to remaining part of mat on the left
     mat.bottomRightCorner(remainingRows-1, remainingCols)
-       .applyHouseholderOnTheRight(mat.row(k).tail(remainingCols-1).transpose(), mat.coeff(k,k+1), tempData);
+       .applyHouseholderOnTheRight(mat.row(k).tail(remainingCols-1).adjoint(), mat.coeff(k,k+1), tempData);
   }
 }
 
@@ -159,6 +159,8 @@ void upperbidiagonalization_blocked_helper(MatrixType& A,
                                                       traits<MatrixType>::Flags & RowMajorBit> > Y)
 {
   typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename NumTraits<RealScalar>::Literal Literal;
   enum { StorageOrder = traits<MatrixType>::Flags & RowMajorBit };
   typedef InnerStride<int(StorageOrder) == int(ColMajor) ? 1 : Dynamic> ColInnerStride;
   typedef InnerStride<int(StorageOrder) == int(ColMajor) ? Dynamic : 1> RowInnerStride;
@@ -200,7 +202,7 @@ void upperbidiagonalization_blocked_helper(MatrixType& A,
       {
         SubColumnType y_k( Y.col(k).tail(remainingCols) );
         
-        // let's use the begining of column k of Y as a temporary vector
+        // let's use the beginning of column k of Y as a temporary vector
         SubColumnType tmp( Y.col(k).head(k) );
         y_k.noalias()  = A.block(k,k+1, remainingRows,remainingCols).adjoint() * v_k; // bottleneck
         tmp.noalias()  = V_k1.adjoint()  * v_k;
@@ -229,7 +231,7 @@ void upperbidiagonalization_blocked_helper(MatrixType& A,
       {
         SubColumnType x_k ( X.col(k).tail(remainingRows-1) );
         
-        // let's use the begining of column k of X as a temporary vectors
+        // let's use the beginning of column k of X as a temporary vectors
         // note that tmp0 and tmp1 overlaps
         SubColumnType tmp0 ( X.col(k).head(k) ),
                       tmp1 ( X.col(k).head(k+1) );
@@ -263,7 +265,7 @@ void upperbidiagonalization_blocked_helper(MatrixType& A,
     SubMatType A10( A.block(bs,0, brows-bs,bs) );
     SubMatType A01( A.block(0,bs, bs,bcols-bs) );
     Scalar tmp = A01(bs-1,0);
-    A01(bs-1,0) = 1;
+    A01(bs-1,0) = Literal(1);
     A11.noalias() -= A10 * Y.topLeftCorner(bcols,bs).bottomRows(bcols-bs).adjoint();
     A11.noalias() -= X.topLeftCorner(brows,bs).bottomRows(brows-bs) * A01;
     A01(bs-1,0) = tmp;
diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h
index 2907f6529..b9ca94bc3 100644
--- a/Eigen/src/SparseCholesky/SimplicialCholesky.h
+++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h
@@ -101,7 +101,7 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
     
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the matrix.appears to be negative.
       */
     ComputationInfo info() const
diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
index 31e06995b..0aa92f8bc 100644
--- a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
+++ b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
@@ -5,7 +5,7 @@
 
 /*
 
-NOTE: thes functions vave been adapted from the LDL library:
+NOTE: these functions have been adapted from the LDL library:
 
 LDL Copyright (c) 2005 by Timothy A. Davis.  All Rights Reserved.
 
@@ -122,7 +122,7 @@ void SimplicialCholeskyBase<Derived>::factorize_preordered(const CholMatrixType&
   for(StorageIndex k = 0; k < size; ++k)
   {
     // compute nonzero pattern of kth row of L, in topological order
-    y[k] = 0.0;                     // Y(0:k) is now all zero
+    y[k] = Scalar(0);                     // Y(0:k) is now all zero
     StorageIndex top = size;               // stack for pattern is empty
     tags[k] = k;                    // mark node k as visited
     m_nonZerosPerCol[k] = 0;        // count of nonzeros in column k of L
@@ -146,12 +146,12 @@ void SimplicialCholeskyBase<Derived>::factorize_preordered(const CholMatrixType&
     /* compute numerical values kth row of L (a sparse triangular solve) */
 
     RealScalar d = numext::real(y[k]) * m_shiftScale + m_shiftOffset;    // get D(k,k), apply the shift function, and clear Y(k)
-    y[k] = 0.0;
+    y[k] = Scalar(0);
     for(; top < size; ++top)
     {
       Index i = pattern[top];       /* pattern[top:n-1] is pattern of L(:,k) */
       Scalar yi = y[i];             /* get and clear Y(i) */
-      y[i] = 0.0;
+      y[i] = Scalar(0);
 
       /* the nonzero entry L(k,i) */
       Scalar l_ki;
diff --git a/Eigen/src/SparseCore/AmbiVector.h b/Eigen/src/SparseCore/AmbiVector.h
index 8a5cc91f2..e0295f2af 100644
--- a/Eigen/src/SparseCore/AmbiVector.h
+++ b/Eigen/src/SparseCore/AmbiVector.h
@@ -94,7 +94,7 @@ class AmbiVector
       Index allocSize = m_allocatedElements * sizeof(ListEl);
       allocSize = (allocSize + sizeof(Scalar) - 1)/sizeof(Scalar);
       Scalar* newBuffer = new Scalar[allocSize];
-      memcpy(newBuffer,  m_buffer,  copyElements * sizeof(ListEl));
+      std::memcpy(newBuffer,  m_buffer,  copyElements * sizeof(ListEl));
       delete[] m_buffer;
       m_buffer = newBuffer;
     }
diff --git a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
index 492eb0a29..9db119b67 100644
--- a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
+++ b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
@@ -17,7 +17,9 @@ namespace internal {
 template<typename Lhs, typename Rhs, typename ResultType>
 static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res, bool sortedInsertion = false)
 {
-  typedef typename remove_all<Lhs>::type::Scalar Scalar;
+  typedef typename remove_all<Lhs>::type::Scalar LhsScalar;
+  typedef typename remove_all<Rhs>::type::Scalar RhsScalar;
+  typedef typename remove_all<ResultType>::type::Scalar ResScalar;
 
   // make sure to call innerSize/outerSize since we fake the storage order.
   Index rows = lhs.innerSize();
@@ -25,7 +27,7 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r
   eigen_assert(lhs.outerSize() == rhs.innerSize());
   
   ei_declare_aligned_stack_constructed_variable(bool,   mask,     rows, 0);
-  ei_declare_aligned_stack_constructed_variable(Scalar, values,   rows, 0);
+  ei_declare_aligned_stack_constructed_variable(ResScalar, values,   rows, 0);
   ei_declare_aligned_stack_constructed_variable(Index,  indices,  rows, 0);
   
   std::memset(mask,0,sizeof(bool)*rows);
@@ -51,12 +53,12 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r
     Index nnz = 0;
     for (typename evaluator<Rhs>::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt)
     {
-      Scalar y = rhsIt.value();
+      RhsScalar y = rhsIt.value();
       Index k = rhsIt.index();
       for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt)
       {
         Index i = lhsIt.index();
-        Scalar x = lhsIt.value();
+        LhsScalar x = lhsIt.value();
         if(!mask[i])
         {
           mask[i] = true;
@@ -166,11 +168,12 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,C
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-     typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorMatrix;
-     RowMajorMatrix rhsRow = rhs;
-     RowMajorMatrix resRow(lhs.rows(), rhs.cols());
-     internal::conservative_sparse_sparse_product_impl<RowMajorMatrix,Lhs,RowMajorMatrix>(rhsRow, lhs, resRow);
-     res = resRow;
+    typedef SparseMatrix<typename Rhs::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorRhs;
+    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorRes;
+    RowMajorRhs rhsRow = rhs;
+    RowMajorRes resRow(lhs.rows(), rhs.cols());
+    internal::conservative_sparse_sparse_product_impl<RowMajorRhs,Lhs,RowMajorRes>(rhsRow, lhs, resRow);
+    res = resRow;
   }
 };
 
@@ -179,10 +182,11 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,R
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorMatrix;
-    RowMajorMatrix lhsRow = lhs;
-    RowMajorMatrix resRow(lhs.rows(), rhs.cols());
-    internal::conservative_sparse_sparse_product_impl<Rhs,RowMajorMatrix,RowMajorMatrix>(rhs, lhsRow, resRow);
+    typedef SparseMatrix<typename Lhs::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorLhs;
+    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorRes;
+    RowMajorLhs lhsRow = lhs;
+    RowMajorRes resRow(lhs.rows(), rhs.cols());
+    internal::conservative_sparse_sparse_product_impl<Rhs,RowMajorLhs,RowMajorRes>(rhs, lhsRow, resRow);
     res = resRow;
   }
 };
@@ -219,10 +223,11 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,C
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
-    ColMajorMatrix lhsCol = lhs;
-    ColMajorMatrix resCol(lhs.rows(), rhs.cols());
-    internal::conservative_sparse_sparse_product_impl<ColMajorMatrix,Rhs,ColMajorMatrix>(lhsCol, rhs, resCol);
+    typedef SparseMatrix<typename Lhs::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorLhs;
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorRes;
+    ColMajorLhs lhsCol = lhs;
+    ColMajorRes resCol(lhs.rows(), rhs.cols());
+    internal::conservative_sparse_sparse_product_impl<ColMajorLhs,Rhs,ColMajorRes>(lhsCol, rhs, resCol);
     res = resCol;
   }
 };
@@ -232,10 +237,11 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,R
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
-    ColMajorMatrix rhsCol = rhs;
-    ColMajorMatrix resCol(lhs.rows(), rhs.cols());
-    internal::conservative_sparse_sparse_product_impl<Lhs,ColMajorMatrix,ColMajorMatrix>(lhs, rhsCol, resCol);
+    typedef SparseMatrix<typename Rhs::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorRhs;
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorRes;
+    ColMajorRhs rhsCol = rhs;
+    ColMajorRes resCol(lhs.rows(), rhs.cols());
+    internal::conservative_sparse_sparse_product_impl<Lhs,ColMajorRhs,ColMajorRes>(lhs, rhsCol, resCol);
     res = resCol;
   }
 };
@@ -263,7 +269,8 @@ namespace internal {
 template<typename Lhs, typename Rhs, typename ResultType>
 static void sparse_sparse_to_dense_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res)
 {
-  typedef typename remove_all<Lhs>::type::Scalar Scalar;
+  typedef typename remove_all<Lhs>::type::Scalar LhsScalar;
+  typedef typename remove_all<Rhs>::type::Scalar RhsScalar;
   Index cols = rhs.outerSize();
   eigen_assert(lhs.outerSize() == rhs.innerSize());
 
@@ -274,12 +281,12 @@ static void sparse_sparse_to_dense_product_impl(const Lhs& lhs, const Rhs& rhs,
   {
     for (typename evaluator<Rhs>::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt)
     {
-      Scalar y = rhsIt.value();
+      RhsScalar y = rhsIt.value();
       Index k = rhsIt.index();
       for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt)
       {
         Index i = lhsIt.index();
-        Scalar x = lhsIt.value();
+        LhsScalar x = lhsIt.value();
         res.coeffRef(i,j) += x * y;
       }
     }
@@ -310,9 +317,9 @@ struct sparse_sparse_to_dense_product_selector<Lhs,Rhs,ResultType,RowMajor,ColMa
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
-    ColMajorMatrix lhsCol(lhs);
-    internal::sparse_sparse_to_dense_product_impl<ColMajorMatrix,Rhs,ResultType>(lhsCol, rhs, res);
+    typedef SparseMatrix<typename Lhs::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorLhs;
+    ColMajorLhs lhsCol(lhs);
+    internal::sparse_sparse_to_dense_product_impl<ColMajorLhs,Rhs,ResultType>(lhsCol, rhs, res);
   }
 };
 
@@ -321,9 +328,9 @@ struct sparse_sparse_to_dense_product_selector<Lhs,Rhs,ResultType,ColMajor,RowMa
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
-    ColMajorMatrix rhsCol(rhs);
-    internal::sparse_sparse_to_dense_product_impl<Lhs,ColMajorMatrix,ResultType>(lhs, rhsCol, res);
+    typedef SparseMatrix<typename Rhs::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorRhs;
+    ColMajorRhs rhsCol(rhs);
+    internal::sparse_sparse_to_dense_product_impl<Lhs,ColMajorRhs,ResultType>(lhs, rhsCol, res);
   }
 };
 
diff --git a/Eigen/src/SparseCore/SparseAssign.h b/Eigen/src/SparseCore/SparseAssign.h
index 18352a847..113463258 100644
--- a/Eigen/src/SparseCore/SparseAssign.h
+++ b/Eigen/src/SparseCore/SparseAssign.h
@@ -83,7 +83,7 @@ void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src)
     // eval without temporary
     dst.resize(src.rows(), src.cols());
     dst.setZero();
-    dst.reserve((std::max)(src.rows(),src.cols())*2);
+    dst.reserve((std::min)(src.rows()*src.cols(), (std::max)(src.rows(),src.cols())*2));
     for (Index j=0; j<outerEvaluationSize; ++j)
     {
       dst.startVec(j);
@@ -107,7 +107,7 @@ void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src)
     
     DstXprType temp(src.rows(), src.cols());
 
-    temp.reserve((std::max)(src.rows(),src.cols())*2);
+    temp.reserve((std::min)(src.rows()*src.cols(), (std::max)(src.rows(),src.cols())*2));
     for (Index j=0; j<outerEvaluationSize; ++j)
     {
       temp.startVec(j);
diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h
index 511e92b2f..5ed7c437b 100644
--- a/Eigen/src/SparseCore/SparseBlock.h
+++ b/Eigen/src/SparseCore/SparseBlock.h
@@ -326,46 +326,6 @@ private:
 
 //----------
 
-/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
-  * is col-major (resp. row-major).
-  */
-template<typename Derived>
-typename SparseMatrixBase<Derived>::InnerVectorReturnType SparseMatrixBase<Derived>::innerVector(Index outer)
-{ return InnerVectorReturnType(derived(), outer); }
-
-/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
-  * is col-major (resp. row-major). Read-only.
-  */
-template<typename Derived>
-const typename SparseMatrixBase<Derived>::ConstInnerVectorReturnType SparseMatrixBase<Derived>::innerVector(Index outer) const
-{ return ConstInnerVectorReturnType(derived(), outer); }
-
-/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
-  * is col-major (resp. row-major).
-  */
-template<typename Derived>
-typename SparseMatrixBase<Derived>::InnerVectorsReturnType
-SparseMatrixBase<Derived>::innerVectors(Index outerStart, Index outerSize)
-{
-  return Block<Derived,Dynamic,Dynamic,true>(derived(),
-                                             IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
-                                             IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
-
-}
-
-/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
-  * is col-major (resp. row-major). Read-only.
-  */
-template<typename Derived>
-const typename SparseMatrixBase<Derived>::ConstInnerVectorsReturnType
-SparseMatrixBase<Derived>::innerVectors(Index outerStart, Index outerSize) const
-{
-  return Block<const Derived,Dynamic,Dynamic,true>(derived(),
-                                                  IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
-                                                  IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
-
-}
-
 /** Generic implementation of sparse Block expression.
   * Real-only.
   */
diff --git a/Eigen/src/SparseCore/SparseDenseProduct.h b/Eigen/src/SparseCore/SparseDenseProduct.h
index 0547db596..f005a18a1 100644
--- a/Eigen/src/SparseCore/SparseDenseProduct.h
+++ b/Eigen/src/SparseCore/SparseDenseProduct.h
@@ -88,10 +88,11 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, A
   typedef typename internal::remove_all<SparseLhsType>::type Lhs;
   typedef typename internal::remove_all<DenseRhsType>::type Rhs;
   typedef typename internal::remove_all<DenseResType>::type Res;
-  typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
+  typedef evaluator<Lhs> LhsEval;
+  typedef typename LhsEval::InnerIterator LhsInnerIterator;
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha)
   {
-    evaluator<Lhs> lhsEval(lhs);
+    LhsEval lhsEval(lhs);
     for(Index c=0; c<rhs.cols(); ++c)
     {
       for(Index j=0; j<lhs.outerSize(); ++j)
@@ -111,17 +112,38 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
   typedef typename internal::remove_all<SparseLhsType>::type Lhs;
   typedef typename internal::remove_all<DenseRhsType>::type Rhs;
   typedef typename internal::remove_all<DenseResType>::type Res;
-  typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
+  typedef evaluator<Lhs> LhsEval;
+  typedef typename LhsEval::InnerIterator LhsInnerIterator;
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
   {
-    evaluator<Lhs> lhsEval(lhs);
-    for(Index j=0; j<lhs.outerSize(); ++j)
+    Index n = lhs.rows();
+    LhsEval lhsEval(lhs);
+
+#ifdef EIGEN_HAS_OPENMP
+    Eigen::initParallel();
+    Index threads = Eigen::nbThreads();
+    // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
+    // It basically represents the minimal amount of work to be done to be worth it.
+    if(threads>1 && lhsEval.nonZerosEstimate()*rhs.cols() > 20000)
     {
-      typename Res::RowXpr res_j(res.row(j));
-      for(LhsInnerIterator it(lhsEval,j); it ;++it)
-        res_j += (alpha*it.value()) * rhs.row(it.index());
+      #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads)
+      for(Index i=0; i<n; ++i)
+        processRow(lhsEval,rhs,res,alpha,i);
+    }
+    else
+#endif
+    {
+      for(Index i=0; i<n; ++i)
+        processRow(lhsEval, rhs, res, alpha, i);
     }
   }
+
+  static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, Res& res, const typename Res::Scalar& alpha, Index i)
+  {
+    typename Res::RowXpr res_i(res.row(i));
+    for(LhsInnerIterator it(lhsEval,i); it ;++it)
+      res_i += (alpha*it.value()) * rhs.row(it.index());
+  }
 };
 
 template<typename SparseLhsType, typename DenseRhsType, typename DenseResType>
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index 323c2323b..8bfa5f6b8 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -21,7 +21,7 @@ namespace Eigen {
   * This class implements a more versatile variants of the common \em compressed row/column storage format.
   * Each colmun's (resp. row) non zeros are stored as a pair of value with associated row (resp. colmiun) index.
   * All the non zeros are stored in a single large buffer. Unlike the \em compressed format, there might be extra
-  * space inbetween the nonzeros of two successive colmuns (resp. rows) such that insertion of new non-zero
+  * space in between the nonzeros of two successive colmuns (resp. rows) such that insertion of new non-zero
   * can be done with limited memory reallocation and copies.
   *
   * A call to the function makeCompressed() turns the matrix into the standard \em compressed format
@@ -503,7 +503,7 @@ class SparseMatrix
       }
     }
     
-    /** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerence \a epsilon */
+    /** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerance \a epsilon */
     void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision())
     {
       prune(default_prunning_func(reference,epsilon));
@@ -604,9 +604,9 @@ class SparseMatrix
       m_outerIndex = newOuterIndex;
       if (outerChange > 0)
       {
-        StorageIndex last = m_outerSize == 0 ? 0 : m_outerIndex[m_outerSize];
+        StorageIndex lastIdx = m_outerSize == 0 ? 0 : m_outerIndex[m_outerSize];
         for(Index i=m_outerSize; i<m_outerSize+outerChange+1; i++)          
-          m_outerIndex[i] = last; 
+          m_outerIndex[i] = lastIdx; 
       }
       m_outerSize += outerChange;
     }
@@ -986,7 +986,7 @@ void set_from_triplets(const InputIterator& begin, const InputIterator& end, Spa
   *
   * \warning The list of triplets is read multiple times (at least twice). Therefore, it is not recommended to define
   * an abstract iterator over a complex data-structure that would be expensive to evaluate. The triplets should rather
-  * be explicitely stored into a std::vector for instance.
+  * be explicitly stored into a std::vector for instance.
   */
 template<typename Scalar, int _Options, typename _StorageIndex>
 template<typename InputIterators>
diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h
index c6b548f11..229449f02 100644
--- a/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/Eigen/src/SparseCore/SparseMatrixBase.h
@@ -87,6 +87,11 @@ template<typename Derived> class SparseMatrixBase
           * we are dealing with a column-vector (if there is only one column) or with
           * a row-vector (if there is only one row). */
 
+      NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2,
+        /**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors,
+         * and 2 for matrices.
+         */
+
       Flags = internal::traits<Derived>::Flags,
         /**< This stores expression \ref flags flags which may or may not be inherited by new expressions
           * constructed from this one. See the \ref flags "list of flags".
@@ -350,18 +355,6 @@ template<typename Derived> class SparseMatrixBase
     const ConstTransposeReturnType transpose() const { return ConstTransposeReturnType(derived()); }
     const AdjointReturnType adjoint() const { return AdjointReturnType(transpose()); }
 
-    // inner-vector
-    typedef Block<Derived,IsRowMajor?1:Dynamic,IsRowMajor?Dynamic:1,true>       InnerVectorReturnType;
-    typedef Block<const Derived,IsRowMajor?1:Dynamic,IsRowMajor?Dynamic:1,true> ConstInnerVectorReturnType;
-    InnerVectorReturnType innerVector(Index outer);
-    const ConstInnerVectorReturnType innerVector(Index outer) const;
-
-    // set of inner-vectors
-    typedef Block<Derived,Dynamic,Dynamic,true> InnerVectorsReturnType;
-    typedef Block<const Derived,Dynamic,Dynamic,true> ConstInnerVectorsReturnType;
-    InnerVectorsReturnType innerVectors(Index outerStart, Index outerSize);
-    const ConstInnerVectorsReturnType innerVectors(Index outerStart, Index outerSize) const;
-
     DenseMatrixType toDense() const
     {
       return DenseMatrixType(derived());
diff --git a/Eigen/src/SparseCore/SparseProduct.h b/Eigen/src/SparseCore/SparseProduct.h
index 4cbf68781..c495a7398 100644
--- a/Eigen/src/SparseCore/SparseProduct.h
+++ b/Eigen/src/SparseCore/SparseProduct.h
@@ -17,7 +17,7 @@ namespace Eigen {
   * The automatic pruning of the small values can be achieved by calling the pruned() function
   * in which case a totally different product algorithm is employed:
   * \code
-  * C = (A*B).pruned();             // supress numerical zeros (exact)
+  * C = (A*B).pruned();             // suppress numerical zeros (exact)
   * C = (A*B).pruned(ref);
   * C = (A*B).pruned(ref,epsilon);
   * \endcode
diff --git a/Eigen/src/SparseCore/SparseSelfAdjointView.h b/Eigen/src/SparseCore/SparseSelfAdjointView.h
index 9e39be738..65611b3d4 100644
--- a/Eigen/src/SparseCore/SparseSelfAdjointView.h
+++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h
@@ -47,6 +47,7 @@ template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView
     
     enum {
       Mode = _Mode,
+      TransposeMode = ((Mode & Upper) ? Lower : 0) | ((Mode & Lower) ? Upper : 0),
       RowsAtCompileTime = internal::traits<SparseSelfAdjointView>::RowsAtCompileTime,
       ColsAtCompileTime = internal::traits<SparseSelfAdjointView>::ColsAtCompileTime
     };
@@ -310,7 +311,7 @@ inline void sparse_selfadjoint_time_dense_product(const SparseLhsType& lhs, cons
         while (i && i.index()<j) ++i;
         if(i && i.index()==j)
         {
-          res(j,k) += alpha * i.value() * rhs(j,k);
+          res.coeffRef(j,k) += alpha * i.value() * rhs.coeff(j,k);
           ++i;
         }
       }
@@ -323,14 +324,14 @@ inline void sparse_selfadjoint_time_dense_product(const SparseLhsType& lhs, cons
       {
         LhsScalar lhs_ij = i.value();
         if(!LhsIsRowMajor) lhs_ij = numext::conj(lhs_ij);
-        res_j += lhs_ij * rhs(i.index(),k);
+        res_j += lhs_ij * rhs.coeff(i.index(),k);
         res(i.index(),k) += numext::conj(lhs_ij) * rhs_j;
       }
-      res(j,k) += alpha * res_j;
+      res.coeffRef(j,k) += alpha * res_j;
 
       // handle diagonal coeff
       if (ProcessFirstHalf && i && (i.index()==j))
-        res(j,k) += alpha * i.value() * rhs(j,k);
+        res.coeffRef(j,k) += alpha * i.value() * rhs.coeff(j,k);
     }
   }
 }
@@ -368,7 +369,7 @@ struct generic_product_impl<Lhs, RhsView, DenseShape, SparseSelfAdjointShape, Pr
     
     // transpose everything
     Transpose<Dest> dstT(dst);
-    internal::sparse_selfadjoint_time_dense_product<RhsView::Mode>(rhsNested.transpose(), lhsNested.transpose(), dstT, alpha);
+    internal::sparse_selfadjoint_time_dense_product<RhsView::TransposeMode>(rhsNested.transpose(), lhsNested.transpose(), dstT, alpha);
   }
 };
 
diff --git a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
index 21c419002..88820a48f 100644
--- a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
+++ b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
@@ -21,7 +21,8 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r
 {
   // return sparse_sparse_product_with_pruning_impl2(lhs,rhs,res);
 
-  typedef typename remove_all<Lhs>::type::Scalar Scalar;
+  typedef typename remove_all<Rhs>::type::Scalar RhsScalar;
+  typedef typename remove_all<ResultType>::type::Scalar ResScalar;
   typedef typename remove_all<Lhs>::type::StorageIndex StorageIndex;
 
   // make sure to call innerSize/outerSize since we fake the storage order.
@@ -31,7 +32,7 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r
   eigen_assert(lhs.outerSize() == rhs.innerSize());
 
   // allocate a temporary buffer
-  AmbiVector<Scalar,StorageIndex> tempVector(rows);
+  AmbiVector<ResScalar,StorageIndex> tempVector(rows);
 
   // mimics a resizeByInnerOuter:
   if(ResultType::IsRowMajor)
@@ -63,14 +64,14 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r
     {
       // FIXME should be written like this: tmp += rhsIt.value() * lhs.col(rhsIt.index())
       tempVector.restart();
-      Scalar x = rhsIt.value();
+      RhsScalar x = rhsIt.value();
       for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, rhsIt.index()); lhsIt; ++lhsIt)
       {
         tempVector.coeffRef(lhsIt.index()) += lhsIt.value() * x;
       }
     }
     res.startVec(j);
-    for (typename AmbiVector<Scalar,StorageIndex>::Iterator it(tempVector,tolerance); it; ++it)
+    for (typename AmbiVector<ResScalar,StorageIndex>::Iterator it(tempVector,tolerance); it; ++it)
       res.insertBackByOuterInner(j,it.index()) = it.value();
   }
   res.finalize();
@@ -85,7 +86,6 @@ struct sparse_sparse_product_with_pruning_selector;
 template<typename Lhs, typename Rhs, typename ResultType>
 struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,ColMajor,ColMajor>
 {
-  typedef typename traits<typename remove_all<Lhs>::type>::Scalar Scalar;
   typedef typename ResultType::RealScalar RealScalar;
 
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
@@ -129,8 +129,8 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,R
   typedef typename ResultType::RealScalar RealScalar;
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixLhs;
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixRhs;
+    typedef SparseMatrix<typename Lhs::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixLhs;
+    typedef SparseMatrix<typename Rhs::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixRhs;
     ColMajorMatrixLhs colLhs(lhs);
     ColMajorMatrixRhs colRhs(rhs);
     internal::sparse_sparse_product_with_pruning_impl<ColMajorMatrixLhs,ColMajorMatrixRhs,ResultType>(colLhs, colRhs, res, tolerance);
@@ -149,7 +149,7 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,R
   typedef typename ResultType::RealScalar RealScalar;
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename Lhs::StorageIndex> RowMajorMatrixLhs;
+    typedef SparseMatrix<typename Lhs::Scalar,RowMajor,typename Lhs::StorageIndex> RowMajorMatrixLhs;
     RowMajorMatrixLhs rowLhs(lhs);
     sparse_sparse_product_with_pruning_selector<RowMajorMatrixLhs,Rhs,ResultType,RowMajor,RowMajor>(rowLhs,rhs,res,tolerance);
   }
@@ -161,7 +161,7 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,C
   typedef typename ResultType::RealScalar RealScalar;
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename Lhs::StorageIndex> RowMajorMatrixRhs;
+    typedef SparseMatrix<typename Rhs::Scalar,RowMajor,typename Lhs::StorageIndex> RowMajorMatrixRhs;
     RowMajorMatrixRhs rowRhs(rhs);
     sparse_sparse_product_with_pruning_selector<Lhs,RowMajorMatrixRhs,ResultType,RowMajor,RowMajor,RowMajor>(lhs,rowRhs,res,tolerance);
   }
@@ -173,7 +173,7 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,R
   typedef typename ResultType::RealScalar RealScalar;
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixRhs;
+    typedef SparseMatrix<typename Rhs::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixRhs;
     ColMajorMatrixRhs colRhs(rhs);
     internal::sparse_sparse_product_with_pruning_impl<Lhs,ColMajorMatrixRhs,ResultType>(lhs, colRhs, res, tolerance);
   }
@@ -185,7 +185,7 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,C
   typedef typename ResultType::RealScalar RealScalar;
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixLhs;
+    typedef SparseMatrix<typename Lhs::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixLhs;
     ColMajorMatrixLhs colLhs(lhs);
     internal::sparse_sparse_product_with_pruning_impl<ColMajorMatrixLhs,Rhs,ResultType>(colLhs, rhs, res, tolerance);
   }
diff --git a/Eigen/src/SparseCore/SparseVector.h b/Eigen/src/SparseCore/SparseVector.h
index 19b0fbc9d..05779be68 100644
--- a/Eigen/src/SparseCore/SparseVector.h
+++ b/Eigen/src/SparseCore/SparseVector.h
@@ -281,7 +281,7 @@ class SparseVector
     }
 
     /** Swaps the values of \c *this and \a other.
-      * Overloaded for performance: this version performs a \em shallow swap by swaping pointers and attributes only.
+      * Overloaded for performance: this version performs a \em shallow swap by swapping pointers and attributes only.
       * \sa SparseMatrixBase::swap()
       */
     inline void swap(SparseVector& other)
diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h
index f883ab383..91b6369ac 100644
--- a/Eigen/src/SparseLU/SparseLU.h
+++ b/Eigen/src/SparseLU/SparseLU.h
@@ -193,7 +193,7 @@ class SparseLU : public SparseSolverBase<SparseLU<_MatrixType,_OrderingType> >,
     
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the LU factorization reports a problem, zero diagonal for instance
       *          \c InvalidInput if the input matrix is invalid
       *
@@ -499,11 +499,8 @@ void SparseLU<MatrixType, OrderingType>::factorize(const MatrixType& matrix)
   eigen_assert(m_analysisIsOk && "analyzePattern() should be called first"); 
   eigen_assert((matrix.rows() == matrix.cols()) && "Only for squared matrices");
   
-  typedef typename IndexVector::Scalar StorageIndex; 
-  
   m_isInitialized = true;
   
-  
   // Apply the column permutation computed in analyzepattern()
   //   m_mat = matrix * m_perm_c.inverse(); 
   m_mat = matrix;
@@ -706,8 +703,8 @@ struct SparseLUMatrixLReturnType : internal::no_assignment_operator
   typedef typename MappedSupernodalType::Scalar Scalar;
   explicit SparseLUMatrixLReturnType(const MappedSupernodalType& mapL) : m_mapL(mapL)
   { }
-  Index rows() { return m_mapL.rows(); }
-  Index cols() { return m_mapL.cols(); }
+  Index rows() const { return m_mapL.rows(); }
+  Index cols() const { return m_mapL.cols(); }
   template<typename Dest>
   void solveInPlace( MatrixBase<Dest> &X) const
   {
@@ -723,8 +720,8 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator
   SparseLUMatrixUReturnType(const MatrixLType& mapL, const MatrixUType& mapU)
   : m_mapL(mapL),m_mapU(mapU)
   { }
-  Index rows() { return m_mapL.rows(); }
-  Index cols() { return m_mapL.cols(); }
+  Index rows() const { return m_mapL.rows(); }
+  Index cols() const { return m_mapL.cols(); }
 
   template<typename Dest>   void solveInPlace(MatrixBase<Dest> &X) const
   {
@@ -747,8 +744,9 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator
       }
       else
       {
+        // FIXME: the following lines should use Block expressions and not Map!
         Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) );
-        Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
+        Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X.coeffRef(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
         U = A.template triangularView<Upper>().solve(U);
       }
 
diff --git a/Eigen/src/SparseLU/SparseLU_Memory.h b/Eigen/src/SparseLU/SparseLU_Memory.h
index 4dc42e87b..349bfd585 100644
--- a/Eigen/src/SparseLU/SparseLU_Memory.h
+++ b/Eigen/src/SparseLU/SparseLU_Memory.h
@@ -51,7 +51,7 @@ inline Index LUTempSpace(Index&m, Index& w)
 
 
 /** 
-  * Expand the existing storage to accomodate more fill-ins
+  * Expand the existing storage to accommodate more fill-ins
   * \param vec Valid pointer to the vector to allocate or expand
   * \param[in,out] length  At input, contain the current length of the vector that is to be increased. At output, length of the newly allocated vector
   * \param[in] nbElts Current number of elements in the factors
diff --git a/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h b/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
index 721e1883b..8583b1b69 100644
--- a/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
+++ b/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
@@ -75,12 +75,12 @@ class MappedSuperNodalMatrix
     /**
      * Number of rows
      */
-    Index rows() { return m_row; }
+    Index rows() const { return m_row; }
     
     /**
      * Number of columns
      */
-    Index cols() { return m_col; }
+    Index cols() const { return m_col; }
     
     /**
      * Return the array of nonzero values packed by column
diff --git a/Eigen/src/SparseLU/SparseLU_column_dfs.h b/Eigen/src/SparseLU/SparseLU_column_dfs.h
index c98b30e32..5a2c941b4 100644
--- a/Eigen/src/SparseLU/SparseLU_column_dfs.h
+++ b/Eigen/src/SparseLU/SparseLU_column_dfs.h
@@ -151,7 +151,7 @@ Index SparseLUImpl<Scalar,StorageIndex>::column_dfs(const Index m, const Index j
         StorageIndex ito = glu.xlsub(fsupc+1);
         glu.xlsub(jcolm1) = ito; 
         StorageIndex istop = ito + jptr - jm1ptr; 
-        xprune(jcolm1) = istop; // intialize xprune(jcol-1)
+        xprune(jcolm1) = istop; // initialize xprune(jcol-1)
         glu.xlsub(jcol) = istop; 
         
         for (StorageIndex ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito)
@@ -166,7 +166,7 @@ Index SparseLUImpl<Scalar,StorageIndex>::column_dfs(const Index m, const Index j
   // Tidy up the pointers before exit
   glu.xsup(nsuper+1) = jcolp1; 
   glu.supno(jcolp1) = nsuper; 
-  xprune(jcol) = StorageIndex(nextl);  // Intialize upper bound for pruning
+  xprune(jcol) = StorageIndex(nextl);  // Initialize upper bound for pruning
   glu.xlsub(jcolp1) = StorageIndex(nextl); 
   
   return 0; 
diff --git a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
index 95ba7413f..e37c2fe0d 100644
--- a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
+++ b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
@@ -215,7 +215,7 @@ void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const
         if(RK==4){ a3 = pload<Packet>(A3+i+(I+1)*PacketSize);  }\
                    pstore(C0+i+(I)*PacketSize, c0);
         
-        // agressive vectorization and peeling
+        // aggressive vectorization and peeling
         for(Index i=0; i<actual_b_end1; i+=PacketSize*8)
         {
           EIGEN_ASM_COMMENT("SPARSELU_GEMML_KERNEL2");
diff --git a/Eigen/src/SparseLU/SparseLU_panel_bmod.h b/Eigen/src/SparseLU/SparseLU_panel_bmod.h
index 822cf32c3..f052001c8 100644
--- a/Eigen/src/SparseLU/SparseLU_panel_bmod.h
+++ b/Eigen/src/SparseLU/SparseLU_panel_bmod.h
@@ -38,7 +38,7 @@ namespace internal {
  * \brief Performs numeric block updates (sup-panel) in topological order.
  * 
  * Before entering this routine, the original nonzeros in the panel
- * were already copied i nto the spa[m,w]
+ * were already copied into the spa[m,w]
  * 
  * \param m number of rows in the matrix
  * \param w Panel size
diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h
index 2d4498b03..1a28389e8 100644
--- a/Eigen/src/SparseQR/SparseQR.h
+++ b/Eigen/src/SparseQR/SparseQR.h
@@ -52,7 +52,7 @@ namespace internal {
   * rank-revealing permutations. Use colsPermutation() to get it.
   * 
   * Q is the orthogonal matrix represented as products of Householder reflectors. 
-  * Use matrixQ() to get an expression and matrixQ().transpose() to get the transpose.
+  * Use matrixQ() to get an expression and matrixQ().adjoint() to get the adjoint.
   * You can then apply it to a vector.
   * 
   * R is the sparse triangular or trapezoidal matrix. The later occurs when A is rank-deficient.
@@ -65,6 +65,7 @@ namespace internal {
   * \implsparsesolverconcept
   *
   * \warning The input sparse matrix A must be in compressed mode (see SparseMatrix::makeCompressed()).
+  * \warning For complex matrices matrixQ().transpose() will actually return the adjoint matrix.
   * 
   */
 template<typename _MatrixType, typename _OrderingType>
@@ -196,9 +197,9 @@ class SparseQR : public SparseSolverBase<SparseQR<_MatrixType,_OrderingType> >
 
       Index rank = this->rank();
       
-      // Compute Q^T * b;
+      // Compute Q^* * b;
       typename Dest::PlainObject y, b;
-      y = this->matrixQ().transpose() * B; 
+      y = this->matrixQ().adjoint() * B;
       b = y;
       
       // Solve with the triangular matrix R
@@ -604,7 +605,7 @@ struct SparseQR_QProduct : ReturnByValue<SparseQR_QProduct<SparseQRType, Derived
   // Get the references 
   SparseQR_QProduct(const SparseQRType& qr, const Derived& other, bool transpose) : 
   m_qr(qr),m_other(other),m_transpose(transpose) {}
-  inline Index rows() const { return m_transpose ? m_qr.rows() : m_qr.cols(); }
+  inline Index rows() const { return m_qr.matrixQ().rows(); }
   inline Index cols() const { return m_other.cols(); }
   
   // Assign to a vector
@@ -632,16 +633,20 @@ struct SparseQR_QProduct : ReturnByValue<SparseQR_QProduct<SparseQRType, Derived
     }
     else
     {
-      eigen_assert(m_qr.m_Q.rows() == m_other.rows() && "Non conforming object sizes");
+      eigen_assert(m_qr.matrixQ().cols() == m_other.rows() && "Non conforming object sizes");
+
+      res.conservativeResize(rows(), cols());
+
       // Compute res = Q * other column by column
       for(Index j = 0; j < res.cols(); j++)
       {
-        for (Index k = diagSize-1; k >=0; k--)
+        Index start_k = internal::is_identity<Derived>::value ? numext::mini(j,diagSize-1) : diagSize-1;
+        for (Index k = start_k; k >=0; k--)
         {
           Scalar tau = Scalar(0);
           tau = m_qr.m_Q.col(k).dot(res.col(j));
           if(tau==Scalar(0)) continue;
-          tau = tau * m_qr.m_hcoeffs(k);
+          tau = tau * numext::conj(m_qr.m_hcoeffs(k));
           res.col(j) -= tau * m_qr.m_Q.col(k);
         }
       }
@@ -650,7 +655,7 @@ struct SparseQR_QProduct : ReturnByValue<SparseQR_QProduct<SparseQRType, Derived
   
   const SparseQRType& m_qr;
   const Derived& m_other;
-  bool m_transpose;
+  bool m_transpose; // TODO this actually means adjoint
 };
 
 template<typename SparseQRType>
@@ -668,13 +673,14 @@ struct SparseQRMatrixQReturnType : public EigenBase<SparseQRMatrixQReturnType<Sp
   {
     return SparseQR_QProduct<SparseQRType,Derived>(m_qr,other.derived(),false);
   }
+  // To use for operations with the adjoint of Q
   SparseQRMatrixQTransposeReturnType<SparseQRType> adjoint() const
   {
     return SparseQRMatrixQTransposeReturnType<SparseQRType>(m_qr);
   }
   inline Index rows() const { return m_qr.rows(); }
-  inline Index cols() const { return (std::min)(m_qr.rows(),m_qr.cols()); }
-  // To use for operations with the transpose of Q
+  inline Index cols() const { return m_qr.rows(); }
+  // To use for operations with the transpose of Q FIXME this is the same as adjoint at the moment
   SparseQRMatrixQTransposeReturnType<SparseQRType> transpose() const
   {
     return SparseQRMatrixQTransposeReturnType<SparseQRType>(m_qr);
@@ -682,6 +688,7 @@ struct SparseQRMatrixQReturnType : public EigenBase<SparseQRMatrixQReturnType<Sp
   const SparseQRType& m_qr;
 };
 
+// TODO this actually represents the adjoint of Q
 template<typename SparseQRType>
 struct SparseQRMatrixQTransposeReturnType
 {
@@ -712,7 +719,7 @@ struct Assignment<DstXprType, SparseQRMatrixQReturnType<SparseQRType>, internal:
   typedef typename DstXprType::StorageIndex StorageIndex;
   static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &/*func*/)
   {
-    typename DstXprType::PlainObject idMat(src.m_qr.rows(), src.m_qr.rows());
+    typename DstXprType::PlainObject idMat(src.rows(), src.cols());
     idMat.setIdentity();
     // Sort the sparse householder reflectors if needed
     const_cast<SparseQRType *>(&src.m_qr)->_sort_matrix_Q();
diff --git a/Eigen/src/StlSupport/StdDeque.h b/Eigen/src/StlSupport/StdDeque.h
index cf1fedf92..045da7b4d 100644
--- a/Eigen/src/StlSupport/StdDeque.h
+++ b/Eigen/src/StlSupport/StdDeque.h
@@ -36,7 +36,7 @@ namespace std \
     deque(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : deque_base(first, last, a) {} \
     deque(const deque& c) : deque_base(c) {}  \
     explicit deque(size_type num, const value_type& val = value_type()) : deque_base(num, val) {} \
-    deque(iterator start, iterator end) : deque_base(start, end) {}  \
+    deque(iterator start_, iterator end_) : deque_base(start_, end_) {}  \
     deque& operator=(const deque& x) {  \
       deque_base::operator=(x);  \
       return *this;  \
@@ -62,7 +62,7 @@ namespace std {
     : deque_base(first, last, a) {} \
     deque(const deque& c) : deque_base(c) {}  \
     explicit deque(size_type num, const value_type& val = value_type()) : deque_base(num, val) {} \
-    deque(iterator start, iterator end) : deque_base(start, end) {}  \
+    deque(iterator start_, iterator end_) : deque_base(start_, end_) {}  \
     deque& operator=(const deque& x) {  \
       deque_base::operator=(x);  \
       return *this;  \
diff --git a/Eigen/src/StlSupport/StdList.h b/Eigen/src/StlSupport/StdList.h
index e1eba4985..8ba3fada0 100644
--- a/Eigen/src/StlSupport/StdList.h
+++ b/Eigen/src/StlSupport/StdList.h
@@ -35,7 +35,7 @@ namespace std \
     list(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : list_base(first, last, a) {} \
     list(const list& c) : list_base(c) {}  \
     explicit list(size_type num, const value_type& val = value_type()) : list_base(num, val) {} \
-    list(iterator start, iterator end) : list_base(start, end) {}  \
+    list(iterator start_, iterator end_) : list_base(start_, end_) {}  \
     list& operator=(const list& x) {  \
       list_base::operator=(x);  \
       return *this;  \
@@ -62,7 +62,7 @@ namespace std
     : list_base(first, last, a) {} \
     list(const list& c) : list_base(c) {}  \
     explicit list(size_type num, const value_type& val = value_type()) : list_base(num, val) {} \
-    list(iterator start, iterator end) : list_base(start, end) {}  \
+    list(iterator start_, iterator end_) : list_base(start_, end_) {}  \
     list& operator=(const list& x) {  \
     list_base::operator=(x);  \
     return *this; \
diff --git a/Eigen/src/StlSupport/StdVector.h b/Eigen/src/StlSupport/StdVector.h
index ec22821d2..9fcf19bce 100644
--- a/Eigen/src/StlSupport/StdVector.h
+++ b/Eigen/src/StlSupport/StdVector.h
@@ -36,7 +36,7 @@ namespace std \
     vector(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : vector_base(first, last, a) {} \
     vector(const vector& c) : vector_base(c) {}  \
     explicit vector(size_type num, const value_type& val = value_type()) : vector_base(num, val) {} \
-    vector(iterator start, iterator end) : vector_base(start, end) {}  \
+    vector(iterator start_, iterator end_) : vector_base(start_, end_) {}  \
     vector& operator=(const vector& x) {  \
       vector_base::operator=(x);  \
       return *this;  \
@@ -62,7 +62,7 @@ namespace std {
     : vector_base(first, last, a) {} \
     vector(const vector& c) : vector_base(c) {}  \
     explicit vector(size_type num, const value_type& val = value_type()) : vector_base(num, val) {} \
-    vector(iterator start, iterator end) : vector_base(start, end) {}  \
+    vector(iterator start_, iterator end_) : vector_base(start_, end_) {}  \
     vector& operator=(const vector& x) {  \
       vector_base::operator=(x);  \
       return *this;  \
diff --git a/Eigen/src/SuperLUSupport/SuperLUSupport.h b/Eigen/src/SuperLUSupport/SuperLUSupport.h
index 50a69f306..354e33de5 100644
--- a/Eigen/src/SuperLUSupport/SuperLUSupport.h
+++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h
@@ -297,8 +297,8 @@ SluMatrix asSluMatrix(MatrixType& mat)
 template<typename Scalar, int Flags, typename Index>
 MappedSparseMatrix<Scalar,Flags,Index> map_superlu(SluMatrix& sluMat)
 {
-  eigen_assert((Flags&RowMajor)==RowMajor && sluMat.Stype == SLU_NR
-         || (Flags&ColMajor)==ColMajor && sluMat.Stype == SLU_NC);
+  eigen_assert(((Flags&RowMajor)==RowMajor && sluMat.Stype == SLU_NR)
+         || ((Flags&ColMajor)==ColMajor && sluMat.Stype == SLU_NC));
 
   Index outerSize = (Flags&RowMajor)==RowMajor ? sluMat.ncol : sluMat.nrow;
 
@@ -352,7 +352,7 @@ class SuperLUBase : public SparseSolverBase<Derived>
     
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the matrix.appears to be negative.
       */
     ComputationInfo info() const
diff --git a/Eigen/src/UmfPackSupport/UmfPackSupport.h b/Eigen/src/UmfPackSupport/UmfPackSupport.h
index 9568cc1d5..e3a333f80 100644
--- a/Eigen/src/UmfPackSupport/UmfPackSupport.h
+++ b/Eigen/src/UmfPackSupport/UmfPackSupport.h
@@ -10,6 +10,16 @@
 #ifndef EIGEN_UMFPACKSUPPORT_H
 #define EIGEN_UMFPACKSUPPORT_H
 
+// for compatibility with super old version of umfpack,
+// not sure this is really needed, but this is harmless.
+#ifndef SuiteSparse_long
+#ifdef UF_long
+#define SuiteSparse_long UF_long
+#else
+#error neither SuiteSparse_long nor UF_long are defined
+#endif
+#endif
+
 namespace Eigen {
 
 /* TODO extract L, extract U, compute det, etc... */
@@ -17,42 +27,85 @@ namespace Eigen {
 // generic double/complex<double> wrapper functions:
 
 
-inline void umfpack_defaults(double control[UMFPACK_CONTROL], double)
+ // Defaults
+inline void umfpack_defaults(double control[UMFPACK_CONTROL], double, int)
 { umfpack_di_defaults(control); }
 
-inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex<double>)
+inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex<double>, int)
 { umfpack_zi_defaults(control); }
 
-inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double)
+inline void umfpack_defaults(double control[UMFPACK_CONTROL], double, SuiteSparse_long)
+{ umfpack_dl_defaults(control); }
+
+inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex<double>, SuiteSparse_long)
+{ umfpack_zl_defaults(control); }
+
+// Report info
+inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double, int)
 { umfpack_di_report_info(control, info);}
 
-inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex<double>)
+inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex<double>, int)
 { umfpack_zi_report_info(control, info);}
 
-inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double)
+inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double, SuiteSparse_long)
+{ umfpack_dl_report_info(control, info);}
+
+inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex<double>, SuiteSparse_long)
+{ umfpack_zl_report_info(control, info);}
+
+// Report status
+inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double, int)
 { umfpack_di_report_status(control, status);}
 
-inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex<double>)
+inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex<double>, int)
 { umfpack_zi_report_status(control, status);}
 
-inline void umfpack_report_control(double control[UMFPACK_CONTROL], double)
+inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double, SuiteSparse_long)
+{ umfpack_dl_report_status(control, status);}
+
+inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex<double>, SuiteSparse_long)
+{ umfpack_zl_report_status(control, status);}
+
+// report control
+inline void umfpack_report_control(double control[UMFPACK_CONTROL], double, int)
 { umfpack_di_report_control(control);}
 
-inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex<double>)
+inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex<double>, int)
 { umfpack_zi_report_control(control);}
 
-inline void umfpack_free_numeric(void **Numeric, double)
+inline void umfpack_report_control(double control[UMFPACK_CONTROL], double, SuiteSparse_long)
+{ umfpack_dl_report_control(control);}
+
+inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex<double>, SuiteSparse_long)
+{ umfpack_zl_report_control(control);}
+
+// Free numeric
+inline void umfpack_free_numeric(void **Numeric, double, int)
 { umfpack_di_free_numeric(Numeric); *Numeric = 0; }
 
-inline void umfpack_free_numeric(void **Numeric, std::complex<double>)
+inline void umfpack_free_numeric(void **Numeric, std::complex<double>, int)
 { umfpack_zi_free_numeric(Numeric); *Numeric = 0; }
 
-inline void umfpack_free_symbolic(void **Symbolic, double)
+inline void umfpack_free_numeric(void **Numeric, double, SuiteSparse_long)
+{ umfpack_dl_free_numeric(Numeric); *Numeric = 0; }
+
+inline void umfpack_free_numeric(void **Numeric, std::complex<double>, SuiteSparse_long)
+{ umfpack_zl_free_numeric(Numeric); *Numeric = 0; }
+
+// Free symbolic
+inline void umfpack_free_symbolic(void **Symbolic, double, int)
 { umfpack_di_free_symbolic(Symbolic); *Symbolic = 0; }
 
-inline void umfpack_free_symbolic(void **Symbolic, std::complex<double>)
+inline void umfpack_free_symbolic(void **Symbolic, std::complex<double>, int)
 { umfpack_zi_free_symbolic(Symbolic); *Symbolic = 0; }
 
+inline void umfpack_free_symbolic(void **Symbolic, double, SuiteSparse_long)
+{ umfpack_dl_free_symbolic(Symbolic); *Symbolic = 0; }
+
+inline void umfpack_free_symbolic(void **Symbolic, std::complex<double>, SuiteSparse_long)
+{ umfpack_zl_free_symbolic(Symbolic); *Symbolic = 0; }
+
+// Symbolic
 inline int umfpack_symbolic(int n_row,int n_col,
                             const int Ap[], const int Ai[], const double Ax[], void **Symbolic,
                             const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO])
@@ -66,7 +119,21 @@ inline int umfpack_symbolic(int n_row,int n_col,
 {
   return umfpack_zi_symbolic(n_row,n_col,Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Control,Info);
 }
+inline SuiteSparse_long umfpack_symbolic( SuiteSparse_long n_row,SuiteSparse_long n_col,
+                                          const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const double Ax[], void **Symbolic,
+                                          const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO])
+{
+  return umfpack_dl_symbolic(n_row,n_col,Ap,Ai,Ax,Symbolic,Control,Info);
+}
 
+inline SuiteSparse_long umfpack_symbolic( SuiteSparse_long n_row,SuiteSparse_long n_col,
+                                          const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const std::complex<double> Ax[], void **Symbolic,
+                                          const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO])
+{
+  return umfpack_zl_symbolic(n_row,n_col,Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Control,Info);
+}
+
+// Numeric
 inline int umfpack_numeric( const int Ap[], const int Ai[], const double Ax[],
                             void *Symbolic, void **Numeric,
                             const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO])
@@ -80,7 +147,21 @@ inline int umfpack_numeric( const int Ap[], const int Ai[], const std::complex<d
 {
   return umfpack_zi_numeric(Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Numeric,Control,Info);
 }
+inline SuiteSparse_long umfpack_numeric(const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const double Ax[],
+                                        void *Symbolic, void **Numeric,
+                                        const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO])
+{
+  return umfpack_dl_numeric(Ap,Ai,Ax,Symbolic,Numeric,Control,Info);
+}
 
+inline SuiteSparse_long umfpack_numeric(const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const std::complex<double> Ax[],
+                                        void *Symbolic, void **Numeric,
+                                        const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO])
+{
+  return umfpack_zl_numeric(Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Numeric,Control,Info);
+}
+
+// solve
 inline int umfpack_solve( int sys, const int Ap[], const int Ai[], const double Ax[],
                           double X[], const double B[], void *Numeric,
                           const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO])
@@ -95,6 +176,21 @@ inline int umfpack_solve( int sys, const int Ap[], const int Ai[], const std::co
   return umfpack_zi_solve(sys,Ap,Ai,&numext::real_ref(Ax[0]),0,&numext::real_ref(X[0]),0,&numext::real_ref(B[0]),0,Numeric,Control,Info);
 }
 
+inline SuiteSparse_long umfpack_solve(int sys, const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const double Ax[],
+                                      double X[], const double B[], void *Numeric,
+                                      const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO])
+{
+  return umfpack_dl_solve(sys,Ap,Ai,Ax,X,B,Numeric,Control,Info);
+}
+
+inline SuiteSparse_long umfpack_solve(int sys, const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const std::complex<double> Ax[],
+                                      std::complex<double> X[], const std::complex<double> B[], void *Numeric,
+                                      const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO])
+{
+  return umfpack_zl_solve(sys,Ap,Ai,&numext::real_ref(Ax[0]),0,&numext::real_ref(X[0]),0,&numext::real_ref(B[0]),0,Numeric,Control,Info);
+}
+
+// Get Lunz
 inline int umfpack_get_lunz(int *lnz, int *unz, int *n_row, int *n_col, int *nz_udiag, void *Numeric, double)
 {
   return umfpack_di_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric);
@@ -105,6 +201,19 @@ inline int umfpack_get_lunz(int *lnz, int *unz, int *n_row, int *n_col, int *nz_
   return umfpack_zi_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric);
 }
 
+inline SuiteSparse_long umfpack_get_lunz( SuiteSparse_long *lnz, SuiteSparse_long *unz, SuiteSparse_long *n_row, SuiteSparse_long *n_col,
+                                          SuiteSparse_long *nz_udiag, void *Numeric, double)
+{
+  return umfpack_dl_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric);
+}
+
+inline SuiteSparse_long umfpack_get_lunz( SuiteSparse_long *lnz, SuiteSparse_long *unz, SuiteSparse_long *n_row, SuiteSparse_long *n_col,
+                                          SuiteSparse_long *nz_udiag, void *Numeric, std::complex<double>)
+{
+  return umfpack_zl_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric);
+}
+
+// Get Numeric
 inline int umfpack_get_numeric(int Lp[], int Lj[], double Lx[], int Up[], int Ui[], double Ux[],
                                int P[], int Q[], double Dx[], int *do_recip, double Rs[], void *Numeric)
 {
@@ -120,18 +229,45 @@ inline int umfpack_get_numeric(int Lp[], int Lj[], std::complex<double> Lx[], in
   return umfpack_zi_get_numeric(Lp,Lj,Lx?&lx0_real:0,0,Up,Ui,Ux?&ux0_real:0,0,P,Q,
                                 Dx?&dx0_real:0,0,do_recip,Rs,Numeric);
 }
+inline SuiteSparse_long umfpack_get_numeric(SuiteSparse_long Lp[], SuiteSparse_long Lj[], double Lx[], SuiteSparse_long Up[], SuiteSparse_long Ui[], double Ux[],
+                                            SuiteSparse_long P[], SuiteSparse_long Q[], double Dx[], SuiteSparse_long *do_recip, double Rs[], void *Numeric)
+{
+  return umfpack_dl_get_numeric(Lp,Lj,Lx,Up,Ui,Ux,P,Q,Dx,do_recip,Rs,Numeric);
+}
 
-inline int umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO])
+inline SuiteSparse_long umfpack_get_numeric(SuiteSparse_long Lp[], SuiteSparse_long Lj[], std::complex<double> Lx[], SuiteSparse_long Up[], SuiteSparse_long Ui[], std::complex<double> Ux[],
+                                            SuiteSparse_long P[], SuiteSparse_long Q[], std::complex<double> Dx[], SuiteSparse_long *do_recip, double Rs[], void *Numeric)
+{
+  double& lx0_real = numext::real_ref(Lx[0]);
+  double& ux0_real = numext::real_ref(Ux[0]);
+  double& dx0_real = numext::real_ref(Dx[0]);
+  return umfpack_zl_get_numeric(Lp,Lj,Lx?&lx0_real:0,0,Up,Ui,Ux?&ux0_real:0,0,P,Q,
+                                Dx?&dx0_real:0,0,do_recip,Rs,Numeric);
+}
+
+// Get Determinant
+inline int umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], int)
 {
   return umfpack_di_get_determinant(Mx,Ex,NumericHandle,User_Info);
 }
 
-inline int umfpack_get_determinant(std::complex<double> *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO])
+inline int umfpack_get_determinant(std::complex<double> *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], int)
 {
   double& mx_real = numext::real_ref(*Mx);
   return umfpack_zi_get_determinant(&mx_real,0,Ex,NumericHandle,User_Info);
 }
 
+inline SuiteSparse_long umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], SuiteSparse_long)
+{
+  return umfpack_dl_get_determinant(Mx,Ex,NumericHandle,User_Info);
+}
+
+inline SuiteSparse_long umfpack_get_determinant(std::complex<double> *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], SuiteSparse_long)
+{
+  double& mx_real = numext::real_ref(*Mx);
+  return umfpack_zl_get_determinant(&mx_real,0,Ex,NumericHandle,User_Info);
+}
+
 
 /** \ingroup UmfPackSupport_Module
   * \brief A sparse LU factorization and solver based on UmfPack
@@ -164,7 +300,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
     typedef Matrix<int, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
     typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
     typedef SparseMatrix<Scalar> LUMatrixType;
-    typedef SparseMatrix<Scalar,ColMajor,int> UmfpackMatrixType;
+    typedef SparseMatrix<Scalar,ColMajor,StorageIndex> UmfpackMatrixType;
     typedef Ref<const UmfpackMatrixType, StandardCompressedFormat> UmfpackMatrixRef;
     enum {
       ColsAtCompileTime = MatrixType::ColsAtCompileTime,
@@ -192,8 +328,8 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
 
     ~UmfPackLU()
     {
-      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar());
-      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar());
+      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(), StorageIndex());
+      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar(), StorageIndex());
     }
 
     inline Index rows() const { return mp_matrix.rows(); }
@@ -201,7 +337,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the matrix.appears to be negative.
       */
     ComputationInfo info() const
@@ -241,8 +377,8 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
     template<typename InputMatrixType>
     void compute(const InputMatrixType& matrix)
     {
-      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar());
-      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar());
+      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(),StorageIndex());
+      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex());
       grab(matrix.derived());
       analyzePattern_impl();
       factorize_impl();
@@ -257,8 +393,8 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
     template<typename InputMatrixType>
     void analyzePattern(const InputMatrixType& matrix)
     {
-      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar());
-      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar());
+      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(),StorageIndex());
+      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex());
 
       grab(matrix.derived());
 
@@ -309,7 +445,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
     {
       eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()");
       if(m_numeric)
-        umfpack_free_numeric(&m_numeric,Scalar());
+        umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex());
 
       grab(matrix.derived());
 
@@ -322,7 +458,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
       */
     void printUmfpackControl()
     {
-      umfpack_report_control(m_control.data(), Scalar());
+      umfpack_report_control(m_control.data(), Scalar(),StorageIndex());
     }
 
     /** Prints statistics collected by UmfPack.
@@ -332,7 +468,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
     void printUmfpackInfo()
     {
       eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()");
-      umfpack_report_info(m_control.data(), m_umfpackInfo.data(), Scalar());
+      umfpack_report_info(m_control.data(), m_umfpackInfo.data(), Scalar(),StorageIndex());
     }
 
     /** Prints the status of the previous factorization operation performed by UmfPack (symbolic or numerical factorization).
@@ -341,7 +477,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
       */
     void printUmfpackStatus() {
       eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()");
-      umfpack_report_status(m_control.data(), m_fact_errorCode, Scalar());
+      umfpack_report_status(m_control.data(), m_fact_errorCode, Scalar(),StorageIndex());
     }
 
     /** \internal */
@@ -362,13 +498,13 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
       m_symbolic              = 0;
       m_extractedDataAreDirty = true;
 
-      umfpack_defaults(m_control.data(), Scalar());
+      umfpack_defaults(m_control.data(), Scalar(),StorageIndex());
     }
 
     void analyzePattern_impl()
     {
-      m_fact_errorCode = umfpack_symbolic(internal::convert_index<int>(mp_matrix.rows()),
-                                          internal::convert_index<int>(mp_matrix.cols()),
+      m_fact_errorCode = umfpack_symbolic(internal::convert_index<StorageIndex>(mp_matrix.rows()),
+                                          internal::convert_index<StorageIndex>(mp_matrix.cols()),
                                           mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
                                           &m_symbolic, m_control.data(), m_umfpackInfo.data());
 
@@ -408,7 +544,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
 
     // cached data to reduce reallocation, etc.
     mutable LUMatrixType m_l;
-    int m_fact_errorCode;
+    StorageIndex m_fact_errorCode;
     UmfpackControl m_control;
     mutable UmfpackInfo m_umfpackInfo;
 
@@ -438,7 +574,7 @@ void UmfPackLU<MatrixType>::extractData() const
   if (m_extractedDataAreDirty)
   {
     // get size of the data
-    int lnz, unz, rows, cols, nz_udiag;
+    StorageIndex lnz, unz, rows, cols, nz_udiag;
     umfpack_get_lunz(&lnz, &unz, &rows, &cols, &nz_udiag, m_numeric, Scalar());
 
     // allocate data
@@ -464,7 +600,7 @@ template<typename MatrixType>
 typename UmfPackLU<MatrixType>::Scalar UmfPackLU<MatrixType>::determinant() const
 {
   Scalar det;
-  umfpack_get_determinant(&det, 0, m_numeric, 0);
+  umfpack_get_determinant(&det, 0, m_numeric, 0, StorageIndex());
   return det;
 }
 
@@ -477,7 +613,6 @@ bool UmfPackLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBas
   eigen_assert((XDerived::Flags&RowMajorBit)==0 && "UmfPackLU backend does not support non col-major result yet");
   eigen_assert(b.derived().data() != x.derived().data() && " Umfpack does not support inplace solve");
 
-  int errorCode;
   Scalar* x_ptr = 0;
   Matrix<Scalar,Dynamic,1> x_tmp;
   if(x.innerStride()!=1)
@@ -489,9 +624,10 @@ bool UmfPackLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBas
   {
     if(x.innerStride()==1)
       x_ptr = &x.col(j).coeffRef(0);
-    errorCode = umfpack_solve(UMFPACK_A,
-        mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
-        x_ptr, &b.const_cast_derived().col(j).coeffRef(0), m_numeric, m_control.data(), m_umfpackInfo.data());
+    StorageIndex errorCode = umfpack_solve(UMFPACK_A,
+                                mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
+                                x_ptr, &b.const_cast_derived().col(j).coeffRef(0),
+                                m_numeric, m_control.data(), m_umfpackInfo.data());
     if(x.innerStride()!=1)
       x.col(j) = x_tmp;
     if (errorCode!=0)
diff --git a/Eigen/src/misc/lapacke.h b/Eigen/src/misc/lapacke.h
index 8c7e79b03..3d8e24f5a 100755
--- a/Eigen/src/misc/lapacke.h
+++ b/Eigen/src/misc/lapacke.h
@@ -43,10 +43,6 @@
 #include "lapacke_config.h"
 #endif
 
-#ifdef __cplusplus
-extern "C" {
-#endif /* __cplusplus */
-
 #include <stdlib.h>
 
 #ifndef lapack_int
@@ -108,6 +104,11 @@ lapack_complex_double lapack_make_complex_double( double re, double im );
 
 #endif
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
 #ifndef LAPACKE_malloc
 #define LAPACKE_malloc( size ) malloc( size )
 #endif
diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
index 43615bd56..e928db467 100644
--- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
@@ -21,6 +21,7 @@ typedef CwiseUnaryOp<internal::scalar_acos_op<Scalar>, const Derived> AcosReturn
 typedef CwiseUnaryOp<internal::scalar_asin_op<Scalar>, const Derived> AsinReturnType;
 typedef CwiseUnaryOp<internal::scalar_atan_op<Scalar>, const Derived> AtanReturnType;
 typedef CwiseUnaryOp<internal::scalar_tanh_op<Scalar>, const Derived> TanhReturnType;
+typedef CwiseUnaryOp<internal::scalar_logistic_op<Scalar>, const Derived> LogisticReturnType;
 typedef CwiseUnaryOp<internal::scalar_sinh_op<Scalar>, const Derived> SinhReturnType;
 typedef CwiseUnaryOp<internal::scalar_cosh_op<Scalar>, const Derived> CoshReturnType;
 typedef CwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived> SquareReturnType;
@@ -335,6 +336,15 @@ cosh() const
   return CoshReturnType(derived());
 }
 
+/** \returns an expression of the coefficient-wise logistic of *this.
+  */
+EIGEN_DEVICE_FUNC
+inline const LogisticReturnType
+logistic() const
+{
+  return LogisticReturnType(derived());
+}
+
 /** \returns an expression of the coefficient-wise inverse of *this.
   *
   * Example: \include Cwise_inverse.cpp
diff --git a/Eigen/src/plugins/BlockMethods.h b/Eigen/src/plugins/BlockMethods.h
index 5caf14469..67fdebc6f 100644
--- a/Eigen/src/plugins/BlockMethods.h
+++ b/Eigen/src/plugins/BlockMethods.h
@@ -40,6 +40,14 @@ typedef const VectorBlock<const Derived> ConstSegmentReturnType;
 template<int Size> struct FixedSegmentReturnType { typedef VectorBlock<Derived, Size> Type; };
 template<int Size> struct ConstFixedSegmentReturnType { typedef const VectorBlock<const Derived, Size> Type; };
 
+/// \internal inner-vector
+typedef Block<Derived,IsRowMajor?1:Dynamic,IsRowMajor?Dynamic:1,true>       InnerVectorReturnType;
+typedef Block<const Derived,IsRowMajor?1:Dynamic,IsRowMajor?Dynamic:1,true> ConstInnerVectorReturnType;
+
+/// \internal set of inner-vectors
+typedef Block<Derived,Dynamic,Dynamic,true> InnerVectorsReturnType;
+typedef Block<const Derived,Dynamic,Dynamic,true> ConstInnerVectorsReturnType;
+
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
 /// \returns an expression of a block in \c *this with either dynamic or fixed sizes.
@@ -1036,7 +1044,7 @@ inline const typename ConstFixedBlockXpr<NRows,NCols>::Type block(Index startRow
 /// \a NRows is \a Dynamic, and the same for the number of columns.
 ///
 /// Example: \include MatrixBase_template_int_int_block_int_int_int_int.cpp
-/// Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.out
 ///
 /// \note The usage of of this overload is discouraged from %Eigen 3.4, better used the generic
 /// block(Index,Index,NRowsType,NColsType), here is the one-to-one complete equivalence:
@@ -1053,6 +1061,7 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template<int NRows, int NCols>
+EIGEN_DEVICE_FUNC
 inline typename FixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol,
                                                   Index blockRows, Index blockCols)
 {
@@ -1354,3 +1363,39 @@ inline typename ConstFixedSegmentReturnType<N>::Type tail(Index n = N) const
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename ConstFixedSegmentReturnType<N>::Type(derived(), size() - n);
 }
+
+/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+/// is col-major (resp. row-major).
+///
+InnerVectorReturnType innerVector(Index outer)
+{ return InnerVectorReturnType(derived(), outer); }
+
+/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+/// is col-major (resp. row-major). Read-only.
+///
+const ConstInnerVectorReturnType innerVector(Index outer) const
+{ return ConstInnerVectorReturnType(derived(), outer); }
+
+/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+/// is col-major (resp. row-major).
+///
+InnerVectorsReturnType
+innerVectors(Index outerStart, Index outerSize)
+{
+  return Block<Derived,Dynamic,Dynamic,true>(derived(),
+                                             IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
+                                             IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
+
+}
+
+/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+/// is col-major (resp. row-major). Read-only.
+///
+const ConstInnerVectorsReturnType
+innerVectors(Index outerStart, Index outerSize) const
+{
+  return Block<const Derived,Dynamic,Dynamic,true>(derived(),
+                                                  IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
+                                                  IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
+
+}
diff --git a/Eigen/src/plugins/IndexedViewMethods.h b/Eigen/src/plugins/IndexedViewMethods.h
index 22c1666c5..5bfb19ac6 100644
--- a/Eigen/src/plugins/IndexedViewMethods.h
+++ b/Eigen/src/plugins/IndexedViewMethods.h
@@ -53,13 +53,6 @@ ivcSize(const Indices& indices) const {
   return internal::makeIndexedViewCompatible(indices, internal::variable_if_dynamic<Index,SizeAtCompileTime>(derived().size()),Specialized);
 }
 
-template<typename RowIndices, typename ColIndices>
-struct valid_indexed_view_overload {
-  // Here we use is_convertible to Index instead of is_integral in order to treat enums as Index.
-  // In c++11 we could use is_integral<T> && is_enum<T> if is_convertible appears to be too permissive.
-  enum { value = !(internal::is_convertible<RowIndices,Index>::value && internal::is_convertible<ColIndices,Index>::value) };
-};
-
 public:
 
 #endif
@@ -74,7 +67,7 @@ struct EIGEN_INDEXED_VIEW_METHOD_TYPE {
 // This is the generic version
 
 template<typename RowIndices, typename ColIndices>
-typename internal::enable_if<valid_indexed_view_overload<RowIndices,ColIndices>::value
+typename internal::enable_if<internal::valid_indexed_view_overload<RowIndices,ColIndices>::value
   && internal::traits<typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>::ReturnAsIndexedView,
   typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type >::type
 operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
@@ -86,7 +79,7 @@ operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_IND
 // The following overload returns a Block<> object
 
 template<typename RowIndices, typename ColIndices>
-typename internal::enable_if<valid_indexed_view_overload<RowIndices,ColIndices>::value
+typename internal::enable_if<internal::valid_indexed_view_overload<RowIndices,ColIndices>::value
   && internal::traits<typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>::ReturnAsBlock,
   typename internal::traits<typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>::BlockType>::type
 operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
@@ -104,7 +97,7 @@ operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_IND
 // The following overload returns a Scalar
 
 template<typename RowIndices, typename ColIndices>
-typename internal::enable_if<valid_indexed_view_overload<RowIndices,ColIndices>::value
+typename internal::enable_if<internal::valid_indexed_view_overload<RowIndices,ColIndices>::value
   && internal::traits<typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>::ReturnAsScalar,
   CoeffReturnType >::type
 operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
@@ -114,7 +107,7 @@ operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_IND
 
 #if EIGEN_HAS_STATIC_ARRAY_TEMPLATE
 
-// The folowing three overloads are needed to handle raw Index[N] arrays.
+// The following three overloads are needed to handle raw Index[N] arrays.
 
 template<typename RowIndicesT, std::size_t RowIndicesN, typename ColIndices>
 IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,const RowIndicesT (&)[RowIndicesN],typename IvcColType<ColIndices>::type>
@@ -146,7 +139,7 @@ operator()(const RowIndicesT (&rowIndices)[RowIndicesN], const ColIndicesT (&col
 
 template<typename Indices>
 typename internal::enable_if<
-  IsRowMajor && (!(internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1 || internal::is_integral<Indices>::value)),
+  IsRowMajor && (!(internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1 || internal::is_valid_index_type<Indices>::value)),
   IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,IvcIndex,typename IvcType<Indices>::type> >::type
 operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
 {
@@ -157,7 +150,7 @@ operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
 
 template<typename Indices>
 typename internal::enable_if<
-  (!IsRowMajor) && (!(internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1 || internal::is_integral<Indices>::value)),
+  (!IsRowMajor) && (!(internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1 || internal::is_valid_index_type<Indices>::value)),
   IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,typename IvcType<Indices>::type,IvcIndex> >::type
 operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
 {
@@ -168,7 +161,7 @@ operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
 
 template<typename Indices>
 typename internal::enable_if<
-  (internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1) && (!internal::is_integral<Indices>::value) && (!Symbolic::is_symbolic<Indices>::value),
+  (internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1) && (!internal::is_valid_index_type<Indices>::value) && (!symbolic::is_symbolic<Indices>::value),
   VectorBlock<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,internal::array_size<Indices>::value> >::type
 operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
 {
@@ -179,7 +172,7 @@ operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
 }
 
 template<typename IndexType>
-typename internal::enable_if<Symbolic::is_symbolic<IndexType>::value, CoeffReturnType >::type
+typename internal::enable_if<symbolic::is_symbolic<IndexType>::value, CoeffReturnType >::type
 operator()(const IndexType& id) EIGEN_INDEXED_VIEW_METHOD_CONST
 {
   return Base::operator()(internal::eval_expr_given_size(id,size()));
@@ -250,6 +243,8 @@ operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST
   *
   * For 1D vectors and arrays, you better use the operator()(const Indices&) overload, which behave the same way but taking a single parameter.
   *
+  * See also this <a href="https://stackoverflow.com/questions/46110917/eigen-replicate-items-along-one-dimension-without-useless-allocations">question</a> and its answer for an example of how to duplicate coefficients.
+  *
   * \sa operator()(const Indices&), class Block, class IndexedView, DenseBase::block(Index,Index,Index,Index)
   */
 template<typename RowIndices, typename ColIndices>
diff --git a/Eigen/src/plugins/ReshapedMethods.h b/Eigen/src/plugins/ReshapedMethods.h
index 2bb0b8623..9aeb7f3ee 100644
--- a/Eigen/src/plugins/ReshapedMethods.h
+++ b/Eigen/src/plugins/ReshapedMethods.h
@@ -25,7 +25,7 @@
 /// AutoSize does preserve compile-time sizes when possible, i.e., when the sizes of the input are known at compile time \b and
 /// that the other size is passed at compile-time using Eigen::fix<N> as above.
 ///
-/// \sa operator()(placeholders::all), class Reshaped, fix, fix<N>(int)
+/// \sa operator()(all), class Reshaped, fix, fix<N>(int)
 ///
 template<int Order = ColMajor, typename NRowsType, typename NColsType>
 EIGEN_DEVICE_FUNC
@@ -50,7 +50,7 @@ reshaped(NRowsType nRows, NColsType nCols) const;
 /// \sa reshaped()
 EIGEN_DEVICE_FUNC
 inline Reshaped<Derived,SizeAtCompileTime,1>
-operator()(placeholders::all);
+operator()(all);
 
 #else
author	Gael Guennebaud <g.gael@free.fr>	2018-09-21 11:51:49 +0200
committer	Gael Guennebaud <g.gael@free.fr>	2018-09-21 11:51:49 +0200
commit	a488d59787d25337fd71bcc994d1c2c6ab011731 (patch)
tree	2ac09b50a43bf7586f8df8ac0c069902e326773b /Eigen
parent	47720e797012f6ef2f50da1f0e79fdc4f16e60a1 (diff)
parent	3ec29859146f33e323ebd6d41e46acda63dffcef (diff)