560 files changed, 21947 insertions, 6911 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8a068739b..dbf0999ce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,6 +8,7 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
   message(FATAL_ERROR "In-source builds not allowed. Please make a new directory (called a build directory) and run CMake from there. You may need to remove CMakeCache.txt. ")
 endif()
 
+
 # Alias Eigen_*_DIR to Eigen3_*_DIR:
 
 set(Eigen_SOURCE_DIR ${Eigen3_SOURCE_DIR})
@@ -41,10 +42,13 @@ string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen_minor_
 set(EIGEN_MINOR_VERSION "${CMAKE_MATCH_1}")
 set(EIGEN_VERSION_NUMBER ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION})
 
-# if the mercurial program is absent, this will leave the EIGEN_HG_CHANGESET string empty,
-# but won't stop CMake.
-execute_process(COMMAND hg tip -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_HGTIP_OUTPUT)
-execute_process(COMMAND hg branch -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_BRANCH_OUTPUT)
+# if we are not in a mercurial clone
+if(IS_DIRECTORY ${CMAKE_SOURCE_DIR}/.hg)
+  # if the mercurial program is absent or this will leave the EIGEN_HG_CHANGESET string empty,
+  # but won't stop CMake.
+  execute_process(COMMAND hg tip -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_HGTIP_OUTPUT)
+  execute_process(COMMAND hg branch -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_BRANCH_OUTPUT)
+endif()
 
 # if this is the default (aka development) branch, extract the mercurial changeset number from the hg tip output...
 if(EIGEN_BRANCH_OUTPUT MATCHES "default")
@@ -104,7 +108,7 @@ if(NOT WIN32 OR NOT CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
   option(EIGEN_BUILD_PKGCONFIG "Build pkg-config .pc file for Eigen" ON)
 endif()
 
-set(CMAKE_INCLUDE_CURRENT_DIR ON)
+set(CMAKE_INCLUDE_CURRENT_DIR OFF)
 
 option(EIGEN_SPLIT_LARGE_TESTS "Split large tests into smaller executables" ON)
 
@@ -153,11 +157,7 @@ if(NOT MSVC)
   ei_add_cxx_compiler_flag("-Wdouble-promotion")
 #  ei_add_cxx_compiler_flag("-Wconversion")
   
-  # -Wshadow is insanely too strict with gcc, hopefully it will become usable with gcc 6
-  # if(NOT CMAKE_COMPILER_IS_GNUCXX OR (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "5.0.0"))
-  if(NOT CMAKE_COMPILER_IS_GNUCXX)
-    ei_add_cxx_compiler_flag("-Wshadow")
-  endif()
+  ei_add_cxx_compiler_flag("-Wshadow")
   
   ei_add_cxx_compiler_flag("-Wno-psabi")
   ei_add_cxx_compiler_flag("-Wno-variadic-macros")
@@ -257,6 +257,12 @@ if(NOT MSVC)
     message(STATUS "Enabling VSX in tests/examples")
   endif()
 
+  option(EIGEN_TEST_MSA "Enable/Disable MSA in tests/examples" OFF)
+  if(EIGEN_TEST_MSA)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mmsa")
+    message(STATUS "Enabling MSA in tests/examples")
+  endif()
+
   option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF)
   if(EIGEN_TEST_NEON)
     if(EIGEN_TEST_FMA)
@@ -372,7 +378,7 @@ option(EIGEN_TEST_CXX11 "Enable testing with C++11 and C++11 features (e.g. Tens
 
 set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture level to target when compiling CUDA code")
 
-include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 
 # Backward compatibility support for EIGEN_INCLUDE_INSTALL_DIR
 if(EIGEN_INCLUDE_INSTALL_DIR)
diff --git a/CTestConfig.cmake b/CTestConfig.cmake
index 4c0027824..8b4cd798e 100644
--- a/CTestConfig.cmake
+++ b/CTestConfig.cmake
@@ -11,7 +11,7 @@ set(CTEST_DROP_METHOD "http")
 set(CTEST_DROP_SITE "manao.inria.fr")
 set(CTEST_DROP_LOCATION "/CDash/submit.php?project=Eigen")
 set(CTEST_DROP_SITE_CDASH TRUE)
-set(CTEST_PROJECT_SUBPROJECTS
-Official
-Unsupported
-)
+#set(CTEST_PROJECT_SUBPROJECTS
+#Official
+#Unsupported
+#)
diff --git a/CTestCustom.cmake.in b/CTestCustom.cmake.in
index 9fed9d327..89e487f05 100644
--- a/CTestCustom.cmake.in
+++ b/CTestCustom.cmake.in
@@ -1,3 +1,4 @@
 
 set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_WARNINGS "2000")
 set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_ERRORS   "2000")
+list(APPEND CTEST_CUSTOM_ERROR_EXCEPTION    @EIGEN_CTEST_ERROR_EXCEPTION@)
diff --git a/Eigen/Core b/Eigen/Core
index 5a6dec8cc..864bde551 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -14,82 +14,26 @@
 // first thing Eigen does: stop the compiler from committing suicide
 #include "src/Core/util/DisableStupidWarnings.h"
 
-#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
-  #define EIGEN_CUDACC __CUDACC__
-#endif
-
-#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA)
-  #define EIGEN_CUDA_ARCH __CUDA_ARCH__
-#endif
-
-// Starting with CUDA 9 the composite __CUDACC_VER__ is not available.
-#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
-#define EIGEN_CUDACC_VER  ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
-#elif defined(__CUDACC_VER__)
-#define EIGEN_CUDACC_VER __CUDACC_VER__
-#else
-#define EIGEN_CUDACC_VER 0
-#endif
+// then include this file where all our macros are defined. It's really important to do it first because
+// it's where we do all the compiler/OS/arch detections and define most defaults.
+#include "src/Core/util/Macros.h"
 
-// Handle NVCC/CUDA/SYCL
-#if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__)
-  // Do not try asserts on CUDA and SYCL!
-  #ifndef EIGEN_NO_DEBUG
-  #define EIGEN_NO_DEBUG
-  #endif
-
-  #ifdef EIGEN_INTERNAL_DEBUGGING
-  #undef EIGEN_INTERNAL_DEBUGGING
-  #endif
-
-  #ifdef EIGEN_EXCEPTIONS
-  #undef EIGEN_EXCEPTIONS
-  #endif
-
-  // All functions callable from CUDA code must be qualified with __device__
-  #ifdef EIGEN_CUDACC
-    // Do not try to vectorize on CUDA and SYCL!
-    #ifndef EIGEN_DONT_VECTORIZE
-    #define EIGEN_DONT_VECTORIZE
-    #endif
-
-    #define EIGEN_DEVICE_FUNC __host__ __device__
-    // We need cuda_runtime.h to ensure that that EIGEN_USING_STD_MATH macro
-    // works properly on the device side
-    #include <cuda_runtime.h>
-  #else
-    #define EIGEN_DEVICE_FUNC
-  #endif
-#else
-  #define EIGEN_DEVICE_FUNC
-#endif
+// This detects SSE/AVX/NEON/etc. and configure alignment settings
+#include "src/Core/util/ConfigureVectorization.h"
 
-#ifdef __NVCC__
-#define EIGEN_DONT_VECTORIZE
+// We need cuda_runtime.h/hip_runtime.h to ensure that
+// the EIGEN_USING_STD_MATH macro works properly on the device side
+#if defined(EIGEN_CUDACC)
+  #include <cuda_runtime.h>
+#elif defined(EIGEN_HIPCC)
+  #include <hip/hip_runtime.h>
 #endif
 
-// When compiling CUDA device code with NVCC, pull in math functions from the
-// global namespace.  In host mode, and when device doee with clang, use the
-// std versions.
-#if defined(EIGEN_CUDA_ARCH) && defined(__NVCC__)
-  #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
-#else
-  #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
-#endif
-
-#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL)
-  #define EIGEN_EXCEPTIONS
-#endif
 
 #ifdef EIGEN_EXCEPTIONS
   #include <new>
 #endif
 
-// then include this file where all our macros are defined. It's really important to do it first because
-// it's where we do all the alignment settings (platform detection and honoring the user's will if he
-// defined e.g. EIGEN_DONT_ALIGN) so it needs to be done before we do anything with vectorization.
-#include "src/Core/util/Macros.h"
-
 // Disable the ipa-cp-clone optimization flag with MinGW 6.x or newer (enabled by default with -O3)
 // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details.
 #if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6)
@@ -102,169 +46,9 @@
 // and inclusion of their respective header files
 #include "src/Core/util/MKL_support.h"
 
-// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into
-// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks
-#if EIGEN_MAX_ALIGN_BYTES==0
-  #ifndef EIGEN_DONT_VECTORIZE
-    #define EIGEN_DONT_VECTORIZE
-  #endif
-#endif
-
-#if EIGEN_COMP_MSVC
-  #include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled
-  #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later
-    // Remember that usage of defined() in a #define is undefined by the standard.
-    // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP.
-    #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64
-      #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
-    #endif
-  #endif
-#else
-  // Remember that usage of defined() in a #define is undefined by the standard
-  #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) )
-    #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC
-  #endif
-#endif
 
-#ifndef EIGEN_DONT_VECTORIZE
-
-  #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
-
-    // Defines symbols for compile-time detection of which instructions are
-    // used.
-    // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_SSE
-    #define EIGEN_VECTORIZE_SSE2
-
-    // Detect sse3/ssse3/sse4:
-    // gcc and icc defines __SSE3__, ...
-    // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you
-    // want to force the use of those instructions with msvc.
-    #ifdef __SSE3__
-      #define EIGEN_VECTORIZE_SSE3
-    #endif
-    #ifdef __SSSE3__
-      #define EIGEN_VECTORIZE_SSSE3
-    #endif
-    #ifdef __SSE4_1__
-      #define EIGEN_VECTORIZE_SSE4_1
-    #endif
-    #ifdef __SSE4_2__
-      #define EIGEN_VECTORIZE_SSE4_2
-    #endif
-    #ifdef __AVX__
-      #define EIGEN_VECTORIZE_AVX
-      #define EIGEN_VECTORIZE_SSE3
-      #define EIGEN_VECTORIZE_SSSE3
-      #define EIGEN_VECTORIZE_SSE4_1
-      #define EIGEN_VECTORIZE_SSE4_2
-    #endif
-    #ifdef __AVX2__
-      #define EIGEN_VECTORIZE_AVX2
-      #define EIGEN_VECTORIZE_AVX
-      #define EIGEN_VECTORIZE_SSE3
-      #define EIGEN_VECTORIZE_SSSE3
-      #define EIGEN_VECTORIZE_SSE4_1
-      #define EIGEN_VECTORIZE_SSE4_2
-    #endif
-    #ifdef __FMA__
-      #define EIGEN_VECTORIZE_FMA
-    #endif
-    #if defined(__AVX512F__)
-      #define EIGEN_VECTORIZE_AVX512
-      #define EIGEN_VECTORIZE_AVX2
-      #define EIGEN_VECTORIZE_AVX
-      #define EIGEN_VECTORIZE_FMA
-      #define EIGEN_VECTORIZE_SSE3
-      #define EIGEN_VECTORIZE_SSSE3
-      #define EIGEN_VECTORIZE_SSE4_1
-      #define EIGEN_VECTORIZE_SSE4_2
-      #ifdef __AVX512DQ__
-        #define EIGEN_VECTORIZE_AVX512DQ
-      #endif
-    #endif
-
-    // include files
-
-    // This extern "C" works around a MINGW-w64 compilation issue
-    // https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354
-    // In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do).
-    // However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations
-    // with conflicting linkage.  The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know;
-    // so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too.
-    // notice that since these are C headers, the extern "C" is theoretically needed anyways.
-    extern "C" {
-      // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
-      // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
-      #if EIGEN_COMP_ICC >= 1110
-        #include <immintrin.h>
-      #else
-        #include <mmintrin.h>
-        #include <emmintrin.h>
-        #include <xmmintrin.h>
-        #ifdef  EIGEN_VECTORIZE_SSE3
-        #include <pmmintrin.h>
-        #endif
-        #ifdef EIGEN_VECTORIZE_SSSE3
-        #include <tmmintrin.h>
-        #endif
-        #ifdef EIGEN_VECTORIZE_SSE4_1
-        #include <smmintrin.h>
-        #endif
-        #ifdef EIGEN_VECTORIZE_SSE4_2
-        #include <nmmintrin.h>
-        #endif
-        #if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512)
-        #include <immintrin.h>
-        #endif
-      #endif
-    } // end extern "C"
-  #elif defined __VSX__
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_VSX
-    #include <altivec.h>
-    // We need to #undef all these ugly tokens defined in <altivec.h>
-    // => use __vector instead of vector
-    #undef bool
-    #undef vector
-    #undef pixel
-  #elif defined __ALTIVEC__
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_ALTIVEC
-    #include <altivec.h>
-    // We need to #undef all these ugly tokens defined in <altivec.h>
-    // => use __vector instead of vector
-    #undef bool
-    #undef vector
-    #undef pixel
-  #elif (defined  __ARM_NEON) || (defined __ARM_NEON__)
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_NEON
-    #include <arm_neon.h>
-  #elif (defined __s390x__ && defined __VEC__)
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_ZVECTOR
-    #include <vecintrin.h>
-  #endif
-#endif
-
-#if defined(__F16C__) && !defined(EIGEN_COMP_CLANG)
-  // We can use the optimized fp16 to float and float to fp16 conversion routines
-  #define EIGEN_HAS_FP16_C
-#endif
-
-#if defined EIGEN_CUDACC
-  #define EIGEN_VECTORIZE_CUDA
-  #include <vector_types.h>
-  #if EIGEN_CUDACC_VER >= 70500
-    #define EIGEN_HAS_CUDA_FP16
-  #endif
-#endif
-
-#if defined EIGEN_HAS_CUDA_FP16
-  #include <host_defines.h>
-  #include <cuda_fp16.h>
+#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
+  #define EIGEN_HAS_GPU_FP16
 #endif
 
 #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
@@ -296,6 +80,10 @@
 // for min/max:
 #include <algorithm>
 
+#if EIGEN_HAS_CXX11
+#include <array>
+#endif
+
 // for std::is_nothrow_move_assignable
 #ifdef EIGEN_INCLUDE_TYPE_TRAITS
 #include <type_traits>
@@ -320,38 +108,6 @@
   #include <SYCL/sycl.hpp>
 #endif
 
-/** \brief Namespace containing all symbols from the %Eigen library. */
-namespace Eigen {
-
-inline static const char *SimdInstructionSetsInUse(void) {
-#if defined(EIGEN_VECTORIZE_AVX512)
-  return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
-#elif defined(EIGEN_VECTORIZE_AVX)
-  return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
-#elif defined(EIGEN_VECTORIZE_SSE4_2)
-  return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
-#elif defined(EIGEN_VECTORIZE_SSE4_1)
-  return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
-#elif defined(EIGEN_VECTORIZE_SSSE3)
-  return "SSE, SSE2, SSE3, SSSE3";
-#elif defined(EIGEN_VECTORIZE_SSE3)
-  return "SSE, SSE2, SSE3";
-#elif defined(EIGEN_VECTORIZE_SSE2)
-  return "SSE, SSE2";
-#elif defined(EIGEN_VECTORIZE_ALTIVEC)
-  return "AltiVec";
-#elif defined(EIGEN_VECTORIZE_VSX)
-  return "VSX";
-#elif defined(EIGEN_VECTORIZE_NEON)
-  return "ARM NEON";
-#elif defined(EIGEN_VECTORIZE_ZVECTOR)
-  return "S390X ZVECTOR";
-#else
-  return "None";
-#endif
-}
-
-} // end namespace Eigen
 
 #if defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS || defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API || defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS || defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API || defined EIGEN2_SUPPORT
 // This will generate an error message:
@@ -360,7 +116,7 @@ inline static const char *SimdInstructionSetsInUse(void) {
 
 namespace Eigen {
 
-// we use size_t frequently and we'll never remember to prepend it with std:: everytime just to
+// we use size_t frequently and we'll never remember to prepend it with std:: every time just to
 // ensure QNX/QCC support
 using std::size_t;
 // gcc 4.6.0 wants std:: for ptrdiff_t
@@ -387,7 +143,6 @@ using std::ptrdiff_t;
 #include "src/Core/util/IntegralConstant.h"
 #include "src/Core/util/SymbolicIndex.h"
 
-
 #include "src/Core/NumTraits.h"
 #include "src/Core/MathFunctions.h"
 #include "src/Core/GenericPacketMath.h"
@@ -410,6 +165,7 @@ using std::ptrdiff_t;
   #include "src/Core/arch/AVX/MathFunctions.h"
   #include "src/Core/arch/AVX/Complex.h"
   #include "src/Core/arch/AVX/TypeCasting.h"
+  #include "src/Core/arch/SSE/TypeCasting.h"
 #elif defined EIGEN_VECTORIZE_SSE
   #include "src/Core/arch/SSE/PacketMath.h"
   #include "src/Core/arch/SSE/MathFunctions.h"
@@ -423,20 +179,25 @@ using std::ptrdiff_t;
   #include "src/Core/arch/NEON/PacketMath.h"
   #include "src/Core/arch/NEON/MathFunctions.h"
   #include "src/Core/arch/NEON/Complex.h"
+  #include "src/Core/arch/NEON/TypeCasting.h"
 #elif defined EIGEN_VECTORIZE_ZVECTOR
   #include "src/Core/arch/ZVector/PacketMath.h"
   #include "src/Core/arch/ZVector/MathFunctions.h"
   #include "src/Core/arch/ZVector/Complex.h"
+#elif defined EIGEN_VECTORIZE_MSA
+  #include "src/Core/arch/MSA/PacketMath.h"
+  #include "src/Core/arch/MSA/MathFunctions.h"
+  #include "src/Core/arch/MSA/Complex.h"
 #endif
 
 // Half float support
-#include "src/Core/arch/CUDA/Half.h"
-#include "src/Core/arch/CUDA/PacketMathHalf.h"
-#include "src/Core/arch/CUDA/TypeCasting.h"
+#include "src/Core/arch/GPU/Half.h"
+#include "src/Core/arch/GPU/PacketMathHalf.h"
+#include "src/Core/arch/GPU/TypeCasting.h"
 
-#if defined EIGEN_VECTORIZE_CUDA
-  #include "src/Core/arch/CUDA/PacketMath.h"
-  #include "src/Core/arch/CUDA/MathFunctions.h"
+#if defined EIGEN_VECTORIZE_GPU
+  #include "src/Core/arch/GPU/PacketMath.h"
+  #include "src/Core/arch/GPU/MathFunctions.h"
 #endif
 
 #include "src/Core/arch/Default/Settings.h"
@@ -450,7 +211,9 @@ using std::ptrdiff_t;
 
 // Specialized functors to enable the processing of complex numbers
 // on CUDA devices
+#ifdef EIGEN_CUDACC
 #include "src/Core/arch/CUDA/Complex.h"
+#endif
 
 #include "src/Core/util/IndexedViewHelper.h"
 #include "src/Core/ArithmeticSequence.h"
diff --git a/Eigen/PardisoSupport b/Eigen/PardisoSupport
index 340edf51f..340edf51f 100755..100644
--- a/Eigen/PardisoSupport
+++ b/Eigen/PardisoSupport
diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h
index 13a8f6d14..2dfeac333 100644
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -247,7 +247,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the factorization failed because of a zero pivot.
       */
     ComputationInfo info() const
@@ -569,13 +569,14 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons
   // more precisely, use pseudo-inverse of D (see bug 241)
   using std::abs;
   const typename Diagonal<const MatrixType>::RealReturnType vecD(vectorD());
-  // In some previous versions, tolerance was set to the max of 1/highest and the maximal diagonal entry * epsilon
-  // as motivated by LAPACK's xGELSS:
+  // In some previous versions, tolerance was set to the max of 1/highest (or rather numeric_limits::min())
+  // and the maximal diagonal entry * epsilon as motivated by LAPACK's xGELSS:
   // RealScalar tolerance = numext::maxi(vecD.array().abs().maxCoeff() * NumTraits<RealScalar>::epsilon(),RealScalar(1) / NumTraits<RealScalar>::highest());
   // However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest
   // diagonal element is not well justified and leads to numerical issues in some cases.
   // Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
-  RealScalar tolerance = RealScalar(1) / NumTraits<RealScalar>::highest();
+  // Using numeric_limits::min() gives us more robustness to denormals.
+  RealScalar tolerance = (std::numeric_limits<RealScalar>::min)();
 
   for (Index i = 0; i < vecD.size(); ++i)
   {
diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h
index 814174d47..22e4be75d 100644
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -180,7 +180,7 @@ template<typename _MatrixType, int _UpLo> class LLT
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the matrix.appears not to be positive definite.
       */
     ComputationInfo info() const
diff --git a/Eigen/src/Core/ArithmeticSequence.h b/Eigen/src/Core/ArithmeticSequence.h
index ada1571f1..2083cf178 100644
--- a/Eigen/src/Core/ArithmeticSequence.h
+++ b/Eigen/src/Core/ArithmeticSequence.h
@@ -225,7 +225,8 @@ auto seq(FirstType f, LastType l, IncrType incr)
                -typename internal::cleanup_index_type<FirstType>::type(f)+CleanedIncrType(incr)) / CleanedIncrType(incr),
               CleanedIncrType(incr));
 }
-#else
+
+#else // EIGEN_HAS_CXX11
 
 template<typename FirstType,typename LastType>
 typename internal::enable_if<!(Symbolic::is_symbolic<FirstType>::value || Symbolic::is_symbolic<LastType>::value),
@@ -314,10 +315,39 @@ seq(const Symbolic::BaseExpr<FirstTypeDerived> &f, const Symbolic::BaseExpr<Last
   typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
   return seqN(f.derived(),(l.derived()-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr);
 }
-#endif
+#endif // EIGEN_HAS_CXX11
 
 #endif // EIGEN_PARSED_BY_DOXYGEN
 
+
+#if EIGEN_HAS_CXX11
+/** \cpp11
+  * \returns a symbolic ArithmeticSequence representing the last \a size elements with increment \a incr.
+  *
+  * It is a shortcut for: \code seqN(last-(size-fix<1>)*incr, size, incr) \endcode
+  * 
+  * \sa lastN(SizeType), seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */
+template<typename SizeType,typename IncrType>
+auto lastN(SizeType size, IncrType incr)
+-> decltype(seqN(Eigen::placeholders::last-(size-fix<1>())*incr, size, incr))
+{
+  return seqN(Eigen::placeholders::last-(size-fix<1>())*incr, size, incr);
+}
+
+/** \cpp11
+  * \returns a symbolic ArithmeticSequence representing the last \a size elements with a unit increment.
+  *
+  *  It is a shortcut for: \code seq(last+fix<1>-size, last) \endcode
+  * 
+  * \sa lastN(SizeType,IncrType, seqN(FirstType,SizeType), seq(FirstType,LastType) */
+template<typename SizeType>
+auto lastN(SizeType size)
+-> decltype(seqN(Eigen::placeholders::last+fix<1>()-size, size))
+{
+  return seqN(Eigen::placeholders::last+fix<1>()-size, size);
+}
+#endif
+
 namespace internal {
 
 // Convert a symbolic span into a usable one (i.e., remove last/end "keywords")
diff --git a/Eigen/src/Core/ArrayWrapper.h b/Eigen/src/Core/ArrayWrapper.h
index 688aadd62..757b31825 100644
--- a/Eigen/src/Core/ArrayWrapper.h
+++ b/Eigen/src/Core/ArrayWrapper.h
@@ -90,8 +90,8 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
     EIGEN_DEVICE_FUNC
     inline void evalTo(Dest& dst) const { dst = m_expression; }
 
-    const typename internal::remove_all<NestedExpressionType>::type& 
     EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<NestedExpressionType>::type& 
     nestedExpression() const 
     {
       return m_expression;
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
index ebf5590de..362d905d2 100644
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -756,7 +756,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType
 // AssignmentKind must define a Kind typedef.
 template<typename DstShape, typename SrcShape> struct AssignmentKind;
 
-// Assignement kind defined in this file:
+// Assignment kind defined in this file:
 struct Dense2Dense {};
 struct EigenBase2EigenBase {};
 
@@ -899,7 +899,7 @@ struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Weak>
     src.evalTo(dst);
   }
 
-  // NOTE The following two functions are templated to avoid their instanciation if not needed
+  // NOTE The following two functions are templated to avoid their instantiation if not needed
   //      This is needed because some expressions supports evalTo only and/or have 'void' as scalar type.
   template<typename SrcScalarType>
   EIGEN_DEVICE_FUNC
diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h
index 6c2ab9264..6866095bf 100755
--- a/Eigen/src/Core/Assign_MKL.h
+++ b/Eigen/src/Core/Assign_MKL.h
@@ -84,7 +84,8 @@ class vml_assign_traits
   struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE,EIGENTYPE>,   \
                    Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> {              \
     typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType;                                            \
-    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &/*func*/) {                   \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &func) {                       \
+      resize_if_allowed(dst, src, func);                                                                                        \
       eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                       \
       if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) {                                              \
         VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(),                                                        \
@@ -144,7 +145,8 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil,  Ceil,   _)
                    Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> {            \
     typedef CwiseBinaryOp<scalar_##EIGENOP##_op<EIGENTYPE,EIGENTYPE>, SrcXprNested,                                           \
                     const CwiseNullaryOp<internal::scalar_constant_op<EIGENTYPE>,Plain> > SrcXprType;                         \
-    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &/*func*/) {                 \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &func) {                     \
+      resize_if_allowed(dst, src, func);                                                                                      \
       eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                     \
       VMLTYPE exponent = reinterpret_cast<const VMLTYPE&>(src.rhs().functor().m_other);                                       \
       if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal)                                              \
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index 8419798c1..264446f65 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -1080,14 +1080,15 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
     : m_argImpl(block.nestedExpression()), 
       m_startRow(block.startRow()), 
       m_startCol(block.startCol()),
-      m_linear_offset(InnerPanel?(XprType::IsRowMajor ? block.startRow()*block.cols() : block.startCol()*block.rows()):0)
+      m_linear_offset(ForwardLinearAccess?(ArgType::IsRowMajor ? block.startRow()*block.nestedExpression().cols() + block.startCol() : block.startCol()*block.nestedExpression().rows() + block.startRow()):0)
   { }
  
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
   enum {
-    RowsAtCompileTime = XprType::RowsAtCompileTime
+    RowsAtCompileTime = XprType::RowsAtCompileTime,
+    ForwardLinearAccess = (InnerPanel || int(XprType::IsRowMajor)==int(ArgType::IsRowMajor)) && bool(evaluator<ArgType>::Flags&LinearAccessBit)
   };
  
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -1099,7 +1100,7 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index index) const
   { 
-    if (InnerPanel)
+    if (ForwardLinearAccess)
       return m_argImpl.coeff(m_linear_offset.value() + index); 
     else
       return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
@@ -1114,7 +1115,7 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Scalar& coeffRef(Index index)
   { 
-    if (InnerPanel)
+    if (ForwardLinearAccess)
       return m_argImpl.coeffRef(m_linear_offset.value() + index); 
     else
       return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
@@ -1131,7 +1132,7 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
   EIGEN_STRONG_INLINE
   PacketType packet(Index index) const 
   { 
-    if (InnerPanel)
+    if (ForwardLinearAccess)
       return m_argImpl.template packet<LoadMode,PacketType>(m_linear_offset.value() + index);
     else
       return packet<LoadMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
@@ -1149,7 +1150,7 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
   EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketType& x) 
   {
-    if (InnerPanel)
+    if (ForwardLinearAccess)
       return m_argImpl.template writePacket<StoreMode,PacketType>(m_linear_offset.value() + index, x);
     else
       return writePacket<StoreMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
@@ -1161,7 +1162,7 @@ protected:
   evaluator<ArgType> m_argImpl;
   const variable_if_dynamic<Index, (ArgType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
   const variable_if_dynamic<Index, (ArgType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
-  const variable_if_dynamic<Index, InnerPanel ? Dynamic : 0> m_linear_offset;
+  const variable_if_dynamic<Index, ForwardLinearAccess ? Dynamic : 0> m_linear_offset;
 };
 
 // TODO: This evaluator does not actually use the child evaluator; 
diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h
index fd933eed4..8b994f35c 100644
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -157,6 +157,11 @@ template<typename Derived> class DenseBase
           * we are dealing with a column-vector (if there is only one column) or with
           * a row-vector (if there is only one row). */
 
+      NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2,
+        /**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors, 
+         * and 2 for matrices.
+         */
+
       Flags = internal::traits<Derived>::Flags,
         /**< This stores expression \ref flags flags which may or may not be inherited by new expressions
           * constructed from this one. See the \ref flags "list of flags".
@@ -395,7 +400,7 @@ template<typename Derived> class DenseBase
       * Notice that in the case of a plain matrix or vector (not an expression) this function just returns
       * a const reference, in order to avoid a useless copy.
       * 
-      * \warning Be carefull with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink.
+      * \warning Be careful with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink.
       */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE EvalReturnType eval() const
diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h
index 7958feeb9..3c02a1025 100644
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@@ -61,7 +61,7 @@ struct plain_array
 #if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
   #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask)
 #elif EIGEN_GNUC_AT_LEAST(4,7) 
-  // GCC 4.7 is too aggressive in its optimizations and remove the alignement test based on the fact the array is declared to be aligned.
+  // GCC 4.7 is too aggressive in its optimizations and remove the alignment test based on the fact the array is declared to be aligned.
   // See this bug report: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53900
   // Hiding the origin of the array pointer behind a function argument seems to do the trick even if the function is inlined:
   template<typename PtrType>
@@ -207,7 +207,9 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
       EIGEN_UNUSED_VARIABLE(rows);
       EIGEN_UNUSED_VARIABLE(cols);
     }
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data, other.m_data);
+    }
     EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
     EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
@@ -267,7 +269,11 @@ template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic
     }
     EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {}
     EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
-    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
+    {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_rows,other.m_rows);
+      numext::swap(m_cols,other.m_cols);
+    }
     EIGEN_DEVICE_FUNC Index rows() const {return m_rows;}
     EIGEN_DEVICE_FUNC Index cols() const {return m_cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; }
@@ -296,7 +302,11 @@ template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Si
       return *this; 
     }
     EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {}
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
+    {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_rows,other.m_rows);
+    }
     EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
     EIGEN_DEVICE_FUNC Index cols(void) const {return _Cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index) { m_rows = rows; }
@@ -325,11 +335,14 @@ template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Si
       return *this;
     }
     EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) {}
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_cols,other.m_cols);
+    }
     EIGEN_DEVICE_FUNC Index rows(void) const {return _Rows;}
     EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
-    void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
-    void resize(Index, Index, Index cols) { m_cols = cols; }
+    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
+    EIGEN_DEVICE_FUNC void resize(Index, Index, Index cols) { m_cols = cols; }
     EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
     EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };
@@ -381,16 +394,19 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
     EIGEN_DEVICE_FUNC
     DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
     {
-      using std::swap;
-      swap(m_data, other.m_data);
-      swap(m_rows, other.m_rows);
-      swap(m_cols, other.m_cols);
+      numext::swap(m_data, other.m_data);
+      numext::swap(m_rows, other.m_rows);
+      numext::swap(m_cols, other.m_cols);
       return *this;
     }
 #endif
     EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); }
     EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
-    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
+    {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_rows,other.m_rows);
+      numext::swap(m_cols,other.m_cols);
+    }
     EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
     EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
     void conservativeResize(Index size, Index rows, Index cols)
@@ -459,14 +475,16 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
     EIGEN_DEVICE_FUNC
     DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
     {
-      using std::swap;
-      swap(m_data, other.m_data);
-      swap(m_cols, other.m_cols);
+      numext::swap(m_data, other.m_data);
+      numext::swap(m_cols, other.m_cols);
       return *this;
     }
 #endif
     EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); }
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_cols,other.m_cols);
+    }
     EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
     EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols)
@@ -533,14 +551,16 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
     EIGEN_DEVICE_FUNC
     DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
     {
-      using std::swap;
-      swap(m_data, other.m_data);
-      swap(m_rows, other.m_rows);
+      numext::swap(m_data, other.m_data);
+      numext::swap(m_rows, other.m_rows);
       return *this;
     }
 #endif
     EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); }
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
+      numext::swap(m_data,other.m_data);
+      numext::swap(m_rows,other.m_rows);
+    }
     EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
     EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
     void conservativeResize(Index size, Index rows, Index)
diff --git a/Eigen/src/Core/Diagonal.h b/Eigen/src/Core/Diagonal.h
index c62f5ff21..563135fb2 100644
--- a/Eigen/src/Core/Diagonal.h
+++ b/Eigen/src/Core/Diagonal.h
@@ -70,7 +70,10 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
     EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal)
 
     EIGEN_DEVICE_FUNC
-    explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {}
+    explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index)
+    {
+      eigen_assert( a_index <= m_matrix.cols() && -a_index <= m_matrix.rows() );
+    }
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal)
 
diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h
index bb8e3fecc..11da432b2 100644
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -31,7 +31,8 @@ struct dot_nocheck
   typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
   typedef typename conj_prod::result_type ResScalar;
   EIGEN_DEVICE_FUNC
-  static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
+  EIGEN_STRONG_INLINE
+  static ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
   {
     return a.template binaryExpr<conj_prod>(b).sum();
   }
@@ -43,7 +44,8 @@ struct dot_nocheck<T, U, true>
   typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
   typedef typename conj_prod::result_type ResScalar;
   EIGEN_DEVICE_FUNC
-  static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
+  EIGEN_STRONG_INLINE
+  static ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
   {
     return a.transpose().template binaryExpr<conj_prod>(b).sum();
   }
@@ -65,6 +67,7 @@ struct dot_nocheck<T, U, true>
 template<typename Derived>
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE
 typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
 MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
 {
@@ -102,7 +105,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::trai
   * \sa lpNorm(), dot(), squaredNorm()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
 {
   return numext::sqrt(squaredNorm());
 }
@@ -117,7 +120,7 @@ EIGEN_DEVICE_FUNC inline typename NumTraits<typename internal::traits<Derived>::
   * \sa norm(), normalize()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::PlainObject
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::normalized() const
 {
   typedef typename internal::nested_eval<Derived,2>::type _Nested;
@@ -139,7 +142,7 @@ MatrixBase<Derived>::normalized() const
   * \sa norm(), normalized()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline void MatrixBase<Derived>::normalize()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize()
 {
   RealScalar z = squaredNorm();
   // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
@@ -160,7 +163,7 @@ EIGEN_DEVICE_FUNC inline void MatrixBase<Derived>::normalize()
   * \sa stableNorm(), stableNormalize(), normalized()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::PlainObject
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::stableNormalized() const
 {
   typedef typename internal::nested_eval<Derived,3>::type _Nested;
@@ -185,7 +188,7 @@ MatrixBase<Derived>::stableNormalized() const
   * \sa stableNorm(), stableNormalized(), normalize()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline void MatrixBase<Derived>::stableNormalize()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::stableNormalize()
 {
   RealScalar w = cwiseAbs().maxCoeff();
   RealScalar z = (derived()/w).squaredNorm();
diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h
index 694f7cbde..43f3b84c8 100644
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -35,7 +35,7 @@ template<int Rows, int Cols, int Depth> struct product_type_selector;
 template<int Size, int MaxSize> struct product_size_category
 {
   enum {
-    #ifndef EIGEN_CUDA_ARCH
+    #ifndef EIGEN_GPU_COMPILE_PHASE
     is_large = MaxSize == Dynamic ||
                Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
                (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
@@ -163,13 +163,13 @@ template<typename Scalar,int Size,int MaxSize,bool Cond> struct gemv_static_vect
 template<typename Scalar,int Size,int MaxSize>
 struct gemv_static_vector_if<Scalar,Size,MaxSize,false>
 {
-  EIGEN_STRONG_INLINE  Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
 };
 
 template<typename Scalar,int Size>
 struct gemv_static_vector_if<Scalar,Size,Dynamic,true>
 {
-  EIGEN_STRONG_INLINE Scalar* data() { return 0; }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { return 0; }
 };
 
 template<typename Scalar,int Size,int MaxSize>
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 30878eda6..b67c41d8a 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -82,7 +82,11 @@ struct default_packet_traits
     HasPolygamma = 0,
     HasErf = 0,
     HasErfc = 0,
+    HasI0e = 0,
+    HasI1e = 0,
     HasIGamma = 0,
+    HasIGammaDerA = 0,
+    HasGammaSampleDerAlpha = 0,
     HasIGammac = 0,
     HasBetaInc = 0,
 
@@ -299,7 +303,9 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu
 /** \internal tries to do cache prefetching of \a addr */
 template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
 {
-#ifdef EIGEN_CUDA_ARCH
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+  // do nothing
+#elif defined(EIGEN_CUDA_ARCH)
 #if defined(__LP64__)
   // 64-bit pointer operand constraint for inlined asm
   asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
@@ -324,13 +330,13 @@ preduxp(const Packet* vecs) { return vecs[0]; }
 template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux(const Packet& a)
 { return a; }
 
-/** \internal \returns the sum of the elements of \a a by block of 4 elements.
+/** \internal \returns the sum of the elements of upper and lower half of \a a if \a a is larger than 4.
   * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7}
   * For packet-size smaller or equal to 4, this boils down to a noop.
   */
 template<typename Packet> EIGEN_DEVICE_FUNC inline
 typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type
-predux_downto4(const Packet& a)
+predux_half_dowto4(const Packet& a)
 { return a; }
 
 /** \internal \returns the product of the elements of \a a*/
@@ -526,7 +532,7 @@ inline void palign(PacketType& first, const PacketType& second)
 ***************************************************************************/
 
 // Eigen+CUDA does not support complexes.
-#ifndef EIGEN_CUDACC
+#if !defined(EIGEN_GPUCC)
 
 template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
 { return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 5ba5293a0..f16476a92 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -96,7 +96,7 @@ struct real_default_impl<Scalar,true>
 
 template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
 
-#ifdef EIGEN_CUDA_ARCH
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 template<typename T>
 struct real_impl<std::complex<T> >
 {
@@ -144,7 +144,7 @@ struct imag_default_impl<Scalar,true>
 
 template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
 
-#ifdef EIGEN_CUDA_ARCH
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 template<typename T>
 struct imag_impl<std::complex<T> >
 {
@@ -238,7 +238,7 @@ struct imag_ref_retval
 ****************************************************************************/
 
 template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
-struct conj_impl
+struct conj_default_impl
 {
   EIGEN_DEVICE_FUNC
   static inline Scalar run(const Scalar& x)
@@ -248,7 +248,7 @@ struct conj_impl
 };
 
 template<typename Scalar>
-struct conj_impl<Scalar,true>
+struct conj_default_impl<Scalar,true>
 {
   EIGEN_DEVICE_FUNC
   static inline Scalar run(const Scalar& x)
@@ -258,6 +258,20 @@ struct conj_impl<Scalar,true>
   }
 };
 
+template<typename Scalar> struct conj_impl : conj_default_impl<Scalar> {};
+
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+template<typename T>
+struct conj_impl<std::complex<T> >
+{
+  EIGEN_DEVICE_FUNC
+  static inline std::complex<T> run(const std::complex<T>& x)
+  {
+    return std::complex<T>(x.real(), -x.imag());
+  }
+};
+#endif
+
 template<typename Scalar>
 struct conj_retval
 {
@@ -347,31 +361,7 @@ struct norm1_retval
 * Implementation of hypot                                                *
 ****************************************************************************/
 
-template<typename Scalar>
-struct hypot_impl
-{
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  static inline RealScalar run(const Scalar& x, const Scalar& y)
-  {
-    EIGEN_USING_STD_MATH(abs);
-    EIGEN_USING_STD_MATH(sqrt);
-    RealScalar _x = abs(x);
-    RealScalar _y = abs(y);
-    Scalar p, qp;
-    if(_x>_y)
-    {
-      p = _x;
-      qp = _y / p;
-    }
-    else
-    {
-      p = _y;
-      qp = _x / p;
-    }
-    if(p==RealScalar(0)) return RealScalar(0);
-    return p * sqrt(RealScalar(1) + qp*qp);
-  }
-};
+template<typename Scalar> struct hypot_impl;
 
 template<typename Scalar>
 struct hypot_retval
@@ -445,7 +435,12 @@ struct round_retval
   struct arg_impl {
     static inline Scalar run(const Scalar& x)
     {
+      #if defined(EIGEN_HIP_DEVICE_COMPILE)
+      // HIP does not seem to have a native device side implementation for the math routine "arg"
+      using std::arg;
+      #else 		  
       EIGEN_USING_STD_MATH(arg);
+      #endif
       return arg(x);
     }
   };
@@ -497,11 +492,11 @@ namespace std_fallback {
 
     EIGEN_USING_STD_MATH(exp);
     Scalar u = exp(x);
-    if (u == Scalar(1)) {
+    if (numext::equal_strict(u, Scalar(1))) {
       return x;
     }
     Scalar um1 = u - RealScalar(1);
-    if (um1 == Scalar(-1)) {
+    if (numext::equal_strict(um1, Scalar(-1))) {
       return RealScalar(-1);
     }
 
@@ -543,7 +538,7 @@ namespace std_fallback {
     typedef typename NumTraits<Scalar>::Real RealScalar;
     EIGEN_USING_STD_MATH(log);
     Scalar x1p = RealScalar(1) + x;
-    return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
+    return numext::equal_strict(x1p, Scalar(1)) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
   }
 }
 
@@ -749,7 +744,7 @@ inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random()
   return EIGEN_MATHFUNC_IMPL(random, Scalar)::run();
 }
 
-// Implementatin of is* functions
+// Implementation of is* functions
 
 // std::is* do not work with fast-math and gcc, std::is* are available on MSVC 2013 and newer, as well as in clang.
 #if (EIGEN_HAS_CXX11_MATH && !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || (EIGEN_COMP_MSVC>=1800) || (EIGEN_COMP_CLANG)
@@ -778,7 +773,7 @@ EIGEN_DEVICE_FUNC
 typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
 isfinite_impl(const T& x)
 {
-  #ifdef EIGEN_CUDA_ARCH
+  #if defined(EIGEN_GPU_COMPILE_PHASE)
     return (::isfinite)(x);
   #elif EIGEN_USE_STD_FPCLASSIFY
     using std::isfinite;
@@ -793,7 +788,7 @@ EIGEN_DEVICE_FUNC
 typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
 isinf_impl(const T& x)
 {
-  #ifdef EIGEN_CUDA_ARCH
+  #if defined(EIGEN_GPU_COMPILE_PHASE)
     return (::isinf)(x);
   #elif EIGEN_USE_STD_FPCLASSIFY
     using std::isinf;
@@ -808,7 +803,7 @@ EIGEN_DEVICE_FUNC
 typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
 isnan_impl(const T& x)
 {
-  #ifdef EIGEN_CUDA_ARCH
+  #if defined(EIGEN_GPU_COMPILE_PHASE)
     return (::isnan)(x);
   #elif EIGEN_USE_STD_FPCLASSIFY
     using std::isnan;
@@ -874,7 +869,7 @@ template<typename T> T generic_fast_tanh_float(const T& a_x);
 
 namespace numext {
 
-#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
+#if (!defined(EIGEN_GPUCC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) && !defined(__SYCL_DEVICE_ONLY__)
 template<typename T>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
@@ -891,19 +886,16 @@ EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
   return max EIGEN_NOT_A_MACRO (x,y);
 }
 
-
 #elif defined(__SYCL_DEVICE_ONLY__)
 template<typename T>
 EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
 {
-
   return y < x ? y : x;
 }
 
 template<typename T>
 EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
 {
-
   return x < y ? y : x;
 }
 
@@ -947,7 +939,6 @@ EIGEN_ALWAYS_INLINE unsigned long maxi(const unsigned long& x, const unsigned lo
   return cl::sycl::max(x,y);
 }
 
-
 EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y)
 {
   return cl::sycl::fmin(x,y);
@@ -981,6 +972,24 @@ EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y)
 {
   return fminf(x, y);
 }
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y)
+{
+  return fmin(x, y);
+}
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y)
+{
+#if defined(EIGEN_HIPCC)
+  // no "fminl" on HIP yet
+  return (x < y) ? x : y;
+#else
+  return fminl(x, y);
+#endif
+}
+
 template<typename T>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
@@ -993,6 +1002,23 @@ EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y)
 {
   return fmaxf(x, y);
 }
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y)
+{
+  return fmax(x, y);
+}
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y)
+{
+#if defined(EIGEN_HIPCC)
+  // no "fmaxl" on HIP yet
+  return (x > y) ? x : y;
+#else
+  return fmaxl(x, y);
+#endif
+}
 #endif
 
 
@@ -1088,7 +1114,7 @@ EIGEN_ALWAYS_INLINE float   log1p(float x) { return cl::sycl::log1p(x); }
 EIGEN_ALWAYS_INLINE double  log1p(double x) { return cl::sycl::log1p(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float log1p(const float &x) { return ::log1pf(x); }
 
@@ -1146,7 +1172,7 @@ EIGEN_ALWAYS_INLINE float   floor(float x) { return cl::sycl::floor(x); }
 EIGEN_ALWAYS_INLINE double  floor(double x) { return cl::sycl::floor(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float floor(const float &x) { return ::floorf(x); }
 
@@ -1167,7 +1193,7 @@ EIGEN_ALWAYS_INLINE float   ceil(float x) { return cl::sycl::ceil(x); }
 EIGEN_ALWAYS_INLINE double  ceil(double x) { return cl::sycl::ceil(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float ceil(const float &x) { return ::ceilf(x); }
 
@@ -1225,7 +1251,7 @@ EIGEN_ALWAYS_INLINE double  log(double x) { return cl::sycl::log(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
 
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float log(const float &x) { return ::logf(x); }
 
@@ -1253,7 +1279,7 @@ EIGEN_ALWAYS_INLINE float   abs(float x) { return cl::sycl::fabs(x); }
 EIGEN_ALWAYS_INLINE double  abs(double x) { return cl::sycl::fabs(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float abs(const float &x) { return ::fabsf(x); }
 
@@ -1283,12 +1309,28 @@ EIGEN_ALWAYS_INLINE float   exp(float x) { return cl::sycl::exp(x); }
 EIGEN_ALWAYS_INLINE double  exp(double x) { return cl::sycl::exp(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float exp(const float &x) { return ::expf(x); }
 
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 double exp(const double &x) { return ::exp(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+std::complex<float> exp(const std::complex<float>& x) {
+  float com = ::expf(x.real());
+  float res_real = com * ::cosf(x.imag());
+  float res_imag = com * ::sinf(x.imag());
+  return std::complex<float>(res_real, res_imag);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+std::complex<double> exp(const std::complex<double>& x) {
+  double com = ::exp(x.real());
+  double res_real = com * ::cos(x.imag());
+  double res_imag = com * ::sin(x.imag());
+  return std::complex<double>(res_real, res_imag);
+}
 #endif
 
 template<typename Scalar>
@@ -1303,7 +1345,7 @@ EIGEN_ALWAYS_INLINE float   expm1(float x) { return cl::sycl::expm1(x); }
 EIGEN_ALWAYS_INLINE double  expm1(double x) { return cl::sycl::expm1(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float expm1(const float &x) { return ::expm1f(x); }
 
@@ -1323,7 +1365,7 @@ EIGEN_ALWAYS_INLINE float   cos(float x) { return cl::sycl::cos(x); }
 EIGEN_ALWAYS_INLINE double  cos(double x) { return cl::sycl::cos(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float cos(const float &x) { return ::cosf(x); }
 
@@ -1343,7 +1385,7 @@ EIGEN_ALWAYS_INLINE float   sin(float x) { return cl::sycl::sin(x); }
 EIGEN_ALWAYS_INLINE double  sin(double x) { return cl::sycl::sin(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float sin(const float &x) { return ::sinf(x); }
 
@@ -1363,7 +1405,7 @@ EIGEN_ALWAYS_INLINE float   tan(float x) { return cl::sycl::tan(x); }
 EIGEN_ALWAYS_INLINE double  tan(double x) { return cl::sycl::tan(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tan(const float &x) { return ::tanf(x); }
 
@@ -1394,7 +1436,7 @@ EIGEN_ALWAYS_INLINE float   acosh(float x) { return cl::sycl::acosh(x); }
 EIGEN_ALWAYS_INLINE double  acosh(double x) { return cl::sycl::acosh(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float acos(const float &x) { return ::acosf(x); }
 
@@ -1425,7 +1467,7 @@ EIGEN_ALWAYS_INLINE float   asinh(float x) { return cl::sycl::asinh(x); }
 EIGEN_ALWAYS_INLINE double  asinh(double x) { return cl::sycl::asinh(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float asin(const float &x) { return ::asinf(x); }
 
@@ -1456,7 +1498,7 @@ EIGEN_ALWAYS_INLINE float   atanh(float x) { return cl::sycl::atanh(x); }
 EIGEN_ALWAYS_INLINE double  atanh(double x) { return cl::sycl::atanh(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float atan(const float &x) { return ::atanf(x); }
 
@@ -1477,7 +1519,7 @@ EIGEN_ALWAYS_INLINE float   cosh(float x) { return cl::sycl::cosh(x); }
 EIGEN_ALWAYS_INLINE double  cosh(double x) { return cl::sycl::cosh(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float cosh(const float &x) { return ::coshf(x); }
 
@@ -1497,7 +1539,7 @@ EIGEN_ALWAYS_INLINE float   sinh(float x) { return cl::sycl::sinh(x); }
 EIGEN_ALWAYS_INLINE double  sinh(double x) { return cl::sycl::sinh(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float sinh(const float &x) { return ::sinhf(x); }
 
@@ -1515,12 +1557,12 @@ T tanh(const T &x) {
 #if defined(__SYCL_DEVICE_ONLY__)
 EIGEN_ALWAYS_INLINE float   tanh(float x) { return cl::sycl::tanh(x); }
 EIGEN_ALWAYS_INLINE double  tanh(double x) { return cl::sycl::tanh(x); }
-#elif (!defined(EIGEN_CUDACC)) && EIGEN_FAST_MATH
+#elif (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tanh(float x) { return internal::generic_fast_tanh_float(x); }
 #endif
 
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tanh(const float &x) { return ::tanhf(x); }
 
@@ -1540,7 +1582,7 @@ EIGEN_ALWAYS_INLINE float   fmod(float x, float y) { return cl::sycl::fmod(x, y)
 EIGEN_ALWAYS_INLINE double  fmod(double x, double y) { return cl::sycl::fmod(x, y); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
 
-#ifdef EIGEN_CUDACC
+#if defined(EIGEN_GPUCC)
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float fmod(const float& a, const float& b) {
diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h
index ae1386b4c..a23e93ccb 100644
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@@ -66,6 +66,30 @@ T generic_fast_tanh_float(const T& a_x)
   return pdiv(p, q);
 }
 
+template<typename RealScalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y)
+{
+  EIGEN_USING_STD_MATH(sqrt);
+  RealScalar p, qp;
+  p = numext::maxi(x,y);
+  if(p==RealScalar(0)) return RealScalar(0);
+  qp = numext::mini(y,x) / p;    
+  return p * sqrt(RealScalar(1) + qp*qp);
+}
+
+template<typename Scalar>
+struct hypot_impl
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static EIGEN_DEVICE_FUNC
+  inline RealScalar run(const Scalar& x, const Scalar& y)
+  {
+    EIGEN_USING_STD_MATH(abs);
+    return positive_real_hypot<RealScalar>(abs(x), abs(y));
+  }
+};
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index 11435903b..6046c8bae 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -328,6 +328,7 @@ template<typename Derived> class MatrixBase
 
     inline const PartialPivLU<PlainObject> lu() const;
 
+    EIGEN_DEVICE_FUNC
     inline const Inverse<Derived> inverse() const;
 
     template<typename ResultType>
@@ -337,12 +338,15 @@ template<typename Derived> class MatrixBase
       bool& invertible,
       const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
     ) const;
+
     template<typename ResultType>
     inline void computeInverseWithCheck(
       ResultType& inverse,
       bool& invertible,
       const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
     ) const;
+
+    EIGEN_DEVICE_FUNC
     Scalar determinant() const;
 
 /////////// Cholesky module ///////////
@@ -414,15 +418,19 @@ template<typename Derived> class MatrixBase
 
 ////////// Householder module ///////////
 
+    EIGEN_DEVICE_FUNC
     void makeHouseholderInPlace(Scalar& tau, RealScalar& beta);
     template<typename EssentialPart>
+    EIGEN_DEVICE_FUNC
     void makeHouseholder(EssentialPart& essential,
                          Scalar& tau, RealScalar& beta) const;
     template<typename EssentialPart>
+    EIGEN_DEVICE_FUNC
     void applyHouseholderOnTheLeft(const EssentialPart& essential,
                                    const Scalar& tau,
                                    Scalar* workspace);
     template<typename EssentialPart>
+    EIGEN_DEVICE_FUNC
     void applyHouseholderOnTheRight(const EssentialPart& essential,
                                     const Scalar& tau,
                                     Scalar* workspace);
diff --git a/Eigen/src/Core/NoAlias.h b/Eigen/src/Core/NoAlias.h
index e94c8ee96..570283d90 100644
--- a/Eigen/src/Core/NoAlias.h
+++ b/Eigen/src/Core/NoAlias.h
@@ -75,10 +75,10 @@ class NoAlias
   *
   * More precisely, noalias() allows to bypass the EvalBeforeAssignBit flag.
   * Currently, even though several expressions may alias, only product
-  * expressions have this flag. Therefore, noalias() is only usefull when
+  * expressions have this flag. Therefore, noalias() is only useful when
   * the source expression contains a matrix product.
   *
-  * Here are some examples where noalias is usefull:
+  * Here are some examples where noalias is useful:
   * \code
   * D.noalias()  = A * B;
   * D.noalias() += A.transpose() * B;
diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h
index 5567d4c90..b053cff07 100644
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -21,12 +21,14 @@ template< typename T,
           bool is_integer = NumTraits<T>::IsInteger>
 struct default_digits10_impl
 {
+  EIGEN_DEVICE_FUNC
   static int run() { return std::numeric_limits<T>::digits10; }
 };
 
 template<typename T>
 struct default_digits10_impl<T,false,false> // Floating point
 {
+  EIGEN_DEVICE_FUNC
   static int run() {
     using std::log10;
     using std::ceil;
@@ -38,6 +40,7 @@ struct default_digits10_impl<T,false,false> // Floating point
 template<typename T>
 struct default_digits10_impl<T,false,true> // Integer
 {
+  EIGEN_DEVICE_FUNC
   static int run() { return 0; }
 };
 
@@ -49,12 +52,14 @@ template< typename T,
           bool is_integer = NumTraits<T>::IsInteger>
 struct default_digits_impl
 {
+  EIGEN_DEVICE_FUNC
   static int run() { return std::numeric_limits<T>::digits; }
 };
 
 template<typename T>
 struct default_digits_impl<T,false,false> // Floating point
 {
+  EIGEN_DEVICE_FUNC
   static int run() {
     using std::log;
     using std::ceil;
@@ -66,6 +71,7 @@ struct default_digits_impl<T,false,false> // Floating point
 template<typename T>
 struct default_digits_impl<T,false,true> // Integer
 {
+  EIGEN_DEVICE_FUNC
   static int run() { return 0; }
 };
 
diff --git a/Eigen/src/Core/PermutationMatrix.h b/Eigen/src/Core/PermutationMatrix.h
index b1fb455b9..acd085301 100644
--- a/Eigen/src/Core/PermutationMatrix.h
+++ b/Eigen/src/Core/PermutationMatrix.h
@@ -99,13 +99,13 @@ class PermutationBase : public EigenBase<Derived>
     #endif
 
     /** \returns the number of rows */
-    inline Index rows() const { return Index(indices().size()); }
+    inline EIGEN_DEVICE_FUNC Index rows() const { return Index(indices().size()); }
 
     /** \returns the number of columns */
-    inline Index cols() const { return Index(indices().size()); }
+    inline EIGEN_DEVICE_FUNC Index cols() const { return Index(indices().size()); }
 
     /** \returns the size of a side of the respective square matrix, i.e., the number of indices */
-    inline Index size() const { return Index(indices().size()); }
+    inline EIGEN_DEVICE_FUNC Index size() const { return Index(indices().size()); }
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename DenseDerived>
diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h
index 1dc7e223a..da329fd4f 100644
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -780,7 +780,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       resize(size);
     }
     
-    // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitely converted)
+    // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitly converted)
     template<typename T>
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if<Base::SizeAtCompileTime==1 && internal::is_convertible<T, Scalar>::value,T>::type* = 0)
@@ -921,13 +921,19 @@ namespace internal {
 template <typename Derived, typename OtherDerived, bool IsVector>
 struct conservative_resize_like_impl
 {
+  #if EIGEN_HAS_TYPE_TRAITS
+  static const bool IsRelocatable = std::is_trivially_copyable<typename Derived::Scalar>::value;
+  #else
+  static const bool IsRelocatable = !NumTraits<typename Derived::Scalar>::RequireInitialization;
+  #endif
   static void run(DenseBase<Derived>& _this, Index rows, Index cols)
   {
     if (_this.rows() == rows && _this.cols() == cols) return;
     EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
 
-    if ( ( Derived::IsRowMajor && _this.cols() == cols) || // row-major and we change only the number of rows
-         (!Derived::IsRowMajor && _this.rows() == rows) )  // column-major and we change only the number of columns
+    if ( IsRelocatable
+          && (( Derived::IsRowMajor && _this.cols() == cols) ||  // row-major and we change only the number of rows
+              (!Derived::IsRowMajor && _this.rows() == rows) ))  // column-major and we change only the number of columns
     {
       internal::check_rows_cols_for_overflow<Derived::MaxSizeAtCompileTime>::run(rows, cols);
       _this.derived().m_storage.conservativeResize(rows*cols,rows,cols);
@@ -955,8 +961,9 @@ struct conservative_resize_like_impl
     EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
     EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(OtherDerived)
 
-    if ( ( Derived::IsRowMajor && _this.cols() == other.cols()) || // row-major and we change only the number of rows
-         (!Derived::IsRowMajor && _this.rows() == other.rows()) )  // column-major and we change only the number of columns
+    if ( IsRelocatable &&
+          (( Derived::IsRowMajor && _this.cols() == other.cols()) ||  // row-major and we change only the number of rows
+           (!Derived::IsRowMajor && _this.rows() == other.rows()) ))  // column-major and we change only the number of columns
     {
       const Index new_rows = other.rows() - _this.rows();
       const Index new_cols = other.cols() - _this.cols();
@@ -984,13 +991,18 @@ template <typename Derived, typename OtherDerived>
 struct conservative_resize_like_impl<Derived,OtherDerived,true>
   : conservative_resize_like_impl<Derived,OtherDerived,false>
 {
-  using conservative_resize_like_impl<Derived,OtherDerived,false>::run;
+  typedef conservative_resize_like_impl<Derived,OtherDerived,false> Base;
+  using Base::run;
+  using Base::IsRelocatable;
   
   static void run(DenseBase<Derived>& _this, Index size)
   {
     const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : size;
     const Index new_cols = Derived::RowsAtCompileTime==1 ? size : 1;
-    _this.derived().m_storage.conservativeResize(size,new_rows,new_cols);
+    if(IsRelocatable)
+      _this.derived().m_storage.conservativeResize(size,new_rows,new_cols);
+    else
+      Base::run(_this.derived(), new_rows, new_cols);
   }
 
   static void run(DenseBase<Derived>& _this, const DenseBase<OtherDerived>& other)
@@ -1001,7 +1013,10 @@ struct conservative_resize_like_impl<Derived,OtherDerived,true>
 
     const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : other.rows();
     const Index new_cols = Derived::RowsAtCompileTime==1 ? other.cols() : 1;
-    _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);
+    if(IsRelocatable)
+      _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);
+    else
+      Base::run(_this.derived(), new_rows, new_cols);
 
     if (num_new_elements > 0)
       _this.tail(num_new_elements) = other.tail(num_new_elements);
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index ae0c94b38..70790dbd4 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -97,8 +97,8 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option,
         && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
     }
 
-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
 
     EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; }
     EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; }
@@ -116,7 +116,7 @@ class dense_product_base
  : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
 {};
 
-/** Convertion to scalar for inner-products */
+/** Conversion to scalar for inner-products */
 template<typename Lhs, typename Rhs, int Option>
 class dense_product_base<Lhs, Rhs, Option, InnerProduct>
  : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
@@ -127,7 +127,7 @@ public:
   using Base::derived;
   typedef typename Base::Scalar Scalar;
   
-  operator const Scalar() const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator const Scalar() const
   {
     return internal::evaluator<ProductXpr>(derived()).coeff(0,0);
   }
@@ -162,7 +162,7 @@ class ProductImpl<Lhs,Rhs,Option,Dense>
     
   public:
   
-    EIGEN_DEVICE_FUNC Scalar coeff(Index row, Index col) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index row, Index col) const
     {
       EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
       eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
@@ -170,7 +170,7 @@ class ProductImpl<Lhs,Rhs,Option,Dense>
       return internal::evaluator<Derived>(derived()).coeff(row,col);
     }
 
-    EIGEN_DEVICE_FUNC Scalar coeff(Index i) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index i) const
     {
       EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
       eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index 86966abdb..0330b5741 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -32,7 +32,7 @@ struct evaluator<Product<Lhs, Rhs, Options> >
   typedef Product<Lhs, Rhs, Options> XprType;
   typedef product_evaluator<XprType> Base;
   
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {}
 };
  
 // Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B"
@@ -55,7 +55,7 @@ struct evaluator<CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
                                const Product<Lhs, Rhs, DefaultProduct> > XprType;
   typedef evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1,Lhs,product), Rhs, DefaultProduct> > Base;
 
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
     : Base(xpr.lhs().functor().m_other * xpr.rhs().lhs() * xpr.rhs().rhs())
   {}
 };
@@ -68,7 +68,7 @@ struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> >
   typedef Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> XprType;
   typedef evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> > Base;
   
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
     : Base(Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>(
         Product<Lhs, Rhs, LazyProduct>(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()),
         xpr.index() ))
@@ -137,7 +137,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scal
   typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
   typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static EIGEN_STRONG_INLINE
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
   {
     Index dstRows = src.rows();
@@ -155,7 +155,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<
   typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
   typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static EIGEN_STRONG_INLINE
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,Scalar> &)
   {
     eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
@@ -170,7 +170,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<
   typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
   typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static EIGEN_STRONG_INLINE
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,Scalar> &)
   {
     eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
@@ -190,7 +190,7 @@ struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_product_op<ScalarBi
   typedef CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>,
                         const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>,
                         const Product<Lhs,Rhs,DefaultProduct> > SrcXprType;
-  static EIGEN_STRONG_INLINE
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func)
   {
     call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs())*src.rhs().rhs(), func);
@@ -217,7 +217,7 @@ template<typename DstXprType, typename OtherXpr, typename ProductType, typename
 struct assignment_from_xpr_op_product
 {
   template<typename SrcXprType, typename InitialFunc>
-  static EIGEN_STRONG_INLINE
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/)
   {
     call_assignment_no_alias(dst, src.lhs(), Func1());
@@ -246,19 +246,19 @@ template<typename Lhs, typename Rhs>
 struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
 {
   template<typename Dst>
-  static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
   }
   
   template<typename Dst>
-  static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum();
   }
   
   template<typename Dst>
-  static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   { dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); }
 };
 
@@ -269,10 +269,10 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
 
 // Column major result
 template<typename Dst, typename Lhs, typename Rhs, typename Func>
-void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
+void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
 {
   evaluator<Rhs> rhsEval(rhs);
-  typename nested_eval<Lhs,Rhs::SizeAtCompileTime>::type actual_lhs(lhs);
+  ei_declare_local_nested_eval(Lhs,lhs,Rhs::SizeAtCompileTime,actual_lhs);
   // FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored
   // FIXME not very good if rhs is real and lhs complex while alpha is real too
   const Index cols = dst.cols();
@@ -282,10 +282,10 @@ void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const
 
 // Row major result
 template<typename Dst, typename Lhs, typename Rhs, typename Func>
-void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
+void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
 {
   evaluator<Lhs> lhsEval(lhs);
-  typename nested_eval<Rhs,Lhs::SizeAtCompileTime>::type actual_rhs(rhs);
+  ei_declare_local_nested_eval(Rhs,rhs,Lhs::SizeAtCompileTime,actual_rhs);
   // FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored
   // FIXME not very good if lhs is real and rhs complex while alpha is real too
   const Index rows = dst.rows();
@@ -300,37 +300,37 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,OuterProduct>
   typedef typename Product<Lhs,Rhs>::Scalar Scalar;
   
   // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose
-  struct set  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
-  struct add  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
-  struct sub  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
+  struct set  { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
+  struct add  { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
+  struct sub  { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
   struct adds {
     Scalar m_scale;
     explicit adds(const Scalar& s) : m_scale(s) {}
-    template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const {
+    template<typename Dst, typename Src> void EIGEN_DEVICE_FUNC operator()(const Dst& dst, const Src& src) const {
       dst.const_cast_derived() += m_scale * src;
     }
   };
   
   template<typename Dst>
-  static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>());
   }
   
   template<typename Dst>
-  static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>());
   }
   
   template<typename Dst>
-  static inline void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>());
   }
   
   template<typename Dst>
-  static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
   {
     internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>());
   }
@@ -345,19 +345,19 @@ struct generic_product_impl_base
   typedef typename Product<Lhs,Rhs>::Scalar Scalar;
   
   template<typename Dst>
-  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   { dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); }
 
   template<typename Dst>
-  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   { scaleAndAddTo(dst,lhs, rhs, Scalar(1)); }
 
   template<typename Dst>
-  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   { scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); }
   
   template<typename Dst>
-  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
   { Derived::scaleAndAddTo(dst,lhs,rhs,alpha); }
 
 };
@@ -373,7 +373,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct>
   typedef typename internal::remove_all<typename internal::conditional<int(Side)==OnTheRight,LhsNested,RhsNested>::type>::type MatrixType;
 
   template<typename Dest>
-  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
   {
     LhsNested actual_lhs(lhs);
     RhsNested actual_rhs(rhs);
@@ -390,26 +390,52 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
   typedef typename Product<Lhs,Rhs>::Scalar Scalar;
   
   template<typename Dst>
-  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     // Same as: dst.noalias() = lhs.lazyProduct(rhs);
     // but easier on the compiler side
     call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar,Scalar>());
   }
-  
+
   template<typename Dst>
-  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     // dst.noalias() += lhs.lazyProduct(rhs);
     call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op<typename Dst::Scalar,Scalar>());
   }
   
   template<typename Dst>
-  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     // dst.noalias() -= lhs.lazyProduct(rhs);
     call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
   }
+
+  // Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor:
+  //    dst {,+,-}= s * (A.lazyProduct(B))
+  // This is a huge benefit for heap-allocated matrix types as it save one costly allocation.
+  // For them, this strategy is also faster than simply by-passing the heap allocation through
+  // stack allocation.
+  // For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower,
+  // and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only,
+  // that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
+  template<typename Dst, typename Scalar1, typename Scalar2, typename Plain1, typename Xpr2, typename Func>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void eval_dynamic(Dst& dst, const CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
+                                           const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func)
+  {
+    call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func);
+  }
+
+  // Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above
+  // overload more specialized.
+  template<typename Dst, typename LhsT, typename Func>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func)
+  {
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
+  }
+  
   
 //   template<typename Dst>
 //   static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
@@ -741,7 +767,8 @@ struct generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag>
   typedef typename Product<Lhs,Rhs>::Scalar Scalar;
   
   template<typename Dest>
-  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC
+  void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
   {
     selfadjoint_product_impl<typename Lhs::MatrixType,Lhs::Mode,false,Rhs,0,Rhs::IsVectorAtCompileTime>::run(dst, lhs.nestedExpression(), rhs, alpha);
   }
@@ -785,7 +812,11 @@ public:
     _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
     _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
     Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0),
-    Alignment = evaluator<MatrixType>::Alignment
+    Alignment = evaluator<MatrixType>::Alignment,
+
+    AsScalarProduct =     (DiagonalType::SizeAtCompileTime==1)
+                      ||  (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft)
+                      ||  (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight)
   };
   
   diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
@@ -797,7 +828,10 @@ public:
   
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
   {
-    return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
+    if(AsScalarProduct)
+      return m_diagImpl.coeff(0) * m_matImpl.coeff(idx);
+    else
+      return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
   }
   
 protected:
@@ -851,7 +885,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
     return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
   }
   
-#ifndef EIGEN_CUDACC
+#ifndef EIGEN_GPUCC
   template<int LoadMode,typename PacketType>
   EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
   {
@@ -895,7 +929,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
     return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
   }
   
-#ifndef EIGEN_CUDACC
+#ifndef EIGEN_GPUCC
   template<int LoadMode,typename PacketType>
   EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
   {
diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index 2b5b73bf7..e449ef3ac 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -23,22 +23,22 @@ namespace internal {
 * Part 1 : the logic deciding a strategy for vectorization and unrolling
 ***************************************************************************/
 
-template<typename Func, typename Derived>
+template<typename Func, typename Evaluator>
 struct redux_traits
 {
 public:
-    typedef typename find_best_packet<typename Derived::Scalar,Derived::SizeAtCompileTime>::type PacketType;
+    typedef typename find_best_packet<typename Evaluator::Scalar,Evaluator::SizeAtCompileTime>::type PacketType;
   enum {
     PacketSize = unpacket_traits<PacketType>::size,
-    InnerMaxSize = int(Derived::IsRowMajor)
-                 ? Derived::MaxColsAtCompileTime
-                 : Derived::MaxRowsAtCompileTime
+    InnerMaxSize = int(Evaluator::IsRowMajor)
+                 ? Evaluator::MaxColsAtCompileTime
+                 : Evaluator::MaxRowsAtCompileTime
   };
 
   enum {
-    MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit)
+    MightVectorize = (int(Evaluator::Flags)&ActualPacketAccessBit)
                   && (functor_traits<Func>::PacketAccess),
-    MayLinearVectorize = bool(MightVectorize) && (int(Derived::Flags)&LinearAccessBit),
+    MayLinearVectorize = bool(MightVectorize) && (int(Evaluator::Flags)&LinearAccessBit),
     MaySliceVectorize  = bool(MightVectorize) && int(InnerMaxSize)>=3*PacketSize
   };
 
@@ -51,8 +51,8 @@ public:
 
 public:
   enum {
-    Cost = Derived::SizeAtCompileTime == Dynamic ? HugeCost
-         : Derived::SizeAtCompileTime * Derived::CoeffReadCost + (Derived::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
+    Cost = Evaluator::SizeAtCompileTime == Dynamic ? HugeCost
+         : Evaluator::SizeAtCompileTime * Evaluator::CoeffReadCost + (Evaluator::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
     UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize))
   };
 
@@ -64,9 +64,9 @@ public:
 #ifdef EIGEN_DEBUG_ASSIGN
   static void debug()
   {
-    std::cerr << "Xpr: " << typeid(typename Derived::XprType).name() << std::endl;
+    std::cerr << "Xpr: " << typeid(typename Evaluator::XprType).name() << std::endl;
     std::cerr.setf(std::ios::hex, std::ios::basefield);
-    EIGEN_DEBUG_VAR(Derived::Flags)
+    EIGEN_DEBUG_VAR(Evaluator::Flags)
     std::cerr.unsetf(std::ios::hex);
     EIGEN_DEBUG_VAR(InnerMaxSize)
     EIGEN_DEBUG_VAR(PacketSize)
@@ -87,88 +87,88 @@ public:
 
 /*** no vectorization ***/
 
-template<typename Func, typename Derived, int Start, int Length>
+template<typename Func, typename Evaluator, int Start, int Length>
 struct redux_novec_unroller
 {
   enum {
     HalfLength = Length/2
   };
 
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
 
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
+  static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func& func)
   {
-    return func(redux_novec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
-                redux_novec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func));
+    return func(redux_novec_unroller<Func, Evaluator, Start, HalfLength>::run(eval,func),
+                redux_novec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::run(eval,func));
   }
 };
 
-template<typename Func, typename Derived, int Start>
-struct redux_novec_unroller<Func, Derived, Start, 1>
+template<typename Func, typename Evaluator, int Start>
+struct redux_novec_unroller<Func, Evaluator, Start, 1>
 {
   enum {
-    outer = Start / Derived::InnerSizeAtCompileTime,
-    inner = Start % Derived::InnerSizeAtCompileTime
+    outer = Start / Evaluator::InnerSizeAtCompileTime,
+    inner = Start % Evaluator::InnerSizeAtCompileTime
   };
 
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
 
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&)
+  static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func&)
   {
-    return mat.coeffByOuterInner(outer, inner);
+    return eval.coeffByOuterInner(outer, inner);
   }
 };
 
 // This is actually dead code and will never be called. It is required
 // to prevent false warnings regarding failed inlining though
 // for 0 length run() will never be called at all.
-template<typename Func, typename Derived, int Start>
-struct redux_novec_unroller<Func, Derived, Start, 0>
+template<typename Func, typename Evaluator, int Start>
+struct redux_novec_unroller<Func, Evaluator, Start, 0>
 {
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
   EIGEN_DEVICE_FUNC 
-  static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); }
+  static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); }
 };
 
 /*** vectorization ***/
 
-template<typename Func, typename Derived, int Start, int Length>
+template<typename Func, typename Evaluator, int Start, int Length>
 struct redux_vec_unroller
 {
   enum {
-    PacketSize = redux_traits<Func, Derived>::PacketSize,
+    PacketSize = redux_traits<Func, Evaluator>::PacketSize,
     HalfLength = Length/2
   };
 
-  typedef typename Derived::Scalar Scalar;
-  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
 
-  static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func)
+  static EIGEN_STRONG_INLINE PacketScalar run(const Evaluator &eval, const Func& func)
   {
     return func.packetOp(
-            redux_vec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
-            redux_vec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func) );
+            redux_vec_unroller<Func, Evaluator, Start, HalfLength>::run(eval,func),
+            redux_vec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::run(eval,func) );
   }
 };
 
-template<typename Func, typename Derived, int Start>
-struct redux_vec_unroller<Func, Derived, Start, 1>
+template<typename Func, typename Evaluator, int Start>
+struct redux_vec_unroller<Func, Evaluator, Start, 1>
 {
   enum {
-    index = Start * redux_traits<Func, Derived>::PacketSize,
-    outer = index / int(Derived::InnerSizeAtCompileTime),
-    inner = index % int(Derived::InnerSizeAtCompileTime),
-    alignment = Derived::Alignment
+    index = Start * redux_traits<Func, Evaluator>::PacketSize,
+    outer = index / int(Evaluator::InnerSizeAtCompileTime),
+    inner = index % int(Evaluator::InnerSizeAtCompileTime),
+    alignment = Evaluator::Alignment
   };
 
-  typedef typename Derived::Scalar Scalar;
-  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
 
-  static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&)
+  static EIGEN_STRONG_INLINE PacketScalar run(const Evaluator &eval, const Func&)
   {
-    return mat.template packetByOuterInner<alignment,PacketScalar>(outer, inner);
+    return eval.template packetByOuterInner<alignment,PacketScalar>(outer, inner);
   }
 };
 
@@ -176,53 +176,65 @@ struct redux_vec_unroller<Func, Derived, Start, 1>
 * Part 3 : implementation of all cases
 ***************************************************************************/
 
-template<typename Func, typename Derived,
-         int Traversal = redux_traits<Func, Derived>::Traversal,
-         int Unrolling = redux_traits<Func, Derived>::Unrolling
+template<typename Func, typename Evaluator,
+         int Traversal = redux_traits<Func, Evaluator>::Traversal,
+         int Unrolling = redux_traits<Func, Evaluator>::Unrolling
 >
 struct redux_impl;
 
-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>
+template<typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>
 {
-  typedef typename Derived::Scalar Scalar;
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
+  typedef typename Evaluator::Scalar Scalar;
+
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
+  Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
   {
-    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
+    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
     Scalar res;
-    res = mat.coeffByOuterInner(0, 0);
-    for(Index i = 1; i < mat.innerSize(); ++i)
-      res = func(res, mat.coeffByOuterInner(0, i));
-    for(Index i = 1; i < mat.outerSize(); ++i)
-      for(Index j = 0; j < mat.innerSize(); ++j)
-        res = func(res, mat.coeffByOuterInner(i, j));
+    res = eval.coeffByOuterInner(0, 0);
+    for(Index i = 1; i < xpr.innerSize(); ++i)
+      res = func(res, eval.coeffByOuterInner(0, i));
+    for(Index i = 1; i < xpr.outerSize(); ++i)
+      for(Index j = 0; j < xpr.innerSize(); ++j)
+        res = func(res, eval.coeffByOuterInner(i, j));
     return res;
   }
 };
 
-template<typename Func, typename Derived>
-struct redux_impl<Func,Derived, DefaultTraversal, CompleteUnrolling>
-  : public redux_novec_unroller<Func,Derived, 0, Derived::SizeAtCompileTime>
-{};
+template<typename Func, typename Evaluator>
+struct redux_impl<Func,Evaluator, DefaultTraversal, CompleteUnrolling>
+  : redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime>
+{
+  typedef redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
+  typedef typename Evaluator::Scalar Scalar;
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
+  Scalar run(const Evaluator &eval, const Func& func, const XprType& /*xpr*/)
+  {
+    return Base::run(eval,func);
+  }
+};
 
-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
+template<typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, NoUnrolling>
 {
-  typedef typename Derived::Scalar Scalar;
-  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
 
-  static Scalar run(const Derived &mat, const Func& func)
+  template<typename XprType>
+  static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
   {
-    const Index size = mat.size();
+    const Index size = xpr.size();
     
-    const Index packetSize = redux_traits<Func, Derived>::PacketSize;
+    const Index packetSize = redux_traits<Func, Evaluator>::PacketSize;
     const int packetAlignment = unpacket_traits<PacketScalar>::alignment;
     enum {
-      alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
-      alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment)
+      alignment0 = (bool(Evaluator::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
+      alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Evaluator::Alignment)
     };
-    const Index alignedStart = internal::first_default_aligned(mat.nestedExpression());
+    const Index alignedStart = internal::first_default_aligned(xpr);
     const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
     const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize);
     const Index alignedEnd2 = alignedStart + alignedSize2;
@@ -230,34 +242,34 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
     Scalar res;
     if(alignedSize)
     {
-      PacketScalar packet_res0 = mat.template packet<alignment,PacketScalar>(alignedStart);
+      PacketScalar packet_res0 = eval.template packet<alignment,PacketScalar>(alignedStart);
       if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop
       {
-        PacketScalar packet_res1 = mat.template packet<alignment,PacketScalar>(alignedStart+packetSize);
+        PacketScalar packet_res1 = eval.template packet<alignment,PacketScalar>(alignedStart+packetSize);
         for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize)
         {
-          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(index));
-          packet_res1 = func.packetOp(packet_res1, mat.template packet<alignment,PacketScalar>(index+packetSize));
+          packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(index));
+          packet_res1 = func.packetOp(packet_res1, eval.template packet<alignment,PacketScalar>(index+packetSize));
         }
 
         packet_res0 = func.packetOp(packet_res0,packet_res1);
         if(alignedEnd>alignedEnd2)
-          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(alignedEnd2));
+          packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(alignedEnd2));
       }
       res = func.predux(packet_res0);
 
       for(Index index = 0; index < alignedStart; ++index)
-        res = func(res,mat.coeff(index));
+        res = func(res,eval.coeff(index));
 
       for(Index index = alignedEnd; index < size; ++index)
-        res = func(res,mat.coeff(index));
+        res = func(res,eval.coeff(index));
     }
     else // too small to vectorize anything.
          // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
     {
-      res = mat.coeff(0);
+      res = eval.coeff(0);
       for(Index index = 1; index < size; ++index)
-        res = func(res,mat.coeff(index));
+        res = func(res,eval.coeff(index));
     }
 
     return res;
@@ -265,130 +277,106 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
 };
 
 // NOTE: for SliceVectorizedTraversal we simply bypass unrolling
-template<typename Func, typename Derived, int Unrolling>
-struct redux_impl<Func, Derived, SliceVectorizedTraversal, Unrolling>
+template<typename Func, typename Evaluator, int Unrolling>
+struct redux_impl<Func, Evaluator, SliceVectorizedTraversal, Unrolling>
 {
-  typedef typename Derived::Scalar Scalar;
-  typedef typename redux_traits<Func, Derived>::PacketType PacketType;
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketType;
 
-  EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func)
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
   {
-    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
-    const Index innerSize = mat.innerSize();
-    const Index outerSize = mat.outerSize();
+    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
+    const Index innerSize = xpr.innerSize();
+    const Index outerSize = xpr.outerSize();
     enum {
-      packetSize = redux_traits<Func, Derived>::PacketSize
+      packetSize = redux_traits<Func, Evaluator>::PacketSize
     };
     const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize;
     Scalar res;
     if(packetedInnerSize)
     {
-      PacketType packet_res = mat.template packet<Unaligned,PacketType>(0,0);
+      PacketType packet_res = eval.template packet<Unaligned,PacketType>(0,0);
       for(Index j=0; j<outerSize; ++j)
         for(Index i=(j==0?packetSize:0); i<packetedInnerSize; i+=Index(packetSize))
-          packet_res = func.packetOp(packet_res, mat.template packetByOuterInner<Unaligned,PacketType>(j,i));
+          packet_res = func.packetOp(packet_res, eval.template packetByOuterInner<Unaligned,PacketType>(j,i));
 
       res = func.predux(packet_res);
       for(Index j=0; j<outerSize; ++j)
         for(Index i=packetedInnerSize; i<innerSize; ++i)
-          res = func(res, mat.coeffByOuterInner(j,i));
+          res = func(res, eval.coeffByOuterInner(j,i));
     }
     else // too small to vectorize anything.
          // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
     {
-      res = redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>::run(mat, func);
+      res = redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>::run(eval, func, xpr);
     }
 
     return res;
   }
 };
 
-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
+template<typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, CompleteUnrolling>
 {
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
 
-  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
   enum {
-    PacketSize = redux_traits<Func, Derived>::PacketSize,
-    Size = Derived::SizeAtCompileTime,
+    PacketSize = redux_traits<Func, Evaluator>::PacketSize,
+    Size = Evaluator::SizeAtCompileTime,
     VectorizedSize = (Size / PacketSize) * PacketSize
   };
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
+
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
+  Scalar run(const Evaluator &eval, const Func& func, const XprType &xpr)
   {
-    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
+    EIGEN_ONLY_USED_FOR_DEBUG(xpr)
+    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
     if (VectorizedSize > 0) {
-      Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
+      Scalar res = func.predux(redux_vec_unroller<Func, Evaluator, 0, Size / PacketSize>::run(eval,func));
       if (VectorizedSize != Size)
-        res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
+        res = func(res,redux_novec_unroller<Func, Evaluator, VectorizedSize, Size-VectorizedSize>::run(eval,func));
       return res;
     }
     else {
-      return redux_novec_unroller<Func, Derived, 0, Size>::run(mat,func);
+      return redux_novec_unroller<Func, Evaluator, 0, Size>::run(eval,func);
     }
   }
 };
 
 // evaluator adaptor
 template<typename _XprType>
-class redux_evaluator
+class redux_evaluator : public internal::evaluator<_XprType>
 {
+  typedef internal::evaluator<_XprType> Base;
 public:
   typedef _XprType XprType;
-  EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
+  EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : Base(xpr) {}
   
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketScalar PacketScalar;
-  typedef typename XprType::PacketReturnType PacketReturnType;
   
   enum {
     MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = XprType::MaxColsAtCompileTime,
     // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator
-    Flags = evaluator<XprType>::Flags & ~DirectAccessBit,
+    Flags = Base::Flags & ~DirectAccessBit,
     IsRowMajor = XprType::IsRowMajor,
     SizeAtCompileTime = XprType::SizeAtCompileTime,
-    InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime,
-    CoeffReadCost = evaluator<XprType>::CoeffReadCost,
-    Alignment = evaluator<XprType>::Alignment
+    InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime
   };
   
-  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
-  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
-  EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }
-  EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); }
-  EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); }
-
-  EIGEN_DEVICE_FUNC
-  CoeffReturnType coeff(Index row, Index col) const
-  { return m_evaluator.coeff(row, col); }
-
-  EIGEN_DEVICE_FUNC
-  CoeffReturnType coeff(Index index) const
-  { return m_evaluator.coeff(index); }
-
-  template<int LoadMode, typename PacketType>
-  PacketType packet(Index row, Index col) const
-  { return m_evaluator.template packet<LoadMode,PacketType>(row, col); }
-
-  template<int LoadMode, typename PacketType>
-  PacketType packet(Index index) const
-  { return m_evaluator.template packet<LoadMode,PacketType>(index); }
-  
   EIGEN_DEVICE_FUNC
   CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
-  { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
+  { return Base::coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
   
   template<int LoadMode, typename PacketType>
   PacketType packetByOuterInner(Index outer, Index inner) const
-  { return m_evaluator.template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
+  { return Base::template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
   
-  const XprType & nestedExpression() const { return m_xpr; }
-  
-protected:
-  internal::evaluator<XprType> m_evaluator;
-  const XprType &m_xpr;
 };
 
 } // end namespace internal
@@ -407,7 +395,7 @@ protected:
   */
 template<typename Derived>
 template<typename Func>
-EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::redux(const Func& func) const
 {
   eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
@@ -415,7 +403,9 @@ DenseBase<Derived>::redux(const Func& func) const
   typedef typename internal::redux_evaluator<Derived> ThisEvaluator;
   ThisEvaluator thisEval(derived());
   
-  return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func);
+  // The initial expression is passed to the reducer as an additional argument instead of
+  // passing it as a member of redux_evaluator to help  
+  return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func, derived());
 }
 
 /** \returns the minimum of all coefficients of \c *this.
diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h
index abb1e5121..ac9502bc4 100644
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@@ -95,6 +95,8 @@ protected:
   template<typename Expression>
   EIGEN_DEVICE_FUNC void construct(Expression& expr)
   {
+    EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(PlainObjectType,Expression);
+
     if(PlainObjectType::RowsAtCompileTime==1)
     {
       eigen_assert(expr.rows()==1 || expr.cols()==1);
diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h
index 7e71fe3c0..2cf3fa1ef 100644
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@@ -71,7 +71,9 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
 
     EIGEN_DEVICE_FUNC
     explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
-    {}
+    {
+      EIGEN_STATIC_ASSERT(UpLo==Lower || UpLo==Upper,SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY);
+    }
 
     EIGEN_DEVICE_FUNC
     inline Index rows() const { return m_matrix.rows(); }
@@ -189,7 +191,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
                                    TriangularView<typename MatrixType::AdjointReturnType,TriMode> >::type(tmp2);
     }
 
-    typedef SelfAdjointView<const MatrixConjugateReturnType,Mode> ConjugateReturnType;
+    typedef SelfAdjointView<const MatrixConjugateReturnType,UpLo> ConjugateReturnType;
     /** \sa MatrixBase::conjugate() const */
     EIGEN_DEVICE_FUNC
     inline const ConjugateReturnType conjugate() const
diff --git a/Eigen/src/Core/SelfCwiseBinaryOp.h b/Eigen/src/Core/SelfCwiseBinaryOp.h
index 50099df82..7c89c2e23 100644
--- a/Eigen/src/Core/SelfCwiseBinaryOp.h
+++ b/Eigen/src/Core/SelfCwiseBinaryOp.h
@@ -17,7 +17,6 @@ namespace Eigen {
 template<typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
 {
-  typedef typename Derived::PlainObject PlainObject;
   internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar,Scalar>());
   return derived();
 }
@@ -25,7 +24,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(co
 template<typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
 {
-  typedef typename Derived::PlainObject PlainObject;
   internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar,Scalar>());
   return derived();
 }
@@ -33,7 +31,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(co
 template<typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
 {
-  typedef typename Derived::PlainObject PlainObject;
   internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar,Scalar>());
   return derived();
 }
@@ -41,7 +38,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(co
 template<typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
 {
-  typedef typename Derived::PlainObject PlainObject;
   internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar,Scalar>());
   return derived();
 }
diff --git a/Eigen/src/Core/Solve.h b/Eigen/src/Core/Solve.h
index a8daea511..2bf940a26 100644
--- a/Eigen/src/Core/Solve.h
+++ b/Eigen/src/Core/Solve.h
@@ -181,7 +181,7 @@ struct Assignment<DstXprType, Solve<CwiseUnaryOp<internal::scalar_conjugate_op<t
   }
 };
 
-} // end namepsace internal
+} // end namespace internal
 
 } // end namespace Eigen
 
diff --git a/Eigen/src/Core/SolverBase.h b/Eigen/src/Core/SolverBase.h
index 8a4adc229..702a5485c 100644
--- a/Eigen/src/Core/SolverBase.h
+++ b/Eigen/src/Core/SolverBase.h
@@ -56,7 +56,8 @@ class SolverBase : public EigenBase<Derived>
       MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
                                                              internal::traits<Derived>::MaxColsAtCompileTime>::ret),
       IsVectorAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime == 1
-                           || internal::traits<Derived>::MaxColsAtCompileTime == 1
+                           || internal::traits<Derived>::MaxColsAtCompileTime == 1,
+      NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2
     };
 
     /** Default constructor */
diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h
index e0e7a0c8f..77ea3c261 100644
--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h
@@ -50,6 +50,71 @@ inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& sc
     ssq += (bl*invScale).squaredNorm();
 }
 
+template<typename VectorType, typename RealScalar>
+void stable_norm_impl_inner_step(const VectorType &vec, RealScalar& ssq, RealScalar& scale, RealScalar& invScale)
+{
+  typedef typename VectorType::Scalar Scalar;
+  const Index blockSize = 4096;
+  
+  typedef typename internal::nested_eval<VectorType,2>::type VectorTypeCopy;
+  typedef typename internal::remove_all<VectorTypeCopy>::type VectorTypeCopyClean;
+  const VectorTypeCopy copy(vec);
+  
+  enum {
+    CanAlign = (   (int(VectorTypeCopyClean::Flags)&DirectAccessBit)
+                || (int(internal::evaluator<VectorTypeCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
+               ) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT)
+                 && (EIGEN_MAX_STATIC_ALIGN_BYTES>0) // if we cannot allocate on the stack, then let's not bother about this optimization
+  };
+  typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<VectorTypeCopyClean>::Alignment>,
+                                                   typename VectorTypeCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
+  Index n = vec.size();
+  
+  Index bi = internal::first_default_aligned(copy);
+  if (bi>0)
+    internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
+  for (; bi<n; bi+=blockSize)
+    internal::stable_norm_kernel(SegmentWrapper(copy.segment(bi,numext::mini(blockSize, n - bi))), ssq, scale, invScale);
+}
+
+template<typename VectorType>
+typename VectorType::RealScalar
+stable_norm_impl(const VectorType &vec, typename enable_if<VectorType::IsVectorAtCompileTime>::type* = 0 )
+{
+  using std::sqrt;
+  using std::abs;
+
+  Index n = vec.size();
+
+  if(n==1)
+    return abs(vec.coeff(0));
+
+  typedef typename VectorType::RealScalar RealScalar;
+  RealScalar scale(0);
+  RealScalar invScale(1);
+  RealScalar ssq(0); // sum of squares
+
+  stable_norm_impl_inner_step(vec, ssq, scale, invScale);
+  
+  return scale * sqrt(ssq);
+}
+
+template<typename MatrixType>
+typename MatrixType::RealScalar
+stable_norm_impl(const MatrixType &mat, typename enable_if<!MatrixType::IsVectorAtCompileTime>::type* = 0 )
+{
+  using std::sqrt;
+
+  typedef typename MatrixType::RealScalar RealScalar;
+  RealScalar scale(0);
+  RealScalar invScale(1);
+  RealScalar ssq(0); // sum of squares
+
+  for(Index j=0; j<mat.outerSize(); ++j)
+    stable_norm_impl_inner_step(mat.innerVector(j), ssq, scale, invScale);
+  return scale * sqrt(ssq);
+}
+
 template<typename Derived>
 inline typename NumTraits<typename traits<Derived>::Scalar>::Real
 blueNorm_impl(const EigenBase<Derived>& _vec)
@@ -98,12 +163,16 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
   RealScalar asml = RealScalar(0);
   RealScalar amed = RealScalar(0);
   RealScalar abig = RealScalar(0);
-  for(typename Derived::InnerIterator it(vec, 0); it; ++it)
+
+  for(Index j=0; j<vec.outerSize(); ++j)
   {
-    RealScalar ax = abs(it.value());
-    if(ax > ab2)     abig += numext::abs2(ax*s2m);
-    else if(ax < b1) asml += numext::abs2(ax*s1m);
-    else             amed += numext::abs2(ax);
+    for(typename Derived::InnerIterator it(vec, j); it; ++it)
+    {
+      RealScalar ax = abs(it.value());
+      if(ax > ab2)     abig += numext::abs2(ax*s2m);
+      else if(ax < b1) asml += numext::abs2(ax*s1m);
+      else             amed += numext::abs2(ax);
+    }
   }
   if(amed!=amed)
     return amed;  // we got a NaN
@@ -156,36 +225,7 @@ template<typename Derived>
 inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 MatrixBase<Derived>::stableNorm() const
 {
-  using std::sqrt;
-  using std::abs;
-  const Index blockSize = 4096;
-  RealScalar scale(0);
-  RealScalar invScale(1);
-  RealScalar ssq(0); // sum of square
-  
-  typedef typename internal::nested_eval<Derived,2>::type DerivedCopy;
-  typedef typename internal::remove_all<DerivedCopy>::type DerivedCopyClean;
-  const DerivedCopy copy(derived());
-  
-  enum {
-    CanAlign = (   (int(DerivedCopyClean::Flags)&DirectAccessBit)
-                || (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
-               ) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT)
-                 && (EIGEN_MAX_STATIC_ALIGN_BYTES>0) // if we cannot allocate on the stack, then let's not bother about this optimization
-  };
-  typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>,
-                                                   typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
-  Index n = size();
-  
-  if(n==1)
-    return abs(this->coeff(0));
-  
-  Index bi = internal::first_default_aligned(copy);
-  if (bi>0)
-    internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
-  for (; bi<n; bi+=blockSize)
-    internal::stable_norm_kernel(SegmentWrapper(copy.segment(bi,numext::mini(blockSize, n - bi))), ssq, scale, invScale);
-  return scale * sqrt(ssq);
+  return internal::stable_norm_impl(derived());
 }
 
 /** \returns the \em l2 norm of \c *this using the Blue's algorithm.
@@ -213,7 +253,10 @@ template<typename Derived>
 inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 MatrixBase<Derived>::hypotNorm() const
 {
-  return this->cwiseAbs().redux(internal::scalar_hypot_op<RealScalar>());
+  if(size()==1)
+    return numext::abs(coeff(0,0));
+  else
+    return this->cwiseAbs().redux(internal::scalar_hypot_op<RealScalar>());
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index ba7d6e629..d7c204579 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -79,6 +79,7 @@ template<typename MatrixType> class Transpose
     nestedExpression() { return m_matrix; }
 
     /** \internal */
+    EIGEN_DEVICE_FUNC
     void resize(Index nrows, Index ncols) {
       m_matrix.resize(ncols,nrows);
     }
diff --git a/Eigen/src/Core/Transpositions.h b/Eigen/src/Core/Transpositions.h
index 19c17bb4a..81a4a5855 100644
--- a/Eigen/src/Core/Transpositions.h
+++ b/Eigen/src/Core/Transpositions.h
@@ -84,7 +84,7 @@ class TranspositionsBase
     }
 
     // FIXME: do we want such methods ?
-    // might be usefull when the target matrix expression is complex, e.g.:
+    // might be useful when the target matrix expression is complex, e.g.:
     // object.matrix().block(..,..,..,..) = trans * object.matrix().block(..,..,..,..);
     /*
     template<typename MatrixType>
@@ -384,7 +384,7 @@ class Transpose<TranspositionsBase<TranspositionsDerived> >
     const Product<OtherDerived, Transpose, AliasFreeProduct>
     operator*(const MatrixBase<OtherDerived>& matrix, const Transpose& trt)
     {
-      return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trt.derived());
+      return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trt);
     }
 
     /** \returns the \a matrix with the inverse transpositions applied to the rows.
diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h
index ed80da36a..521de6160 100644
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -65,6 +65,7 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
     inline Index innerStride() const { return derived().innerStride(); }
     
     // dummy resize function
+    EIGEN_DEVICE_FUNC
     void resize(Index rows, Index cols)
     {
       EIGEN_UNUSED_VARIABLE(rows);
@@ -470,7 +471,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
       * \a Side==OnTheLeft (the default), or the right-inverse-multiply  \a other * inverse(\c *this) if
       * \a Side==OnTheRight.
       *
-      * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft
+      * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft
       *
       * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the
       * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
@@ -496,7 +497,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
       * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
       * This function will const_cast it, so constness isn't honored here.
       *
-      * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft
+      * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft
       *
       * See TriangularView:solve() for the details.
       */
@@ -716,6 +717,7 @@ struct unary_evaluator<TriangularView<MatrixType,Mode>, IndexBased>
 {
   typedef TriangularView<MatrixType,Mode> XprType;
   typedef evaluator<typename internal::remove_all<MatrixType>::type> Base;
+  EIGEN_DEVICE_FUNC
   unary_evaluator(const XprType &xpr) : Base(xpr.nestedExpression()) {}
 };
 
diff --git a/Eigen/src/Core/VectorBlock.h b/Eigen/src/Core/VectorBlock.h
index d72fbf7e9..0ede5d58e 100644
--- a/Eigen/src/Core/VectorBlock.h
+++ b/Eigen/src/Core/VectorBlock.h
@@ -35,7 +35,7 @@ struct traits<VectorBlock<VectorType, Size> >
   * It is the return type of DenseBase::segment(Index,Index) and DenseBase::segment<int>(Index) and
   * most of the time this is the only way it is used.
   *
-  * However, if you want to directly maniputate sub-vector expressions,
+  * However, if you want to directly manipulate sub-vector expressions,
   * for instance if you want to write a function returning such an expression, you
   * will need to use this class.
   *
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 636230944..774e64981 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -318,9 +318,9 @@ template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
 }
 
 #ifndef EIGEN_VECTORIZE_AVX512
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
 #endif
 
 template<> EIGEN_STRONG_INLINE float  pfirst<Packet8f>(const Packet8f& a) {
@@ -343,9 +343,12 @@ template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
 {
    __m256d tmp = _mm256_shuffle_pd(a,a,5);
   return _mm256_permute2f128_pd(tmp, tmp, 1);
-
+  #if 0
+  // This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd
+  // exhibit the same latency/throughput, but it is here for future reference/benchmarking...
   __m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
     return _mm256_permute_pd(swap_halves,5);
+  #endif
 }
 
 // pabs should be ok
@@ -412,7 +415,7 @@ template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
   return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1))));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f predux_downto4<Packet8f>(const Packet8f& a)
+template<> EIGEN_STRONG_INLINE Packet4f predux_half_dowto4<Packet8f>(const Packet8f& a)
 {
   return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
 }
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
index 399be0ee4..ba1246722 100644
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -88,9 +88,9 @@ plog<Packet16f>(const Packet16f& _x) {
   //     x = x + x - 1.0;
   //   } else { x = x - 1.0; }
   __mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ);
-  Packet16f tmp = _mm512_mask_blend_ps(mask, x, _mm512_setzero_ps());
+  Packet16f tmp = _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), x);
   x = psub(x, p16f_1);
-  e = psub(e, _mm512_mask_blend_ps(mask, p16f_1, _mm512_setzero_ps()));
+  e = psub(e, _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), p16f_1));
   x = padd(x, tmp);
 
   Packet16f x2 = pmul(x, x);
@@ -119,8 +119,9 @@ plog<Packet16f>(const Packet16f& _x) {
   x = padd(x, y2);
 
   // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
-  return _mm512_mask_blend_ps(iszero_mask, p16f_minus_inf,
-                              _mm512_mask_blend_ps(invalid_mask, p16f_nan, x));
+  return _mm512_mask_blend_ps(iszero_mask,
+                              _mm512_mask_blend_ps(invalid_mask, x, p16f_nan),
+                              p16f_minus_inf);
 }
 #endif
 
@@ -257,50 +258,39 @@ pexp<Packet8d>(const Packet8d& _x) {
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
 psqrt<Packet16f>(const Packet16f& _x) {
-  _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
-  _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
-  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
-
-  Packet16f neg_half = pmul(_x, p16f_minus_half);
+  Packet16f neg_half = pmul(_x, pset1<Packet16f>(-.5f));
+  __mmask16 denormal_mask = _mm512_kand(
+      _mm512_cmp_ps_mask(_x, pset1<Packet16f>((std::numeric_limits<float>::min)()),
+                        _CMP_LT_OQ),
+      _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ));
 
-  // select only the inverse sqrt of positive normal inputs (denormals are
-  // flushed to zero and cause infs as well).
-  __mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ);
-  Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_rsqrt14_ps(_x),
-                                     _mm512_setzero_ps());
+  Packet16f x = _mm512_rsqrt14_ps(_x);
 
   // Do a single step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
+  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet16f>(1.5f)));
 
-  // Multiply the original _x by it's reciprocal square root to extract the
-  // square root.
-  return pmul(_x, x);
+  // Flush results for denormals to zero.
+  return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps());
 }
 
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
 psqrt<Packet8d>(const Packet8d& _x) {
-  _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
-  _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
-  _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
-
-  Packet8d neg_half = pmul(_x, p8d_minus_half);
+  Packet8d neg_half = pmul(_x, pset1<Packet8d>(-.5f));
+  __mmask16 denormal_mask = _mm512_kand(
+      _mm512_cmp_pd_mask(_x, pset1<Packet8d>((std::numeric_limits<double>::min)()),
+                        _CMP_LT_OQ),
+      _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ));
 
-  // select only the inverse sqrt of positive normal inputs (denormals are
-  // flushed to zero and cause infs as well).
-  __mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ);
-  Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_rsqrt14_pd(_x),
-                                    _mm512_setzero_pd());
+  Packet8d x = _mm512_rsqrt14_pd(_x);
 
-  // Do a first step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
+  // Do a single step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5f)));
 
   // Do a second step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
+  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5f)));
 
-  // Multiply the original _x by it's reciprocal square root to extract the
-  // square root.
-  return pmul(_x, x);
+  return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd());
 }
 #else
 template <>
@@ -333,20 +323,18 @@ prsqrt<Packet16f>(const Packet16f& _x) {
   // select only the inverse sqrt of positive normal inputs (denormals are
   // flushed to zero and cause infs as well).
   __mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ);
-  Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(),
-                                     _mm512_rsqrt14_ps(_x));
+  Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps());
 
   // Fill in NaNs and Infs for the negative/zero entries.
   __mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ);
   Packet16f infs_and_nans = _mm512_mask_blend_ps(
-      neg_mask, p16f_nan,
-      _mm512_mask_blend_ps(le_zero_mask, p16f_inf, _mm512_setzero_ps()));
+      neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan);
 
   // Do a single step of Newton's iteration.
   x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
 
   // Insert NaNs and Infs in all the right places.
-  return _mm512_mask_blend_ps(le_zero_mask, infs_and_nans, x);
+  return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans);
 }
 
 template <>
@@ -363,14 +351,12 @@ prsqrt<Packet8d>(const Packet8d& _x) {
   // select only the inverse sqrt of positive normal inputs (denormals are
   // flushed to zero and cause infs as well).
   __mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ);
-  Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(),
-                                    _mm512_rsqrt14_pd(_x));
+  Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd());
 
   // Fill in NaNs and Infs for the negative/zero entries.
   __mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ);
   Packet8d infs_and_nans = _mm512_mask_blend_pd(
-      neg_mask, p8d_nan,
-      _mm512_mask_blend_pd(le_zero_mask, p8d_inf, _mm512_setzero_pd()));
+      neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan);
 
   // Do a first step of Newton's iteration.
   x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
@@ -379,9 +365,9 @@ prsqrt<Packet8d>(const Packet8d& _x) {
   x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
 
   // Insert NaNs and Infs in all the right places.
-  return _mm512_mask_blend_pd(le_zero_mask, infs_and_nans, x);
+  return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans);
 }
-#else
+#elif defined(EIGEN_VECTORIZE_AVX512ER)
 template <>
 EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
   return _mm512_rsqrt28_ps(x);
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index 12b897572..4e2e916de 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -54,6 +54,7 @@ template<> struct packet_traits<float>  : default_packet_traits
     AlignedOnScalar = 1,
     size = 16,
     HasHalfPacket = 1,
+    HasBlend = 0,
 #if EIGEN_GNUC_AT_LEAST(5, 3)
 #ifdef EIGEN_VECTORIZE_AVX512DQ
     HasLog = 1,
@@ -470,6 +471,8 @@ EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) {
   __m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0));
   return pairs;
 }
+
+#ifdef EIGEN_VECTORIZE_AVX512DQ
 // Loads 4 doubles from memory a returns the packet {a0, a0  a1, a1, a2, a2, a3,
 // a3}
 template <>
@@ -481,6 +484,17 @@ EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
   x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[3]), 3);
   return x;
 }
+#else
+template <>
+EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
+  __m512d x = _mm512_setzero_pd();
+  x = _mm512_mask_broadcastsd_pd(x, 0x3<<0, _mm_load_sd(from+0));
+  x = _mm512_mask_broadcastsd_pd(x, 0x3<<2, _mm_load_sd(from+1));
+  x = _mm512_mask_broadcastsd_pd(x, 0x3<<4, _mm_load_sd(from+2));
+  x = _mm512_mask_broadcastsd_pd(x, 0x3<<6, _mm_load_sd(from+3));
+  return x;
+}
+#endif
 
 // Loads 4 floats from memory a returns the packet
 // {a0, a0  a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
@@ -590,9 +604,9 @@ EIGEN_STRONG_INLINE void pstore1<Packet16i>(int* to, const int& a) {
   pstore(to, pa);
 }
 
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
 
 template <>
 EIGEN_STRONG_INLINE float pfirst<Packet16f>(const Packet16f& a) {
@@ -874,7 +888,7 @@ EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) {
+EIGEN_STRONG_INLINE Packet8f predux_half_dowto4<Packet16f>(const Packet16f& a) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
   __m256 lane0 = _mm512_extractf32x8_ps(a, 0);
   __m256 lane1 = _mm512_extractf32x8_ps(a, 1);
@@ -890,7 +904,7 @@ EIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) {
 #endif
 }
 template <>
-EIGEN_STRONG_INLINE Packet4d predux_downto4<Packet8d>(const Packet8d& a) {
+EIGEN_STRONG_INLINE Packet4d predux_half_dowto4<Packet8d>(const Packet8d& a) {
   __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
   __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
   __m256d res = _mm256_add_pd(lane0, lane1);
@@ -1272,11 +1286,38 @@ EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& /*ifPacket*/,
   return Packet16f();
 }
 template <>
-EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& /*ifPacket*/,
-                                    const Packet8d& /*thenPacket*/,
-                                    const Packet8d& /*elsePacket*/) {
-  assert(false && "To be implemented");
-  return Packet8d();
+EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket,
+                                    const Packet8d& thenPacket,
+                                    const Packet8d& elsePacket) {
+  __mmask8 m = (ifPacket.select[0]   )
+             | (ifPacket.select[1]<<1)
+             | (ifPacket.select[2]<<2)
+             | (ifPacket.select[3]<<3)
+             | (ifPacket.select[4]<<4)
+             | (ifPacket.select[5]<<5)
+             | (ifPacket.select[6]<<6)
+             | (ifPacket.select[7]<<7);
+  return _mm512_mask_blend_pd(m, elsePacket, thenPacket);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pinsertfirst(const Packet16f& a, float b)
+{
+  return _mm512_mask_broadcastss_ps(a, (1), _mm_load_ss(&b));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8d pinsertfirst(const Packet8d& a, double b)
+{
+  return _mm512_mask_broadcastsd_pd(a, (1), _mm_load_sd(&b));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pinsertlast(const Packet16f& a, float b)
+{
+  return _mm512_mask_broadcastss_ps(a, (1<<15), _mm_load_ss(&b));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8d pinsertlast(const Packet8d& a, double b)
+{
+  return _mm512_mask_broadcastsd_pd(a, (1<<7), _mm_load_sd(&b));
 }
 
 } // end namespace internal
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index b3f1ea199..7f4e90f75 100755
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -103,7 +103,7 @@ static Packet16uc p16uc_PSET32_WODD   = vec_sld((Packet16uc) vec_splat((Packet4u
 static Packet16uc p16uc_PSET32_WEVEN  = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
 static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8);      //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
 #else
-static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; 
+static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
 static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
 static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
 static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
@@ -388,10 +388,30 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); }
 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; }
 
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  #ifdef __VSX__
+  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
+  Packet4f ret;
+  __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+  return ret;
+  #else
+  return vec_min(a, b);
+  #endif
+}
 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
 
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  #ifdef __VSX__
+  // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
+  Packet4f ret;
+  __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+  return ret;
+  #else
+  return vec_max(a, b);
+  #endif
+}
 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
 
 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
@@ -434,7 +454,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
   return (Packet4i) vec_perm(MSQ, LSQ, mask);    // align the data
 }
 #else
-// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
+// We also need to redefine little endian loading of Packet4i/Packet4f using VSX
 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
 {
   EIGEN_DEBUG_UNALIGNED_LOAD
@@ -500,7 +520,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& f
   vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
 }
 #else
-// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
+// We also need to redefine little endian loading of Packet4i/Packet4f using VSX
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet4i& from)
 {
   EIGEN_DEBUG_ALIGNED_STORE
@@ -764,7 +784,7 @@ typedef __vector __bool long         Packet2bl;
 
 static Packet2l  p2l_ONE  = { 1, 1 };
 static Packet2l  p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
-static Packet2d  p2d_ONE  = { 1.0, 1.0 }; 
+static Packet2d  p2d_ONE  = { 1.0, 1.0 };
 static Packet2d  p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
 static Packet2d  p2d_MZERO = { -0.0, -0.0 };
 
@@ -910,9 +930,21 @@ template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const
 // for some weird raisons, it has to be overloaded for packet of integers
 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
 
-template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b)
+{
+  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
+  Packet2d ret;
+  __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+  return ret;
+ }
 
-template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b)
+{
+  // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
+  Packet2d ret;
+  __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+  return ret;
+}
 
 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
 
@@ -969,7 +1001,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
   Packet2d v[2], sum;
   v[0] = vecs[0] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[0]), reinterpret_cast<Packet4f>(vecs[0]), 8));
   v[1] = vecs[1] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[1]), reinterpret_cast<Packet4f>(vecs[1]), 8));
- 
+
 #ifdef _BIG_ENDIAN
   sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[0]), reinterpret_cast<Packet4f>(v[1]), 8));
 #else
@@ -1022,7 +1054,7 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {
 
 template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
   Packet2l select = { ifPacket.select[0], ifPacket.select[1] };
-  Packet2bl mask = vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE));
+  Packet2bl mask = reinterpret_cast<Packet2bl>( vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)) );
   return vec_sel(elsePacket, thenPacket, mask);
 }
 #endif // __VSX__
diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/GPU/Half.h
index bfda39df5..ab9d27591 100644
--- a/Eigen/src/Core/arch/CUDA/Half.h
+++ b/Eigen/src/Core/arch/GPU/Half.h
@@ -26,15 +26,15 @@
 
 
 // Standard 16-bit float type, mostly useful for GPUs. Defines a new
-// type Eigen::half (inheriting from CUDA's __half struct) with
+// type Eigen::half (inheriting either from CUDA's or HIP's __half struct) with
 // operator overloads such that it behaves basically as an arithmetic
 // type. It will be quite slow on CPUs (so it is recommended to stay
 // in fp32 for CPUs, except for simple parameter conversions, I/O
 // to disk and the likes), but fast on GPUs.
 
 
-#ifndef EIGEN_HALF_CUDA_H
-#define EIGEN_HALF_CUDA_H
+#ifndef EIGEN_HALF_GPU_H
+#define EIGEN_HALF_GPU_H
 
 #if __cplusplus > 199711L
 #define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type()
@@ -49,16 +49,41 @@ struct half;
 
 namespace half_impl {
 
-#if !defined(EIGEN_HAS_CUDA_FP16)
+#if !defined(EIGEN_HAS_GPU_FP16)
 // Make our own __half_raw definition that is similar to CUDA's.
 struct __half_raw {
   EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
   explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
   unsigned short x;
 };
-#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
+#elif defined(EIGEN_HAS_HIP_FP16)
+ #if defined(EIGEN_HAS_OLD_HIP_FP16)
+// Make a __half_raw definition that is
+// ++ compatible with that of Eigen and
+// ++ add a implcit conversion to the native __half of the old HIP implementation.
+//
+// Keeping ".x" as "unsigned short" keeps the interface the same between the Eigen and HIP implementation.
+//
+// In the old HIP implementation,
+//   ++ __half is a typedef of __fp16
+//   ++ the "__h*" routines take "__half" arguments
+// so we need to implicitly convert "__half_raw" to "__half" to avoid having to explicitly make 
+// that conversiion in each call to a "__h*" routine...that is why we have "operator __half" routine
+struct __half_raw {
+  EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
+  explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
+  union {
+    unsigned short x;
+    __half data;
+  };
+  operator __half(void) const { return data; }
+};
+ #endif
+#elif defined(EIGEN_HAS_CUDA_FP16)
+ #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
 // In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
-typedef __half __half_raw;
+ typedef __half __half_raw;
+ #endif
 #endif
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x);
@@ -69,8 +94,19 @@ struct half_base : public __half_raw {
   EIGEN_DEVICE_FUNC half_base() {}
   EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
   EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
+
+#if defined(EIGEN_HAS_GPU_FP16)
+ #if defined(EIGEN_HAS_HIP_FP16)
+  #if defined(EIGEN_HAS_OLD_HIP_FP16)
+  EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(__half_as_ushort(h)) {}
+  #else
+  EIGEN_DEVICE_FUNC half_base(const __half& h) { x = __half_as_ushort(h); }
+  #endif
+ #elif defined(EIGEN_HAS_CUDA_FP16)
+  #if (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000)
   EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
+  #endif
+ #endif    
 #endif
 };
 
@@ -78,17 +114,38 @@ struct half_base : public __half_raw {
 
 // Class definition.
 struct half : public half_impl::half_base {
-  #if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
-    typedef half_impl::__half_raw __half_raw;
-  #endif
+
+  // Writing this out as separate #if-else blocks to make the code easier to follow
+  // The same applies to most #if-else blocks in this file
+#if !defined(EIGEN_HAS_GPU_FP16)
+  typedef half_impl::__half_raw __half_raw;
+#elif defined(EIGEN_HAS_HIP_FP16)
+ #if defined(EIGEN_HAS_OLD_HIP_FP16)
+  typedef half_impl::__half_raw __half_raw;
+ #endif
+#elif defined(EIGEN_HAS_CUDA_FP16)
+  // Note that EIGEN_CUDACC_VER is set to 0 even when compiling with HIP, so (EIGEN_CUDACC_VER < 90000) is true even for HIP!
+  // So keeping this within #if defined(EIGEN_HAS_CUDA_FP16) is needed
+ #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000  
+  typedef half_impl::__half_raw __half_raw;
+ #endif
+#endif
 
   EIGEN_DEVICE_FUNC half() {}
 
   EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
   EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
+  
+#if defined(EIGEN_HAS_GPU_FP16)
+ #if defined(EIGEN_HAS_HIP_FP16)
   EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
+ #elif defined(EIGEN_HAS_CUDA_FP16)
+  #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
+  EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
+  #endif
+ #endif
 #endif
+  
 
   explicit EIGEN_DEVICE_FUNC half(bool b)
       : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
@@ -145,9 +202,64 @@ struct half : public half_impl::half_base {
   }
 };
 
+} // end namespace Eigen
+
+namespace std {
+template<>
+struct numeric_limits<Eigen::half> {
+  static const bool is_specialized = true;
+  static const bool is_signed = true;
+  static const bool is_integer = false;
+  static const bool is_exact = false;
+  static const bool has_infinity = true;
+  static const bool has_quiet_NaN = true;
+  static const bool has_signaling_NaN = true;
+  static const float_denorm_style has_denorm = denorm_present;
+  static const bool has_denorm_loss = false;
+  static const std::float_round_style round_style = std::round_to_nearest;
+  static const bool is_iec559 = false;
+  static const bool is_bounded = false;
+  static const bool is_modulo = false;
+  static const int digits = 11;
+  static const int digits10 = 3;      // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
+  static const int max_digits10 = 5;  // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
+  static const int radix = 2;
+  static const int min_exponent = -13;
+  static const int min_exponent10 = -4;
+  static const int max_exponent = 16;
+  static const int max_exponent10 = 4;
+  static const bool traps = true;
+  static const bool tinyness_before = false;
+
+  static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); }
+  static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
+  static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
+  static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); }
+  static Eigen::half round_error() { return Eigen::half(0.5); }
+  static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
+  static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
+  static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
+  static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); }
+};
+
+// If std::numeric_limits<T> is specialized, should also specialize
+// std::numeric_limits<const T>, std::numeric_limits<volatile T>, and
+// std::numeric_limits<const volatile T>
+// https://stackoverflow.com/a/16519653/
+template<>
+struct numeric_limits<const Eigen::half> : numeric_limits<Eigen::half> {};
+template<>
+struct numeric_limits<volatile Eigen::half> : numeric_limits<Eigen::half> {};
+template<>
+struct numeric_limits<const volatile Eigen::half> : numeric_limits<Eigen::half> {};
+} // end namespace std
+
+namespace Eigen {
+
 namespace half_impl {
 
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
 
 // Intrinsics for native fp16 support. Note that on current hardware,
 // these are no faster than fp32 arithmetic (you need to use the half2
@@ -208,7 +320,7 @@ EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
 
 #else  // Emulate support for half floats
 
-// Definitions for CPUs and older CUDA, mostly working through conversion
+// Definitions for CPUs and older HIP+CUDA, mostly working through conversion
 // to/from fp32.
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
@@ -245,10 +357,10 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b)
   return a;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
-  return float(a) == float(b);
+  return numext::equal_strict(float(a),float(b));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
-  return float(a) != float(b);
+  return numext::not_equal_strict(float(a), float(b));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
   return float(a) < float(b);
@@ -288,7 +400,8 @@ union FP32 {
 };
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   __half tmp_ff = __float2half(ff);
   return *(__half_raw*)&tmp_ff;
 
@@ -344,7 +457,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
 }
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   return __half2float(h);
 
 #elif defined(EIGEN_HAS_FP16_C)
@@ -378,7 +492,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
   return (a.x & 0x7fff) == 0x7c00;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   return __hisnan(a);
 #else
   return (a.x & 0x7fff) > 0x7c00;
@@ -394,7 +509,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
   return result;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
+#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hexp(a));
 #else
    return half(::expf(float(a)));
@@ -404,7 +520,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
   return half(numext::expm1(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
-#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
+#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   return half(::hlog(a));
 #else
   return half(::logf(float(a)));
@@ -417,7 +534,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
   return half(::log10f(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
+#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hsqrt(a));
 #else
     return half(::sqrtf(float(a)));
@@ -439,14 +557,16 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
   return half(::tanhf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
+#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hfloor(a));
 #else
   return half(::floorf(float(a)));
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
+#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hceil(a));
 #else
   return half(::ceilf(float(a)));
@@ -454,7 +574,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
 }
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   return __hlt(b, a) ? b : a;
 #else
   const float f1 = static_cast<float>(a);
@@ -463,7 +584,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   return __hlt(a, b) ? b : a;
 #else
   const float f1 = static_cast<float>(a);
@@ -501,49 +623,6 @@ template<> struct is_arithmetic<half> { enum { value = true }; };
 
 } // end namespace internal
 
-}  // end namespace Eigen
-
-namespace std {
-template<>
-struct numeric_limits<Eigen::half> {
-  static const bool is_specialized = true;
-  static const bool is_signed = true;
-  static const bool is_integer = false;
-  static const bool is_exact = false;
-  static const bool has_infinity = true;
-  static const bool has_quiet_NaN = true;
-  static const bool has_signaling_NaN = true;
-  static const float_denorm_style has_denorm = denorm_present;
-  static const bool has_denorm_loss = false;
-  static const std::float_round_style round_style = std::round_to_nearest;
-  static const bool is_iec559 = false;
-  static const bool is_bounded = false;
-  static const bool is_modulo = false;
-  static const int digits = 11;
-  static const int digits10 = 3;      // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
-  static const int max_digits10 = 5;  // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
-  static const int radix = 2;
-  static const int min_exponent = -13;
-  static const int min_exponent10 = -4;
-  static const int max_exponent = 16;
-  static const int max_exponent10 = 4;
-  static const bool traps = true;
-  static const bool tinyness_before = false;
-
-  static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); }
-  static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
-  static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
-  static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); }
-  static Eigen::half round_error() { return Eigen::half(0.5); }
-  static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
-  static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
-  static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
-  static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); }
-};
-}
-
-namespace Eigen {
-
 template<> struct NumTraits<Eigen::half>
     : GenericNumTraits<Eigen::half>
 {
@@ -584,7 +663,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
   return Eigen::half(::expf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
+#if (EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
   return Eigen::half(::hlog(a));
 #else
   return Eigen::half(::logf(float(a)));
@@ -618,9 +698,12 @@ struct hash<Eigen::half> {
 
 
 // Add the missing shfl_xor intrinsic
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
+#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
+
 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
-  #if EIGEN_CUDACC_VER < 90000
+  #if (EIGEN_CUDACC_VER < 90000) || \
+    defined(EIGEN_HAS_HIP_FP16)
   return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
   #else
   return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
@@ -629,7 +712,8 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneM
 #endif
 
 // ldg() has an overload for __half_raw, but we also need one for Eigen::half.
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
+#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
   return Eigen::half_impl::raw_uint16_to_half(
       __ldg(reinterpret_cast<const unsigned short*>(ptr)));
@@ -637,7 +721,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr)
 #endif
 
 
-#if defined(EIGEN_CUDA_ARCH)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 namespace Eigen {
 namespace numext {
 
@@ -663,4 +747,4 @@ bool (isfinite)(const Eigen::half& h) {
 }  // namespace numext
 #endif
 
-#endif // EIGEN_HALF_CUDA_H
+#endif // EIGEN_HALF_GPU_H
diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/GPU/MathFunctions.h
index ff6256ce0..d2b3a2568 100644
--- a/Eigen/src/Core/arch/CUDA/MathFunctions.h
+++ b/Eigen/src/Core/arch/GPU/MathFunctions.h
@@ -7,8 +7,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H
-#define EIGEN_MATH_FUNCTIONS_CUDA_H
+#ifndef EIGEN_MATH_FUNCTIONS_GPU_H
+#define EIGEN_MATH_FUNCTIONS_GPU_H
 
 namespace Eigen {
 
@@ -17,7 +17,7 @@ namespace internal {
 // Make sure this is only available when targeting a GPU: we don't want to
 // introduce conflicts between these packet_traits definitions and the ones
 // we'll use on the host side (SSE, AVX, ...)
-#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
+#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 plog<float4>(const float4& a)
 {
@@ -100,4 +100,4 @@ double2 prsqrt<double2>(const double2& a)
 
 } // end namespace Eigen
 
-#endif // EIGEN_MATH_FUNCTIONS_CUDA_H
+#endif // EIGEN_MATH_FUNCTIONS_GPU_H
diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h
index 97a8abe59..ddf37b9c1 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h
@@ -7,8 +7,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_PACKET_MATH_CUDA_H
-#define EIGEN_PACKET_MATH_CUDA_H
+#ifndef EIGEN_PACKET_MATH_GPU_H
+#define EIGEN_PACKET_MATH_GPU_H
 
 namespace Eigen {
 
@@ -17,7 +17,7 @@ namespace internal {
 // Make sure this is only available when targeting a GPU: we don't want to
 // introduce conflicts between these packet_traits definitions and the ones
 // we'll use on the host side (SSE, AVX, ...)
-#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
+#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
 template<> struct is_arithmetic<float4>  { enum { value = true }; };
 template<> struct is_arithmetic<double2> { enum { value = true }; };
 
@@ -44,7 +44,11 @@ template<> struct packet_traits<float> : default_packet_traits
     HasPolygamma = 1,
     HasErf = 1,
     HasErfc = 1,
+    HasI0e = 1,
+    HasI1e = 1,
     HasIGamma = 1,
+    HasIGammaDerA = 1,
+    HasGammaSampleDerAlpha = 1,
     HasIGammac = 1,
     HasBetaInc = 1,
 
@@ -73,7 +77,11 @@ template<> struct packet_traits<double> : default_packet_traits
     HasPolygamma = 1,
     HasErf = 1,
     HasErfc = 1,
+    HasI0e = 1,
+    HasI1e = 1,
     HasIGamma = 1,
+    HasIGammaDerA = 1,
+    HasGammaSampleDerAlpha = 1,
     HasIGammac = 1,
     HasBetaInc = 1,
 
@@ -330,4 +338,4 @@ ptranspose(PacketBlock<double2,2>& kernel) {
 } // end namespace Eigen
 
 
-#endif // EIGEN_PACKET_MATH_CUDA_H
+#endif // EIGEN_PACKET_MATH_GPU_H
diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
index 8a6f209c4..b0a72e1f9 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
@@ -7,15 +7,16 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H
-#define EIGEN_PACKET_MATH_HALF_CUDA_H
+#ifndef EIGEN_PACKET_MATH_HALF_GPU_H
+#define EIGEN_PACKET_MATH_HALF_GPU_H
 
 
 namespace Eigen {
 namespace internal {
 
 // Most of the following operations require arch >= 3.0
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE))
 
 template<> struct is_arithmetic<half2> { enum { value = true }; };
 
@@ -43,7 +44,18 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
 template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
 
 template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+#if defined(EIGEN_HAS_OLD_HIP_FP16)
+  return half2half2(from);
+#else  
+  return __half2half2(from);
+#endif
+
+#else // EIGEN_CUDA_ARCH
   return __half2half2(from);
+#endif
 }
 
 template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
@@ -69,20 +81,46 @@ template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half*
 
 template<>
  __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+#if defined(EIGEN_HAS_OLD_HIP_FP16)
+  return __halves2half2((*(from+0)), (*(from+1)));
+#else
+  return __ldg((const half2*)from);
+#endif
+
+#else  // EIGEN_CUDA_ARCH
+
 #if EIGEN_CUDA_ARCH >= 350
    return __ldg((const half2*)from);
 #else
   return __halves2half2(*(from+0), *(from+1));
 #endif
+
+#endif
 }
 
 template<>
 __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+#if defined(EIGEN_HAS_OLD_HIP_FP16)
+  return __halves2half2((*(from+0)), (*(from+1)));
+#else
+  return __halves2half2(__ldg(from+0), __ldg(from+1));
+#endif
+
+#else  // EIGEN_CUDA_ARCH
+
 #if EIGEN_CUDA_ARCH >= 350
    return __halves2half2(__ldg(from+0), __ldg(from+1));
 #else
   return __halves2half2(*(from+0), *(from+1));
 #endif
+
+#endif
 }
 
 template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
@@ -117,15 +155,29 @@ ptranspose(PacketBlock<half2,2>& kernel) {
 }
 
 template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+  
+  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
+  
+#else  // EIGEN_CUDA_ARCH
+
 #if EIGEN_CUDA_ARCH >= 530
   return __halves2half2(a, __hadd(a, __float2half(1.0f)));
 #else
   float f = __half2float(a) + 1.0f;
   return __halves2half2(a, __float2half(f));
 #endif
+
+#endif
 }
 
 template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  return __hadd2(a, b);
+
+#else  // EIGEN_CUDA_ARCH
+
 #if EIGEN_CUDA_ARCH >= 530
   return __hadd2(a, b);
 #else
@@ -137,9 +189,17 @@ template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, cons
   float r2 = a2 + b2;
   return __floats2half2_rn(r1, r2);
 #endif
+
+#endif
 }
 
 template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  return __hsub2(a, b);
+  
+#else  // EIGEN_CUDA_ARCH
+
 #if EIGEN_CUDA_ARCH >= 530
   return __hsub2(a, b);
 #else
@@ -151,9 +211,17 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, cons
   float r2 = a2 - b2;
   return __floats2half2_rn(r1, r2);
 #endif
+
+#endif
 }
 
 template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  return __hneg2(a);
+
+#else  // EIGEN_CUDA_ARCH
+
 #if EIGEN_CUDA_ARCH >= 530
   return __hneg2(a);
 #else
@@ -161,11 +229,19 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
   float a2 = __high2float(a);
   return __floats2half2_rn(-a1, -a2);
 #endif
+
+#endif
 }
 
 template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
 
 template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  return __hmul2(a, b);
+
+#else  // EIGEN_CUDA_ARCH
+
 #if EIGEN_CUDA_ARCH >= 530
   return __hmul2(a, b);
 #else
@@ -177,9 +253,17 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, cons
   float r2 = a2 * b2;
   return __floats2half2_rn(r1, r2);
 #endif
+
+#endif
 }
 
 template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+   return __hfma2(a, b, c);
+
+#else  // EIGEN_CUDA_ARCH
+
 #if EIGEN_CUDA_ARCH >= 530
    return __hfma2(a, b, c);
 #else
@@ -193,9 +277,21 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, con
   float r2 = a2 * b2 + c2;
   return __floats2half2_rn(r1, r2);
 #endif
+
+#endif
 }
 
 template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+  
+#if defined(EIGEN_HAS_OLD_HIP_FP16)
+  return h2div(a, b);
+#else
+  return __h2div(a, b);
+#endif
+  
+#else // EIGEN_CUDA_ARCH
+  
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float b1 = __low2float(b);
@@ -203,6 +299,8 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, cons
   float r1 = a1 / b1;
   float r2 = a2 / b2;
   return __floats2half2_rn(r1, r2);
+
+#endif
 }
 
 template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
@@ -226,6 +324,12 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, cons
 }
 
 template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  return __hadd(__low2half(a), __high2half(a));
+
+#else  // EIGEN_CUDA_ARCH
+
 #if EIGEN_CUDA_ARCH >= 530
   return __hadd(__low2half(a), __high2half(a));
 #else
@@ -233,9 +337,19 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2&
   float a2 = __high2float(a);
   return Eigen::half(__float2half(a1 + a2));
 #endif
+
+#endif
 }
 
 template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  __half first = __low2half(a);
+  __half second = __high2half(a);
+  return __hgt(first, second) ? first : second;
+
+#else  // EIGEN_CUDA_ARCH
+
 #if EIGEN_CUDA_ARCH >= 530
   __half first = __low2half(a);
   __half second = __high2half(a);
@@ -245,9 +359,19 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const ha
   float a2 = __high2float(a);
   return a1 > a2 ? __low2half(a) : __high2half(a);
 #endif
+
+#endif
 }
 
 template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  __half first = __low2half(a);
+  __half second = __high2half(a);
+  return __hlt(first, second) ? first : second;
+
+#else  // EIGEN_CUDA_ARCH
+
 #if EIGEN_CUDA_ARCH >= 530
   __half first = __low2half(a);
   __half second = __high2half(a);
@@ -257,9 +381,17 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const ha
   float a2 = __high2float(a);
   return a1 < a2 ? __low2half(a) : __high2half(a);
 #endif
+
+#endif
 }
 
 template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  return __hmul(__low2half(a), __high2half(a));
+
+#else  // EIGEN_CUDA_ARCH
+
 #if EIGEN_CUDA_ARCH >= 530
   return __hmul(__low2half(a), __high2half(a));
 #else
@@ -267,6 +399,8 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const ha
   float a2 = __high2float(a);
   return Eigen::half(__float2half(a1 * a2));
 #endif
+
+#endif
 }
 
 template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
@@ -285,7 +419,8 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
   return __floats2half2_rn(r1, r2);
 }
 
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
+#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
 
 template<>  __device__ EIGEN_STRONG_INLINE
 half2 plog<half2>(const half2& a) {
@@ -362,10 +497,10 @@ struct packet_traits<half> : default_packet_traits {
     AlignedOnScalar = 1,
     size = 16,
     HasHalfPacket = 0,
-    HasAdd    = 0,
-    HasSub    = 0,
-    HasMul    = 0,
-    HasNegate = 0,
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasNegate = 1,
     HasAbs    = 0,
     HasAbs2   = 0,
     HasMin    = 0,
@@ -415,6 +550,21 @@ template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet1
 }
 
 template<> EIGEN_STRONG_INLINE Packet16h
+ploaddup<Packet16h>(const Eigen::half*  from) {
+  Packet16h result;
+  unsigned short a = from[0].x;
+  unsigned short b = from[1].x;
+  unsigned short c = from[2].x;
+  unsigned short d = from[3].x;
+  unsigned short e = from[4].x;
+  unsigned short f = from[5].x;
+  unsigned short g = from[6].x;
+  unsigned short h = from[7].x;
+  result.x = _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h
 ploadquad(const Eigen::half* from) {
   Packet16h result;
   unsigned short a = from[0].x;
@@ -486,6 +636,13 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
 #endif
 }
 
+template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
+  // FIXME we could do that with bit manipulation
+  Packet16f af = half2float(a);
+  Packet16f rf = pnegate(af);
+  return float2half(rf);
+}
+
 template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
   Packet16f af = half2float(a);
   Packet16f bf = half2float(b);
@@ -493,6 +650,13 @@ template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, con
   return float2half(rf);
 }
 
+template<> EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = psub(af, bf);
+  return float2half(rf);
+}
+
 template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
   Packet16f af = half2float(a);
   Packet16f bf = half2float(b);
@@ -505,6 +669,57 @@ template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
   return half(predux(from_float));
 }
 
+template<> EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& from) {
+  Packet16f from_float = half2float(from);
+  return half(predux_mul(from_float));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h preduxp<Packet16h>(const Packet16h* p) {
+  Packet16f pf[16];
+  pf[0] = half2float(p[0]);
+  pf[1] = half2float(p[1]);
+  pf[2] = half2float(p[2]);
+  pf[3] = half2float(p[3]);
+  pf[4] = half2float(p[4]);
+  pf[5] = half2float(p[5]);
+  pf[6] = half2float(p[6]);
+  pf[7] = half2float(p[7]);
+  pf[8] = half2float(p[8]);
+  pf[9] = half2float(p[9]);
+  pf[10] = half2float(p[10]);
+  pf[11] = half2float(p[11]);
+  pf[12] = half2float(p[12]);
+  pf[13] = half2float(p[13]);
+  pf[14] = half2float(p[14]);
+  pf[15] = half2float(p[15]);
+  Packet16f reduced = preduxp<Packet16f>(pf);
+  return float2half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a)
+{
+  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
+  Packet16h res;
+  res.x = _mm256_insertf128_si256(
+                    _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a.x,1),m)),
+                                           _mm_shuffle_epi8(_mm256_extractf128_si256(a.x,0),m), 1);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pinsertfirst(const Packet16h& a, Eigen::half b)
+{
+  Packet16h res;
+  res.x = _mm256_insert_epi16(a.x,b.x,0);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pinsertlast(const Packet16h& a, Eigen::half b)
+{
+  Packet16h res;
+  res.x = _mm256_insert_epi16(a.x,b.x,15);
+  return res;
+}
+
 template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
 {
   Packet16h result;
@@ -612,20 +827,20 @@ ptranspose(PacketBlock<Packet16h,16>& kernel) {
 
   // NOTE: no unpacklo/hi instr in this case, so using permute instr.
   __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
-  __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
-  __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
-  __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
-  __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
-  __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
-  __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
-  __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
-  __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
-  __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
-  __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
-  __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
-  __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
-  __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
-  __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
+  __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
+  __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
+  __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
+  __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
+  __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
+  __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
+  __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
+  __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
+  __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
+  __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
+  __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
+  __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
+  __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
+  __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
   __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
 
   kernel.packet[0].x = a_p_0;
@@ -730,10 +945,10 @@ struct packet_traits<Eigen::half> : default_packet_traits {
     AlignedOnScalar = 1,
     size = 8,
     HasHalfPacket = 0,
-    HasAdd    = 0,
-    HasSub    = 0,
-    HasMul    = 0,
-    HasNegate = 0,
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasNegate = 1,
     HasAbs    = 0,
     HasAbs2   = 0,
     HasMin    = 0,
@@ -783,6 +998,17 @@ template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const
 }
 
 template<> EIGEN_STRONG_INLINE Packet8h
+ploaddup<Packet8h>(const Eigen::half*  from) {
+  Packet8h result;
+  unsigned short a = from[0].x;
+  unsigned short b = from[1].x;
+  unsigned short c = from[2].x;
+  unsigned short d = from[3].x;
+  result.x = _mm_set_epi16(d, d, c, c, b, b, a, a);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h
 ploadquad<Packet8h>(const Eigen::half* from) {
   Packet8h result;
   unsigned short a = from[0].x;
@@ -835,6 +1061,13 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
 
 template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
 
+template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
+  // FIXME we could do that with bit manipulation
+  Packet8f af = half2float(a);
+  Packet8f rf = pnegate(af);
+  return float2half(rf);
+}
+
 template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
   Packet8f af = half2float(a);
   Packet8f bf = half2float(b);
@@ -842,6 +1075,13 @@ template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const
   return float2half(rf);
 }
 
+template<> EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = psub(af, bf);
+  return float2half(rf);
+}
+
 template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
   Packet8f af = half2float(a);
   Packet8f bf = half2float(b);
@@ -894,6 +1134,52 @@ template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h&
   return Eigen::half(reduced);
 }
 
+template<> EIGEN_STRONG_INLINE Packet8h preduxp<Packet8h>(const Packet8h* p) {
+  Packet8f pf[8];
+  pf[0] = half2float(p[0]);
+  pf[1] = half2float(p[1]);
+  pf[2] = half2float(p[2]);
+  pf[3] = half2float(p[3]);
+  pf[4] = half2float(p[4]);
+  pf[5] = half2float(p[5]);
+  pf[6] = half2float(p[6]);
+  pf[7] = half2float(p[7]);
+  Packet8f reduced = preduxp<Packet8f>(pf);
+  return float2half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a)
+{
+  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
+  Packet8h res;
+  res.x = _mm_shuffle_epi8(a.x,m);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pinsertfirst(const Packet8h& a, Eigen::half b)
+{
+  Packet8h res;
+  res.x = _mm_insert_epi16(a.x,int(b.x),0);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pinsertlast(const Packet8h& a, Eigen::half b)
+{
+  Packet8h res;
+  res.x = _mm_insert_epi16(a.x,int(b.x),7);
+  return res;
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet8h>
+{
+  static EIGEN_STRONG_INLINE void run(Packet8h& first, const Packet8h& second)
+  {
+    if (Offset!=0)
+      first.x = _mm_alignr_epi8(second.x,first.x, Offset*2);
+  }
+};
+
 EIGEN_STRONG_INLINE void
 ptranspose(PacketBlock<Packet8h,8>& kernel) {
   __m128i a = kernel.packet[0].x;
@@ -1130,4 +1416,4 @@ ptranspose(PacketBlock<Packet4h,4>& kernel) {
 }
 }
 
-#endif // EIGEN_PACKET_MATH_HALF_CUDA_H
+#endif // EIGEN_PACKET_MATH_HALF_GPU_H
diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/GPU/TypeCasting.h
index 30f870c3d..57a55d08b 100644
--- a/Eigen/src/Core/arch/CUDA/TypeCasting.h
+++ b/Eigen/src/Core/arch/GPU/TypeCasting.h
@@ -7,8 +7,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_TYPE_CASTING_CUDA_H
-#define EIGEN_TYPE_CASTING_CUDA_H
+#ifndef EIGEN_TYPE_CASTING_GPU_H
+#define EIGEN_TYPE_CASTING_GPU_H
 
 namespace Eigen {
 
@@ -19,7 +19,8 @@ struct scalar_cast_op<float, Eigen::half> {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
   typedef Eigen::half result_type;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const {
-    #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
+    #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+      (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
       return __float2half(a);
     #else
       return Eigen::half(a);
@@ -37,7 +38,8 @@ struct scalar_cast_op<int, Eigen::half> {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
   typedef Eigen::half result_type;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const {
-    #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
+    #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+      (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
       return __float2half(static_cast<float>(a));
     #else
       return Eigen::half(static_cast<float>(a));
@@ -55,7 +57,8 @@ struct scalar_cast_op<Eigen::half, float> {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
   typedef float result_type;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const {
-    #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
+    #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+      (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
       return __half2float(a);
     #else
       return static_cast<float>(a);
@@ -69,7 +72,8 @@ struct functor_traits<scalar_cast_op<Eigen::half, float> >
 
 
 
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
 
 template <>
 struct type_casting_traits<Eigen::half, float> {
@@ -209,4 +213,4 @@ template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f
 
 } // end namespace Eigen
 
-#endif // EIGEN_TYPE_CASTING_CUDA_H
+#endif // EIGEN_TYPE_CASTING_GPU_H
diff --git a/Eigen/src/Core/arch/HIP/hcc/math_constants.h b/Eigen/src/Core/arch/HIP/hcc/math_constants.h
new file mode 100644
index 000000000..25375a0a4
--- /dev/null
+++ b/Eigen/src/Core/arch/HIP/hcc/math_constants.h
@@ -0,0 +1,23 @@
+/*
+ * math_constants.h - 
+ *  HIP equivalent of the CUDA header of the same name
+ */
+
+#ifndef __MATH_CONSTANTS_H__
+#define __MATH_CONSTANTS_H__
+
+/* single precision constants */
+
+#define HIPRT_INF_F        __int_as_float(0x7f800000)
+#define HIPRT_NAN_F        __int_as_float(0x7fffffff)
+#define HIPRT_MIN_DENORM_F __int_as_float(0x00000001)
+#define HIPRT_MAX_NORMAL_F __int_as_float(0x7f7fffff)
+#define HIPRT_NEG_ZERO_F   __int_as_float(0x80000000)
+#define HIPRT_ZERO_F       0.0f
+#define HIPRT_ONE_F        1.0f
+
+/* double precision constants */
+#define HIPRT_INF          __hiloint2double(0x7ff00000, 0x00000000)
+#define HIPRT_NAN          __hiloint2double(0xfff80000, 0x00000000)
+
+#endif
diff --git a/Eigen/src/Core/arch/MSA/Complex.h b/Eigen/src/Core/arch/MSA/Complex.h
new file mode 100644
index 000000000..9a45cf51e
--- /dev/null
+++ b/Eigen/src/Core/arch/MSA/Complex.h
@@ -0,0 +1,759 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Wave Computing, Inc.
+// Written by:
+//   Chris Larsen
+//   Alexey Frunze (afrunze@wavecomp.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_MSA_H
+#define EIGEN_COMPLEX_MSA_H
+
+#include <iostream>
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- float ----------
+struct Packet2cf {
+  EIGEN_STRONG_INLINE Packet2cf() {
+  }
+  EIGEN_STRONG_INLINE explicit Packet2cf(const std::complex<float>& a,
+                                         const std::complex<float>& b) {
+    Packet4f t = { std::real(a), std::imag(a), std::real(b), std::imag(b) };
+    v = t;
+  }
+  EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {
+  }
+  EIGEN_STRONG_INLINE Packet2cf(const Packet2cf& a) : v(a.v) {
+  }
+  EIGEN_STRONG_INLINE Packet2cf& operator=(const Packet2cf& b) {
+    v = b.v;
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf conjugate(void) const {
+    return Packet2cf((Packet4f)__builtin_msa_bnegi_d((v2u64)v, 63));
+  }
+  EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) {
+    Packet4f v1, v2;
+
+    // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
+    v1 = (Packet4f)__builtin_msa_ilvev_w((v4i32)v, (v4i32)v);
+    // Get the imag values of a | a1_im | a1_im | a2_im | a2_im |
+    v2 = (Packet4f)__builtin_msa_ilvod_w((v4i32)v, (v4i32)v);
+    // Multiply the real a with b
+    v1 = pmul(v1, b.v);
+    // Multiply the imag a with b
+    v2 = pmul(v2, b.v);
+    // Conjugate v2
+    v2 = Packet2cf(v2).conjugate().v;
+    // Swap real/imag elements in v2.
+    v2 = (Packet4f)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(1, 0, 3, 2));
+    // Add and return the result
+    v = padd(v1, v2);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const {
+    return Packet2cf(*this) *= b;
+  }
+  EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) {
+    v = padd(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const {
+    return Packet2cf(*this) += b;
+  }
+  EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) {
+    v = psub(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const {
+    return Packet2cf(*this) -= b;
+  }
+  EIGEN_STRONG_INLINE Packet2cf& operator/=(const Packet2cf& b) {
+    *this *= b.conjugate();
+    Packet4f s = pmul<Packet4f>(b.v, b.v);
+    s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+    v = pdiv(v, s);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator/(const Packet2cf& b) const {
+    return Packet2cf(*this) /= b;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator-(void) const {
+    return Packet2cf(pnegate(v));
+  }
+
+  Packet4f v;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Packet2cf& value) {
+  os << "[ (" << value.v[0] << ", " << value.v[1]
+     << "i),"
+        "  ("
+     << value.v[2] << ", " << value.v[3] << "i) ]";
+  return os;
+}
+
+template <>
+struct packet_traits<std::complex<float> > : default_packet_traits {
+  typedef Packet2cf type;
+  typedef Packet2cf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    HasHalfPacket = 0,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasNegate = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasSetLinear = 0,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct unpacket_traits<Packet2cf> {
+  typedef std::complex<float> type;
+  enum { size = 2, alignment = Aligned16 };
+  typedef Packet2cf half;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
+  EIGEN_MSA_DEBUG;
+
+  float f0 = from.real(), f1 = from.imag();
+  Packet4f v0 = { f0, f0, f0, f0 };
+  Packet4f v1 = { f1, f1, f1, f1 };
+  return Packet2cf((Packet4f)__builtin_msa_ilvr_w((Packet4i)v1, (Packet4i)v0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a + b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a - b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return -a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a.conjugate();
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a * b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf(pand(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf(por(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf(pxor(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf(pandnot(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_MSA_DEBUG;
+
+  return pset1<Packet2cf>(*from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to,
+                                                      const Packet2cf& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_STORE pstore<float>((float*)to, from.v);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to,
+                                                       const Packet2cf& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu<float>((float*)to, from.v);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(
+    const std::complex<float>* from, Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf(from[0 * stride], from[1 * stride]);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to,
+                                                                       const Packet2cf& from,
+                                                                       Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  *to = std::complex<float>(from.v[0], from.v[1]);
+  to += stride;
+  *to = std::complex<float>(from.v[2], from.v[3]);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float>* addr) {
+  EIGEN_MSA_DEBUG;
+
+  prefetch(reinterpret_cast<const float*>(addr));
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return std::complex<float>(a.v[0], a.v[1]);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4f value = (Packet4f)preverse((Packet2d)a.v);
+  value += a.v;
+  return std::complex<float>(value[0], value[1]);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4f sum1, sum2, sum;
+
+  // Add the first two 64-bit float32x2_t of vecs[0]
+  sum1 = (Packet4f)__builtin_msa_ilvr_d((v2i64)vecs[1].v, (v2i64)vecs[0].v);
+  sum2 = (Packet4f)__builtin_msa_ilvl_d((v2i64)vecs[1].v, (v2i64)vecs[0].v);
+  sum = padd(sum1, sum2);
+
+  return Packet2cf(sum);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return std::complex<float>((a.v[0] * a.v[2]) - (a.v[1] * a.v[3]),
+                             (a.v[0] * a.v[3]) + (a.v[1] * a.v[2]));
+}
+
+template <int Offset>
+struct palign_impl<Offset, Packet2cf> {
+  EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second) {
+    if (Offset == 1) {
+      first.v = (Packet4f)__builtin_msa_sldi_b((v16i8)second.v, (v16i8)first.v, Offset * 8);
+    }
+  }
+};
+
+template <>
+struct conj_helper<Packet2cf, Packet2cf, false, true> {
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y,
+                                      const Packet2cf& c) const {
+    return padd(pmul(x, y), c);
+  }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const {
+    return internal::pmul(a, pconj(b));
+  }
+};
+
+template <>
+struct conj_helper<Packet2cf, Packet2cf, true, false> {
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y,
+                                      const Packet2cf& c) const {
+    return padd(pmul(x, y), c);
+  }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const {
+    return internal::pmul(pconj(a), b);
+  }
+};
+
+template <>
+struct conj_helper<Packet2cf, Packet2cf, true, true> {
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y,
+                                      const Packet2cf& c) const {
+    return padd(pmul(x, y), c);
+  }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const {
+    return pconj(internal::pmul(a, b));
+  }
+};
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a / b;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet2cf, 2>& value) {
+  os << "[ " << value.packet[0] << ", " << std::endl << "  " << value.packet[1] << " ]";
+  return os;
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4f tmp =
+      (Packet4f)__builtin_msa_ilvl_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
+  kernel.packet[0].v =
+      (Packet4f)__builtin_msa_ilvr_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
+  kernel.packet[1].v = tmp;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
+                                     const Packet2cf& elsePacket) {
+  return (Packet2cf)(Packet4f)pblend<Packet2d>(ifPacket, (Packet2d)thenPacket.v,
+                                               (Packet2d)elsePacket.v);
+}
+
+//---------- double ----------
+
+struct Packet1cd {
+  EIGEN_STRONG_INLINE Packet1cd() {
+  }
+  EIGEN_STRONG_INLINE explicit Packet1cd(const std::complex<double>& a) {
+    v[0] = std::real(a);
+    v[1] = std::imag(a);
+  }
+  EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {
+  }
+  EIGEN_STRONG_INLINE Packet1cd(const Packet1cd& a) : v(a.v) {
+  }
+  EIGEN_STRONG_INLINE Packet1cd& operator=(const Packet1cd& b) {
+    v = b.v;
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd conjugate(void) const {
+    static const v2u64 p2ul_CONJ_XOR = { 0x0, 0x8000000000000000 };
+    return (Packet1cd)pxor(v, (Packet2d)p2ul_CONJ_XOR);
+  }
+  EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) {
+    Packet2d v1, v2;
+
+    // Get the real values of a | a1_re | a1_re
+    v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)v, (v2i64)v);
+    // Get the imag values of a | a1_im | a1_im
+    v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)v, (v2i64)v);
+    // Multiply the real a with b
+    v1 = pmul(v1, b.v);
+    // Multiply the imag a with b
+    v2 = pmul(v2, b.v);
+    // Conjugate v2
+    v2 = Packet1cd(v2).conjugate().v;
+    // Swap real/imag elements in v2.
+    v2 = (Packet2d)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+    // Add and return the result
+    v = padd(v1, v2);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const {
+    return Packet1cd(*this) *= b;
+  }
+  EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) {
+    v = padd(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const {
+    return Packet1cd(*this) += b;
+  }
+  EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) {
+    v = psub(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const {
+    return Packet1cd(*this) -= b;
+  }
+  EIGEN_STRONG_INLINE Packet1cd& operator/=(const Packet1cd& b) {
+    *this *= b.conjugate();
+    Packet2d s = pmul<Packet2d>(b.v, b.v);
+    s = padd(s, preverse<Packet2d>(s));
+    v = pdiv(v, s);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator/(const Packet1cd& b) const {
+    return Packet1cd(*this) /= b;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator-(void) const {
+    return Packet1cd(pnegate(v));
+  }
+
+  Packet2d v;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Packet1cd& value) {
+  os << "[ (" << value.v[0] << ", " << value.v[1] << "i) ]";
+  return os;
+}
+
+template <>
+struct packet_traits<std::complex<double> > : default_packet_traits {
+  typedef Packet1cd type;
+  typedef Packet1cd half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 0,
+    size = 1,
+    HasHalfPacket = 0,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasNegate = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasSetLinear = 0
+  };
+};
+
+template <>
+struct unpacket_traits<Packet1cd> {
+  typedef std::complex<double> type;
+  enum { size = 1, alignment = Aligned16 };
+  typedef Packet1cd half;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a + b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a - b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return -a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a.conjugate();
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a * b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(pand(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(por(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(pxor(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(pandnot(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_MSA_DEBUG;
+
+  return pset1<Packet1cd>(*from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to,
+                                                       const Packet1cd& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_STORE pstore<double>((double*)to, from.v);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to,
+                                                        const Packet1cd& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu<double>((double*)to, from.v);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double>* addr) {
+  EIGEN_MSA_DEBUG;
+
+  prefetch(reinterpret_cast<const double*>(addr));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(
+    const std::complex<double>* from, Index stride __attribute__((unused))) {
+  EIGEN_MSA_DEBUG;
+
+  Packet1cd res;
+  res.v[0] = std::real(from[0]);
+  res.v[1] = std::imag(from[0]);
+  return res;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to,
+                                                                        const Packet1cd& from,
+                                                                        Index stride
+                                                                        __attribute__((unused))) {
+  EIGEN_MSA_DEBUG;
+
+  pstore(to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return std::complex<double>(a.v[0], a.v[1]);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return pfirst(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs) {
+  EIGEN_MSA_DEBUG;
+
+  return vecs[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return pfirst(a);
+}
+
+template <int Offset>
+struct palign_impl<Offset, Packet1cd> {
+  static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) {
+    // FIXME is it sure we never have to align a Packet1cd?
+    // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes
+    // boundary...
+  }
+};
+
+template <>
+struct conj_helper<Packet1cd, Packet1cd, false, true> {
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y,
+                                      const Packet1cd& c) const {
+    return padd(pmul(x, y), c);
+  }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const {
+    return internal::pmul(a, pconj(b));
+  }
+};
+
+template <>
+struct conj_helper<Packet1cd, Packet1cd, true, false> {
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y,
+                                      const Packet1cd& c) const {
+    return padd(pmul(x, y), c);
+  }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const {
+    return internal::pmul(pconj(a), b);
+  }
+};
+
+template <>
+struct conj_helper<Packet1cd, Packet1cd, true, true> {
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y,
+                                      const Packet1cd& c) const {
+    return padd(pmul(x, y), c);
+  }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const {
+    return pconj(internal::pmul(a, b));
+  }
+};
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d)
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a / b;
+}
+
+EIGEN_STRONG_INLINE Packet1cd pcplxflip /*<Packet1cd>*/ (const Packet1cd& x) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet1cd, 2>& value) {
+  os << "[ " << value.packet[0] << ", " << std::endl << "  " << value.packet[1] << " ]";
+  return os;
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd, 2>& kernel) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d v1, v2;
+
+  v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v);
+  // Get the imag values of a
+  v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v);
+
+  kernel.packet[0].v = v1;
+  kernel.packet[1].v = v2;
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_COMPLEX_MSA_H
diff --git a/Eigen/src/Core/arch/MSA/MathFunctions.h b/Eigen/src/Core/arch/MSA/MathFunctions.h
new file mode 100644
index 000000000..98e23e36f
--- /dev/null
+++ b/Eigen/src/Core/arch/MSA/MathFunctions.h
@@ -0,0 +1,387 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007 Julien Pommier
+// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// Copyright (C) 2018 Wave Computing, Inc.
+// Written by:
+//   Chris Larsen
+//   Alexey Frunze (afrunze@wavecomp.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* The sin, cos, exp, and log functions of this file come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+/* The tanh function of this file is an adaptation of
+ * template<typename T> T generic_fast_tanh_float(const T&)
+ * from MathFunctionsImpl.h.
+ */
+
+#ifndef EIGEN_MATH_FUNCTIONS_MSA_H
+#define EIGEN_MATH_FUNCTIONS_MSA_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+plog<Packet4f>(const Packet4f& _x) {
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292e-2f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, -1.1514610310e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, -1.2420140846e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, +1.4249322787e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, -1.6668057665e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, +2.0000714765e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, -2.4999993993e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, +3.3333331174e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
+  static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
+
+  // Convert negative argument into NAN (quiet negative, to be specific).
+  Packet4f zero = (Packet4f)__builtin_msa_ldi_w(0);
+  Packet4i neg_mask = __builtin_msa_fclt_w(_x, zero);
+  Packet4i zero_mask = __builtin_msa_fceq_w(_x, zero);
+  Packet4f non_neg_x_or_nan = padd(_x, (Packet4f)neg_mask);  // Add 0.0 or NAN.
+  Packet4f x = non_neg_x_or_nan;
+
+  // Extract exponent from x = mantissa * 2**exponent, where 1.0 <= mantissa < 2.0.
+  // N.B. the exponent is one less of what frexpf() would return.
+  Packet4i e_int = __builtin_msa_ftint_s_w(__builtin_msa_flog2_w(x));
+  // Multiply x by 2**(-exponent-1) to get 0.5 <= x < 1.0 as from frexpf().
+  x = __builtin_msa_fexp2_w(x, (Packet4i)__builtin_msa_nori_b((v16u8)e_int, 0));
+
+  /*
+     if (x < SQRTHF) {
+       x = x + x - 1.0;
+     } else {
+       e += 1;
+       x = x - 1.0;
+     }
+  */
+  Packet4f xx = padd(x, x);
+  Packet4i ge_mask = __builtin_msa_fcle_w(p4f_cephes_SQRTHF, x);
+  e_int = psub(e_int, ge_mask);
+  x = (Packet4f)__builtin_msa_bsel_v((v16u8)ge_mask, (v16u8)xx, (v16u8)x);
+  x = psub(x, p4f_1);
+  Packet4f e = __builtin_msa_ffint_s_w(e_int);
+
+  Packet4f x2 = pmul(x, x);
+  Packet4f x3 = pmul(x2, x);
+
+  Packet4f y, y1, y2;
+  y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
+  y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
+  y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
+  y = pmadd(y, x, p4f_cephes_log_p2);
+  y1 = pmadd(y1, x, p4f_cephes_log_p5);
+  y2 = pmadd(y2, x, p4f_cephes_log_p8);
+  y = pmadd(y, x3, y1);
+  y = pmadd(y, x3, y2);
+  y = pmul(y, x3);
+
+  y = pmadd(e, p4f_cephes_log_q1, y);
+  x = __builtin_msa_fmsub_w(x, x2, p4f_half);
+  x = padd(x, y);
+  x = pmadd(e, p4f_cephes_log_q2, x);
+
+  // x is now the logarithm result candidate. We still need to handle the
+  // extreme arguments of zero and positive infinity, though.
+  // N.B. if the argument is +INFINITY, x is NAN because the polynomial terms
+  // contain infinities of both signs (see the coefficients and code above).
+  // INFINITY - INFINITY is NAN.
+
+  // If the argument is +INFINITY, make it the new result candidate.
+  // To achieve that we choose the smaller of the result candidate and the
+  // argument.
+  // This is correct for all finite pairs of values (the logarithm is smaller
+  // than the argument).
+  // This is also correct in the special case when the argument is +INFINITY
+  // and the result candidate is NAN. This is because the fmin.df instruction
+  // prefers non-NANs to NANs.
+  x = __builtin_msa_fmin_w(x, non_neg_x_or_nan);
+
+  // If the argument is zero (including -0.0), the result becomes -INFINITY.
+  Packet4i neg_infs = __builtin_msa_slli_w(zero_mask, 23);
+  x = (Packet4f)__builtin_msa_bsel_v((v16u8)zero_mask, (v16u8)x, (v16u8)neg_infs);
+
+  return x;
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+pexp<Packet4f>(const Packet4f& _x) {
+  // Limiting single-precision pexp's argument to [-128, +128] lets pexp
+  // reach 0 and INFINITY naturally.
+  static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -128.0f);
+  static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, +128.0f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507e-3f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073e-3f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894e-2f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
+
+  Packet4f x = _x;
+
+  // Clamp x.
+  x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(x, p4f_exp_lo), (v16u8)x,
+                                     (v16u8)p4f_exp_lo);
+  x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_exp_hi, x), (v16u8)x,
+                                     (v16u8)p4f_exp_hi);
+
+  // Round to nearest integer by adding 0.5 (with x's sign) and truncating.
+  Packet4f x2_add = (Packet4f)__builtin_msa_binsli_w((v4u32)p4f_half, (v4u32)x, 0);
+  Packet4f x2 = pmadd(x, p4f_cephes_LOG2EF, x2_add);
+  Packet4i x2_int = __builtin_msa_ftrunc_s_w(x2);
+  Packet4f x2_int_f = __builtin_msa_ffint_s_w(x2_int);
+
+  x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C1);
+  x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C2);
+
+  Packet4f z = pmul(x, x);
+
+  Packet4f y = p4f_cephes_exp_p0;
+  y = pmadd(y, x, p4f_cephes_exp_p1);
+  y = pmadd(y, x, p4f_cephes_exp_p2);
+  y = pmadd(y, x, p4f_cephes_exp_p3);
+  y = pmadd(y, x, p4f_cephes_exp_p4);
+  y = pmadd(y, x, p4f_cephes_exp_p5);
+  y = pmadd(y, z, x);
+  y = padd(y, p4f_1);
+
+  // y *= 2**exponent.
+  y = __builtin_msa_fexp2_w(y, x2_int);
+
+  return y;
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+ptanh<Packet4f>(const Packet4f& _x) {
+  static _EIGEN_DECLARE_CONST_Packet4f(tanh_tiny, 1e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(tanh_hi, 9.0f);
+  // The monomial coefficients of the numerator polynomial (odd).
+  static _EIGEN_DECLARE_CONST_Packet4f(alpha_1, 4.89352455891786e-3f);
+  static _EIGEN_DECLARE_CONST_Packet4f(alpha_3, 6.37261928875436e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(alpha_5, 1.48572235717979e-5f);
+  static _EIGEN_DECLARE_CONST_Packet4f(alpha_7, 5.12229709037114e-8f);
+  static _EIGEN_DECLARE_CONST_Packet4f(alpha_9, -8.60467152213735e-11f);
+  static _EIGEN_DECLARE_CONST_Packet4f(alpha_11, 2.00018790482477e-13f);
+  static _EIGEN_DECLARE_CONST_Packet4f(alpha_13, -2.76076847742355e-16f);
+  // The monomial coefficients of the denominator polynomial (even).
+  static _EIGEN_DECLARE_CONST_Packet4f(beta_0, 4.89352518554385e-3f);
+  static _EIGEN_DECLARE_CONST_Packet4f(beta_2, 2.26843463243900e-3f);
+  static _EIGEN_DECLARE_CONST_Packet4f(beta_4, 1.18534705686654e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(beta_6, 1.19825839466702e-6f);
+
+  Packet4f x = pabs(_x);
+  Packet4i tiny_mask = __builtin_msa_fclt_w(x, p4f_tanh_tiny);
+
+  // Clamp the inputs to the range [-9, 9] since anything outside
+  // this range is -/+1.0f in single-precision.
+  x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_tanh_hi, x), (v16u8)x,
+                                     (v16u8)p4f_tanh_hi);
+
+  // Since the polynomials are odd/even, we need x**2.
+  Packet4f x2 = pmul(x, x);
+
+  // Evaluate the numerator polynomial p.
+  Packet4f p = pmadd(x2, p4f_alpha_13, p4f_alpha_11);
+  p = pmadd(x2, p, p4f_alpha_9);
+  p = pmadd(x2, p, p4f_alpha_7);
+  p = pmadd(x2, p, p4f_alpha_5);
+  p = pmadd(x2, p, p4f_alpha_3);
+  p = pmadd(x2, p, p4f_alpha_1);
+  p = pmul(x, p);
+
+  // Evaluate the denominator polynomial q.
+  Packet4f q = pmadd(x2, p4f_beta_6, p4f_beta_4);
+  q = pmadd(x2, q, p4f_beta_2);
+  q = pmadd(x2, q, p4f_beta_0);
+
+  // Divide the numerator by the denominator.
+  p = pdiv(p, q);
+
+  // Reinstate the sign.
+  p = (Packet4f)__builtin_msa_binsli_w((v4u32)p, (v4u32)_x, 0);
+
+  // When the argument is very small in magnitude it's more accurate to just return it.
+  p = (Packet4f)__builtin_msa_bsel_v((v16u8)tiny_mask, (v16u8)p, (v16u8)_x);
+
+  return p;
+}
+
+template <bool sine>
+Packet4f psincos_inner_msa_float(const Packet4f& _x) {
+  static _EIGEN_DECLARE_CONST_Packet4f(sincos_max_arg, 13176795.0f);  // Approx. (2**24) / (4/Pi).
+  static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1, -0.78515625f);
+  static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
+  static _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891e-4f);
+  static _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736e-3f);
+  static _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611e-1f);
+  static _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948e-5f);
+  static _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765e-3f);
+  static _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827e-2f);
+  static _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f);  // 4/Pi.
+  static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
+
+  Packet4f x = pabs(_x);
+
+  // Translate infinite arguments into NANs.
+  Packet4f zero_or_nan_if_inf = psub(_x, _x);
+  x = padd(x, zero_or_nan_if_inf);
+  // Prevent sin/cos from generating values larger than 1.0 in magnitude
+  // for very large arguments by setting x to 0.0.
+  Packet4i small_or_nan_mask = __builtin_msa_fcult_w(x, p4f_sincos_max_arg);
+  x = pand(x, (Packet4f)small_or_nan_mask);
+
+  // Scale x by 4/Pi to find x's octant.
+  Packet4f y = pmul(x, p4f_cephes_FOPI);
+  // Get the octant. We'll reduce x by this number of octants or by one more than it.
+  Packet4i y_int = __builtin_msa_ftrunc_s_w(y);
+  // x's from even-numbered octants will translate to octant 0: [0, +Pi/4].
+  // x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0].
+  // Adjustment for odd-numbered octants: octant = (octant + 1) & (~1).
+  Packet4i y_int1 = __builtin_msa_addvi_w(y_int, 1);
+  Packet4i y_int2 = (Packet4i)__builtin_msa_bclri_w((Packet4ui)y_int1, 0);
+  y = __builtin_msa_ffint_s_w(y_int2);
+
+  // Compute the sign to apply to the polynomial.
+  Packet4i sign_mask = sine ? pxor(__builtin_msa_slli_w(y_int1, 29), (Packet4i)_x)
+                            : __builtin_msa_slli_w(__builtin_msa_addvi_w(y_int, 3), 29);
+
+  // Get the polynomial selection mask.
+  // We'll calculate both (sin and cos) polynomials and then select from the two.
+  Packet4i poly_mask = __builtin_msa_ceqi_w(__builtin_msa_slli_w(y_int2, 30), 0);
+
+  // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4.
+  // The magic pass: "Extended precision modular arithmetic"
+  // x = ((x - y * DP1) - y * DP2) - y * DP3
+  Packet4f tmp1 = pmul(y, p4f_minus_cephes_DP1);
+  Packet4f tmp2 = pmul(y, p4f_minus_cephes_DP2);
+  Packet4f tmp3 = pmul(y, p4f_minus_cephes_DP3);
+  x = padd(x, tmp1);
+  x = padd(x, tmp2);
+  x = padd(x, tmp3);
+
+  // Evaluate the cos(x) polynomial.
+  y = p4f_coscof_p0;
+  Packet4f z = pmul(x, x);
+  y = pmadd(y, z, p4f_coscof_p1);
+  y = pmadd(y, z, p4f_coscof_p2);
+  y = pmul(y, z);
+  y = pmul(y, z);
+  y = __builtin_msa_fmsub_w(y, z, p4f_half);
+  y = padd(y, p4f_1);
+
+  // Evaluate the sin(x) polynomial.
+  Packet4f y2 = p4f_sincof_p0;
+  y2 = pmadd(y2, z, p4f_sincof_p1);
+  y2 = pmadd(y2, z, p4f_sincof_p2);
+  y2 = pmul(y2, z);
+  y2 = pmadd(y2, x, x);
+
+  // Select the correct result from the two polynomials.
+  y = sine ? (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y, (v16u8)y2)
+           : (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y2, (v16u8)y);
+
+  // Update the sign.
+  sign_mask = pxor(sign_mask, (Packet4i)y);
+  y = (Packet4f)__builtin_msa_binsli_w((v4u32)y, (v4u32)sign_mask, 0);
+  return y;
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+psin<Packet4f>(const Packet4f& x) {
+  return psincos_inner_msa_float</* sine */ true>(x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
+pcos<Packet4f>(const Packet4f& x) {
+  return psincos_inner_msa_float</* sine */ false>(x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d
+pexp<Packet2d>(const Packet2d& _x) {
+  // Limiting double-precision pexp's argument to [-1024, +1024] lets pexp
+  // reach 0 and INFINITY naturally.
+  static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -1024.0);
+  static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, +1024.0);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
+  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
+  static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
+  static _EIGEN_DECLARE_CONST_Packet2d(1, 1.0);
+  static _EIGEN_DECLARE_CONST_Packet2d(2, 2.0);
+
+  Packet2d x = _x;
+
+  // Clamp x.
+  x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(x, p2d_exp_lo), (v16u8)x,
+                                     (v16u8)p2d_exp_lo);
+  x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(p2d_exp_hi, x), (v16u8)x,
+                                     (v16u8)p2d_exp_hi);
+
+  // Round to nearest integer by adding 0.5 (with x's sign) and truncating.
+  Packet2d x2_add = (Packet2d)__builtin_msa_binsli_d((v2u64)p2d_half, (v2u64)x, 0);
+  Packet2d x2 = pmadd(x, p2d_cephes_LOG2EF, x2_add);
+  Packet2l x2_long = __builtin_msa_ftrunc_s_d(x2);
+  Packet2d x2_long_d = __builtin_msa_ffint_s_d(x2_long);
+
+  x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C1);
+  x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C2);
+
+  x2 = pmul(x, x);
+
+  Packet2d px = p2d_cephes_exp_p0;
+  px = pmadd(px, x2, p2d_cephes_exp_p1);
+  px = pmadd(px, x2, p2d_cephes_exp_p2);
+  px = pmul(px, x);
+
+  Packet2d qx = p2d_cephes_exp_q0;
+  qx = pmadd(qx, x2, p2d_cephes_exp_q1);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q2);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q3);
+
+  x = pdiv(px, psub(qx, px));
+  x = pmadd(p2d_2, x, p2d_1);
+
+  // x *= 2**exponent.
+  x = __builtin_msa_fexp2_d(x, x2_long);
+
+  return x;
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_MSA_H
diff --git a/Eigen/src/Core/arch/MSA/PacketMath.h b/Eigen/src/Core/arch/MSA/PacketMath.h
new file mode 100644
index 000000000..94c15d132
--- /dev/null
+++ b/Eigen/src/Core/arch/MSA/PacketMath.h
@@ -0,0 +1,1317 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Wave Computing, Inc.
+// Written by:
+//   Chris Larsen
+//   Alexey Frunze (afrunze@wavecomp.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_MSA_H
+#define EIGEN_PACKET_MATH_MSA_H
+
+#include <iostream>
+#include <string>
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+#endif
+
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
+#endif
+
+#if 0
+#define EIGEN_MSA_DEBUG                                                             \
+  static bool firstTime = true;                                                     \
+  do {                                                                              \
+    if (firstTime) {                                                                \
+      std::cout << __FILE__ << ':' << __LINE__ << ':' << __FUNCTION__ << std::endl; \
+      firstTime = false;                                                            \
+    }                                                                               \
+  } while (0)
+#else
+#define EIGEN_MSA_DEBUG
+#endif
+
+#define EIGEN_MSA_SHF_I8(a, b, c, d) (((d) << 6) | ((c) << 4) | ((b) << 2) | (a))
+
+typedef v4f32 Packet4f;
+typedef v4i32 Packet4i;
+typedef v4u32 Packet4ui;
+
+#define _EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = { X, X, X, X }
+#define _EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = { X, X, X, X }
+#define _EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = { X, X, X, X }
+
+inline std::ostream& operator<<(std::ostream& os, const Packet4f& value) {
+  os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const Packet4i& value) {
+  os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const Packet4ui& value) {
+  os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
+  return os;
+}
+
+template <>
+struct packet_traits<float> : default_packet_traits {
+  typedef Packet4f type;
+  typedef Packet4f half;  // Packet2f intrinsics not implemented yet
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 0,  // Packet2f intrinsics not implemented yet
+    // FIXME check the Has*
+    HasDiv = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasTanh = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct packet_traits<int32_t> : default_packet_traits {
+  typedef Packet4i type;
+  typedef Packet4i half;  // Packet2i intrinsics not implemented yet
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 0,  // Packet2i intrinsics not implemented yet
+    // FIXME check the Has*
+    HasDiv = 1,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct unpacket_traits<Packet4f> {
+  typedef float type;
+  enum { size = 4, alignment = Aligned16 };
+  typedef Packet4f half;
+};
+
+template <>
+struct unpacket_traits<Packet4i> {
+  typedef int32_t type;
+  enum { size = 4, alignment = Aligned16 };
+  typedef Packet4i half;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4f v = { from, from, from, from };
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fill_w(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float* from) {
+  EIGEN_MSA_DEBUG;
+
+  float f = *from;
+  Packet4f v = { f, f, f, f };
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pload1<Packet4i>(const int32_t* from) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fill_w(*from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fadd_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_addv_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
+  EIGEN_MSA_DEBUG;
+
+  static const Packet4f countdown = { 0.0f, 1.0f, 2.0f, 3.0f };
+  return padd(pset1<Packet4f>(a), countdown);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) {
+  EIGEN_MSA_DEBUG;
+
+  static const Packet4i countdown = { 0, 1, 2, 3 };
+  return padd(pset1<Packet4i>(a), countdown);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fsub_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_subv_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_bnegi_w((v4u32)a, 31);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_addvi_w((v4i32)__builtin_msa_nori_b((v16u8)a, 0), 1);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fmul_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_mulv_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fdiv_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_div_s_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fmadd_w(c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
+  EIGEN_MSA_DEBUG;
+
+  // Use "asm" construct to avoid __builtin_msa_maddv_w GNU C bug.
+  Packet4i value = c;
+  __asm__("maddv.w %w[value], %w[a], %w[b]\n"
+          // Outputs
+          : [value] "+f"(value)
+          // Inputs
+          : [a] "f"(a), [b] "f"(b));
+  return value;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_and_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4i)__builtin_msa_and_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_or_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4i)__builtin_msa_or_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4i)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return pand(a, (Packet4f)__builtin_msa_xori_b((v16u8)b, 255));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return pand(a, (Packet4i)__builtin_msa_xori_b((v16u8)b, 255));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  // This prefers numbers to NaNs.
+  return __builtin_msa_fmin_w(a, b);
+#else
+  // This prefers NaNs to numbers.
+  Packet4i aNaN = __builtin_msa_fcun_w(a, a);
+  Packet4i aMinOrNaN = por(__builtin_msa_fclt_w(a, b), aNaN);
+  return (Packet4f)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_min_s_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  // This prefers numbers to NaNs.
+  return __builtin_msa_fmax_w(a, b);
+#else
+  // This prefers NaNs to numbers.
+  Packet4i aNaN = __builtin_msa_fcun_w(a, a);
+  Packet4i aMaxOrNaN = por(__builtin_msa_fclt_w(b, a), aNaN);
+  return (Packet4f)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_max_s_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast<float*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_LOAD return __builtin_msa_ld_w(const_cast<int32_t*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast<float*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4i)__builtin_msa_ld_w(const_cast<int32_t*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
+  EIGEN_MSA_DEBUG;
+
+  float f0 = from[0], f1 = from[1];
+  Packet4f v0 = { f0, f0, f0, f0 };
+  Packet4f v1 = { f1, f1, f1, f1 };
+  return (Packet4f)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from) {
+  EIGEN_MSA_DEBUG;
+
+  int32_t i0 = from[0], i1 = from[1];
+  Packet4i v0 = { i0, i0, i0, i0 };
+  Packet4i v1 = { i1, i1, i1, i1 };
+  return (Packet4i)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w(from, to, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w(from, to, 0);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  float f = *from;
+  Packet4f v = { f, f, f, f };
+  v[1] = from[stride];
+  v[2] = from[2 * stride];
+  v[3] = from[3 * stride];
+  return v;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  int32_t i = *from;
+  Packet4i v = { i, i, i, i };
+  v[1] = from[stride];
+  v[2] = from[2 * stride];
+  v[3] = from[3 * stride];
+  return v;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from,
+                                                        Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  *to = from[0];
+  to += stride;
+  *to = from[1];
+  to += stride;
+  *to = from[2];
+  to += stride;
+  *to = from[3];
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from,
+                                                          Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  *to = from[0];
+  to += stride;
+  *to = from[1];
+  to += stride;
+  *to = from[2];
+  to += stride;
+  *to = from[3];
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
+  EIGEN_MSA_DEBUG;
+
+  __builtin_prefetch(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) {
+  EIGEN_MSA_DEBUG;
+
+  __builtin_prefetch(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_bclri_w((v4u32)a, 31);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4i zero = __builtin_msa_ldi_w(0);
+  return __builtin_msa_add_a_w(zero, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4f s = padd(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return s[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) {
+  EIGEN_MSA_DEBUG;
+
+  v4i32 tmp1, tmp2, tmp3, tmp4;
+  Packet4f sum;
+
+  tmp1 = __builtin_msa_ilvr_w((v4i32)vecs[1], (v4i32)vecs[0]);
+  tmp2 = __builtin_msa_ilvr_w((v4i32)vecs[3], (v4i32)vecs[2]);
+  tmp3 = __builtin_msa_ilvl_w((v4i32)vecs[1], (v4i32)vecs[0]);
+  tmp4 = __builtin_msa_ilvl_w((v4i32)vecs[3], (v4i32)vecs[2]);
+
+  sum = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
+  sum = padd(sum, (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1));
+  sum = padd(sum, (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3));
+  sum = padd(sum, (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3));
+
+  return sum;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) {
+  EIGEN_MSA_DEBUG;
+
+  v4i32 tmp1, tmp2, tmp3, tmp4;
+  Packet4i sum;
+
+  tmp1 = __builtin_msa_ilvr_w((v4i32)vecs[1], (v4i32)vecs[0]);
+  tmp2 = __builtin_msa_ilvr_w((v4i32)vecs[3], (v4i32)vecs[2]);
+  tmp3 = __builtin_msa_ilvl_w((v4i32)vecs[1], (v4i32)vecs[0]);
+  tmp4 = __builtin_msa_ilvl_w((v4i32)vecs[3], (v4i32)vecs[2]);
+
+  sum = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
+  sum = padd(sum, (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1));
+  sum = padd(sum, (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3));
+  sum = padd(sum, (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3));
+
+  return sum;
+}
+
+template <>
+EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4i s = padd(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  s = padd(s, __builtin_msa_shf_w(s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return s[0];
+}
+
+// Other reduction functions:
+// mul
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4f p = pmul(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  p = pmul(p, (Packet4f)__builtin_msa_shf_w((v4i32)p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return p[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4i p = pmul(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  p = pmul(p, __builtin_msa_shf_w(p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return p[0];
+}
+
+// min
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  // Swap 64-bit halves of a.
+  Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+#if !EIGEN_FAST_MATH
+  // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit
+  // masks of all zeroes/ones in low 64 bits.
+  v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
+  // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes.
+  unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
+#endif
+  // Continue with min computation.
+  Packet4f v = __builtin_msa_fmin_w(a, swapped);
+  v = __builtin_msa_fmin_w(
+      v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+#if !EIGEN_FAST_MATH
+  // Based on the mask select between v and 4 qNaNs.
+  v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
+  v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
+#endif
+  return v[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4i m = pmin(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  m = pmin(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return m[0];
+}
+
+// max
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  // Swap 64-bit halves of a.
+  Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+#if !EIGEN_FAST_MATH
+  // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit
+  // masks of all zeroes/ones in low 64 bits.
+  v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
+  // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes.
+  unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
+#endif
+  // Continue with max computation.
+  Packet4f v = __builtin_msa_fmax_w(a, swapped);
+  v = __builtin_msa_fmax_w(
+      v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+#if !EIGEN_FAST_MATH
+  // Based on the mask select between v and 4 qNaNs.
+  v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
+  v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
+#endif
+  return v[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4i m = pmax(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  m = pmax(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return m[0];
+}
+
+#define PALIGN_MSA(Offset, Type, Command)                                                \
+  template <>                                                                            \
+  struct palign_impl<Offset, Type> {                                                     \
+    EIGEN_STRONG_INLINE static void run(Type& first, const Type& second) {               \
+      if (Offset != 0) first = (Type)(Command((v16i8)second, (v16i8)first, Offset * 4)); \
+    }                                                                                    \
+  };
+
+PALIGN_MSA(0, Packet4f, __builtin_msa_sldi_b)
+PALIGN_MSA(1, Packet4f, __builtin_msa_sldi_b)
+PALIGN_MSA(2, Packet4f, __builtin_msa_sldi_b)
+PALIGN_MSA(3, Packet4f, __builtin_msa_sldi_b)
+PALIGN_MSA(0, Packet4i, __builtin_msa_sldi_b)
+PALIGN_MSA(1, Packet4i, __builtin_msa_sldi_b)
+PALIGN_MSA(2, Packet4i, __builtin_msa_sldi_b)
+PALIGN_MSA(3, Packet4i, __builtin_msa_sldi_b)
+
+#undef PALIGN_MSA
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4f, 4>& value) {
+  os << "[ " << value.packet[0] << "," << std::endl
+     << "  " << value.packet[1] << "," << std::endl
+     << "  " << value.packet[2] << "," << std::endl
+     << "  " << value.packet[3] << " ]";
+  return os;
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
+  EIGEN_MSA_DEBUG;
+
+  v4i32 tmp1, tmp2, tmp3, tmp4;
+
+  tmp1 = __builtin_msa_ilvr_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
+  tmp2 = __builtin_msa_ilvr_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
+  tmp3 = __builtin_msa_ilvl_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
+  tmp4 = __builtin_msa_ilvl_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
+
+  kernel.packet[0] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
+  kernel.packet[1] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
+  kernel.packet[2] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
+  kernel.packet[3] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
+}
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4i, 4>& value) {
+  os << "[ " << value.packet[0] << "," << std::endl
+     << "  " << value.packet[1] << "," << std::endl
+     << "  " << value.packet[2] << "," << std::endl
+     << "  " << value.packet[3] << " ]";
+  return os;
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
+  EIGEN_MSA_DEBUG;
+
+  v4i32 tmp1, tmp2, tmp3, tmp4;
+
+  tmp1 = __builtin_msa_ilvr_w(kernel.packet[1], kernel.packet[0]);
+  tmp2 = __builtin_msa_ilvr_w(kernel.packet[3], kernel.packet[2]);
+  tmp3 = __builtin_msa_ilvl_w(kernel.packet[1], kernel.packet[0]);
+  tmp4 = __builtin_msa_ilvl_w(kernel.packet[3], kernel.packet[2]);
+
+  kernel.packet[0] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
+  kernel.packet[1] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
+  kernel.packet[2] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
+  kernel.packet[3] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fsqrt_w(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  return __builtin_msa_frsqrt_w(a);
+#else
+  Packet4f ones = __builtin_msa_ffint_s_w(__builtin_msa_ldi_w(1));
+  return pdiv(ones, psqrt(a));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
+  Packet4f v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"  // 3 = round towards -INFINITY.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.w %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
+  Packet4f v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"
+      "xori    %[new_mode], %[new_mode], 1\n"  // 2 = round towards +INFINITY.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.w %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
+  Packet4f v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"
+      "xori    %[new_mode], %[new_mode], 3\n"  // 0 = round to nearest, ties to even.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.w %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
+                                    const Packet4f& elsePacket) {
+  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2],
+                       ifPacket.select[3] };
+  Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
+  return (Packet4f)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
+                                    const Packet4i& elsePacket) {
+  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2],
+                       ifPacket.select[3] };
+  Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
+  return (Packet4i)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
+}
+
+//---------- double ----------
+
+typedef v2f64 Packet2d;
+typedef v2i64 Packet2l;
+typedef v2u64 Packet2ul;
+
+#define _EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = { X, X }
+#define _EIGEN_DECLARE_CONST_Packet2l(NAME, X) const Packet2l p2l_##NAME = { X, X }
+#define _EIGEN_DECLARE_CONST_Packet2ul(NAME, X) const Packet2ul p2ul_##NAME = { X, X }
+
+inline std::ostream& operator<<(std::ostream& os, const Packet2d& value) {
+  os << "[ " << value[0] << ", " << value[1] << " ]";
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const Packet2l& value) {
+  os << "[ " << value[0] << ", " << value[1] << " ]";
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const Packet2ul& value) {
+  os << "[ " << value[0] << ", " << value[1] << " ]";
+  return os;
+}
+
+template <>
+struct packet_traits<double> : default_packet_traits {
+  typedef Packet2d type;
+  typedef Packet2d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    HasHalfPacket = 0,
+    // FIXME check the Has*
+    HasDiv = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct unpacket_traits<Packet2d> {
+  typedef double type;
+  enum { size = 2, alignment = Aligned16 };
+  typedef Packet2d half;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d value = { from, from };
+  return value;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fadd_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
+  EIGEN_MSA_DEBUG;
+
+  static const Packet2d countdown = { 0.0, 1.0 };
+  return padd(pset1<Packet2d>(a), countdown);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fsub_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_bnegi_d((v2u64)a, 63);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fmul_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fdiv_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fmadd_d(c, a, b);
+}
+
+// Logical Operations are not supported for float, so we have to reinterpret casts using MSA
+// intrinsics
+template <>
+EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_and_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_or_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return pand(a, (Packet2d)__builtin_msa_xori_b((v16u8)b, 255));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast<double*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  // This prefers numbers to NaNs.
+  return __builtin_msa_fmin_d(a, b);
+#else
+  // This prefers NaNs to numbers.
+  v2i64 aNaN = __builtin_msa_fcun_d(a, a);
+  v2i64 aMinOrNaN = por(__builtin_msa_fclt_d(a, b), aNaN);
+  return (Packet2d)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  // This prefers numbers to NaNs.
+  return __builtin_msa_fmax_d(a, b);
+#else
+  // This prefers NaNs to numbers.
+  v2i64 aNaN = __builtin_msa_fcun_d(a, a);
+  v2i64 aMaxOrNaN = por(__builtin_msa_fclt_d(b, a), aNaN);
+  return (Packet2d)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast<double*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d value = { *from, *from };
+  return value;
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d value;
+  value[0] = *from;
+  from += stride;
+  value[1] = *from;
+  return value;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from,
+                                                         Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  *to = from[0];
+  to += stride;
+  *to = from[1];
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
+  EIGEN_MSA_DEBUG;
+
+  __builtin_prefetch(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_bclri_d((v2u64)a, 63);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d s = padd(a, preverse(a));
+  return s[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d v0 = (Packet2d)__builtin_msa_ilvev_d((v2i64)vecs[1], (v2i64)vecs[0]);
+  Packet2d v1 = (Packet2d)__builtin_msa_ilvod_d((v2i64)vecs[1], (v2i64)vecs[0]);
+
+  return padd(v0, v1);
+}
+
+// Other reduction functions:
+// mul
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d p = pmul(a, preverse(a));
+  return p[0];
+}
+
+// min
+template <>
+EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+  Packet2d v = __builtin_msa_fmin_d(a, swapped);
+  return v[0];
+#else
+  double a0 = a[0], a1 = a[1];
+  return ((std::isnan)(a0) || a0 < a1) ? a0 : a1;
+#endif
+}
+
+// max
+template <>
+EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+  Packet2d v = __builtin_msa_fmax_d(a, swapped);
+  return v[0];
+#else
+  double a0 = a[0], a1 = a[1];
+  return ((std::isnan)(a0) || a0 > a1) ? a0 : a1;
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fsqrt_d(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  return __builtin_msa_frsqrt_d(a);
+#else
+  Packet2d ones = __builtin_msa_ffint_s_d(__builtin_msa_ldi_d(1));
+  return pdiv(ones, psqrt(a));
+#endif
+}
+
+#define PALIGN_MSA(Offset, Type, Command)                                                \
+  template <>                                                                            \
+  struct palign_impl<Offset, Type> {                                                     \
+    EIGEN_STRONG_INLINE static void run(Type& first, const Type& second) {               \
+      if (Offset != 0) first = (Type)(Command((v16i8)second, (v16i8)first, Offset * 8)); \
+    }                                                                                    \
+  };
+
+PALIGN_MSA(0, Packet2d, __builtin_msa_sldi_b)
+PALIGN_MSA(1, Packet2d, __builtin_msa_sldi_b)
+
+#undef PALIGN_MSA
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet2d, 2>& value) {
+  os << "[ " << value.packet[0] << "," << std::endl << "  " << value.packet[1] << " ]";
+  return os;
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d trn1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
+  Packet2d trn2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
+  kernel.packet[0] = trn1;
+  kernel.packet[1] = trn2;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
+  Packet2d v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"  // 3 = round towards -INFINITY.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.d %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
+  Packet2d v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"
+      "xori    %[new_mode], %[new_mode], 1\n"  // 2 = round towards +INFINITY.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.d %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
+  Packet2d v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"
+      "xori    %[new_mode], %[new_mode], 3\n"  // 0 = round to nearest, ties to even.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.d %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
+                                    const Packet2d& elsePacket) {
+  Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
+  Packet2l mask = __builtin_msa_ceqi_d((Packet2l)select, 0);
+  return (Packet2d)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_PACKET_MATH_MSA_H
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index ef50ba303..306a309be 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -67,7 +67,7 @@ template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type;
 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
   float32x2_t r64;
-  r64 = vld1_f32((float *)&from);
+  r64 = vld1_f32((const float *)&from);
 
   return Packet2cf(vcombine_f32(r64, r64));
 }
@@ -142,7 +142,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf
   to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
 }
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { EIGEN_ARM_PREFETCH((float *)addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { EIGEN_ARM_PREFETCH((const float *)addr); }
 
 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
@@ -277,7 +277,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con
   s = vmulq_f32(b.v, b.v);
   rev_s = vrev64q_f32(s);
 
-  return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s)));
+  return Packet2cf(pdiv<Packet4f>(res.v, vaddq_f32(s,rev_s)));
 }
 
 EIGEN_DEVICE_FUNC inline void
@@ -383,7 +383,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { EIGEN_ARM_PREFETCH((double *)addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { EIGEN_ARM_PREFETCH((const double *)addr); }
 
 template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
 {
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 6283b6cfa..010739380 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -36,12 +36,43 @@ namespace internal {
 #endif
 #endif
 
+#if EIGEN_COMP_MSVC
+
+// In MSVC's arm_neon.h header file, all NEON vector types
+// are aliases to the same underlying type __n128.
+// We thus have to wrap them to make them different C++ types.
+// (See also bug 1428)
+
+template<typename T,int unique_id>
+struct eigen_packet_wrapper
+{
+  operator T&() { return m_val; }
+  operator const T&() const { return m_val; }
+  eigen_packet_wrapper() {}
+  eigen_packet_wrapper(const T &v) : m_val(v) {}
+  eigen_packet_wrapper& operator=(const T &v) {
+    m_val = v;
+    return *this;
+  }
+
+  T m_val;
+};
+typedef eigen_packet_wrapper<float32x2_t,0> Packet2f;
+typedef eigen_packet_wrapper<float32x4_t,1> Packet4f;
+typedef eigen_packet_wrapper<int32x4_t  ,2> Packet4i;
+typedef eigen_packet_wrapper<int32x2_t  ,3> Packet2i;
+typedef eigen_packet_wrapper<uint32x4_t ,4> Packet4ui;
+
+#else
+
 typedef float32x2_t Packet2f;
 typedef float32x4_t Packet4f;
 typedef int32x4_t   Packet4i;
 typedef int32x2_t   Packet2i;
 typedef uint32x4_t  Packet4ui;
 
+#endif // EIGEN_COMP_MSVC
+
 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
   const Packet4f p4f_##NAME = pset1<Packet4f>(X)
 
diff --git a/Eigen/src/Core/arch/NEON/TypeCasting.h b/Eigen/src/Core/arch/NEON/TypeCasting.h
new file mode 100644
index 000000000..95d1fd0e4
--- /dev/null
+++ b/Eigen/src/Core/arch/NEON/TypeCasting.h
@@ -0,0 +1,48 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_NEON_H
+#define EIGEN_TYPE_CASTING_NEON_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <>
+struct type_casting_traits<float, int> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template <>
+struct type_casting_traits<int, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+
+template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+  return vcvtq_s32_f32(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+  return vcvtq_f32_s32(a);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_NEON_H
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index 23e717f28..d075043ce 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -128,7 +128,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf
                                      _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
 }
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
 
 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
@@ -324,7 +324,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); }
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
 
 template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
 {
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index 7b5f948e1..4af2c6cae 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -242,7 +242,7 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
   return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x);
 }
 
-/* evaluation of 4 sines at onces, using SSE2 intrinsics.
+/* evaluation of 4 sines at once, using SSE2 intrinsics.
 
    The code is the exact rewriting of the cephes sinf function.
    Precision is excellent as long as x < 8192 (I did not bother to
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 03c8a2c13..2e815c0c5 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -461,10 +461,16 @@ template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double&
   pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
 }
 
+#if EIGEN_COMP_PGI
+typedef const void * SsePrefetchPtrType;
+#else
+typedef const char * SsePrefetchPtrType;
+#endif
+
 #ifndef EIGEN_VECTORIZE_AVX
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
 #endif
 
 #if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
@@ -657,7 +663,7 @@ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
   // TODO try to call _mm_mul_epu32 directly
   EIGEN_ALIGN16 int aux[4];
   pstore(aux, a);
-  return  (aux[0] * aux[1]) * (aux[2] * aux[3]);;
+  return  (aux[0] * aux[1]) * (aux[2] * aux[3]);
 }
 
 // min
@@ -928,4 +934,14 @@ template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, co
 
 } // end namespace Eigen
 
+#if EIGEN_COMP_PGI
+// PGI++ does not define the following intrinsics in C++ mode.
+static inline __m128  _mm_castpd_ps   (__m128d x) { return reinterpret_cast<__m128&>(x);  }
+static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); }
+static inline __m128d _mm_castps_pd   (__m128  x) { return reinterpret_cast<__m128d&>(x); }
+static inline __m128i _mm_castps_si128(__m128  x) { return reinterpret_cast<__m128i&>(x); }
+static inline __m128  _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x);  }
+static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); }
+#endif
+
 #endif // EIGEN_PACKET_MATH_SSE_H
diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h
index c84893230..c6ca8c716 100644
--- a/Eigen/src/Core/arch/SSE/TypeCasting.h
+++ b/Eigen/src/Core/arch/SSE/TypeCasting.h
@@ -14,6 +14,7 @@ namespace Eigen {
 
 namespace internal {
 
+#ifndef EIGEN_VECTORIZE_AVX
 template <>
 struct type_casting_traits<float, int> {
   enum {
@@ -23,11 +24,6 @@ struct type_casting_traits<float, int> {
   };
 };
 
-template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
-  return _mm_cvttps_epi32(a);
-}
-
-
 template <>
 struct type_casting_traits<int, float> {
   enum {
@@ -37,11 +33,6 @@ struct type_casting_traits<int, float> {
   };
 };
 
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
-  return _mm_cvtepi32_ps(a);
-}
-
-
 template <>
 struct type_casting_traits<double, float> {
   enum {
@@ -51,10 +42,6 @@ struct type_casting_traits<double, float> {
   };
 };
 
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
-  return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
-}
-
 template <>
 struct type_casting_traits<float, double> {
   enum {
@@ -63,6 +50,19 @@ struct type_casting_traits<float, double> {
     TgtCoeffRatio = 2
   };
 };
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+  return _mm_cvttps_epi32(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+  return _mm_cvtepi32_ps(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
+  return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
+}
 
 template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
   // Simply discard the second half of the input
diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h
index 1077d8eb0..9765cc763 100644
--- a/Eigen/src/Core/functors/AssignmentFunctors.h
+++ b/Eigen/src/Core/functors/AssignmentFunctors.h
@@ -144,7 +144,7 @@ template<typename Scalar> struct swap_assign_op {
   EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const
   {
-#ifdef EIGEN_CUDACC
+#ifdef EIGEN_GPUCC
     // FIXME is there some kind of cuda::swap?
     Scalar t=b; const_cast<Scalar&>(b)=a; a=t;
 #else
diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h
index 96747bac7..401d597d8 100644
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@@ -255,7 +255,7 @@ struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_NEQ> : binary_op_base<LhsScalar,Rh
 
 
 /** \internal
-  * \brief Template functor to compute the hypot of two scalars
+  * \brief Template functor to compute the hypot of two \b positive \b and \b real scalars
   *
   * \sa MatrixBase::stableNorm(), class Redux
   */
@@ -263,22 +263,15 @@ template<typename Scalar>
 struct scalar_hypot_op<Scalar,Scalar> : binary_op_base<Scalar,Scalar>
 {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op)
-//   typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& _x, const Scalar& _y) const
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar &x, const Scalar &y) const
   {
-    EIGEN_USING_STD_MATH(sqrt)
-    Scalar p, qp;
-    if(_x>_y)
-    {
-      p = _x;
-      qp = _y / p;
-    }
-    else
-    {
-      p = _y;
-      qp = _x / p;
-    }
-    return p * sqrt(Scalar(1) + qp*qp);
+    // This functor is used by hypotNorm only for which it is faster to first apply abs
+    // on all coefficients prior to reduction through hypot.
+    // This way we avoid calling abs on positive and real entries, and this also permits
+    // to seamlessly handle complexes. Otherwise we would have to handle both real and complexes
+    // through the same functor...
+    return internal::positive_real_hypot(x,y);
   }
 };
 template<typename Scalar>
@@ -443,7 +436,7 @@ template<typename BinaryOp> struct bind1st_op : BinaryOp {
   typedef typename BinaryOp::second_argument_type second_argument_type;
   typedef typename BinaryOp::result_type          result_type;
 
-  bind1st_op(const first_argument_type &val) : m_value(val) {}
+  EIGEN_DEVICE_FUNC explicit bind1st_op(const first_argument_type &val) : m_value(val) {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); }
 
@@ -462,7 +455,7 @@ template<typename BinaryOp> struct bind2nd_op : BinaryOp {
   typedef typename BinaryOp::second_argument_type second_argument_type;
   typedef typename BinaryOp::result_type          result_type;
 
-  bind2nd_op(const second_argument_type &val) : m_value(val) {}
+  EIGEN_DEVICE_FUNC explicit bind2nd_op(const second_argument_type &val) : m_value(val) {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); }
 
diff --git a/Eigen/src/Core/functors/StlFunctors.h b/Eigen/src/Core/functors/StlFunctors.h
index 6df3fa501..9c1d75850 100644
--- a/Eigen/src/Core/functors/StlFunctors.h
+++ b/Eigen/src/Core/functors/StlFunctors.h
@@ -83,13 +83,17 @@ struct functor_traits<std::binder1st<T> >
 { enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
 #endif
 
+#if (__cplusplus < 201703L) && (EIGEN_COMP_MSVC < 1910)
+// std::unary_negate is deprecated since c++17 and will be removed in c++20
 template<typename T>
 struct functor_traits<std::unary_negate<T> >
 { enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
 
+// std::binary_negate is deprecated since c++17 and will be removed in c++20
 template<typename T>
 struct functor_traits<std::binary_negate<T> >
 { enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
+#endif
 
 #ifdef EIGEN_STDEXT_SUPPORT
 
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 45230bce5..3ec8eb082 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -580,7 +580,7 @@ DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Pack
 }
 
 template<typename Packet>
-const DoublePacket<Packet>& predux_downto4(const DoublePacket<Packet> &a)
+const DoublePacket<Packet>& predux_half_dowto4(const DoublePacket<Packet> &a)
 {
   return a;
 }
@@ -972,7 +972,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
               EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
               EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
               internal::prefetch(blA+(3*K+16)*LhsProgress); \
-              if (EIGEN_ARCH_ARM) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \
+              if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \
               traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
               traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
               traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
@@ -1523,13 +1523,13 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           prefetch(&blA[0]);
           const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
 
-          // The following piece of code wont work for 512 bit registers
+          // The following piece of code won't work for 512 bit registers
           // Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size
           // as nr (which is currently 4) for the return type.
-          typedef typename unpacket_traits<SResPacket>::half SResPacketHalf;
+          const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
           if ((SwappedTraits::LhsProgress % 4) == 0 &&
               (SwappedTraits::LhsProgress <= 8) &&
-              (SwappedTraits::LhsProgress!=8 || unpacket_traits<SResPacketHalf>::size==nr))
+              (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr))
           {
             SAccPacket C0, C1, C2, C3;
             straits.initAcc(C0);
@@ -1596,13 +1596,13 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
                 SRhsPacketHalf b0;
                 straits.loadLhsUnaligned(blB, a0);
                 straits.loadRhs(blA, b0);
-                SAccPacketHalf c0 = predux_downto4(C0);
+                SAccPacketHalf c0 = predux_half_dowto4(C0);
                 straits.madd(a0,b0,c0,b0);
                 straits.acc(c0, alphav, R);
               }
               else
               {
-                straits.acc(predux_downto4(C0), alphav, R);
+                straits.acc(predux_half_dowto4(C0), alphav, R);
               }
               res.scatterPacket(i, j2, R);
             }
@@ -1924,7 +1924,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Co
 //       const Scalar* b6 = &rhs[(j2+6)*rhsStride];
 //       const Scalar* b7 = &rhs[(j2+7)*rhsStride];
 //       Index k=0;
-//       if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4
+//       if(PacketSize==8) // TODO enable vectorized transposition for PacketSize==4
 //       {
 //         for(; k<peeled_k; k+=PacketSize) {
 //           PacketBlock<Packet> kernel;
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index ed4d3182b..bd7b6ff2a 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -108,7 +108,7 @@ static void run(Index rows, Index cols, Index depth,
       // i.e., we test that info[tid].users equals 0.
       // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
       while(info[tid].users!=0) {}
-      info[tid].users += threads;
+      info[tid].users = threads;
 
       pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length);
 
@@ -146,7 +146,9 @@ static void run(Index rows, Index cols, Index depth,
       // Release all the sub blocks A'_i of A' for the current thread,
       // i.e., we simply decrement the number of users by 1
       for(Index i=0; i<threads; ++i)
+#if !EIGEN_HAS_CXX11_ATOMIC
         #pragma omp atomic
+#endif
         info[i].users -= 1;
     }
   }
@@ -431,10 +433,10 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
     // to determine the following heuristic.
     // EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h,
     // unless it has been specialized by the user or for a given architecture.
-    // Note that the condition rhs.rows()>0 was required because lazy produc is (was?) not happy with empty inputs.
+    // Note that the condition rhs.rows()>0 was required because lazy product is (was?) not happy with empty inputs.
     // I'm not sure it is still required.
     if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
-      lazyproduct::evalTo(dst, lhs, rhs);
+      lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op<typename Dst::Scalar,Scalar>());
     else
     {
       dst.setZero();
@@ -446,7 +448,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
   static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
-      lazyproduct::addTo(dst, lhs, rhs);
+      lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op<typename Dst::Scalar,Scalar>());
     else
       scaleAndAddTo(dst,lhs, rhs, Scalar(1));
   }
@@ -455,7 +457,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
   static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
-      lazyproduct::subTo(dst, lhs, rhs);
+      lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op<typename Dst::Scalar,Scalar>());
     else
       scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
   }
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
index 9176a1382..49565c070 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
@@ -37,7 +37,7 @@ namespace Eigen {
 
 namespace internal {
 
-template <typename Index, typename Scalar, int AStorageOrder, bool ConjugateA, int ResStorageOrder, int  UpLo>
+template <typename Index, typename Scalar, int AStorageOrder, bool ConjugateA, int ResStorageOrder, int UpLo>
 struct general_matrix_matrix_rankupdate :
        general_matrix_matrix_triangular_product<
          Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,UpLo,BuiltIn> {};
@@ -52,7 +52,7 @@ struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,Con
   static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \
                           const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking<Scalar, Scalar>& blocking) \
   { \
-    if ( lhs==rhs && ((UpLo&(Lower|Upper)==UpLo)) ) { \
+    if ( lhs==rhs && ((UpLo&(Lower|Upper))==UpLo) ) { \
       general_matrix_matrix_rankupdate<Index,Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,UpLo> \
       ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \
     } else { \
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index 41d8242e1..767feb99d 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -48,7 +48,7 @@ typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
 
-EIGEN_DONT_INLINE static void run(
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
   Index rows, Index cols,
   const LhsMapper& lhs,
   const RhsMapper& rhs,
@@ -57,7 +57,7 @@ EIGEN_DONT_INLINE static void run(
 };
 
 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
   Index rows, Index cols,
   const LhsMapper& alhs,
   const RhsMapper& rhs,
@@ -201,7 +201,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,C
 }
 
 /* Optimized row-major matrix * vector product:
- * This algorithm processes 4 rows at onces that allows to both reduce
+ * This algorithm processes 4 rows at once that allows to both reduce
  * the number of load/stores of the result by a factor 4 and to reduce
  * the instruction dependency. Moreover, we know that all bands have the
  * same alignment pattern.
@@ -231,7 +231,7 @@ typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
 
-EIGEN_DONT_INLINE static void run(
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
   Index rows, Index cols,
   const LhsMapper& lhs,
   const RhsMapper& rhs,
@@ -240,7 +240,7 @@ EIGEN_DONT_INLINE static void run(
 };
 
 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
   Index rows, Index cols,
   const LhsMapper& alhs,
   const RhsMapper& rhs,
diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index c2f084c82..92e9b0d9f 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -10,6 +10,10 @@
 #ifndef EIGEN_PARALLELIZER_H
 #define EIGEN_PARALLELIZER_H
 
+#if EIGEN_HAS_CXX11_ATOMIC
+#include <atomic>
+#endif
+
 namespace Eigen {
 
 namespace internal {
@@ -75,8 +79,17 @@ template<typename Index> struct GemmParallelInfo
 {
   GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {}
 
+  // volatile is not enough on all architectures (see bug 1572)
+  // to guarantee that when thread A says to thread B that it is
+  // done with packing a block, then all writes have been really
+  // carried out... C++11 memory model+atomic guarantees this.
+#if EIGEN_HAS_CXX11_ATOMIC
+  std::atomic<Index> sync;
+  std::atomic<int> users;
+#else
   Index volatile sync;
   int volatile users;
+#endif
 
   Index lhs_start;
   Index lhs_length;
@@ -87,11 +100,14 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth,
 {
   // TODO when EIGEN_USE_BLAS is defined,
   // we should still enable OMP for other scalar types
-#if !(defined (EIGEN_HAS_OPENMP)) || defined (EIGEN_USE_BLAS)
+  // Without C++11, we have to disable GEMM's parallelization on
+  // non x86 architectures because there volatile is not enough for our purpose.
+  // See bug 1572.
+#if (! defined(EIGEN_HAS_OPENMP)) || defined(EIGEN_USE_BLAS) || ((!EIGEN_HAS_CXX11_ATOMIC) && !(EIGEN_ARCH_i386_OR_x86_64))
   // FIXME the transpose variable is only needed to properly split
   // the matrix product when multithreading is enabled. This is a temporary
   // fix to support row-major destination matrices. This whole
-  // parallelizer mechanism has to be redisigned anyway.
+  // parallelizer mechanism has to be redesigned anyway.
   EIGEN_UNUSED_VARIABLE(depth);
   EIGEN_UNUSED_VARIABLE(transpose);
   func(0,rows, 0,cols);
@@ -117,7 +133,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth,
   // compute the number of threads we are going to use
   Index threads = std::min<Index>(nbThreads(), pb_max_threads);
 
-  // if multi-threading is explicitely disabled, not useful, or if we already are in a parallel session,
+  // if multi-threading is explicitly disabled, not useful, or if we already are in a parallel session,
   // then abort multi-threading
   // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp?
   if((!Condition) || (threads==1) || (omp_get_num_threads()>1))
diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h
index 3fd180e6c..d38fd72b2 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -15,7 +15,7 @@ namespace Eigen {
 namespace internal {
 
 /* Optimized selfadjoint matrix * vector product:
- * This algorithm processes 2 columns at onces that allows to both reduce
+ * This algorithm processes 2 columns at once that allows to both reduce
  * the number of load/stores of the result by a factor 2 and to reduce
  * the instruction dependency.
  */
@@ -27,7 +27,8 @@ template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool Conju
 struct selfadjoint_matrix_vector_product
 
 {
-static EIGEN_DONT_INLINE void run(
+static EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC
+void run(
   Index size,
   const Scalar*  lhs, Index lhsStride,
   const Scalar*  rhs,
@@ -36,7 +37,8 @@ static EIGEN_DONT_INLINE void run(
 };
 
 template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run(
+EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC
+void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run(
   Index size,
   const Scalar*  lhs, Index lhsStride,
   const Scalar*  rhs,
@@ -62,8 +64,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
 
   Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha;
 
-
-  Index bound = (std::max)(Index(0),size-8) & 0xfffffffe;
+  Index bound = numext::maxi(Index(0), size-8) & 0xfffffffe;
   if (FirstTriangular)
     bound = size - bound;
 
@@ -175,7 +176,8 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,0,true>
   enum { LhsUpLo = LhsMode&(Upper|Lower) };
 
   template<typename Dest>
-  static void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC
+  void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
   {
     typedef typename Dest::Scalar ResScalar;
     typedef typename Rhs::Scalar RhsScalar;
diff --git a/Eigen/src/Core/products/SelfadjointRank2Update.h b/Eigen/src/Core/products/SelfadjointRank2Update.h
index d395888e5..09209f733 100644
--- a/Eigen/src/Core/products/SelfadjointRank2Update.h
+++ b/Eigen/src/Core/products/SelfadjointRank2Update.h
@@ -24,7 +24,8 @@ struct selfadjoint_rank2_update_selector;
 template<typename Scalar, typename Index, typename UType, typename VType>
 struct selfadjoint_rank2_update_selector<Scalar,Index,UType,VType,Lower>
 {
-  static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC
+  void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
   {
     const Index size = u.size();
     for (Index i=0; i<size; ++i)
diff --git a/Eigen/src/Core/products/TriangularSolverVector.h b/Eigen/src/Core/products/TriangularSolverVector.h
index b994759b2..647317016 100644
--- a/Eigen/src/Core/products/TriangularSolverVector.h
+++ b/Eigen/src/Core/products/TriangularSolverVector.h
@@ -58,7 +58,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
       {
         // let's directly call the low level product function because:
         // 1 - it is faster to compile
-        // 2 - it is slighlty faster at runtime
+        // 2 - it is slightly faster at runtime
         Index startRow = IsLower ? pi : pi-actualPanelWidth;
         Index startCol = IsLower ? 0 : pi;
 
@@ -77,7 +77,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
         if (k>0)
           rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum();
 
-        if(!(Mode & UnitDiag))
+        if((!(Mode & UnitDiag)) && numext::not_equal_strict(rhs[i],RhsScalar(0)))
           rhs[i] /= cjLhs(i,i);
       }
     }
@@ -114,20 +114,23 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
       for(Index k=0; k<actualPanelWidth; ++k)
       {
         Index i = IsLower ? pi+k : pi-k-1;
-        if(!(Mode & UnitDiag))
-          rhs[i] /= cjLhs.coeff(i,i);
-
-        Index r = actualPanelWidth - k - 1; // remaining size
-        Index s = IsLower ? i+1 : i-r;
-        if (r>0)
-          Map<Matrix<RhsScalar,Dynamic,1> >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r);
+        if(numext::not_equal_strict(rhs[i],RhsScalar(0)))
+        {
+          if(!(Mode & UnitDiag))
+            rhs[i] /= cjLhs.coeff(i,i);
+
+          Index r = actualPanelWidth - k - 1; // remaining size
+          Index s = IsLower ? i+1 : i-r;
+          if (r>0)
+            Map<Matrix<RhsScalar,Dynamic,1> >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r);
+        }
       }
       Index r = IsLower ? size - endBlock : startBlock; // remaining size
       if (r > 0)
       {
         // let's directly call the low level product function because:
         // 1 - it is faster to compile
-        // 2 - it is slighlty faster at runtime
+        // 2 - it is slightly faster at runtime
         general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
             r, actualPanelWidth,
             LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride),
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index b1791fb3a..705925984 100755
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -289,8 +289,8 @@ template<typename XprType> struct blas_traits
     ExtractType,
     typename _ExtractType::PlainObject
     >::type DirectLinearAccessType;
-  static inline ExtractType extract(const XprType& x) { return x; }
-  static inline const Scalar extractScalarFactor(const XprType&) { return Scalar(1); }
+  static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return x; }
+  static inline EIGEN_DEVICE_FUNC const Scalar extractScalarFactor(const XprType&) { return Scalar(1); }
 };
 
 // pop conjugate
@@ -318,8 +318,8 @@ struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp
   typedef blas_traits<NestedXpr> Base;
   typedef CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain>, NestedXpr> XprType;
   typedef typename Base::ExtractType ExtractType;
-  static inline ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); }
-  static inline Scalar extractScalarFactor(const XprType& x)
+  static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); }
+  static inline EIGEN_DEVICE_FUNC Scalar extractScalarFactor(const XprType& x)
   { return x.lhs().functor().m_other * Base::extractScalarFactor(x.rhs()); }
 };
 template<typename Scalar, typename NestedXpr, typename Plain>
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
new file mode 100644
index 000000000..f30503b33
--- /dev/null
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -0,0 +1,432 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2018 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CONFIGURE_VECTORIZATION_H
+#define EIGEN_CONFIGURE_VECTORIZATION_H
+
+// FIXME: not sure why this is needed, perhaps it is not needed anymore.
+#ifdef __NVCC__
+  #ifndef EIGEN_DONT_VECTORIZE
+  #define EIGEN_DONT_VECTORIZE
+  #endif
+#endif
+
+//------------------------------------------------------------------------------------------
+// Static and dynamic alignment control
+//
+// The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES
+// as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively.
+// The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not,
+// a default value is automatically computed based on architecture, compiler, and OS.
+//
+// This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX}
+// to be used to declare statically aligned buffers.
+//------------------------------------------------------------------------------------------
+
+
+/* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements.
+ * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled,
+ * so that vectorization doesn't affect binary compatibility.
+ *
+ * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
+ * vectorized and non-vectorized code.
+ */
+#if (defined EIGEN_CUDACC)
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
+#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
+#elif EIGEN_COMP_MSVC
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))
+#elif EIGEN_COMP_SUNCC
+  // FIXME not sure about this one:
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
+#else
+  #error Please tell me what is the equivalent of __attribute__((aligned(n))) for your compiler
+#endif
+
+// If the user explicitly disable vectorization, then we also disable alignment
+#if defined(EIGEN_DONT_VECTORIZE)
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0
+#elif defined(__AVX512F__)
+  // 64 bytes static alignment is preferred only if really required
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64
+#elif defined(__AVX__)
+  // 32 bytes static alignment is preferred only if really required
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32
+#else
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16
+#endif
+
+
+// EIGEN_MIN_ALIGN_BYTES defines the minimal value for which the notion of explicit alignment makes sense
+#define EIGEN_MIN_ALIGN_BYTES 16
+
+// Defined the boundary (in bytes) on which the data needs to be aligned. Note
+// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be
+// aligned at all regardless of the value of this #define.
+
+#if (defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN))  && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && EIGEN_MAX_STATIC_ALIGN_BYTES>0
+#error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY.
+#endif
+
+// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprecated
+// They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0
+#if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)
+  #ifdef EIGEN_MAX_STATIC_ALIGN_BYTES
+    #undef EIGEN_MAX_STATIC_ALIGN_BYTES
+  #endif
+  #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
+#endif
+
+#ifndef EIGEN_MAX_STATIC_ALIGN_BYTES
+
+  // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES
+
+  // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable
+  // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
+  // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
+  // certain common platform (compiler+architecture combinations) to avoid these problems.
+  // Only static alignment is really problematic (relies on nonstandard compiler extensions),
+  // try to keep heap alignment even when we have to disable static alignment.
+  #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || EIGEN_ARCH_MIPS)
+  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
+  #elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6)
+  // Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support.
+  // Not sure which version fixed it, hopefully it doesn't affect 4.7, which is still somewhat in use.
+  // 4.8 and newer seem definitely unaffected.
+  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
+  #else
+  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
+  #endif
+
+  // static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX
+  #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \
+  && !EIGEN_GCC3_OR_OLDER \
+  && !EIGEN_COMP_SUNCC \
+  && !EIGEN_OS_QNX
+    #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1
+  #else
+    #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
+  #endif
+
+  #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT
+    #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+  #else
+    #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
+  #endif
+
+#endif
+
+// If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_ALIGN_BYTES
+#if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES<EIGEN_MAX_STATIC_ALIGN_BYTES
+#undef EIGEN_MAX_STATIC_ALIGN_BYTES
+#define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
+#endif
+
+#if EIGEN_MAX_STATIC_ALIGN_BYTES==0 && !defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
+  #define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
+#endif
+
+// At this stage, EIGEN_MAX_STATIC_ALIGN_BYTES>0 is the true test whether we want to align arrays on the stack or not.
+// It takes into account both the user choice to explicitly enable/disable alignment (by setting EIGEN_MAX_STATIC_ALIGN_BYTES)
+// and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT).
+// Henceforth, only EIGEN_MAX_STATIC_ALIGN_BYTES should be used.
+
+
+// Shortcuts to EIGEN_ALIGN_TO_BOUNDARY
+#define EIGEN_ALIGN8  EIGEN_ALIGN_TO_BOUNDARY(8)
+#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16)
+#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32)
+#define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64)
+#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
+#define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES)
+#else
+#define EIGEN_ALIGN_MAX
+#endif
+
+
+// Dynamic alignment control
+
+#if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES>0
+#error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN.
+#endif
+
+#ifdef EIGEN_DONT_ALIGN
+  #ifdef EIGEN_MAX_ALIGN_BYTES
+    #undef EIGEN_MAX_ALIGN_BYTES
+  #endif
+  #define EIGEN_MAX_ALIGN_BYTES 0
+#elif !defined(EIGEN_MAX_ALIGN_BYTES)
+  #define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+#endif
+
+#if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES
+#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+#else
+#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
+#endif
+
+
+#ifndef EIGEN_UNALIGNED_VECTORIZE
+#define EIGEN_UNALIGNED_VECTORIZE 1
+#endif
+
+//----------------------------------------------------------------------
+
+
+
+// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into
+// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks
+#if EIGEN_MAX_ALIGN_BYTES==0
+  #ifndef EIGEN_DONT_VECTORIZE
+    #define EIGEN_DONT_VECTORIZE
+  #endif
+#endif
+
+
+// The following (except #include <malloc.h> and _M_IX86_FP ??) can likely be
+// removed as gcc 4.1 and msvc 2008 are not supported anyways.
+#if EIGEN_COMP_MSVC
+  #include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled
+  #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later
+    // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP.
+    #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64
+      #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
+    #endif
+  #endif
+#else
+  #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) )
+    #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC
+  #endif
+#endif
+
+
+#ifndef EIGEN_DONT_VECTORIZE
+
+  #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
+
+    // Defines symbols for compile-time detection of which instructions are
+    // used.
+    // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_SSE
+    #define EIGEN_VECTORIZE_SSE2
+
+    // Detect sse3/ssse3/sse4:
+    // gcc and icc defines __SSE3__, ...
+    // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you
+    // want to force the use of those instructions with msvc.
+    #ifdef __SSE3__
+      #define EIGEN_VECTORIZE_SSE3
+    #endif
+    #ifdef __SSSE3__
+      #define EIGEN_VECTORIZE_SSSE3
+    #endif
+    #ifdef __SSE4_1__
+      #define EIGEN_VECTORIZE_SSE4_1
+    #endif
+    #ifdef __SSE4_2__
+      #define EIGEN_VECTORIZE_SSE4_2
+    #endif
+    #ifdef __AVX__
+      #define EIGEN_VECTORIZE_AVX
+      #define EIGEN_VECTORIZE_SSE3
+      #define EIGEN_VECTORIZE_SSSE3
+      #define EIGEN_VECTORIZE_SSE4_1
+      #define EIGEN_VECTORIZE_SSE4_2
+    #endif
+    #ifdef __AVX2__
+      #define EIGEN_VECTORIZE_AVX2
+      #define EIGEN_VECTORIZE_AVX
+      #define EIGEN_VECTORIZE_SSE3
+      #define EIGEN_VECTORIZE_SSSE3
+      #define EIGEN_VECTORIZE_SSE4_1
+      #define EIGEN_VECTORIZE_SSE4_2
+    #endif
+    #ifdef __FMA__
+      #define EIGEN_VECTORIZE_FMA
+    #endif
+    #if defined(__AVX512F__)
+      #define EIGEN_VECTORIZE_AVX512
+      #define EIGEN_VECTORIZE_AVX2
+      #define EIGEN_VECTORIZE_AVX
+      #define EIGEN_VECTORIZE_FMA
+      #define EIGEN_VECTORIZE_SSE3
+      #define EIGEN_VECTORIZE_SSSE3
+      #define EIGEN_VECTORIZE_SSE4_1
+      #define EIGEN_VECTORIZE_SSE4_2
+      #ifdef __AVX512DQ__
+        #define EIGEN_VECTORIZE_AVX512DQ
+      #endif
+      #ifdef __AVX512ER__
+        #define EIGEN_VECTORIZE_AVX512ER
+      #endif
+    #endif
+
+    // include files
+
+    // This extern "C" works around a MINGW-w64 compilation issue
+    // https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354
+    // In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do).
+    // However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations
+    // with conflicting linkage.  The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know;
+    // so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too.
+    // notice that since these are C headers, the extern "C" is theoretically needed anyways.
+    extern "C" {
+      // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
+      // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
+      #if EIGEN_COMP_ICC >= 1110
+        #include <immintrin.h>
+      #else
+        #include <mmintrin.h>
+        #include <emmintrin.h>
+        #include <xmmintrin.h>
+        #ifdef  EIGEN_VECTORIZE_SSE3
+        #include <pmmintrin.h>
+        #endif
+        #ifdef EIGEN_VECTORIZE_SSSE3
+        #include <tmmintrin.h>
+        #endif
+        #ifdef EIGEN_VECTORIZE_SSE4_1
+        #include <smmintrin.h>
+        #endif
+        #ifdef EIGEN_VECTORIZE_SSE4_2
+        #include <nmmintrin.h>
+        #endif
+        #if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512)
+        #include <immintrin.h>
+        #endif
+      #endif
+    } // end extern "C"
+
+  #elif defined __VSX__
+
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_VSX
+    #include <altivec.h>
+    // We need to #undef all these ugly tokens defined in <altivec.h>
+    // => use __vector instead of vector
+    #undef bool
+    #undef vector
+    #undef pixel
+
+  #elif defined __ALTIVEC__
+
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_ALTIVEC
+    #include <altivec.h>
+    // We need to #undef all these ugly tokens defined in <altivec.h>
+    // => use __vector instead of vector
+    #undef bool
+    #undef vector
+    #undef pixel
+
+  #elif (defined  __ARM_NEON) || (defined __ARM_NEON__)
+
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_NEON
+    #include <arm_neon.h>
+
+  #elif (defined __s390x__ && defined __VEC__)
+
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_ZVECTOR
+    #include <vecintrin.h>
+
+  #elif defined __mips_msa
+
+    // Limit MSA optimizations to little-endian CPUs for now.
+    // TODO: Perhaps, eventually support MSA optimizations on big-endian CPUs?
+    #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+      #if defined(__LP64__)
+        #define EIGEN_MIPS_64
+      #else
+        #define EIGEN_MIPS_32
+      #endif
+      #define EIGEN_VECTORIZE
+      #define EIGEN_VECTORIZE_MSA
+      #include <msa.h>
+    #endif
+
+  #endif
+#endif
+
+#if defined(__F16C__) && !defined(EIGEN_COMP_CLANG)
+  // We can use the optimized fp16 to float and float to fp16 conversion routines
+  #define EIGEN_HAS_FP16_C
+#endif
+
+#if defined EIGEN_CUDACC
+  #define EIGEN_VECTORIZE_GPU
+  #include <vector_types.h>
+  #if EIGEN_CUDACC_VER >= 70500
+    #define EIGEN_HAS_CUDA_FP16
+  #endif
+#endif
+
+#if defined(EIGEN_HAS_CUDA_FP16)
+  #include <host_defines.h>
+  #include <cuda_fp16.h>
+#endif
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  #define EIGEN_VECTORIZE_GPU
+  #include <hip/hip_vector_types.h>
+
+  #define EIGEN_HAS_HIP_FP16
+  #include <hip/hip_fp16.h>
+
+  #define HIP_PATCH_WITH_NEW_FP16 18215
+  #if (HIP_VERSION_PATCH < HIP_PATCH_WITH_NEW_FP16)
+    #define EIGEN_HAS_OLD_HIP_FP16
+    // Old HIP implementation does not have a explicit typedef for "half2"
+    typedef __half2 half2;
+  #endif
+
+#endif
+
+
+/** \brief Namespace containing all symbols from the %Eigen library. */
+namespace Eigen {
+
+inline static const char *SimdInstructionSetsInUse(void) {
+#if defined(EIGEN_VECTORIZE_AVX512)
+  return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_AVX)
+  return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_SSE4_2)
+  return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_SSE4_1)
+  return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
+#elif defined(EIGEN_VECTORIZE_SSSE3)
+  return "SSE, SSE2, SSE3, SSSE3";
+#elif defined(EIGEN_VECTORIZE_SSE3)
+  return "SSE, SSE2, SSE3";
+#elif defined(EIGEN_VECTORIZE_SSE2)
+  return "SSE, SSE2";
+#elif defined(EIGEN_VECTORIZE_ALTIVEC)
+  return "AltiVec";
+#elif defined(EIGEN_VECTORIZE_VSX)
+  return "VSX";
+#elif defined(EIGEN_VECTORIZE_NEON)
+  return "ARM NEON";
+#elif defined(EIGEN_VECTORIZE_ZVECTOR)
+  return "S390X ZVECTOR";
+#elif defined(EIGEN_VECTORIZE_MSA)
+  return "MIPS MSA";
+#else
+  return "None";
+#endif
+}
+
+} // end namespace Eigen
+
+
+#endif // EIGEN_CONFIGURE_VECTORIZATION_H
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index 5d37e5d04..f1afb4db9 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -468,6 +468,7 @@ namespace Architecture
     AltiVec = 0x2,
     VSX = 0x3,
     NEON = 0x4,
+    MSA = 0x5,
 #if defined EIGEN_VECTORIZE_SSE
     Target = SSE
 #elif defined EIGEN_VECTORIZE_ALTIVEC
@@ -476,6 +477,8 @@ namespace Architecture
     Target = VSX
 #elif defined EIGEN_VECTORIZE_NEON
     Target = NEON
+#elif defined EIGEN_VECTORIZE_MSA
+    Target = MSA
 #else
     Target = Generic
 #endif
diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h
index 8ef0f3594..af832ae77 100755
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -45,12 +45,16 @@
     #pragma clang diagnostic ignored "-Wabsolute-value"
   #endif
 
-#elif defined __GNUC__ && __GNUC__>=6
+#elif defined __GNUC__
 
   #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
     #pragma GCC diagnostic push
   #endif
-  #pragma GCC diagnostic ignored "-Wignored-attributes"
+  // g++ warns about local variables shadowing member functions, which is too strict
+  #pragma GCC diagnostic ignored "-Wshadow"
+  #if __GNUC__>=6
+    #pragma GCC diagnostic ignored "-Wignored-attributes"
+  #endif
 
 #endif
 
diff --git a/Eigen/src/Core/util/IndexedViewHelper.h b/Eigen/src/Core/util/IndexedViewHelper.h
index ab01c857f..dcba731f0 100644
--- a/Eigen/src/Core/util/IndexedViewHelper.h
+++ b/Eigen/src/Core/util/IndexedViewHelper.h
@@ -117,7 +117,7 @@ template<> struct get_compile_time_incr<SingleRange> {
   enum { value = 1 }; // 1 or 0 ??
 };
 
-// Turn a single index into something that looks like an array (i.e., that exposes a .size(), and operatro[](int) methods)
+// Turn a single index into something that looks like an array (i.e., that exposes a .size(), and operator[](int) methods)
 template<typename T, int XprSize>
 struct IndexedViewCompatibleType<T,XprSize,typename internal::enable_if<internal::is_integral<T>::value>::type> {
   // Here we could simply use Array, but maybe it's less work for the compiler to use
diff --git a/Eigen/src/Core/util/IntegralConstant.h b/Eigen/src/Core/util/IntegralConstant.h
index 78a4705cd..bf99cd8ab 100644
--- a/Eigen/src/Core/util/IntegralConstant.h
+++ b/Eigen/src/Core/util/IntegralConstant.h
@@ -192,7 +192,7 @@ inline internal::FixedInt<N> fix() { return internal::FixedInt<N>(); }
 // The generic typename T is mandatory. Otherwise, a code like fix<N> could refer to either the function above or this next overload.
 // This way a code like fix<N> can only refer to the previous function.
 template<int N,typename T>
-inline internal::VariableAndFixedInt<N> fix(T val) { return internal::VariableAndFixedInt<N>(val); }
+inline internal::VariableAndFixedInt<N> fix(T val) { return internal::VariableAndFixedInt<N>(internal::convert_index<int>(val)); }
 #endif
 
 #else // EIGEN_PARSED_BY_DOXYGEN
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index e351b7ad9..b15819f7d 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -11,6 +11,10 @@
 #ifndef EIGEN_MACROS_H
 #define EIGEN_MACROS_H
 
+//------------------------------------------------------------------------------------------
+// Eigen version and basic defaults
+//------------------------------------------------------------------------------------------
+
 #define EIGEN_WORLD_VERSION 3
 #define EIGEN_MAJOR_VERSION 3
 #define EIGEN_MINOR_VERSION 90
@@ -19,7 +23,40 @@
                                       (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
                                                                  EIGEN_MINOR_VERSION>=z))))
 
+#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
+#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::RowMajor
+#else
+#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::ColMajor
+#endif
+
+#ifndef EIGEN_DEFAULT_DENSE_INDEX_TYPE
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE std::ptrdiff_t
+#endif
+
+// Upperbound on the C++ version to use.
+// Expected values are 03, 11, 14, 17, etc.
+// By default, let's use an arbitrarily large C++ version.
+#ifndef EIGEN_MAX_CPP_VER
+#define EIGEN_MAX_CPP_VER 99
+#endif
+
+/** Allows to disable some optimizations which might affect the accuracy of the result.
+  * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
+  * They currently include:
+  *   - single precision ArrayBase::sin() and ArrayBase::cos() for SSE and AVX vectorization.
+  */
+#ifndef EIGEN_FAST_MATH
+#define EIGEN_FAST_MATH 1
+#endif
+
+#ifndef EIGEN_STACK_ALLOCATION_LIMIT
+// 131072 == 128 KB
+#define EIGEN_STACK_ALLOCATION_LIMIT 131072
+#endif
+
+//------------------------------------------------------------------------------------------
 // Compiler identification, EIGEN_COMP_*
+//------------------------------------------------------------------------------------------
 
 /// \internal EIGEN_COMP_GNUC set to 1 for all compilers compatible with GCC
 #ifdef __GNUC__
@@ -73,12 +110,17 @@
 
 // For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC:
 //  name  ver   MSC_VER
-//  2008    9      1500
-//  2010   10      1600
-//  2012   11      1700
-//  2013   12      1800
-//  2015   14      1900
-//  "15"   15      1900
+//  2008         9      1500
+//  2010        10      1600
+//  2012        11      1700
+//  2013        12      1800
+//  2015        14      1900
+//  "15"        15      1900
+//  2017-14.1   15.0    1910
+//  2017-14.11  15.3    1911
+//  2017-14.12  15.5    1912
+//  2017-14.13  15.6    1913
+//  2017-14.14  15.7    1914
 
 /// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC or clang-cl
 #if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC || EIGEN_COMP_LLVM || EIGEN_COMP_CLANG)
@@ -142,7 +184,11 @@
 #endif
 
 
+
+//------------------------------------------------------------------------------------------
 // Architecture identification, EIGEN_ARCH_*
+//------------------------------------------------------------------------------------------
+
 
 #if defined(__x86_64__) || defined(_M_X64) || defined(__amd64)
   #define EIGEN_ARCH_x86_64 1
@@ -212,7 +258,9 @@
 
 
 
+//------------------------------------------------------------------------------------------
 // Operating system identification, EIGEN_OS_*
+//------------------------------------------------------------------------------------------
 
 /// \internal EIGEN_OS_UNIX set to 1 if the OS is a unix variant
 #if defined(__unix__) || defined(__unix)
@@ -314,26 +362,114 @@
 #endif
 
 
+//------------------------------------------------------------------------------------------
+// Detect GPU compilers and architectures
+//------------------------------------------------------------------------------------------
 
-#if EIGEN_GNUC_AT_MOST(4,3) && !EIGEN_COMP_CLANG
-  // see bug 89
-  #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 0
-#else
-  #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 1
+// NVCC is not supported as the target platform for HIPCC
+// Note that this also makes EIGEN_CUDACC and EIGEN_HIPCC mutually exclusive
+#if defined(__NVCC__) && defined(__HIPCC__)
+  #error "NVCC as the target platform for HIPCC is currently not supported."
 #endif
 
-// This macro can be used to prevent from macro expansion, e.g.:
-//   std::max EIGEN_NOT_A_MACRO(a,b)
-#define EIGEN_NOT_A_MACRO
+#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
+  // Means the compiler is either nvcc or clang with CUDA enabled
+  #define EIGEN_CUDACC __CUDACC__
+#endif
 
-#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
-#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::RowMajor
+#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA)
+  // Means we are generating code for the device
+  #define EIGEN_CUDA_ARCH __CUDA_ARCH__
+#endif
+
+// Starting with CUDA 9 the composite __CUDACC_VER__ is not available.
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+  #define EIGEN_CUDACC_VER  ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
+#elif defined(__CUDACC_VER__)
+  #define EIGEN_CUDACC_VER __CUDACC_VER__
 #else
-#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::ColMajor
+  #define EIGEN_CUDACC_VER 0
 #endif
 
-#ifndef EIGEN_DEFAULT_DENSE_INDEX_TYPE
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE std::ptrdiff_t
+#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP)
+  // Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP)
+  #define EIGEN_HIPCC __HIPCC__
+
+  // We need to include hip_runtime.h here because it pulls in
+  // ++ hip_common.h which contains the define for  __HIP_DEVICE_COMPILE__
+  // ++ host_defines.h which contains the defines for the __host__ and __device__ macros
+  #include <hip/hip_runtime.h>
+
+  #if defined(__HIP_DEVICE_COMPILE__)
+    // analogous to EIGEN_CUDA_ARCH, but for HIP
+    #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__
+  #endif
+#endif
+
+// Unify CUDA/HIPCC
+
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
+//
+// If either EIGEN_CUDACC or EIGEN_HIPCC is defined, then define EIGEN_GPUCC
+//
+#define EIGEN_GPUCC
+//
+// EIGEN_HIPCC implies the HIP compiler and is used to tweak Eigen code for use in HIP kernels
+// EIGEN_CUDACC implies the CUDA compiler and is used to tweak Eigen code for use in CUDA kernels
+//
+// In most cases the same tweaks are required to the Eigen code to enable in both the HIP and CUDA kernels.
+// For those cases, the corresponding code should be guarded with
+//      #if defined(EIGEN_GPUCC)
+// instead of
+//      #if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
+//
+// For cases where the tweak is specific to HIP, the code should be guarded with
+//      #if defined(EIGEN_HIPCC)
+//
+// For cases where the tweak is specific to CUDA, the code should be guarded with
+//      #if defined(EIGEN_CUDACC)
+//
+#endif
+
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
+//
+// If either EIGEN_CUDA_ARCH or EIGEN_HIP_DEVICE_COMPILE is defined, then define EIGEN_GPU_COMPILE_PHASE
+//
+#define EIGEN_GPU_COMPILE_PHASE
+//
+// GPU compilers (HIPCC, NVCC) typically do two passes over the source code,
+//   + one to compile the source for the "host" (ie CPU)
+//   + another to compile the source for the "device" (ie. GPU)
+//
+// Code that needs to enabled only during the either the "host" or "device" compilation phase
+// needs to be guarded with a macro that indicates the current compilation phase
+//
+// EIGEN_HIP_DEVICE_COMPILE implies the device compilation phase in HIP
+// EIGEN_CUDA_ARCH implies the device compilation phase in CUDA
+//
+// In most cases, the "host" / "device" specific code is the same for both HIP and CUDA
+// For those cases, the code should be guarded with
+//       #if defined(EIGEN_GPU_COMPILE_PHASE)
+// instead of
+//       #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
+//
+// For cases where the tweak is specific to HIP, the code should be guarded with
+//      #if defined(EIGEN_HIP_DEVICE_COMPILE)
+//
+// For cases where the tweak is specific to CUDA, the code should be guarded with
+//      #if defined(EIGEN_CUDA_ARCH)
+//
+#endif
+
+//------------------------------------------------------------------------------------------
+// Detect Compiler/Architecture/OS specific features
+//------------------------------------------------------------------------------------------
+
+#if EIGEN_GNUC_AT_MOST(4,3) && !EIGEN_COMP_CLANG
+  // see bug 89
+  #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 0
+#else
+  #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 1
 #endif
 
 // Cross compiler wrapper around LLVM's __has_builtin
@@ -357,13 +493,6 @@
 #define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 0
 #endif
 
-// Upperbound on the C++ version to use.
-// Expected values are 03, 11, 14, 17, etc.
-// By default, let's use an arbitrarily large C++ version.
-#ifndef EIGEN_MAX_CPP_VER
-#define EIGEN_MAX_CPP_VER 99
-#endif
-
 #if EIGEN_MAX_CPP_VER>=11 && (defined(__cplusplus) && (__cplusplus >= 201103L) || EIGEN_COMP_MSVC >= 1900)
 #define EIGEN_HAS_CXX11 1
 #else
@@ -389,6 +518,8 @@
 #endif
 
 // Does the compiler support C99?
+// Need to include <cmath> to make sure _GLIBCXX_USE_C99 gets defined
+#include <cmath>
 #ifndef EIGEN_HAS_C99_MATH
 #if EIGEN_MAX_CPP_VER>=11 && \
     ((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901))       \
@@ -410,9 +541,13 @@
 #endif
 #endif
 
-// Does the compiler support type_trais?
+// Does the compiler support type_traits?
+// - full support of type traits was added only to GCC 5.1.0.
+// - 20150626 corresponds to the last release of 4.x libstdc++
 #ifndef EIGEN_HAS_TYPE_TRAITS
-#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_HAS_CXX11 || EIGEN_COMP_MSVC >= 1700)
+#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_HAS_CXX11 || EIGEN_COMP_MSVC >= 1700) \
+  && ((!EIGEN_COMP_GNUC_STRICT) || EIGEN_GNUC_AT_LEAST(5, 1)) \
+  && ((!defined(__GLIBCXX__))   || __GLIBCXX__ > 20150626)
 #define EIGEN_HAS_TYPE_TRAITS 1
 #define EIGEN_INCLUDE_TYPE_TRAITS
 #else
@@ -437,22 +572,22 @@
 // Does the compiler fully support const expressions? (as in c++14)
 #ifndef EIGEN_HAS_CONSTEXPR
 
-#if defined(EIGEN_CUDACC)
-// Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above
-#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_CUDACC_VER >= 70500))
+  #if defined(EIGEN_CUDACC)
+  // Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above
+  #if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_CUDACC_VER >= 70500))
+    #define EIGEN_HAS_CONSTEXPR 1
+  #endif
+  #elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \
+    (EIGEN_GNUC_AT_LEAST(4,8) && (__cplusplus > 199711L)) || \
+    (EIGEN_COMP_CLANG >= 306 && (__cplusplus > 199711L)))
   #define EIGEN_HAS_CONSTEXPR 1
-#endif
-#elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \
-  (EIGEN_GNUC_AT_LEAST(4,8) && (__cplusplus > 199711L)) || \
-  (EIGEN_COMP_CLANG >= 306 && (__cplusplus > 199711L)))
-#define EIGEN_HAS_CONSTEXPR 1
-#endif
+  #endif
 
-#ifndef EIGEN_HAS_CONSTEXPR
-#define EIGEN_HAS_CONSTEXPR 0
-#endif
+  #ifndef EIGEN_HAS_CONSTEXPR
+  #define EIGEN_HAS_CONSTEXPR 0
+  #endif
 
-#endif
+#endif // EIGEN_HAS_CONSTEXPR
 
 // Does the compiler support C++11 math?
 // Let's be conservative and enable the default C++11 implementation only if we are sure it exists
@@ -490,15 +625,39 @@
   #endif
 #endif
 
-/** Allows to disable some optimizations which might affect the accuracy of the result.
-  * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
-  * They currently include:
-  *   - single precision ArrayBase::sin() and ArrayBase::cos() for SSE and AVX vectorization.
-  */
-#ifndef EIGEN_FAST_MATH
-#define EIGEN_FAST_MATH 1
+#ifndef EIGEN_HAS_CXX11_ATOMIC
+  #if    EIGEN_MAX_CPP_VER>=11 && \
+         (__has_feature(cxx_atomic) \
+      || (__cplusplus > 201103L) \
+      || ((__cplusplus >= 201103L) && (EIGEN_COMP_MSVC==0 || EIGEN_COMP_MSVC >= 1700)))
+    #define EIGEN_HAS_CXX11_ATOMIC 1
+  #else
+    #define EIGEN_HAS_CXX11_ATOMIC 0
+  #endif
 #endif
 
+#if defined(EIGEN_CUDACC) && EIGEN_HAS_CONSTEXPR
+  // While available already with c++11, this is useful mostly starting with c++14 and relaxed constexpr rules
+  #if defined(__NVCC__)
+    // nvcc considers constexpr functions as __host__ __device__ with the option --expt-relaxed-constexpr
+    #ifdef __CUDACC_RELAXED_CONSTEXPR__
+      #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC
+    #endif
+  #elif defined(__clang__) && defined(__CUDA__)
+    // clang++ always considers constexpr functions as implicitly __host__ __device__
+    #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC
+  #endif
+#endif
+
+
+//------------------------------------------------------------------------------------------
+// Preprocessor programming helpers
+//------------------------------------------------------------------------------------------
+
+// This macro can be used to prevent from macro expansion, e.g.:
+//   std::max EIGEN_NOT_A_MACRO(a,b)
+#define EIGEN_NOT_A_MACRO
+
 #define EIGEN_DEBUG_VAR(x) std::cerr << #x << " = " << x << std::endl;
 
 // concatenate two tokens
@@ -550,6 +709,36 @@
 #define EIGEN_PERMISSIVE_EXPR
 #endif
 
+// GPU stuff
+
+// Disable some features when compiling with GPU compilers (NVCC/clang-cuda/SYCL/HIPCC)
+#if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__) || defined(EIGEN_HIPCC)
+  // Do not try asserts on device code
+  #ifndef EIGEN_NO_DEBUG
+  #define EIGEN_NO_DEBUG
+  #endif
+
+  #ifdef EIGEN_INTERNAL_DEBUGGING
+  #undef EIGEN_INTERNAL_DEBUGGING
+  #endif
+
+  #ifdef EIGEN_EXCEPTIONS
+  #undef EIGEN_EXCEPTIONS
+  #endif
+#endif
+
+// All functions callable from CUDA/HIP code must be qualified with __device__
+#ifdef EIGEN_GPUCC
+  #ifndef EIGEN_DONT_VECTORIZE
+  #define EIGEN_DONT_VECTORIZE
+  #endif
+
+  #define EIGEN_DEVICE_FUNC __host__ __device__
+#else
+  #define EIGEN_DEVICE_FUNC
+#endif
+
+
 // this macro allows to get rid of linking errors about multiply defined functions.
 //  - static is not very good because it prevents definitions from different object files to be merged.
 //           So static causes the resulting linked executable to be bloated with multiple copies of the same function.
@@ -661,169 +850,6 @@ namespace Eigen {
 #  define EIGEN_CONST_CONDITIONAL(cond)  cond
 #endif
 
-//------------------------------------------------------------------------------------------
-// Static and dynamic alignment control
-//
-// The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES
-// as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively.
-// The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not,
-// a default value is automatically computed based on architecture, compiler, and OS.
-//
-// This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX}
-// to be used to declare statically aligned buffers.
-//------------------------------------------------------------------------------------------
-
-
-/* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements.
- * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled,
- * so that vectorization doesn't affect binary compatibility.
- *
- * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
- * vectorized and non-vectorized code.
- */
-#if (defined EIGEN_CUDACC)
-  #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
-#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
-  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
-#elif EIGEN_COMP_MSVC
-  #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))
-#elif EIGEN_COMP_SUNCC
-  // FIXME not sure about this one:
-  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
-#else
-  #error Please tell me what is the equivalent of __attribute__((aligned(n))) for your compiler
-#endif
-
-// If the user explicitly disable vectorization, then we also disable alignment
-#if defined(EIGEN_DONT_VECTORIZE)
-  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0
-#elif defined(EIGEN_VECTORIZE_AVX512)
-  // 64 bytes static alignment is preferred only if really required
-  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64
-#elif defined(__AVX__)
-  // 32 bytes static alignment is preferred only if really required
-  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32
-#else
-  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16
-#endif
-
-
-// EIGEN_MIN_ALIGN_BYTES defines the minimal value for which the notion of explicit alignment makes sense
-#define EIGEN_MIN_ALIGN_BYTES 16
-
-// Defined the boundary (in bytes) on which the data needs to be aligned. Note
-// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be
-// aligned at all regardless of the value of this #define.
-
-#if (defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN))  && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && EIGEN_MAX_STATIC_ALIGN_BYTES>0
-#error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY.
-#endif
-
-// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprectated
-// They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0
-#if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)
-  #ifdef EIGEN_MAX_STATIC_ALIGN_BYTES
-    #undef EIGEN_MAX_STATIC_ALIGN_BYTES
-  #endif
-  #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
-#endif
-
-#ifndef EIGEN_MAX_STATIC_ALIGN_BYTES
-
-  // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES
-
-  // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable
-  // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
-  // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
-  // certain common platform (compiler+architecture combinations) to avoid these problems.
-  // Only static alignment is really problematic (relies on nonstandard compiler extensions),
-  // try to keep heap alignment even when we have to disable static alignment.
-  #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64)
-  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
-  #elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6)
-  // Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support.
-  // Not sure which version fixed it, hopefully it doesn't affect 4.7, which is still somewhat in use.
-  // 4.8 and newer seem definitely unaffected.
-  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
-  #else
-  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
-  #endif
-
-  // static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX
-  #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \
-  && !EIGEN_GCC3_OR_OLDER \
-  && !EIGEN_COMP_SUNCC \
-  && !EIGEN_OS_QNX
-    #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1
-  #else
-    #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
-  #endif
-
-  #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT
-    #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
-  #else
-    #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
-  #endif
-
-#endif
-
-// If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_ALIGN_BYTES
-#if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES<EIGEN_MAX_STATIC_ALIGN_BYTES
-#undef EIGEN_MAX_STATIC_ALIGN_BYTES
-#define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
-#endif
-
-#if EIGEN_MAX_STATIC_ALIGN_BYTES==0 && !defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
-  #define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
-#endif
-
-// At this stage, EIGEN_MAX_STATIC_ALIGN_BYTES>0 is the true test whether we want to align arrays on the stack or not.
-// It takes into account both the user choice to explicitly enable/disable alignment (by settting EIGEN_MAX_STATIC_ALIGN_BYTES)
-// and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT).
-// Henceforth, only EIGEN_MAX_STATIC_ALIGN_BYTES should be used.
-
-
-// Shortcuts to EIGEN_ALIGN_TO_BOUNDARY
-#define EIGEN_ALIGN8  EIGEN_ALIGN_TO_BOUNDARY(8)
-#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16)
-#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32)
-#define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64)
-#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
-#define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES)
-#else
-#define EIGEN_ALIGN_MAX
-#endif
-
-
-// Dynamic alignment control
-
-#if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES>0
-#error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN.
-#endif
-
-#ifdef EIGEN_DONT_ALIGN
-  #ifdef EIGEN_MAX_ALIGN_BYTES
-    #undef EIGEN_MAX_ALIGN_BYTES
-  #endif
-  #define EIGEN_MAX_ALIGN_BYTES 0
-#elif !defined(EIGEN_MAX_ALIGN_BYTES)
-  #define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
-#endif
-
-#if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES
-#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
-#else
-#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
-#endif
-
-
-#ifndef EIGEN_UNALIGNED_VECTORIZE
-#define EIGEN_UNALIGNED_VECTORIZE 1
-#endif
-
-//----------------------------------------------------------------------
-
-
 #ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD
   #define EIGEN_RESTRICT
 #endif
@@ -831,10 +857,6 @@ namespace Eigen {
   #define EIGEN_RESTRICT __restrict
 #endif
 
-#ifndef EIGEN_STACK_ALLOCATION_LIMIT
-// 131072 == 128 KB
-#define EIGEN_STACK_ALLOCATION_LIMIT 131072
-#endif
 
 #ifndef EIGEN_DEFAULT_IO_FORMAT
 #ifdef EIGEN_MAKING_DOCS
@@ -849,6 +871,18 @@ namespace Eigen {
 // just an empty macro !
 #define EIGEN_EMPTY
 
+
+// When compiling CUDA/HIP device code with NVCC or HIPCC
+// pull in math functions from the global namespace.
+// In host mode, and when device code is compiled with clang,
+// use the std versions.
+#if (defined(EIGEN_CUDA_ARCH) && defined(__NVCC__)) || defined(EIGEN_HIP_DEVICE_COMPILE)
+  #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
+#else
+  #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
+#endif
+
+
 #if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_CUDACC_VER>0)
   // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324)
   #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
@@ -997,15 +1031,23 @@ namespace Eigen {
   EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME)
 
 
+#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) && !defined(EIGEN_HIP_DEVICE_COMPILE)
+  #define EIGEN_EXCEPTIONS
+#endif
+
+
 #ifdef EIGEN_EXCEPTIONS
 #  define EIGEN_THROW_X(X) throw X
 #  define EIGEN_THROW throw
 #  define EIGEN_TRY try
 #  define EIGEN_CATCH(X) catch (X)
 #else
-#  ifdef EIGEN_CUDA_ARCH
+#  if defined(EIGEN_CUDA_ARCH)
 #    define EIGEN_THROW_X(X) asm("trap;")
 #    define EIGEN_THROW asm("trap;")
+#  elif defined(EIGEN_HIP_DEVICE_COMPILE)
+#    define EIGEN_THROW_X(X) asm("s_trap 0")
+#    define EIGEN_THROW asm("s_trap 0")
 #  else
 #    define EIGEN_THROW_X(X) std::abort()
 #    define EIGEN_THROW std::abort()
@@ -1025,7 +1067,13 @@ namespace Eigen {
 #   define EIGEN_NOEXCEPT
 #   define EIGEN_NOEXCEPT_IF(x)
 #   define EIGEN_NO_THROW throw()
-#   define EIGEN_EXCEPTION_SPEC(X) throw(X)
+#   if EIGEN_COMP_MSVC
+      // MSVC does not support exception specifications (warning C4290),
+      // and they are deprecated in c++11 anyway.
+#     define EIGEN_EXCEPTION_SPEC(X) throw()
+#   else
+#     define EIGEN_EXCEPTION_SPEC(X) throw(X)
+#   endif
 #endif
 
 #endif // EIGEN_MACROS_H
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index c455f92a1..8dbd4d93b 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -70,7 +70,20 @@ inline void throw_std_bad_alloc()
     throw std::bad_alloc();
   #else
     std::size_t huge = static_cast<std::size_t>(-1);
+    #if defined(EIGEN_HIPCC)
+    //
+    // calls to "::operator new" are to be treated as opaque function calls (i.e no inlining),
+    // and as a consequence the code in the #else block triggers the hipcc warning :
+    // "no overloaded function has restriction specifiers that are compatible with the ambient context"
+    //
+    // "throw_std_bad_alloc" has the EIGEN_DEVICE_FUNC attribute, so it seems that hipcc expects
+    // the same on "operator new"
+    // Reverting code back to the old version in this #if block for the hipcc compiler
+    //
     new int[huge];
+    #else
+    ::operator new(huge);
+    #endif
   #endif
 }
 
@@ -156,7 +169,13 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size)
 
   void *result;
   #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
+
+    #if defined(EIGEN_HIP_DEVICE_COMPILE)
+    result = ::malloc(size);
+    #else
     result = std::malloc(size);
+    #endif
+
     #if EIGEN_DEFAULT_ALIGN_BYTES==16
     eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade alignd memory allocator.");
     #endif
@@ -174,7 +193,13 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size)
 EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr)
 {
   #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
+
+    #if defined(EIGEN_HIP_DEVICE_COMPILE)
+    ::free(ptr);
+    #else
     std::free(ptr);
+    #endif
+
   #else
     handmade_aligned_free(ptr);
   #endif
@@ -218,7 +243,12 @@ template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(std:
 {
   check_that_malloc_is_allowed();
 
+  #if defined(EIGEN_HIP_DEVICE_COMPILE)
+  void *result = ::malloc(size);
+  #else
   void *result = std::malloc(size);
+  #endif
+
   if(!result && size)
     throw_std_bad_alloc();
   return result;
@@ -232,7 +262,11 @@ template<bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void
 
 template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void *ptr)
 {
+  #if defined(EIGEN_HIP_DEVICE_COMPILE)
+  ::free(ptr);
+  #else
   std::free(ptr);
+  #endif
 }
 
 template<bool Align> inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size)
@@ -493,7 +527,11 @@ template<typename T> struct smart_copy_helper<T,true> {
     IntPtr size = IntPtr(end)-IntPtr(start);
     if(size==0) return;
     eigen_internal_assert(start!=0 && end!=0 && target!=0);
+    #if defined(EIGEN_HIP_DEVICE_COMPILE)
+    ::memcpy(target, start, size);
+    #else
     std::memcpy(target, start, size);
+    #endif
   }
 };
 
@@ -542,7 +580,7 @@ template<typename T> struct smart_memmove_helper<T,false> {
 
 // you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA
 // to the appropriate stack allocation function
-#ifndef EIGEN_ALLOCA
+#if ! defined EIGEN_ALLOCA && ! defined EIGEN_GPU_COMPILE_PHASE
   #if EIGEN_OS_LINUX || EIGEN_OS_MAC || (defined alloca)
     #define EIGEN_ALLOCA alloca
   #elif EIGEN_COMP_MSVC
@@ -550,6 +588,15 @@ template<typename T> struct smart_memmove_helper<T,false> {
   #endif
 #endif
 
+// With clang -Oz -mthumb, alloca changes the stack pointer in a way that is
+// not allowed in Thumb2. -DEIGEN_STACK_ALLOCATION_LIMIT=0 doesn't work because
+// the compiler still emits bad code because stack allocation checks use "<=".
+// TODO: Eliminate after https://bugs.llvm.org/show_bug.cgi?id=23772
+// is fixed.
+#if defined(__clang__) && defined(__thumb__)
+  #undef EIGEN_ALLOCA
+#endif
+
 // This helper class construct the allocated memory, and takes care of destructing and freeing the handled data
 // at destruction time. In practice this helper class is mainly useful to avoid memory leak in case of exceptions.
 template<typename T> class aligned_stack_memory_handler : noncopyable
@@ -561,12 +608,14 @@ template<typename T> class aligned_stack_memory_handler : noncopyable
      * In this case, the buffer elements will also be destructed when this handler will be destructed.
      * Finally, if \a dealloc is true, then the pointer \a ptr is freed.
      **/
+    EIGEN_DEVICE_FUNC
     aligned_stack_memory_handler(T* ptr, std::size_t size, bool dealloc)
       : m_ptr(ptr), m_size(size), m_deallocate(dealloc)
     {
       if(NumTraits<T>::RequireInitialization && m_ptr)
         Eigen::internal::construct_elements_of_array(m_ptr, size);
     }
+    EIGEN_DEVICE_FUNC
     ~aligned_stack_memory_handler()
     {
       if(NumTraits<T>::RequireInitialization && m_ptr)
@@ -580,6 +629,60 @@ template<typename T> class aligned_stack_memory_handler : noncopyable
     bool m_deallocate;
 };
 
+#ifdef EIGEN_ALLOCA
+
+template<typename Xpr, int NbEvaluations,
+         bool MapExternalBuffer = nested_eval<Xpr,NbEvaluations>::Evaluate && Xpr::MaxSizeAtCompileTime==Dynamic
+         >
+struct local_nested_eval_wrapper
+{
+  static const bool NeedExternalBuffer = false;
+  typedef typename Xpr::Scalar Scalar;
+  typedef typename nested_eval<Xpr,NbEvaluations>::type ObjectType;
+  ObjectType object;
+
+  EIGEN_DEVICE_FUNC
+  local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) : object(xpr)
+  {
+    EIGEN_UNUSED_VARIABLE(ptr);
+    eigen_internal_assert(ptr==0);
+  }
+};
+
+template<typename Xpr, int NbEvaluations>
+struct local_nested_eval_wrapper<Xpr,NbEvaluations,true>
+{
+  static const bool NeedExternalBuffer = true;
+  typedef typename Xpr::Scalar Scalar;
+  typedef typename plain_object_eval<Xpr>::type PlainObject;
+  typedef Map<PlainObject,EIGEN_DEFAULT_ALIGN_BYTES> ObjectType;
+  ObjectType object;
+
+  EIGEN_DEVICE_FUNC
+  local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr)
+    : object(ptr==0 ? reinterpret_cast<Scalar*>(Eigen::internal::aligned_malloc(sizeof(Scalar)*xpr.size())) : ptr, xpr.rows(), xpr.cols()),
+      m_deallocate(ptr==0)
+  {
+    if(NumTraits<Scalar>::RequireInitialization && object.data())
+      Eigen::internal::construct_elements_of_array(object.data(), object.size());
+    object = xpr;
+  }
+
+  EIGEN_DEVICE_FUNC
+  ~local_nested_eval_wrapper()
+  {
+    if(NumTraits<Scalar>::RequireInitialization && object.data())
+      Eigen::internal::destruct_elements_of_array(object.data(), object.size());
+    if(m_deallocate)
+      Eigen::internal::aligned_free(object.data());
+  }
+
+private:
+  bool m_deallocate;
+};
+
+#endif // EIGEN_ALLOCA
+
 template<typename T> class scoped_array : noncopyable
 {
   T* m_ptr;
@@ -607,9 +710,11 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
 } // end namespace internal
 
 /** \internal
-  * Declares, allocates and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack
-  * if SIZE is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform
-  * (currently, this is Linux and Visual Studio only). Otherwise the memory is allocated on the heap.
+  *
+  * The macro ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) declares, allocates,
+  * and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack
+  * if the size in bytes is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform
+  * (currently, this is Linux, OSX and Visual Studio only). Otherwise the memory is allocated on the heap.
   * The allocated buffer is automatically deleted when exiting the scope of this declaration.
   * If BUFFER is non null, then the declared variable is simply an alias for BUFFER, and no allocation/deletion occurs.
   * Here is an example:
@@ -620,6 +725,14 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
   * }
   * \endcode
   * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token.
+  *
+  * The macro ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) is analogue to
+  * \code
+  *   typename internal::nested_eval<XPRT_T,N>::type NAME(XPR);
+  * \endcode
+  * with the advantage of using aligned stack allocation even if the maximal size of XPR at compile time is unknown.
+  * This is accomplished through alloca if this later is supported and if the required number of bytes
+  * is below EIGEN_STACK_ALLOCATION_LIMIT.
   */
 #ifdef EIGEN_ALLOCA
 
@@ -639,6 +752,13 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
                     : Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE) );  \
     Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,sizeof(TYPE)*SIZE>EIGEN_STACK_ALLOCATION_LIMIT)
 
+
+  #define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) \
+    Eigen::internal::local_nested_eval_wrapper<XPR_T,N> EIGEN_CAT(NAME,_wrapper)(XPR, reinterpret_cast<typename XPR_T::Scalar*>( \
+      ( (Eigen::internal::local_nested_eval_wrapper<XPR_T,N>::NeedExternalBuffer) && ((sizeof(typename XPR_T::Scalar)*XPR.size())<=EIGEN_STACK_ALLOCATION_LIMIT) ) \
+        ? EIGEN_ALIGNED_ALLOCA( sizeof(typename XPR_T::Scalar)*XPR.size() ) : 0 ) ) ; \
+    typename Eigen::internal::local_nested_eval_wrapper<XPR_T,N>::ObjectType NAME(EIGEN_CAT(NAME,_wrapper).object)
+
 #else
 
   #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \
@@ -646,6 +766,9 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
     TYPE* NAME = (BUFFER)!=0 ? BUFFER : reinterpret_cast<TYPE*>(Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE));    \
     Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,true)
 
+
+#define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) typename Eigen::internal::nested_eval<XPR_T,N>::type NAME(XPR)
+
 #endif
 
 
@@ -703,7 +826,7 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
 *  - 32 bytes alignment if AVX is enabled.
 *  - 64 bytes alignment if AVX512 is enabled.
 *
-* This can be controled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented
+* This can be controlled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented
 * \link TopicPreprocessorDirectivesPerformance there \endlink.
 *
 * Example:
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 0fa818008..658cfa9eb 100755
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -11,9 +11,18 @@
 #ifndef EIGEN_META_H
 #define EIGEN_META_H
 
-#if defined(EIGEN_CUDA_ARCH)
-#include <cfloat>
-#include <math_constants.h>
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+
+ #include <cfloat>
+
+ #if defined(EIGEN_CUDA_ARCH)
+  #include <math_constants.h>
+ #endif
+
+ #if defined(EIGEN_HIP_DEVICE_COMPILE)
+  #include "Eigen/src/Core/arch/HIP/hcc/math_constants.h"
+  #endif
+
 #endif
 
 #if EIGEN_COMP_ICC>=1600 &&  __cplusplus >= 201103L
@@ -98,6 +107,8 @@ template<> struct is_arithmetic<signed long>   { enum { value = true }; };
 template<> struct is_arithmetic<unsigned long> { enum { value = true }; };
 
 #if EIGEN_HAS_CXX11
+template<> struct is_arithmetic<signed long long>   { enum { value = true }; };
+template<> struct is_arithmetic<unsigned long long> { enum { value = true }; };
 using std::is_integral;
 #else
 template<typename T> struct is_integral               { enum { value = false }; };
@@ -111,6 +122,10 @@ template<> struct is_integral<signed int>             { enum { value = true }; }
 template<> struct is_integral<unsigned int>           { enum { value = true }; };
 template<> struct is_integral<signed long>            { enum { value = true }; };
 template<> struct is_integral<unsigned long>          { enum { value = true }; };
+#if EIGEN_COMP_MSVC
+template<> struct is_integral<signed __int64>         { enum { value = true }; };
+template<> struct is_integral<unsigned __int64>       { enum { value = true }; };
+#endif
 #endif
 
 
@@ -139,16 +154,19 @@ private:
   struct yes {int a[1];};
   struct no  {int a[2];};
 
-  static yes test(const To&, int);
+  template<typename T>
+  static yes test(T, int);
+
+  template<typename T>
   static no  test(any_conversion, ...);
 
 public:
-  static From ms_from;
+  static typename internal::remove_reference<From>::type* ms_from;
 #ifdef __INTEL_COMPILER
   #pragma warning push
   #pragma warning ( disable : 2259 )
 #endif
-  enum { value = sizeof(test(ms_from, 0))==sizeof(yes) };
+  enum { value = sizeof(test<To>(*ms_from, 0))==sizeof(yes) };
 #ifdef __INTEL_COMPILER
   #pragma warning pop
 #endif
@@ -157,8 +175,7 @@ public:
 template<typename From, typename To>
 struct is_convertible
 {
-  enum { value = is_convertible_impl<typename remove_all<From>::type,
-                                     typename remove_all<To  >::type>::value };
+  enum { value = is_convertible_impl<From,To>::value };
 };
 
 /** \internal Allows to enable/disable an overload
@@ -169,7 +186,7 @@ template<bool Condition, typename T=void> struct enable_if;
 template<typename T> struct enable_if<true,T>
 { typedef T type; };
 
-#if defined(EIGEN_CUDA_ARCH)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 #if !defined(__FLT_EPSILON__)
 #define __FLT_EPSILON__ FLT_EPSILON
 #define __DBL_EPSILON__ DBL_EPSILON
@@ -191,13 +208,31 @@ template<> struct numeric_limits<float>
   EIGEN_DEVICE_FUNC
   static float epsilon() { return __FLT_EPSILON__; }
   EIGEN_DEVICE_FUNC
-  static float (max)() { return CUDART_MAX_NORMAL_F; }
+  static float (max)() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_MAX_NORMAL_F;
+  #else
+    return HIPRT_MAX_NORMAL_F;
+  #endif
+  }
   EIGEN_DEVICE_FUNC
   static float (min)() { return FLT_MIN; }
   EIGEN_DEVICE_FUNC
-  static float infinity() { return CUDART_INF_F; }
+  static float infinity() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_INF_F;
+  #else
+    return HIPRT_INF_F;
+  #endif
+  }
   EIGEN_DEVICE_FUNC
-  static float quiet_NaN() { return CUDART_NAN_F; }
+  static float quiet_NaN() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_NAN_F;
+  #else
+    return HIPRT_NAN_F;
+  #endif
+  }
 };
 template<> struct numeric_limits<double>
 {
@@ -208,9 +243,21 @@ template<> struct numeric_limits<double>
   EIGEN_DEVICE_FUNC
   static double (min)() { return DBL_MIN; }
   EIGEN_DEVICE_FUNC
-  static double infinity() { return CUDART_INF; }
+  static double infinity() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_INF;
+  #else
+    return HIPRT_INF;
+  #endif
+  }
   EIGEN_DEVICE_FUNC
-  static double quiet_NaN() { return CUDART_NAN; }
+  static double quiet_NaN() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_NAN;
+  #else
+    return HIPRT_NAN;
+  #endif
+  }
 };
 template<> struct numeric_limits<int>
 {
@@ -272,7 +319,7 @@ template<> struct numeric_limits<unsigned long long>
 #endif
 
 /** \internal
-  * A base class do disable default copy ctor and copy assignement operator.
+  * A base class do disable default copy ctor and copy assignment operator.
   */
 class noncopyable
 {
@@ -523,13 +570,13 @@ template<typename T, typename U> struct scalar_product_traits
 
 namespace numext {
   
-#if defined(EIGEN_CUDA_ARCH)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 template<typename T> EIGEN_DEVICE_FUNC   void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; }
 #else
 template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); }
 #endif
 
-#if defined(EIGEN_CUDA_ARCH)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 using internal::device::numeric_limits;
 #else
 using std::numeric_limits;
@@ -538,11 +585,36 @@ using std::numeric_limits;
 // Integer division with rounding up.
 // T is assumed to be an integer type with a>=0, and b>0
 template<typename T>
+EIGEN_DEVICE_FUNC
 T div_ceil(const T &a, const T &b)
 {
   return (a+b-1) / b;
 }
 
+// The aim of the following functions is to bypass -Wfloat-equal warnings
+// when we really want a strict equality comparison on floating points.
+template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool equal_strict(const X& x,const Y& y) { return x == y; }
+
+#if !defined(EIGEN_GPU_COMPILE_PHASE) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool equal_strict(const float& x,const float& y) { return std::equal_to<float>()(x,y); }
+
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool equal_strict(const double& x,const double& y) { return std::equal_to<double>()(x,y); }
+#endif
+
+template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool not_equal_strict(const X& x,const Y& y) { return x != y; }
+
+#if !defined(EIGEN_GPU_COMPILE_PHASE) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to<float>()(x,y); }
+
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool not_equal_strict(const double& x,const double& y) { return std::not_equal_to<double>()(x,y); }
+#endif
+
 } // end namespace numext
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/util/ReenableStupidWarnings.h b/Eigen/src/Core/util/ReenableStupidWarnings.h
index 86b60f52f..e9353a623 100644
--- a/Eigen/src/Core/util/ReenableStupidWarnings.h
+++ b/Eigen/src/Core/util/ReenableStupidWarnings.h
@@ -8,7 +8,7 @@
     #pragma warning pop
   #elif defined __clang__
     #pragma clang diagnostic pop
-  #elif defined __GNUC__ && __GNUC__>=6
+  #elif defined __GNUC__
     #pragma GCC diagnostic pop
   #endif
 
diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h
index cb1678900..500e47792 100644
--- a/Eigen/src/Core/util/StaticAssert.h
+++ b/Eigen/src/Core/util/StaticAssert.h
@@ -24,6 +24,7 @@
  *
  */
 
+#ifndef EIGEN_STATIC_ASSERT
 #ifndef EIGEN_NO_STATIC_ASSERT
 
   #if EIGEN_MAX_CPP_VER>=11 && (__has_feature(cxx_static_assert) || (defined(__cplusplus) && __cplusplus >= 201103L) || (EIGEN_COMP_MSVC >= 1600))
@@ -101,7 +102,8 @@
         THIS_TYPE_IS_NOT_SUPPORTED=1,
         STORAGE_KIND_MUST_MATCH=1,
         STORAGE_INDEX_MUST_MATCH=1,
-        CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1
+        CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1,
+        SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1
       };
     };
 
@@ -131,7 +133,7 @@
   #define EIGEN_STATIC_ASSERT(CONDITION,MSG) eigen_assert((CONDITION) && #MSG);
 
 #endif // EIGEN_NO_STATIC_ASSERT
-
+#endif // EIGEN_STATIC_ASSERT
 
 // static assertion failing if the type \a TYPE is not a vector type
 #define EIGEN_STATIC_ASSERT_VECTOR_ONLY(TYPE) \
diff --git a/Eigen/src/Core/util/SymbolicIndex.h b/Eigen/src/Core/util/SymbolicIndex.h
index bb6349eb9..b9fe733b7 100644
--- a/Eigen/src/Core/util/SymbolicIndex.h
+++ b/Eigen/src/Core/util/SymbolicIndex.h
@@ -35,7 +35,7 @@ namespace Eigen {
   * std::cout << expr98.eval(x=6) << "\n";
   * \endcode
   *
-  * It is currently only used internally to define and minipulate the placeholders::last and placeholders::end symbols in Eigen::seq and Eigen::seqN.
+  * It is currently only used internally to define and manipulate the placeholders::last and placeholders::end symbols in Eigen::seq and Eigen::seqN.
   *
   */
 namespace Symbolic {
@@ -187,17 +187,10 @@ public:
 
 template<typename T>
 struct is_symbolic {
-  // BaseExpr has no conversion ctor, so we only have to check whether T can be staticaly cast to its base class BaseExpr<T>.
+  // BaseExpr has no conversion ctor, so we only have to check whether T can be statically cast to its base class BaseExpr<T>.
   enum { value = internal::is_convertible<T,BaseExpr<T> >::value };
 };
 
-// Specialization for functions, because is_convertible fails in this case.
-// Useful in c++98/11 mode when testing is_symbolic<decltype(fix<N>)>
-template<typename T>
-struct is_symbolic<T (*)()> {
-  enum { value = false };
-};
-
 /** Represents the actual value of a symbol identified by its tag
   *
   * It is the return type of SymbolValue::operator=, and most of the time this is only way it is used.
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index 10328be0d..e3231c712 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -39,14 +39,22 @@ template<typename T> struct is_valid_index_type
 {
   enum { value =
 #if EIGEN_HAS_TYPE_TRAITS
-   internal::is_integral<T>::value || std::is_enum<T>::value
+    internal::is_integral<T>::value || std::is_enum<T>::value
+#elif EIGEN_COMP_MSVC
+    internal::is_integral<T>::value || __is_enum(T)
 #else
-  // without C++11, we use is_convertible to Index instead of is_integral in order to treat enums as Index.
-  internal::is_convertible<T,Index>::value
+    // without C++11, we use is_convertible to Index instead of is_integral in order to treat enums as Index.
+    internal::is_convertible<T,Index>::value
 #endif
   };
 };
 
+// true if both types are not valid index types
+template<typename RowIndices, typename ColIndices>
+struct valid_indexed_view_overload {
+  enum { value = !(internal::is_valid_index_type<RowIndices>::value && internal::is_valid_index_type<ColIndices>::value) };
+};
+
 // promote_scalar_arg is an helper used in operation between an expression and a scalar, like:
 //    expression * scalar
 // Its role is to determine how the type T of the scalar operand should be promoted given the scalar type ExprScalar of the given expression.
@@ -452,7 +460,7 @@ template<typename T, int n, typename PlainObject = typename plain_object_eval<T>
 {
   enum {
     ScalarReadCost = NumTraits<typename traits<T>::Scalar>::ReadCost,
-    CoeffReadCost = evaluator<T>::CoeffReadCost,  // NOTE What if an evaluator evaluate itself into a tempory?
+    CoeffReadCost = evaluator<T>::CoeffReadCost,  // NOTE What if an evaluator evaluate itself into a temporary?
                                                   //      Then CoeffReadCost will be small (e.g., 1) but we still have to evaluate, especially if n>1.
                                                   //      This situation is already taken care by the EvalBeforeNestingBit flag, which is turned ON
                                                   //      for all evaluator creating a temporary. This flag is then propagated by the parent evaluators.
@@ -668,17 +676,32 @@ template<typename T> struct is_diagonal<DiagonalWrapper<T> >
 template<typename T, int S> struct is_diagonal<DiagonalMatrix<T,S> >
 { enum { ret = true }; };
 
+
+template<typename T> struct is_identity
+{ enum { value = false }; };
+
+template<typename T> struct is_identity<CwiseNullaryOp<internal::scalar_identity_op<typename T::Scalar>, T> >
+{ enum { value = true }; };
+
+
 template<typename S1, typename S2> struct glue_shapes;
 template<> struct glue_shapes<DenseShape,TriangularShape> { typedef TriangularShape type;  };
 
 template<typename T1, typename T2>
-bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if<has_direct_access<T1>::ret&&has_direct_access<T2>::ret, T1>::type * = 0)
+struct possibly_same_dense {
+  enum { value = has_direct_access<T1>::ret && has_direct_access<T2>::ret && is_same<typename T1::Scalar,typename T2::Scalar>::value };
+};
+
+template<typename T1, typename T2>
+EIGEN_DEVICE_FUNC
+bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if<possibly_same_dense<T1,T2>::value>::type * = 0)
 {
   return (mat1.data()==mat2.data()) && (mat1.innerStride()==mat2.innerStride()) && (mat1.outerStride()==mat2.outerStride());
 }
 
 template<typename T1, typename T2>
-bool is_same_dense(const T1 &, const T2 &, typename enable_if<!(has_direct_access<T1>::ret&&has_direct_access<T2>::ret), T1>::type * = 0)
+EIGEN_DEVICE_FUNC
+bool is_same_dense(const T1 &, const T2 &, typename enable_if<!possibly_same_dense<T1,T2>::value>::type * = 0)
 {
   return false;
 }
diff --git a/Eigen/src/Eigenvalues/ComplexEigenSolver.h b/Eigen/src/Eigenvalues/ComplexEigenSolver.h
index dc5fae06a..081e918f1 100644
--- a/Eigen/src/Eigenvalues/ComplexEigenSolver.h
+++ b/Eigen/src/Eigenvalues/ComplexEigenSolver.h
@@ -214,7 +214,7 @@ template<typename _MatrixType> class ComplexEigenSolver
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+      * \returns \c Success if computation was successful, \c NoConvergence otherwise.
       */
     ComputationInfo info() const
     {
diff --git a/Eigen/src/Eigenvalues/ComplexSchur.h b/Eigen/src/Eigenvalues/ComplexSchur.h
index 7f38919f7..b8b3490c6 100644
--- a/Eigen/src/Eigenvalues/ComplexSchur.h
+++ b/Eigen/src/Eigenvalues/ComplexSchur.h
@@ -212,7 +212,7 @@ template<typename _MatrixType> class ComplexSchur
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+      * \returns \c Success if computation was successful, \c NoConvergence otherwise.
       */
     ComputationInfo info() const
     {
diff --git a/Eigen/src/Eigenvalues/EigenSolver.h b/Eigen/src/Eigenvalues/EigenSolver.h
index f205b185d..997bebe7b 100644
--- a/Eigen/src/Eigenvalues/EigenSolver.h
+++ b/Eigen/src/Eigenvalues/EigenSolver.h
@@ -277,7 +277,7 @@ template<typename _MatrixType> class EigenSolver
     template<typename InputType>
     EigenSolver& compute(const EigenBase<InputType>& matrix, bool computeEigenvectors = true);
 
-    /** \returns NumericalIssue if the input contains INF or NaN values or overflow occured. Returns Success otherwise. */
+    /** \returns NumericalIssue if the input contains INF or NaN values or overflow occurred. Returns Success otherwise. */
     ComputationInfo info() const
     {
       eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
index 36a91dffc..87d789b3f 100644
--- a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
+++ b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
@@ -311,7 +311,6 @@ GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixTyp
     // Aliases:
     Map<VectorType> v(reinterpret_cast<Scalar*>(m_tmp.data()), size);
     ComplexVectorType &cv = m_tmp;
-    const MatrixType &mZ = m_realQZ.matrixZ();
     const MatrixType &mS = m_realQZ.matrixS();
     const MatrixType &mT = m_realQZ.matrixT();
 
@@ -351,7 +350,7 @@ GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixTyp
               }
             }
           }
-          m_eivec.col(i).real().noalias() = mZ.transpose() * v;
+          m_eivec.col(i).real().noalias() = m_realQZ.matrixZ().transpose() * v;
           m_eivec.col(i).real().normalize();
           m_eivec.col(i).imag().setConstant(0);
         }
@@ -400,7 +399,7 @@ GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixTyp
                               / (alpha*mT.coeffRef(j,j) - static_cast<Scalar>(beta*mS.coeffRef(j,j)));
             }
           }
-          m_eivec.col(i+1).noalias() = (mZ.transpose() * cv);
+          m_eivec.col(i+1).noalias() = (m_realQZ.matrixZ().transpose() * cv);
           m_eivec.col(i+1).normalize();
           m_eivec.col(i) = m_eivec.col(i+1).conjugate();
         }
diff --git a/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
index 5f6bb8289..d0f9091be 100644
--- a/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
+++ b/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
@@ -121,7 +121,7 @@ class GeneralizedSelfAdjointEigenSolver : public SelfAdjointEigenSolver<_MatrixT
       *
       * \returns    Reference to \c *this
       *
-      * Accoring to \p options, this function computes eigenvalues and (if requested)
+      * According to \p options, this function computes eigenvalues and (if requested)
       * the eigenvectors of one of the following three generalized eigenproblems:
       * - \c Ax_lBx: \f$ Ax = \lambda B x \f$
       * - \c ABx_lx: \f$ ABx = \lambda x \f$
diff --git a/Eigen/src/Eigenvalues/HessenbergDecomposition.h b/Eigen/src/Eigenvalues/HessenbergDecomposition.h
index f647f69b0..d947dac4e 100644
--- a/Eigen/src/Eigenvalues/HessenbergDecomposition.h
+++ b/Eigen/src/Eigenvalues/HessenbergDecomposition.h
@@ -315,7 +315,7 @@ void HessenbergDecomposition<MatrixType>::_compute(MatrixType& matA, CoeffVector
 
     // A = A H'
     matA.rightCols(remainingSize)
-        .applyHouseholderOnTheRight(matA.col(i).tail(remainingSize-1).conjugate(), numext::conj(h), &temp.coeffRef(0));
+        .applyHouseholderOnTheRight(matA.col(i).tail(remainingSize-1), numext::conj(h), &temp.coeffRef(0));
   }
 }
 
diff --git a/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h b/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
index dbbd4806a..66e5a3dbb 100644
--- a/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
+++ b/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
@@ -66,7 +66,6 @@ template<typename Derived>
 inline typename MatrixBase<Derived>::EigenvaluesReturnType
 MatrixBase<Derived>::eigenvalues() const
 {
-  typedef typename internal::traits<Derived>::Scalar Scalar;
   return internal::eigenvalues_selector<Derived, NumTraits<Scalar>::IsComplex>::run(derived());
 }
 
@@ -88,7 +87,6 @@ template<typename MatrixType, unsigned int UpLo>
 EIGEN_DEVICE_FUNC inline typename SelfAdjointView<MatrixType, UpLo>::EigenvaluesReturnType
 SelfAdjointView<MatrixType, UpLo>::eigenvalues() const
 {
-  typedef typename SelfAdjointView<MatrixType, UpLo>::PlainObject PlainObject;
   PlainObject thisAsMatrix(*this);
   return SelfAdjointEigenSolver<PlainObject>(thisAsMatrix, false).eigenvalues();
 }
diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h
index b3a910dd9..e2b37f40e 100644
--- a/Eigen/src/Eigenvalues/RealQZ.h
+++ b/Eigen/src/Eigenvalues/RealQZ.h
@@ -161,7 +161,7 @@ namespace Eigen {
 
       /** \brief Reports whether previous computation was successful.
        *
-       * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+       * \returns \c Success if computation was successful, \c NoConvergence otherwise.
        */
       ComputationInfo info() const
       {
diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h
index f5c86041d..aca8a8279 100644
--- a/Eigen/src/Eigenvalues/RealSchur.h
+++ b/Eigen/src/Eigenvalues/RealSchur.h
@@ -190,7 +190,7 @@ template<typename _MatrixType> class RealSchur
     RealSchur& computeFromHessenberg(const HessMatrixType& matrixH, const OrthMatrixType& matrixQ,  bool computeU);
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+      * \returns \c Success if computation was successful, \c NoConvergence otherwise.
       */
     ComputationInfo info() const
     {
@@ -270,8 +270,13 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::compute(const EigenBase<InputType>
   // Step 1. Reduce to Hessenberg form
   m_hess.compute(matrix.derived()/scale);
 
-  // Step 2. Reduce to real Schur form  
-  computeFromHessenberg(m_hess.matrixH(), m_hess.matrixQ(), computeU);
+  // Step 2. Reduce to real Schur form
+  // Note: we copy m_hess.matrixQ() into m_matU here and not in computeFromHessenberg
+  //       to be able to pass our working-space buffer for the Householder to Dense evaluation.
+  m_workspaceVector.resize(matrix.cols());
+  if(computeU)
+    m_hess.matrixQ().evalTo(m_matU, m_workspaceVector);
+  computeFromHessenberg(m_hess.matrixH(), m_matU, computeU);
 
   m_matT *= scale;
   
@@ -284,13 +289,13 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMa
   using std::abs;
 
   m_matT = matrixH;
-  if(computeU)
+  m_workspaceVector.resize(m_matT.cols());
+  if(computeU && !internal::is_same_dense(m_matU,matrixQ))
     m_matU = matrixQ;
   
   Index maxIters = m_maxIters;
   if (maxIters == -1)
     maxIters = m_maxIterationsPerRow * matrixH.rows();
-  m_workspaceVector.resize(m_matT.cols());
   Scalar* workspace = &m_workspaceVector.coeffRef(0);
 
   // The matrix m_matT is divided in three parts. 
@@ -303,7 +308,7 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMa
   Scalar exshift(0);   // sum of exceptional shifts
   Scalar norm = computeNormOfT();
 
-  if(norm!=0)
+  if(norm!=Scalar(0))
   {
     while (iu >= 0)
     {
@@ -327,7 +332,7 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMa
       else // No convergence yet
       {
         // The firstHouseholderVector vector has to be initialized to something to get rid of a silly GCC warning (-O1 -Wall -DNDEBUG )
-        Vector3s firstHouseholderVector(0,0,0), shiftInfo;
+        Vector3s firstHouseholderVector = Vector3s::Zero(), shiftInfo;
         computeShift(iu, iter, exshift, shiftInfo);
         iter = iter + 1;
         totalIter = totalIter + 1;
diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
index 9ddd553f2..2d1a82c57 100644
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
@@ -20,7 +20,9 @@ class GeneralizedSelfAdjointEigenSolver;
 
 namespace internal {
 template<typename SolverType,int Size,bool IsComplex> struct direct_selfadjoint_eigenvalues;
+
 template<typename MatrixType, typename DiagType, typename SubDiagType>
+EIGEN_DEVICE_FUNC
 ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec);
 }
 
@@ -337,7 +339,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+      * \returns \c Success if computation was successful, \c NoConvergence otherwise.
       */
     EIGEN_DEVICE_FUNC
     ComputationInfo info() const
@@ -354,7 +356,8 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
     static const int m_maxIterations = 30;
 
   protected:
-    static void check_template_parameters()
+    static EIGEN_DEVICE_FUNC
+    void check_template_parameters()
     {
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
     }
@@ -403,7 +406,7 @@ SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
   
   const InputType &matrix(a_matrix.derived());
   
-  using std::abs;
+  EIGEN_USING_STD_MATH(abs);
   eigen_assert(matrix.cols() == matrix.rows());
   eigen_assert((options&~(EigVecMask|GenEigMask))==0
           && (options&EigVecMask)!=EigVecMask
@@ -479,9 +482,10 @@ namespace internal {
   * \returns \c Success or \c NoConvergence
   */
 template<typename MatrixType, typename DiagType, typename SubDiagType>
+EIGEN_DEVICE_FUNC
 ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec)
 {
-  using std::abs;
+  EIGEN_USING_STD_MATH(abs);
 
   ComputationInfo info;
   typedef typename MatrixType::Scalar Scalar;
@@ -535,7 +539,7 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag
       diag.segment(i,n-i).minCoeff(&k);
       if (k > 0)
       {
-        std::swap(diag[i], diag[k+i]);
+        numext::swap(diag[i], diag[k+i]);
         if(computeEigenvectors)
           eivec.col(i).swap(eivec.col(k+i));
       }
@@ -605,7 +609,8 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
   EIGEN_DEVICE_FUNC
   static inline bool extract_kernel(MatrixType& mat, Ref<VectorType> res, Ref<VectorType> representative)
   {
-    using std::abs;
+    EIGEN_USING_STD_MATH(abs);
+    EIGEN_USING_STD_MATH(sqrt);
     Index i0;
     // Find non-zero column i0 (by construction, there must exist a non zero coefficient on the diagonal):
     mat.diagonal().cwiseAbs().maxCoeff(&i0);
@@ -616,8 +621,8 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
     VectorType c0, c1;
     n0 = (c0 = representative.cross(mat.col((i0+1)%3))).squaredNorm();
     n1 = (c1 = representative.cross(mat.col((i0+2)%3))).squaredNorm();
-    if(n0>n1) res = c0/std::sqrt(n0);
-    else      res = c1/std::sqrt(n1);
+    if(n0>n1) res = c0/sqrt(n0);
+    else      res = c1/sqrt(n1);
 
     return true;
   }
@@ -719,7 +724,7 @@ struct direct_selfadjoint_eigenvalues<SolverType,2,false>
   EIGEN_DEVICE_FUNC
   static inline void computeRoots(const MatrixType& m, VectorType& roots)
   {
-    using std::sqrt;
+    EIGEN_USING_STD_MATH(sqrt);
     const Scalar t0 = Scalar(0.5) * sqrt( numext::abs2(m(0,0)-m(1,1)) + Scalar(4)*numext::abs2(m(1,0)));
     const Scalar t1 = Scalar(0.5) * (m(0,0) + m(1,1));
     roots(0) = t1 - t0;
@@ -807,7 +812,7 @@ template<int StorageOrder,typename RealScalar, typename Scalar, typename Index>
 EIGEN_DEVICE_FUNC
 static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index start, Index end, Scalar* matrixQ, Index n)
 {
-  using std::abs;
+  EIGEN_USING_STD_MATH(abs);
   RealScalar td = (diag[end-1] - diag[end])*RealScalar(0.5);
   RealScalar e = subdiag[end-1];
   // Note that thanks to scaling, e^2 or td^2 cannot overflow, however they can still
diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h
index 3891cf883..b0c947dc0 100644
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h
@@ -37,7 +37,7 @@ namespace Eigen {
 
 /** \internal Specialization for the data types supported by LAPACKe */
 
-#define EIGEN_LAPACKE_EIG_SELFADJ(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, EIGCOLROW, LAPACKE_COLROW ) \
+#define EIGEN_LAPACKE_EIG_SELFADJ_2(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, EIGCOLROW ) \
 template<> template<typename InputType> inline \
 SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >& \
 SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const EigenBase<InputType>& matrix, int options) \
@@ -47,7 +47,7 @@ SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(c
           && (options&EigVecMask)!=EigVecMask \
           && "invalid option parameter"); \
   bool computeEigenvectors = (options&ComputeEigenvectors)==ComputeEigenvectors; \
-  lapack_int n = internal::convert_index<lapack_int>(matrix.cols()), lda, matrix_order, info; \
+  lapack_int n = internal::convert_index<lapack_int>(matrix.cols()), lda, info; \
   m_eivalues.resize(n,1); \
   m_subdiag.resize(n-1); \
   m_eivec = matrix; \
@@ -63,27 +63,24 @@ SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(c
   } \
 \
   lda = internal::convert_index<lapack_int>(m_eivec.outerStride()); \
-  matrix_order=LAPACKE_COLROW; \
   char jobz, uplo='L'/*, range='A'*/; \
   jobz = computeEigenvectors ? 'V' : 'N'; \
 \
-  info = LAPACKE_##LAPACKE_NAME( matrix_order, jobz, uplo, n, (LAPACKE_TYPE*)m_eivec.data(), lda, (LAPACKE_RTYPE*)m_eivalues.data() ); \
+  info = LAPACKE_##LAPACKE_NAME( LAPACK_COL_MAJOR, jobz, uplo, n, (LAPACKE_TYPE*)m_eivec.data(), lda, (LAPACKE_RTYPE*)m_eivalues.data() ); \
   m_info = (info==0) ? Success : NoConvergence; \
   m_isInitialized = true; \
   m_eigenvectorsOk = computeEigenvectors; \
   return *this; \
 }
 
+#define EIGEN_LAPACKE_EIG_SELFADJ(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME )              \
+        EIGEN_LAPACKE_EIG_SELFADJ_2(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, ColMajor )  \
+        EIGEN_LAPACKE_EIG_SELFADJ_2(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, RowMajor ) 
 
-EIGEN_LAPACKE_EIG_SELFADJ(double,   double,                double, dsyev, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_LAPACKE_EIG_SELFADJ(float,    float,                 float,  ssyev, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float,  float,  cheev, ColMajor, LAPACK_COL_MAJOR)
-
-EIGEN_LAPACKE_EIG_SELFADJ(double,   double,                double, dsyev, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_LAPACKE_EIG_SELFADJ(float,    float,                 float,  ssyev, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float,  float,  cheev, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_EIG_SELFADJ(double,   double,                double, dsyev)
+EIGEN_LAPACKE_EIG_SELFADJ(float,    float,                 float,  ssyev)
+EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev)
+EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float,  float,  cheev)
 
 } // end namespace Eigen
 
diff --git a/Eigen/src/Eigenvalues/Tridiagonalization.h b/Eigen/src/Eigenvalues/Tridiagonalization.h
index 1d102c17b..c5c1acf46 100644
--- a/Eigen/src/Eigenvalues/Tridiagonalization.h
+++ b/Eigen/src/Eigenvalues/Tridiagonalization.h
@@ -25,6 +25,7 @@ struct traits<TridiagonalizationMatrixTReturnType<MatrixType> >
 };
 
 template<typename MatrixType, typename CoeffVectorType>
+EIGEN_DEVICE_FUNC
 void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs);
 }
 
@@ -344,6 +345,7 @@ namespace internal {
   * \sa Tridiagonalization::packedMatrix()
   */
 template<typename MatrixType, typename CoeffVectorType>
+EIGEN_DEVICE_FUNC
 void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs)
 {
   using numext::conj;
@@ -424,6 +426,7 @@ struct tridiagonalization_inplace_selector;
   * \sa class Tridiagonalization
   */
 template<typename MatrixType, typename DiagonalType, typename SubDiagonalType>
+EIGEN_DEVICE_FUNC
 void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
 {
   eigen_assert(mat.cols()==mat.rows() && diag.size()==mat.rows() && subdiag.size()==mat.rows()-1);
@@ -439,7 +442,8 @@ struct tridiagonalization_inplace_selector
   typedef typename Tridiagonalization<MatrixType>::CoeffVectorType CoeffVectorType;
   typedef typename Tridiagonalization<MatrixType>::HouseholderSequenceType HouseholderSequenceType;
   template<typename DiagonalType, typename SubDiagonalType>
-  static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
+  static EIGEN_DEVICE_FUNC
+  void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
   {
     CoeffVectorType hCoeffs(mat.cols()-1);
     tridiagonalization_inplace(mat,hCoeffs);
@@ -508,7 +512,8 @@ struct tridiagonalization_inplace_selector<MatrixType,1,IsComplex>
   typedef typename MatrixType::Scalar Scalar;
 
   template<typename DiagonalType, typename SubDiagonalType>
-  static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ)
+  static EIGEN_DEVICE_FUNC
+  void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ)
   {
     diag(0,0) = numext::real(mat(0,0));
     if(extractQ)
diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h
index c3fd8c3e0..faea62f17 100644
--- a/Eigen/src/Geometry/Quaternion.h
+++ b/Eigen/src/Geometry/Quaternion.h
@@ -276,6 +276,27 @@ public:
   EIGEN_DEVICE_FUNC explicit inline Quaternion(const Quaternion<OtherScalar, OtherOptions>& other)
   { m_coeffs = other.coeffs().template cast<Scalar>(); }
 
+#if EIGEN_HAS_RVALUE_REFERENCES
+  // We define a copy constructor, which means we don't get an implicit move constructor or assignment operator.
+  /** Default move constructor */
+  EIGEN_DEVICE_FUNC inline Quaternion(Quaternion&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value)
+    : m_coeffs(std::move(other.coeffs()))
+  {}
+
+  /** Default move assignment operator */
+  EIGEN_DEVICE_FUNC Quaternion& operator=(Quaternion&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
+  {
+    m_coeffs = std::move(other.coeffs());
+    return *this;
+  }
+
+  // And now because we declared a constructor, we don't get an implicit copy constructor. Say we want one.
+  /** Default copy constructor */
+  EIGEN_DEVICE_FUNC Quaternion(const Quaternion& other)
+    : m_coeffs(other.coeffs())
+  {}
+#endif
+
   EIGEN_DEVICE_FUNC static Quaternion UnitRandom();
 
   template<typename Derived1, typename Derived2>
@@ -628,7 +649,7 @@ EIGEN_DEVICE_FUNC Quaternion<Scalar,Options> Quaternion<Scalar,Options>::UnitRan
   const Scalar u1 = internal::random<Scalar>(0, 1),
                u2 = internal::random<Scalar>(0, 2*EIGEN_PI),
                u3 = internal::random<Scalar>(0, 2*EIGEN_PI);
-  const Scalar a = sqrt(1 - u1),
+  const Scalar a = sqrt(Scalar(1) - u1),
                b = sqrt(u1);
   return Quaternion (a * sin(u2), a * cos(u2), b * sin(u3), b * cos(u3));
 }
diff --git a/Eigen/src/Geometry/Scaling.h b/Eigen/src/Geometry/Scaling.h
index 8d9acf252..df650fda6 100755
--- a/Eigen/src/Geometry/Scaling.h
+++ b/Eigen/src/Geometry/Scaling.h
@@ -128,7 +128,7 @@ public:
 /** Concatenates a linear transformation matrix and a uniform scaling
   * \relates UniformScaling
   */
-// NOTE this operator is defiend in MatrixBase and not as a friend function
+// NOTE this operator is defined in MatrixBase and not as a friend function
 // of UniformScaling to fix an internal crash of Intel's ICC
 template<typename Derived,typename Scalar>
 EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,Scalar,product)
diff --git a/Eigen/src/Geometry/arch/Geometry_SSE.h b/Eigen/src/Geometry/arch/Geometry_SSE.h
index f68cab583..d4346aa1c 100644
--- a/Eigen/src/Geometry/arch/Geometry_SSE.h
+++ b/Eigen/src/Geometry/arch/Geometry_SSE.h
@@ -26,17 +26,17 @@ struct quat_product<Architecture::SSE, Derived, OtherDerived, float>
   static inline Quaternion<float> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
   {
     Quaternion<float> res;
-    const __m128 mask = _mm_setr_ps(0.f,0.f,0.f,-0.f);
-    __m128 a = _a.coeffs().template packet<AAlignment>(0);
-    __m128 b = _b.coeffs().template packet<BAlignment>(0);
-    __m128 s1 = _mm_mul_ps(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2));
-    __m128 s2 = _mm_mul_ps(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1));
+    const Packet4f mask = _mm_setr_ps(0.f,0.f,0.f,-0.f);
+    Packet4f a = _a.coeffs().template packet<AAlignment>(0);
+    Packet4f b = _b.coeffs().template packet<BAlignment>(0);
+    Packet4f s1 = pmul(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2));
+    Packet4f s2 = pmul(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1));
     pstoret<float,Packet4f,ResAlignment>(
               &res.x(),
-              _mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,vec4f_swizzle1(b,3,3,3,3)),
-                                    _mm_mul_ps(vec4f_swizzle1(a,2,0,1,0),
+              padd(psub(pmul(a,vec4f_swizzle1(b,3,3,3,3)),
+                                    pmul(vec4f_swizzle1(a,2,0,1,0),
                                                vec4f_swizzle1(b,1,2,0,0))),
-                         _mm_xor_ps(mask,_mm_add_ps(s1,s2))));
+                         pxor(mask,padd(s1,s2))));
     
     return res;
   }
diff --git a/Eigen/src/Householder/BlockHouseholder.h b/Eigen/src/Householder/BlockHouseholder.h
index 01a7ed188..39ce1c2a0 100644
--- a/Eigen/src/Householder/BlockHouseholder.h
+++ b/Eigen/src/Householder/BlockHouseholder.h
@@ -63,8 +63,15 @@ void make_block_householder_triangular_factor(TriangularFactorType& triFactor, c
       triFactor.row(i).tail(rt).noalias() = -hCoeffs(i) * vectors.col(i).tail(rs).adjoint()
                                                         * vectors.bottomRightCorner(rs, rt).template triangularView<UnitLower>();
             
-      // FIXME add .noalias() once the triangular product can work inplace
-      triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView<Upper>();
+      // FIXME use the following line with .noalias() once the triangular product can work inplace
+      // triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView<Upper>();
+      for(Index j=nbVecs-1; j>i; --j)
+      {
+        typename TriangularFactorType::Scalar z = triFactor(i,j);
+        triFactor(i,j) = z * triFactor(j,j);
+        if(nbVecs-j-1>0)
+          triFactor.row(i).tail(nbVecs-j-1) += z * triFactor.row(j).tail(nbVecs-j-1);
+      }
       
     }
     triFactor(i,i) = hCoeffs(i);
diff --git a/Eigen/src/Householder/Householder.h b/Eigen/src/Householder/Householder.h
index 80de2c305..5bc037f00 100644
--- a/Eigen/src/Householder/Householder.h
+++ b/Eigen/src/Householder/Householder.h
@@ -39,6 +39,7 @@ template<int n> struct decrement_size
   *     MatrixBase::applyHouseholderOnTheRight()
   */
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 void MatrixBase<Derived>::makeHouseholderInPlace(Scalar& tau, RealScalar& beta)
 {
   VectorBlock<Derived, internal::decrement_size<Base::SizeAtCompileTime>::ret> essentialPart(derived(), 1, size()-1);
@@ -62,6 +63,7 @@ void MatrixBase<Derived>::makeHouseholderInPlace(Scalar& tau, RealScalar& beta)
   */
 template<typename Derived>
 template<typename EssentialPart>
+EIGEN_DEVICE_FUNC
 void MatrixBase<Derived>::makeHouseholder(
   EssentialPart& essential,
   Scalar& tau,
@@ -103,13 +105,14 @@ void MatrixBase<Derived>::makeHouseholder(
   * \param essential the essential part of the vector \c v
   * \param tau the scaling factor of the Householder transformation
   * \param workspace a pointer to working space with at least
-  *                  this->cols() * essential.size() entries
+  *                  this->cols() entries
   *
   * \sa MatrixBase::makeHouseholder(), MatrixBase::makeHouseholderInPlace(), 
   *     MatrixBase::applyHouseholderOnTheRight()
   */
 template<typename Derived>
 template<typename EssentialPart>
+EIGEN_DEVICE_FUNC
 void MatrixBase<Derived>::applyHouseholderOnTheLeft(
   const EssentialPart& essential,
   const Scalar& tau,
@@ -140,13 +143,14 @@ void MatrixBase<Derived>::applyHouseholderOnTheLeft(
   * \param essential the essential part of the vector \c v
   * \param tau the scaling factor of the Householder transformation
   * \param workspace a pointer to working space with at least
-  *                  this->cols() * essential.size() entries
+  *                  this->rows() entries
   *
   * \sa MatrixBase::makeHouseholder(), MatrixBase::makeHouseholderInPlace(), 
   *     MatrixBase::applyHouseholderOnTheLeft()
   */
 template<typename Derived>
 template<typename EssentialPart>
+EIGEN_DEVICE_FUNC
 void MatrixBase<Derived>::applyHouseholderOnTheRight(
   const EssentialPart& essential,
   const Scalar& tau,
@@ -160,10 +164,10 @@ void MatrixBase<Derived>::applyHouseholderOnTheRight(
   {
     Map<typename internal::plain_col_type<PlainObject>::type> tmp(workspace,rows());
     Block<Derived, Derived::RowsAtCompileTime, EssentialPart::SizeAtCompileTime> right(derived(), 0, 1, rows(), cols()-1);
-    tmp.noalias() = right * essential.conjugate();
+    tmp.noalias() = right * essential;
     tmp += this->col(0);
     this->col(0) -= tau * tmp;
-    right.noalias() -= tau * tmp * essential.transpose();
+    right.noalias() -= tau * tmp * essential.adjoint();
   }
 }
 
diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h
index 3ce0a693d..e62befcb6 100644
--- a/Eigen/src/Householder/HouseholderSequence.h
+++ b/Eigen/src/Householder/HouseholderSequence.h
@@ -87,7 +87,7 @@ struct hseq_side_dependent_impl
 {
   typedef Block<const VectorsType, Dynamic, 1> EssentialVectorType;
   typedef HouseholderSequence<VectorsType, CoeffsType, OnTheLeft> HouseholderSequenceType;
-  static inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k)
+  static EIGEN_DEVICE_FUNC inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k)
   {
     Index start = k+1+h.m_shift;
     return Block<const VectorsType,Dynamic,1>(h.m_vectors, start, k, h.rows()-start, 1);
@@ -140,6 +140,22 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       Side
     > ConjugateReturnType;
 
+    typedef HouseholderSequence<
+      VectorsType,
+      typename internal::conditional<NumTraits<Scalar>::IsComplex,
+        typename internal::remove_all<typename CoeffsType::ConjugateReturnType>::type,
+        CoeffsType>::type,
+      Side
+    > AdjointReturnType;
+
+    typedef HouseholderSequence<
+      typename internal::conditional<NumTraits<Scalar>::IsComplex,
+        typename internal::remove_all<typename VectorsType::ConjugateReturnType>::type,
+        VectorsType>::type,
+      CoeffsType,
+      Side
+    > TransposeReturnType;
+
     /** \brief Constructor.
       * \param[in]  v      %Matrix containing the essential parts of the Householder vectors
       * \param[in]  h      Vector containing the Householder coefficients
@@ -157,17 +173,19 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       *
       * \sa setLength(), setShift()
       */
+    EIGEN_DEVICE_FUNC
     HouseholderSequence(const VectorsType& v, const CoeffsType& h)
-      : m_vectors(v), m_coeffs(h), m_trans(false), m_length(v.diagonalSize()),
+      : m_vectors(v), m_coeffs(h), m_reverse(false), m_length(v.diagonalSize()),
         m_shift(0)
     {
     }
 
     /** \brief Copy constructor. */
+    EIGEN_DEVICE_FUNC
     HouseholderSequence(const HouseholderSequence& other)
       : m_vectors(other.m_vectors),
         m_coeffs(other.m_coeffs),
-        m_trans(other.m_trans),
+        m_reverse(other.m_reverse),
         m_length(other.m_length),
         m_shift(other.m_shift)
     {
@@ -177,12 +195,14 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       * \returns Number of rows 
       * \details This equals the dimension of the space that the transformation acts on.
       */
+    EIGEN_DEVICE_FUNC
     Index rows() const { return Side==OnTheLeft ? m_vectors.rows() : m_vectors.cols(); }
 
     /** \brief Number of columns of transformation viewed as a matrix.
       * \returns Number of columns
       * \details This equals the dimension of the space that the transformation acts on.
       */
+    EIGEN_DEVICE_FUNC
     Index cols() const { return rows(); }
 
     /** \brief Essential part of a Householder vector.
@@ -199,6 +219,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       *
       * \sa setShift(), shift()
       */
+    EIGEN_DEVICE_FUNC
     const EssentialVectorType essentialVector(Index k) const
     {
       eigen_assert(k >= 0 && k < m_length);
@@ -206,31 +227,39 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
     }
 
     /** \brief %Transpose of the Householder sequence. */
-    HouseholderSequence transpose() const
+    TransposeReturnType transpose() const
     {
-      return HouseholderSequence(*this).setTrans(!m_trans);
+      return TransposeReturnType(m_vectors.conjugate(), m_coeffs)
+              .setReverseFlag(!m_reverse)
+              .setLength(m_length)
+              .setShift(m_shift);
     }
 
     /** \brief Complex conjugate of the Householder sequence. */
     ConjugateReturnType conjugate() const
     {
       return ConjugateReturnType(m_vectors.conjugate(), m_coeffs.conjugate())
-             .setTrans(m_trans)
+             .setReverseFlag(m_reverse)
              .setLength(m_length)
              .setShift(m_shift);
     }
 
     /** \brief Adjoint (conjugate transpose) of the Householder sequence. */
-    ConjugateReturnType adjoint() const
+    AdjointReturnType adjoint() const
     {
-      return conjugate().setTrans(!m_trans);
+      return AdjointReturnType(m_vectors, m_coeffs.conjugate())
+              .setReverseFlag(!m_reverse)
+              .setLength(m_length)
+              .setShift(m_shift);
     }
 
     /** \brief Inverse of the Householder sequence (equals the adjoint). */
-    ConjugateReturnType inverse() const { return adjoint(); }
+    AdjointReturnType inverse() const { return adjoint(); }
 
     /** \internal */
-    template<typename DestType> inline void evalTo(DestType& dst) const
+    template<typename DestType>
+    inline EIGEN_DEVICE_FUNC
+    void evalTo(DestType& dst) const
     {
       Matrix<Scalar, DestType::RowsAtCompileTime, 1,
              AutoAlign|ColMajor, DestType::MaxRowsAtCompileTime, 1> workspace(rows());
@@ -239,6 +268,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
 
     /** \internal */
     template<typename Dest, typename Workspace>
+    EIGEN_DEVICE_FUNC
     void evalTo(Dest& dst, Workspace& workspace) const
     {
       workspace.resize(rows());
@@ -251,7 +281,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
         for(Index k = vecs-1; k >= 0; --k)
         {
           Index cornerSize = rows() - k - m_shift;
-          if(m_trans)
+          if(m_reverse)
             dst.bottomRightCorner(cornerSize, cornerSize)
                .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), workspace.data());
           else
@@ -265,18 +295,26 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
         for(Index k = 0; k<cols()-vecs ; ++k)
           dst.col(k).tail(rows()-k-1).setZero();
       }
+      else if(m_length>BlockSize)
+      {
+        dst.setIdentity(rows(), rows());
+        if(m_reverse)
+          applyThisOnTheLeft(dst,workspace,true);
+        else
+          applyThisOnTheLeft(dst,workspace,true);
+      }
       else
       {
         dst.setIdentity(rows(), rows());
         for(Index k = vecs-1; k >= 0; --k)
         {
           Index cornerSize = rows() - k - m_shift;
-          if(m_trans)
+          if(m_reverse)
             dst.bottomRightCorner(cornerSize, cornerSize)
-               .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), &workspace.coeffRef(0));
+               .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), workspace.data());
           else
             dst.bottomRightCorner(cornerSize, cornerSize)
-               .applyHouseholderOnTheLeft(essentialVector(k), m_coeffs.coeff(k), &workspace.coeffRef(0));
+               .applyHouseholderOnTheLeft(essentialVector(k), m_coeffs.coeff(k), workspace.data());
         }
       }
     }
@@ -295,31 +333,34 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       workspace.resize(dst.rows());
       for(Index k = 0; k < m_length; ++k)
       {
-        Index actual_k = m_trans ? m_length-k-1 : k;
+        Index actual_k = m_reverse ? m_length-k-1 : k;
         dst.rightCols(rows()-m_shift-actual_k)
            .applyHouseholderOnTheRight(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data());
       }
     }
 
     /** \internal */
-    template<typename Dest> inline void applyThisOnTheLeft(Dest& dst) const
+    template<typename Dest> inline void applyThisOnTheLeft(Dest& dst, bool inputIsIdentity = false) const
     {
       Matrix<Scalar,1,Dest::ColsAtCompileTime,RowMajor,1,Dest::MaxColsAtCompileTime> workspace;
-      applyThisOnTheLeft(dst, workspace);
+      applyThisOnTheLeft(dst, workspace, inputIsIdentity);
     }
 
     /** \internal */
     template<typename Dest, typename Workspace>
-    inline void applyThisOnTheLeft(Dest& dst, Workspace& workspace) const
+    inline void applyThisOnTheLeft(Dest& dst, Workspace& workspace, bool inputIsIdentity = false) const
     {
-      const Index BlockSize = 48;
+      if(inputIsIdentity && m_reverse)
+        inputIsIdentity = false;
       // if the entries are large enough, then apply the reflectors by block
       if(m_length>=BlockSize && dst.cols()>1)
       {
-        for(Index i = 0; i < m_length; i+=BlockSize)
+        // Make sure we have at least 2 useful blocks, otherwise it is point-less:
+        Index blockSize = m_length<Index(2*BlockSize) ? (m_length+1)/2 : Index(BlockSize);
+        for(Index i = 0; i < m_length; i+=blockSize)
         {
-          Index end = m_trans ? (std::min)(m_length,i+BlockSize) : m_length-i;
-          Index k = m_trans ? i : (std::max)(Index(0),end-BlockSize);
+          Index end = m_reverse ? (std::min)(m_length,i+blockSize) : m_length-i;
+          Index k = m_reverse ? i : (std::max)(Index(0),end-blockSize);
           Index bs = end-k;
           Index start = k + m_shift;
           
@@ -329,8 +370,15 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
                                                                    Side==OnTheRight ? bs : m_vectors.rows()-start,
                                                                    Side==OnTheRight ? m_vectors.cols()-start : bs);
           typename internal::conditional<Side==OnTheRight, Transpose<SubVectorsType>, SubVectorsType&>::type sub_vecs(sub_vecs1);
-          Block<Dest,Dynamic,Dynamic> sub_dst(dst,dst.rows()-rows()+m_shift+k,0, rows()-m_shift-k,dst.cols());
-          apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_trans);
+
+          Index dstStart = dst.rows()-rows()+m_shift+k;
+          Index dstRows  = rows()-m_shift-k;
+          Block<Dest,Dynamic,Dynamic> sub_dst(dst,
+                                              dstStart,
+                                              inputIsIdentity ? dstStart : 0,
+                                              dstRows,
+                                              inputIsIdentity ? dstRows : dst.cols());
+          apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_reverse);
         }
       }
       else
@@ -338,8 +386,9 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
         workspace.resize(dst.cols());
         for(Index k = 0; k < m_length; ++k)
         {
-          Index actual_k = m_trans ? k : m_length-k-1;
-          dst.bottomRows(rows()-m_shift-actual_k)
+          Index actual_k = m_reverse ? k : m_length-k-1;
+          Index dstStart = rows()-m_shift-actual_k;
+          dst.bottomRightCorner(dstStart, inputIsIdentity ? dstStart : dst.cols())
             .applyHouseholderOnTheLeft(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data());
         }
       }
@@ -357,7 +406,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
     {
       typename internal::matrix_type_times_scalar_type<Scalar, OtherDerived>::Type
         res(other.template cast<typename internal::matrix_type_times_scalar_type<Scalar,OtherDerived>::ResultScalar>());
-      applyThisOnTheLeft(res);
+      applyThisOnTheLeft(res, internal::is_identity<OtherDerived>::value && res.rows()==res.cols());
       return res;
     }
 
@@ -372,6 +421,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       *
       * \sa length()
       */
+    EIGEN_DEVICE_FUNC
     HouseholderSequence& setLength(Index length)
     {
       m_length = length;
@@ -389,13 +439,17 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       *
       * \sa shift()
       */
+    EIGEN_DEVICE_FUNC
     HouseholderSequence& setShift(Index shift)
     {
       m_shift = shift;
       return *this;
     }
 
+    EIGEN_DEVICE_FUNC
     Index length() const { return m_length; }  /**< \brief Returns the length of the Householder sequence. */
+
+    EIGEN_DEVICE_FUNC
     Index shift() const { return m_shift; }    /**< \brief Returns the shift of the Householder sequence. */
 
     /* Necessary for .adjoint() and .conjugate() */
@@ -403,27 +457,30 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
 
   protected:
 
-    /** \brief Sets the transpose flag.
-      * \param [in]  trans  New value of the transpose flag.
+    /** \internal
+      * \brief Sets the reverse flag.
+      * \param [in]  reverse  New value of the reverse flag.
       *
-      * By default, the transpose flag is not set. If the transpose flag is set, then this object represents 
-      * \f$ H^T = H_{n-1}^T \ldots H_1^T H_0^T \f$ instead of \f$ H = H_0 H_1 \ldots H_{n-1} \f$.
+      * By default, the reverse flag is not set. If the reverse flag is set, then this object represents
+      * \f$ H^r = H_{n-1} \ldots H_1 H_0 \f$ instead of \f$ H = H_0 H_1 \ldots H_{n-1} \f$.
+      * \note For real valued HouseholderSequence this is equivalent to transposing \f$ H \f$.
       *
-      * \sa trans()
+      * \sa reverseFlag(), transpose(), adjoint()
       */
-    HouseholderSequence& setTrans(bool trans)
+    HouseholderSequence& setReverseFlag(bool reverse)
     {
-      m_trans = trans;
+      m_reverse = reverse;
       return *this;
     }
 
-    bool trans() const { return m_trans; }     /**< \brief Returns the transpose flag. */
+    bool reverseFlag() const { return m_reverse; }     /**< \internal \brief Returns the reverse flag. */
 
     typename VectorsType::Nested m_vectors;
     typename CoeffsType::Nested m_coeffs;
-    bool m_trans;
+    bool m_reverse;
     Index m_length;
     Index m_shift;
+    enum { BlockSize = 48 };
 };
 
 /** \brief Computes the product of a matrix with a Householder sequence.
diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
index 338e6f10a..43bd8e8f6 100644
--- a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
@@ -136,7 +136,7 @@ class IncompleteLUT : public SparseSolverBase<IncompleteLUT<_Scalar, _StorageInd
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the matrix.appears to be negative.
       */
     ComputationInfo info() const
@@ -230,7 +230,7 @@ void IncompleteLUT<Scalar,StorageIndex>::analyzePattern(const _MatrixType& amat)
   SparseMatrix<Scalar,ColMajor, StorageIndex> mat1 = amat;
   SparseMatrix<Scalar,ColMajor, StorageIndex> mat2 = amat.transpose();
   // FIXME for a matrix with nearly symmetric pattern, mat2+mat1 is the appropriate choice.
-  //       on the other hand for a really non-symmetric pattern, mat2*mat1 should be prefered...
+  //       on the other hand for a really non-symmetric pattern, mat2*mat1 should be preferred...
   SparseMatrix<Scalar,ColMajor, StorageIndex> AtA = mat2 + mat1;
   AMDOrdering<StorageIndex> ordering;
   ordering(AtA,m_P);
diff --git a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
index 7c2326eb7..bfeee71cd 100644
--- a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
+++ b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
@@ -275,7 +275,7 @@ public:
   const Preconditioner& preconditioner() const { return m_preconditioner; }
 
   /** \returns the max number of iterations.
-    * It is either the value setted by setMaxIterations or, by default,
+    * It is either the value set by setMaxIterations or, by default,
     * twice the number of columns of the matrix.
     */
   Index maxIterations() const
diff --git a/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h b/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
index 0ace45177..79e1e4819 100644
--- a/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
+++ b/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
@@ -108,7 +108,7 @@ struct Assignment<DstXprType, SolveWithGuess<DecType,RhsType,GuessType>, interna
   }
 };
 
-} // end namepsace internal
+} // end namespace internal
 
 } // end namespace Eigen
 
diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h
index af1228cb8..176ff9d07 100644
--- a/Eigen/src/Jacobi/Jacobi.h
+++ b/Eigen/src/Jacobi/Jacobi.h
@@ -90,11 +90,12 @@ template<typename Scalar> class JacobiRotation
   * \sa MatrixBase::makeJacobi(const MatrixBase<Derived>&, Index, Index), MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
   */
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 bool JacobiRotation<Scalar>::makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z)
 {
   using std::sqrt;
   using std::abs;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
+
   RealScalar deno = RealScalar(2)*abs(y);
   if(deno < (std::numeric_limits<RealScalar>::min)())
   {
@@ -134,6 +135,7 @@ bool JacobiRotation<Scalar>::makeJacobi(const RealScalar& x, const Scalar& y, co
   */
 template<typename Scalar>
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 inline bool JacobiRotation<Scalar>::makeJacobi(const MatrixBase<Derived>& m, Index p, Index q)
 {
   return makeJacobi(numext::real(m.coeff(p,p)), m.coeff(p,q), numext::real(m.coeff(q,q)));
@@ -156,6 +158,7 @@ inline bool JacobiRotation<Scalar>::makeJacobi(const MatrixBase<Derived>& m, Ind
   * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
   */
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* z)
 {
   makeGivens(p, q, z, typename internal::conditional<NumTraits<Scalar>::IsComplex, internal::true_type, internal::false_type>::type());
@@ -164,6 +167,7 @@ void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar
 
 // specialization for complexes
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type)
 {
   using std::sqrt;
@@ -223,6 +227,7 @@ void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar
 
 // specialization for reals
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type)
 {
   using std::sqrt;
@@ -286,6 +291,7 @@ void apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>&
   */
 template<typename Derived>
 template<typename OtherScalar>
+EIGEN_DEVICE_FUNC
 inline void MatrixBase<Derived>::applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j)
 {
   RowXpr x(this->row(p));
@@ -301,6 +307,7 @@ inline void MatrixBase<Derived>::applyOnTheLeft(Index p, Index q, const JacobiRo
   */
 template<typename Derived>
 template<typename OtherScalar>
+EIGEN_DEVICE_FUNC
 inline void MatrixBase<Derived>::applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j)
 {
   ColXpr x(this->col(p));
@@ -314,7 +321,8 @@ template<typename Scalar, typename OtherScalar,
          int SizeAtCompileTime, int MinAlignment, bool Vectorizable>
 struct apply_rotation_in_the_plane_selector
 {
-  static inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s)
+  static EIGEN_DEVICE_FUNC
+  inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s)
   {
     for(Index i=0; i<size; ++i)
     {
@@ -441,6 +449,7 @@ struct apply_rotation_in_the_plane_selector<Scalar,OtherScalar,SizeAtCompileTime
 };
 
 template<typename VectorX, typename VectorY, typename OtherScalar>
+EIGEN_DEVICE_FUNC
 void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j)
 {
   typedef typename VectorX::Scalar Scalar;
diff --git a/Eigen/src/KLUSupport/KLUSupport.h b/Eigen/src/KLUSupport/KLUSupport.h
index a9e8633d9..d2633a935 100644
--- a/Eigen/src/KLUSupport/KLUSupport.h
+++ b/Eigen/src/KLUSupport/KLUSupport.h
@@ -106,7 +106,7 @@ class KLU : public SparseSolverBase<KLU<_MatrixType> >
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the matrix.appears to be negative.
       */
     ComputationInfo info() const
diff --git a/Eigen/src/LU/Determinant.h b/Eigen/src/LU/Determinant.h
index d6a3c1e5a..6af63a6e7 100644
--- a/Eigen/src/LU/Determinant.h
+++ b/Eigen/src/LU/Determinant.h
@@ -15,6 +15,7 @@ namespace Eigen {
 namespace internal {
 
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 inline const typename Derived::Scalar bruteforce_det3_helper
 (const MatrixBase<Derived>& matrix, int a, int b, int c)
 {
@@ -23,6 +24,7 @@ inline const typename Derived::Scalar bruteforce_det3_helper
 }
 
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 const typename Derived::Scalar bruteforce_det4_helper
 (const MatrixBase<Derived>& matrix, int j, int k, int m, int n)
 {
@@ -44,7 +46,8 @@ template<typename Derived,
 
 template<typename Derived> struct determinant_impl<Derived, 1>
 {
-  static inline typename traits<Derived>::Scalar run(const Derived& m)
+  static inline EIGEN_DEVICE_FUNC
+  typename traits<Derived>::Scalar run(const Derived& m)
   {
     return m.coeff(0,0);
   }
@@ -52,7 +55,8 @@ template<typename Derived> struct determinant_impl<Derived, 1>
 
 template<typename Derived> struct determinant_impl<Derived, 2>
 {
-  static inline typename traits<Derived>::Scalar run(const Derived& m)
+  static inline EIGEN_DEVICE_FUNC
+  typename traits<Derived>::Scalar run(const Derived& m)
   {
     return m.coeff(0,0) * m.coeff(1,1) - m.coeff(1,0) * m.coeff(0,1);
   }
@@ -60,7 +64,8 @@ template<typename Derived> struct determinant_impl<Derived, 2>
 
 template<typename Derived> struct determinant_impl<Derived, 3>
 {
-  static inline typename traits<Derived>::Scalar run(const Derived& m)
+  static inline EIGEN_DEVICE_FUNC
+  typename traits<Derived>::Scalar run(const Derived& m)
   {
     return bruteforce_det3_helper(m,0,1,2)
           - bruteforce_det3_helper(m,1,0,2)
@@ -70,7 +75,8 @@ template<typename Derived> struct determinant_impl<Derived, 3>
 
 template<typename Derived> struct determinant_impl<Derived, 4>
 {
-  static typename traits<Derived>::Scalar run(const Derived& m)
+  static EIGEN_DEVICE_FUNC
+  typename traits<Derived>::Scalar run(const Derived& m)
   {
     // trick by Martin Costabel to compute 4x4 det with only 30 muls
     return bruteforce_det4_helper(m,0,1,2,3)
@@ -89,6 +95,7 @@ template<typename Derived> struct determinant_impl<Derived, 4>
   * \returns the determinant of this matrix
   */
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 inline typename internal::traits<Derived>::Scalar MatrixBase<Derived>::determinant() const
 {
   eigen_assert(rows() == cols());
diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h
index ec61086d5..50d1bb41b 100644
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@@ -48,7 +48,7 @@ template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> >
   * The data of the LU decomposition can be directly accessed through the methods matrixLU(),
   * permutationP(), permutationQ().
   *
-  * As an exemple, here is how the original matrix can be retrieved:
+  * As an example, here is how the original matrix can be retrieved:
   * \include class_FullPivLU.cpp
   * Output: \verbinclude class_FullPivLU.out
   *
diff --git a/Eigen/src/LU/InverseImpl.h b/Eigen/src/LU/InverseImpl.h
index 018f99b58..1bab00c01 100644
--- a/Eigen/src/LU/InverseImpl.h
+++ b/Eigen/src/LU/InverseImpl.h
@@ -290,6 +290,7 @@ template<typename DstXprType, typename XprType>
 struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar>, Dense2Dense>
 {
   typedef Inverse<XprType> SrcXprType;
+  EIGEN_DEVICE_FUNC
   static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar> &)
   {
     Index dstRows = src.rows();
@@ -332,6 +333,7 @@ struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename Dst
   * \sa computeInverseAndDetWithCheck()
   */
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 inline const Inverse<Derived> MatrixBase<Derived>::inverse() const
 {
   EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsInteger,THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
@@ -404,7 +406,7 @@ inline void MatrixBase<Derived>::computeInverseWithCheck(
     const RealScalar& absDeterminantThreshold
   ) const
 {
-  RealScalar determinant;
+  Scalar determinant;
   // i'd love to put some static assertions there, but SFINAE means that they have no effect...
   eigen_assert(rows() == cols());
   computeInverseAndDetWithCheck(inverse,determinant,invertible,absDeterminantThreshold);
diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index d43961887..bfcd2c95b 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@@ -420,8 +420,8 @@ struct partial_lu_impl
     * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise.
     *
     * \note This very low level interface using pointers, etc. is to:
-    *   1 - reduce the number of instanciations to the strict minimum
-    *   2 - avoid infinite recursion of the instanciations with Block<Block<Block<...> > >
+    *   1 - reduce the number of instantiations to the strict minimum
+    *   2 - avoid infinite recursion of the instantiations with Block<Block<Block<...> > >
     */
   static Index blocked_lu(Index rows, Index cols, Scalar* lu_data, Index luStride, PivIndex* row_transpositions, PivIndex& nb_transpositions, Index maxBlockSize=256)
   {
diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h
index da85b4d6e..67fcad3f7 100644
--- a/Eigen/src/OrderingMethods/Eigen_Colamd.h
+++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h
@@ -1493,7 +1493,7 @@ static inline  void order_children
 	c = Col [c].shared1.parent ;
 
 	/* continue until we hit an ordered column.  There are */
-	/* guarranteed not to be anymore unordered columns */
+	/* guaranteed not to be anymore unordered columns */
 	/* above an ordered column */
       } while (Col [c].shared2.order == COLAMD_EMPTY) ;
 
@@ -1638,7 +1638,7 @@ static void detect_super_cols
 	  COLAMD_ASSERT (ROW_IS_ALIVE (*cp1))  ;
 	  COLAMD_ASSERT (ROW_IS_ALIVE (*cp2))  ;
 	  /* row indices will same order for both supercols, */
-	  /* no gather scatter nessasary */
+	  /* no gather scatter necessary */
 	  if (*cp1++ != *cp2++)
 	  {
 	    break ;
@@ -1688,7 +1688,7 @@ static void detect_super_cols
 
 /*
   Defragments and compacts columns and rows in the workspace A.  Used when
-  all avaliable memory has been used while performing row merging.  Returns
+  all available memory has been used while performing row merging.  Returns
   the index of the first free position in A, after garbage collection.  The
   time taken by this routine is linear is the size of the array A, which is
   itself linear in the number of nonzeros in the input matrix.
diff --git a/Eigen/src/PaStiXSupport/PaStiXSupport.h b/Eigen/src/PaStiXSupport/PaStiXSupport.h
index 160d8a523..37426877a 100644
--- a/Eigen/src/PaStiXSupport/PaStiXSupport.h
+++ b/Eigen/src/PaStiXSupport/PaStiXSupport.h
@@ -203,7 +203,7 @@ class PastixBase : public SparseSolverBase<Derived>
     
      /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the PaStiX reports a problem
       *          \c InvalidInput if the input matrix is invalid
       *
diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h
index 091c3970e..fb2ba04b4 100644
--- a/Eigen/src/PardisoSupport/PardisoSupport.h
+++ b/Eigen/src/PardisoSupport/PardisoSupport.h
@@ -140,7 +140,7 @@ class PardisoImpl : public SparseSolverBase<Derived>
   
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the matrix appears to be negative.
       */
     ComputationInfo info() const
diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h
index 5270eaca2..1faa3442e 100644
--- a/Eigen/src/QR/ColPivHouseholderQR.h
+++ b/Eigen/src/QR/ColPivHouseholderQR.h
@@ -402,7 +402,7 @@ template<typename _MatrixType> class ColPivHouseholderQR
       */
     RealScalar maxPivot() const { return m_maxpivot; }
 
-    /** \brief Reports whether the QR factorization was succesful.
+    /** \brief Reports whether the QR factorization was successful.
       *
       * \note This function always returns \c Success. It is provided for compatibility
       * with other factorization routines.
@@ -595,11 +595,7 @@ void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &
 
   typename RhsType::PlainObject c(rhs);
 
-  // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T
-  c.applyOnTheLeft(householderSequence(m_qr, m_hCoeffs)
-                    .setLength(nonzero_pivots)
-                    .transpose()
-    );
+  c.applyOnTheLeft(householderQ().setLength(nonzero_pivots).adjoint() );
 
   m_qr.topLeftCorner(nonzero_pivots, nonzero_pivots)
       .template triangularView<Upper>()
diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h
index 13b61fcdb..03017a375 100644
--- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h
+++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h
@@ -353,7 +353,7 @@ class CompleteOrthogonalDecomposition {
   inline RealScalar maxPivot() const { return m_cpqr.maxPivot(); }
 
   /** \brief Reports whether the complete orthogonal decomposition was
-   * succesful.
+   * successful.
    *
    * \note This function always returns \c Success. It is provided for
    * compatibility
@@ -452,7 +452,7 @@ void CompleteOrthogonalDecomposition<MatrixType>::computeInPlace()
         // Apply Z(k) to the first k rows of X_k
         m_cpqr.m_qr.topRightCorner(k, cols - rank + 1)
             .applyHouseholderOnTheRight(
-                m_cpqr.m_qr.row(k).tail(cols - rank).transpose(), m_zCoeffs(k),
+                m_cpqr.m_qr.row(k).tail(cols - rank).adjoint(), m_zCoeffs(k),
                 &m_temp(0));
       }
       if (k != rank - 1) {
@@ -500,11 +500,8 @@ void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl(
   }
 
   // Compute c = Q^* * rhs
-  // Note that the matrix Q = H_0^* H_1^*... so its inverse is
-  // Q^* = (H_0 H_1 ...)^T
   typename RhsType::PlainObject c(rhs);
-  c.applyOnTheLeft(
-      householderSequence(matrixQTZ(), hCoeffs()).setLength(rank).transpose());
+  c.applyOnTheLeft(matrixQ().setLength(rank).adjoint());
 
   // Solve T z = c(1:rank, :)
   dst.topRows(rank) = matrixT()
diff --git a/Eigen/src/QR/HouseholderQR.h b/Eigen/src/QR/HouseholderQR.h
index 762b21c36..33cb9c8ff 100644
--- a/Eigen/src/QR/HouseholderQR.h
+++ b/Eigen/src/QR/HouseholderQR.h
@@ -291,7 +291,7 @@ template<typename MatrixQR, typename HCoeffs,
   bool InnerStrideIsOne = (MatrixQR::InnerStrideAtCompileTime == 1 && HCoeffs::InnerStrideAtCompileTime == 1)>
 struct householder_qr_inplace_blocked
 {
-  // This is specialized for MKL-supported Scalar types in HouseholderQR_MKL.h
+  // This is specialized for LAPACK-supported Scalar types in HouseholderQR_LAPACKE.h
   static void run(MatrixQR& mat, HCoeffs& hCoeffs, Index maxBlockSize=32,
       typename MatrixQR::Scalar* tempData = 0)
   {
@@ -353,11 +353,7 @@ void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) c
 
   typename RhsType::PlainObject c(rhs);
 
-  // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T
-  c.applyOnTheLeft(householderSequence(
-    m_qr.leftCols(rank),
-    m_hCoeffs.head(rank)).transpose()
-  );
+  c.applyOnTheLeft(householderQ().setLength(rank).adjoint() );
 
   m_qr.topLeftCorner(rank, rank)
       .template triangularView<Upper>()
diff --git a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
index 953d57c9d..1a5c5254e 100644
--- a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
+++ b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
@@ -220,7 +220,7 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
     
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the sparse QR can not be computed
       */
     ComputationInfo info() const
diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h
index 06865a331..11df14918 100644
--- a/Eigen/src/SVD/BDCSVD.h
+++ b/Eigen/src/SVD/BDCSVD.h
@@ -62,7 +62,7 @@ struct traits<BDCSVD<_MatrixType> >
  * recommended and can several order of magnitude faster.
  *
  * \warning this algorithm is unlikely to provide accurate result when compiled with unsafe math optimizations.
- * For instance, this concerns Intel's compiler (ICC), which perfroms such optimization by default unless
+ * For instance, this concerns Intel's compiler (ICC), which performs such optimization by default unless
  * you compile with the \c -fp-model \c precise option. Likewise, the \c -ffast-math option of GCC or clang will
  * significantly degrade the accuracy.
  *
@@ -1299,7 +1299,7 @@ void BDCSVD<MatrixType>::deflation(Eigen::Index firstCol, Eigen::Index lastCol,
 #endif
 }//end deflation
 
-#ifndef EIGEN_CUDACC
+#if !defined(EIGEN_GPUCC)
 /** \svd_module
   *
   * \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm
diff --git a/Eigen/src/SVD/UpperBidiagonalization.h b/Eigen/src/SVD/UpperBidiagonalization.h
index 11ac847e1..997defc47 100644
--- a/Eigen/src/SVD/UpperBidiagonalization.h
+++ b/Eigen/src/SVD/UpperBidiagonalization.h
@@ -127,7 +127,7 @@ void upperbidiagonalization_inplace_unblocked(MatrixType& mat,
        .makeHouseholderInPlace(mat.coeffRef(k,k+1), upper_diagonal[k]);
     // apply householder transform to remaining part of mat on the left
     mat.bottomRightCorner(remainingRows-1, remainingCols)
-       .applyHouseholderOnTheRight(mat.row(k).tail(remainingCols-1).transpose(), mat.coeff(k,k+1), tempData);
+       .applyHouseholderOnTheRight(mat.row(k).tail(remainingCols-1).adjoint(), mat.coeff(k,k+1), tempData);
   }
 }
 
@@ -202,7 +202,7 @@ void upperbidiagonalization_blocked_helper(MatrixType& A,
       {
         SubColumnType y_k( Y.col(k).tail(remainingCols) );
         
-        // let's use the begining of column k of Y as a temporary vector
+        // let's use the beginning of column k of Y as a temporary vector
         SubColumnType tmp( Y.col(k).head(k) );
         y_k.noalias()  = A.block(k,k+1, remainingRows,remainingCols).adjoint() * v_k; // bottleneck
         tmp.noalias()  = V_k1.adjoint()  * v_k;
@@ -231,7 +231,7 @@ void upperbidiagonalization_blocked_helper(MatrixType& A,
       {
         SubColumnType x_k ( X.col(k).tail(remainingRows-1) );
         
-        // let's use the begining of column k of X as a temporary vectors
+        // let's use the beginning of column k of X as a temporary vectors
         // note that tmp0 and tmp1 overlaps
         SubColumnType tmp0 ( X.col(k).head(k) ),
                       tmp1 ( X.col(k).head(k+1) );
diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h
index 2907f6529..b9ca94bc3 100644
--- a/Eigen/src/SparseCholesky/SimplicialCholesky.h
+++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h
@@ -101,7 +101,7 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
     
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the matrix.appears to be negative.
       */
     ComputationInfo info() const
diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
index 84a1bf2bd..0aa92f8bc 100644
--- a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
+++ b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
@@ -5,7 +5,7 @@
 
 /*
 
-NOTE: thes functions vave been adapted from the LDL library:
+NOTE: these functions have been adapted from the LDL library:
 
 LDL Copyright (c) 2005 by Timothy A. Davis.  All Rights Reserved.
 
diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h
index 511e92b2f..5ed7c437b 100644
--- a/Eigen/src/SparseCore/SparseBlock.h
+++ b/Eigen/src/SparseCore/SparseBlock.h
@@ -326,46 +326,6 @@ private:
 
 //----------
 
-/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
-  * is col-major (resp. row-major).
-  */
-template<typename Derived>
-typename SparseMatrixBase<Derived>::InnerVectorReturnType SparseMatrixBase<Derived>::innerVector(Index outer)
-{ return InnerVectorReturnType(derived(), outer); }
-
-/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
-  * is col-major (resp. row-major). Read-only.
-  */
-template<typename Derived>
-const typename SparseMatrixBase<Derived>::ConstInnerVectorReturnType SparseMatrixBase<Derived>::innerVector(Index outer) const
-{ return ConstInnerVectorReturnType(derived(), outer); }
-
-/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
-  * is col-major (resp. row-major).
-  */
-template<typename Derived>
-typename SparseMatrixBase<Derived>::InnerVectorsReturnType
-SparseMatrixBase<Derived>::innerVectors(Index outerStart, Index outerSize)
-{
-  return Block<Derived,Dynamic,Dynamic,true>(derived(),
-                                             IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
-                                             IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
-
-}
-
-/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
-  * is col-major (resp. row-major). Read-only.
-  */
-template<typename Derived>
-const typename SparseMatrixBase<Derived>::ConstInnerVectorsReturnType
-SparseMatrixBase<Derived>::innerVectors(Index outerStart, Index outerSize) const
-{
-  return Block<const Derived,Dynamic,Dynamic,true>(derived(),
-                                                  IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
-                                                  IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
-
-}
-
 /** Generic implementation of sparse Block expression.
   * Real-only.
   */
diff --git a/Eigen/src/SparseCore/SparseDenseProduct.h b/Eigen/src/SparseCore/SparseDenseProduct.h
index 0547db596..f005a18a1 100644
--- a/Eigen/src/SparseCore/SparseDenseProduct.h
+++ b/Eigen/src/SparseCore/SparseDenseProduct.h
@@ -88,10 +88,11 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, A
   typedef typename internal::remove_all<SparseLhsType>::type Lhs;
   typedef typename internal::remove_all<DenseRhsType>::type Rhs;
   typedef typename internal::remove_all<DenseResType>::type Res;
-  typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
+  typedef evaluator<Lhs> LhsEval;
+  typedef typename LhsEval::InnerIterator LhsInnerIterator;
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha)
   {
-    evaluator<Lhs> lhsEval(lhs);
+    LhsEval lhsEval(lhs);
     for(Index c=0; c<rhs.cols(); ++c)
     {
       for(Index j=0; j<lhs.outerSize(); ++j)
@@ -111,17 +112,38 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
   typedef typename internal::remove_all<SparseLhsType>::type Lhs;
   typedef typename internal::remove_all<DenseRhsType>::type Rhs;
   typedef typename internal::remove_all<DenseResType>::type Res;
-  typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
+  typedef evaluator<Lhs> LhsEval;
+  typedef typename LhsEval::InnerIterator LhsInnerIterator;
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
   {
-    evaluator<Lhs> lhsEval(lhs);
-    for(Index j=0; j<lhs.outerSize(); ++j)
+    Index n = lhs.rows();
+    LhsEval lhsEval(lhs);
+
+#ifdef EIGEN_HAS_OPENMP
+    Eigen::initParallel();
+    Index threads = Eigen::nbThreads();
+    // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
+    // It basically represents the minimal amount of work to be done to be worth it.
+    if(threads>1 && lhsEval.nonZerosEstimate()*rhs.cols() > 20000)
     {
-      typename Res::RowXpr res_j(res.row(j));
-      for(LhsInnerIterator it(lhsEval,j); it ;++it)
-        res_j += (alpha*it.value()) * rhs.row(it.index());
+      #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads)
+      for(Index i=0; i<n; ++i)
+        processRow(lhsEval,rhs,res,alpha,i);
+    }
+    else
+#endif
+    {
+      for(Index i=0; i<n; ++i)
+        processRow(lhsEval, rhs, res, alpha, i);
     }
   }
+
+  static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, Res& res, const typename Res::Scalar& alpha, Index i)
+  {
+    typename Res::RowXpr res_i(res.row(i));
+    for(LhsInnerIterator it(lhsEval,i); it ;++it)
+      res_i += (alpha*it.value()) * rhs.row(it.index());
+  }
 };
 
 template<typename SparseLhsType, typename DenseRhsType, typename DenseResType>
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index 323c2323b..8f77194b6 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -21,7 +21,7 @@ namespace Eigen {
   * This class implements a more versatile variants of the common \em compressed row/column storage format.
   * Each colmun's (resp. row) non zeros are stored as a pair of value with associated row (resp. colmiun) index.
   * All the non zeros are stored in a single large buffer. Unlike the \em compressed format, there might be extra
-  * space inbetween the nonzeros of two successive colmuns (resp. rows) such that insertion of new non-zero
+  * space in between the nonzeros of two successive colmuns (resp. rows) such that insertion of new non-zero
   * can be done with limited memory reallocation and copies.
   *
   * A call to the function makeCompressed() turns the matrix into the standard \em compressed format
@@ -503,7 +503,7 @@ class SparseMatrix
       }
     }
     
-    /** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerence \a epsilon */
+    /** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerance \a epsilon */
     void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision())
     {
       prune(default_prunning_func(reference,epsilon));
@@ -986,7 +986,7 @@ void set_from_triplets(const InputIterator& begin, const InputIterator& end, Spa
   *
   * \warning The list of triplets is read multiple times (at least twice). Therefore, it is not recommended to define
   * an abstract iterator over a complex data-structure that would be expensive to evaluate. The triplets should rather
-  * be explicitely stored into a std::vector for instance.
+  * be explicitly stored into a std::vector for instance.
   */
 template<typename Scalar, int _Options, typename _StorageIndex>
 template<typename InputIterators>
diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h
index c6b548f11..229449f02 100644
--- a/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/Eigen/src/SparseCore/SparseMatrixBase.h
@@ -87,6 +87,11 @@ template<typename Derived> class SparseMatrixBase
           * we are dealing with a column-vector (if there is only one column) or with
           * a row-vector (if there is only one row). */
 
+      NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2,
+        /**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors,
+         * and 2 for matrices.
+         */
+
       Flags = internal::traits<Derived>::Flags,
         /**< This stores expression \ref flags flags which may or may not be inherited by new expressions
           * constructed from this one. See the \ref flags "list of flags".
@@ -350,18 +355,6 @@ template<typename Derived> class SparseMatrixBase
     const ConstTransposeReturnType transpose() const { return ConstTransposeReturnType(derived()); }
     const AdjointReturnType adjoint() const { return AdjointReturnType(transpose()); }
 
-    // inner-vector
-    typedef Block<Derived,IsRowMajor?1:Dynamic,IsRowMajor?Dynamic:1,true>       InnerVectorReturnType;
-    typedef Block<const Derived,IsRowMajor?1:Dynamic,IsRowMajor?Dynamic:1,true> ConstInnerVectorReturnType;
-    InnerVectorReturnType innerVector(Index outer);
-    const ConstInnerVectorReturnType innerVector(Index outer) const;
-
-    // set of inner-vectors
-    typedef Block<Derived,Dynamic,Dynamic,true> InnerVectorsReturnType;
-    typedef Block<const Derived,Dynamic,Dynamic,true> ConstInnerVectorsReturnType;
-    InnerVectorsReturnType innerVectors(Index outerStart, Index outerSize);
-    const ConstInnerVectorsReturnType innerVectors(Index outerStart, Index outerSize) const;
-
     DenseMatrixType toDense() const
     {
       return DenseMatrixType(derived());
diff --git a/Eigen/src/SparseCore/SparseProduct.h b/Eigen/src/SparseCore/SparseProduct.h
index 4cbf68781..c495a7398 100644
--- a/Eigen/src/SparseCore/SparseProduct.h
+++ b/Eigen/src/SparseCore/SparseProduct.h
@@ -17,7 +17,7 @@ namespace Eigen {
   * The automatic pruning of the small values can be achieved by calling the pruned() function
   * in which case a totally different product algorithm is employed:
   * \code
-  * C = (A*B).pruned();             // supress numerical zeros (exact)
+  * C = (A*B).pruned();             // suppress numerical zeros (exact)
   * C = (A*B).pruned(ref);
   * C = (A*B).pruned(ref,epsilon);
   * \endcode
diff --git a/Eigen/src/SparseCore/SparseSelfAdjointView.h b/Eigen/src/SparseCore/SparseSelfAdjointView.h
index 5ab64f1a8..65611b3d4 100644
--- a/Eigen/src/SparseCore/SparseSelfAdjointView.h
+++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h
@@ -311,7 +311,7 @@ inline void sparse_selfadjoint_time_dense_product(const SparseLhsType& lhs, cons
         while (i && i.index()<j) ++i;
         if(i && i.index()==j)
         {
-          res(j,k) += alpha * i.value() * rhs(j,k);
+          res.coeffRef(j,k) += alpha * i.value() * rhs.coeff(j,k);
           ++i;
         }
       }
@@ -324,14 +324,14 @@ inline void sparse_selfadjoint_time_dense_product(const SparseLhsType& lhs, cons
       {
         LhsScalar lhs_ij = i.value();
         if(!LhsIsRowMajor) lhs_ij = numext::conj(lhs_ij);
-        res_j += lhs_ij * rhs(i.index(),k);
+        res_j += lhs_ij * rhs.coeff(i.index(),k);
         res(i.index(),k) += numext::conj(lhs_ij) * rhs_j;
       }
-      res(j,k) += alpha * res_j;
+      res.coeffRef(j,k) += alpha * res_j;
 
       // handle diagonal coeff
       if (ProcessFirstHalf && i && (i.index()==j))
-        res(j,k) += alpha * i.value() * rhs(j,k);
+        res.coeffRef(j,k) += alpha * i.value() * rhs.coeff(j,k);
     }
   }
 }
diff --git a/Eigen/src/SparseCore/SparseVector.h b/Eigen/src/SparseCore/SparseVector.h
index 19b0fbc9d..05779be68 100644
--- a/Eigen/src/SparseCore/SparseVector.h
+++ b/Eigen/src/SparseCore/SparseVector.h
@@ -281,7 +281,7 @@ class SparseVector
     }
 
     /** Swaps the values of \c *this and \a other.
-      * Overloaded for performance: this version performs a \em shallow swap by swaping pointers and attributes only.
+      * Overloaded for performance: this version performs a \em shallow swap by swapping pointers and attributes only.
       * \sa SparseMatrixBase::swap()
       */
     inline void swap(SparseVector& other)
diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h
index f883ab383..91b6369ac 100644
--- a/Eigen/src/SparseLU/SparseLU.h
+++ b/Eigen/src/SparseLU/SparseLU.h
@@ -193,7 +193,7 @@ class SparseLU : public SparseSolverBase<SparseLU<_MatrixType,_OrderingType> >,
     
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the LU factorization reports a problem, zero diagonal for instance
       *          \c InvalidInput if the input matrix is invalid
       *
@@ -499,11 +499,8 @@ void SparseLU<MatrixType, OrderingType>::factorize(const MatrixType& matrix)
   eigen_assert(m_analysisIsOk && "analyzePattern() should be called first"); 
   eigen_assert((matrix.rows() == matrix.cols()) && "Only for squared matrices");
   
-  typedef typename IndexVector::Scalar StorageIndex; 
-  
   m_isInitialized = true;
   
-  
   // Apply the column permutation computed in analyzepattern()
   //   m_mat = matrix * m_perm_c.inverse(); 
   m_mat = matrix;
@@ -706,8 +703,8 @@ struct SparseLUMatrixLReturnType : internal::no_assignment_operator
   typedef typename MappedSupernodalType::Scalar Scalar;
   explicit SparseLUMatrixLReturnType(const MappedSupernodalType& mapL) : m_mapL(mapL)
   { }
-  Index rows() { return m_mapL.rows(); }
-  Index cols() { return m_mapL.cols(); }
+  Index rows() const { return m_mapL.rows(); }
+  Index cols() const { return m_mapL.cols(); }
   template<typename Dest>
   void solveInPlace( MatrixBase<Dest> &X) const
   {
@@ -723,8 +720,8 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator
   SparseLUMatrixUReturnType(const MatrixLType& mapL, const MatrixUType& mapU)
   : m_mapL(mapL),m_mapU(mapU)
   { }
-  Index rows() { return m_mapL.rows(); }
-  Index cols() { return m_mapL.cols(); }
+  Index rows() const { return m_mapL.rows(); }
+  Index cols() const { return m_mapL.cols(); }
 
   template<typename Dest>   void solveInPlace(MatrixBase<Dest> &X) const
   {
@@ -747,8 +744,9 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator
       }
       else
       {
+        // FIXME: the following lines should use Block expressions and not Map!
         Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) );
-        Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
+        Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X.coeffRef(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
         U = A.template triangularView<Upper>().solve(U);
       }
 
diff --git a/Eigen/src/SparseLU/SparseLU_Memory.h b/Eigen/src/SparseLU/SparseLU_Memory.h
index 4dc42e87b..349bfd585 100644
--- a/Eigen/src/SparseLU/SparseLU_Memory.h
+++ b/Eigen/src/SparseLU/SparseLU_Memory.h
@@ -51,7 +51,7 @@ inline Index LUTempSpace(Index&m, Index& w)
 
 
 /** 
-  * Expand the existing storage to accomodate more fill-ins
+  * Expand the existing storage to accommodate more fill-ins
   * \param vec Valid pointer to the vector to allocate or expand
   * \param[in,out] length  At input, contain the current length of the vector that is to be increased. At output, length of the newly allocated vector
   * \param[in] nbElts Current number of elements in the factors
diff --git a/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h b/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
index 721e1883b..8583b1b69 100644
--- a/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
+++ b/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
@@ -75,12 +75,12 @@ class MappedSuperNodalMatrix
     /**
      * Number of rows
      */
-    Index rows() { return m_row; }
+    Index rows() const { return m_row; }
     
     /**
      * Number of columns
      */
-    Index cols() { return m_col; }
+    Index cols() const { return m_col; }
     
     /**
      * Return the array of nonzero values packed by column
diff --git a/Eigen/src/SparseLU/SparseLU_column_dfs.h b/Eigen/src/SparseLU/SparseLU_column_dfs.h
index c98b30e32..5a2c941b4 100644
--- a/Eigen/src/SparseLU/SparseLU_column_dfs.h
+++ b/Eigen/src/SparseLU/SparseLU_column_dfs.h
@@ -151,7 +151,7 @@ Index SparseLUImpl<Scalar,StorageIndex>::column_dfs(const Index m, const Index j
         StorageIndex ito = glu.xlsub(fsupc+1);
         glu.xlsub(jcolm1) = ito; 
         StorageIndex istop = ito + jptr - jm1ptr; 
-        xprune(jcolm1) = istop; // intialize xprune(jcol-1)
+        xprune(jcolm1) = istop; // initialize xprune(jcol-1)
         glu.xlsub(jcol) = istop; 
         
         for (StorageIndex ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito)
@@ -166,7 +166,7 @@ Index SparseLUImpl<Scalar,StorageIndex>::column_dfs(const Index m, const Index j
   // Tidy up the pointers before exit
   glu.xsup(nsuper+1) = jcolp1; 
   glu.supno(jcolp1) = nsuper; 
-  xprune(jcol) = StorageIndex(nextl);  // Intialize upper bound for pruning
+  xprune(jcol) = StorageIndex(nextl);  // Initialize upper bound for pruning
   glu.xlsub(jcolp1) = StorageIndex(nextl); 
   
   return 0; 
diff --git a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
index 95ba7413f..e37c2fe0d 100644
--- a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
+++ b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
@@ -215,7 +215,7 @@ void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const
         if(RK==4){ a3 = pload<Packet>(A3+i+(I+1)*PacketSize);  }\
                    pstore(C0+i+(I)*PacketSize, c0);
         
-        // agressive vectorization and peeling
+        // aggressive vectorization and peeling
         for(Index i=0; i<actual_b_end1; i+=PacketSize*8)
         {
           EIGEN_ASM_COMMENT("SPARSELU_GEMML_KERNEL2");
diff --git a/Eigen/src/SparseLU/SparseLU_panel_bmod.h b/Eigen/src/SparseLU/SparseLU_panel_bmod.h
index 822cf32c3..f052001c8 100644
--- a/Eigen/src/SparseLU/SparseLU_panel_bmod.h
+++ b/Eigen/src/SparseLU/SparseLU_panel_bmod.h
@@ -38,7 +38,7 @@ namespace internal {
  * \brief Performs numeric block updates (sup-panel) in topological order.
  * 
  * Before entering this routine, the original nonzeros in the panel
- * were already copied i nto the spa[m,w]
+ * were already copied into the spa[m,w]
  * 
  * \param m number of rows in the matrix
  * \param w Panel size
diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h
index f7111fe2e..1a28389e8 100644
--- a/Eigen/src/SparseQR/SparseQR.h
+++ b/Eigen/src/SparseQR/SparseQR.h
@@ -52,7 +52,7 @@ namespace internal {
   * rank-revealing permutations. Use colsPermutation() to get it.
   * 
   * Q is the orthogonal matrix represented as products of Householder reflectors. 
-  * Use matrixQ() to get an expression and matrixQ().transpose() to get the transpose.
+  * Use matrixQ() to get an expression and matrixQ().adjoint() to get the adjoint.
   * You can then apply it to a vector.
   * 
   * R is the sparse triangular or trapezoidal matrix. The later occurs when A is rank-deficient.
@@ -65,6 +65,7 @@ namespace internal {
   * \implsparsesolverconcept
   *
   * \warning The input sparse matrix A must be in compressed mode (see SparseMatrix::makeCompressed()).
+  * \warning For complex matrices matrixQ().transpose() will actually return the adjoint matrix.
   * 
   */
 template<typename _MatrixType, typename _OrderingType>
@@ -196,9 +197,9 @@ class SparseQR : public SparseSolverBase<SparseQR<_MatrixType,_OrderingType> >
 
       Index rank = this->rank();
       
-      // Compute Q^T * b;
+      // Compute Q^* * b;
       typename Dest::PlainObject y, b;
-      y = this->matrixQ().transpose() * B; 
+      y = this->matrixQ().adjoint() * B;
       b = y;
       
       // Solve with the triangular matrix R
@@ -327,7 +328,7 @@ void SparseQR<MatrixType,OrderingType>::analyzePattern(const MatrixType& mat)
   internal::coletree(matCpy, m_etree, m_firstRowElt, m_outputPerm_c.indices().data());
   m_isEtreeOk = true;
   
-  m_R.resize(diagSize, n);
+  m_R.resize(m, n);
   m_Q.resize(m, diagSize);
   
   // Allocate space for nonzero elements : rough estimation
@@ -604,7 +605,7 @@ struct SparseQR_QProduct : ReturnByValue<SparseQR_QProduct<SparseQRType, Derived
   // Get the references 
   SparseQR_QProduct(const SparseQRType& qr, const Derived& other, bool transpose) : 
   m_qr(qr),m_other(other),m_transpose(transpose) {}
-  inline Index rows() const { return m_transpose ? m_qr.rows() : m_qr.cols(); }
+  inline Index rows() const { return m_qr.matrixQ().rows(); }
   inline Index cols() const { return m_other.cols(); }
   
   // Assign to a vector
@@ -632,16 +633,20 @@ struct SparseQR_QProduct : ReturnByValue<SparseQR_QProduct<SparseQRType, Derived
     }
     else
     {
-      eigen_assert(m_qr.m_Q.rows() == m_other.rows() && "Non conforming object sizes");
+      eigen_assert(m_qr.matrixQ().cols() == m_other.rows() && "Non conforming object sizes");
+
+      res.conservativeResize(rows(), cols());
+
       // Compute res = Q * other column by column
       for(Index j = 0; j < res.cols(); j++)
       {
-        for (Index k = diagSize-1; k >=0; k--)
+        Index start_k = internal::is_identity<Derived>::value ? numext::mini(j,diagSize-1) : diagSize-1;
+        for (Index k = start_k; k >=0; k--)
         {
           Scalar tau = Scalar(0);
           tau = m_qr.m_Q.col(k).dot(res.col(j));
           if(tau==Scalar(0)) continue;
-          tau = tau * m_qr.m_hcoeffs(k);
+          tau = tau * numext::conj(m_qr.m_hcoeffs(k));
           res.col(j) -= tau * m_qr.m_Q.col(k);
         }
       }
@@ -650,7 +655,7 @@ struct SparseQR_QProduct : ReturnByValue<SparseQR_QProduct<SparseQRType, Derived
   
   const SparseQRType& m_qr;
   const Derived& m_other;
-  bool m_transpose;
+  bool m_transpose; // TODO this actually means adjoint
 };
 
 template<typename SparseQRType>
@@ -668,13 +673,14 @@ struct SparseQRMatrixQReturnType : public EigenBase<SparseQRMatrixQReturnType<Sp
   {
     return SparseQR_QProduct<SparseQRType,Derived>(m_qr,other.derived(),false);
   }
+  // To use for operations with the adjoint of Q
   SparseQRMatrixQTransposeReturnType<SparseQRType> adjoint() const
   {
     return SparseQRMatrixQTransposeReturnType<SparseQRType>(m_qr);
   }
   inline Index rows() const { return m_qr.rows(); }
-  inline Index cols() const { return (std::min)(m_qr.rows(),m_qr.cols()); }
-  // To use for operations with the transpose of Q
+  inline Index cols() const { return m_qr.rows(); }
+  // To use for operations with the transpose of Q FIXME this is the same as adjoint at the moment
   SparseQRMatrixQTransposeReturnType<SparseQRType> transpose() const
   {
     return SparseQRMatrixQTransposeReturnType<SparseQRType>(m_qr);
@@ -682,6 +688,7 @@ struct SparseQRMatrixQReturnType : public EigenBase<SparseQRMatrixQReturnType<Sp
   const SparseQRType& m_qr;
 };
 
+// TODO this actually represents the adjoint of Q
 template<typename SparseQRType>
 struct SparseQRMatrixQTransposeReturnType
 {
@@ -712,7 +719,7 @@ struct Assignment<DstXprType, SparseQRMatrixQReturnType<SparseQRType>, internal:
   typedef typename DstXprType::StorageIndex StorageIndex;
   static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &/*func*/)
   {
-    typename DstXprType::PlainObject idMat(src.m_qr.rows(), src.m_qr.rows());
+    typename DstXprType::PlainObject idMat(src.rows(), src.cols());
     idMat.setIdentity();
     // Sort the sparse householder reflectors if needed
     const_cast<SparseQRType *>(&src.m_qr)->_sort_matrix_Q();
diff --git a/Eigen/src/SuperLUSupport/SuperLUSupport.h b/Eigen/src/SuperLUSupport/SuperLUSupport.h
index 50a69f306..4bb95eb8b 100644
--- a/Eigen/src/SuperLUSupport/SuperLUSupport.h
+++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h
@@ -352,7 +352,7 @@ class SuperLUBase : public SparseSolverBase<Derived>
     
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the matrix.appears to be negative.
       */
     ComputationInfo info() const
diff --git a/Eigen/src/UmfPackSupport/UmfPackSupport.h b/Eigen/src/UmfPackSupport/UmfPackSupport.h
index 9568cc1d5..ba10a9318 100644
--- a/Eigen/src/UmfPackSupport/UmfPackSupport.h
+++ b/Eigen/src/UmfPackSupport/UmfPackSupport.h
@@ -10,6 +10,16 @@
 #ifndef EIGEN_UMFPACKSUPPORT_H
 #define EIGEN_UMFPACKSUPPORT_H
 
+// for compatibility with super old version of umfpack,
+// not sure this is really needed, but this is harmless.
+#ifndef SuiteSparse_long
+#ifdef UF_long
+#define SuiteSparse_long UF_long
+#else
+#error neither SuiteSparse_long nor UF_long are defined
+#endif
+#endif
+
 namespace Eigen {
 
 /* TODO extract L, extract U, compute det, etc... */
@@ -17,42 +27,85 @@ namespace Eigen {
 // generic double/complex<double> wrapper functions:
 
 
-inline void umfpack_defaults(double control[UMFPACK_CONTROL], double)
+ // Defaults
+inline void umfpack_defaults(double control[UMFPACK_CONTROL], double, int)
 { umfpack_di_defaults(control); }
 
-inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex<double>)
+inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex<double>, int)
 { umfpack_zi_defaults(control); }
 
-inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double)
+inline void umfpack_defaults(double control[UMFPACK_CONTROL], double, SuiteSparse_long)
+{ umfpack_dl_defaults(control); }
+
+inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex<double>, SuiteSparse_long)
+{ umfpack_zl_defaults(control); }
+
+// Report info
+inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double, int)
 { umfpack_di_report_info(control, info);}
 
-inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex<double>)
+inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex<double>, int)
 { umfpack_zi_report_info(control, info);}
 
-inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double)
+inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double, SuiteSparse_long)
+{ umfpack_dl_report_info(control, info);}
+
+inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex<double>, SuiteSparse_long)
+{ umfpack_zl_report_info(control, info);}
+
+// Report status
+inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double, int)
 { umfpack_di_report_status(control, status);}
 
-inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex<double>)
+inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex<double>, int)
 { umfpack_zi_report_status(control, status);}
 
-inline void umfpack_report_control(double control[UMFPACK_CONTROL], double)
+inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double, SuiteSparse_long)
+{ umfpack_dl_report_status(control, status);}
+
+inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex<double>, SuiteSparse_long)
+{ umfpack_zl_report_status(control, status);}
+
+// report control
+inline void umfpack_report_control(double control[UMFPACK_CONTROL], double, int)
 { umfpack_di_report_control(control);}
 
-inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex<double>)
+inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex<double>, int)
 { umfpack_zi_report_control(control);}
 
-inline void umfpack_free_numeric(void **Numeric, double)
+inline void umfpack_report_control(double control[UMFPACK_CONTROL], double, SuiteSparse_long)
+{ umfpack_dl_report_control(control);}
+
+inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex<double>, SuiteSparse_long)
+{ umfpack_zl_report_control(control);}
+
+// Free numeric
+inline void umfpack_free_numeric(void **Numeric, double, int)
 { umfpack_di_free_numeric(Numeric); *Numeric = 0; }
 
-inline void umfpack_free_numeric(void **Numeric, std::complex<double>)
+inline void umfpack_free_numeric(void **Numeric, std::complex<double>, int)
 { umfpack_zi_free_numeric(Numeric); *Numeric = 0; }
 
-inline void umfpack_free_symbolic(void **Symbolic, double)
+inline void umfpack_free_numeric(void **Numeric, double, SuiteSparse_long)
+{ umfpack_dl_free_numeric(Numeric); *Numeric = 0; }
+
+inline void umfpack_free_numeric(void **Numeric, std::complex<double>, SuiteSparse_long)
+{ umfpack_zl_free_numeric(Numeric); *Numeric = 0; }
+
+// Free symbolic
+inline void umfpack_free_symbolic(void **Symbolic, double, int)
 { umfpack_di_free_symbolic(Symbolic); *Symbolic = 0; }
 
-inline void umfpack_free_symbolic(void **Symbolic, std::complex<double>)
+inline void umfpack_free_symbolic(void **Symbolic, std::complex<double>, int)
 { umfpack_zi_free_symbolic(Symbolic); *Symbolic = 0; }
 
+inline void umfpack_free_symbolic(void **Symbolic, double, SuiteSparse_long)
+{ umfpack_dl_free_symbolic(Symbolic); *Symbolic = 0; }
+
+inline void umfpack_free_symbolic(void **Symbolic, std::complex<double>, SuiteSparse_long)
+{ umfpack_zl_free_symbolic(Symbolic); *Symbolic = 0; }
+
+// Symbolic
 inline int umfpack_symbolic(int n_row,int n_col,
                             const int Ap[], const int Ai[], const double Ax[], void **Symbolic,
                             const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO])
@@ -66,7 +119,21 @@ inline int umfpack_symbolic(int n_row,int n_col,
 {
   return umfpack_zi_symbolic(n_row,n_col,Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Control,Info);
 }
+inline SuiteSparse_long umfpack_symbolic( SuiteSparse_long n_row,SuiteSparse_long n_col,
+                                          const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const double Ax[], void **Symbolic,
+                                          const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO])
+{
+  return umfpack_dl_symbolic(n_row,n_col,Ap,Ai,Ax,Symbolic,Control,Info);
+}
 
+inline SuiteSparse_long umfpack_symbolic( SuiteSparse_long n_row,SuiteSparse_long n_col,
+                                          const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const std::complex<double> Ax[], void **Symbolic,
+                                          const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO])
+{
+  return umfpack_zl_symbolic(n_row,n_col,Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Control,Info);
+}
+
+// Numeric
 inline int umfpack_numeric( const int Ap[], const int Ai[], const double Ax[],
                             void *Symbolic, void **Numeric,
                             const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO])
@@ -80,7 +147,21 @@ inline int umfpack_numeric( const int Ap[], const int Ai[], const std::complex<d
 {
   return umfpack_zi_numeric(Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Numeric,Control,Info);
 }
+inline SuiteSparse_long umfpack_numeric(const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const double Ax[],
+                                        void *Symbolic, void **Numeric,
+                                        const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO])
+{
+  return umfpack_dl_numeric(Ap,Ai,Ax,Symbolic,Numeric,Control,Info);
+}
 
+inline SuiteSparse_long umfpack_numeric(const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const std::complex<double> Ax[],
+                                        void *Symbolic, void **Numeric,
+                                        const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO])
+{
+  return umfpack_zl_numeric(Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Numeric,Control,Info);
+}
+
+// solve
 inline int umfpack_solve( int sys, const int Ap[], const int Ai[], const double Ax[],
                           double X[], const double B[], void *Numeric,
                           const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO])
@@ -95,6 +176,21 @@ inline int umfpack_solve( int sys, const int Ap[], const int Ai[], const std::co
   return umfpack_zi_solve(sys,Ap,Ai,&numext::real_ref(Ax[0]),0,&numext::real_ref(X[0]),0,&numext::real_ref(B[0]),0,Numeric,Control,Info);
 }
 
+inline SuiteSparse_long umfpack_solve(int sys, const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const double Ax[],
+                                      double X[], const double B[], void *Numeric,
+                                      const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO])
+{
+  return umfpack_dl_solve(sys,Ap,Ai,Ax,X,B,Numeric,Control,Info);
+}
+
+inline SuiteSparse_long umfpack_solve(int sys, const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const std::complex<double> Ax[],
+                                      std::complex<double> X[], const std::complex<double> B[], void *Numeric,
+                                      const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO])
+{
+  return umfpack_zl_solve(sys,Ap,Ai,&numext::real_ref(Ax[0]),0,&numext::real_ref(X[0]),0,&numext::real_ref(B[0]),0,Numeric,Control,Info);
+}
+
+// Get Lunz
 inline int umfpack_get_lunz(int *lnz, int *unz, int *n_row, int *n_col, int *nz_udiag, void *Numeric, double)
 {
   return umfpack_di_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric);
@@ -105,6 +201,19 @@ inline int umfpack_get_lunz(int *lnz, int *unz, int *n_row, int *n_col, int *nz_
   return umfpack_zi_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric);
 }
 
+inline SuiteSparse_long umfpack_get_lunz( SuiteSparse_long *lnz, SuiteSparse_long *unz, SuiteSparse_long *n_row, SuiteSparse_long *n_col,
+                                          SuiteSparse_long *nz_udiag, void *Numeric, double)
+{
+  return umfpack_dl_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric);
+}
+
+inline SuiteSparse_long umfpack_get_lunz( SuiteSparse_long *lnz, SuiteSparse_long *unz, SuiteSparse_long *n_row, SuiteSparse_long *n_col,
+                                          SuiteSparse_long *nz_udiag, void *Numeric, std::complex<double>)
+{
+  return umfpack_zl_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric);
+}
+
+// Get Numeric
 inline int umfpack_get_numeric(int Lp[], int Lj[], double Lx[], int Up[], int Ui[], double Ux[],
                                int P[], int Q[], double Dx[], int *do_recip, double Rs[], void *Numeric)
 {
@@ -120,18 +229,45 @@ inline int umfpack_get_numeric(int Lp[], int Lj[], std::complex<double> Lx[], in
   return umfpack_zi_get_numeric(Lp,Lj,Lx?&lx0_real:0,0,Up,Ui,Ux?&ux0_real:0,0,P,Q,
                                 Dx?&dx0_real:0,0,do_recip,Rs,Numeric);
 }
+inline SuiteSparse_long umfpack_get_numeric(SuiteSparse_long Lp[], SuiteSparse_long Lj[], double Lx[], SuiteSparse_long Up[], SuiteSparse_long Ui[], double Ux[],
+                                            SuiteSparse_long P[], SuiteSparse_long Q[], double Dx[], SuiteSparse_long *do_recip, double Rs[], void *Numeric)
+{
+  return umfpack_dl_get_numeric(Lp,Lj,Lx,Up,Ui,Ux,P,Q,Dx,do_recip,Rs,Numeric);
+}
 
-inline int umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO])
+inline SuiteSparse_long umfpack_get_numeric(SuiteSparse_long Lp[], SuiteSparse_long Lj[], std::complex<double> Lx[], SuiteSparse_long Up[], SuiteSparse_long Ui[], std::complex<double> Ux[],
+                                            SuiteSparse_long P[], SuiteSparse_long Q[], std::complex<double> Dx[], SuiteSparse_long *do_recip, double Rs[], void *Numeric)
+{
+  double& lx0_real = numext::real_ref(Lx[0]);
+  double& ux0_real = numext::real_ref(Ux[0]);
+  double& dx0_real = numext::real_ref(Dx[0]);
+  return umfpack_zl_get_numeric(Lp,Lj,Lx?&lx0_real:0,0,Up,Ui,Ux?&ux0_real:0,0,P,Q,
+                                Dx?&dx0_real:0,0,do_recip,Rs,Numeric);
+}
+
+// Get Determinant
+inline int umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], int)
 {
   return umfpack_di_get_determinant(Mx,Ex,NumericHandle,User_Info);
 }
 
-inline int umfpack_get_determinant(std::complex<double> *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO])
+inline int umfpack_get_determinant(std::complex<double> *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], int)
 {
   double& mx_real = numext::real_ref(*Mx);
   return umfpack_zi_get_determinant(&mx_real,0,Ex,NumericHandle,User_Info);
 }
 
+inline SuiteSparse_long umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], SuiteSparse_long)
+{
+  return umfpack_dl_get_determinant(Mx,Ex,NumericHandle,User_Info);
+}
+
+inline SuiteSparse_long umfpack_get_determinant(std::complex<double> *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], SuiteSparse_long)
+{
+  double& mx_real = numext::real_ref(*Mx);
+  return umfpack_zl_get_determinant(&mx_real,0,Ex,NumericHandle,User_Info);
+}
+
 
 /** \ingroup UmfPackSupport_Module
   * \brief A sparse LU factorization and solver based on UmfPack
@@ -164,7 +300,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
     typedef Matrix<int, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
     typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
     typedef SparseMatrix<Scalar> LUMatrixType;
-    typedef SparseMatrix<Scalar,ColMajor,int> UmfpackMatrixType;
+    typedef SparseMatrix<Scalar,ColMajor,StorageIndex> UmfpackMatrixType;
     typedef Ref<const UmfpackMatrixType, StandardCompressedFormat> UmfpackMatrixRef;
     enum {
       ColsAtCompileTime = MatrixType::ColsAtCompileTime,
@@ -192,8 +328,8 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
 
     ~UmfPackLU()
     {
-      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar());
-      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar());
+      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(), StorageIndex());
+      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar(), StorageIndex());
     }
 
     inline Index rows() const { return mp_matrix.rows(); }
@@ -201,7 +337,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the matrix.appears to be negative.
       */
     ComputationInfo info() const
@@ -241,8 +377,8 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
     template<typename InputMatrixType>
     void compute(const InputMatrixType& matrix)
     {
-      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar());
-      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar());
+      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(),StorageIndex());
+      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex());
       grab(matrix.derived());
       analyzePattern_impl();
       factorize_impl();
@@ -257,8 +393,8 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
     template<typename InputMatrixType>
     void analyzePattern(const InputMatrixType& matrix)
     {
-      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar());
-      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar());
+      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(),StorageIndex());
+      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex());
 
       grab(matrix.derived());
 
@@ -309,7 +445,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
     {
       eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()");
       if(m_numeric)
-        umfpack_free_numeric(&m_numeric,Scalar());
+        umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex());
 
       grab(matrix.derived());
 
@@ -322,7 +458,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
       */
     void printUmfpackControl()
     {
-      umfpack_report_control(m_control.data(), Scalar());
+      umfpack_report_control(m_control.data(), Scalar(),StorageIndex());
     }
 
     /** Prints statistics collected by UmfPack.
@@ -332,7 +468,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
     void printUmfpackInfo()
     {
       eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()");
-      umfpack_report_info(m_control.data(), m_umfpackInfo.data(), Scalar());
+      umfpack_report_info(m_control.data(), m_umfpackInfo.data(), Scalar(),StorageIndex());
     }
 
     /** Prints the status of the previous factorization operation performed by UmfPack (symbolic or numerical factorization).
@@ -341,7 +477,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
       */
     void printUmfpackStatus() {
       eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()");
-      umfpack_report_status(m_control.data(), m_fact_errorCode, Scalar());
+      umfpack_report_status(m_control.data(), m_fact_errorCode, Scalar(),StorageIndex());
     }
 
     /** \internal */
@@ -362,13 +498,13 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
       m_symbolic              = 0;
       m_extractedDataAreDirty = true;
 
-      umfpack_defaults(m_control.data(), Scalar());
+      umfpack_defaults(m_control.data(), Scalar(),StorageIndex());
     }
 
     void analyzePattern_impl()
     {
-      m_fact_errorCode = umfpack_symbolic(internal::convert_index<int>(mp_matrix.rows()),
-                                          internal::convert_index<int>(mp_matrix.cols()),
+      m_fact_errorCode = umfpack_symbolic(internal::convert_index<StorageIndex>(mp_matrix.rows()),
+                                          internal::convert_index<StorageIndex>(mp_matrix.cols()),
                                           mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
                                           &m_symbolic, m_control.data(), m_umfpackInfo.data());
 
@@ -408,7 +544,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
 
     // cached data to reduce reallocation, etc.
     mutable LUMatrixType m_l;
-    int m_fact_errorCode;
+    StorageIndex m_fact_errorCode;
     UmfpackControl m_control;
     mutable UmfpackInfo m_umfpackInfo;
 
@@ -438,7 +574,7 @@ void UmfPackLU<MatrixType>::extractData() const
   if (m_extractedDataAreDirty)
   {
     // get size of the data
-    int lnz, unz, rows, cols, nz_udiag;
+    StorageIndex lnz, unz, rows, cols, nz_udiag;
     umfpack_get_lunz(&lnz, &unz, &rows, &cols, &nz_udiag, m_numeric, Scalar());
 
     // allocate data
@@ -464,7 +600,7 @@ template<typename MatrixType>
 typename UmfPackLU<MatrixType>::Scalar UmfPackLU<MatrixType>::determinant() const
 {
   Scalar det;
-  umfpack_get_determinant(&det, 0, m_numeric, 0);
+  umfpack_get_determinant(&det, 0, m_numeric, 0, StorageIndex());
   return det;
 }
 
@@ -477,7 +613,7 @@ bool UmfPackLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBas
   eigen_assert((XDerived::Flags&RowMajorBit)==0 && "UmfPackLU backend does not support non col-major result yet");
   eigen_assert(b.derived().data() != x.derived().data() && " Umfpack does not support inplace solve");
 
-  int errorCode;
+  StorageIndex errorCode;
   Scalar* x_ptr = 0;
   Matrix<Scalar,Dynamic,1> x_tmp;
   if(x.innerStride()!=1)
@@ -489,9 +625,10 @@ bool UmfPackLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBas
   {
     if(x.innerStride()==1)
       x_ptr = &x.col(j).coeffRef(0);
-    errorCode = umfpack_solve(UMFPACK_A,
-        mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
-        x_ptr, &b.const_cast_derived().col(j).coeffRef(0), m_numeric, m_control.data(), m_umfpackInfo.data());
+      errorCode = umfpack_solve(UMFPACK_A,
+                                mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
+                                x_ptr, &b.const_cast_derived().col(j).coeffRef(0),
+                                m_numeric, m_control.data(), m_umfpackInfo.data());
     if(x.innerStride()!=1)
       x.col(j) = x_tmp;
     if (errorCode!=0)
diff --git a/Eigen/src/plugins/BlockMethods.h b/Eigen/src/plugins/BlockMethods.h
index 5caf14469..67fdebc6f 100644
--- a/Eigen/src/plugins/BlockMethods.h
+++ b/Eigen/src/plugins/BlockMethods.h
@@ -40,6 +40,14 @@ typedef const VectorBlock<const Derived> ConstSegmentReturnType;
 template<int Size> struct FixedSegmentReturnType { typedef VectorBlock<Derived, Size> Type; };
 template<int Size> struct ConstFixedSegmentReturnType { typedef const VectorBlock<const Derived, Size> Type; };
 
+/// \internal inner-vector
+typedef Block<Derived,IsRowMajor?1:Dynamic,IsRowMajor?Dynamic:1,true>       InnerVectorReturnType;
+typedef Block<const Derived,IsRowMajor?1:Dynamic,IsRowMajor?Dynamic:1,true> ConstInnerVectorReturnType;
+
+/// \internal set of inner-vectors
+typedef Block<Derived,Dynamic,Dynamic,true> InnerVectorsReturnType;
+typedef Block<const Derived,Dynamic,Dynamic,true> ConstInnerVectorsReturnType;
+
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
 /// \returns an expression of a block in \c *this with either dynamic or fixed sizes.
@@ -1036,7 +1044,7 @@ inline const typename ConstFixedBlockXpr<NRows,NCols>::Type block(Index startRow
 /// \a NRows is \a Dynamic, and the same for the number of columns.
 ///
 /// Example: \include MatrixBase_template_int_int_block_int_int_int_int.cpp
-/// Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.out
 ///
 /// \note The usage of of this overload is discouraged from %Eigen 3.4, better used the generic
 /// block(Index,Index,NRowsType,NColsType), here is the one-to-one complete equivalence:
@@ -1053,6 +1061,7 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 /// \sa block(Index,Index,NRowsType,NColsType), class Block
 ///
 template<int NRows, int NCols>
+EIGEN_DEVICE_FUNC
 inline typename FixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol,
                                                   Index blockRows, Index blockCols)
 {
@@ -1354,3 +1363,39 @@ inline typename ConstFixedSegmentReturnType<N>::Type tail(Index n = N) const
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename ConstFixedSegmentReturnType<N>::Type(derived(), size() - n);
 }
+
+/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+/// is col-major (resp. row-major).
+///
+InnerVectorReturnType innerVector(Index outer)
+{ return InnerVectorReturnType(derived(), outer); }
+
+/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+/// is col-major (resp. row-major). Read-only.
+///
+const ConstInnerVectorReturnType innerVector(Index outer) const
+{ return ConstInnerVectorReturnType(derived(), outer); }
+
+/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+/// is col-major (resp. row-major).
+///
+InnerVectorsReturnType
+innerVectors(Index outerStart, Index outerSize)
+{
+  return Block<Derived,Dynamic,Dynamic,true>(derived(),
+                                             IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
+                                             IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
+
+}
+
+/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+/// is col-major (resp. row-major). Read-only.
+///
+const ConstInnerVectorsReturnType
+innerVectors(Index outerStart, Index outerSize) const
+{
+  return Block<const Derived,Dynamic,Dynamic,true>(derived(),
+                                                  IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
+                                                  IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
+
+}
diff --git a/Eigen/src/plugins/IndexedViewMethods.h b/Eigen/src/plugins/IndexedViewMethods.h
index a7ec63adf..d435aa597 100644
--- a/Eigen/src/plugins/IndexedViewMethods.h
+++ b/Eigen/src/plugins/IndexedViewMethods.h
@@ -53,11 +53,6 @@ ivcSize(const Indices& indices) const {
   return internal::makeIndexedViewCompatible(indices, internal::variable_if_dynamic<Index,SizeAtCompileTime>(derived().size()),Specialized);
 }
 
-template<typename RowIndices, typename ColIndices>
-struct valid_indexed_view_overload {
-  enum { value = !(internal::is_valid_index_type<RowIndices>::value && internal::is_valid_index_type<ColIndices>::value) };
-};
-
 public:
 
 #endif
@@ -72,7 +67,7 @@ struct EIGEN_INDEXED_VIEW_METHOD_TYPE {
 // This is the generic version
 
 template<typename RowIndices, typename ColIndices>
-typename internal::enable_if<valid_indexed_view_overload<RowIndices,ColIndices>::value
+typename internal::enable_if<internal::valid_indexed_view_overload<RowIndices,ColIndices>::value
   && internal::traits<typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>::ReturnAsIndexedView,
   typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type >::type
 operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
@@ -84,7 +79,7 @@ operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_IND
 // The following overload returns a Block<> object
 
 template<typename RowIndices, typename ColIndices>
-typename internal::enable_if<valid_indexed_view_overload<RowIndices,ColIndices>::value
+typename internal::enable_if<internal::valid_indexed_view_overload<RowIndices,ColIndices>::value
   && internal::traits<typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>::ReturnAsBlock,
   typename internal::traits<typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>::BlockType>::type
 operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
@@ -102,7 +97,7 @@ operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_IND
 // The following overload returns a Scalar
 
 template<typename RowIndices, typename ColIndices>
-typename internal::enable_if<valid_indexed_view_overload<RowIndices,ColIndices>::value
+typename internal::enable_if<internal::valid_indexed_view_overload<RowIndices,ColIndices>::value
   && internal::traits<typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>::ReturnAsScalar,
   CoeffReturnType >::type
 operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
@@ -112,7 +107,7 @@ operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_IND
 
 #if EIGEN_HAS_STATIC_ARRAY_TEMPLATE
 
-// The folowing three overloads are needed to handle raw Index[N] arrays.
+// The following three overloads are needed to handle raw Index[N] arrays.
 
 template<typename RowIndicesT, std::size_t RowIndicesN, typename ColIndices>
 IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,const RowIndicesT (&)[RowIndicesN],typename IvcColType<ColIndices>::type>
diff --git a/README.md b/README.md
index 4654a81c3..99c9e2933 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,7 @@
 **Eigen is a C++ template library for linear algebra: matrices, vectors, numerical solvers, and related algorithms.**
 
 For more information go to http://eigen.tuxfamily.org/.
+
+For ***pull request*** please only use the official repository at https://bitbucket.org/eigen/eigen.
+
+For ***bug reports*** and ***feature requests*** go to http://eigen.tuxfamily.org/bz.
diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp
index d563a1d2d..6bc4aca3d 100644
--- a/bench/analyze-blocking-sizes.cpp
+++ b/bench/analyze-blocking-sizes.cpp
@@ -825,7 +825,7 @@ int main(int argc, char* argv[])
   }
   for (int i = 1; i < argc; i++) {
     bool arg_handled = false;
-    // Step 1. Try to match action invokation names.
+    // Step 1. Try to match action invocation names.
     for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
       if (!strcmp(argv[i], (*it)->invokation_name())) {
         if (!action) {
diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp
index 8528c5587..688d99c4a 100644
--- a/bench/bench_gemm.cpp
+++ b/bench/bench_gemm.cpp
@@ -129,7 +129,7 @@ void matlab_cplx_real(const M& ar, const M& ai, const M& b, M& cr, M& ci)
 template<typename A, typename B, typename C>
 EIGEN_DONT_INLINE void gemm(const A& a, const B& b, C& c)
 {
- c.noalias() += a * b;
+  c.noalias() += a * b;
 }
 
 int main(int argc, char ** argv)
diff --git a/bench/btl/README b/bench/btl/README
index f3f5fb36f..ebed88960 100644
--- a/bench/btl/README
+++ b/bench/btl/README
@@ -36,7 +36,7 @@ For instance:
 
 You can also select a given set of actions defining the environment variable BTL_CONFIG this way:
   BTL_CONFIG="-a action1{:action2}*" ctest -V
-An exemple:
+An example:
   BTL_CONFIG="-a axpy:vector_matrix:trisolve:ata" ctest -V -R eigen2
 
 Finally, if bench results already exist (the bench*.dat files) then they merges by keeping the best for each matrix size. If you want to overwrite the previous ones you can simply add the "--overwrite" option:
diff --git a/bench/btl/generic_bench/bench.hh b/bench/btl/generic_bench/bench.hh
index 7b7b951b5..0732940d5 100644
--- a/bench/btl/generic_bench/bench.hh
+++ b/bench/btl/generic_bench/bench.hh
@@ -159,7 +159,7 @@ BTL_DONT_INLINE void bench( int size_min, int size_max, int nb_point ){
 //    bench<Mixed_Perf_Analyzer,Action>(size_min,size_max,nb_point);
 
 
-  // Only for small problem size. Otherwize it will be too long
+  // Only for small problem size. Otherwise it will be too long
 //   bench<X86_Perf_Analyzer,Action>(size_min,size_max,nb_point);
 //   bench<STL_Perf_Analyzer,Action>(size_min,size_max,nb_point);
 
diff --git a/bench/btl/generic_bench/utils/size_log.hh b/bench/btl/generic_bench/utils/size_log.hh
index 13a3da7a8..68945e7cc 100644
--- a/bench/btl/generic_bench/utils/size_log.hh
+++ b/bench/btl/generic_bench/utils/size_log.hh
@@ -23,7 +23,7 @@
 #include "math.h"
 // The Vector class must satisfy the following part of STL vector concept :
 //            resize() method
-//            [] operator for seting element
+//            [] operator for setting element
 // the vector element are int compatible.
 template<class Vector>
 void size_log(const int nb_point, const int size_min, const int size_max, Vector & X)
diff --git a/bench/btl/generic_bench/utils/xy_file.hh b/bench/btl/generic_bench/utils/xy_file.hh
index 4571bed8f..0492faf09 100644
--- a/bench/btl/generic_bench/utils/xy_file.hh
+++ b/bench/btl/generic_bench/utils/xy_file.hh
@@ -55,7 +55,7 @@ bool read_xy_file(const std::string & filename, std::vector<int> & tab_sizes,
 
 // The Vector class must satisfy the following part of STL vector concept :
 //            resize() method
-//            [] operator for seting element
+//            [] operator for setting element
 // the vector element must have the << operator define
 
 using namespace std;
diff --git a/bench/btl/libs/ublas/ublas_interface.hh b/bench/btl/libs/ublas/ublas_interface.hh
index 95cad5195..f59b7cf2f 100644
--- a/bench/btl/libs/ublas/ublas_interface.hh
+++ b/bench/btl/libs/ublas/ublas_interface.hh
@@ -100,7 +100,7 @@ public :
     Y+=coef*X;
   }
 
-  // alias free assignements
+  // alias free assignments
 
   static inline void matrix_vector_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){
     X.assign(prod(A,B));
diff --git a/bench/eig33.cpp b/bench/eig33.cpp
index 47947a9be..f003d8a53 100644
--- a/bench/eig33.cpp
+++ b/bench/eig33.cpp
@@ -101,7 +101,7 @@ void eigen33(const Matrix& mat, Matrix& evecs, Vector& evals)
   computeRoots(scaledMat,evals);
 
   // compute the eigen vectors
-  // **here we assume 3 differents eigenvalues**
+  // **here we assume 3 different eigenvalues**
 
   // "optimized version" which appears to be slower with gcc!
 //     Vector base;
diff --git a/bench/spbench/spbenchsolver.cpp b/bench/spbench/spbenchsolver.cpp
index 4acd0039c..2a7351124 100644
--- a/bench/spbench/spbenchsolver.cpp
+++ b/bench/spbench/spbenchsolver.cpp
@@ -54,7 +54,7 @@ int main(int argc, char ** args)
       statbuf.close();
     }
     else
-      std::cerr << "Unable to open the provided file for writting... \n";
+      std::cerr << "Unable to open the provided file for writing... \n";
   }       
   
   // Get the maximum number of iterations and the tolerance
diff --git a/blas/f2c/ctbmv.c b/blas/f2c/ctbmv.c
index 790fd581f..a6e0dae80 100644
--- a/blas/f2c/ctbmv.c
+++ b/blas/f2c/ctbmv.c
@@ -147,7 +147,7 @@
 /*           ( 1 + ( n - 1 )*abs( INCX ) ). */
 /*           Before entry, the incremented array X must contain the n */
 /*           element vector x. On exit, X is overwritten with the */
-/*           tranformed vector x. */
+/*           transformed vector x. */
 
 /*  INCX   - INTEGER. */
 /*           On entry, INCX specifies the increment for the elements of */
diff --git a/blas/f2c/dtbmv.c b/blas/f2c/dtbmv.c
index fdf73ebb5..aa67d19da 100644
--- a/blas/f2c/dtbmv.c
+++ b/blas/f2c/dtbmv.c
@@ -143,7 +143,7 @@
 /*           ( 1 + ( n - 1 )*abs( INCX ) ). */
 /*           Before entry, the incremented array X must contain the n */
 /*           element vector x. On exit, X is overwritten with the */
-/*           tranformed vector x. */
+/*           transformed vector x. */
 
 /*  INCX   - INTEGER. */
 /*           On entry, INCX specifies the increment for the elements of */
diff --git a/blas/f2c/stbmv.c b/blas/f2c/stbmv.c
index fcf9ce336..b5a68b545 100644
--- a/blas/f2c/stbmv.c
+++ b/blas/f2c/stbmv.c
@@ -143,7 +143,7 @@
 /*           ( 1 + ( n - 1 )*abs( INCX ) ). */
 /*           Before entry, the incremented array X must contain the n */
 /*           element vector x. On exit, X is overwritten with the */
-/*           tranformed vector x. */
+/*           transformed vector x. */
 
 /*  INCX   - INTEGER. */
 /*           On entry, INCX specifies the increment for the elements of */
diff --git a/blas/f2c/ztbmv.c b/blas/f2c/ztbmv.c
index 4cdcd7f88..3bf0beb01 100644
--- a/blas/f2c/ztbmv.c
+++ b/blas/f2c/ztbmv.c
@@ -147,7 +147,7 @@
 /*           ( 1 + ( n - 1 )*abs( INCX ) ). */
 /*           Before entry, the incremented array X must contain the n */
 /*           element vector x. On exit, X is overwritten with the */
-/*           tranformed vector x. */
+/*           transformed vector x. */
 
 /*  INCX   - INTEGER. */
 /*           On entry, INCX specifies the increment for the elements of */
diff --git a/blas/level1_impl.h b/blas/level1_impl.h
index f857bfa20..6e7f8c976 100644
--- a/blas/level1_impl.h
+++ b/blas/level1_impl.h
@@ -33,7 +33,7 @@ int EIGEN_BLAS_FUNC(copy)(int *n, RealScalar *px, int *incx, RealScalar *py, int
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* y = reinterpret_cast<Scalar*>(py);
 
-  // be carefull, *incx==0 is allowed !!
+  // be careful, *incx==0 is allowed !!
   if(*incx==1 && *incy==1)
     make_vector(y,*n) = make_vector(x,*n);
   else
diff --git a/blas/testing/cblat1.f b/blas/testing/cblat1.f
index 8ca67fb19..73015f5a9 100644
--- a/blas/testing/cblat1.f
+++ b/blas/testing/cblat1.f
@@ -619,7 +619,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/blas/testing/dblat1.f b/blas/testing/dblat1.f
index 30691f9bf..03d9f1345 100644
--- a/blas/testing/dblat1.f
+++ b/blas/testing/dblat1.f
@@ -990,7 +990,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/blas/testing/sblat1.f b/blas/testing/sblat1.f
index 6657c2693..4d43d9b48 100644
--- a/blas/testing/sblat1.f
+++ b/blas/testing/sblat1.f
@@ -946,7 +946,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/blas/testing/zblat1.f b/blas/testing/zblat1.f
index d30112c63..c00b67dc8 100644
--- a/blas/testing/zblat1.f
+++ b/blas/testing/zblat1.f
@@ -619,7 +619,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/cmake/EigenConfigureTesting.cmake b/cmake/EigenConfigureTesting.cmake
index afc24b5e9..ba88228a0 100644
--- a/cmake/EigenConfigureTesting.cmake
+++ b/cmake/EigenConfigureTesting.cmake
@@ -11,16 +11,18 @@ add_custom_target(buildtests)
 add_custom_target(check COMMAND "ctest")
 add_dependencies(check buildtests)
 
-# check whether /bin/bash exists
-find_file(EIGEN_BIN_BASH_EXISTS "/bin/bash" PATHS "/" NO_DEFAULT_PATH)
+# check whether /bin/bash exists (disabled as not used anymore)
+# find_file(EIGEN_BIN_BASH_EXISTS "/bin/bash" PATHS "/" NO_DEFAULT_PATH)
 
 # This call activates testing and generates the DartConfiguration.tcl
 include(CTest)
 
 set(EIGEN_TEST_BUILD_FLAGS "" CACHE STRING "Options passed to the build command of unit tests")
+set(EIGEN_DASHBOARD_BUILD_TARGET "buildtests" CACHE STRING "Target to be built in dashboard mode, default is buildtests")
+set(EIGEN_CTEST_ERROR_EXCEPTION "" CACHE STRING "Regular expression for build error messages to be filtered out")
 
 # Overwrite default DartConfiguration.tcl such that ctest can build our unit tests.
-# Recall that our unit tests are not in the "all" target, so we have to explicitely ask ctest to build our custom 'buildtests' target.
+# Recall that our unit tests are not in the "all" target, so we have to explicitly ask ctest to build our custom 'buildtests' target.
 # At this stage, we can also add custom flags to the build tool through the user defined EIGEN_TEST_BUILD_FLAGS variable.
 file(READ  "${CMAKE_CURRENT_BINARY_DIR}/DartConfiguration.tcl" EIGEN_DART_CONFIG_FILE)
 # try to grab the default flags
@@ -28,7 +30,7 @@ string(REGEX MATCH "MakeCommand:.*-- (.*)\nDefaultCTestConfigurationType" EIGEN_
 if(NOT CMAKE_MATCH_1)
 string(REGEX MATCH "MakeCommand:.*[^c]make (.*)\nDefaultCTestConfigurationType" EIGEN_DUMMY ${EIGEN_DART_CONFIG_FILE})
 endif()
-string(REGEX REPLACE "MakeCommand:.*DefaultCTestConfigurationType" "MakeCommand: ${CMAKE_COMMAND} --build . --target buildtests --config \"\${CTEST_CONFIGURATION_TYPE}\" -- ${CMAKE_MATCH_1} ${EIGEN_TEST_BUILD_FLAGS}\nDefaultCTestConfigurationType"
+string(REGEX REPLACE "MakeCommand:.*DefaultCTestConfigurationType" "MakeCommand: ${CMAKE_COMMAND} --build . --target ${EIGEN_DASHBOARD_BUILD_TARGET} --config \"\${CTEST_CONFIGURATION_TYPE}\" -- ${CMAKE_MATCH_1} ${EIGEN_TEST_BUILD_FLAGS}\nDefaultCTestConfigurationType"
        EIGEN_DART_CONFIG_FILE2 ${EIGEN_DART_CONFIG_FILE})
 file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/DartConfiguration.tcl" ${EIGEN_DART_CONFIG_FILE2})
 
@@ -39,7 +41,7 @@ ei_init_testing()
 
 # configure Eigen related testing options
 option(EIGEN_NO_ASSERTION_CHECKING "Disable checking of assertions using exceptions" OFF)
-option(EIGEN_DEBUG_ASSERTS "Enable advanced debuging of assertions" OFF)
+option(EIGEN_DEBUG_ASSERTS "Enable advanced debugging of assertions" OFF)
 
 if(CMAKE_COMPILER_IS_GNUCXX)
   option(EIGEN_COVERAGE_TESTING "Enable/disable gcov" OFF)
diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake
index 4a34ddef5..35deed509 100644
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake
@@ -19,19 +19,28 @@ macro(ei_add_test_internal testname testname_with_suffix)
   endif()
 
   if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu)
-    if(EIGEN_TEST_CUDA_CLANG)
+    if(EIGEN_TEST_HIP)
+      hip_reset_flags()
+      hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS "-DEIGEN_USE_HIP ${ARGV2}")
+    elseif(EIGEN_TEST_CUDA_CLANG)
       set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX)
-      if(CUDA_64_BIT_DEVICE_CODE)
+      
+      if(CUDA_64_BIT_DEVICE_CODE AND (EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/lib64"))
         link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64")
       else()
         link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib")
       endif()
+
       if (${ARGC} GREATER 2)
         add_executable(${targetname} ${filename})
       else()
         add_executable(${targetname} ${filename} OPTIONS ${ARGV2})
       endif()
-      target_link_libraries(${targetname} "cudart_static" "cuda" "dl" "rt" "pthread")
+      set(CUDA_CLANG_LINK_LIBRARIES "cudart_static" "cuda" "dl" "pthread")
+      if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
+      set(CUDA_CLANG_LINK_LIBRARIES ${CUDA_CLANG_LINK_LIBRARIES} "rt")
+      endif()
+      target_link_libraries(${targetname} ${CUDA_CLANG_LINK_LIBRARIES})
     else()
       if (${ARGC} GREATER 2)
         cuda_add_executable(${targetname} ${filename} OPTIONS ${ARGV2})
@@ -59,8 +68,6 @@ macro(ei_add_test_internal testname testname_with_suffix)
 
   ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}")
 
-  ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_FUNC=${testname}")
-
   if(MSVC)
     ei_add_target_property(${targetname} COMPILE_FLAGS "/bigobj")
   endif()
@@ -99,7 +106,7 @@ macro(ei_add_test_internal testname testname_with_suffix)
 
   add_test(${testname_with_suffix} "${targetname}")
 
-  # Specify target and test labels accoirding to EIGEN_CURRENT_SUBPROJECT
+  # Specify target and test labels according to EIGEN_CURRENT_SUBPROJECT
   get_property(current_subproject GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT)
   if ((current_subproject) AND (NOT (current_subproject STREQUAL "")))
     set_property(TARGET ${targetname} PROPERTY LABELS "Build${current_subproject}")
@@ -161,8 +168,6 @@ macro(ei_add_test_internal_sycl testname testname_with_suffix)
 
   ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}")
 
-  ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_FUNC=${testname}")
-
   if(MSVC AND NOT EIGEN_SPLIT_LARGE_TESTS)
     ei_add_target_property(${targetname} COMPILE_FLAGS "/bigobj")
   endif()
@@ -247,7 +252,7 @@ endmacro(ei_add_test_internal_sycl)
 #
 # If EIGEN_SPLIT_LARGE_TESTS is ON, the test is split into multiple executables
 #   test_<testname>_<N>
-# where N runs from 1 to the greatest occurence found in the source file. Each of these
+# where N runs from 1 to the greatest occurrence found in the source file. Each of these
 # executables is built passing -DEIGEN_TEST_PART_N. This allows to split large tests
 # into smaller executables.
 #
@@ -267,26 +272,28 @@ macro(ei_add_test testname)
   endif()
 
   file(READ "${filename}" test_source)
-  set(parts 0)
   string(REGEX MATCHALL "CALL_SUBTEST_[0-9]+|EIGEN_TEST_PART_[0-9]+|EIGEN_SUFFIXES(;[0-9]+)+"
-         occurences "${test_source}")
-  string(REGEX REPLACE "CALL_SUBTEST_|EIGEN_TEST_PART_|EIGEN_SUFFIXES" "" suffixes "${occurences}")
+         occurrences "${test_source}")
+  string(REGEX REPLACE "CALL_SUBTEST_|EIGEN_TEST_PART_|EIGEN_SUFFIXES" "" suffixes "${occurrences}")
   list(REMOVE_DUPLICATES suffixes)
-  if(EIGEN_SPLIT_LARGE_TESTS AND suffixes)
+  set(explicit_suffixes "")
+  if( (NOT EIGEN_SPLIT_LARGE_TESTS) AND suffixes)
+    # Check whether we have EIGEN_TEST_PART_* statements, in which case we likely must enforce splitting.
+    # For instance, indexed_view activate a different c++ version for each part.
+    string(REGEX MATCHALL "EIGEN_TEST_PART_[0-9]+" occurrences "${test_source}")
+    string(REGEX REPLACE "EIGEN_TEST_PART_" "" explicit_suffixes "${occurrences}")
+    list(REMOVE_DUPLICATES explicit_suffixes)
+  endif()
+  if( (EIGEN_SPLIT_LARGE_TESTS AND suffixes) OR explicit_suffixes)
     add_custom_target(${testname})
     foreach(suffix ${suffixes})
       ei_add_test_internal(${testname} ${testname}_${suffix}
         "${ARGV1} -DEIGEN_TEST_PART_${suffix}=1" "${ARGV2}")
       add_dependencies(${testname} ${testname}_${suffix})
     endforeach(suffix)
-  else(EIGEN_SPLIT_LARGE_TESTS AND suffixes)
-    set(symbols_to_enable_all_parts "")
-    foreach(suffix ${suffixes})
-      set(symbols_to_enable_all_parts
-        "${symbols_to_enable_all_parts} -DEIGEN_TEST_PART_${suffix}=1")
-    endforeach(suffix)
-    ei_add_test_internal(${testname} ${testname} "${ARGV1} ${symbols_to_enable_all_parts}" "${ARGV2}")
-  endif(EIGEN_SPLIT_LARGE_TESTS AND suffixes)
+  else()
+    ei_add_test_internal(${testname} ${testname} "${ARGV1} -DEIGEN_TEST_PART_ALL=1" "${ARGV2}")
+  endif()
 endmacro(ei_add_test)
 
 macro(ei_add_test_sycl testname)
@@ -303,8 +310,8 @@ macro(ei_add_test_sycl testname)
   file(READ "${filename}" test_source)
   set(parts 0)
   string(REGEX MATCHALL "CALL_SUBTEST_[0-9]+|EIGEN_TEST_PART_[0-9]+|EIGEN_SUFFIXES(;[0-9]+)+"
-         occurences "${test_source}")
-  string(REGEX REPLACE "CALL_SUBTEST_|EIGEN_TEST_PART_|EIGEN_SUFFIXES" "" suffixes "${occurences}")
+         occurrences "${test_source}")
+  string(REGEX REPLACE "CALL_SUBTEST_|EIGEN_TEST_PART_|EIGEN_SUFFIXES" "" suffixes "${occurrences}")
   list(REMOVE_DUPLICATES suffixes)
   if(EIGEN_SPLIT_LARGE_TESTS AND suffixes)
     add_custom_target(${testname})
@@ -449,6 +456,12 @@ macro(ei_testing_print_summary)
       message(STATUS "VSX:               Using architecture defaults")
     endif()
 
+    if(EIGEN_TEST_MSA)
+      message(STATUS "MIPS MSA:          ON")
+    else()
+      message(STATUS "MIPS MSA:          Using architecture defaults")
+    endif()
+
     if(EIGEN_TEST_NEON)
       message(STATUS "ARM NEON:          ON")
     else()
@@ -491,6 +504,11 @@ macro(ei_testing_print_summary)
     else()
       message(STATUS "CUDA:              OFF")
     endif()
+    if(EIGEN_TEST_HIP)
+      message(STATUS "HIP:               ON (using hipcc)")
+    else()
+      message(STATUS "HIP:               OFF")
+    endif()
 
   endif() # vectorization / alignment options
 
@@ -647,6 +665,8 @@ macro(ei_get_cxxflags VAR)
     set(${VAR} SSE3)
   elseif(EIGEN_TEST_SSE2 OR IS_64BIT_ENV)
     set(${VAR} SSE2)
+  elseif(EIGEN_TEST_MSA)
+    set(${VAR} MSA)
   endif()
 
   if(EIGEN_TEST_OPENMP)
@@ -679,6 +699,10 @@ macro(ei_set_build_string)
     set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-${LOCAL_COMPILER_FLAGS})
   endif()
 
+  if(EIGEN_TEST_EXTERNAL_BLAS)
+    set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-external_blas)
+  endif()
+
   ei_is_64bit_env(IS_64BIT_ENV)
   if(NOT IS_64BIT_ENV)
     set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-32bit)
diff --git a/cmake/FindComputeCpp.cmake b/cmake/FindComputeCpp.cmake
index e61dedc46..29f2a5007 100644
--- a/cmake/FindComputeCpp.cmake
+++ b/cmake/FindComputeCpp.cmake
@@ -243,7 +243,7 @@ endfunction()
 #######################
 #
 #  Adds a SYCL compilation custom command associated with an existing
-#  target and sets a dependancy on that new command.
+#  target and sets a dependency on that new command.
 #
 #  targetName : Name of the target to add a SYCL to.
 #  binaryDir : Intermediate directory to output the integration header.
diff --git a/cmake/FindEigen3.cmake b/cmake/FindEigen3.cmake
index 657440ba5..4d9bc1a52 100644
--- a/cmake/FindEigen3.cmake
+++ b/cmake/FindEigen3.cmake
@@ -15,7 +15,7 @@
 #  Eigen3::Eigen - The header-only Eigen library
 #
 # This module reads hints about search locations from 
-# the following enviroment variables:
+# the following environment variables:
 #
 # EIGEN3_ROOT
 # EIGEN3_ROOT_DIR
@@ -68,6 +68,7 @@ if (EIGEN3_INCLUDE_DIR)
   # in cache already
   _eigen3_check_version()
   set(EIGEN3_FOUND ${EIGEN3_VERSION_OK})
+  set(Eigen3_FOUND ${EIGEN3_VERSION_OK})
 
 else (EIGEN3_INCLUDE_DIR)
   
diff --git a/cmake/language_support.cmake b/cmake/language_support.cmake
index 2f14f30b8..ddba50945 100644
--- a/cmake/language_support.cmake
+++ b/cmake/language_support.cmake
@@ -26,7 +26,7 @@ function(workaround_9220 language language_works)
     cmake_minimum_required(VERSION 2.8.0)
     set (CMAKE_Fortran_FLAGS \"${CMAKE_Fortran_FLAGS}\")
     set (CMAKE_EXE_LINKER_FLAGS \"${CMAKE_EXE_LINKER_FLAGS}\")
-    enable_language(${language} OPTIONAL)
+    enable_language(${language})
   ")
   file(REMOVE_RECURSE ${CMAKE_BINARY_DIR}/language_tests/${language})
   file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/language_tests/${language})
diff --git a/debug/msvc/eigen_autoexp_part.dat b/debug/msvc/eigen_autoexp_part.dat
index 07aa43739..35ef5807c 100644
--- a/debug/msvc/eigen_autoexp_part.dat
+++ b/debug/msvc/eigen_autoexp_part.dat
@@ -14,7 +14,7 @@
 ; * - Eigen::Matrix<*,-1,+,*,*,*>
 ; * - Eigen::Matrix<*,+,+,*,*,*>
 ; *
-; * Matrices are displayed properly independantly of the memory
+; * Matrices are displayed properly independently of the memory
 ; * alignment (RowMajor vs. ColMajor).
 ; *
 ; * This file is distributed WITHOUT ANY WARRANTY. Please ensure
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
index 2109978fe..49b9fba39 100644
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@ -1764,7 +1764,7 @@ UML_LOOK               = YES
 # the class node. If there are many fields or methods and many nodes the
 # graph may become too big to be useful. The UML_LIMIT_NUM_FIELDS
 # threshold limits the number of items for each type to make the size more
-# managable. Set this to 0 for no limit. Note that the threshold may be
+# manageable. Set this to 0 for no limit. Note that the threshold may be
 # exceeded by 50% before the limit is enforced.
 
 UML_LIMIT_NUM_FIELDS   = 10
diff --git a/doc/FunctionsTakingEigenTypes.dox b/doc/FunctionsTakingEigenTypes.dox
index 152dda47d..6b4e49214 100644
--- a/doc/FunctionsTakingEigenTypes.dox
+++ b/doc/FunctionsTakingEigenTypes.dox
@@ -79,7 +79,7 @@ These examples are just intended to give the reader a first impression of how fu
 
 \section TopicUsingRefClass How to write generic, but non-templated function?
 
-In all the previous examples, the functions had to be template functions. This approach allows to write very generic code, but it is often desirable to write non templated function and still keep some level of genericity to avoid stupid copies of the arguments. The typical example is to write functions accepting both a MatrixXf or a block of a MatrixXf. This exactly the purpose of the Ref class. Here is a simple example:
+In all the previous examples, the functions had to be template functions. This approach allows to write very generic code, but it is often desirable to write non templated functions and still keep some level of genericity to avoid stupid copies of the arguments. The typical example is to write functions accepting both a MatrixXf or a block of a MatrixXf. This is exactly the purpose of the Ref class. Here is a simple example:
 
 <table class="example">
 <tr><th>Example:</th><th>Output:</th></tr>
@@ -133,7 +133,7 @@ In this special case, the example is fine and will be working because both param
 
 \section TopicPlainFunctionsFailing In which cases do functions taking a plain Matrix or Array argument fail?
 
-Here, we consider a slightly modified version of the function given above. This time, we do not want to return the result but pass an additional non-const paramter which allows us to store the result. A first naive implementation might look as follows.
+Here, we consider a slightly modified version of the function given above. This time, we do not want to return the result but pass an additional non-const parameter which allows us to store the result. A first naive implementation might look as follows.
 \code
 // Note: This code is flawed!
 void cov(const MatrixXf& x, const MatrixXf& y, MatrixXf& C)
@@ -176,7 +176,7 @@ The implementation above does now not only work with temporary expressions but i
 
 \section TopicResizingInGenericImplementations How to resize matrices in generic implementations?
 
-One might think we are done now, right? This is not completely true because in order for our covariance function to be generically applicable, we want the follwing code to work
+One might think we are done now, right? This is not completely true because in order for our covariance function to be generically applicable, we want the following code to work
 \code
 MatrixXf x = MatrixXf::Random(100,3);
 MatrixXf y = MatrixXf::Random(100,3);
diff --git a/doc/LeastSquares.dox b/doc/LeastSquares.dox
index e2191a22f..24dfe4b4f 100644
--- a/doc/LeastSquares.dox
+++ b/doc/LeastSquares.dox
@@ -16,7 +16,7 @@ equations is the fastest but least accurate, and the QR decomposition is in betw
 
 \section LeastSquaresSVD Using the SVD decomposition
 
-The \link JacobiSVD::solve() solve() \endlink method in the JacobiSVD class can be directly used to
+The \link BDCSVD::solve() solve() \endlink method in the BDCSVD class can be directly used to
 solve linear squares systems. It is not enough to compute only the singular values (the default for
 this class); you also need the singular vectors but the thin SVD decomposition suffices for
 computing least squares solutions:
diff --git a/doc/Pitfalls.dox b/doc/Pitfalls.dox
index cf42effef..3f395053d 100644
--- a/doc/Pitfalls.dox
+++ b/doc/Pitfalls.dox
@@ -2,10 +2,16 @@ namespace Eigen {
 
 /** \page TopicPitfalls Common pitfalls
 
+
 \section TopicPitfalls_template_keyword Compilation error with template methods
 
 See this \link TopicTemplateKeyword page \endlink.
 
+\section TopicPitfalls_aliasing Aliasing
+
+Don't miss this \link TopicAliasing page \endlink on aliasing,
+especially if you got wrong results in statements where the destination appears on the right hand side of the expression.
+
 \section TopicPitfalls_auto_keyword C++11 and the auto keyword
 
 In short: do not use the auto keywords with Eigen's expressions, unless you are 100% sure about what you are doing. In particular, do not use the auto keyword as a replacement for a Matrix<> type. Here is an example:
diff --git a/doc/PreprocessorDirectives.dox b/doc/PreprocessorDirectives.dox
index b6d08c700..b49f7d3cf 100644
--- a/doc/PreprocessorDirectives.dox
+++ b/doc/PreprocessorDirectives.dox
@@ -51,7 +51,7 @@ are doing.
 
 \section TopicPreprocessorDirectivesCppVersion C++ standard features
 
-By default, %Eigen strive to automatically detect and enable langage features at compile-time based on
+By default, %Eigen strive to automatically detect and enable language features at compile-time based on
 the information provided by the compiler.
 
  - \b EIGEN_MAX_CPP_VER - disables usage of C++ features requiring a version greater than EIGEN_MAX_CPP_VER.
diff --git a/doc/QuickReference.dox b/doc/QuickReference.dox
index 59d7d05e4..18c90a2a9 100644
--- a/doc/QuickReference.dox
+++ b/doc/QuickReference.dox
@@ -68,7 +68,7 @@ Array<float,4,1>                <=>   Array4f
 
 Conversion between the matrix and array worlds:
 \code
-Array44f a1, a1;
+Array44f a1, a2;
 Matrix4f m1, m2;
 m1 = a1 * a2;                     // coeffwise product, implicit conversion from array to matrix.
 a1 = m1 * m2;                     // matrix product, implicit conversion from matrix to array.
diff --git a/doc/QuickStartGuide.dox b/doc/QuickStartGuide.dox
index ea32c3b3d..23bb2981b 100644
--- a/doc/QuickStartGuide.dox
+++ b/doc/QuickStartGuide.dox
@@ -68,7 +68,7 @@ The output is as follows:
 
 The second example starts by declaring a 3-by-3 matrix \c m which is initialized using the \link DenseBase::Random(Index,Index) Random() \endlink method with random values between -1 and 1. The next line applies a linear mapping such that the values are between 10 and 110. The function call \link DenseBase::Constant(Index,Index,const Scalar&) MatrixXd::Constant\endlink(3,3,1.2) returns a 3-by-3 matrix expression having all coefficients equal to 1.2. The rest is standard arithmetics.
 
-The next line of the \c main function introduces a new type: \c VectorXd. This represents a (column) vector of arbitrary size. Here, the vector \c v is created to contain \c 3 coefficients which are left unitialized. The one but last line uses the so-called comma-initializer, explained in \ref TutorialAdvancedInitialization, to set all coefficients of the vector \c v to be as follows:
+The next line of the \c main function introduces a new type: \c VectorXd. This represents a (column) vector of arbitrary size. Here, the vector \c v is created to contain \c 3 coefficients which are left uninitialized. The one but last line uses the so-called comma-initializer, explained in \ref TutorialAdvancedInitialization, to set all coefficients of the vector \c v to be as follows:
 
 \f[
 v =
diff --git a/doc/SparseQuickReference.dox b/doc/SparseQuickReference.dox
index a25622e80..81a73eec2 100644
--- a/doc/SparseQuickReference.dox
+++ b/doc/SparseQuickReference.dox
@@ -80,7 +80,7 @@ sm1.setZero();
 
 
 \section SparseBasicInfos Matrix properties
-Beyond the basic functions rows() and cols(), there are some useful functions that are available to easily get some informations from the matrix. 
+Beyond the basic functions rows() and cols(), there are some useful functions that are available to easily get some information from the matrix. 
 <table class="manual">
 <tr>
   <td> \code
diff --git a/doc/TemplateKeyword.dox b/doc/TemplateKeyword.dox
index b84cfdae9..fbf2c7081 100644
--- a/doc/TemplateKeyword.dox
+++ b/doc/TemplateKeyword.dox
@@ -76,7 +76,7 @@ point where the template is defined, without knowing the actual value of the tem
 and \c Derived2 in the example). That means that the compiler cannot know that <tt>dst.triangularView</tt> is
 a member template and that the following &lt; symbol is part of the delimiter for the template
 parameter. Another possibility would be that <tt>dst.triangularView</tt> is a member variable with the &lt;
-symbol refering to the <tt>operator&lt;()</tt> function. In fact, the compiler should choose the second
+symbol referring to the <tt>operator&lt;()</tt> function. In fact, the compiler should choose the second
 possibility, according to the standard. If <tt>dst.triangularView</tt> is a member template (as in our case),
 the programmer should specify this explicitly with the \c template keyword and write <tt>dst.template
 triangularView</tt>.
diff --git a/doc/TopicLazyEvaluation.dox b/doc/TopicLazyEvaluation.dox
index 101ef8c72..b7820e3e6 100644
--- a/doc/TopicLazyEvaluation.dox
+++ b/doc/TopicLazyEvaluation.dox
@@ -58,7 +58,7 @@ the product <tt>matrix3 * matrix4</tt> gets evaluated immediately into a tempora
 
 \code matrix1 = matrix2 * (matrix3 + matrix4); \endcode
 
-Here, provided the matrices have at least 2 rows and 2 columns, each coefficienct of the expression <tt>matrix3 + matrix4</tt> is going to be used several times in the matrix product. Instead of computing the sum everytime, it is much better to compute it once and store it in a temporary variable. Eigen understands this and evaluates <tt>matrix3 + matrix4</tt> into a temporary variable before evaluating the product.
+Here, provided the matrices have at least 2 rows and 2 columns, each coefficienct of the expression <tt>matrix3 + matrix4</tt> is going to be used several times in the matrix product. Instead of computing the sum every time, it is much better to compute it once and store it in a temporary variable. Eigen understands this and evaluates <tt>matrix3 + matrix4</tt> into a temporary variable before evaluating the product.
 
 */
 
diff --git a/doc/TopicLinearAlgebraDecompositions.dox b/doc/TopicLinearAlgebraDecompositions.dox
index 491470627..0965da872 100644
--- a/doc/TopicLinearAlgebraDecompositions.dox
+++ b/doc/TopicLinearAlgebraDecompositions.dox
@@ -4,7 +4,7 @@ namespace Eigen {
 
 This page presents a catalogue of the dense matrix decompositions offered by Eigen.
 For an introduction on linear solvers and decompositions, check this \link TutorialLinearAlgebra page \endlink.
-To get an overview of the true relative speed of the different decomposition, check this \link DenseDecompositionBenchmark benchmark \endlink.
+To get an overview of the true relative speed of the different decompositions, check this \link DenseDecompositionBenchmark benchmark \endlink.
 
 \section TopicLinAlgBigTable Catalogue of decompositions offered by Eigen
 
@@ -114,6 +114,18 @@ To get an overview of the true relative speed of the different decomposition, ch
     <tr><th class="inter" colspan="9">\n Singular values and eigenvalues decompositions</th></tr>
 
     <tr>
+        <td>BDCSVD (divide \& conquer)</td>
+        <td>-</td>
+        <td>One of the fastest SVD algorithms</td>
+        <td>Excellent</td>
+        <td>Yes</td>
+        <td>Singular values/vectors, least squares</td>
+        <td>Yes (and does least squares)</td>
+        <td>Excellent</td>
+        <td>Blocked bidiagonalization</td>
+    </tr>
+
+    <tr>
         <td>JacobiSVD (two-sided)</td>
         <td>-</td>
         <td>Slow (but fast for small matrices)</td>
@@ -248,7 +260,7 @@ To get an overview of the true relative speed of the different decomposition, ch
   <dt><b>Blocking</b></dt>
     <dd>Means the algorithm can work per block, whence guaranteeing a good scaling of the performance for large matrices.</dd>
   <dt><b>Implicit Multi Threading (MT)</b></dt>
-    <dd>Means the algorithm can take advantage of multicore processors via OpenMP. "Implicit" means the algortihm itself is not parallelized, but that it relies on parallelized matrix-matrix product rountines.</dd>
+    <dd>Means the algorithm can take advantage of multicore processors via OpenMP. "Implicit" means the algortihm itself is not parallelized, but that it relies on parallelized matrix-matrix product routines.</dd>
   <dt><b>Explicit Multi Threading (MT)</b></dt>
     <dd>Means the algorithm is explicitly parallelized to take advantage of multicore processors via OpenMP.</dd>
   <dt><b>Meta-unroller</b></dt>
diff --git a/doc/TopicMultithreading.dox b/doc/TopicMultithreading.dox
index 47c9b261f..bc394f484 100644
--- a/doc/TopicMultithreading.dox
+++ b/doc/TopicMultithreading.dox
@@ -47,7 +47,7 @@ int main(int argc, char** argv)
 
 \warning note that all functions generating random matrices are \b not re-entrant nor thread-safe. Those include DenseBase::Random(), and DenseBase::setRandom() despite a call to Eigen::initParallel(). This is because these functions are based on std::rand which is not re-entrant. For thread-safe random generator, we recommend the use of boost::random or c++11 random feature.
 
-In the case your application is parallelized with OpenMP, you might want to disable Eigen's own parallization as detailed in the previous section.
+In the case your application is parallelized with OpenMP, you might want to disable Eigen's own parallelization as detailed in the previous section.
 
 */
 
diff --git a/doc/TutorialLinearAlgebra.dox b/doc/TutorialLinearAlgebra.dox
index cb92ceeae..a72724143 100644
--- a/doc/TutorialLinearAlgebra.dox
+++ b/doc/TutorialLinearAlgebra.dox
@@ -73,7 +73,7 @@ depending on your matrix and the trade-off you want to make:
         <td>ColPivHouseholderQR</td>
         <td>colPivHouseholderQr()</td>
         <td>None</td>
-        <td>++</td>
+        <td>+</td>
         <td>-</td>
         <td>+++</td>
     </tr>
@@ -86,6 +86,14 @@ depending on your matrix and the trade-off you want to make:
         <td>+++</td>
     </tr>
     <tr class="alt">
+        <td>CompleteOrthogonalDecomposition</td>
+        <td>completeOrthogonalDecomposition()</td>
+        <td>None</td>
+        <td>+</td>
+        <td>-</td>
+        <td>+++</td>
+    </tr>
+    <tr class="alt">
         <td>LLT</td>
         <td>llt()</td>
         <td>Positive definite</td>
@@ -102,14 +110,23 @@ depending on your matrix and the trade-off you want to make:
         <td>++</td>
     </tr>
     <tr class="alt">
+        <td>BDCSVD</td>
+        <td>bdcSvd()</td>
+        <td>None</td>
+        <td>-</td>
+        <td>-</td>
+        <td>+++</td>
+    </tr>
+    <tr class="alt">
         <td>JacobiSVD</td>
         <td>jacobiSvd()</td>
         <td>None</td>
-        <td>- -</td>
+        <td>-</td>
         <td>- - -</td>
         <td>+++</td>
     </tr>
 </table>
+To get an overview of the true relative speed of the different decompositions, check this \link DenseDecompositionBenchmark benchmark \endlink.
 
 All of these decompositions offer a solve() method that works as in the above example.
 
@@ -183,8 +200,11 @@ Here is an example:
 
 \section TutorialLinAlgLeastsquares Least squares solving
 
-The most accurate method to do least squares solving is with a SVD decomposition. Eigen provides one
-as the JacobiSVD class, and its solve() is doing least-squares solving.
+The most accurate method to do least squares solving is with a SVD decomposition.
+Eigen provides two implementations.
+The recommended one is the BDCSVD class, which scale well for large problems
+and automatically fall-back to the JacobiSVD class for smaller problems.
+For both classes, their solve() method is doing least-squares solving.
 
 Here is an example:
 <table class="example">
diff --git a/doc/TutorialMapClass.dox b/doc/TutorialMapClass.dox
index f8fb0fd2f..caa2539d8 100644
--- a/doc/TutorialMapClass.dox
+++ b/doc/TutorialMapClass.dox
@@ -29,9 +29,9 @@ Map<const Vector4i> mi(pi);
 \endcode
 where \c pi is an \c int \c *. In this case the size does not have to be passed to the constructor, because it is already specified by the Matrix/Array type.
 
-Note that Map does not have a default constructor; you \em must pass a pointer to intialize the object. However, you can work around this requirement (see \ref TutorialMapPlacementNew).
+Note that Map does not have a default constructor; you \em must pass a pointer to initialize the object. However, you can work around this requirement (see \ref TutorialMapPlacementNew).
 
-Map is flexible enough to accomodate a variety of different data representations.  There are two other (optional) template parameters:
+Map is flexible enough to accommodate a variety of different data representations.  There are two other (optional) template parameters:
 \code
 Map<typename MatrixType,
     int MapOptions,
diff --git a/doc/TutorialSparse.dox b/doc/TutorialSparse.dox
index 352907408..350ea1139 100644
--- a/doc/TutorialSparse.dox
+++ b/doc/TutorialSparse.dox
@@ -57,7 +57,7 @@ The \c "_" indicates available free space to quickly insert new elements.
 Assuming no reallocation is needed, the insertion of a random element is therefore in O(nnz_j) where nnz_j is the number of nonzeros of the respective inner vector.
 On the other hand, inserting elements with increasing inner indices in a given inner vector is much more efficient since this only requires to increase the respective \c InnerNNZs entry that is a O(1) operation.
 
-The case where no empty space is available is a special case, and is refered as the \em compressed mode.
+The case where no empty space is available is a special case, and is referred as the \em compressed mode.
 It corresponds to the widely used Compressed Column (or Row) Storage schemes (CCS or CRS).
 Any SparseMatrix can be turned to this form by calling the SparseMatrix::makeCompressed() function.
 In this case, one can remark that the \c InnerNNZs array is redundant with \c OuterStarts because we the equality: \c InnerNNZs[j] = \c OuterStarts[j+1]-\c OuterStarts[j].
@@ -212,7 +212,7 @@ See the SparseMatrix::setFromTriplets() function and class Triplet for more deta
 
 
 In some cases, however, slightly higher performance, and lower memory consumption can be reached by directly inserting the non-zeros into the destination matrix.
-A typical scenario of this approach is illustrated bellow:
+A typical scenario of this approach is illustrated below:
 \code
 1: SparseMatrix<double> mat(rows,cols);         // default is column major
 2: mat.reserve(VectorXi::Constant(cols,6));
diff --git a/doc/UnalignedArrayAssert.dox b/doc/UnalignedArrayAssert.dox
index 0f7022973..8676faa1b 100644
--- a/doc/UnalignedArrayAssert.dox
+++ b/doc/UnalignedArrayAssert.dox
@@ -117,8 +117,8 @@ It doesn't disable 16-byte alignment, because that would mean that vectorized an
 
 \section checkmycode How can I check my code is safe regarding alignment issues?
 
-Unfortunately, there is no possibility in C++ to detect any of the aformentioned shortcoming at compile time (though static analysers are becoming more and more powerful and could detect some of them).
-Even at runtime, all we can do is to catch invalid unaligned allocation and trigger the explicit assertion mentioned at the begining of this page.
+Unfortunately, there is no possibility in C++ to detect any of the aforementioned shortcoming at compile time (though static analysers are becoming more and more powerful and could detect some of them).
+Even at runtime, all we can do is to catch invalid unaligned allocation and trigger the explicit assertion mentioned at the beginning of this page.
 Therefore, if your program runs fine on a given system with some given compilation flags, then this does not guarantee that your code is safe. For instance, on most 64 bits systems buffer are aligned on 16 bytes boundary and so, if you do not enable AVX instruction set, then your code will run fine. On the other hand, the same code may assert if moving to a more exotic platform, or enabling AVX instructions that required 32 bytes alignment by default.
 
 The situation is not hopeless though. Assuming your code is well covered by unit test, then you can check its alignment safety by linking it to a custom malloc library returning 8 bytes aligned buffers only. This way all alignment shortcomings should pop-up. To this end, you must also compile your program with \link TopicPreprocessorDirectivesPerformance EIGEN_MALLOC_ALREADY_ALIGNED=0 \endlink.
diff --git a/doc/UsingNVCC.dox b/doc/UsingNVCC.dox
index 9bcdf0bfc..36beb2ddd 100644
--- a/doc/UsingNVCC.dox
+++ b/doc/UsingNVCC.dox
@@ -5,7 +5,7 @@ namespace Eigen {
 
 Staring from CUDA 5.5 and Eigen 3.3, it is possible to use Eigen's matrices, vectors, and arrays for fixed size within CUDA kernels. This is especially useful when working on numerous but small problems. By default, when Eigen's headers are included within a .cu file compiled by nvcc most Eigen's functions and methods are prefixed by the \c __device__ \c __host__ keywords making them callable from both host and device code.
 This support can be disabled by defining \c EIGEN_NO_CUDA before including any Eigen's header.
-This might be usefull to disable some warnings when a .cu file makes use of Eigen on the host side only.
+This might be useful to disable some warnings when a .cu file makes use of Eigen on the host side only.
 However, in both cases, host's SIMD vectorization has to be disabled in .cu files.
 It is thus \b strongly \b recommended to properly move all costly host computation from your .cu files to regular .cpp files.
 
diff --git a/doc/eigen_navtree_hacks.js b/doc/eigen_navtree_hacks.js
index bd7e02b38..39c59f73c 100644
--- a/doc/eigen_navtree_hacks.js
+++ b/doc/eigen_navtree_hacks.js
@@ -65,6 +65,10 @@ function getNode(o, po)
 function resizeHeight() 
 {
   var toc = $("#nav-toc");
+  var header = $("#header");
+  var content = $("#doc-content");
+  var navtree = $("#nav-path");
+  var sidenav = $("#side-nav");
   var tocHeight = toc.height();  // <- we added this line
   var headerHeight = header.height();
   var footerHeight = footer.height();
diff --git a/doc/eigendoxy.css b/doc/eigendoxy.css
index 6ce2b839b..427b128ba 100644
--- a/doc/eigendoxy.css
+++ b/doc/eigendoxy.css
@@ -93,7 +93,7 @@ table th.inter {
 	border-color: #cccccc;
 }
 
-/** class for exemple / output tables **/
+/** class for example / output tables **/
 
 table.example {
 }
@@ -219,3 +219,8 @@ h3.version {
 td.width20em p.endtd {
   width:  20em;
 }
+
+/* needed for huge screens */
+.ui-resizable-e {
+  background-repeat: repeat-y;
+}
+\ No newline at end of file
diff --git a/doc/eigendoxy_footer.html.in b/doc/eigendoxy_footer.html.in
index 878244a19..9ac0596cb 100644
--- a/doc/eigendoxy_footer.html.in
+++ b/doc/eigendoxy_footer.html.in
@@ -5,14 +5,14 @@
     $navpath
     <li class="footer">$generatedby
     <a href="http://www.doxygen.org/index.html">
-    <img class="footer" src="$relpath$doxygen.png" alt="doxygen"/></a> $doxygenversion </li>
+    <img class="footer" src="$relpath^doxygen.png" alt="doxygen"/></a> $doxygenversion </li>
   </ul>
 </div>
 <!--END GENERATE_TREEVIEW-->
 <!--BEGIN !GENERATE_TREEVIEW-->
 <hr class="footer"/><address class="footer"><small>
 $generatedby &#160;<a href="http://www.doxygen.org/index.html">
-<img class="footer" src="$relpath$doxygen.png" alt="doxygen"/>
+<img class="footer" src="$relpath^doxygen.png" alt="doxygen"/>
 </a> $doxygenversion
 </small></address>
 <!--END !GENERATE_TREEVIEW-->
diff --git a/doc/eigendoxy_header.html.in b/doc/eigendoxy_header.html.in
index 0f3859f40..bb149f8f0 100644
--- a/doc/eigendoxy_header.html.in
+++ b/doc/eigendoxy_header.html.in
@@ -4,25 +4,23 @@
 <meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
 <meta http-equiv="X-UA-Compatible" content="IE=9"/>
 <meta name="generator" content="Doxygen $doxygenversion"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
 <!--BEGIN PROJECT_NAME--><title>$projectname: $title</title><!--END PROJECT_NAME-->
 <!--BEGIN !PROJECT_NAME--><title>$title</title><!--END !PROJECT_NAME-->
-<link href="$relpath$tabs.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="$relpath$jquery.js"></script>
-<script type="text/javascript" src="$relpath$dynsections.js"></script>
+<link href="$relpath^tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="$relpath^jquery.js"></script>
+<script type="text/javascript" src="$relpath^dynsections.js"></script>
 $treeview
 $search
 $mathjax
-<link href="$relpath$$stylesheet"   rel="stylesheet" type="text/css" />
+<link href="$relpath^$stylesheet" rel="stylesheet" type="text/css" />
 <link href="$relpath$eigendoxy.css" rel="stylesheet" type="text/css">
 <!-- $extrastylesheet -->
 <script type="text/javascript" src="$relpath$eigen_navtree_hacks.js"></script>
-<!-- <script type="text/javascript"> -->
-<!-- </script> -->
 
 </head>
 <body>
 <div id="top"><!-- do not remove this div, it is closed by doxygen! -->
-<!-- <a name="top"></a> -->
 
 <!--BEGIN TITLEAREA-->
 <div id="titlearea">
@@ -30,10 +28,10 @@ $mathjax
  <tbody>
  <tr style="height: 56px;">
   <!--BEGIN PROJECT_LOGO-->
-  <td id="projectlogo"><img alt="Logo" src="$relpath$$projectlogo"/></td>
+  <td id="projectlogo"><img alt="Logo" src="$relpath^$projectlogo"/></td>
   <!--END PROJECT_LOGO-->
   <!--BEGIN PROJECT_NAME-->
-  <td style="padding-left: 0.5em;">
+  <td id="projectalign" style="padding-left: 0.5em;">
    <div id="projectname"><a href="http://eigen.tuxfamily.org">$projectname</a>
    <!--BEGIN PROJECT_NUMBER-->&#160;<span id="projectnumber">$projectnumber</span><!--END PROJECT_NUMBER-->
    </div>
@@ -42,7 +40,7 @@ $mathjax
   <!--END PROJECT_NAME-->
   <!--BEGIN !PROJECT_NAME-->
    <!--BEGIN PROJECT_BRIEF-->
-    <td style="padding-left: 0.5em;">
+    <td id="projectalign" style="padding-left: 0.5em;">
     <div id="projectbrief">$projectbrief</div>
     </td>
    <!--END PROJECT_BRIEF-->
diff --git a/doc/examples/Cwise_lgamma.cpp b/doc/examples/Cwise_lgamma.cpp
index f1c4f503e..6bfaccbce 100644
--- a/doc/examples/Cwise_lgamma.cpp
+++ b/doc/examples/Cwise_lgamma.cpp
@@ -6,4 +6,4 @@ int main()
 {
   Array4d v(0.5,10,0,-1);
   std::cout << v.lgamma() << std::endl;
-}
-\ No newline at end of file
+}
diff --git a/doc/examples/TutorialLinAlgSVDSolve.cpp b/doc/examples/TutorialLinAlgSVDSolve.cpp
index 9fbc031de..f109f04e5 100644
--- a/doc/examples/TutorialLinAlgSVDSolve.cpp
+++ b/doc/examples/TutorialLinAlgSVDSolve.cpp
@@ -11,5 +11,5 @@ int main()
    VectorXf b = VectorXf::Random(3);
    cout << "Here is the right hand side b:\n" << b << endl;
    cout << "The least-squares solution is:\n"
-        << A.jacobiSvd(ComputeThinU | ComputeThinV).solve(b) << endl;
+        << A.bdcSvd(ComputeThinU | ComputeThinV).solve(b) << endl;
 }
diff --git a/doc/examples/Tutorial_simple_example_dynamic_size.cpp b/doc/examples/Tutorial_simple_example_dynamic_size.cpp
index 0f0280e0e..defcb1ee4 100644
--- a/doc/examples/Tutorial_simple_example_dynamic_size.cpp
+++ b/doc/examples/Tutorial_simple_example_dynamic_size.cpp
@@ -10,7 +10,7 @@ int main()
     MatrixXi m(size,size+1);         // a (size)x(size+1)-matrix of int's
     for (int j=0; j<m.cols(); ++j)   // loop over columns
       for (int i=0; i<m.rows(); ++i) // loop over rows
-        m(i,j) = i+j*m.rows();       // to access matrix coefficients,
+        m(i,j) = i+j*size;           // to access matrix coefficients,
                                      // use operator()(int,int)
     std::cout << m << "\n\n";
   }
diff --git a/doc/examples/matrixfree_cg.cpp b/doc/examples/matrixfree_cg.cpp
index 6a205aea3..74699381c 100644
--- a/doc/examples/matrixfree_cg.cpp
+++ b/doc/examples/matrixfree_cg.cpp
@@ -67,6 +67,7 @@ namespace internal {
       // This method should implement "dst += alpha * lhs * rhs" inplace,
       // however, for iterative solvers, alpha is always equal to 1, so let's not bother about it.
       assert(alpha==Scalar(1) && "scaling is not implemented");
+      EIGEN_ONLY_USED_FOR_DEBUG(alpha);
 
       // Here we could simply call dst.noalias() += lhs.my_matrix() * rhs,
       // but let's do something fancier (and less efficient):
diff --git a/doc/snippets/MatrixBase_cwiseEqual.cpp b/doc/snippets/MatrixBase_cwiseEqual.cpp
index eb3656f4c..469af642c 100644
--- a/doc/snippets/MatrixBase_cwiseEqual.cpp
+++ b/doc/snippets/MatrixBase_cwiseEqual.cpp
@@ -3,5 +3,5 @@ m << 1, 0,
      1, 1;
 cout << "Comparing m with identity matrix:" << endl;
 cout << m.cwiseEqual(MatrixXi::Identity(2,2)) << endl;
-int count = m.cwiseEqual(MatrixXi::Identity(2,2)).count();
+Index count = m.cwiseEqual(MatrixXi::Identity(2,2)).count();
 cout << "Number of coefficients that are equal: " << count << endl;
diff --git a/doc/snippets/MatrixBase_cwiseNotEqual.cpp b/doc/snippets/MatrixBase_cwiseNotEqual.cpp
index 6a2e4fb6c..7f0a105d6 100644
--- a/doc/snippets/MatrixBase_cwiseNotEqual.cpp
+++ b/doc/snippets/MatrixBase_cwiseNotEqual.cpp
@@ -3,5 +3,5 @@ m << 1, 0,
      1, 1;
 cout << "Comparing m with identity matrix:" << endl;
 cout << m.cwiseNotEqual(MatrixXi::Identity(2,2)) << endl;
-int count = m.cwiseNotEqual(MatrixXi::Identity(2,2)).count();
+Index count = m.cwiseNotEqual(MatrixXi::Identity(2,2)).count();
 cout << "Number of coefficients that are not equal: " << count << endl;
diff --git a/doc/special_examples/Tutorial_sparse_example.cpp b/doc/special_examples/Tutorial_sparse_example.cpp
index 830e196ea..8850db052 100644
--- a/doc/special_examples/Tutorial_sparse_example.cpp
+++ b/doc/special_examples/Tutorial_sparse_example.cpp
@@ -1,5 +1,6 @@
 #include <Eigen/Sparse>
 #include <vector>
+#include <iostream>
 
 typedef Eigen::SparseMatrix<double> SpMat; // declares a column-major sparse matrix type of double
 typedef Eigen::Triplet<double> T;
@@ -9,10 +10,13 @@ void saveAsBitmap(const Eigen::VectorXd& x, int n, const char* filename);
 
 int main(int argc, char** argv)
 {
-  assert(argc==2);
+  if(argc!=2) {
+    std::cerr << "Error: expected one and only one argument.\n";
+    return -1;
+  }
   
   int n = 300;  // size of the image
-  int m = n*n;  // number of unknows (=number of pixels)
+  int m = n*n;  // number of unknowns (=number of pixels)
 
   // Assembly:
   std::vector<T> coefficients;            // list of non-zeros coefficients
diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt
index 9883d4c72..522ba8a2b 100644
--- a/lapack/CMakeLists.txt
+++ b/lapack/CMakeLists.txt
@@ -35,7 +35,7 @@ set(EigenLapack_SRCS  ${EigenLapack_SRCS}
   second_NONE.f dsecnd_NONE.f
 )
 
-option(EIGEN_ENABLE_LAPACK_TESTS OFF "Enbale the Lapack unit tests")
+option(EIGEN_ENABLE_LAPACK_TESTS OFF "Enable the Lapack unit tests")
 
 if(EIGEN_ENABLE_LAPACK_TESTS)
 
@@ -49,7 +49,7 @@ if(EIGEN_ENABLE_LAPACK_TESTS)
                   INACTIVITY_TIMEOUT 15
                   TIMEOUT 240
                   STATUS download_status
-                  EXPECTED_MD5 5758ce55afcf79da98de8b9de1615ad5
+                  EXPECTED_MD5 ab5742640617e3221a873aba44bbdc93
                   SHOW_PROGRESS)
                   
     message(STATUS ${download_status})
@@ -59,7 +59,7 @@ if(EIGEN_ENABLE_LAPACK_TESTS)
       message(STATUS "Setup lapack reference and lapack unit tests")
       execute_process(COMMAND tar xzf  "lapack_addons_3.4.1.tgz" WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
     else()
-      message(STATUS "Download of lapack_addons_3.4.1.tgz failed, LAPACK unit tests wont be enabled")
+      message(STATUS "Download of lapack_addons_3.4.1.tgz failed, LAPACK unit tests won't be enabled")
       set(EIGEN_ENABLE_LAPACK_TESTS false)
     endif()
                   
diff --git a/scripts/eigen_gen_split_test_help.cmake b/scripts/eigen_gen_split_test_help.cmake
new file mode 100644
index 000000000..e43f5aabe
--- /dev/null
+++ b/scripts/eigen_gen_split_test_help.cmake
@@ -0,0 +1,11 @@
+#!cmake -P
+file(WRITE split_test_helper.h "")
+foreach(i RANGE 1 999)
+  file(APPEND split_test_helper.h
+    "#if defined(EIGEN_TEST_PART_${i}) || defined(EIGEN_TEST_PART_ALL)\n"
+    "#define CALL_SUBTEST_${i}(FUNC) CALL_SUBTEST(FUNC)\n"
+    "#else\n"
+    "#define CALL_SUBTEST_${i}(FUNC)\n"
+    "#endif\n\n"
+  )
+endforeach()
+\ No newline at end of file
diff --git a/test/AnnoyingScalar.h b/test/AnnoyingScalar.h
new file mode 100644
index 000000000..2b6544a6a
--- /dev/null
+++ b/test/AnnoyingScalar.h
@@ -0,0 +1,154 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011-2018 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TEST_ANNOYING_SCALAR_H
+#define EIGEN_TEST_ANNOYING_SCALAR_H
+
+#include <ostream>
+
+#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
+struct my_exception
+{
+  my_exception() {}
+  ~my_exception() {}
+};
+#endif
+
+// An AnnoyingScalar is a pseudo scalar type that:
+// - can randomly through an exception in operator +
+// - randomly allocate on the heap or initialize a reference to itself making it non trivially copyable, nor movable, nor relocatable.
+
+class AnnoyingScalar
+{
+  public:
+    AnnoyingScalar()                { init(); *v = 0;  }
+    AnnoyingScalar(long double _v)  { init(); *v = _v; }
+    AnnoyingScalar(double _v)       { init(); *v = _v; }
+    AnnoyingScalar(float _v)        { init(); *v = _v; }
+    AnnoyingScalar(int _v)          { init(); *v = _v; }
+    AnnoyingScalar(long _v)         { init(); *v = _v; }
+    #if EIGEN_HAS_CXX11
+    AnnoyingScalar(long long _v)    { init(); *v = _v; }
+    #endif
+    AnnoyingScalar(const AnnoyingScalar& other) { init(); *v = *(other.v); }
+    ~AnnoyingScalar() {
+      if(v!=&data)
+        delete v;
+      instances--;
+    }
+
+    void init() {
+      if(internal::random<bool>())
+        v = new float;
+      else
+        v = &data;
+      instances++;
+    }
+
+    AnnoyingScalar operator+(const AnnoyingScalar& other) const
+    {
+      #ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
+      countdown--;
+      if(countdown<=0 && !dont_throw)
+        throw my_exception();
+      #endif
+      return AnnoyingScalar(*v+*other.v);
+    }
+
+    AnnoyingScalar operator-() const
+    { return AnnoyingScalar(-*v); }
+    
+    AnnoyingScalar operator-(const AnnoyingScalar& other) const
+    { return AnnoyingScalar(*v-*other.v); }
+    
+    AnnoyingScalar operator*(const AnnoyingScalar& other) const
+    { return AnnoyingScalar((*v)*(*other.v)); }
+
+    AnnoyingScalar operator/(const AnnoyingScalar& other) const
+    { return AnnoyingScalar((*v)/(*other.v)); }
+    
+    AnnoyingScalar& operator+=(const AnnoyingScalar& other) { *v += *other.v; return *this; }
+    AnnoyingScalar& operator-=(const AnnoyingScalar& other) { *v -= *other.v; return *this; }
+    AnnoyingScalar& operator*=(const AnnoyingScalar& other) { *v *= *other.v; return *this; }
+    AnnoyingScalar& operator/=(const AnnoyingScalar& other) { *v /= *other.v; return *this; }
+    AnnoyingScalar& operator= (const AnnoyingScalar& other) { *v  = *other.v; return *this; }
+  
+    bool operator==(const AnnoyingScalar& other) const { return *v == *other.v; }
+    bool operator!=(const AnnoyingScalar& other) const { return *v != *other.v; }
+    bool operator<=(const AnnoyingScalar& other) const { return *v <= *other.v; }
+    bool operator< (const AnnoyingScalar& other) const { return *v <  *other.v; }
+    bool operator>=(const AnnoyingScalar& other) const { return *v >= *other.v; }
+    bool operator> (const AnnoyingScalar& other) const { return *v >  *other.v; }
+    
+    float* v;
+    float data;
+    static int instances;
+#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
+    static int countdown;
+    static bool dont_throw;
+#endif
+};
+
+AnnoyingScalar real(const AnnoyingScalar &x) { return x; }
+AnnoyingScalar imag(const AnnoyingScalar & ) { return 0; }
+AnnoyingScalar conj(const AnnoyingScalar &x) { return x; }
+AnnoyingScalar sqrt(const AnnoyingScalar &x) { return std::sqrt(*x.v); }
+AnnoyingScalar abs (const AnnoyingScalar &x) { return std::abs(*x.v); }
+AnnoyingScalar cos (const AnnoyingScalar &x) { return std::cos(*x.v); }
+AnnoyingScalar sin (const AnnoyingScalar &x) { return std::sin(*x.v); }
+AnnoyingScalar acos(const AnnoyingScalar &x) { return std::acos(*x.v); }
+AnnoyingScalar atan2(const AnnoyingScalar &y,const AnnoyingScalar &x) { return std::atan2(*y.v,*x.v); }
+
+std::ostream& operator<<(std::ostream& stream,const AnnoyingScalar& x) {
+  stream << (*(x.v));
+  return stream;
+}
+
+int AnnoyingScalar::instances = 0;
+
+#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
+int AnnoyingScalar::countdown = 0;
+bool AnnoyingScalar::dont_throw = false;
+#endif
+
+namespace Eigen {
+template<>
+struct NumTraits<AnnoyingScalar> : NumTraits<float>
+{
+  enum {
+    RequireInitialization = true
+  };
+  typedef AnnoyingScalar Real;
+  typedef AnnoyingScalar Nested;
+  typedef AnnoyingScalar Literal;
+  typedef AnnoyingScalar NonInteger;
+};
+
+template<> inline AnnoyingScalar test_precision<AnnoyingScalar>() { return test_precision<float>(); }
+
+namespace internal {
+  template<> double cast(const AnnoyingScalar& x) { return double(*x.v); }
+  template<> float  cast(const AnnoyingScalar& x) { return *x.v; }
+}
+
+}
+
+AnnoyingScalar get_test_precision(const AnnoyingScalar&)
+{ return Eigen::test_precision<AnnoyingScalar>(); }
+
+AnnoyingScalar test_relative_error(const AnnoyingScalar &a, const AnnoyingScalar &b)
+{ return test_relative_error(*a.v, *b.v); }
+
+inline bool test_isApprox(const AnnoyingScalar &a, const AnnoyingScalar &b)
+{ return internal::isApprox(*a.v, *b.v, test_precision<float>()); }
+
+inline bool test_isMuchSmallerThan(const AnnoyingScalar &a, const AnnoyingScalar &b)
+{ return test_isMuchSmallerThan(*a.v, *b.v); }
+
+#endif // EIGEN_TEST_ANNOYING_SCALAR_H
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 8bd086ce3..b4730cff0 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,16 +1,7 @@
-# generate split test header file only if it does not yet exist
-# in order to prevent a rebuild everytime cmake is configured
-if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)  
-  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h "")
-  foreach(i RANGE 1 999)
-    file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h
-      "#ifdef EIGEN_TEST_PART_${i}\n"
-      "#define CALL_SUBTEST_${i}(FUNC) CALL_SUBTEST(FUNC)\n"
-      "#else\n"
-      "#define CALL_SUBTEST_${i}(FUNC)\n"
-      "#endif\n\n"
-    )
-  endforeach()
+# The file split_test_helper.h was generated at first run,
+# it is now included in test/
+if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)
+  file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)
 endif()
 
 # check if we have a Fortran compiler
@@ -30,6 +21,15 @@ if(NOT EIGEN_Fortran_COMPILER_WORKS)
   find_package(LAPACK QUIET)
 endif()
 
+# TODO do the same for EXTERNAL_LAPACK
+option(EIGEN_TEST_EXTERNAL_BLAS "Use external BLAS library for testsuite" OFF)
+if(EIGEN_TEST_EXTERNAL_BLAS)
+  find_package(BLAS REQUIRED)
+  message(STATUS "BLAS_COMPILER_FLAGS: ${BLAS_COMPILER_FLAGS}")
+  add_definitions("-DEIGEN_USE_BLAS") # is adding  ${BLAS_COMPILER_FLAGS} necessary?
+  list(APPEND EXTERNAL_LIBS "${BLAS_LIBRARIES}")
+endif(EIGEN_TEST_EXTERNAL_BLAS)
+
 # configure blas/lapack (use Eigen's ones)
 set(EIGEN_BLAS_LIBRARIES eigen_blas)
 set(EIGEN_LAPACK_LIBRARIES eigen_lapack)
@@ -91,7 +91,7 @@ else()
 endif()
 
 
-find_package(PASTIX QUIET COMPONENTS METIS SCOTCH)
+find_package(PASTIX QUIET COMPONENTS METIS SEQ)
 # check that the PASTIX found is a version without MPI
 find_path(PASTIX_pastix_nompi.h_INCLUDE_DIRS
   NAMES pastix_nompi.h
@@ -283,6 +283,7 @@ ei_add_test(mpl2only)
 ei_add_test(inplace_decomposition)
 ei_add_test(half_float)
 ei_add_test(array_of_string)
+ei_add_test(num_dimensions)
 
 add_executable(bug1213 bug1213.cpp bug1213_main.cpp)
 
@@ -309,7 +310,6 @@ if(UMFPACK_FOUND)
 endif()
 
 if(KLU_FOUND OR SuiteSparse_FOUND)
-  message("ADDING KLU TEST")
   ei_add_test(klu_support "" "${KLU_ALL_LIBS}")
 endif()
 
@@ -387,10 +387,9 @@ if(CUDA_FOUND)
   if(EIGEN_TEST_CUDA_CLANG)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_30")
   endif()
-  cuda_include_directories(${CMAKE_CURRENT_BINARY_DIR})
   set(EIGEN_ADD_TEST_FILENAME_EXTENSION  "cu")
   
-  ei_add_test(cuda_basic)
+  ei_add_test(gpu_basic)
   
   unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
 
@@ -399,6 +398,47 @@ endif(CUDA_FOUND)
 endif(EIGEN_TEST_CUDA)
 
 
+# HIP unit tests
+option(EIGEN_TEST_HIP "Add HIP support." OFF)
+if (EIGEN_TEST_HIP)
+
+  set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.")
+
+  if (EXISTS ${HIP_PATH})
+    
+    list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake) 
+
+    find_package(HIP REQUIRED)
+    if (HIP_FOUND)
+
+      execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)
+
+      if (${HIP_PLATFORM} STREQUAL "hcc")
+
+	include_directories(${HIP_PATH}/include)
+
+	set(EIGEN_ADD_TEST_FILENAME_EXTENSION  "cu")
+	ei_add_test(gpu_basic)
+	unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+	
+      elseif (${HIP_PLATFORM} STREQUAL "nvcc")
+	message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen")
+      else ()
+	message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}")
+      endif()
+      
+    endif(HIP_FOUND)
+    
+  else ()
+
+    message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist")
+    
+  endif()
+    
+endif(EIGEN_TEST_HIP)
+
+
+
 file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests)    
 add_test(NAME failtests WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests COMMAND ${CMAKE_COMMAND} ${Eigen_SOURCE_DIR} -G "${CMAKE_GENERATOR}" -DEIGEN_FAILTEST=ON)
 
diff --git a/test/adjoint.cpp b/test/adjoint.cpp
index bdea51c10..4e1e4b5e8 100644
--- a/test/adjoint.cpp
+++ b/test/adjoint.cpp
@@ -70,7 +70,6 @@ template<typename MatrixType> void adjoint(const MatrixType& m)
      Transpose.h Conjugate.h Dot.h
   */
   using std::abs;
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
@@ -146,7 +145,35 @@ template<typename MatrixType> void adjoint(const MatrixType& m)
   VERIFY_IS_APPROX(rv1.template cast<Scalar>().dot(v1), rv1.dot(v1));
 }
 
-void test_adjoint()
+template<int>
+void adjoint_extra()
+{
+  MatrixXcf a(10,10), b(10,10);
+  VERIFY_RAISES_ASSERT(a = a.transpose());
+  VERIFY_RAISES_ASSERT(a = a.transpose() + b);
+  VERIFY_RAISES_ASSERT(a = b + a.transpose());
+  VERIFY_RAISES_ASSERT(a = a.conjugate().transpose());
+  VERIFY_RAISES_ASSERT(a = a.adjoint());
+  VERIFY_RAISES_ASSERT(a = a.adjoint() + b);
+  VERIFY_RAISES_ASSERT(a = b + a.adjoint());
+
+  // no assertion should be triggered for these cases:
+  a.transpose() = a.transpose();
+  a.transpose() += a.transpose();
+  a.transpose() += a.transpose() + b;
+  a.transpose() = a.adjoint();
+  a.transpose() += a.adjoint();
+  a.transpose() += a.adjoint() + b;
+
+  // regression tests for check_for_aliasing
+  MatrixXd c(10,10);
+  c = 1.0 * MatrixXd::Ones(10,10) + c;
+  c = MatrixXd::Ones(10,10) * 1.0 + c;
+  c = c + MatrixXd::Ones(10,10) .cwiseProduct( MatrixXd::Zero(10,10) );
+  c = MatrixXd::Ones(10,10) * MatrixXd::Zero(10,10);
+}
+
+EIGEN_DECLARE_TEST(adjoint)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( adjoint(Matrix<float, 1, 1>()) );
@@ -169,32 +196,6 @@ void test_adjoint()
   // test a large static matrix only once
   CALL_SUBTEST_7( adjoint(Matrix<float, 100, 100>()) );
 
-#ifdef EIGEN_TEST_PART_13
-  {
-    MatrixXcf a(10,10), b(10,10);
-    VERIFY_RAISES_ASSERT(a = a.transpose());
-    VERIFY_RAISES_ASSERT(a = a.transpose() + b);
-    VERIFY_RAISES_ASSERT(a = b + a.transpose());
-    VERIFY_RAISES_ASSERT(a = a.conjugate().transpose());
-    VERIFY_RAISES_ASSERT(a = a.adjoint());
-    VERIFY_RAISES_ASSERT(a = a.adjoint() + b);
-    VERIFY_RAISES_ASSERT(a = b + a.adjoint());
-
-    // no assertion should be triggered for these cases:
-    a.transpose() = a.transpose();
-    a.transpose() += a.transpose();
-    a.transpose() += a.transpose() + b;
-    a.transpose() = a.adjoint();
-    a.transpose() += a.adjoint();
-    a.transpose() += a.adjoint() + b;
-
-    // regression tests for check_for_aliasing
-    MatrixXd c(10,10);
-    c = 1.0 * MatrixXd::Ones(10,10) + c;
-    c = MatrixXd::Ones(10,10) * 1.0 + c;
-    c = c + MatrixXd::Ones(10,10) .cwiseProduct( MatrixXd::Zero(10,10) );
-    c = MatrixXd::Ones(10,10) * MatrixXd::Zero(10,10);
-  }
-#endif
+  CALL_SUBTEST_13( adjoint_extra<0>() );
 }
 
diff --git a/test/array.cpp b/test/array.cpp
index f7f3ba780..c01653668 100644
--- a/test/array.cpp
+++ b/test/array.cpp
@@ -11,7 +11,6 @@
 
 template<typename ArrayType> void array(const ArrayType& m)
 {
-  typedef typename ArrayType::Index Index;
   typedef typename ArrayType::Scalar Scalar;
   typedef typename ArrayType::RealScalar RealScalar;
   typedef Array<Scalar, ArrayType::RowsAtCompileTime, 1> ColVectorType;
@@ -130,7 +129,6 @@ template<typename ArrayType> void array(const ArrayType& m)
 template<typename ArrayType> void comparisons(const ArrayType& m)
 {
   using std::abs;
-  typedef typename ArrayType::Index Index;
   typedef typename ArrayType::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
 
@@ -197,7 +195,7 @@ template<typename ArrayType> void comparisons(const ArrayType& m)
   RealScalar a = m1.abs().mean();
   VERIFY( (m1<-a || m1>a).count() == (m1.abs()>a).count());
 
-  typedef Array<typename ArrayType::Index, Dynamic, 1> ArrayOfIndices;
+  typedef Array<Index, Dynamic, 1> ArrayOfIndices;
 
   // TODO allows colwise/rowwise for array
   VERIFY_IS_APPROX(((m1.abs()+1)>RealScalar(0.1)).colwise().count(), ArrayOfIndices::Constant(cols,rows).transpose());
@@ -208,7 +206,6 @@ template<typename ArrayType> void array_real(const ArrayType& m)
 {
   using std::abs;
   using std::sqrt;
-  typedef typename ArrayType::Index Index;
   typedef typename ArrayType::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
 
@@ -322,7 +319,6 @@ template<typename ArrayType> void array_real(const ArrayType& m)
 
 template<typename ArrayType> void array_complex(const ArrayType& m)
 {
-  typedef typename ArrayType::Index Index;
   typedef typename ArrayType::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
 
@@ -427,7 +423,6 @@ template<typename ArrayType> void array_complex(const ArrayType& m)
 
 template<typename ArrayType> void min_max(const ArrayType& m)
 {
-  typedef typename ArrayType::Index Index;
   typedef typename ArrayType::Scalar Scalar;
 
   Index rows = m.rows();
@@ -454,7 +449,7 @@ template<typename ArrayType> void min_max(const ArrayType& m)
 
 }
 
-void test_array()
+EIGEN_DECLARE_TEST(array)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( array(Array<float, 1, 1>()) );
diff --git a/test/array_for_matrix.cpp b/test/array_for_matrix.cpp
index b8721391f..6b03abb10 100644
--- a/test/array_for_matrix.cpp
+++ b/test/array_for_matrix.cpp
@@ -11,7 +11,6 @@
 
 template<typename MatrixType> void array_for_matrix(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> ColVectorType;
   typedef Matrix<Scalar, 1, MatrixType::ColsAtCompileTime> RowVectorType; 
@@ -83,7 +82,6 @@ template<typename MatrixType> void array_for_matrix(const MatrixType& m)
 template<typename MatrixType> void comparisons(const MatrixType& m)
 {
   using std::abs;
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
 
@@ -140,7 +138,7 @@ template<typename MatrixType> void comparisons(const MatrixType& m)
   RealScalar a = m1.cwiseAbs().mean();
   VERIFY( ((m1.array()<-a).matrix() || (m1.array()>a).matrix()).count() == (m1.cwiseAbs().array()>a).count());
 
-  typedef Matrix<typename MatrixType::Index, Dynamic, 1> VectorOfIndices;
+  typedef Matrix<Index, Dynamic, 1> VectorOfIndices;
 
   // TODO allows colwise/rowwise for array
   VERIFY_IS_APPROX(((m1.array().abs()+1)>RealScalar(0.1)).matrix().colwise().count(), VectorOfIndices::Constant(cols,rows).transpose());
@@ -172,7 +170,6 @@ template<typename VectorType> void lpNorm(const VectorType& v)
 
 template<typename MatrixType> void cwise_min_max(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
 
   Index rows = m.rows();
@@ -211,7 +208,6 @@ template<typename MatrixType> void cwise_min_max(const MatrixType& m)
 
 template<typename MatrixTraits> void resize(const MatrixTraits& t)
 {
-  typedef typename MatrixTraits::Index Index;
   typedef typename MatrixTraits::Scalar Scalar;
   typedef Matrix<Scalar,Dynamic,Dynamic> MatrixType;
   typedef Array<Scalar,Dynamic,Dynamic> Array2DType;
@@ -260,7 +256,7 @@ void regrrssion_bug_1410()
   VERIFY((internal::traits<MatrixWrapper<Array4i> >::Flags&LvalueBit)==LvalueBit);
 }
 
-void test_array_for_matrix()
+EIGEN_DECLARE_TEST(array_for_matrix)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( array_for_matrix(Matrix<float, 1, 1>()) );
diff --git a/test/array_of_string.cpp b/test/array_of_string.cpp
index e23b7c59e..23e51529b 100644
--- a/test/array_of_string.cpp
+++ b/test/array_of_string.cpp
@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-void test_array_of_string()
+EIGEN_DECLARE_TEST(array_of_string)
 {
   typedef Array<std::string,1,Dynamic> ArrayXs;
   ArrayXs a1(3), a2(3), a3(3), a3ref(3);
diff --git a/test/array_replicate.cpp b/test/array_replicate.cpp
index 779c8fc2f..057c3c77b 100644
--- a/test/array_replicate.cpp
+++ b/test/array_replicate.cpp
@@ -14,7 +14,6 @@ template<typename MatrixType> void replicate(const MatrixType& m)
   /* this test covers the following files:
      Replicate.cpp
   */
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
   typedef Matrix<Scalar, Dynamic, Dynamic> MatrixX;
@@ -69,7 +68,7 @@ template<typename MatrixType> void replicate(const MatrixType& m)
   VERIFY_IS_APPROX(vx1, v1.colwise().replicate(f2));
 }
 
-void test_array_replicate()
+EIGEN_DECLARE_TEST(array_replicate)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( replicate(Matrix<float, 1, 1>()) );
diff --git a/test/array_reverse.cpp b/test/array_reverse.cpp
index c9d9f90c3..e23159def 100644
--- a/test/array_reverse.cpp
+++ b/test/array_reverse.cpp
@@ -15,7 +15,6 @@ using namespace std;
 
 template<typename MatrixType> void reverse(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
 
@@ -124,7 +123,16 @@ template<typename MatrixType> void reverse(const MatrixType& m)
   VERIFY_IS_APPROX(x, m1(r, cols - 1 - c));
 }
 
-void test_array_reverse()
+template<int>
+void array_reverse_extra()
+{
+  Vector4f x; x << 1, 2, 3, 4;
+  Vector4f y; y << 4, 3, 2, 1;
+  VERIFY(x.reverse()[1] == 3);
+  VERIFY(x.reverse() == y);
+}
+
+EIGEN_DECLARE_TEST(array_reverse)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( reverse(Matrix<float, 1, 1>()) );
@@ -137,10 +145,5 @@ void test_array_reverse()
     CALL_SUBTEST_8( reverse(Matrix<float, 100, 100>()) );
     CALL_SUBTEST_9( reverse(Matrix<float,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
   }
-#ifdef EIGEN_TEST_PART_3
-  Vector4f x; x << 1, 2, 3, 4;
-  Vector4f y; y << 4, 3, 2, 1;
-  VERIFY(x.reverse()[1] == 3);
-  VERIFY(x.reverse() == y);
-#endif
+  CALL_SUBTEST_3( array_reverse_extra<0>() );
 }
diff --git a/test/bandmatrix.cpp b/test/bandmatrix.cpp
index f8c38f7c3..66a1b0db4 100644
--- a/test/bandmatrix.cpp
+++ b/test/bandmatrix.cpp
@@ -59,7 +59,7 @@ template<typename MatrixType> void bandmatrix(const MatrixType& _m)
 
 using Eigen::internal::BandMatrix;
 
-void test_bandmatrix()
+EIGEN_DECLARE_TEST(bandmatrix)
 {
   for(int i = 0; i < 10*g_repeat ; i++) {
     Index rows = internal::random<Index>(1,10);
diff --git a/test/basicstuff.cpp b/test/basicstuff.cpp
index c346ce6cb..85af603d8 100644
--- a/test/basicstuff.cpp
+++ b/test/basicstuff.cpp
@@ -13,7 +13,6 @@
 
 template<typename MatrixType> void basicStuff(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> SquareMatrixType;
@@ -124,22 +123,22 @@ template<typename MatrixType> void basicStuff(const MatrixType& m)
   
   // check automatic transposition
   sm2.setZero();
-  for(typename MatrixType::Index i=0;i<rows;++i)
+  for(Index i=0;i<rows;++i)
     sm2.col(i) = sm1.row(i);
   VERIFY_IS_APPROX(sm2,sm1.transpose());
   
   sm2.setZero();
-  for(typename MatrixType::Index i=0;i<rows;++i)
+  for(Index i=0;i<rows;++i)
     sm2.col(i).noalias() = sm1.row(i);
   VERIFY_IS_APPROX(sm2,sm1.transpose());
   
   sm2.setZero();
-  for(typename MatrixType::Index i=0;i<rows;++i)
+  for(Index i=0;i<rows;++i)
     sm2.col(i).noalias() += sm1.row(i);
   VERIFY_IS_APPROX(sm2,sm1.transpose());
   
   sm2.setZero();
-  for(typename MatrixType::Index i=0;i<rows;++i)
+  for(Index i=0;i<rows;++i)
     sm2.col(i).noalias() -= sm1.row(i);
   VERIFY_IS_APPROX(sm2,-sm1.transpose());
   
@@ -160,7 +159,6 @@ template<typename MatrixType> void basicStuff(const MatrixType& m)
 
 template<typename MatrixType> void basicStuffComplex(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
   typedef Matrix<RealScalar, MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime> RealMatrixType;
@@ -196,7 +194,7 @@ template<typename MatrixType> void basicStuffComplex(const MatrixType& m)
   VERIFY(!static_cast<const MatrixType&>(cm).imag().isZero());
 }
 
-#ifdef EIGEN_TEST_PART_2
+template<int>
 void casting()
 {
   Matrix4f m = Matrix4f::Random(), m2;
@@ -205,7 +203,6 @@ void casting()
   m2 = m.cast<float>(); // check the specialization when NewType == Type
   VERIFY(m.isApprox(m2));
 }
-#endif
 
 template <typename Scalar>
 void fixedSizeMatrixConstruction()
@@ -270,7 +267,7 @@ void fixedSizeMatrixConstruction()
   }
 }
 
-void test_basicstuff()
+EIGEN_DECLARE_TEST(basicstuff)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( basicStuff(Matrix<float, 1, 1>()) );
@@ -292,5 +289,5 @@ void test_basicstuff()
   CALL_SUBTEST_1(fixedSizeMatrixConstruction<long int>());
   CALL_SUBTEST_1(fixedSizeMatrixConstruction<std::ptrdiff_t>());
 
-  CALL_SUBTEST_2(casting());
+  CALL_SUBTEST_2(casting<0>());
 }
diff --git a/test/bdcsvd.cpp b/test/bdcsvd.cpp
index 6c7b09696..3065ff015 100644
--- a/test/bdcsvd.cpp
+++ b/test/bdcsvd.cpp
@@ -62,7 +62,7 @@ void compare_bdc_jacobi(const MatrixType& a = MatrixType(), unsigned int computa
   if(computationOptions & ComputeThinV) VERIFY_IS_APPROX(bdc_svd.matrixV(), jacobi_svd.matrixV());
 }
 
-void test_bdcsvd()
+EIGEN_DECLARE_TEST(bdcsvd)
 {
   CALL_SUBTEST_3(( svd_verify_assert<BDCSVD<Matrix3f>  >(Matrix3f()) ));
   CALL_SUBTEST_4(( svd_verify_assert<BDCSVD<Matrix4d>  >(Matrix4d()) ));
@@ -104,7 +104,7 @@ void test_bdcsvd()
   CALL_SUBTEST_7( BDCSVD<MatrixXf>(10,10) );
 
   // Check that preallocation avoids subsequent mallocs
-  // Disbaled because not supported by BDCSVD
+  // Disabled because not supported by BDCSVD
   // CALL_SUBTEST_9( svd_preallocate<void>() );
 
   CALL_SUBTEST_2( svd_underoverflow<void>() );
diff --git a/test/bicgstab.cpp b/test/bicgstab.cpp
index 4cc0dd31c..89d6a45ef 100644
--- a/test/bicgstab.cpp
+++ b/test/bicgstab.cpp
@@ -26,7 +26,7 @@ template<typename T, typename I> void test_bicgstab_T()
   //CALL_SUBTEST( check_sparse_square_solving(bicgstab_colmajor_ssor)     );
 }
 
-void test_bicgstab()
+EIGEN_DECLARE_TEST(bicgstab)
 {
   CALL_SUBTEST_1((test_bicgstab_T<double,int>()) );
   CALL_SUBTEST_2((test_bicgstab_T<std::complex<double>, int>()));
diff --git a/test/block.cpp b/test/block.cpp
index 8c4dd87be..ca13539a9 100644
--- a/test/block.cpp
+++ b/test/block.cpp
@@ -39,7 +39,6 @@ is_same_block(const T1& a, const T2& b)
 
 template<typename MatrixType> void block(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
@@ -162,9 +161,18 @@ template<typename MatrixType> void block(const MatrixType& m)
   // expressions without direct access
   VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,rows-r1,cols-c1).block(r2-r1,c2-c1,rows-r2,cols-c2)) , ((m1+m2).block(r2,c2,rows-r2,cols-c2)) );
   VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).row(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)) );
+  VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).row(0)) , ((m1+m2).eval().row(r1).segment(c1,c2-c1+1)) );
   VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).col(0)) , ((m1+m2).col(c1).segment(r1,r2-r1+1)) );
   VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).transpose().col(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)).transpose() );
   VERIFY_IS_APPROX( ((m1+m2).transpose().block(c1,r1,c2-c1+1,r2-r1+1).col(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)).transpose() );
+  VERIFY_IS_APPROX( ((m1+m2).template block<Dynamic,1>(r1,c1,r2-r1+1,1)) , ((m1+m2).eval().col(c1).eval().segment(r1,r2-r1+1)) );
+  VERIFY_IS_APPROX( ((m1+m2).template block<1,Dynamic>(r1,c1,1,c2-c1+1)) , ((m1+m2).eval().row(r1).eval().segment(c1,c2-c1+1)) );
+  VERIFY_IS_APPROX( ((m1+m2).transpose().template block<1,Dynamic>(c1,r1,1,r2-r1+1)) , ((m1+m2).eval().col(c1).eval().segment(r1,r2-r1+1)).transpose() );
+  VERIFY_IS_APPROX( (m1+m2).row(r1).eval(), (m1+m2).eval().row(r1) );
+  VERIFY_IS_APPROX( (m1+m2).adjoint().col(r1).eval(), (m1+m2).adjoint().eval().col(r1) );
+  VERIFY_IS_APPROX( (m1+m2).adjoint().row(c1).eval(), (m1+m2).adjoint().eval().row(c1) );
+  VERIFY_IS_APPROX( (m1*1).row(r1).segment(c1,c2-c1+1).eval(), m1.row(r1).eval().segment(c1,c2-c1+1).eval() );
+  VERIFY_IS_APPROX( m1.col(c1).reverse().segment(r1,r2-r1+1).eval(),m1.col(c1).reverse().eval().segment(r1,r2-r1+1).eval() );
 
   VERIFY_IS_APPROX( (m1*1).topRows(r1),  m1.topRows(r1) );
   VERIFY_IS_APPROX( (m1*1).leftCols(c1), m1.leftCols(c1) );
@@ -218,7 +226,6 @@ template<typename MatrixType> void block(const MatrixType& m)
 template<typename MatrixType>
 void compare_using_data_and_stride(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   Index rows = m.rows();
   Index cols = m.cols();
   Index size = m.size();
@@ -252,7 +259,6 @@ void compare_using_data_and_stride(const MatrixType& m)
 template<typename MatrixType>
 void data_and_stride(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   Index rows = m.rows();
   Index cols = m.cols();
 
@@ -270,7 +276,7 @@ void data_and_stride(const MatrixType& m)
   compare_using_data_and_stride(m1.col(c1).transpose());
 }
 
-void test_block()
+EIGEN_DECLARE_TEST(block)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( block(Matrix<float, 1, 1>()) );
diff --git a/test/boostmultiprec.cpp b/test/boostmultiprec.cpp
index bafbd3b96..1588eb86d 100644
--- a/test/boostmultiprec.cpp
+++ b/test/boostmultiprec.cpp
@@ -145,7 +145,7 @@ namespace Eigen {
 
 }
 
-void test_boostmultiprec()
+EIGEN_DECLARE_TEST(boostmultiprec)
 {
   typedef Matrix<Real,Dynamic,Dynamic> Mat;
   typedef Matrix<std::complex<Real>,Dynamic,Dynamic> MatC;
diff --git a/test/cholesky.cpp b/test/cholesky.cpp
index b4b6bda7d..b871351e0 100644
--- a/test/cholesky.cpp
+++ b/test/cholesky.cpp
@@ -57,7 +57,6 @@ template<typename MatrixType,template <typename,int> class CholType> void test_c
 
 template<typename MatrixType> void cholesky(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   /* this test covers the following files:
      LLT.h LDLT.h
   */
@@ -289,8 +288,6 @@ template<typename MatrixType> void cholesky_cplx(const MatrixType& m)
 
   // test mixing real/scalar types
 
-  typedef typename MatrixType::Index Index;
-
   Index rows = m.rows();
   Index cols = m.cols();
 
@@ -491,7 +488,7 @@ template<typename MatrixType> void cholesky_verify_assert()
   VERIFY_RAISES_ASSERT(ldlt.solveInPlace(&tmp))
 }
 
-void test_cholesky()
+EIGEN_DECLARE_TEST(cholesky)
 {
   int s = 0;
   for(int i = 0; i < g_repeat; i++) {
diff --git a/test/cholmod_support.cpp b/test/cholmod_support.cpp
index 931207334..89b9cf41e 100644
--- a/test/cholmod_support.cpp
+++ b/test/cholmod_support.cpp
@@ -55,7 +55,7 @@ template<typename T, int flags, typename IdxType> void test_cholmod_T()
     test_cholmod_ST<SparseMatrix<T, flags, IdxType> >();
 }
 
-void test_cholmod_support()
+EIGEN_DECLARE_TEST(cholmod_support)
 {
   CALL_SUBTEST_11( (test_cholmod_T<double              , ColMajor, int >()) );
   CALL_SUBTEST_12( (test_cholmod_T<double              , ColMajor, long>()) );
diff --git a/test/commainitializer.cpp b/test/commainitializer.cpp
index 9844adbd2..3cb94da62 100644
--- a/test/commainitializer.cpp
+++ b/test/commainitializer.cpp
@@ -65,7 +65,7 @@ struct test_block_recursion<-1>
   static void run() { }
 };
 
-void test_commainitializer()
+EIGEN_DECLARE_TEST(commainitializer)
 {
   Matrix3d m3;
   Matrix4d m4;
diff --git a/test/conjugate_gradient.cpp b/test/conjugate_gradient.cpp
index 9622fd86d..47a4ca707 100644
--- a/test/conjugate_gradient.cpp
+++ b/test/conjugate_gradient.cpp
@@ -26,7 +26,7 @@ template<typename T, typename I> void test_conjugate_gradient_T()
   CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_upper_I)     );
 }
 
-void test_conjugate_gradient()
+EIGEN_DECLARE_TEST(conjugate_gradient)
 {
   CALL_SUBTEST_1(( test_conjugate_gradient_T<double,int>() ));
   CALL_SUBTEST_2(( test_conjugate_gradient_T<std::complex<double>, int>() ));
diff --git a/test/conservative_resize.cpp b/test/conservative_resize.cpp
index 498421b4c..5dc500068 100644
--- a/test/conservative_resize.cpp
+++ b/test/conservative_resize.cpp
@@ -10,6 +10,7 @@
 #include "main.h"
 
 #include <Eigen/Core>
+#include "AnnoyingScalar.h"
 
 using namespace Eigen;
 
@@ -17,7 +18,6 @@ template <typename Scalar, int Storage>
 void run_matrix_tests()
 {
   typedef Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Storage> MatrixType;
-  typedef typename MatrixType::Index Index;
 
   MatrixType m, n;
 
@@ -110,7 +110,31 @@ void run_vector_tests()
   }
 }
 
-void test_conservative_resize()
+// Basic memory leak check with a non-copyable scalar type
+template<int> void noncopyable()
+{
+  typedef Eigen::Matrix<AnnoyingScalar,Dynamic,1> VectorType;
+  typedef Eigen::Matrix<AnnoyingScalar,Dynamic,Dynamic> MatrixType;
+  
+  {
+    AnnoyingScalar::dont_throw = true;
+    int n = 50;
+    VectorType v0(n), v1(n);
+    MatrixType m0(n,n), m1(n,n), m2(n,n);
+    v0.setOnes(); v1.setOnes();
+    m0.setOnes(); m1.setOnes(); m2.setOnes();
+    VERIFY(m0==m1);
+    m0.conservativeResize(2*n,2*n);
+    VERIFY(m0.topLeftCorner(n,n) == m1);
+    
+    VERIFY(v0.head(n) == v1);
+    v0.conservativeResize(2*n);
+    VERIFY(v0.head(n) == v1);
+  }
+  VERIFY(AnnoyingScalar::instances==0 && "global memory leak detected in noncopyable");
+}
+
+EIGEN_DECLARE_TEST(conservative_resize)
 {
   for(int i=0; i<g_repeat; ++i)
   {
@@ -123,12 +147,16 @@ void test_conservative_resize()
     CALL_SUBTEST_4((run_matrix_tests<std::complex<float>, Eigen::RowMajor>()));
     CALL_SUBTEST_4((run_matrix_tests<std::complex<float>, Eigen::ColMajor>()));
     CALL_SUBTEST_5((run_matrix_tests<std::complex<double>, Eigen::RowMajor>()));
-    CALL_SUBTEST_6((run_matrix_tests<std::complex<double>, Eigen::ColMajor>()));
+    CALL_SUBTEST_5((run_matrix_tests<std::complex<double>, Eigen::ColMajor>()));
 
     CALL_SUBTEST_1((run_vector_tests<int>()));
     CALL_SUBTEST_2((run_vector_tests<float>()));
     CALL_SUBTEST_3((run_vector_tests<double>()));
     CALL_SUBTEST_4((run_vector_tests<std::complex<float> >()));
     CALL_SUBTEST_5((run_vector_tests<std::complex<double> >()));
+
+    AnnoyingScalar::dont_throw = true;
+    CALL_SUBTEST_6(( run_vector_tests<AnnoyingScalar>() ));
+    CALL_SUBTEST_6(( noncopyable<0>() ));
   }
 }
diff --git a/test/constructor.cpp b/test/constructor.cpp
index eec9e2192..1dd3bc3c0 100644
--- a/test/constructor.cpp
+++ b/test/constructor.cpp
@@ -37,7 +37,7 @@ template<typename MatrixType> void ctor_init1(const MatrixType& m)
 }
 
 
-void test_constructor()
+EIGEN_DECLARE_TEST(constructor)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( ctor_init1(Matrix<float, 1, 1>()) );
diff --git a/test/corners.cpp b/test/corners.cpp
index 3c64c32a1..73342a8dd 100644
--- a/test/corners.cpp
+++ b/test/corners.cpp
@@ -15,7 +15,6 @@
 
 template<typename MatrixType> void corners(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   Index rows = m.rows();
   Index cols = m.cols();
 
@@ -102,7 +101,7 @@ template<typename MatrixType, int CRows, int CCols, int SRows, int SCols> void c
   VERIFY_IS_EQUAL((const_matrix.template rightCols<c>()), (const_matrix.template block<rows,c>(0,cols-c)));
 }
 
-void test_corners()
+EIGEN_DECLARE_TEST(corners)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( corners(Matrix<float, 1, 1>()) );
diff --git a/test/ctorleak.cpp b/test/ctorleak.cpp
index c158f5e4e..7202e90dd 100644
--- a/test/ctorleak.cpp
+++ b/test/ctorleak.cpp
@@ -33,7 +33,7 @@ Index Foo::object_limit = 0;
 #undef EIGEN_TEST_MAX_SIZE
 #define EIGEN_TEST_MAX_SIZE 3
 
-void test_ctorleak()
+EIGEN_DECLARE_TEST(ctorleak)
 {
   typedef Matrix<Foo, Dynamic, Dynamic> MatrixX;
   typedef Matrix<Foo, Dynamic, 1> VectorX;
diff --git a/test/cuda_common.h b/test/cuda_common.h
deleted file mode 100644
index 9737693ac..000000000
--- a/test/cuda_common.h
+++ /dev/null
@@ -1,101 +0,0 @@
-
-#ifndef EIGEN_TEST_CUDA_COMMON_H
-#define EIGEN_TEST_CUDA_COMMON_H
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <iostream>
-
-#ifndef __CUDACC__
-dim3 threadIdx, blockDim, blockIdx;
-#endif
-
-template<typename Kernel, typename Input, typename Output>
-void run_on_cpu(const Kernel& ker, int n, const Input& in, Output& out)
-{
-  for(int i=0; i<n; i++)
-    ker(i, in.data(), out.data());
-}
-
-
-template<typename Kernel, typename Input, typename Output>
-__global__
-void run_on_cuda_meta_kernel(const Kernel ker, int n, const Input* in, Output* out)
-{
-  int i = threadIdx.x + blockIdx.x*blockDim.x;
-  if(i<n) {
-    ker(i, in, out);
-  }
-}
-
-
-template<typename Kernel, typename Input, typename Output>
-void run_on_cuda(const Kernel& ker, int n, const Input& in, Output& out)
-{
-  typename Input::Scalar*  d_in;
-  typename Output::Scalar* d_out;
-  std::ptrdiff_t in_bytes  = in.size()  * sizeof(typename Input::Scalar);
-  std::ptrdiff_t out_bytes = out.size() * sizeof(typename Output::Scalar);
-  
-  cudaMalloc((void**)(&d_in),  in_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
-  
-  cudaMemcpy(d_in,  in.data(),  in_bytes,  cudaMemcpyHostToDevice);
-  cudaMemcpy(d_out, out.data(), out_bytes, cudaMemcpyHostToDevice);
-  
-  // Simple and non-optimal 1D mapping assuming n is not too large
-  // That's only for unit testing!
-  dim3 Blocks(128);
-  dim3 Grids( (n+int(Blocks.x)-1)/int(Blocks.x) );
-
-  cudaThreadSynchronize();
-  run_on_cuda_meta_kernel<<<Grids,Blocks>>>(ker, n, d_in, d_out);
-  cudaThreadSynchronize();
-  
-  // check inputs have not been modified
-  cudaMemcpy(const_cast<typename Input::Scalar*>(in.data()),  d_in,  in_bytes,  cudaMemcpyDeviceToHost);
-  cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost);
-  
-  cudaFree(d_in);
-  cudaFree(d_out);
-}
-
-
-template<typename Kernel, typename Input, typename Output>
-void run_and_compare_to_cuda(const Kernel& ker, int n, const Input& in, Output& out)
-{
-  Input  in_ref,  in_cuda;
-  Output out_ref, out_cuda;
-  #ifndef __CUDA_ARCH__
-  in_ref = in_cuda = in;
-  out_ref = out_cuda = out;
-  #endif
-  run_on_cpu (ker, n, in_ref,  out_ref);
-  run_on_cuda(ker, n, in_cuda, out_cuda);
-  #ifndef __CUDA_ARCH__
-  VERIFY_IS_APPROX(in_ref, in_cuda);
-  VERIFY_IS_APPROX(out_ref, out_cuda);
-  #endif
-}
-
-
-void ei_test_init_cuda()
-{
-  int device = 0;
-  cudaDeviceProp deviceProp;
-  cudaGetDeviceProperties(&deviceProp, device);
-  std::cout << "CUDA device info:\n";
-  std::cout << "  name:                        " << deviceProp.name << "\n";
-  std::cout << "  capability:                  " << deviceProp.major << "." << deviceProp.minor << "\n";
-  std::cout << "  multiProcessorCount:         " << deviceProp.multiProcessorCount << "\n";
-  std::cout << "  maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor << "\n";
-  std::cout << "  warpSize:                    " << deviceProp.warpSize << "\n";
-  std::cout << "  regsPerBlock:                " << deviceProp.regsPerBlock << "\n";
-  std::cout << "  concurrentKernels:           " << deviceProp.concurrentKernels << "\n";
-  std::cout << "  clockRate:                   " << deviceProp.clockRate << "\n";
-  std::cout << "  canMapHostMemory:            " << deviceProp.canMapHostMemory << "\n";
-  std::cout << "  computeMode:                 " << deviceProp.computeMode << "\n";
-}
-
-#endif // EIGEN_TEST_CUDA_COMMON_H
diff --git a/test/denseLM.cpp b/test/denseLM.cpp
index 0aa736ea3..afb8004b1 100644
--- a/test/denseLM.cpp
+++ b/test/denseLM.cpp
@@ -182,7 +182,7 @@ void test_denseLM_T()
   
 }
 
-void test_denseLM()
+EIGEN_DECLARE_TEST(denseLM)
 {
   CALL_SUBTEST_2(test_denseLM_T<double>());
   
diff --git a/test/dense_storage.cpp b/test/dense_storage.cpp
index e63712b1a..1150ec52b 100644
--- a/test/dense_storage.cpp
+++ b/test/dense_storage.cpp
@@ -52,7 +52,7 @@ void dense_storage_assignment()
     VERIFY_IS_EQUAL(raw_reference[i], raw_copied_reference[i]);
 }
 
-void test_dense_storage()
+EIGEN_DECLARE_TEST(dense_storage)
 {
   dense_storage_copy<int,Dynamic,Dynamic>();  
   dense_storage_copy<int,Dynamic,3>();
diff --git a/test/determinant.cpp b/test/determinant.cpp
index 758f3afbb..7dd33c373 100644
--- a/test/determinant.cpp
+++ b/test/determinant.cpp
@@ -16,7 +16,6 @@ template<typename MatrixType> void determinant(const MatrixType& m)
   /* this test covers the following files:
      Determinant.h
   */
-  typedef typename MatrixType::Index Index;
   Index size = m.rows();
 
   MatrixType m1(size, size), m2(size, size);
@@ -51,7 +50,7 @@ template<typename MatrixType> void determinant(const MatrixType& m)
   VERIFY_IS_APPROX(m2.block(0,0,0,0).determinant(), Scalar(1));
 }
 
-void test_determinant()
+EIGEN_DECLARE_TEST(determinant)
 {
   for(int i = 0; i < g_repeat; i++) {
     int s = 0;
diff --git a/test/diagonal.cpp b/test/diagonal.cpp
index c1546e97d..4e8c4b3c9 100644
--- a/test/diagonal.cpp
+++ b/test/diagonal.cpp
@@ -11,7 +11,6 @@
 
 template<typename MatrixType> void diagonal(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
 
   Index rows = m.rows();
@@ -66,6 +65,9 @@ template<typename MatrixType> void diagonal(const MatrixType& m)
     m2.diagonal(N2).coeffRef(0) = Scalar(2)*s1;
     VERIFY_IS_APPROX(m2.diagonal(N2).coeff(0), Scalar(2)*s1);
   }
+
+  VERIFY( m1.diagonal( cols).size()==0 );
+  VERIFY( m1.diagonal(-rows).size()==0 );
 }
 
 template<typename MatrixType> void diagonal_assert(const MatrixType& m) {
@@ -81,9 +83,12 @@ template<typename MatrixType> void diagonal_assert(const MatrixType& m) {
     VERIFY_RAISES_ASSERT( m1.array() *= m1.diagonal().array() );
     VERIFY_RAISES_ASSERT( m1.array() /= m1.diagonal().array() );
   }
+
+  VERIFY_RAISES_ASSERT( m1.diagonal(cols+1) );
+  VERIFY_RAISES_ASSERT( m1.diagonal(-(rows+1)) );
 }
 
-void test_diagonal()
+EIGEN_DECLARE_TEST(diagonal)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( diagonal(Matrix<float, 1, 1>()) );
@@ -95,7 +100,6 @@ void test_diagonal()
     CALL_SUBTEST_2( diagonal(MatrixXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_1( diagonal(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_1( diagonal(Matrix<float,Dynamic,4>(3, 4)) );
+    CALL_SUBTEST_1( diagonal_assert(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
   }
-
-  CALL_SUBTEST_1( diagonal_assert(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
 }
diff --git a/test/diagonalmatrices.cpp b/test/diagonalmatrices.cpp
index cd6dc8cf0..ba58ca8d1 100644
--- a/test/diagonalmatrices.cpp
+++ b/test/diagonalmatrices.cpp
@@ -11,7 +11,6 @@
 using namespace std;
 template<typename MatrixType> void diagonalmatrices(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   enum { Rows = MatrixType::RowsAtCompileTime, Cols = MatrixType::ColsAtCompileTime };
   typedef Matrix<Scalar, Rows, 1> VectorType;
@@ -30,6 +29,7 @@ template<typename MatrixType> void diagonalmatrices(const MatrixType& m)
              v2 = VectorType::Random(rows);
   RowVectorType rv1 = RowVectorType::Random(cols),
              rv2 = RowVectorType::Random(cols);
+
   LeftDiagonalMatrix ldm1(v1), ldm2(v2);
   RightDiagonalMatrix rdm1(rv1), rdm2(rv2);
   
@@ -99,6 +99,38 @@ template<typename MatrixType> void diagonalmatrices(const MatrixType& m)
   VERIFY_IS_APPROX( (sq_m1 += (s1*v1).asDiagonal()), sq_m2 += (s1*v1).asDiagonal().toDenseMatrix() );
   VERIFY_IS_APPROX( (sq_m1 -= (s1*v1).asDiagonal()), sq_m2 -= (s1*v1).asDiagonal().toDenseMatrix() );
   VERIFY_IS_APPROX( (sq_m1 = (s1*v1).asDiagonal()), (s1*v1).asDiagonal().toDenseMatrix() );
+
+  sq_m1.setRandom();
+  sq_m2 = v1.asDiagonal();
+  sq_m2 = sq_m1 * sq_m2;
+  VERIFY_IS_APPROX( (sq_m1*v1.asDiagonal()).col(i), sq_m2.col(i) );
+  VERIFY_IS_APPROX( (sq_m1*v1.asDiagonal()).row(i), sq_m2.row(i) );
+}
+
+template<typename MatrixType> void as_scalar_product(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
+  typedef Matrix<Scalar, Dynamic, Dynamic> DynMatrixType;
+  typedef Matrix<Scalar, Dynamic, 1> DynVectorType;
+  typedef Matrix<Scalar, 1, Dynamic> DynRowVectorType;
+
+  Index rows = m.rows();
+  Index depth = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
+
+  VectorType v1 = VectorType::Random(rows);  
+  DynVectorType     dv1  = DynVectorType::Random(depth);
+  DynRowVectorType  drv1 = DynRowVectorType::Random(depth);
+  DynMatrixType     dm1  = dv1;
+  DynMatrixType     drm1 = drv1;
+  
+  Scalar s = v1(0);
+
+  VERIFY_IS_APPROX( v1.asDiagonal() * drv1, s*drv1 );
+  VERIFY_IS_APPROX( dv1 * v1.asDiagonal(), dv1*s );
+
+  VERIFY_IS_APPROX( v1.asDiagonal() * drm1, s*drm1 );
+  VERIFY_IS_APPROX( dm1 * v1.asDiagonal(), dm1*s );
 }
 
 template<int>
@@ -112,18 +144,23 @@ void bug987()
   VERIFY_IS_APPROX(( res1 = points.topLeftCorner<2,2>()*diag.asDiagonal()) , res2 = tmp2*diag.asDiagonal() );
 }
 
-void test_diagonalmatrices()
+EIGEN_DECLARE_TEST(diagonalmatrices)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( diagonalmatrices(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_1( as_scalar_product(Matrix<float, 1, 1>()) );
+
     CALL_SUBTEST_2( diagonalmatrices(Matrix3f()) );
     CALL_SUBTEST_3( diagonalmatrices(Matrix<double,3,3,RowMajor>()) );
     CALL_SUBTEST_4( diagonalmatrices(Matrix4d()) );
     CALL_SUBTEST_5( diagonalmatrices(Matrix<float,4,4,RowMajor>()) );
     CALL_SUBTEST_6( diagonalmatrices(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_6( as_scalar_product(MatrixXcf(1,1)) );
     CALL_SUBTEST_7( diagonalmatrices(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_8( diagonalmatrices(Matrix<double,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_9( diagonalmatrices(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_9( diagonalmatrices(MatrixXf(1,1)) );
+    CALL_SUBTEST_9( as_scalar_product(MatrixXf(1,1)) );
   }
   CALL_SUBTEST_10( bug987<0>() );
 }
diff --git a/test/dontalign.cpp b/test/dontalign.cpp
index 4643cfed6..2e4102b86 100644
--- a/test/dontalign.cpp
+++ b/test/dontalign.cpp
@@ -19,7 +19,6 @@
 template<typename MatrixType>
 void dontalign(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> SquareMatrixType;
@@ -45,7 +44,7 @@ void dontalign(const MatrixType& m)
   internal::aligned_delete(array, rows);
 }
 
-void test_dontalign()
+EIGEN_DECLARE_TEST(dontalign)
 {
 #if defined EIGEN_TEST_PART_1 || defined EIGEN_TEST_PART_5
   dontalign(Matrix3d());
diff --git a/test/dynalloc.cpp b/test/dynalloc.cpp
index f1cc70bee..ceecd76e3 100644
--- a/test/dynalloc.cpp
+++ b/test/dynalloc.cpp
@@ -119,7 +119,7 @@ template<typename T> void check_custom_new_delete()
 #endif
 }
 
-void test_dynalloc()
+EIGEN_DECLARE_TEST(dynalloc)
 {
   // low level dynamic memory allocation
   CALL_SUBTEST(check_handmade_aligned_malloc());
diff --git a/test/eigen2support.cpp b/test/eigen2support.cpp
index ad1d98091..49d7328e9 100644
--- a/test/eigen2support.cpp
+++ b/test/eigen2support.cpp
@@ -13,7 +13,6 @@
 
 template<typename MatrixType> void eigen2support(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
 
   Index rows = m.rows();
@@ -53,7 +52,7 @@ template<typename MatrixType> void eigen2support(const MatrixType& m)
   m1.minor(0,0);
 }
 
-void test_eigen2support()
+EIGEN_DECLARE_TEST(eigen2support)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( eigen2support(Matrix<double,1,1>()) );
diff --git a/test/eigensolver_complex.cpp b/test/eigensolver_complex.cpp
index 293b1b265..c5373f420 100644
--- a/test/eigensolver_complex.cpp
+++ b/test/eigensolver_complex.cpp
@@ -47,7 +47,7 @@ template<typename MatrixType> bool find_pivot(typename MatrixType::Scalar tol, M
   return false;
 }
 
-/* Check that two column vectors are approximately equal upto permutations.
+/* Check that two column vectors are approximately equal up to permutations.
  * Initially, this method checked that the k-th power sums are equal for all k = 1, ..., vec1.rows(),
  * however this strategy is numerically inacurate because of numerical cancellation issues.
  */
@@ -71,7 +71,6 @@ void verify_is_approx_upto_permutation(const VectorType& vec1, const VectorType&
 
 template<typename MatrixType> void eigensolver(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   /* this test covers the following files:
      ComplexEigenSolver.h, and indirectly ComplexSchur.h
   */
@@ -153,7 +152,7 @@ template<typename MatrixType> void eigensolver_verify_assert(const MatrixType& m
   VERIFY_RAISES_ASSERT(eig.eigenvectors());
 }
 
-void test_eigensolver_complex()
+EIGEN_DECLARE_TEST(eigensolver_complex)
 {
   int s = 0;
   for(int i = 0; i < g_repeat; i++) {
diff --git a/test/eigensolver_generalized_real.cpp b/test/eigensolver_generalized_real.cpp
index 9c0838ba4..95ed431db 100644
--- a/test/eigensolver_generalized_real.cpp
+++ b/test/eigensolver_generalized_real.cpp
@@ -15,7 +15,6 @@
 
 template<typename MatrixType> void generalized_eigensolver_real(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   /* this test covers the following files:
      GeneralizedEigenSolver.h
   */
@@ -77,9 +76,16 @@ template<typename MatrixType> void generalized_eigensolver_real(const MatrixType
     GeneralizedEigenSolver<MatrixType> eig2(a.adjoint() * a,b.adjoint() * b);
     eig2.compute(a.adjoint() * a,b.adjoint() * b);
   }
+
+  // check without eigenvectors
+  {
+    GeneralizedEigenSolver<MatrixType> eig1(spdA, spdB, true);
+    GeneralizedEigenSolver<MatrixType> eig2(spdA, spdB, false);
+    VERIFY_IS_APPROX(eig1.eigenvalues(), eig2.eigenvalues());
+  }
 }
 
-void test_eigensolver_generalized_real()
+EIGEN_DECLARE_TEST(eigensolver_generalized_real)
 {
   for(int i = 0; i < g_repeat; i++) {
     int s = 0;
diff --git a/test/eigensolver_generic.cpp b/test/eigensolver_generic.cpp
index d0e644d4b..e0e435151 100644
--- a/test/eigensolver_generic.cpp
+++ b/test/eigensolver_generic.cpp
@@ -14,7 +14,6 @@
 
 template<typename MatrixType> void eigensolver(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   /* this test covers the following files:
      EigenSolver.h
   */
@@ -101,7 +100,35 @@ template<typename MatrixType> void eigensolver_verify_assert(const MatrixType& m
   VERIFY_RAISES_ASSERT(eig.pseudoEigenvectors());
 }
 
-void test_eigensolver_generic()
+template<int>
+void eigensolver_generic_extra()
+{
+  {
+    // regression test for bug 793
+    MatrixXd a(3,3);
+    a << 0,  0,  1,
+        1,  1, 1,
+        1, 1e+200,  1;
+    Eigen::EigenSolver<MatrixXd> eig(a);
+    double scale = 1e-200; // scale to avoid overflow during the comparisons
+    VERIFY_IS_APPROX(a * eig.pseudoEigenvectors()*scale, eig.pseudoEigenvectors() * eig.pseudoEigenvalueMatrix()*scale);
+    VERIFY_IS_APPROX(a * eig.eigenvectors()*scale, eig.eigenvectors() * eig.eigenvalues().asDiagonal()*scale);
+  }
+  {
+    // check a case where all eigenvalues are null.
+    MatrixXd a(2,2);
+    a << 1,  1,
+        -1, -1;
+    Eigen::EigenSolver<MatrixXd> eig(a);
+    VERIFY_IS_APPROX(eig.pseudoEigenvectors().squaredNorm(), 2.);
+    VERIFY_IS_APPROX((a * eig.pseudoEigenvectors()).norm()+1., 1.);
+    VERIFY_IS_APPROX((eig.pseudoEigenvectors() * eig.pseudoEigenvalueMatrix()).norm()+1., 1.);
+    VERIFY_IS_APPROX((a * eig.eigenvectors()).norm()+1., 1.);
+    VERIFY_IS_APPROX((eig.eigenvectors() * eig.eigenvalues().asDiagonal()).norm()+1., 1.);
+  }
+}
+
+EIGEN_DECLARE_TEST(eigensolver_generic)
 {
   int s = 0;
   for(int i = 0; i < g_repeat; i++) {
@@ -136,31 +163,7 @@ void test_eigensolver_generic()
   }
   );
   
-#ifdef EIGEN_TEST_PART_2
-  {
-    // regression test for bug 793
-    MatrixXd a(3,3);
-    a << 0,  0,  1,
-        1,  1, 1,
-        1, 1e+200,  1;
-    Eigen::EigenSolver<MatrixXd> eig(a);
-    double scale = 1e-200; // scale to avoid overflow during the comparisons
-    VERIFY_IS_APPROX(a * eig.pseudoEigenvectors()*scale, eig.pseudoEigenvectors() * eig.pseudoEigenvalueMatrix()*scale);
-    VERIFY_IS_APPROX(a * eig.eigenvectors()*scale, eig.eigenvectors() * eig.eigenvalues().asDiagonal()*scale);
-  }
-  {
-    // check a case where all eigenvalues are null.
-    MatrixXd a(2,2);
-    a << 1,  1,
-        -1, -1;
-    Eigen::EigenSolver<MatrixXd> eig(a);
-    VERIFY_IS_APPROX(eig.pseudoEigenvectors().squaredNorm(), 2.);
-    VERIFY_IS_APPROX((a * eig.pseudoEigenvectors()).norm()+1., 1.);
-    VERIFY_IS_APPROX((eig.pseudoEigenvectors() * eig.pseudoEigenvalueMatrix()).norm()+1., 1.);
-    VERIFY_IS_APPROX((a * eig.eigenvectors()).norm()+1., 1.);
-    VERIFY_IS_APPROX((eig.eigenvectors() * eig.eigenvalues().asDiagonal()).norm()+1., 1.);
-  }
-#endif
+  CALL_SUBTEST_2( eigensolver_generic_extra<0>() );
   
   TEST_SET_BUT_UNUSED_VARIABLE(s)
 }
diff --git a/test/eigensolver_selfadjoint.cpp b/test/eigensolver_selfadjoint.cpp
index 39ad4130e..65b80c3fb 100644
--- a/test/eigensolver_selfadjoint.cpp
+++ b/test/eigensolver_selfadjoint.cpp
@@ -68,7 +68,6 @@ template<typename MatrixType> void selfadjointeigensolver_essential_check(const
 
 template<typename MatrixType> void selfadjointeigensolver(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   /* this test covers the following files:
      EigenSolver.h, SelfAdjointEigenSolver.h (and indirectly: Tridiagonalization.h)
   */
@@ -231,7 +230,7 @@ void bug_1204()
   SelfAdjointEigenSolver<Eigen::SparseMatrix<double> > eig(A);
 }
 
-void test_eigensolver_selfadjoint()
+EIGEN_DECLARE_TEST(eigensolver_selfadjoint)
 {
   int s = 0;
   for(int i = 0; i < g_repeat; i++) {
diff --git a/test/evaluators.cpp b/test/evaluators.cpp
index aed5a05a7..f4fdaf053 100644
--- a/test/evaluators.cpp
+++ b/test/evaluators.cpp
@@ -101,7 +101,7 @@ using namespace std;
 #define VERIFY_IS_APPROX_EVALUATOR(DEST,EXPR) VERIFY_IS_APPROX(copy_using_evaluator(DEST,(EXPR)), (EXPR).eval());
 #define VERIFY_IS_APPROX_EVALUATOR2(DEST,EXPR,REF) VERIFY_IS_APPROX(copy_using_evaluator(DEST,(EXPR)), (REF).eval());
 
-void test_evaluators()
+EIGEN_DECLARE_TEST(evaluators)
 {
   // Testing Matrix evaluator and Transpose
   Vector2d v = Vector2d::Random();
diff --git a/test/exceptions.cpp b/test/exceptions.cpp
index b83fb82ba..3d93060ab 100644
--- a/test/exceptions.cpp
+++ b/test/exceptions.cpp
@@ -8,93 +8,34 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 
-// Various sanity tests with exceptions:
+// Various sanity tests with exceptions and non trivially copyable scalar type.
 //  - no memory leak when a custom scalar type trow an exceptions
 //  - todo: complete the list of tests!
 
 #define EIGEN_STACK_ALLOCATION_LIMIT 100000000
 
 #include "main.h"
-
-struct my_exception
-{
-  my_exception() {}
-  ~my_exception() {}
-};
-    
-class ScalarWithExceptions
-{
-  public:
-    ScalarWithExceptions() { init(); }
-    ScalarWithExceptions(const float& _v) { init(); *v = _v; }
-    ScalarWithExceptions(const ScalarWithExceptions& other) { init(); *v = *(other.v); }
-    ~ScalarWithExceptions() {
-      delete v;
-      instances--;
-    }
-
-    void init() {
-      v = new float;
-      instances++;
-    }
-
-    ScalarWithExceptions operator+(const ScalarWithExceptions& other) const
-    {
-      countdown--;
-      if(countdown<=0)
-        throw my_exception();
-      return ScalarWithExceptions(*v+*other.v);
-    }
-    
-    ScalarWithExceptions operator-(const ScalarWithExceptions& other) const
-    { return ScalarWithExceptions(*v-*other.v); }
-    
-    ScalarWithExceptions operator*(const ScalarWithExceptions& other) const
-    { return ScalarWithExceptions((*v)*(*other.v)); }
-    
-    ScalarWithExceptions& operator+=(const ScalarWithExceptions& other)
-    { *v+=*other.v; return *this; }
-    ScalarWithExceptions& operator-=(const ScalarWithExceptions& other)
-    { *v-=*other.v; return *this; }
-    ScalarWithExceptions& operator=(const ScalarWithExceptions& other)
-    { *v = *(other.v); return *this; }
-  
-    bool operator==(const ScalarWithExceptions& other) const
-    { return *v==*other.v; }
-    bool operator!=(const ScalarWithExceptions& other) const
-    { return *v!=*other.v; }
-    
-    float* v;
-    static int instances;
-    static int countdown;
-};
-
-ScalarWithExceptions real(const ScalarWithExceptions &x) { return x; }
-ScalarWithExceptions imag(const ScalarWithExceptions & ) { return 0; }
-ScalarWithExceptions conj(const ScalarWithExceptions &x) { return x; }
-
-int ScalarWithExceptions::instances = 0;
-int ScalarWithExceptions::countdown = 0;
-
+#include "AnnoyingScalar.h"
 
 #define CHECK_MEMLEAK(OP) {                                 \
-    ScalarWithExceptions::countdown = 100;                  \
-    int before = ScalarWithExceptions::instances;           \
-    bool exception_thrown = false;                         \
-    try { OP; }                              \
+    AnnoyingScalar::countdown = 100;                        \
+    int before = AnnoyingScalar::instances;                 \
+    bool exception_thrown = false;                          \
+    try { OP; }                                             \
     catch (my_exception) {                                  \
       exception_thrown = true;                              \
-      VERIFY(ScalarWithExceptions::instances==before && "memory leak detected in " && EIGEN_MAKESTRING(OP)); \
+      VERIFY(AnnoyingScalar::instances==before && "memory leak detected in " && EIGEN_MAKESTRING(OP)); \
     } \
-    VERIFY(exception_thrown && " no exception thrown in " && EIGEN_MAKESTRING(OP)); \
+    VERIFY( (AnnoyingScalar::dont_throw) || (exception_thrown && " no exception thrown in " && EIGEN_MAKESTRING(OP)) ); \
   }
 
-void memoryleak()
+EIGEN_DECLARE_TEST(exceptions)
 {
-  typedef Eigen::Matrix<ScalarWithExceptions,Dynamic,1> VectorType;
-  typedef Eigen::Matrix<ScalarWithExceptions,Dynamic,Dynamic> MatrixType;
+  typedef Eigen::Matrix<AnnoyingScalar,Dynamic,1> VectorType;
+  typedef Eigen::Matrix<AnnoyingScalar,Dynamic,Dynamic> MatrixType;
   
   {
+    AnnoyingScalar::dont_throw = false;
     int n = 50;
     VectorType v0(n), v1(n);
     MatrixType m0(n,n), m1(n,n), m2(n,n);
@@ -104,10 +45,5 @@ void memoryleak()
     CHECK_MEMLEAK(m2 = m0 * m1 * m2);
     CHECK_MEMLEAK((v0+v1).dot(v0+v1));
   }
-  VERIFY(ScalarWithExceptions::instances==0 && "global memory leak detected in " && EIGEN_MAKESTRING(OP)); \
-}
-
-void test_exceptions()
-{
-  CALL_SUBTEST( memoryleak() );
+  VERIFY(AnnoyingScalar::instances==0 && "global memory leak detected in " && EIGEN_MAKESTRING(OP));
 }
diff --git a/test/fastmath.cpp b/test/fastmath.cpp
index cc5db0746..c30f0a846 100644
--- a/test/fastmath.cpp
+++ b/test/fastmath.cpp
@@ -88,7 +88,7 @@ void check_inf_nan(bool dryrun) {
   }
 }
 
-void test_fastmath() {
+EIGEN_DECLARE_TEST(fastmath) {
   std::cout << "*** float *** \n\n"; check_inf_nan<float>(true);
   std::cout << "*** double ***\n\n"; check_inf_nan<double>(true);
   std::cout << "*** long double *** \n\n"; check_inf_nan<long double>(true);
diff --git a/test/first_aligned.cpp b/test/first_aligned.cpp
index ae2d4bc42..ed9945077 100644
--- a/test/first_aligned.cpp
+++ b/test/first_aligned.cpp
@@ -26,7 +26,7 @@ void test_none_aligned_helper(Scalar *array, int size)
 
 struct some_non_vectorizable_type { float x; };
 
-void test_first_aligned()
+EIGEN_DECLARE_TEST(first_aligned)
 {
   EIGEN_ALIGN16 float array_float[100];
   test_first_aligned_helper(array_float, 50);
diff --git a/test/geo_alignedbox.cpp b/test/geo_alignedbox.cpp
index 223ff5eea..c6c051ce4 100644
--- a/test/geo_alignedbox.cpp
+++ b/test/geo_alignedbox.cpp
@@ -33,7 +33,6 @@ template<typename BoxType> void alignedbox(const BoxType& _box)
   /* this test covers the following files:
      AlignedBox.h
   */
-  typedef typename BoxType::Index Index;  
   typedef typename BoxType::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
   typedef Matrix<Scalar, BoxType::AmbientDimAtCompileTime, 1> VectorType;
@@ -95,7 +94,6 @@ template<typename BoxType>
 void alignedboxCastTests(const BoxType& _box)
 {
   // casting  
-  typedef typename BoxType::Index Index;
   typedef typename BoxType::Scalar Scalar;
   typedef Matrix<Scalar, BoxType::AmbientDimAtCompileTime, 1> VectorType;
 
@@ -171,7 +169,7 @@ void specificTest2()
 }
 
 
-void test_geo_alignedbox()
+EIGEN_DECLARE_TEST(geo_alignedbox)
 {
   for(int i = 0; i < g_repeat; i++)
   {
diff --git a/test/geo_eulerangles.cpp b/test/geo_eulerangles.cpp
index 932ebe773..693c627a9 100644
--- a/test/geo_eulerangles.cpp
+++ b/test/geo_eulerangles.cpp
@@ -103,7 +103,7 @@ template<typename Scalar> void eulerangles()
   check_all_var(ea);
 }
 
-void test_geo_eulerangles()
+EIGEN_DECLARE_TEST(geo_eulerangles)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( eulerangles<float>() );
diff --git a/test/geo_homogeneous.cpp b/test/geo_homogeneous.cpp
index 2187c7bf9..9aebe6226 100644
--- a/test/geo_homogeneous.cpp
+++ b/test/geo_homogeneous.cpp
@@ -115,7 +115,7 @@ template<typename Scalar,int Size> void homogeneous(void)
   VERIFY_IS_APPROX( (t2.template triangularView<Lower>() * v0.homogeneous()).eval(), (t2.template triangularView<Lower>()*hv0) );
 }
 
-void test_geo_homogeneous()
+EIGEN_DECLARE_TEST(geo_homogeneous)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1(( homogeneous<float,1>() ));
diff --git a/test/geo_hyperplane.cpp b/test/geo_hyperplane.cpp
index 27892850d..a26709301 100644
--- a/test/geo_hyperplane.cpp
+++ b/test/geo_hyperplane.cpp
@@ -19,7 +19,6 @@ template<typename HyperplaneType> void hyperplane(const HyperplaneType& _plane)
      Hyperplane.h
   */
   using std::abs;
-  typedef typename HyperplaneType::Index Index;
   const Index dim = _plane.dim();
   enum { Options = HyperplaneType::Options };
   typedef typename HyperplaneType::Scalar Scalar;
@@ -181,7 +180,7 @@ template<typename Scalar> void hyperplane_alignment()
 }
 
 
-void test_geo_hyperplane()
+EIGEN_DECLARE_TEST(geo_hyperplane)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( hyperplane(Hyperplane<float,2>()) );
diff --git a/test/geo_orthomethods.cpp b/test/geo_orthomethods.cpp
index e178df257..b7b660740 100644
--- a/test/geo_orthomethods.cpp
+++ b/test/geo_orthomethods.cpp
@@ -115,7 +115,7 @@ template<typename Scalar, int Size> void orthomethods(int size=Size)
   VERIFY_IS_APPROX(mcrossN3.row(i), matN3.row(i).cross(vec3));
 }
 
-void test_geo_orthomethods()
+EIGEN_DECLARE_TEST(geo_orthomethods)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( orthomethods_3<float>() );
diff --git a/test/geo_parametrizedline.cpp b/test/geo_parametrizedline.cpp
index 29c1b105c..7135c8fa5 100644
--- a/test/geo_parametrizedline.cpp
+++ b/test/geo_parametrizedline.cpp
@@ -19,7 +19,6 @@ template<typename LineType> void parametrizedline(const LineType& _line)
      ParametrizedLine.h
   */
   using std::abs;
-  typedef typename LineType::Index Index;
   const Index dim = _line.dim();
   typedef typename LineType::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
@@ -118,7 +117,7 @@ template<typename Scalar> void parametrizedline_alignment()
   #endif
 }
 
-void test_geo_parametrizedline()
+EIGEN_DECLARE_TEST(geo_parametrizedline)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( parametrizedline(ParametrizedLine<float,2>()) );
diff --git a/test/geo_quaternion.cpp b/test/geo_quaternion.cpp
index 8ee8fdb27..ed801c71b 100644
--- a/test/geo_quaternion.cpp
+++ b/test/geo_quaternion.cpp
@@ -12,6 +12,7 @@
 #include <Eigen/Geometry>
 #include <Eigen/LU>
 #include <Eigen/SVD>
+#include "AnnoyingScalar.h"
 
 template<typename T> T bounded_acos(T v)
 {
@@ -85,7 +86,7 @@ template<typename Scalar, int Options> void quaternion(void)
   if (refangle>Scalar(EIGEN_PI))
     refangle = Scalar(2)*Scalar(EIGEN_PI) - refangle;
 
-  if((q1.coeffs()-q2.coeffs()).norm() > 10*largeEps)
+  if((q1.coeffs()-q2.coeffs()).norm() > Scalar(10)*largeEps)
   {
     VERIFY_IS_MUCH_SMALLER_THAN(abs(q1.angularDistance(q2) - refangle), Scalar(1));
   }
@@ -113,7 +114,7 @@ template<typename Scalar, int Options> void quaternion(void)
 
   // Do not execute the test if the rotation angle is almost zero, or
   // the rotation axis and v1 are almost parallel.
-  if (abs(aa.angle()) > 5*test_precision<Scalar>()
+  if (abs(aa.angle()) > Scalar(5)*test_precision<Scalar>()
       && (aa.axis() - v1.normalized()).norm() < Scalar(1.99)
       && (aa.axis() + v1.normalized()).norm() < Scalar(1.99))
   {
@@ -241,7 +242,7 @@ template<typename Scalar> void mapQuaternion(void){
   const MQuaternionUA& cmq3(mq3);
   VERIFY( &cmq3.x() == &mq3.x() );
   // FIXME the following should be ok. The problem is that currently the LValueBit flag
-  // is used to determine wether we can return a coeff by reference or not, which is not enough for Map<const ...>.
+  // is used to determine whether we can return a coeff by reference or not, which is not enough for Map<const ...>.
   //const MCQuaternionUA& cmcq3(mcq3);
   //VERIFY( &cmcq3.x() == &mcq3.x() );
 }
@@ -285,18 +286,36 @@ template<typename PlainObjectType> void check_const_correctness(const PlainObjec
   VERIFY( !(Map<ConstPlainObjectType, Aligned>::Flags & LvalueBit) );
 }
 
-void test_geo_quaternion()
+#if EIGEN_HAS_RVALUE_REFERENCES
+
+// Regression for bug 1573
+struct MovableClass {
+  MovableClass() = default;
+  MovableClass(const MovableClass&) = default;
+  MovableClass(MovableClass&&) noexcept = default;
+  MovableClass& operator=(const MovableClass&) = default;
+  MovableClass& operator=(MovableClass&&) = default;
+  Quaternionf m_quat;
+};
+
+#endif
+
+EIGEN_DECLARE_TEST(geo_quaternion)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1(( quaternion<float,AutoAlign>() ));
     CALL_SUBTEST_1( check_const_correctness(Quaternionf()) );
+    CALL_SUBTEST_1(( quaternion<float,DontAlign>() ));
+    CALL_SUBTEST_1(( quaternionAlignment<float>() ));
+    CALL_SUBTEST_1( mapQuaternion<float>() );
+
     CALL_SUBTEST_2(( quaternion<double,AutoAlign>() ));
     CALL_SUBTEST_2( check_const_correctness(Quaterniond()) );
-    CALL_SUBTEST_3(( quaternion<float,DontAlign>() ));
-    CALL_SUBTEST_4(( quaternion<double,DontAlign>() ));
-    CALL_SUBTEST_5(( quaternionAlignment<float>() ));
-    CALL_SUBTEST_6(( quaternionAlignment<double>() ));
-    CALL_SUBTEST_1( mapQuaternion<float>() );
+    CALL_SUBTEST_2(( quaternion<double,DontAlign>() ));
+    CALL_SUBTEST_2(( quaternionAlignment<double>() ));
     CALL_SUBTEST_2( mapQuaternion<double>() );
+
+    AnnoyingScalar::dont_throw = true;
+    CALL_SUBTEST_3(( quaternion<AnnoyingScalar,AutoAlign>() ));
   }
 }
diff --git a/test/geo_transformations.cpp b/test/geo_transformations.cpp
index 278e527c2..bf920696b 100755
--- a/test/geo_transformations.cpp
+++ b/test/geo_transformations.cpp
@@ -612,7 +612,7 @@ template<typename Scalar, int Dim, int Options> void transform_products()
   VERIFY_IS_APPROX((ac*p).matrix(), a_m*p_m);
 }
 
-void test_geo_transformations()
+EIGEN_DECLARE_TEST(geo_transformations)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1(( transformations<double,Affine,AutoAlign>() ));
diff --git a/test/cuda_basic.cu b/test/gpu_basic.cu
index ce66c2c78..e8069f185 100644
--- a/test/cuda_basic.cu
+++ b/test/gpu_basic.cu
@@ -15,13 +15,10 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cuda_basic
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 
-#include <math_constants.h>
-#include <cuda.h>
 #include "main.h"
-#include "cuda_common.h"
+#include "gpu_common.h"
 
 // Check that dense modules can be properly parsed by nvcc
 #include <Eigen/Dense>
@@ -121,6 +118,22 @@ struct diagonal {
 };
 
 template<typename T>
+struct eigenvalues_direct {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    typedef Matrix<typename T::Scalar, T::RowsAtCompileTime, 1> Vec;
+    T M(in+i);
+    Map<Vec> res(out+i*Vec::MaxSizeAtCompileTime);
+    T A = M*M.adjoint();
+    SelfAdjointEigenSolver<T> eig;
+    eig.computeDirect(A);
+    res = eig.eigenvalues();
+  }
+};
+
+template<typename T>
 struct eigenvalues {
   EIGEN_DEVICE_FUNC
   void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
@@ -131,40 +144,71 @@ struct eigenvalues {
     Map<Vec> res(out+i*Vec::MaxSizeAtCompileTime);
     T A = M*M.adjoint();
     SelfAdjointEigenSolver<T> eig;
-    eig.computeDirect(M);
+    eig.compute(A);
     res = eig.eigenvalues();
   }
 };
 
-void test_cuda_basic()
+template<typename T>
+struct matrix_inverse {
+  EIGEN_DEVICE_FUNC
+  void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+  {
+    using namespace Eigen;
+    T M(in+i);
+    Map<T> res(out+i*T::MaxSizeAtCompileTime);
+    res = M.inverse();
+  }
+};
+
+EIGEN_DECLARE_TEST(gpu_basic)
 {
-  ei_test_init_cuda();
+  ei_test_init_gpu();
   
   int nthreads = 100;
   Eigen::VectorXf in, out;
   
-  #ifndef __CUDA_ARCH__
+  #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
   int data_size = nthreads * 512;
   in.setRandom(data_size);
   out.setRandom(data_size);
   #endif
   
-  CALL_SUBTEST( run_and_compare_to_cuda(coeff_wise<Vector3f>(), nthreads, in, out) );
-  CALL_SUBTEST( run_and_compare_to_cuda(coeff_wise<Array44f>(), nthreads, in, out) );
-  
-  CALL_SUBTEST( run_and_compare_to_cuda(replicate<Array4f>(), nthreads, in, out) );
-  CALL_SUBTEST( run_and_compare_to_cuda(replicate<Array33f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise<Vector3f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise<Array44f>(), nthreads, in, out) );
+
+#if !defined(EIGEN_USE_HIP)
+  // FIXME
+  // These subtests result in a compile failure on the HIP platform
+  //
+  //  eigen-upstream/Eigen/src/Core/Replicate.h:61:65: error:
+  //           base class 'internal::dense_xpr_base<Replicate<Array<float, 4, 1, 0, 4, 1>, -1, -1> >::type'
+  //           (aka 'ArrayBase<Eigen::Replicate<Eigen::Array<float, 4, 1, 0, 4, 1>, -1, -1> >') has protected default constructor
+  CALL_SUBTEST( run_and_compare_to_gpu(replicate<Array4f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(replicate<Array33f>(), nthreads, in, out) );
+#endif
   
-  CALL_SUBTEST( run_and_compare_to_cuda(redux<Array4f>(), nthreads, in, out) );
-  CALL_SUBTEST( run_and_compare_to_cuda(redux<Matrix3f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(redux<Array4f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(redux<Matrix3f>(), nthreads, in, out) );
   
-  CALL_SUBTEST( run_and_compare_to_cuda(prod_test<Matrix3f,Matrix3f>(), nthreads, in, out) );
-  CALL_SUBTEST( run_and_compare_to_cuda(prod_test<Matrix4f,Vector4f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(prod_test<Matrix3f,Matrix3f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(prod_test<Matrix4f,Vector4f>(), nthreads, in, out) );
   
-  CALL_SUBTEST( run_and_compare_to_cuda(diagonal<Matrix3f,Vector3f>(), nthreads, in, out) );
-  CALL_SUBTEST( run_and_compare_to_cuda(diagonal<Matrix4f,Vector4f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(diagonal<Matrix3f,Vector3f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(diagonal<Matrix4f,Vector4f>(), nthreads, in, out) );
+
+  CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix2f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix3f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix4f>(), nthreads, in, out) );
   
-  CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues<Matrix3f>(), nthreads, in, out) );
-  CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues<Matrix2f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix3f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix2f>(), nthreads, in, out) );
 
+#if defined(__NVCC__)
+  // FIXME
+  // These subtests compiles only with nvcc and fail with HIPCC and clang-cuda
+  CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix4f>(), nthreads, in, out) );
+  typedef Matrix<float,6,6> Matrix6f;
+  CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix6f>(), nthreads, in, out) );
+#endif
 }
diff --git a/test/gpu_common.h b/test/gpu_common.h
new file mode 100644
index 000000000..3aac49e96
--- /dev/null
+++ b/test/gpu_common.h
@@ -0,0 +1,157 @@
+
+#ifndef EIGEN_TEST_GPU_COMMON_H
+#define EIGEN_TEST_GPU_COMMON_H
+
+#ifdef EIGEN_USE_HIP
+  #include <hip/hip_runtime.h>
+  #include <hip/hip_runtime_api.h>
+#else
+  #include <cuda.h>
+  #include <cuda_runtime.h>
+  #include <cuda_runtime_api.h>
+#endif
+
+#include <iostream>
+
+#define EIGEN_USE_GPU
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
+#if !defined(__CUDACC__) && !defined(__HIPCC__)
+dim3 threadIdx, blockDim, blockIdx;
+#endif
+
+template<typename Kernel, typename Input, typename Output>
+void run_on_cpu(const Kernel& ker, int n, const Input& in, Output& out)
+{
+  for(int i=0; i<n; i++)
+    ker(i, in.data(), out.data());
+}
+
+
+template<typename Kernel, typename Input, typename Output>
+__global__
+void run_on_gpu_meta_kernel(const Kernel ker, int n, const Input* in, Output* out)
+{
+  int i = threadIdx.x + blockIdx.x*blockDim.x;
+  if(i<n) {
+    ker(i, in, out);
+  }
+}
+
+
+template<typename Kernel, typename Input, typename Output>
+void run_on_gpu(const Kernel& ker, int n, const Input& in, Output& out)
+{
+  typename Input::Scalar*  d_in;
+  typename Output::Scalar* d_out;
+  std::ptrdiff_t in_bytes  = in.size()  * sizeof(typename Input::Scalar);
+  std::ptrdiff_t out_bytes = out.size() * sizeof(typename Output::Scalar);
+  
+  gpuMalloc((void**)(&d_in),  in_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
+  
+  gpuMemcpy(d_in,  in.data(),  in_bytes,  gpuMemcpyHostToDevice);
+  gpuMemcpy(d_out, out.data(), out_bytes, gpuMemcpyHostToDevice);
+  
+  // Simple and non-optimal 1D mapping assuming n is not too large
+  // That's only for unit testing!
+  dim3 Blocks(128);
+  dim3 Grids( (n+int(Blocks.x)-1)/int(Blocks.x) );
+
+  gpuDeviceSynchronize();
+  
+#ifdef EIGEN_USE_HIP
+  hipLaunchKernelGGL(run_on_gpu_meta_kernel<Kernel,
+		     typename std::decay<decltype(*d_in)>::type,
+		     typename std::decay<decltype(*d_out)>::type>, 
+		     dim3(Grids), dim3(Blocks), 0, 0, ker, n, d_in, d_out);
+#else
+  run_on_gpu_meta_kernel<<<Grids,Blocks>>>(ker, n, d_in, d_out);
+#endif
+  
+  gpuDeviceSynchronize();
+  
+  // check inputs have not been modified
+  gpuMemcpy(const_cast<typename Input::Scalar*>(in.data()),  d_in,  in_bytes,  gpuMemcpyDeviceToHost);
+  gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost);
+  
+  gpuFree(d_in);
+  gpuFree(d_out);
+}
+
+
+template<typename Kernel, typename Input, typename Output>
+void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& out)
+{
+  Input  in_ref,  in_gpu;
+  Output out_ref, out_gpu;
+  #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
+  in_ref = in_gpu = in;
+  out_ref = out_gpu = out;
+  #else
+  EIGEN_UNUSED_VARIABLE(in);
+  EIGEN_UNUSED_VARIABLE(out);
+  #endif
+  run_on_cpu (ker, n, in_ref,  out_ref);
+  run_on_gpu(ker, n, in_gpu, out_gpu);
+  #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
+  VERIFY_IS_APPROX(in_ref, in_gpu);
+  VERIFY_IS_APPROX(out_ref, out_gpu);
+  #endif
+}
+
+struct compile_time_device_info {
+  EIGEN_DEVICE_FUNC
+  void operator()(int /*i*/, const int* /*in*/, int* info) const
+  {
+    #if defined(__CUDA_ARCH__)
+    info[0] = int(__CUDA_ARCH__ +0);
+    #endif
+    #if defined(EIGEN_HIP_DEVICE_COMPILE)
+    info[1] = int(EIGEN_HIP_DEVICE_COMPILE +0);
+    #endif
+  }
+};
+
+void ei_test_init_gpu()
+{
+  int device = 0;
+  gpuDeviceProp_t deviceProp;
+  gpuGetDeviceProperties(&deviceProp, device);
+
+  ArrayXi dummy(1), info(10);
+  info = -1;
+  run_on_gpu(compile_time_device_info(),10,dummy,info);
+
+
+  std::cout << "GPU compile-time info:\n";
+  
+  #ifdef EIGEN_CUDACC
+  std::cout << "  EIGEN_CUDACC:                 " << int(EIGEN_CUDACC) << "\n";
+  #endif
+  
+  #ifdef EIGEN_CUDACC_VER
+  std::cout << "  EIGEN_CUDACC_VER:             " << int(EIGEN_CUDACC_VER) << "\n";
+  #endif
+
+  #ifdef EIGEN_HIPCC
+  std::cout << "  EIGEN_HIPCC:                 " << int(EIGEN_HIPCC) << "\n";
+  #endif
+
+  std::cout << "  EIGEN_CUDA_ARCH:             " << info[0] << "\n";  
+  std::cout << "  EIGEN_HIP_DEVICE_COMPILE:    " << info[1] << "\n";
+
+  std::cout << "GPU device info:\n";
+  std::cout << "  name:                        " << deviceProp.name << "\n";
+  std::cout << "  capability:                  " << deviceProp.major << "." << deviceProp.minor << "\n";
+  std::cout << "  multiProcessorCount:         " << deviceProp.multiProcessorCount << "\n";
+  std::cout << "  maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor << "\n";
+  std::cout << "  warpSize:                    " << deviceProp.warpSize << "\n";
+  std::cout << "  regsPerBlock:                " << deviceProp.regsPerBlock << "\n";
+  std::cout << "  concurrentKernels:           " << deviceProp.concurrentKernels << "\n";
+  std::cout << "  clockRate:                   " << deviceProp.clockRate << "\n";
+  std::cout << "  canMapHostMemory:            " << deviceProp.canMapHostMemory << "\n";
+  std::cout << "  computeMode:                 " << deviceProp.computeMode << "\n";
+}
+
+#endif // EIGEN_TEST_GPU_COMMON_H
diff --git a/test/half_float.cpp b/test/half_float.cpp
index 7734f82cc..2a7f9b497 100644
--- a/test/half_float.cpp
+++ b/test/half_float.cpp
@@ -9,7 +9,7 @@
 
 #include "main.h"
 
-#include <Eigen/src/Core/arch/CUDA/Half.h>
+#include <Eigen/src/Core/arch/GPU/Half.h>
 
 // Make sure it's possible to forward declare Eigen::half
 namespace Eigen {
@@ -257,13 +257,31 @@ void test_array()
   ss << a1;
 }
 
-void test_half_float()
+void test_product()
+{
+  typedef Matrix<half,Dynamic,Dynamic> MatrixXh;
+  Index rows  = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
+  Index cols  = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
+  Index depth = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
+  MatrixXh Ah = MatrixXh::Random(rows,depth);
+  MatrixXh Bh = MatrixXh::Random(depth,cols);
+  MatrixXh Ch = MatrixXh::Random(rows,cols);
+  MatrixXf Af = Ah.cast<float>();
+  MatrixXf Bf = Bh.cast<float>();
+  MatrixXf Cf = Ch.cast<float>();
+  VERIFY_IS_APPROX(Ch.noalias()+=Ah*Bh, (Cf.noalias()+=Af*Bf).cast<half>());
+}
+
+EIGEN_DECLARE_TEST(half_float)
 {
-  CALL_SUBTEST(test_conversion());
   CALL_SUBTEST(test_numtraits());
-  CALL_SUBTEST(test_arithmetic());
-  CALL_SUBTEST(test_comparison());
-  CALL_SUBTEST(test_basic_functions());
-  CALL_SUBTEST(test_trigonometric_functions());
-  CALL_SUBTEST(test_array());
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST(test_conversion());
+    CALL_SUBTEST(test_arithmetic());
+    CALL_SUBTEST(test_comparison());
+    CALL_SUBTEST(test_basic_functions());
+    CALL_SUBTEST(test_trigonometric_functions());
+    CALL_SUBTEST(test_array());
+    CALL_SUBTEST(test_product());
+  }
 }
diff --git a/test/hessenberg.cpp b/test/hessenberg.cpp
index 96bc19e2e..0e1b0098d 100644
--- a/test/hessenberg.cpp
+++ b/test/hessenberg.cpp
@@ -49,7 +49,7 @@ template<typename Scalar,int Size> void hessenberg(int size = Size)
   // TODO: Add tests for packedMatrix() and householderCoefficients()
 }
 
-void test_hessenberg()
+EIGEN_DECLARE_TEST(hessenberg)
 {
   CALL_SUBTEST_1(( hessenberg<std::complex<double>,1>() ));
   CALL_SUBTEST_2(( hessenberg<std::complex<double>,2>() ));
diff --git a/test/householder.cpp b/test/householder.cpp
index c5f6b5e4f..cad8138a2 100644
--- a/test/householder.cpp
+++ b/test/householder.cpp
@@ -12,7 +12,6 @@
 
 template<typename MatrixType> void householder(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   static bool even = true;
   even = !even;
   /* this test covers the following files:
@@ -49,6 +48,17 @@ template<typename MatrixType> void householder(const MatrixType& m)
   v1.applyHouseholderOnTheLeft(essential,beta,tmp);
   VERIFY_IS_APPROX(v1.norm(), v2.norm());
 
+  // reconstruct householder matrix:
+  SquareMatrixType id, H1, H2;
+  id.setIdentity(rows, rows);
+  H1 = H2 = id;
+  VectorType vv(rows);
+  vv << Scalar(1), essential;
+  H1.applyHouseholderOnTheLeft(essential, beta, tmp);
+  H2.applyHouseholderOnTheRight(essential, beta, tmp);
+  VERIFY_IS_APPROX(H1, H2);
+  VERIFY_IS_APPROX(H1, id - beta * vv*vv.adjoint());
+
   MatrixType m1(rows, cols),
              m2(rows, cols);
 
@@ -69,7 +79,7 @@ template<typename MatrixType> void householder(const MatrixType& m)
   m3.rowwise() = v1.transpose();
   m4 = m3;
   m3.row(0).makeHouseholder(essential, beta, alpha);
-  m3.applyHouseholderOnTheRight(essential,beta,tmp);
+  m3.applyHouseholderOnTheRight(essential.conjugate(),beta,tmp);
   VERIFY_IS_APPROX(m3.norm(), m4.norm());
   if(rows>=2) VERIFY_IS_MUCH_SMALLER_THAN(m3.block(0,1,rows,rows-1).norm(), m3.norm());
   VERIFY_IS_MUCH_SMALLER_THAN(numext::imag(m3(0,0)), numext::real(m3(0,0)));
@@ -104,14 +114,14 @@ template<typename MatrixType> void householder(const MatrixType& m)
   VERIFY_IS_APPROX(hseq_mat.adjoint(),    hseq_mat_adj);
   VERIFY_IS_APPROX(hseq_mat.conjugate(),  hseq_mat_conj);
   VERIFY_IS_APPROX(hseq_mat.transpose(),  hseq_mat_trans);
-  VERIFY_IS_APPROX(hseq_mat * m6,             hseq_mat * m6);
-  VERIFY_IS_APPROX(hseq_mat.adjoint() * m6,   hseq_mat_adj * m6);
-  VERIFY_IS_APPROX(hseq_mat.conjugate() * m6, hseq_mat_conj * m6);
-  VERIFY_IS_APPROX(hseq_mat.transpose() * m6, hseq_mat_trans * m6);
-  VERIFY_IS_APPROX(m6 * hseq_mat,             m6 * hseq_mat);
-  VERIFY_IS_APPROX(m6 * hseq_mat.adjoint(),   m6 * hseq_mat_adj);
-  VERIFY_IS_APPROX(m6 * hseq_mat.conjugate(), m6 * hseq_mat_conj);
-  VERIFY_IS_APPROX(m6 * hseq_mat.transpose(), m6 * hseq_mat_trans);
+  VERIFY_IS_APPROX(hseq * m6,             hseq_mat * m6);
+  VERIFY_IS_APPROX(hseq.adjoint() * m6,   hseq_mat_adj * m6);
+  VERIFY_IS_APPROX(hseq.conjugate() * m6, hseq_mat_conj * m6);
+  VERIFY_IS_APPROX(hseq.transpose() * m6, hseq_mat_trans * m6);
+  VERIFY_IS_APPROX(m6 * hseq,             m6 * hseq_mat);
+  VERIFY_IS_APPROX(m6 * hseq.adjoint(),   m6 * hseq_mat_adj);
+  VERIFY_IS_APPROX(m6 * hseq.conjugate(), m6 * hseq_mat_conj);
+  VERIFY_IS_APPROX(m6 * hseq.transpose(), m6 * hseq_mat_trans);
 
   // test householder sequence on the right with a shift
 
@@ -123,7 +133,7 @@ template<typename MatrixType> void householder(const MatrixType& m)
   VERIFY_IS_APPROX(m3 * m5, m1); // test evaluating rhseq to a dense matrix, then applying
 }
 
-void test_householder()
+EIGEN_DECLARE_TEST(householder)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( householder(Matrix<double,2,2>()) );
diff --git a/test/incomplete_cholesky.cpp b/test/incomplete_cholesky.cpp
index 59ffe9259..68fe7d507 100644
--- a/test/incomplete_cholesky.cpp
+++ b/test/incomplete_cholesky.cpp
@@ -29,14 +29,10 @@ template<typename T, typename I> void test_incomplete_cholesky_T()
   CALL_SUBTEST( check_sparse_spd_solving(cg_illt_uplo_amd) );
 }
 
-void test_incomplete_cholesky()
+template<int>
+void bug1150()
 {
-  CALL_SUBTEST_1(( test_incomplete_cholesky_T<double,int>() ));
-  CALL_SUBTEST_2(( test_incomplete_cholesky_T<std::complex<double>, int>() ));
-  CALL_SUBTEST_3(( test_incomplete_cholesky_T<double,long int>() ));
-
-#ifdef EIGEN_TEST_PART_1
-    // regression for bug 1150
+  // regression for bug 1150
   for(int N = 1; N<20; ++N)
   {
     Eigen::MatrixXd b( N, N );
@@ -61,5 +57,13 @@ void test_incomplete_cholesky()
     VERIFY(solver.preconditioner().info() == Eigen::Success);
     VERIFY(solver.info() == Eigen::Success);
   }
-#endif
+}
+
+EIGEN_DECLARE_TEST(incomplete_cholesky)
+{
+  CALL_SUBTEST_1(( test_incomplete_cholesky_T<double,int>() ));
+  CALL_SUBTEST_2(( test_incomplete_cholesky_T<std::complex<double>, int>() ));
+  CALL_SUBTEST_3(( test_incomplete_cholesky_T<double,long int>() ));
+
+  CALL_SUBTEST_1(( bug1150<0>() ));
 }
diff --git a/test/indexed_view.cpp b/test/indexed_view.cpp
index 8b3082cea..7219c777f 100644
--- a/test/indexed_view.cpp
+++ b/test/indexed_view.cpp
@@ -140,7 +140,7 @@ void check_indexed_view()
     "500  501  502  503  504  505  506  507  508  509")
   );
 
-  // takes the row numer 3, and repeat it 5 times
+  // take row number 3, and repeat it 5 times
   VERIFY( MATCH( A(seqN(3,5,0), all),
     "300  301  302  303  304  305  306  307  308  309\n"
     "300  301  302  303  304  305  306  307  308  309\n"
@@ -293,6 +293,14 @@ void check_indexed_view()
   }
 
 #if EIGEN_HAS_CXX11
+  // check lastN
+  VERIFY_IS_APPROX( a(lastN(3)), a.tail(3) );
+  VERIFY( MATCH( a(lastN(3)), "7\n8\n9" ) );
+  VERIFY_IS_APPROX( a(lastN(fix<3>())), a.tail<3>() );
+  VERIFY( MATCH( a(lastN(3,2)), "5\n7\n9" ) );
+  VERIFY( MATCH( a(lastN(3,fix<2>())), "5\n7\n9" ) );
+  VERIFY( a(lastN(fix<3>())).SizeAtCompileTime == 3 );
+
   VERIFY( (A(all, std::array<int,4>{{1,3,2,4}})).ColsAtCompileTime == 4);
 
   VERIFY_IS_APPROX( (A(std::array<int,3>{{1,3,5}}, std::array<int,4>{{9,6,3,0}})), A(seqN(1,3,2), seqN(9,4,-3)) );
@@ -371,13 +379,39 @@ void check_indexed_view()
   a(X) = 1;
   A(X,Y) = 1;
 
+  // Check compilation of varying integer types as index types:
+  Index i = n/2;
+  short i_short(i);
+  std::size_t i_sizet(i);
+  VERIFY_IS_EQUAL( a(i), a.coeff(i_short) );
+  VERIFY_IS_EQUAL( a(i), a.coeff(i_sizet) );
+
+  VERIFY_IS_EQUAL( A(i,i), A.coeff(i_short, i_short) );
+  VERIFY_IS_EQUAL( A(i,i), A.coeff(i_short, i) );
+  VERIFY_IS_EQUAL( A(i,i), A.coeff(i, i_short) );
+  VERIFY_IS_EQUAL( A(i,i), A.coeff(i, i_sizet) );
+  VERIFY_IS_EQUAL( A(i,i), A.coeff(i_sizet, i) );
+  VERIFY_IS_EQUAL( A(i,i), A.coeff(i_sizet, i_short) );
+  VERIFY_IS_EQUAL( A(i,i), A.coeff(5, i_sizet) );
+
 }
 
-void test_indexed_view()
+EIGEN_DECLARE_TEST(indexed_view)
 {
 //   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( check_indexed_view() );
     CALL_SUBTEST_2( check_indexed_view() );
     CALL_SUBTEST_3( check_indexed_view() );
 //   }
+
+  // static checks of some internals:
+  STATIC_CHECK(( internal::is_valid_index_type<int>::value ));
+  STATIC_CHECK(( internal::is_valid_index_type<unsigned int>::value ));
+  STATIC_CHECK(( internal::is_valid_index_type<short>::value ));
+  STATIC_CHECK(( internal::is_valid_index_type<std::ptrdiff_t>::value ));
+  STATIC_CHECK(( internal::is_valid_index_type<std::size_t>::value ));
+  STATIC_CHECK(( !internal::valid_indexed_view_overload<int,int>::value ));
+  STATIC_CHECK(( !internal::valid_indexed_view_overload<int,std::ptrdiff_t>::value ));
+  STATIC_CHECK(( !internal::valid_indexed_view_overload<std::ptrdiff_t,int>::value ));
+  STATIC_CHECK(( !internal::valid_indexed_view_overload<std::size_t,int>::value ));
 }
diff --git a/test/inplace_decomposition.cpp b/test/inplace_decomposition.cpp
index 92d0d91b6..e3aa9957d 100644
--- a/test/inplace_decomposition.cpp
+++ b/test/inplace_decomposition.cpp
@@ -79,7 +79,7 @@ template<typename DecType,typename MatrixType> void inplace(bool square = false,
 }
 
 
-void test_inplace_decomposition()
+EIGEN_DECLARE_TEST(inplace_decomposition)
 {
   EIGEN_UNUSED typedef Matrix<double,4,3> Matrix43d;
   for(int i = 0; i < g_repeat; i++) {
diff --git a/test/integer_types.cpp b/test/integer_types.cpp
index 25126315a..e9e514f12 100644
--- a/test/integer_types.cpp
+++ b/test/integer_types.cpp
@@ -18,7 +18,6 @@
 
 template<typename MatrixType> void signed_integer_type_tests(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
 
   enum { is_signed = (Scalar(-1) > Scalar(0)) ? 0 : 1 };
@@ -49,7 +48,6 @@ template<typename MatrixType> void signed_integer_type_tests(const MatrixType& m
 
 template<typename MatrixType> void integer_type_tests(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
 
   VERIFY(NumTraits<Scalar>::IsInteger);
@@ -133,7 +131,18 @@ template<typename MatrixType> void integer_type_tests(const MatrixType& m)
   VERIFY_IS_APPROX((m1 * m2.transpose()) * m1, m1 * (m2.transpose() * m1));
 }
 
-void test_integer_types()
+template<int>
+void integer_types_extra()
+{
+  VERIFY_IS_EQUAL(internal::scalar_div_cost<int>::value, 8);
+  VERIFY_IS_EQUAL(internal::scalar_div_cost<unsigned int>::value, 8);
+  if(sizeof(long)>sizeof(int)) {
+    VERIFY(int(internal::scalar_div_cost<long>::value) > int(internal::scalar_div_cost<int>::value));
+    VERIFY(int(internal::scalar_div_cost<unsigned long>::value) > int(internal::scalar_div_cost<int>::value));
+  }
+}
+
+EIGEN_DECLARE_TEST(integer_types)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( integer_type_tests(Matrix<unsigned int, 1, 1>()) );
@@ -158,12 +167,5 @@ void test_integer_types()
 
     CALL_SUBTEST_8( integer_type_tests(Matrix<unsigned long long, Dynamic, 5>(1, 5)) );
   }
-#ifdef EIGEN_TEST_PART_9
-  VERIFY_IS_EQUAL(internal::scalar_div_cost<int>::value, 8);
-  VERIFY_IS_EQUAL(internal::scalar_div_cost<unsigned int>::value, 8);
-  if(sizeof(long)>sizeof(int)) {
-    VERIFY(int(internal::scalar_div_cost<long>::value) > int(internal::scalar_div_cost<int>::value));
-    VERIFY(int(internal::scalar_div_cost<unsigned long>::value) > int(internal::scalar_div_cost<int>::value));
-  }
-#endif
+  CALL_SUBTEST_9( integer_types_extra<0>() );
 }
diff --git a/test/inverse.cpp b/test/inverse.cpp
index 5c6777a18..8754cb7e5 100644
--- a/test/inverse.cpp
+++ b/test/inverse.cpp
@@ -11,43 +11,26 @@
 #include "main.h"
 #include <Eigen/LU>
 
-template<typename MatrixType> void inverse(const MatrixType& m)
+template<typename MatrixType>
+void inverse_for_fixed_size(const MatrixType&, typename internal::enable_if<MatrixType::SizeAtCompileTime==Dynamic>::type* = 0)
 {
-  using std::abs;
-  typedef typename MatrixType::Index Index;
-  /* this test covers the following files:
-     Inverse.h
-  */
-  Index rows = m.rows();
-  Index cols = m.cols();
-
-  typedef typename MatrixType::Scalar Scalar;
-
-  MatrixType m1(rows, cols),
-             m2(rows, cols),
-             identity = MatrixType::Identity(rows, rows);
-  createRandomPIMatrixOfRank(rows,rows,rows,m1);
-  m2 = m1.inverse();
-  VERIFY_IS_APPROX(m1, m2.inverse() );
-
-  VERIFY_IS_APPROX((Scalar(2)*m2).inverse(), m2.inverse()*Scalar(0.5));
-
-  VERIFY_IS_APPROX(identity, m1.inverse() * m1 );
-  VERIFY_IS_APPROX(identity, m1 * m1.inverse() );
+}
 
-  VERIFY_IS_APPROX(m1, m1.inverse().inverse() );
+template<typename MatrixType>
+void inverse_for_fixed_size(const MatrixType& m1, typename internal::enable_if<MatrixType::SizeAtCompileTime!=Dynamic>::type* = 0)
+{
+  using std::abs;
 
-  // since for the general case we implement separately row-major and col-major, test that
-  VERIFY_IS_APPROX(MatrixType(m1.transpose().inverse()), MatrixType(m1.inverse().transpose()));
+  MatrixType m2, identity = MatrixType::Identity();
 
-#if !defined(EIGEN_TEST_PART_5) && !defined(EIGEN_TEST_PART_6)
+  typedef typename MatrixType::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
   typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> VectorType;
   
   //computeInverseAndDetWithCheck tests
   //First: an invertible matrix
   bool invertible;
-  RealScalar det;
+  Scalar det;
 
   m2.setZero();
   m1.computeInverseAndDetWithCheck(m2, det, invertible);
@@ -61,23 +44,52 @@ template<typename MatrixType> void inverse(const MatrixType& m)
   VERIFY_IS_APPROX(identity, m1*m2);
 
   //Second: a rank one matrix (not invertible, except for 1x1 matrices)
-  VectorType v3 = VectorType::Random(rows);
-  MatrixType m3 = v3*v3.transpose(), m4(rows,cols);
+  VectorType v3 = VectorType::Random();
+  MatrixType m3 = v3*v3.transpose(), m4;
   m3.computeInverseAndDetWithCheck(m4, det, invertible);
-  VERIFY( rows==1 ? invertible : !invertible );
+  VERIFY( m1.rows()==1 ? invertible : !invertible );
   VERIFY_IS_MUCH_SMALLER_THAN(abs(det-m3.determinant()), RealScalar(1));
   m3.computeInverseWithCheck(m4, invertible);
-  VERIFY( rows==1 ? invertible : !invertible );
+  VERIFY( m1.rows()==1 ? invertible : !invertible );
   
   // check with submatrices
   {
     Matrix<Scalar, MatrixType::RowsAtCompileTime+1, MatrixType::RowsAtCompileTime+1, MatrixType::Options> m5;
     m5.setRandom();
-    m5.topLeftCorner(rows,rows) = m1;
+    m5.topLeftCorner(m1.rows(),m1.rows()) = m1;
     m2 = m5.template topLeftCorner<MatrixType::RowsAtCompileTime,MatrixType::ColsAtCompileTime>().inverse();
     VERIFY_IS_APPROX( (m5.template topLeftCorner<MatrixType::RowsAtCompileTime,MatrixType::ColsAtCompileTime>()), m2.inverse() );
   }
-#endif
+}
+
+template<typename MatrixType> void inverse(const MatrixType& m)
+{
+  /* this test covers the following files:
+     Inverse.h
+  */
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  typedef typename MatrixType::Scalar Scalar;
+
+  MatrixType m1(rows, cols),
+             m2(rows, cols),
+             identity = MatrixType::Identity(rows, rows);
+  createRandomPIMatrixOfRank(rows,rows,rows,m1);
+  m2 = m1.inverse();
+  VERIFY_IS_APPROX(m1, m2.inverse() );
+
+  VERIFY_IS_APPROX((Scalar(2)*m2).inverse(), m2.inverse()*Scalar(0.5));
+
+  VERIFY_IS_APPROX(identity, m1.inverse() * m1 );
+  VERIFY_IS_APPROX(identity, m1 * m1.inverse() );
+
+  VERIFY_IS_APPROX(m1, m1.inverse().inverse() );
+
+  // since for the general case we implement separately row-major and col-major, test that
+  VERIFY_IS_APPROX(MatrixType(m1.transpose().inverse()), MatrixType(m1.inverse().transpose()));
+
+  inverse_for_fixed_size(m1);
 
   // check in-place inversion
   if(MatrixType::RowsAtCompileTime>=2 && MatrixType::RowsAtCompileTime<=4)
@@ -93,7 +105,7 @@ template<typename MatrixType> void inverse(const MatrixType& m)
   }
 }
 
-void test_inverse()
+EIGEN_DECLARE_TEST(inverse)
 {
   int s = 0;
   for(int i = 0; i < g_repeat; i++) {
@@ -113,5 +125,7 @@ void test_inverse()
     
     CALL_SUBTEST_7( inverse(Matrix4d()) );
     CALL_SUBTEST_7( inverse(Matrix<double,4,4,DontAlign>()) );
+
+    CALL_SUBTEST_8( inverse(Matrix4cd()) );
   }
 }
diff --git a/test/is_same_dense.cpp b/test/is_same_dense.cpp
index 2c7838ce9..23dd806eb 100644
--- a/test/is_same_dense.cpp
+++ b/test/is_same_dense.cpp
@@ -11,12 +11,16 @@
 
 using internal::is_same_dense;
 
-void test_is_same_dense()
+EIGEN_DECLARE_TEST(is_same_dense)
 {
   typedef Matrix<double,Dynamic,Dynamic,ColMajor> ColMatrixXd;
+  typedef Matrix<std::complex<double>,Dynamic,Dynamic,ColMajor> ColMatrixXcd;
   ColMatrixXd m1(10,10);
+  ColMatrixXcd m2(10,10);
   Ref<ColMatrixXd> ref_m1(m1);
+  Ref<ColMatrixXd,0, Stride<Dynamic,Dynamic> >  ref_m2_real(m2.real());
   Ref<const ColMatrixXd> const_ref_m1(m1);
+
   VERIFY(is_same_dense(m1,m1));
   VERIFY(is_same_dense(m1,ref_m1));
   VERIFY(is_same_dense(const_ref_m1,m1));
@@ -30,4 +34,8 @@ void test_is_same_dense()
   
   Ref<const ColMatrixXd> const_ref_m1_col(m1.col(1));
   VERIFY(is_same_dense(m1.col(1),const_ref_m1_col));
+
+
+  VERIFY(!is_same_dense(m1, ref_m2_real));
+  VERIFY(!is_same_dense(m2, ref_m2_real));
 }
diff --git a/test/jacobi.cpp b/test/jacobi.cpp
index 7ccd4124b..5604797f5 100644
--- a/test/jacobi.cpp
+++ b/test/jacobi.cpp
@@ -14,7 +14,6 @@
 template<typename MatrixType, typename JacobiScalar>
 void jacobi(const MatrixType& m = MatrixType())
 {
-  typedef typename MatrixType::Index Index;
   Index rows = m.rows();
   Index cols = m.cols();
 
@@ -58,7 +57,7 @@ void jacobi(const MatrixType& m = MatrixType())
   }
 }
 
-void test_jacobi()
+EIGEN_DECLARE_TEST(jacobi)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1(( jacobi<Matrix3f, float>() ));
diff --git a/test/jacobisvd.cpp b/test/jacobisvd.cpp
index 7f5f71562..f9a59e0e7 100644
--- a/test/jacobisvd.cpp
+++ b/test/jacobisvd.cpp
@@ -36,7 +36,6 @@ void jacobisvd(const MatrixType& a = MatrixType(), bool pickrandom = true)
 template<typename MatrixType> void jacobisvd_verify_assert(const MatrixType& m)
 {
   svd_verify_assert<JacobiSVD<MatrixType> >(m);
-  typedef typename MatrixType::Index Index;
   Index rows = m.rows();
   Index cols = m.cols();
 
@@ -70,7 +69,7 @@ void jacobisvd_method()
   VERIFY_IS_APPROX(m.jacobiSvd(ComputeFullU|ComputeFullV).solve(m), m);
 }
 
-void test_jacobisvd()
+EIGEN_DECLARE_TEST(jacobisvd)
 {
   CALL_SUBTEST_3(( jacobisvd_verify_assert(Matrix3f()) ));
   CALL_SUBTEST_4(( jacobisvd_verify_assert(Matrix4d()) ));
diff --git a/test/klu_support.cpp b/test/klu_support.cpp
index 138dcc301..f806ad50e 100644
--- a/test/klu_support.cpp
+++ b/test/klu_support.cpp
@@ -24,7 +24,7 @@ template<typename T> void test_klu_support_T()
   //check_sparse_square_determinant(umfpack_rowmajor);
 }
 
-void test_klu_support()
+EIGEN_DECLARE_TEST(klu_support)
 {
   CALL_SUBTEST_1(test_klu_support_T<double>());
   CALL_SUBTEST_2(test_klu_support_T<std::complex<double> >());
diff --git a/test/linearstructure.cpp b/test/linearstructure.cpp
index 17474af10..46ee5162b 100644
--- a/test/linearstructure.cpp
+++ b/test/linearstructure.cpp
@@ -19,7 +19,6 @@ template<typename MatrixType> void linearStructure(const MatrixType& m)
   /* this test covers the following files:
      CwiseUnaryOp.h, CwiseBinaryOp.h, SelfCwiseBinaryOp.h 
   */
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
 
@@ -111,7 +110,20 @@ template<typename MatrixType> void real_complex(DenseIndex rows = MatrixType::Ro
   VERIFY(g_called && "matrix<complex> - real not properly optimized");
 }
 
-void test_linearstructure()
+template<int>
+void linearstructure_overflow()
+{
+  // make sure that /=scalar and /scalar do not overflow
+  // rational: 1.0/4.94e-320 overflow, but m/4.94e-320 should not
+  Matrix4d m2, m3;
+  m3 = m2 =  Matrix4d::Random()*1e-20;
+  m2 = m2 / 4.9e-320;
+  VERIFY_IS_APPROX(m2.cwiseQuotient(m2), Matrix4d::Ones());
+  m3 /= 4.9e-320;
+  VERIFY_IS_APPROX(m3.cwiseQuotient(m3), Matrix4d::Ones());
+}
+
+EIGEN_DECLARE_TEST(linearstructure)
 {
   g_called = true;
   VERIFY(g_called); // avoid `unneeded-internal-declaration` warning.
@@ -131,19 +143,5 @@ void test_linearstructure()
     CALL_SUBTEST_11( real_complex<MatrixXcf>(10,10) );
     CALL_SUBTEST_11( real_complex<ArrayXXcf>(10,10) );
   }
-  
-#ifdef EIGEN_TEST_PART_4
-  {
-    // make sure that /=scalar and /scalar do not overflow
-    // rational: 1.0/4.94e-320 overflow, but m/4.94e-320 should not
-    Matrix4d m2, m3;
-    m3 = m2 =  Matrix4d::Random()*1e-20;
-    m2 = m2 / 4.9e-320;
-    VERIFY_IS_APPROX(m2.cwiseQuotient(m2), Matrix4d::Ones());
-    m3 /= 4.9e-320;
-    VERIFY_IS_APPROX(m3.cwiseQuotient(m3), Matrix4d::Ones());
-    
-    
-  }
-#endif
+  CALL_SUBTEST_4( linearstructure_overflow<0>() );
 }
diff --git a/test/lscg.cpp b/test/lscg.cpp
index d49ee00c3..feb2347a8 100644
--- a/test/lscg.cpp
+++ b/test/lscg.cpp
@@ -30,7 +30,7 @@ template<typename T> void test_lscg_T()
   CALL_SUBTEST( check_sparse_leastsquare_solving(lscg_rowmajor_I)     );
 }
 
-void test_lscg()
+EIGEN_DECLARE_TEST(lscg)
 {
   CALL_SUBTEST_1(test_lscg_T<double>());
   CALL_SUBTEST_2(test_lscg_T<std::complex<double> >());
diff --git a/test/lu.cpp b/test/lu.cpp
index 9787f4d86..144496e91 100644
--- a/test/lu.cpp
+++ b/test/lu.cpp
@@ -18,7 +18,6 @@ typename MatrixType::RealScalar matrix_l1_norm(const MatrixType& m) {
 
 template<typename MatrixType> void lu_non_invertible()
 {
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::RealScalar RealScalar;
   /* this test covers the following files:
      LU.h
@@ -181,7 +180,6 @@ template<typename MatrixType> void lu_partial_piv()
   /* this test covers the following files:
      PartialPivLU.h
   */
-  typedef typename MatrixType::Index Index;
   typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
   Index size = internal::random<Index>(1,4);
 
@@ -244,7 +242,7 @@ template<typename MatrixType> void lu_verify_assert()
   VERIFY_RAISES_ASSERT(plu.inverse())
 }
 
-void test_lu()
+EIGEN_DECLARE_TEST(lu)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( lu_non_invertible<Matrix3f>() );
diff --git a/test/main.h b/test/main.h
index 429c44f81..5d64bc736 100644
--- a/test/main.h
+++ b/test/main.h
@@ -67,11 +67,27 @@
 // protected by parenthesis against macro expansion, the min()/max() macros
 // are defined here and any not-parenthesized min/max call will cause a
 // compiler error.
-#define min(A,B) please_protect_your_min_with_parentheses
-#define max(A,B) please_protect_your_max_with_parentheses
-#define isnan(X) please_protect_your_isnan_with_parentheses
-#define isinf(X) please_protect_your_isinf_with_parentheses
-#define isfinite(X) please_protect_your_isfinite_with_parentheses
+#if !defined(__HIPCC__)
+  //
+  // HIP header files include the following files
+  //  <thread>
+  //  <regex>
+  //  <unordered_map>
+  // which seem to contain not-parenthesized calls to "max"/"min", triggering the following check and causing the compile to fail
+  //
+  // Including those header files before the following macro definition for "min" / "max", only partially resolves the issue
+  // This is because other HIP header files also define "isnan" / "isinf" / "isfinite" functions, which are needed in other
+  // headers.
+  //
+  // So instead choosing to simply disable this check for HIP
+  //
+  #define min(A,B) please_protect_your_min_with_parentheses
+  #define max(A,B) please_protect_your_max_with_parentheses
+  #define isnan(X) please_protect_your_isnan_with_parentheses
+  #define isinf(X) please_protect_your_isinf_with_parentheses
+  #define isfinite(X) please_protect_your_isfinite_with_parentheses
+#endif
+
 #ifdef M_PI
 #undef M_PI
 #endif
@@ -106,13 +122,12 @@ inline void on_temporary_creation(long int size) {
 #define VERIFY_EVALUATION_COUNT(XPR,N) {\
     nb_temporaries = 0; \
     XPR; \
-    if(nb_temporaries!=N) { std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; }\
-    VERIFY( (#XPR) && nb_temporaries==N ); \
+    if(nb_temporaries!=(N)) { std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; }\
+    VERIFY( (#XPR) && nb_temporaries==(N) ); \
   }
   
 #endif
 
-// the following file is automatically generated by cmake
 #include "split_test_helper.h"
 
 #ifdef NDEBUG
@@ -129,10 +144,6 @@ inline void on_temporary_creation(long int size) {
 #define EIGEN_MAKING_DOCS
 #endif
 
-#ifndef EIGEN_TEST_FUNC
-#error EIGEN_TEST_FUNC must be defined
-#endif
-
 #define DEFAULT_REPEAT 10
 
 namespace Eigen
@@ -141,20 +152,48 @@ namespace Eigen
   // level == 0 <=> abort if test fail
   // level >= 1 <=> warning message to std::cerr if test fail
   static int g_test_level = 0;
-  static int g_repeat;
-  static unsigned int g_seed;
-  static bool g_has_set_repeat, g_has_set_seed;
+  static int g_repeat = 1;
+  static unsigned int g_seed = 0;
+  static bool g_has_set_repeat = false, g_has_set_seed = false;
+
+  class EigenTest
+  {
+  public:
+    EigenTest() : m_func(0) {}
+    EigenTest(const char* a_name, void (*func)(void))
+      : m_name(a_name), m_func(func)
+    {
+      ms_registered_tests.push_back(this);
+    }
+    const std::string& name() const { return m_name; }
+    void operator()() const { m_func(); }
+
+    static const std::vector<EigenTest*>& all() { return ms_registered_tests; }
+  protected:
+    std::string m_name;
+    void (*m_func)(void);
+    static std::vector<EigenTest*> ms_registered_tests;
+  };
+
+  std::vector<EigenTest*> EigenTest::ms_registered_tests;
+
+  // Declare and register a test, e.g.:
+  //    EIGEN_DECLARE_TEST(mytest) { ... }
+  // will create a function:
+  //    void test_mytest() { ... }
+  // that will be automatically called.
+  #define EIGEN_DECLARE_TEST(X) \
+    void EIGEN_CAT(test_,X) (); \
+    static EigenTest EIGEN_CAT(test_handler_,X) (EIGEN_MAKESTRING(X), & EIGEN_CAT(test_,X)); \
+    void EIGEN_CAT(test_,X) ()
 }
 
 #define TRACK std::cerr << __FILE__ << " " << __LINE__ << std::endl
 // #define TRACK while()
 
-#define EI_PP_MAKE_STRING2(S) #S
-#define EI_PP_MAKE_STRING(S) EI_PP_MAKE_STRING2(S)
-
 #define EIGEN_DEFAULT_IO_FORMAT IOFormat(4, 0, "  ", "\n", "", "", "", "")
 
-#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__)
+#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
   #define EIGEN_EXCEPTIONS
 #endif
 
@@ -175,9 +214,15 @@ namespace Eigen
       eigen_assert_exception(void) {}
       ~eigen_assert_exception() { Eigen::no_more_assert = false; }
     };
+
+    struct eigen_static_assert_exception
+    {
+      eigen_static_assert_exception(void) {}
+      ~eigen_static_assert_exception() { Eigen::no_more_assert = false; }
+    };
   }
   // If EIGEN_DEBUG_ASSERTS is defined and if no assertion is triggered while
-  // one should have been, then the list of excecuted assertions is printed out.
+  // one should have been, then the list of executed assertions is printed out.
   //
   // EIGEN_DEBUG_ASSERTS is not enabled by default as it
   // significantly increases the compilation time
@@ -203,7 +248,7 @@ namespace Eigen
       }                                     \
       else if (Eigen::internal::push_assert)       \
       {                                     \
-        eigen_assert_list.push_back(std::string(EI_PP_MAKE_STRING(__FILE__) " (" EI_PP_MAKE_STRING(__LINE__) ") : " #a) ); \
+        eigen_assert_list.push_back(std::string(EIGEN_MAKESTRING(__FILE__) " (" EIGEN_MAKESTRING(__LINE__) ") : " #a) ); \
       }
 
     #ifdef EIGEN_EXCEPTIONS
@@ -227,7 +272,7 @@ namespace Eigen
       }
     #endif //EIGEN_EXCEPTIONS
 
-  #elif !defined(__CUDACC__) // EIGEN_DEBUG_ASSERTS
+  #elif !defined(__CUDACC__) && !defined(__HIPCC__)// EIGEN_DEBUG_ASSERTS
     // see bug 89. The copy_bool here is working around a bug in gcc <= 4.3
     #define eigen_assert(a) \
       if( (!Eigen::internal::copy_bool(a)) && (!no_more_assert) )\
@@ -238,6 +283,7 @@ namespace Eigen
         else                                  \
           EIGEN_THROW_X(Eigen::eigen_assert_exception()); \
       }
+
     #ifdef EIGEN_EXCEPTIONS
       #define VERIFY_RAISES_ASSERT(a) {                           \
         Eigen::no_more_assert = false;                            \
@@ -249,25 +295,51 @@ namespace Eigen
         catch (Eigen::eigen_assert_exception&) { VERIFY(true); }  \
         Eigen::report_on_cerr_on_assert_failure = true;           \
       }
-    #endif //EIGEN_EXCEPTIONS
+    #endif // EIGEN_EXCEPTIONS
   #endif // EIGEN_DEBUG_ASSERTS
 
+  #if defined(TEST_CHECK_STATIC_ASSERTIONS) && defined(EIGEN_EXCEPTIONS)
+    #define EIGEN_STATIC_ASSERT(a,MSG) \
+      if( (!Eigen::internal::copy_bool(a)) && (!no_more_assert) )\
+      {                                       \
+        Eigen::no_more_assert = true;         \
+        if(report_on_cerr_on_assert_failure)  \
+          eigen_plain_assert((a) && #MSG);      \
+        else                                  \
+          EIGEN_THROW_X(Eigen::eigen_static_assert_exception()); \
+      }
+    #define VERIFY_RAISES_STATIC_ASSERT(a) {                    \
+      Eigen::no_more_assert = false;                            \
+      Eigen::report_on_cerr_on_assert_failure = false;          \
+      try {                                                     \
+        a;                                                      \
+        VERIFY(Eigen::should_raise_an_assert && # a);           \
+      }                                                         \
+      catch (Eigen::eigen_static_assert_exception&) { VERIFY(true); }  \
+      Eigen::report_on_cerr_on_assert_failure = true;           \
+    }
+  #endif // TEST_CHECK_STATIC_ASSERTIONS
+
 #ifndef VERIFY_RAISES_ASSERT
   #define VERIFY_RAISES_ASSERT(a) \
     std::cout << "Can't VERIFY_RAISES_ASSERT( " #a " ) with exceptions disabled\n";
 #endif
+#ifndef VERIFY_RAISES_STATIC_ASSERT
+  #define VERIFY_RAISES_STATIC_ASSERT(a) \
+    std::cout << "Can't VERIFY_RAISES_STATIC_ASSERT( " #a " ) with exceptions disabled\n";
+#endif
     
-  #if !defined(__CUDACC__)
+  #if !defined(__CUDACC__) && !defined(__HIPCC__)
   #define EIGEN_USE_CUSTOM_ASSERT
   #endif
 
 #else // EIGEN_NO_ASSERTION_CHECKING
 
   #define VERIFY_RAISES_ASSERT(a) {}
+  #define VERIFY_RAISES_STATIC_ASSERT(a) {}
 
 #endif // EIGEN_NO_ASSERTION_CHECKING
 
-
 #define EIGEN_INTERNAL_DEBUGGING
 #include <Eigen/QR> // required for createRandomPIMatrixOfRank
 
@@ -289,10 +361,10 @@ inline void verify_impl(bool condition, const char *testname, const char *file,
   }
 }
 
-#define VERIFY(a) ::verify_impl(a, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a))
+#define VERIFY(a) ::verify_impl(a, g_test_stack.back().c_str(), __FILE__, __LINE__, EIGEN_MAKESTRING(a))
 
-#define VERIFY_GE(a, b) ::verify_impl(a >= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a >= b))
-#define VERIFY_LE(a, b) ::verify_impl(a <= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a <= b))
+#define VERIFY_GE(a, b) ::verify_impl(a >= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EIGEN_MAKESTRING(a >= b))
+#define VERIFY_LE(a, b) ::verify_impl(a <= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EIGEN_MAKESTRING(a <= b))
 
 
 #define VERIFY_IS_EQUAL(a, b) VERIFY(test_is_equal(a, b, true))
@@ -306,8 +378,10 @@ inline void verify_impl(bool condition, const char *testname, const char *file,
 
 #define VERIFY_IS_UNITARY(a) VERIFY(test_isUnitary(a))
 
+#define STATIC_CHECK(COND) EIGEN_STATIC_ASSERT( (COND) , EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT )
+
 #define CALL_SUBTEST(FUNC) do { \
-    g_test_stack.push_back(EI_PP_MAKE_STRING(FUNC)); \
+    g_test_stack.push_back(EIGEN_MAKESTRING(FUNC)); \
     FUNC; \
     g_test_stack.pop_back(); \
   } while (0)
@@ -326,7 +400,7 @@ template<> inline long double test_precision<std::complex<long double> >() { ret
 inline bool test_isApprox(const short& a, const short& b)
 { return internal::isApprox(a, b, test_precision<short>()); }
 inline bool test_isApprox(const unsigned short& a, const unsigned short& b)
-{ return internal::isApprox(a, b, test_precision<unsigned long>()); }
+{ return internal::isApprox(a, b, test_precision<unsigned short>()); }
 inline bool test_isApprox(const unsigned int& a, const unsigned int& b)
 { return internal::isApprox(a, b, test_precision<unsigned int>()); }
 inline bool test_isApprox(const long& a, const long& b)
@@ -658,9 +732,6 @@ template<> std::string type_name<std::complex<double> >()       { return "comple
 template<> std::string type_name<std::complex<long double> >()  { return "complex<long double>"; }
 template<> std::string type_name<std::complex<int> >()          { return "complex<int>"; }
 
-// forward declaration of the main test function
-void EIGEN_CAT(test_,EIGEN_TEST_FUNC)();
-
 using namespace Eigen;
 
 inline void set_repeat_from_string(const char *str)
@@ -747,9 +818,16 @@ int main(int argc, char *argv[])
     srand(g_seed);
     std::cout << "Repeating each test " << g_repeat << " times" << std::endl;
 
-    Eigen::g_test_stack.push_back(std::string(EI_PP_MAKE_STRING(EIGEN_TEST_FUNC)));
+    VERIFY(EigenTest::all().size()>0);
+
+    for(std::size_t i=0; i<EigenTest::all().size(); ++i)
+    {
+      const EigenTest& current_test = *EigenTest::all()[i];
+      Eigen::g_test_stack.push_back(current_test.name());
+      current_test();
+      Eigen::g_test_stack.pop_back();
+    }
 
-    EIGEN_CAT(test_,EIGEN_TEST_FUNC)();
     return 0;
 }
 
diff --git a/test/mapped_matrix.cpp b/test/mapped_matrix.cpp
index 6a84c5897..0ea136ae6 100644
--- a/test/mapped_matrix.cpp
+++ b/test/mapped_matrix.cpp
@@ -17,7 +17,6 @@
 
 template<typename VectorType> void map_class_vector(const VectorType& m)
 {
-  typedef typename VectorType::Index Index;
   typedef typename VectorType::Scalar Scalar;
 
   Index size = m.size();
@@ -51,7 +50,6 @@ template<typename VectorType> void map_class_vector(const VectorType& m)
 
 template<typename MatrixType> void map_class_matrix(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
 
   Index rows = m.rows(), cols = m.cols(), size = rows*cols;
@@ -64,8 +62,9 @@ template<typename MatrixType> void map_class_matrix(const MatrixType& m)
   for(int i = 0; i < size; i++) array2[i] = Scalar(1);
   // array3unaligned -> unaligned pointer to heap
   Scalar* array3 = new Scalar[size+1];
-  for(int i = 0; i < size+1; i++) array3[i] = Scalar(1);
-  Scalar* array3unaligned = internal::UIntPtr(array3)%EIGEN_MAX_ALIGN_BYTES == 0 ? array3+1 : array3;
+  Index sizep1 = size + 1; // <- without this temporary MSVC 2103 generates bad code
+  for(Index i = 0; i < sizep1; i++) array3[i] = Scalar(1);
+  Scalar* array3unaligned = (internal::UIntPtr(array3)%EIGEN_MAX_ALIGN_BYTES) == 0 ? array3+1 : array3;
   Scalar array4[256];
   if(size<=256)
     for(int i = 0; i < size; i++) array4[i] = Scalar(1);
@@ -121,7 +120,6 @@ template<typename MatrixType> void map_class_matrix(const MatrixType& m)
 
 template<typename VectorType> void map_static_methods(const VectorType& m)
 {
-  typedef typename VectorType::Index Index;
   typedef typename VectorType::Scalar Scalar;
 
   Index size = m.size();
@@ -163,7 +161,6 @@ template<typename Scalar>
 void map_not_aligned_on_scalar()
 {
   typedef Matrix<Scalar,Dynamic,Dynamic> MatrixType;
-  typedef typename MatrixType::Index Index;
   Index size = 11;
   Scalar* array1 = internal::aligned_new<Scalar>((size+1)*(size+1)+1);
   Scalar* array2 = reinterpret_cast<Scalar*>(sizeof(Scalar)/2+std::size_t(array1));
@@ -181,7 +178,7 @@ void map_not_aligned_on_scalar()
   internal::aligned_delete(array1, (size+1)*(size+1)+1);
 }
 
-void test_mapped_matrix()
+EIGEN_DECLARE_TEST(mapped_matrix)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( map_class_vector(Matrix<float, 1, 1>()) );
@@ -205,7 +202,6 @@ void test_mapped_matrix()
     CALL_SUBTEST_8( map_static_methods(RowVector3d()) );
     CALL_SUBTEST_9( map_static_methods(VectorXcd(8)) );
     CALL_SUBTEST_10( map_static_methods(VectorXf(12)) );
-    
     CALL_SUBTEST_11( map_not_aligned_on_scalar<double>() );
   }
 }
diff --git a/test/mapstaticmethods.cpp b/test/mapstaticmethods.cpp
index 06272d106..f32c0beec 100644
--- a/test/mapstaticmethods.cpp
+++ b/test/mapstaticmethods.cpp
@@ -69,7 +69,6 @@ struct mapstaticmethods_impl<PlainObjectType, true, false>
 {
   static void run(const PlainObjectType& m)
   {
-    typedef typename PlainObjectType::Index Index;
     Index rows = m.rows(), cols = m.cols();
 
     int i = internal::random<int>(2,5), j = internal::random<int>(2,5);
@@ -116,7 +115,6 @@ struct mapstaticmethods_impl<PlainObjectType, true, true>
 {
   static void run(const PlainObjectType& v)
   {
-    typedef typename PlainObjectType::Index Index;
     Index size = v.size();
 
     int i = internal::random<int>(2,5);
@@ -145,7 +143,7 @@ void mapstaticmethods(const PlainObjectType& m)
   VERIFY(true); // just to avoid 'unused function' warning
 }
 
-void test_mapstaticmethods()
+EIGEN_DECLARE_TEST(mapstaticmethods)
 {
   ptr = internal::aligned_new<float>(1000);
   for(int i = 0; i < 1000; i++) ptr[i] = float(i);
diff --git a/test/mapstride.cpp b/test/mapstride.cpp
index de77dc5de..09196600b 100644
--- a/test/mapstride.cpp
+++ b/test/mapstride.cpp
@@ -11,7 +11,6 @@
 
 template<int Alignment,typename VectorType> void map_class_vector(const VectorType& m)
 {
-  typedef typename VectorType::Index Index;
   typedef typename VectorType::Scalar Scalar;
 
   Index size = m.size();
@@ -50,7 +49,6 @@ template<int Alignment,typename VectorType> void map_class_vector(const VectorTy
 
 template<int Alignment,typename MatrixType> void map_class_matrix(const MatrixType& _m)
 {
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
 
   Index rows = _m.rows(), cols = _m.cols();
@@ -199,7 +197,7 @@ void bug1453()
   VERIFY_IS_APPROX(RowMatrix32i::Map(data, InnerStride<>(2)), RowMatrixXi::Map(data, 3, 2, Stride<4,2>()));
 }
 
-void test_mapstride()
+EIGEN_DECLARE_TEST(mapstride)
 {
   for(int i = 0; i < g_repeat; i++) {
     int maxn = 30;
diff --git a/test/meta.cpp b/test/meta.cpp
index bd505762e..a6a67b85c 100644
--- a/test/meta.cpp
+++ b/test/meta.cpp
@@ -19,7 +19,15 @@ struct FooReturnType {
   typedef int ReturnType;
 };
 
-void test_meta()
+struct MyInterface {
+  virtual void func() = 0;
+  virtual ~MyInterface() {}
+};
+struct MyImpl : public MyInterface {
+  void func() {}
+};
+
+EIGEN_DECLARE_TEST(meta)
 {
   VERIFY((internal::conditional<(3<4),internal::true_type, internal::false_type>::type::value));
   VERIFY(( internal::is_same<float,float>::value));
@@ -62,14 +70,27 @@ void test_meta()
   VERIFY(( internal::is_same<const float,internal::remove_pointer<const float*>::type >::value));
   VERIFY(( internal::is_same<float,internal::remove_pointer<float* const >::type >::value));
   
-  VERIFY(( internal::is_convertible<float,double>::value ));
-  VERIFY(( internal::is_convertible<int,double>::value ));
-  VERIFY(( internal::is_convertible<double,int>::value ));
-  VERIFY((!internal::is_convertible<std::complex<double>,double>::value ));
-  VERIFY(( internal::is_convertible<Array33f,Matrix3f>::value ));
-//   VERIFY((!internal::is_convertible<Matrix3f,Matrix3d>::value )); //does not work because the conversion is prevented by a static assertion
-  VERIFY((!internal::is_convertible<Array33f,int>::value ));
-  VERIFY((!internal::is_convertible<MatrixXf,float>::value ));
+
+  // is_convertible
+  STATIC_CHECK(( internal::is_convertible<float,double>::value ));
+  STATIC_CHECK(( internal::is_convertible<int,double>::value ));
+  STATIC_CHECK(( internal::is_convertible<int, short>::value ));
+  STATIC_CHECK(( internal::is_convertible<short, int>::value ));
+  STATIC_CHECK(( internal::is_convertible<double,int>::value ));
+  STATIC_CHECK(( internal::is_convertible<double,std::complex<double> >::value ));
+  STATIC_CHECK((!internal::is_convertible<std::complex<double>,double>::value ));
+  STATIC_CHECK(( internal::is_convertible<Array33f,Matrix3f>::value ));
+  STATIC_CHECK(( internal::is_convertible<Matrix3f&,Matrix3f>::value ));
+  STATIC_CHECK(( internal::is_convertible<Matrix3f&,Matrix3f&>::value ));
+  STATIC_CHECK(( internal::is_convertible<Matrix3f&,const Matrix3f&>::value ));
+  STATIC_CHECK(( internal::is_convertible<const Matrix3f&,Matrix3f>::value ));
+  STATIC_CHECK(( internal::is_convertible<const Matrix3f&,const Matrix3f&>::value ));
+  STATIC_CHECK((!internal::is_convertible<const Matrix3f&,Matrix3f&>::value ));
+  STATIC_CHECK((!internal::is_convertible<const Matrix3f,Matrix3f&>::value ));
+  STATIC_CHECK(( internal::is_convertible<Matrix3f,Matrix3f&>::value )); // std::is_convertible returns false here though Matrix3f from; Matrix3f& to = from; is valid.
+  //STATIC_CHECK((!internal::is_convertible<Matrix3f,Matrix3d>::value )); //does not work because the conversion is prevented by a static assertion
+  STATIC_CHECK((!internal::is_convertible<Array33f,int>::value ));
+  STATIC_CHECK((!internal::is_convertible<MatrixXf,float>::value ));
   {
     float f;
     MatrixXf A, B;
@@ -80,6 +101,15 @@ void test_meta()
     VERIFY(( check_is_convertible(A*B, A) ));
   }
 
+  STATIC_CHECK(( !internal::is_convertible<MyInterface, MyImpl>::value ));
+  STATIC_CHECK(( !internal::is_convertible<MyImpl, MyInterface>::value ));
+  STATIC_CHECK((  internal::is_convertible<MyImpl, const MyInterface&>::value ));
+  {
+    int i;
+    VERIFY(( check_is_convertible(fix<3>(), i) ));
+    VERIFY((!check_is_convertible(i, fix<DynamicIndex>()) ));
+  }
+
   VERIFY((  internal::has_ReturnType<FooReturnType>::value ));
   VERIFY((  internal::has_ReturnType<ScalarBinaryOpTraits<int,int> >::value ));
   VERIFY(( !internal::has_ReturnType<MatrixXf>::value ));
diff --git a/test/metis_support.cpp b/test/metis_support.cpp
index d87c56a13..b490dacde 100644
--- a/test/metis_support.cpp
+++ b/test/metis_support.cpp
@@ -19,7 +19,7 @@ template<typename T> void test_metis_T()
   check_sparse_square_solving(sparselu_metis); 
 }
 
-void test_metis_support()
+EIGEN_DECLARE_TEST(metis_support)
 {
   CALL_SUBTEST_1(test_metis_T<double>());
 }
diff --git a/test/miscmatrices.cpp b/test/miscmatrices.cpp
index ef20dc749..e71712f33 100644
--- a/test/miscmatrices.cpp
+++ b/test/miscmatrices.cpp
@@ -14,7 +14,6 @@ template<typename MatrixType> void miscMatrices(const MatrixType& m)
   /* this test covers the following files:
      DiagonalMatrix.h Ones.h
   */
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
   Index rows = m.rows();
@@ -35,7 +34,7 @@ template<typename MatrixType> void miscMatrices(const MatrixType& m)
   VERIFY_IS_APPROX(square, MatrixType::Identity(rows, rows));
 }
 
-void test_miscmatrices()
+EIGEN_DECLARE_TEST(miscmatrices)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( miscMatrices(Matrix<float, 1, 1>()) );
diff --git a/test/mixingtypes.cpp b/test/mixingtypes.cpp
index b796082cd..aad63ec2b 100644
--- a/test/mixingtypes.cpp
+++ b/test/mixingtypes.cpp
@@ -8,13 +8,27 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-// work around "uninitialized" warnings and give that option some testing
-#define EIGEN_INITIALIZE_MATRICES_BY_ZERO
+#if defined(EIGEN_TEST_PART_7)
 
 #ifndef EIGEN_NO_STATIC_ASSERT
 #define EIGEN_NO_STATIC_ASSERT // turn static asserts into runtime asserts in order to check them
 #endif
 
+// ignore double-promotion diagnostic for clang and gcc, if we check for static assertion anyway:
+// TODO do the same for MSVC?
+#if defined(__clang__)
+#  if (__clang_major__ * 100 + __clang_minor__) >= 308
+#    pragma clang diagnostic ignored "-Wdouble-promotion"
+#  endif
+#elif defined(__GNUC__)
+  // TODO is there a minimal GCC version for this? At least g++-4.7 seems to be fine with this.
+#  pragma GCC diagnostic ignored "-Wdouble-promotion"
+#endif
+
+#endif
+
+
+
 #if defined(EIGEN_TEST_PART_1) || defined(EIGEN_TEST_PART_2) || defined(EIGEN_TEST_PART_3)
 
 #ifndef EIGEN_DONT_VECTORIZE
@@ -35,6 +49,28 @@ using namespace std;
   VERIFY_IS_APPROX(XPR,REF); \
   VERIFY( g_called && #XPR" not properly optimized");
 
+template<int SizeAtCompileType>
+void raise_assertion(Index size = SizeAtCompileType)
+{
+  // VERIFY_RAISES_ASSERT(mf+md); // does not even compile
+  Matrix<float, SizeAtCompileType, 1> vf; vf.setRandom(size);
+  Matrix<double, SizeAtCompileType, 1> vd; vd.setRandom(size);
+  VERIFY_RAISES_ASSERT(vf=vd);
+  VERIFY_RAISES_ASSERT(vf+=vd);
+  VERIFY_RAISES_ASSERT(vf-=vd);
+  VERIFY_RAISES_ASSERT(vd=vf);
+  VERIFY_RAISES_ASSERT(vd+=vf);
+  VERIFY_RAISES_ASSERT(vd-=vf);
+
+  //   vd.asDiagonal() * mf;    // does not even compile
+  //   vcd.asDiagonal() * mf;   // does not even compile
+
+#if 0 // we get other compilation errors here than just static asserts
+  VERIFY_RAISES_ASSERT(vd.dot(vf));
+#endif
+}
+
+
 template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
 {
   typedef std::complex<float>   CF;
@@ -73,13 +109,6 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
   while(std::abs(scf)<epsf) scf = internal::random<CF>();
   while(std::abs(scd)<epsd) scd = internal::random<CD>();
 
-//   VERIFY_RAISES_ASSERT(mf+md); // does not even compile
-
-#ifdef EIGEN_DONT_VECTORIZE
-  VERIFY_RAISES_ASSERT(vf=vd);
-  VERIFY_RAISES_ASSERT(vf+=vd);
-#endif
-  
   // check scalar products
   VERIFY_MIX_SCALAR(vcf * sf , vcf * complex<float>(sf));
   VERIFY_MIX_SCALAR(sd * vcd , complex<double>(sd) * vcd);
@@ -119,9 +148,6 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
 
   // check dot product
   vf.dot(vf);
-#if 0 // we get other compilation errors here than just static asserts
-  VERIFY_RAISES_ASSERT(vd.dot(vf));
-#endif
   VERIFY_IS_APPROX(vcf.dot(vf), vcf.dot(vf.template cast<complex<float> >()));
 
   // check diagonal product
@@ -130,9 +156,6 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
   VERIFY_IS_APPROX(mcf * vf.asDiagonal(), mcf * vf.template cast<complex<float> >().asDiagonal());
   VERIFY_IS_APPROX(md * vcd.asDiagonal(), md.template cast<complex<double> >() * vcd.asDiagonal());
 
-//   vd.asDiagonal() * mf;    // does not even compile
-//   vcd.asDiagonal() * mf;   // does not even compile
-
   // check inner product
   VERIFY_IS_APPROX((vf.transpose() * vcf).value(), (vf.template cast<complex<float> >().transpose() * vcf).value());
 
@@ -286,7 +309,7 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
   VERIFY_IS_APPROX( rcd.noalias() -= mcd + md*md,           - ((md*md).eval().template cast<CD>()) );
 }
 
-void test_mixingtypes()
+EIGEN_DECLARE_TEST(mixingtypes)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1(mixingtypes<3>());
@@ -296,5 +319,10 @@ void test_mixingtypes()
     CALL_SUBTEST_4(mixingtypes<3>());
     CALL_SUBTEST_5(mixingtypes<4>());
     CALL_SUBTEST_6(mixingtypes<Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE)));
+    CALL_SUBTEST_7(raise_assertion<Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE)));
   }
+  CALL_SUBTEST_7(raise_assertion<0>());
+  CALL_SUBTEST_7(raise_assertion<3>());
+  CALL_SUBTEST_7(raise_assertion<4>());
+  CALL_SUBTEST_7(raise_assertion<Dynamic>(0));
 }
diff --git a/test/nesting_ops.cpp b/test/nesting_ops.cpp
index a419b0e44..4b5fc21f2 100644
--- a/test/nesting_ops.cpp
+++ b/test/nesting_ops.cpp
@@ -91,7 +91,7 @@ template <typename MatrixType> void run_nesting_ops_2(const MatrixType& _m)
 }
 
 
-void test_nesting_ops()
+EIGEN_DECLARE_TEST(nesting_ops)
 {
   CALL_SUBTEST_1(run_nesting_ops_1(MatrixXf::Random(25,25)));
   CALL_SUBTEST_2(run_nesting_ops_1(MatrixXcd::Random(25,25)));
diff --git a/test/nomalloc.cpp b/test/nomalloc.cpp
index 50756c2fb..cb4c073e9 100644
--- a/test/nomalloc.cpp
+++ b/test/nomalloc.cpp
@@ -24,7 +24,6 @@ template<typename MatrixType> void nomalloc(const MatrixType& m)
 {
   /* this test check no dynamic memory allocation are issued with fixed-size matrices
   */
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
 
   Index rows = m.rows();
@@ -173,7 +172,7 @@ template<typename MatrixType> void test_reference(const MatrixType& m) {
   typedef typename MatrixType::Scalar Scalar;
   enum { Flag          =  MatrixType::IsRowMajor ? Eigen::RowMajor : Eigen::ColMajor};
   enum { TransposeFlag = !MatrixType::IsRowMajor ? Eigen::RowMajor : Eigen::ColMajor};
-  typename MatrixType::Index rows = m.rows(), cols=m.cols();
+  Index rows = m.rows(), cols=m.cols();
   typedef Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Flag         > MatrixX;
   typedef Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, TransposeFlag> MatrixXT;
   // Dynamic reference:
@@ -203,7 +202,7 @@ template<typename MatrixType> void test_reference(const MatrixType& m) {
 
 }
 
-void test_nomalloc()
+EIGEN_DECLARE_TEST(nomalloc)
 {
   // create some dynamic objects
   Eigen::MatrixXd M1 = MatrixXd::Random(3,3);
diff --git a/test/nullary.cpp b/test/nullary.cpp
index 22ec92352..12b9e122f 100644
--- a/test/nullary.cpp
+++ b/test/nullary.cpp
@@ -239,45 +239,28 @@ void testMatrixType(const MatrixType& m)
   VERIFY_IS_APPROX( A(i,j), s1 );
 }
 
-void test_nullary()
+template<int>
+void bug79()
 {
-  CALL_SUBTEST_1( testMatrixType(Matrix2d()) );
-  CALL_SUBTEST_2( testMatrixType(MatrixXcf(internal::random<int>(1,300),internal::random<int>(1,300))) );
-  CALL_SUBTEST_3( testMatrixType(MatrixXf(internal::random<int>(1,300),internal::random<int>(1,300))) );
-  
-  for(int i = 0; i < g_repeat*10; i++) {
-    CALL_SUBTEST_4( testVectorType(VectorXd(internal::random<int>(1,30000))) );
-    CALL_SUBTEST_5( testVectorType(Vector4d()) );  // regression test for bug 232
-    CALL_SUBTEST_6( testVectorType(Vector3d()) );
-    CALL_SUBTEST_7( testVectorType(VectorXf(internal::random<int>(1,30000))) );
-    CALL_SUBTEST_8( testVectorType(Vector3f()) );
-    CALL_SUBTEST_8( testVectorType(Vector4f()) );
-    CALL_SUBTEST_8( testVectorType(Matrix<float,8,1>()) );
-    CALL_SUBTEST_8( testVectorType(Matrix<float,1,1>()) );
-
-    CALL_SUBTEST_9( testVectorType(VectorXi(internal::random<int>(1,10))) );
-    CALL_SUBTEST_9( testVectorType(VectorXi(internal::random<int>(9,300))) );
-    CALL_SUBTEST_9( testVectorType(Matrix<int,1,1>()) );
-  }
-
-#ifdef EIGEN_TEST_PART_6
   // Assignment of a RowVectorXd to a MatrixXd (regression test for bug #79).
   VERIFY( (MatrixXd(RowVectorXd::LinSpaced(3, 0, 1)) - RowVector3d(0, 0.5, 1)).norm() < std::numeric_limits<double>::epsilon() );
-#endif
+}
 
-#ifdef EIGEN_TEST_PART_9
+template<int>
+void nullary_overflow()
+{
   // Check possible overflow issue
-  {
-    int n = 60000;
-    ArrayXi a1(n), a2(n);
-    a1.setLinSpaced(n, 0, n-1);
-    for(int i=0; i<n; ++i)
-      a2(i) = i;
-    VERIFY_IS_APPROX(a1,a2);
-  }
-#endif
+  int n = 60000;
+  ArrayXi a1(n), a2(n);
+  a1.setLinSpaced(n, 0, n-1);
+  for(int i=0; i<n; ++i)
+    a2(i) = i;
+  VERIFY_IS_APPROX(a1,a2);
+}
 
-#ifdef EIGEN_TEST_PART_10
+template<int>
+void nullary_internal_logic()
+{
   // check some internal logic
   VERIFY((  internal::has_nullary_operator<internal::scalar_constant_op<double> >::value ));
   VERIFY(( !internal::has_unary_operator<internal::scalar_constant_op<double> >::value ));
@@ -318,5 +301,30 @@ void test_nullary()
     VERIFY(( !internal::has_binary_operator<internal::linspaced_op<int,int> >::value ));
     VERIFY((  internal::functor_has_linear_access<internal::linspaced_op<int,int> >::ret ));
   }
-#endif
+}
+
+EIGEN_DECLARE_TEST(nullary)
+{
+  CALL_SUBTEST_1( testMatrixType(Matrix2d()) );
+  CALL_SUBTEST_2( testMatrixType(MatrixXcf(internal::random<int>(1,300),internal::random<int>(1,300))) );
+  CALL_SUBTEST_3( testMatrixType(MatrixXf(internal::random<int>(1,300),internal::random<int>(1,300))) );
+  
+  for(int i = 0; i < g_repeat*10; i++) {
+    CALL_SUBTEST_4( testVectorType(VectorXd(internal::random<int>(1,30000))) );
+    CALL_SUBTEST_5( testVectorType(Vector4d()) );  // regression test for bug 232
+    CALL_SUBTEST_6( testVectorType(Vector3d()) );
+    CALL_SUBTEST_7( testVectorType(VectorXf(internal::random<int>(1,30000))) );
+    CALL_SUBTEST_8( testVectorType(Vector3f()) );
+    CALL_SUBTEST_8( testVectorType(Vector4f()) );
+    CALL_SUBTEST_8( testVectorType(Matrix<float,8,1>()) );
+    CALL_SUBTEST_8( testVectorType(Matrix<float,1,1>()) );
+
+    CALL_SUBTEST_9( testVectorType(VectorXi(internal::random<int>(1,10))) );
+    CALL_SUBTEST_9( testVectorType(VectorXi(internal::random<int>(9,300))) );
+    CALL_SUBTEST_9( testVectorType(Matrix<int,1,1>()) );
+  }
+
+  CALL_SUBTEST_6( bug79<0>() );
+  CALL_SUBTEST_9( nullary_overflow<0>() );
+  CALL_SUBTEST_10( nullary_internal_logic<0>() );
 }
diff --git a/test/num_dimensions.cpp b/test/num_dimensions.cpp
new file mode 100644
index 000000000..7ad7ef697
--- /dev/null
+++ b/test/num_dimensions.cpp
@@ -0,0 +1,90 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/SparseCore>
+
+template<int ExpectedDim,typename Xpr>
+void check_dim(const Xpr& ) {
+  STATIC_CHECK( Xpr::NumDimensions == ExpectedDim );
+}
+
+#if EIGEN_HAS_CXX11
+template<template <typename,int,int> class Object>
+void map_num_dimensions()
+{
+  typedef Object<double, 1, 1> ArrayScalarType;
+  typedef Object<double, 2, 1> ArrayVectorType;
+  typedef Object<double, 1, 2> TransposeArrayVectorType;
+  typedef Object<double, 2, 2> ArrayType;
+  typedef Object<double, Eigen::Dynamic, 1> DynamicArrayVectorType;
+  typedef Object<double, 1, Eigen::Dynamic> DynamicTransposeArrayVectorType;
+  typedef Object<double, Eigen::Dynamic, Eigen::Dynamic> DynamicArrayType;
+
+  STATIC_CHECK(ArrayScalarType::NumDimensions == 0);
+  STATIC_CHECK(ArrayVectorType::NumDimensions == 1);
+  STATIC_CHECK(TransposeArrayVectorType::NumDimensions == 1);
+  STATIC_CHECK(ArrayType::NumDimensions == 2);
+  STATIC_CHECK(DynamicArrayVectorType::NumDimensions == 1);
+  STATIC_CHECK(DynamicTransposeArrayVectorType::NumDimensions == 1);
+  STATIC_CHECK(DynamicArrayType::NumDimensions == 2);
+
+  typedef Eigen::Map<ArrayScalarType> ArrayScalarMap;
+  typedef Eigen::Map<ArrayVectorType> ArrayVectorMap;
+  typedef Eigen::Map<TransposeArrayVectorType> TransposeArrayVectorMap;
+  typedef Eigen::Map<ArrayType> ArrayMap;
+  typedef Eigen::Map<DynamicArrayVectorType> DynamicArrayVectorMap;
+  typedef Eigen::Map<DynamicTransposeArrayVectorType> DynamicTransposeArrayVectorMap;
+  typedef Eigen::Map<DynamicArrayType> DynamicArrayMap;
+
+  STATIC_CHECK(ArrayScalarMap::NumDimensions == 0);
+  STATIC_CHECK(ArrayVectorMap::NumDimensions == 1);
+  STATIC_CHECK(TransposeArrayVectorMap::NumDimensions == 1);
+  STATIC_CHECK(ArrayMap::NumDimensions == 2);
+  STATIC_CHECK(DynamicArrayVectorMap::NumDimensions == 1);
+  STATIC_CHECK(DynamicTransposeArrayVectorMap::NumDimensions == 1);
+  STATIC_CHECK(DynamicArrayMap::NumDimensions == 2);
+}
+
+template<typename Scalar, int Rows, int Cols>
+using TArray = Array<Scalar,Rows,Cols>;
+
+template<typename Scalar, int Rows, int Cols>
+using TMatrix = Matrix<Scalar,Rows,Cols>;
+
+#endif
+
+EIGEN_DECLARE_TEST(num_dimensions)
+{
+  int n = 10;
+  ArrayXXd A(n,n);
+  CALL_SUBTEST( check_dim<2>(A) );
+  CALL_SUBTEST( check_dim<2>(A.block(1,1,2,2)) );
+  CALL_SUBTEST( check_dim<1>(A.col(1)) );
+  CALL_SUBTEST( check_dim<1>(A.row(1)) );
+
+  MatrixXd M(n,n);
+  CALL_SUBTEST( check_dim<0>(M.row(1)*M.col(1)) );
+
+  SparseMatrix<double> S(n,n);
+  CALL_SUBTEST( check_dim<2>(S) );
+  CALL_SUBTEST( check_dim<2>(S.block(1,1,2,2)) );
+  CALL_SUBTEST( check_dim<1>(S.col(1)) );
+  CALL_SUBTEST( check_dim<1>(S.row(1)) );
+
+  SparseVector<double> s(n);
+  CALL_SUBTEST( check_dim<1>(s) );
+  CALL_SUBTEST( check_dim<1>(s.head(2)) );
+  
+
+  #if EIGEN_HAS_CXX11
+  CALL_SUBTEST( map_num_dimensions<TArray>() );
+  CALL_SUBTEST( map_num_dimensions<TMatrix>() );
+  #endif
+}
diff --git a/test/numext.cpp b/test/numext.cpp
index 3de33e2f9..6307f5979 100644
--- a/test/numext.cpp
+++ b/test/numext.cpp
@@ -33,7 +33,7 @@ void check_abs() {
   }
 }
 
-void test_numext() {
+EIGEN_DECLARE_TEST(numext) {
   CALL_SUBTEST( check_abs<bool>() );
   CALL_SUBTEST( check_abs<signed char>() );
   CALL_SUBTEST( check_abs<unsigned char>() );
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index 08b360340..58a1c60bf 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -28,7 +28,7 @@ template<typename T> T negate(const T& x) { return -x; }
 }
 }
 
-// NOTE: we disbale inlining for this function to workaround a GCC issue when using -O3 and the i387 FPU.
+// NOTE: we disable inlining for this function to workaround a GCC issue when using -O3 and the i387 FPU.
 template<typename Scalar> EIGEN_DONT_INLINE
 bool isApproxAbs(const Scalar& a, const Scalar& b, const typename NumTraits<Scalar>::Real& refvalue)
 {
@@ -123,7 +123,7 @@ template<typename Scalar> void packetmath()
   EIGEN_ALIGN_MAX Scalar data2[size];
   EIGEN_ALIGN_MAX Packet packets[PacketSize*2];
   EIGEN_ALIGN_MAX Scalar ref[size];
-  RealScalar refvalue = 0;
+  RealScalar refvalue = RealScalar(0);
   for (int i=0; i<size; ++i)
   {
     data1[i] = internal::random<Scalar>()/RealScalar(PacketSize);
@@ -171,14 +171,18 @@ template<typename Scalar> void packetmath()
     for (int i=0; i<PacketSize; ++i)
       ref[i] = data1[i+offset];
 
+    // palign is not used anymore, so let's just put a warning if it fails
+    ++g_test_level;
     VERIFY(areApprox(ref, data2, PacketSize) && "internal::palign");
+    --g_test_level;
   }
 
   VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasAdd);
   VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasSub);
   VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasMul);
   VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasNegate);
-  VERIFY((internal::is_same<Scalar,int>::value) || (!PacketTraits::Vectorizable) || PacketTraits::HasDiv);
+  // Disabled as it is not clear why it would be mandatory to support division.
+  //VERIFY((internal::is_same<Scalar,int>::value) || (!PacketTraits::Vectorizable) || PacketTraits::HasDiv);
 
   CHECK_CWISE2_IF(PacketTraits::HasAdd, REF_ADD,  internal::padd);
   CHECK_CWISE2_IF(PacketTraits::HasSub, REF_SUB,  internal::psub);
@@ -242,28 +246,30 @@ template<typename Scalar> void packetmath()
     }
   }
 
-  ref[0] = 0;
+  ref[0] = Scalar(0);
   for (int i=0; i<PacketSize; ++i)
     ref[0] += data1[i];
   VERIFY(isApproxAbs(ref[0], internal::predux(internal::pload<Packet>(data1)), refvalue) && "internal::predux");
 
+  if(PacketSize==8 && internal::unpacket_traits<typename internal::unpacket_traits<Packet>::half>::size ==4) // so far, predux_half_dowto4 is only required in such a case
   {
-    for (int i=0; i<4; ++i)
-      ref[i] = 0;
+    int HalfPacketSize = PacketSize>4 ? PacketSize/2 : PacketSize;
+    for (int i=0; i<HalfPacketSize; ++i)
+      ref[i] = Scalar(0);
     for (int i=0; i<PacketSize; ++i)
-      ref[i%4] += data1[i];
-    internal::pstore(data2, internal::predux_downto4(internal::pload<Packet>(data1)));
-    VERIFY(areApprox(ref, data2, PacketSize>4?PacketSize/2:PacketSize) && "internal::predux_downto4");
+      ref[i%HalfPacketSize] += data1[i];
+    internal::pstore(data2, internal::predux_half_dowto4(internal::pload<Packet>(data1)));
+    VERIFY(areApprox(ref, data2, HalfPacketSize) && "internal::predux_half_dowto4");
   }
 
-  ref[0] = 1;
+  ref[0] = Scalar(1);
   for (int i=0; i<PacketSize; ++i)
     ref[0] *= data1[i];
   VERIFY(internal::isApprox(ref[0], internal::predux_mul(internal::pload<Packet>(data1))) && "internal::predux_mul");
 
   for (int j=0; j<PacketSize; ++j)
   {
-    ref[j] = 0;
+    ref[j] = Scalar(0);
     for (int i=0; i<PacketSize; ++i)
       ref[j] += data1[i+j*PacketSize];
     packets[j] = internal::pload<Packet>(data1+j*PacketSize);
@@ -436,6 +442,7 @@ template<typename Scalar> void packetmath_real()
   if(internal::random<float>(0,1)<0.1f)
     data1[internal::random<int>(0, PacketSize)] = 0;
   CHECK_CWISE1_IF(PacketTraits::HasSqrt, std::sqrt, internal::psqrt);
+  CHECK_CWISE1_IF(PacketTraits::HasSqrt, Scalar(1)/std::sqrt, internal::prsqrt);
   CHECK_CWISE1_IF(PacketTraits::HasLog, std::log, internal::plog);
 #if EIGEN_HAS_C99_MATH && (__cplusplus > 199711L)
   CHECK_CWISE1_IF(PacketTraits::HasExpm1, std::expm1, internal::pexpm1);
@@ -449,35 +456,41 @@ template<typename Scalar> void packetmath_real()
   {
     data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
     data1[1] = std::numeric_limits<Scalar>::epsilon();
-    packet_helper<PacketTraits::HasLog,Packet> h;
-    h.store(data2, internal::plog(h.load(data1)));
-    VERIFY((numext::isnan)(data2[0]));
-    VERIFY_IS_EQUAL(std::log(std::numeric_limits<Scalar>::epsilon()), data2[1]);
-
-    data1[0] = -std::numeric_limits<Scalar>::epsilon();
-    data1[1] = 0;
-    h.store(data2, internal::plog(h.load(data1)));
-    VERIFY((numext::isnan)(data2[0]));
-    VERIFY_IS_EQUAL(std::log(Scalar(0)), data2[1]);
-
-    data1[0] = (std::numeric_limits<Scalar>::min)();
-    data1[1] = -(std::numeric_limits<Scalar>::min)();
-    h.store(data2, internal::plog(h.load(data1)));
-    VERIFY_IS_EQUAL(std::log((std::numeric_limits<Scalar>::min)()), data2[0]);
-    VERIFY((numext::isnan)(data2[1]));
-
-    data1[0] = std::numeric_limits<Scalar>::denorm_min();
-    data1[1] = -std::numeric_limits<Scalar>::denorm_min();
-    h.store(data2, internal::plog(h.load(data1)));
-    // VERIFY_IS_EQUAL(std::log(std::numeric_limits<Scalar>::denorm_min()), data2[0]);
-    VERIFY((numext::isnan)(data2[1]));
-
-    data1[0] = Scalar(-1.0f);
-    h.store(data2, internal::plog(h.load(data1)));
-    VERIFY((numext::isnan)(data2[0]));
-    h.store(data2, internal::psqrt(h.load(data1)));
-    VERIFY((numext::isnan)(data2[0]));
-    VERIFY((numext::isnan)(data2[1]));
+    {
+      packet_helper<PacketTraits::HasLog,Packet> h;
+      h.store(data2, internal::plog(h.load(data1)));
+      VERIFY((numext::isnan)(data2[0]));
+      VERIFY_IS_EQUAL(std::log(std::numeric_limits<Scalar>::epsilon()), data2[1]);
+
+      data1[0] = -std::numeric_limits<Scalar>::epsilon();
+      data1[1] = 0;
+      h.store(data2, internal::plog(h.load(data1)));
+      VERIFY((numext::isnan)(data2[0]));
+      VERIFY_IS_EQUAL(std::log(Scalar(0)), data2[1]);
+
+      data1[0] = (std::numeric_limits<Scalar>::min)();
+      data1[1] = -(std::numeric_limits<Scalar>::min)();
+      h.store(data2, internal::plog(h.load(data1)));
+      VERIFY_IS_EQUAL(std::log((std::numeric_limits<Scalar>::min)()), data2[0]);
+      VERIFY((numext::isnan)(data2[1]));
+
+      data1[0] = std::numeric_limits<Scalar>::denorm_min();
+      data1[1] = -std::numeric_limits<Scalar>::denorm_min();
+      h.store(data2, internal::plog(h.load(data1)));
+      // VERIFY_IS_EQUAL(std::log(std::numeric_limits<Scalar>::denorm_min()), data2[0]);
+      VERIFY((numext::isnan)(data2[1]));
+
+      data1[0] = Scalar(-1.0f);
+      h.store(data2, internal::plog(h.load(data1)));
+      VERIFY((numext::isnan)(data2[0]));
+    }
+    {
+      packet_helper<PacketTraits::HasSqrt,Packet> h;
+      data1[0] = Scalar(-1.0f);
+      h.store(data2, internal::psqrt(h.load(data1)));
+      VERIFY((numext::isnan)(data2[0]));
+      VERIFY((numext::isnan)(data2[1]));
+    }
   }
 }
 
@@ -614,7 +627,7 @@ template<typename Scalar> void packetmath_scatter_gather()
   }
 }
 
-void test_packetmath()
+EIGEN_DECLARE_TEST(packetmath)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( packetmath<float>() );
@@ -622,6 +635,7 @@ void test_packetmath()
     CALL_SUBTEST_3( packetmath<int>() );
     CALL_SUBTEST_4( packetmath<std::complex<float> >() );
     CALL_SUBTEST_5( packetmath<std::complex<double> >() );
+    CALL_SUBTEST_6( packetmath<half>() );
 
     CALL_SUBTEST_1( packetmath_notcomplex<float>() );
     CALL_SUBTEST_2( packetmath_notcomplex<double>() );
diff --git a/test/pardiso_support.cpp b/test/pardiso_support.cpp
index 67efad6d8..9c16ded5b 100644
--- a/test/pardiso_support.cpp
+++ b/test/pardiso_support.cpp
@@ -20,7 +20,7 @@ template<typename T> void test_pardiso_T()
   check_sparse_square_solving(pardiso_lu);
 }
 
-void test_pardiso_support()
+EIGEN_DECLARE_TEST(pardiso_support)
 {
   CALL_SUBTEST_1(test_pardiso_T<float>());
   CALL_SUBTEST_2(test_pardiso_T<double>());
diff --git a/test/pastix_support.cpp b/test/pastix_support.cpp
index b62f85739..9b64417c1 100644
--- a/test/pastix_support.cpp
+++ b/test/pastix_support.cpp
@@ -45,7 +45,7 @@ template<typename T> void test_pastix_T_LU()
   check_sparse_square_solving(pastix_lu);
 }
 
-void test_pastix_support()
+EIGEN_DECLARE_TEST(pastix_support)
 {
   CALL_SUBTEST_1(test_pastix_T<float>());
   CALL_SUBTEST_2(test_pastix_T<double>());
diff --git a/test/permutationmatrices.cpp b/test/permutationmatrices.cpp
index db1266579..71f09be0f 100644
--- a/test/permutationmatrices.cpp
+++ b/test/permutationmatrices.cpp
@@ -14,14 +14,15 @@
 using namespace std;
 template<typename MatrixType> void permutationmatrices(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   enum { Rows = MatrixType::RowsAtCompileTime, Cols = MatrixType::ColsAtCompileTime,
          Options = MatrixType::Options };
   typedef PermutationMatrix<Rows> LeftPermutationType;
+  typedef Transpositions<Rows> LeftTranspositionsType;
   typedef Matrix<int, Rows, 1> LeftPermutationVectorType;
   typedef Map<LeftPermutationType> MapLeftPerm;
   typedef PermutationMatrix<Cols> RightPermutationType;
+  typedef Transpositions<Cols> RightTranspositionsType;
   typedef Matrix<int, Cols, 1> RightPermutationVectorType;
   typedef Map<RightPermutationType> MapRightPerm;
 
@@ -35,6 +36,8 @@ template<typename MatrixType> void permutationmatrices(const MatrixType& m)
   RightPermutationVectorType rv;
   randomPermutationVector(rv, cols);
   RightPermutationType rp(rv);
+  LeftTranspositionsType lt(lv);
+  RightTranspositionsType rt(rv);
   MatrixType m_permuted = MatrixType::Random(rows,cols);
   
   VERIFY_EVALUATION_COUNT(m_permuted = lp * m_original * rp, 1); // 1 temp for sub expression "lp * m_original"
@@ -115,6 +118,14 @@ template<typename MatrixType> void permutationmatrices(const MatrixType& m)
     Matrix<Scalar, Cols, Cols> B = rp.transpose();
     VERIFY_IS_APPROX(A, B.transpose());
   }
+
+  m_permuted = m_original;
+  lp = lt;
+  rp = rt;
+  VERIFY_EVALUATION_COUNT(m_permuted = lt * m_permuted * rt, 1);
+  VERIFY_IS_APPROX(m_permuted, lp*m_original*rp.transpose());
+  
+  VERIFY_IS_APPROX(lt.inverse()*m_permuted*rt.inverse(), m_original);
 }
 
 template<typename T>
@@ -141,7 +152,7 @@ void bug890()
   VERIFY_IS_APPROX(v1, (P.inverse() * rhs).eval());
 }
 
-void test_permutationmatrices()
+EIGEN_DECLARE_TEST(permutationmatrices)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( permutationmatrices(Matrix<float, 1, 1>()) );
diff --git a/test/prec_inverse_4x4.cpp b/test/prec_inverse_4x4.cpp
index eb6ad18c9..072466467 100644
--- a/test/prec_inverse_4x4.cpp
+++ b/test/prec_inverse_4x4.cpp
@@ -68,7 +68,7 @@ template<typename MatrixType> void inverse_general_4x4(int repeat)
   }
 }
 
-void test_prec_inverse_4x4()
+EIGEN_DECLARE_TEST(prec_inverse_4x4)
 {
   CALL_SUBTEST_1((inverse_permutation_4x4<Matrix4f>()));
   CALL_SUBTEST_1(( inverse_general_4x4<Matrix4f>(200000 * g_repeat) ));
diff --git a/test/product.h b/test/product.h
index 0425a929e..d26e8063d 100644
--- a/test/product.h
+++ b/test/product.h
@@ -111,6 +111,17 @@ template<typename MatrixType> void product(const MatrixType& m)
   vcres.noalias() -= m1.transpose() * v1;
   VERIFY_IS_APPROX(vcres, vc2 - m1.transpose() * v1);
 
+  // test scaled products
+  res = square;
+  res.noalias() = s1 * m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, ((s1*m1).eval() * m2.transpose()));
+  res = square;
+  res.noalias() += s1 * m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, square + ((s1*m1).eval() * m2.transpose()));
+  res = square;
+  res.noalias() -= s1 * m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, square - ((s1*m1).eval() * m2.transpose()));
+
   // test d ?= a+b*c rules
   res.noalias() = square + m1 * m2.transpose();
   VERIFY_IS_APPROX(res, square + m1 * m2.transpose());
diff --git a/test/product_extra.cpp b/test/product_extra.cpp
index e2b855bff..bd31df84d 100644
--- a/test/product_extra.cpp
+++ b/test/product_extra.cpp
@@ -11,7 +11,6 @@
 
 template<typename MatrixType> void product_extra(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef Matrix<Scalar, 1, Dynamic> RowVectorType;
   typedef Matrix<Scalar, Dynamic, 1> ColVectorType;
@@ -353,7 +352,7 @@ void bug_1308()
   VERIFY_IS_APPROX(r44.noalias() += Vector4d::Ones() * m44.col(0).transpose(), ones44);
 }
 
-void test_product_extra()
+EIGEN_DECLARE_TEST(product_extra)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( product_extra(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
diff --git a/test/product_large.cpp b/test/product_large.cpp
index 845cd40ca..ae14b714c 100644
--- a/test/product_large.cpp
+++ b/test/product_large.cpp
@@ -30,19 +30,9 @@ void test_aliasing()
   x = z;
 }
 
-void test_product_large()
+template<int>
+void product_large_regressions()
 {
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( product(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
-    CALL_SUBTEST_2( product(MatrixXd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
-    CALL_SUBTEST_3( product(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
-    CALL_SUBTEST_4( product(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
-    CALL_SUBTEST_5( product(Matrix<float,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
-
-    CALL_SUBTEST_1( test_aliasing<float>() );
-  }
-
-#if defined EIGEN_TEST_PART_6
   {
     // test a specific issue in DiagonalProduct
     int N = 1000000;
@@ -95,7 +85,23 @@ void test_product_large()
                 * (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)));
     VERIFY_IS_APPROX(B,C);
   }
-#endif
+}
+
+EIGEN_DECLARE_TEST(product_large)
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( product(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_2( product(MatrixXd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_2( product(MatrixXd(internal::random<int>(1,10), internal::random<int>(1,10))) );
+
+    CALL_SUBTEST_3( product(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_4( product(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
+    CALL_SUBTEST_5( product(Matrix<float,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+
+    CALL_SUBTEST_1( test_aliasing<float>() );
+  }
+
+  CALL_SUBTEST_6( product_large_regressions<0>() );
 
   // Regression test for bug 714:
 #if defined EIGEN_HAS_OPENMP
diff --git a/test/product_mmtr.cpp b/test/product_mmtr.cpp
index d3e24b012..bb19e6e52 100644
--- a/test/product_mmtr.cpp
+++ b/test/product_mmtr.cpp
@@ -84,7 +84,7 @@ template<typename Scalar> void mmtr(int size)
   VERIFY_IS_APPROX(matc, ref2);
 }
 
-void test_product_mmtr()
+EIGEN_DECLARE_TEST(product_mmtr)
 {
   for(int i = 0; i < g_repeat ; i++)
   {
diff --git a/test/product_notemporary.cpp b/test/product_notemporary.cpp
index 30592b79e..dffb07608 100644
--- a/test/product_notemporary.cpp
+++ b/test/product_notemporary.cpp
@@ -15,7 +15,6 @@ template<typename MatrixType> void product_notemporary(const MatrixType& m)
 {
   /* This test checks the number of temporaries created
    * during the evaluation of a complex expression */
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
   typedef Matrix<Scalar, 1, Dynamic> RowVectorType;
@@ -128,11 +127,19 @@ template<typename MatrixType> void product_notemporary(const MatrixType& m)
   VERIFY_EVALUATION_COUNT( cvres.noalias() = (rm3+rm3) * (m1*cv1), 1 );
 
   // Check outer products
+  #ifdef EIGEN_ALLOCA
+  bool temp_via_alloca = m3.rows()*sizeof(Scalar) <= EIGEN_STACK_ALLOCATION_LIMIT;
+  #else
+  bool temp_via_alloca = false;
+  #endif
   m3 = cv1 * rv1;
   VERIFY_EVALUATION_COUNT( m3.noalias() = cv1 * rv1, 0 );
-  VERIFY_EVALUATION_COUNT( m3.noalias() = (cv1+cv1) * (rv1+rv1), 1 );
+  VERIFY_EVALUATION_COUNT( m3.noalias() = (cv1+cv1) * (rv1+rv1), temp_via_alloca ? 0 : 1 );
   VERIFY_EVALUATION_COUNT( m3.noalias() = (m1*cv1) * (rv1), 1 );
   VERIFY_EVALUATION_COUNT( m3.noalias() += (m1*cv1) * (rv1), 1 );
+  rm3 = cv1 * rv1;
+  VERIFY_EVALUATION_COUNT( rm3.noalias() = cv1 * rv1, 0 );
+  VERIFY_EVALUATION_COUNT( rm3.noalias() = (cv1+cv1) * (rv1+rv1), temp_via_alloca ? 0 : 1 );
   VERIFY_EVALUATION_COUNT( rm3.noalias() = (cv1) * (rv1 * m1), 1 );
   VERIFY_EVALUATION_COUNT( rm3.noalias() -= (cv1) * (rv1 * m1), 1 );
   VERIFY_EVALUATION_COUNT( rm3.noalias() = (m1*cv1) * (rv1 * m1), 2 );
@@ -143,7 +150,7 @@ template<typename MatrixType> void product_notemporary(const MatrixType& m)
   VERIFY_EVALUATION_COUNT( rvres.noalias() = rv1 * (m1 * m2.adjoint()), 1 );
 }
 
-void test_product_notemporary()
+EIGEN_DECLARE_TEST(product_notemporary)
 {
   int s;
   for(int i = 0; i < g_repeat; i++) {
diff --git a/test/product_selfadjoint.cpp b/test/product_selfadjoint.cpp
index 3d768aa7e..bdccd0491 100644
--- a/test/product_selfadjoint.cpp
+++ b/test/product_selfadjoint.cpp
@@ -11,7 +11,6 @@
 
 template<typename MatrixType> void product_selfadjoint(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
   typedef Matrix<Scalar, 1, MatrixType::RowsAtCompileTime> RowVectorType;
@@ -60,7 +59,7 @@ template<typename MatrixType> void product_selfadjoint(const MatrixType& m)
   }
 }
 
-void test_product_selfadjoint()
+EIGEN_DECLARE_TEST(product_selfadjoint)
 {
   int s = 0;
   for(int i = 0; i < g_repeat ; i++) {
diff --git a/test/product_small.cpp b/test/product_small.cpp
index fdfdd9f6c..b8ce37d90 100644
--- a/test/product_small.cpp
+++ b/test/product_small.cpp
@@ -228,7 +228,37 @@ void bug_1311()
   VERIFY_IS_APPROX(res, A*b);
 }
 
-void test_product_small()
+template<int>
+void product_small_regressions()
+{
+  {
+    // test compilation of (outer_product) * vector
+    Vector3f v = Vector3f::Random();
+    VERIFY_IS_APPROX( (v * v.transpose()) * v, (v * v.transpose()).eval() * v);
+  }
+  
+  {
+    // regression test for pull-request #93
+    Eigen::Matrix<double, 1, 1> A;  A.setRandom();
+    Eigen::Matrix<double, 18, 1> B; B.setRandom();
+    Eigen::Matrix<double, 1, 18> C; C.setRandom();
+    VERIFY_IS_APPROX(B * A.inverse(), B * A.inverse()[0]);
+    VERIFY_IS_APPROX(A.inverse() * C, A.inverse()[0] * C);
+  }
+
+  {
+    Eigen::Matrix<double, 10, 10> A, B, C;
+    A.setRandom();
+    C = A;
+    for(int k=0; k<79; ++k)
+      C = C * A;
+    B.noalias() = (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)))
+                * (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)));
+    VERIFY_IS_APPROX(B,C);
+  }
+}
+
+EIGEN_DECLARE_TEST(product_small)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( product(Matrix<float, 3, 2>()) );
@@ -263,31 +293,5 @@ void test_product_small()
     CALL_SUBTEST_6( bug_1311<5>() );
   }
 
-#ifdef EIGEN_TEST_PART_6
-  {
-    // test compilation of (outer_product) * vector
-    Vector3f v = Vector3f::Random();
-    VERIFY_IS_APPROX( (v * v.transpose()) * v, (v * v.transpose()).eval() * v);
-  }
-  
-  {
-    // regression test for pull-request #93
-    Eigen::Matrix<double, 1, 1> A;  A.setRandom();
-    Eigen::Matrix<double, 18, 1> B; B.setRandom();
-    Eigen::Matrix<double, 1, 18> C; C.setRandom();
-    VERIFY_IS_APPROX(B * A.inverse(), B * A.inverse()[0]);
-    VERIFY_IS_APPROX(A.inverse() * C, A.inverse()[0] * C);
-  }
-
-  {
-    Eigen::Matrix<double, 10, 10> A, B, C;
-    A.setRandom();
-    C = A;
-    for(int k=0; k<79; ++k)
-      C = C * A;
-    B.noalias() = (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)))
-                * (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)));
-    VERIFY_IS_APPROX(B,C);
-  }
-#endif
+  CALL_SUBTEST_6( product_small_regressions<0>() );
 }
diff --git a/test/product_symm.cpp b/test/product_symm.cpp
index 8c44383f9..7d786467d 100644
--- a/test/product_symm.cpp
+++ b/test/product_symm.cpp
@@ -16,7 +16,6 @@ template<typename Scalar, int Size, int OtherSize> void symm(int size = Size, in
   typedef Matrix<Scalar, OtherSize, Size> Rhs2;
   enum { order = OtherSize==1 ? 0 : RowMajor };
   typedef Matrix<Scalar, Size, OtherSize,order> Rhs3;
-  typedef typename MatrixType::Index Index;
 
   Index rows = size;
   Index cols = size;
@@ -95,7 +94,7 @@ template<typename Scalar, int Size, int OtherSize> void symm(int size = Size, in
 
 }
 
-void test_product_symm()
+EIGEN_DECLARE_TEST(product_symm)
 {
   for(int i = 0; i < g_repeat ; i++)
   {
diff --git a/test/product_syrk.cpp b/test/product_syrk.cpp
index e10f0f2f2..23406fe4b 100644
--- a/test/product_syrk.cpp
+++ b/test/product_syrk.cpp
@@ -11,7 +11,6 @@
 
 template<typename MatrixType> void syrk(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime, RowMajor> RMatrixType;
   typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, Dynamic> Rhs1;
@@ -118,7 +117,7 @@ template<typename MatrixType> void syrk(const MatrixType& m)
                    ((s1 * m1.row(c).adjoint() * m1.row(c).adjoint().adjoint()).eval().template triangularView<Upper>().toDenseMatrix()));
 }
 
-void test_product_syrk()
+EIGEN_DECLARE_TEST(product_syrk)
 {
   for(int i = 0; i < g_repeat ; i++)
   {
diff --git a/test/product_trmm.cpp b/test/product_trmm.cpp
index e08d9f39f..c7594e512 100644
--- a/test/product_trmm.cpp
+++ b/test/product_trmm.cpp
@@ -115,7 +115,7 @@ void trmm(int rows=get_random_size<Scalar>(), int cols=get_random_size<Scalar>()
   CALL_ALL_ORDERS(EIGEN_CAT(3,NB),SCALAR,StrictlyLower)
   
 
-void test_product_trmm()
+EIGEN_DECLARE_TEST(product_trmm)
 {
   for(int i = 0; i < g_repeat ; i++)
   {
diff --git a/test/product_trmv.cpp b/test/product_trmv.cpp
index 57a202afc..5eb1b5ac0 100644
--- a/test/product_trmv.cpp
+++ b/test/product_trmv.cpp
@@ -11,7 +11,6 @@
 
 template<typename MatrixType> void trmv(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
@@ -71,7 +70,7 @@ template<typename MatrixType> void trmv(const MatrixType& m)
   // TODO check with sub-matrices
 }
 
-void test_product_trmv()
+EIGEN_DECLARE_TEST(product_trmv)
 {
   int s = 0;
   for(int i = 0; i < g_repeat ; i++) {
diff --git a/test/product_trsolve.cpp b/test/product_trsolve.cpp
index 4b97fa9d6..0c22cccf6 100644
--- a/test/product_trsolve.cpp
+++ b/test/product_trsolve.cpp
@@ -73,7 +73,7 @@ template<typename Scalar,int Size, int Cols> void trsolve(int size=Size,int cols
   VERIFY_TRSM(cmLhs.template triangularView<Lower>(), rmRhs.col(c));
 }
 
-void test_product_trsolve()
+EIGEN_DECLARE_TEST(product_trsolve)
 {
   for(int i = 0; i < g_repeat ; i++)
   {
diff --git a/test/qr.cpp b/test/qr.cpp
index dfcc1e8f9..4799aa9ef 100644
--- a/test/qr.cpp
+++ b/test/qr.cpp
@@ -12,8 +12,6 @@
 
 template<typename MatrixType> void qr(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
-
   Index rows = m.rows();
   Index cols = m.cols();
 
@@ -85,7 +83,7 @@ template<typename MatrixType> void qr_invertible()
   qr.compute(m1);
   VERIFY_IS_APPROX(log(absdet), qr.logAbsDeterminant());
   // This test is tricky if the determinant becomes too small.
-  // Since we generate random numbers with magnitude rrange [0,1], the average determinant is 0.5^size
+  // Since we generate random numbers with magnitude range [0,1], the average determinant is 0.5^size
   VERIFY_IS_MUCH_SMALLER_THAN( abs(absdet-qr.absDeterminant()), numext::maxi(RealScalar(pow(0.5,size)),numext::maxi<RealScalar>(abs(absdet),abs(qr.absDeterminant()))) );
   
 }
@@ -102,7 +100,7 @@ template<typename MatrixType> void qr_verify_assert()
   VERIFY_RAISES_ASSERT(qr.logAbsDeterminant())
 }
 
-void test_qr()
+EIGEN_DECLARE_TEST(qr)
 {
   for(int i = 0; i < g_repeat; i++) {
    CALL_SUBTEST_1( qr(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE),internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
diff --git a/test/qr_colpivoting.cpp b/test/qr_colpivoting.cpp
index 26ed27f5c..d224a9436 100644
--- a/test/qr_colpivoting.cpp
+++ b/test/qr_colpivoting.cpp
@@ -14,8 +14,6 @@
 
 template <typename MatrixType>
 void cod() {
-  typedef typename MatrixType::Index Index;
-
   Index rows = internal::random<Index>(2, EIGEN_TEST_MAX_SIZE);
   Index cols = internal::random<Index>(2, EIGEN_TEST_MAX_SIZE);
   Index cols2 = internal::random<Index>(2, EIGEN_TEST_MAX_SIZE);
@@ -94,7 +92,6 @@ void cod_fixedsize() {
 template<typename MatrixType> void qr()
 {
   using std::sqrt;
-  typedef typename MatrixType::Index Index;
 
   Index rows = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE), cols = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE), cols2 = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE);
   Index rank = internal::random<Index>(1, (std::min)(rows, cols)-1);
@@ -211,7 +208,6 @@ template<typename MatrixType> void qr_kahan_matrix()
 {
   using std::sqrt;
   using std::abs;
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
 
@@ -300,7 +296,7 @@ template<typename MatrixType> void qr_verify_assert()
   VERIFY_RAISES_ASSERT(qr.logAbsDeterminant())
 }
 
-void test_qr_colpivoting()
+EIGEN_DECLARE_TEST(qr_colpivoting)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( qr<MatrixXf>() );
diff --git a/test/qr_fullpivoting.cpp b/test/qr_fullpivoting.cpp
index 70e89c198..150b4256c 100644
--- a/test/qr_fullpivoting.cpp
+++ b/test/qr_fullpivoting.cpp
@@ -13,13 +13,12 @@
 
 template<typename MatrixType> void qr()
 {
-  typedef typename MatrixType::Index Index;
-
+  static const int Rows = MatrixType::RowsAtCompileTime, Cols = MatrixType::ColsAtCompileTime;
   Index max_size = EIGEN_TEST_MAX_SIZE;
   Index min_size = numext::maxi(1,EIGEN_TEST_MAX_SIZE/10);
-  Index rows  = internal::random<Index>(min_size,max_size),
-        cols  = internal::random<Index>(min_size,max_size),
-        cols2 = internal::random<Index>(min_size,max_size),
+  Index rows  = Rows == Dynamic ? internal::random<Index>(min_size,max_size) : Rows,
+        cols  = Cols == Dynamic ? internal::random<Index>(min_size,max_size) : Cols,
+        cols2 = Cols == Dynamic ? internal::random<Index>(min_size,max_size) : Cols,
         rank  = internal::random<Index>(1, (std::min)(rows, cols)-1);
 
   typedef typename MatrixType::Scalar Scalar;
@@ -126,11 +125,12 @@ template<typename MatrixType> void qr_verify_assert()
   VERIFY_RAISES_ASSERT(qr.logAbsDeterminant())
 }
 
-void test_qr_fullpivoting()
+EIGEN_DECLARE_TEST(qr_fullpivoting)
 {
- for(int i = 0; i < 1; i++) {
-    // FIXME : very weird bug here
-//     CALL_SUBTEST(qr(Matrix2f()) );
+  for(int i = 0; i < 1; i++) {
+    CALL_SUBTEST_5( qr<Matrix3f>() );
+    CALL_SUBTEST_6( qr<Matrix3d>() );
+    CALL_SUBTEST_8( qr<Matrix2f>() );
     CALL_SUBTEST_1( qr<MatrixXf>() );
     CALL_SUBTEST_2( qr<MatrixXd>() );
     CALL_SUBTEST_3( qr<MatrixXcd>() );
diff --git a/test/qtvector.cpp b/test/qtvector.cpp
index 2be885e48..4ec79b1e6 100644
--- a/test/qtvector.cpp
+++ b/test/qtvector.cpp
@@ -18,8 +18,6 @@
 template<typename MatrixType>
 void check_qtvector_matrix(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
-
   Index rows = m.rows();
   Index cols = m.cols();
   MatrixType x = MatrixType::Random(rows,cols), y = MatrixType::Random(rows,cols);
@@ -127,7 +125,7 @@ void check_qtvector_quaternion(const QuaternionType&)
   }
 }
 
-void test_qtvector()
+EIGEN_DECLARE_TEST(qtvector)
 {
   // some non vectorizable fixed sizes
   CALL_SUBTEST(check_qtvector_matrix(Vector2f()));
diff --git a/test/rand.cpp b/test/rand.cpp
index 51cf01773..1b5c058ab 100644
--- a/test/rand.cpp
+++ b/test/rand.cpp
@@ -54,7 +54,7 @@ template<typename Scalar> void check_histogram(Scalar x, Scalar y, int bins)
   VERIFY( (((hist.cast<double>()/double(f))-1.0).abs()<0.02).all() );
 }
 
-void test_rand()
+EIGEN_DECLARE_TEST(rand)
 {
   long long_ref = NumTraits<long>::highest()/10;
   signed char char_offset = (std::min)(g_repeat,64);
diff --git a/test/real_qz.cpp b/test/real_qz.cpp
index 99ac31235..1cf7aba2d 100644
--- a/test/real_qz.cpp
+++ b/test/real_qz.cpp
@@ -18,7 +18,6 @@ template<typename MatrixType> void real_qz(const MatrixType& m)
      RealQZ.h
   */
   using std::abs;
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   
   Index dim = m.cols();
@@ -76,7 +75,7 @@ template<typename MatrixType> void real_qz(const MatrixType& m)
   VERIFY_IS_APPROX(qz.matrixZ()*qz.matrixZ().adjoint(), MatrixType::Identity(dim,dim));
 }
 
-void test_real_qz()
+EIGEN_DECLARE_TEST(real_qz)
 {
   int s = 0;
   for(int i = 0; i < g_repeat; i++) {
diff --git a/test/redux.cpp b/test/redux.cpp
index 2bade3735..9e3ed4546 100644
--- a/test/redux.cpp
+++ b/test/redux.cpp
@@ -16,7 +16,6 @@
 
 template<typename MatrixType> void matrixRedux(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
 
@@ -81,7 +80,6 @@ template<typename MatrixType> void matrixRedux(const MatrixType& m)
 template<typename VectorType> void vectorRedux(const VectorType& w)
 {
   using std::abs;
-  typedef typename VectorType::Index Index;
   typedef typename VectorType::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
   Index size = w.size();
@@ -148,7 +146,7 @@ template<typename VectorType> void vectorRedux(const VectorType& w)
   VERIFY_RAISES_ASSERT(v.head(0).maxCoeff());
 }
 
-void test_redux()
+EIGEN_DECLARE_TEST(redux)
 {
   // the max size cannot be too large, otherwise reduxion operations obviously generate large errors.
   int maxsize = (std::min)(100,EIGEN_TEST_MAX_SIZE);
diff --git a/test/ref.cpp b/test/ref.cpp
index 769db0414..a94ea9677 100644
--- a/test/ref.cpp
+++ b/test/ref.cpp
@@ -13,7 +13,7 @@
 #endif
 
 #define TEST_ENABLE_TEMPORARY_TRACKING
-
+#define TEST_CHECK_STATIC_ASSERTIONS
 #include "main.h"
 
 // test Ref.h
@@ -32,7 +32,6 @@
 
 template<typename MatrixType> void ref_matrix(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
   typedef Matrix<Scalar,Dynamic,Dynamic,MatrixType::Options> DynMatrixType;
@@ -80,7 +79,6 @@ template<typename MatrixType> void ref_matrix(const MatrixType& m)
 
 template<typename VectorType> void ref_vector(const VectorType& m)
 {
-  typedef typename VectorType::Index Index;
   typedef typename VectorType::Scalar Scalar;
   typedef typename VectorType::RealScalar RealScalar;
   typedef Matrix<Scalar,Dynamic,1,VectorType::Options> DynMatrixType;
@@ -255,7 +253,18 @@ void test_ref_overloads()
   test_ref_ambiguous(A, B);
 }
 
-void test_ref()
+void test_ref_fixed_size_assert()
+{
+  Vector4f v4;
+  VectorXf vx(10);
+  VERIFY_RAISES_STATIC_ASSERT( Ref<Vector3f> y = v4; (void)y; );
+  VERIFY_RAISES_STATIC_ASSERT( Ref<Vector3f> y = vx.head<4>(); (void)y; );
+  VERIFY_RAISES_STATIC_ASSERT( Ref<const Vector3f> y = v4; (void)y; );
+  VERIFY_RAISES_STATIC_ASSERT( Ref<const Vector3f> y = vx.head<4>(); (void)y; );
+  VERIFY_RAISES_STATIC_ASSERT( Ref<const Vector3f> y = 2*v4; (void)y; );
+}
+
+EIGEN_DECLARE_TEST(ref)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( ref_vector(Matrix<float, 1, 1>()) );
@@ -277,4 +286,5 @@ void test_ref()
   }
   
   CALL_SUBTEST_7( test_ref_overloads() );
+  CALL_SUBTEST_7( test_ref_fixed_size_assert() );
 }
diff --git a/test/resize.cpp b/test/resize.cpp
index 4adaafe56..646a75b8f 100644
--- a/test/resize.cpp
+++ b/test/resize.cpp
@@ -33,7 +33,7 @@ void resizeLikeTest12() { resizeLikeTest<1,2>(); }
 void resizeLikeTest1020() { resizeLikeTest<10,20>(); }
 void resizeLikeTest31() { resizeLikeTest<3,1>(); }
 
-void test_resize()
+EIGEN_DECLARE_TEST(resize)
 {
   CALL_SUBTEST(resizeLikeTest12() );
   CALL_SUBTEST(resizeLikeTest1020() );
diff --git a/test/rvalue_types.cpp b/test/rvalue_types.cpp
index 8887f1b1b..5f52fb3bc 100644
--- a/test/rvalue_types.cpp
+++ b/test/rvalue_types.cpp
@@ -43,7 +43,7 @@ template <typename MatrixType>
 void rvalue_copyassign(const MatrixType&) {}
 #endif
 
-void test_rvalue_types()
+EIGEN_DECLARE_TEST(rvalue_types)
 {
   CALL_SUBTEST_1(rvalue_copyassign( MatrixXf::Random(50,50).eval() ));
   CALL_SUBTEST_1(rvalue_copyassign( ArrayXXf::Random(50,50).eval() ));
diff --git a/test/schur_complex.cpp b/test/schur_complex.cpp
index deb78e44e..03e17e81d 100644
--- a/test/schur_complex.cpp
+++ b/test/schur_complex.cpp
@@ -79,7 +79,7 @@ template<typename MatrixType> void schur(int size = MatrixType::ColsAtCompileTim
   }
 }
 
-void test_schur_complex()
+EIGEN_DECLARE_TEST(schur_complex)
 {
   CALL_SUBTEST_1(( schur<Matrix4cd>() ));
   CALL_SUBTEST_2(( schur<MatrixXcf>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4)) ));
diff --git a/test/schur_real.cpp b/test/schur_real.cpp
index 4aede87df..945461027 100644
--- a/test/schur_real.cpp
+++ b/test/schur_real.cpp
@@ -13,8 +13,6 @@
 
 template<typename MatrixType> void verifyIsQuasiTriangular(const MatrixType& T)
 {
-  typedef typename MatrixType::Index Index;
-
   const Index size = T.cols();
   typedef typename MatrixType::Scalar Scalar;
 
@@ -100,7 +98,7 @@ template<typename MatrixType> void schur(int size = MatrixType::ColsAtCompileTim
   }
 }
 
-void test_schur_real()
+EIGEN_DECLARE_TEST(schur_real)
 {
   CALL_SUBTEST_1(( schur<Matrix4f>() ));
   CALL_SUBTEST_2(( schur<MatrixXd>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4)) ));
diff --git a/test/selfadjoint.cpp b/test/selfadjoint.cpp
index 92401e506..9ca9cef9e 100644
--- a/test/selfadjoint.cpp
+++ b/test/selfadjoint.cpp
@@ -7,6 +7,7 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#define TEST_CHECK_STATIC_ASSERTIONS
 #include "main.h"
 
 // This file tests the basic selfadjointView API,
@@ -14,7 +15,6 @@
 
 template<typename MatrixType> void selfadjoint(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
 
   Index rows = m.rows();
@@ -45,6 +45,9 @@ template<typename MatrixType> void selfadjoint(const MatrixType& m)
   m4 = m2;
   m4 -= m1.template selfadjointView<Lower>();
   VERIFY_IS_APPROX(m4, m2-m3);
+
+  VERIFY_RAISES_STATIC_ASSERT(m2.template selfadjointView<StrictlyUpper>());
+  VERIFY_RAISES_STATIC_ASSERT(m2.template selfadjointView<UnitLower>());
 }
 
 void bug_159()
@@ -53,7 +56,7 @@ void bug_159()
   EIGEN_UNUSED_VARIABLE(m)
 }
 
-void test_selfadjoint()
+EIGEN_DECLARE_TEST(selfadjoint)
 {
   for(int i = 0; i < g_repeat ; i++)
   {
diff --git a/test/simplicial_cholesky.cpp b/test/simplicial_cholesky.cpp
index 649c817b4..314b903e2 100644
--- a/test/simplicial_cholesky.cpp
+++ b/test/simplicial_cholesky.cpp
@@ -35,11 +35,11 @@ template<typename T, typename I> void test_simplicial_cholesky_T()
   check_sparse_spd_determinant(ldlt_colmajor_lower_amd);
   check_sparse_spd_determinant(ldlt_colmajor_upper_amd);
   
-  check_sparse_spd_solving(ldlt_colmajor_lower_nat, 300, 1000);
-  check_sparse_spd_solving(ldlt_colmajor_upper_nat, 300, 1000);
+  check_sparse_spd_solving(ldlt_colmajor_lower_nat, (std::min)(300,EIGEN_TEST_MAX_SIZE), 1000);
+  check_sparse_spd_solving(ldlt_colmajor_upper_nat, (std::min)(300,EIGEN_TEST_MAX_SIZE), 1000);
 }
 
-void test_simplicial_cholesky()
+EIGEN_DECLARE_TEST(simplicial_cholesky)
 {
   CALL_SUBTEST_1(( test_simplicial_cholesky_T<double,int>() ));
   CALL_SUBTEST_2(( test_simplicial_cholesky_T<std::complex<double>, int>() ));
diff --git a/test/sizeof.cpp b/test/sizeof.cpp
index 03ad20453..af34e97dd 100644
--- a/test/sizeof.cpp
+++ b/test/sizeof.cpp
@@ -15,10 +15,10 @@ template<typename MatrixType> void verifySizeOf(const MatrixType&)
   if (MatrixType::RowsAtCompileTime!=Dynamic && MatrixType::ColsAtCompileTime!=Dynamic)
     VERIFY_IS_EQUAL(std::ptrdiff_t(sizeof(MatrixType)),std::ptrdiff_t(sizeof(Scalar))*std::ptrdiff_t(MatrixType::SizeAtCompileTime));
   else
-    VERIFY_IS_EQUAL(sizeof(MatrixType),sizeof(Scalar*) + 2 * sizeof(typename MatrixType::Index));
+    VERIFY_IS_EQUAL(sizeof(MatrixType),sizeof(Scalar*) + 2 * sizeof(Index));
 }
 
-void test_sizeof()
+EIGEN_DECLARE_TEST(sizeof)
 {
   CALL_SUBTEST(verifySizeOf(Matrix<float, 1, 1>()) );
   CALL_SUBTEST(verifySizeOf(Array<float, 2, 1>()) );
diff --git a/test/sizeoverflow.cpp b/test/sizeoverflow.cpp
index 240d22294..421351233 100644
--- a/test/sizeoverflow.cpp
+++ b/test/sizeoverflow.cpp
@@ -34,7 +34,7 @@ void triggerVectorBadAlloc(Index size)
   VERIFY_THROWS_BADALLOC( VectorType v; v.conservativeResize(size) );
 }
 
-void test_sizeoverflow()
+EIGEN_DECLARE_TEST(sizeoverflow)
 {
   // there are 2 levels of overflow checking. first in PlainObjectBase.h we check for overflow in rows*cols computations.
   // this is tested in tests of the form times_itself_gives_0 * times_itself_gives_0
diff --git a/test/smallvectors.cpp b/test/smallvectors.cpp
index 781511397..f9803acbb 100644
--- a/test/smallvectors.cpp
+++ b/test/smallvectors.cpp
@@ -57,7 +57,7 @@ template<typename Scalar> void smallVectors()
   }
 }
 
-void test_smallvectors()
+EIGEN_DECLARE_TEST(smallvectors)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST(smallVectors<int>() );
diff --git a/test/sparseLM.cpp b/test/sparseLM.cpp
index 8e148f9bc..a48fcb685 100644
--- a/test/sparseLM.cpp
+++ b/test/sparseLM.cpp
@@ -168,7 +168,7 @@ void test_sparseLM_T()
       return ;
 
 }
-void test_sparseLM()
+EIGEN_DECLARE_TEST(sparseLM)
 {
   CALL_SUBTEST_1(test_sparseLM_T<double>());
   
diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp
index f84b6e3f5..3691e8dad 100644
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp
@@ -640,8 +640,22 @@ void big_sparse_triplet(Index rows, Index cols, double density) {
   VERIFY_IS_APPROX(sum, m.sum());
 }
 
+template<int>
+void bug1105()
+{
+  // Regression test for bug 1105
+  int n = Eigen::internal::random<int>(200,600);
+  SparseMatrix<std::complex<double>,0, long> mat(n, n);
+  std::complex<double> val;
 
-void test_sparse_basic()
+  for(int i=0; i<n; ++i)
+  {
+    mat.coeffRef(i, i%(n/10)) = val;
+    VERIFY(mat.data().allocatedSize()<20*n);
+  }
+}
+
+EIGEN_DECLARE_TEST(sparse_basic)
 {
   for(int i = 0; i < g_repeat; i++) {
     int r = Eigen::internal::random<int>(1,200), c = Eigen::internal::random<int>(1,200);
@@ -671,18 +685,5 @@ void test_sparse_basic()
   CALL_SUBTEST_3((big_sparse_triplet<SparseMatrix<float, RowMajor, int> >(10000, 10000, 0.125)));
   CALL_SUBTEST_4((big_sparse_triplet<SparseMatrix<double, ColMajor, long int> >(10000, 10000, 0.125)));
 
-  // Regression test for bug 1105
-#ifdef EIGEN_TEST_PART_7
-  {
-    int n = Eigen::internal::random<int>(200,600);
-    SparseMatrix<std::complex<double>,0, long> mat(n, n);
-    std::complex<double> val;
-
-    for(int i=0; i<n; ++i)
-    {
-      mat.coeffRef(i, i%(n/10)) = val;
-      VERIFY(mat.data().allocatedSize()<20*n);
-    }
-  }
-#endif
+  CALL_SUBTEST_7( bug1105<0>() );
 }
diff --git a/test/sparse_block.cpp b/test/sparse_block.cpp
index 2a0b3b617..f9668102c 100644
--- a/test/sparse_block.cpp
+++ b/test/sparse_block.cpp
@@ -8,6 +8,7 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #include "sparse.h"
+#include "AnnoyingScalar.h"
 
 template<typename T>
 typename Eigen::internal::enable_if<(T::Flags&RowMajorBit)==RowMajorBit, typename T::RowXpr>::type
@@ -31,6 +32,7 @@ template<typename SparseMatrixType> void sparse_block(const SparseMatrixType& re
   const Index outer = ref.outerSize();
 
   typedef typename SparseMatrixType::Scalar Scalar;
+  typedef typename SparseMatrixType::RealScalar RealScalar;
   typedef typename SparseMatrixType::StorageIndex StorageIndex;
 
   double density = (std::max)(8./(rows*cols), 0.01);
@@ -164,14 +166,14 @@ template<typename SparseMatrixType> void sparse_block(const SparseMatrixType& re
     {
       VERIFY(j==numext::real(m3.innerVector(j).nonZeros()));
       if(j>0)
-        VERIFY(j==numext::real(m3.innerVector(j).lastCoeff()));
+        VERIFY(RealScalar(j)==numext::real(m3.innerVector(j).lastCoeff()));
     }
     m3.makeCompressed();
     for(Index j=0; j<(std::min)(outer, inner); ++j)
     {
       VERIFY(j==numext::real(m3.innerVector(j).nonZeros()));
       if(j>0)
-        VERIFY(j==numext::real(m3.innerVector(j).lastCoeff()));
+        VERIFY(RealScalar(j)==numext::real(m3.innerVector(j).lastCoeff()));
     }
 
     VERIFY(m3.innerVector(j0).nonZeros() == m3.transpose().innerVector(j0).nonZeros());
@@ -288,7 +290,7 @@ template<typename SparseMatrixType> void sparse_block(const SparseMatrixType& re
   }
 }
 
-void test_sparse_block()
+EIGEN_DECLARE_TEST(sparse_block)
 {
   for(int i = 0; i < g_repeat; i++) {
     int r = Eigen::internal::random<int>(1,200), c = Eigen::internal::random<int>(1,200);
@@ -313,5 +315,8 @@ void test_sparse_block()
     
     CALL_SUBTEST_4(( sparse_block(SparseMatrix<double,ColMajor,short int>(short(r), short(c))) ));
     CALL_SUBTEST_4(( sparse_block(SparseMatrix<double,RowMajor,short int>(short(r), short(c))) ));
+
+    AnnoyingScalar::dont_throw = true;
+    CALL_SUBTEST_5((  sparse_block(SparseMatrix<AnnoyingScalar>(r,c)) ));
   }
 }
diff --git a/test/sparse_permutations.cpp b/test/sparse_permutations.cpp
index b82cceff8..e93493c39 100644
--- a/test/sparse_permutations.cpp
+++ b/test/sparse_permutations.cpp
@@ -220,7 +220,7 @@ template<typename Scalar> void sparse_permutations_all(int size)
   CALL_SUBTEST(( sparse_permutations<RowMajor>(SparseMatrix<Scalar, RowMajor>(size,size)) ));
 }
 
-void test_sparse_permutations()
+EIGEN_DECLARE_TEST(sparse_permutations)
 {
   for(int i = 0; i < g_repeat; i++) {
     int s = Eigen::internal::random<int>(1,50);
diff --git a/test/sparse_product.cpp b/test/sparse_product.cpp
index f47170b72..db1b0e833 100644
--- a/test/sparse_product.cpp
+++ b/test/sparse_product.cpp
@@ -7,6 +7,12 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#if defined(_MSC_VER) && (_MSC_VER==1800)
+// This unit test takes forever to compile in Release mode with MSVC 2013,
+// multiple hours. So let's switch off optimization for this one.
+#pragma optimize("",off)
+#endif
+
 static long int nb_temporaries;
 
 inline void on_temporary_creation() {
@@ -453,7 +459,7 @@ void test_mixing_types()
   VERIFY_IS_APPROX( dC2 = sC1 * dR1.col(0), dC3 = sC1 * dR1.template cast<Cplx>().col(0) );
 }
 
-void test_sparse_product()
+EIGEN_DECLARE_TEST(sparse_product)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( (sparse_product<SparseMatrix<double,ColMajor> >()) );
diff --git a/test/sparse_ref.cpp b/test/sparse_ref.cpp
index 5e9607234..12b6f8a9d 100644
--- a/test/sparse_ref.cpp
+++ b/test/sparse_ref.cpp
@@ -126,7 +126,7 @@ void call_ref()
   VERIFY_EVALUATION_COUNT( call_ref_5(A.row(2), A.row(2).transpose()),  1);
 }
 
-void test_sparse_ref()
+EIGEN_DECLARE_TEST(sparse_ref)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( check_const_correctness(SparseMatrix<float>()) );
diff --git a/test/sparse_solver.h b/test/sparse_solver.h
index 5145bc3eb..19416ed5d 100644
--- a/test/sparse_solver.h
+++ b/test/sparse_solver.h
@@ -266,7 +266,7 @@ std::string solver_stats(const SparseSolverBase<Derived> &/*solver*/)
 }
 #endif
 
-template<typename Solver> void check_sparse_spd_solving(Solver& solver, int maxSize = 300, int maxRealWorldSize = 100000)
+template<typename Solver> void check_sparse_spd_solving(Solver& solver, int maxSize = (std::min)(300,EIGEN_TEST_MAX_SIZE), int maxRealWorldSize = 100000)
 {
   typedef typename Solver::MatrixType Mat;
   typedef typename Mat::Scalar Scalar;
diff --git a/test/sparse_solvers.cpp b/test/sparse_solvers.cpp
index 3a8873d43..aaf3d39c9 100644
--- a/test/sparse_solvers.cpp
+++ b/test/sparse_solvers.cpp
@@ -101,7 +101,7 @@ template<typename Scalar> void sparse_solvers(int rows, int cols)
   }
 }
 
-void test_sparse_solvers()
+EIGEN_DECLARE_TEST(sparse_solvers)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1(sparse_solvers<double>(8, 8) );
diff --git a/test/sparse_vector.cpp b/test/sparse_vector.cpp
index b3e1dda25..35129278b 100644
--- a/test/sparse_vector.cpp
+++ b/test/sparse_vector.cpp
@@ -145,7 +145,7 @@ template<typename Scalar,typename StorageIndex> void sparse_vector(int rows, int
 
 }
 
-void test_sparse_vector()
+EIGEN_DECLARE_TEST(sparse_vector)
 {
   for(int i = 0; i < g_repeat; i++) {
     int r = Eigen::internal::random<int>(1,500), c = Eigen::internal::random<int>(1,500);
diff --git a/test/sparselu.cpp b/test/sparselu.cpp
index bd000baf1..84cc6ebe5 100644
--- a/test/sparselu.cpp
+++ b/test/sparselu.cpp
@@ -36,7 +36,7 @@ template<typename T> void test_sparselu_T()
   check_sparse_square_determinant(sparselu_amd);
 }
 
-void test_sparselu()
+EIGEN_DECLARE_TEST(sparselu)
 {
   CALL_SUBTEST_1(test_sparselu_T<float>()); 
   CALL_SUBTEST_2(test_sparselu_T<double>());
diff --git a/test/sparseqr.cpp b/test/sparseqr.cpp
index e8605fd21..3ffe62314 100644
--- a/test/sparseqr.cpp
+++ b/test/sparseqr.cpp
@@ -54,6 +54,28 @@ template<typename Scalar> void test_sparseqr_scalar()
   
   b = dA * DenseVector::Random(A.cols());
   solver.compute(A);
+
+  // Q should be MxM
+  VERIFY_IS_EQUAL(solver.matrixQ().rows(), A.rows());
+  VERIFY_IS_EQUAL(solver.matrixQ().cols(), A.rows());
+
+  // R should be MxN
+  VERIFY_IS_EQUAL(solver.matrixR().rows(), A.rows());
+  VERIFY_IS_EQUAL(solver.matrixR().cols(), A.cols());
+
+  // Q and R can be multiplied
+  DenseMat recoveredA = solver.matrixQ()
+                      * DenseMat(solver.matrixR().template triangularView<Upper>())
+                      * solver.colsPermutation().transpose();
+  VERIFY_IS_EQUAL(recoveredA.rows(), A.rows());
+  VERIFY_IS_EQUAL(recoveredA.cols(), A.cols());
+
+  // and in the full rank case the original matrix is recovered
+  if (solver.rank() == A.cols())
+  {
+      VERIFY_IS_APPROX(A, recoveredA);
+  }
+
   if(internal::random<float>(0,1)>0.5f)
     solver.factorize(A);  // this checks that calling analyzePattern is not needed if the pattern do not change.
   if (solver.info() != Success)
@@ -95,7 +117,7 @@ template<typename Scalar> void test_sparseqr_scalar()
   dQ = solver.matrixQ();
   VERIFY_IS_APPROX(Q, dQ);
 }
-void test_sparseqr()
+EIGEN_DECLARE_TEST(sparseqr)
 {
   for(int i=0; i<g_repeat; ++i)
   {
diff --git a/test/special_numbers.cpp b/test/special_numbers.cpp
index 2f1b704be..1e1a63631 100644
--- a/test/special_numbers.cpp
+++ b/test/special_numbers.cpp
@@ -49,7 +49,7 @@ template<typename Scalar> void special_numbers()
   VERIFY(!mboth.array().allFinite());
 }
 
-void test_special_numbers()
+EIGEN_DECLARE_TEST(special_numbers)
 {
   for(int i = 0; i < 10*g_repeat; i++) {
     CALL_SUBTEST_1( special_numbers<float>() );
diff --git a/test/split_test_helper.h b/test/split_test_helper.h
new file mode 100644
index 000000000..82e82aaef
--- /dev/null
+++ b/test/split_test_helper.h
@@ -0,0 +1,5994 @@
+#if defined(EIGEN_TEST_PART_1) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_1(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_1(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_2) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_2(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_2(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_3) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_3(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_3(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_4) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_4(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_4(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_5) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_5(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_5(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_6) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_6(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_6(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_7) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_7(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_7(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_8) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_8(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_8(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_9) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_9(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_9(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_10) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_10(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_10(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_11) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_11(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_11(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_12) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_12(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_12(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_13) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_13(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_13(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_14) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_14(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_14(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_15) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_15(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_15(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_16) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_16(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_16(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_17) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_17(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_17(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_18) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_18(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_18(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_19) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_19(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_19(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_20) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_20(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_20(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_21) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_21(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_21(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_22) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_22(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_22(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_23) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_23(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_23(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_24) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_24(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_24(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_25) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_25(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_25(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_26) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_26(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_26(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_27) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_27(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_27(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_28) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_28(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_28(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_29) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_29(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_29(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_30) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_30(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_30(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_31) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_31(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_31(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_32) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_32(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_32(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_33) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_33(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_33(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_34) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_34(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_34(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_35) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_35(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_35(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_36) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_36(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_36(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_37) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_37(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_37(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_38) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_38(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_38(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_39) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_39(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_39(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_40) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_40(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_40(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_41) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_41(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_41(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_42) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_42(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_42(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_43) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_43(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_43(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_44) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_44(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_44(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_45) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_45(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_45(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_46) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_46(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_46(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_47) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_47(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_47(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_48) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_48(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_48(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_49) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_49(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_49(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_50) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_50(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_50(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_51) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_51(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_51(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_52) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_52(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_52(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_53) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_53(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_53(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_54) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_54(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_54(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_55) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_55(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_55(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_56) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_56(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_56(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_57) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_57(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_57(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_58) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_58(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_58(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_59) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_59(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_59(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_60) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_60(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_60(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_61) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_61(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_61(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_62) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_62(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_62(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_63) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_63(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_63(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_64) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_64(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_64(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_65) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_65(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_65(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_66) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_66(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_66(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_67) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_67(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_67(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_68) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_68(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_68(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_69) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_69(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_69(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_70) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_70(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_70(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_71) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_71(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_71(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_72) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_72(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_72(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_73) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_73(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_73(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_74) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_74(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_74(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_75) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_75(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_75(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_76) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_76(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_76(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_77) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_77(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_77(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_78) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_78(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_78(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_79) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_79(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_79(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_80) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_80(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_80(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_81) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_81(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_81(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_82) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_82(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_82(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_83) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_83(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_83(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_84) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_84(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_84(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_85) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_85(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_85(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_86) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_86(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_86(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_87) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_87(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_87(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_88) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_88(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_88(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_89) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_89(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_89(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_90) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_90(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_90(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_91) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_91(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_91(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_92) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_92(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_92(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_93) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_93(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_93(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_94) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_94(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_94(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_95) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_95(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_95(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_96) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_96(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_96(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_97) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_97(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_97(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_98) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_98(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_98(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_99) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_99(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_99(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_100) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_100(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_100(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_101) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_101(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_101(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_102) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_102(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_102(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_103) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_103(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_103(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_104) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_104(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_104(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_105) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_105(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_105(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_106) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_106(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_106(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_107) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_107(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_107(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_108) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_108(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_108(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_109) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_109(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_109(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_110) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_110(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_110(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_111) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_111(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_111(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_112) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_112(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_112(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_113) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_113(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_113(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_114) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_114(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_114(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_115) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_115(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_115(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_116) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_116(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_116(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_117) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_117(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_117(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_118) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_118(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_118(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_119) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_119(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_119(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_120) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_120(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_120(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_121) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_121(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_121(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_122) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_122(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_122(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_123) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_123(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_123(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_124) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_124(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_124(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_125) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_125(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_125(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_126) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_126(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_126(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_127) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_127(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_127(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_128) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_128(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_128(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_129) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_129(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_129(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_130) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_130(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_130(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_131) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_131(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_131(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_132) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_132(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_132(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_133) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_133(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_133(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_134) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_134(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_134(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_135) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_135(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_135(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_136) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_136(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_136(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_137) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_137(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_137(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_138) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_138(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_138(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_139) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_139(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_139(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_140) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_140(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_140(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_141) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_141(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_141(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_142) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_142(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_142(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_143) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_143(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_143(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_144) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_144(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_144(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_145) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_145(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_145(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_146) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_146(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_146(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_147) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_147(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_147(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_148) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_148(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_148(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_149) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_149(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_149(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_150) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_150(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_150(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_151) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_151(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_151(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_152) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_152(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_152(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_153) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_153(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_153(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_154) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_154(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_154(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_155) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_155(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_155(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_156) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_156(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_156(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_157) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_157(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_157(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_158) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_158(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_158(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_159) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_159(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_159(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_160) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_160(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_160(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_161) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_161(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_161(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_162) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_162(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_162(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_163) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_163(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_163(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_164) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_164(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_164(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_165) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_165(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_165(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_166) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_166(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_166(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_167) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_167(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_167(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_168) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_168(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_168(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_169) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_169(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_169(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_170) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_170(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_170(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_171) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_171(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_171(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_172) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_172(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_172(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_173) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_173(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_173(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_174) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_174(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_174(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_175) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_175(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_175(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_176) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_176(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_176(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_177) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_177(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_177(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_178) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_178(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_178(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_179) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_179(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_179(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_180) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_180(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_180(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_181) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_181(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_181(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_182) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_182(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_182(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_183) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_183(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_183(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_184) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_184(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_184(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_185) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_185(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_185(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_186) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_186(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_186(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_187) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_187(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_187(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_188) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_188(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_188(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_189) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_189(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_189(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_190) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_190(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_190(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_191) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_191(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_191(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_192) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_192(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_192(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_193) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_193(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_193(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_194) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_194(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_194(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_195) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_195(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_195(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_196) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_196(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_196(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_197) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_197(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_197(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_198) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_198(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_198(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_199) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_199(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_199(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_200) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_200(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_200(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_201) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_201(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_201(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_202) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_202(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_202(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_203) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_203(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_203(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_204) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_204(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_204(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_205) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_205(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_205(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_206) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_206(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_206(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_207) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_207(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_207(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_208) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_208(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_208(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_209) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_209(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_209(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_210) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_210(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_210(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_211) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_211(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_211(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_212) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_212(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_212(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_213) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_213(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_213(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_214) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_214(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_214(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_215) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_215(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_215(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_216) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_216(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_216(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_217) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_217(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_217(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_218) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_218(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_218(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_219) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_219(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_219(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_220) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_220(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_220(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_221) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_221(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_221(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_222) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_222(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_222(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_223) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_223(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_223(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_224) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_224(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_224(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_225) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_225(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_225(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_226) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_226(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_226(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_227) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_227(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_227(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_228) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_228(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_228(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_229) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_229(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_229(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_230) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_230(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_230(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_231) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_231(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_231(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_232) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_232(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_232(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_233) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_233(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_233(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_234) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_234(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_234(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_235) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_235(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_235(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_236) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_236(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_236(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_237) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_237(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_237(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_238) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_238(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_238(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_239) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_239(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_239(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_240) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_240(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_240(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_241) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_241(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_241(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_242) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_242(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_242(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_243) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_243(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_243(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_244) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_244(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_244(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_245) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_245(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_245(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_246) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_246(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_246(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_247) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_247(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_247(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_248) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_248(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_248(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_249) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_249(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_249(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_250) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_250(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_250(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_251) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_251(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_251(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_252) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_252(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_252(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_253) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_253(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_253(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_254) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_254(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_254(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_255) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_255(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_255(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_256) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_256(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_256(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_257) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_257(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_257(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_258) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_258(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_258(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_259) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_259(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_259(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_260) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_260(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_260(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_261) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_261(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_261(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_262) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_262(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_262(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_263) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_263(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_263(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_264) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_264(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_264(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_265) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_265(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_265(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_266) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_266(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_266(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_267) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_267(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_267(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_268) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_268(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_268(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_269) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_269(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_269(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_270) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_270(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_270(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_271) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_271(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_271(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_272) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_272(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_272(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_273) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_273(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_273(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_274) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_274(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_274(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_275) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_275(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_275(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_276) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_276(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_276(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_277) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_277(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_277(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_278) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_278(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_278(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_279) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_279(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_279(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_280) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_280(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_280(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_281) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_281(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_281(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_282) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_282(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_282(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_283) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_283(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_283(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_284) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_284(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_284(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_285) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_285(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_285(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_286) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_286(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_286(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_287) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_287(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_287(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_288) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_288(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_288(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_289) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_289(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_289(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_290) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_290(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_290(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_291) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_291(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_291(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_292) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_292(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_292(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_293) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_293(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_293(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_294) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_294(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_294(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_295) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_295(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_295(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_296) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_296(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_296(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_297) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_297(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_297(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_298) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_298(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_298(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_299) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_299(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_299(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_300) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_300(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_300(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_301) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_301(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_301(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_302) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_302(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_302(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_303) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_303(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_303(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_304) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_304(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_304(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_305) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_305(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_305(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_306) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_306(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_306(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_307) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_307(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_307(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_308) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_308(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_308(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_309) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_309(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_309(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_310) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_310(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_310(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_311) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_311(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_311(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_312) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_312(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_312(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_313) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_313(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_313(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_314) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_314(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_314(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_315) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_315(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_315(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_316) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_316(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_316(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_317) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_317(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_317(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_318) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_318(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_318(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_319) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_319(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_319(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_320) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_320(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_320(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_321) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_321(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_321(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_322) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_322(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_322(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_323) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_323(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_323(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_324) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_324(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_324(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_325) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_325(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_325(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_326) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_326(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_326(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_327) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_327(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_327(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_328) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_328(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_328(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_329) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_329(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_329(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_330) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_330(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_330(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_331) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_331(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_331(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_332) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_332(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_332(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_333) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_333(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_333(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_334) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_334(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_334(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_335) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_335(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_335(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_336) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_336(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_336(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_337) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_337(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_337(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_338) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_338(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_338(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_339) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_339(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_339(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_340) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_340(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_340(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_341) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_341(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_341(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_342) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_342(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_342(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_343) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_343(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_343(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_344) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_344(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_344(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_345) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_345(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_345(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_346) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_346(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_346(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_347) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_347(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_347(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_348) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_348(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_348(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_349) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_349(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_349(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_350) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_350(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_350(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_351) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_351(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_351(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_352) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_352(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_352(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_353) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_353(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_353(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_354) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_354(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_354(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_355) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_355(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_355(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_356) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_356(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_356(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_357) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_357(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_357(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_358) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_358(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_358(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_359) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_359(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_359(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_360) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_360(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_360(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_361) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_361(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_361(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_362) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_362(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_362(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_363) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_363(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_363(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_364) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_364(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_364(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_365) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_365(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_365(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_366) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_366(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_366(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_367) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_367(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_367(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_368) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_368(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_368(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_369) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_369(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_369(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_370) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_370(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_370(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_371) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_371(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_371(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_372) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_372(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_372(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_373) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_373(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_373(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_374) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_374(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_374(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_375) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_375(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_375(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_376) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_376(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_376(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_377) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_377(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_377(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_378) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_378(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_378(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_379) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_379(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_379(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_380) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_380(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_380(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_381) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_381(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_381(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_382) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_382(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_382(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_383) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_383(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_383(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_384) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_384(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_384(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_385) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_385(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_385(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_386) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_386(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_386(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_387) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_387(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_387(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_388) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_388(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_388(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_389) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_389(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_389(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_390) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_390(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_390(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_391) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_391(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_391(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_392) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_392(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_392(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_393) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_393(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_393(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_394) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_394(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_394(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_395) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_395(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_395(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_396) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_396(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_396(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_397) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_397(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_397(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_398) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_398(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_398(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_399) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_399(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_399(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_400) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_400(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_400(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_401) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_401(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_401(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_402) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_402(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_402(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_403) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_403(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_403(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_404) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_404(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_404(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_405) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_405(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_405(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_406) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_406(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_406(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_407) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_407(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_407(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_408) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_408(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_408(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_409) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_409(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_409(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_410) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_410(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_410(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_411) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_411(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_411(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_412) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_412(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_412(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_413) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_413(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_413(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_414) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_414(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_414(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_415) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_415(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_415(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_416) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_416(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_416(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_417) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_417(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_417(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_418) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_418(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_418(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_419) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_419(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_419(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_420) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_420(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_420(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_421) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_421(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_421(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_422) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_422(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_422(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_423) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_423(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_423(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_424) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_424(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_424(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_425) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_425(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_425(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_426) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_426(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_426(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_427) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_427(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_427(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_428) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_428(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_428(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_429) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_429(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_429(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_430) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_430(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_430(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_431) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_431(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_431(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_432) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_432(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_432(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_433) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_433(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_433(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_434) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_434(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_434(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_435) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_435(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_435(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_436) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_436(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_436(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_437) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_437(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_437(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_438) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_438(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_438(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_439) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_439(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_439(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_440) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_440(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_440(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_441) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_441(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_441(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_442) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_442(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_442(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_443) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_443(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_443(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_444) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_444(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_444(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_445) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_445(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_445(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_446) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_446(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_446(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_447) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_447(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_447(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_448) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_448(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_448(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_449) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_449(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_449(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_450) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_450(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_450(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_451) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_451(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_451(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_452) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_452(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_452(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_453) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_453(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_453(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_454) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_454(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_454(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_455) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_455(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_455(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_456) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_456(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_456(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_457) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_457(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_457(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_458) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_458(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_458(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_459) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_459(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_459(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_460) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_460(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_460(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_461) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_461(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_461(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_462) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_462(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_462(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_463) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_463(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_463(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_464) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_464(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_464(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_465) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_465(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_465(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_466) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_466(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_466(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_467) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_467(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_467(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_468) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_468(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_468(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_469) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_469(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_469(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_470) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_470(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_470(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_471) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_471(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_471(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_472) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_472(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_472(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_473) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_473(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_473(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_474) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_474(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_474(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_475) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_475(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_475(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_476) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_476(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_476(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_477) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_477(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_477(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_478) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_478(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_478(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_479) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_479(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_479(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_480) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_480(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_480(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_481) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_481(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_481(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_482) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_482(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_482(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_483) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_483(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_483(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_484) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_484(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_484(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_485) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_485(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_485(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_486) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_486(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_486(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_487) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_487(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_487(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_488) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_488(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_488(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_489) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_489(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_489(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_490) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_490(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_490(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_491) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_491(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_491(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_492) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_492(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_492(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_493) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_493(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_493(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_494) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_494(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_494(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_495) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_495(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_495(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_496) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_496(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_496(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_497) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_497(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_497(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_498) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_498(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_498(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_499) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_499(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_499(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_500) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_500(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_500(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_501) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_501(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_501(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_502) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_502(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_502(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_503) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_503(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_503(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_504) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_504(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_504(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_505) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_505(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_505(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_506) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_506(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_506(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_507) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_507(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_507(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_508) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_508(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_508(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_509) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_509(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_509(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_510) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_510(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_510(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_511) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_511(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_511(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_512) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_512(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_512(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_513) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_513(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_513(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_514) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_514(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_514(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_515) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_515(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_515(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_516) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_516(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_516(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_517) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_517(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_517(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_518) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_518(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_518(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_519) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_519(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_519(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_520) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_520(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_520(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_521) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_521(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_521(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_522) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_522(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_522(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_523) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_523(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_523(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_524) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_524(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_524(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_525) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_525(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_525(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_526) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_526(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_526(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_527) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_527(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_527(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_528) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_528(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_528(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_529) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_529(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_529(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_530) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_530(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_530(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_531) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_531(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_531(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_532) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_532(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_532(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_533) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_533(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_533(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_534) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_534(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_534(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_535) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_535(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_535(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_536) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_536(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_536(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_537) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_537(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_537(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_538) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_538(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_538(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_539) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_539(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_539(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_540) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_540(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_540(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_541) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_541(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_541(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_542) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_542(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_542(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_543) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_543(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_543(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_544) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_544(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_544(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_545) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_545(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_545(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_546) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_546(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_546(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_547) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_547(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_547(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_548) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_548(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_548(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_549) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_549(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_549(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_550) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_550(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_550(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_551) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_551(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_551(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_552) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_552(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_552(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_553) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_553(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_553(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_554) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_554(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_554(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_555) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_555(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_555(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_556) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_556(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_556(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_557) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_557(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_557(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_558) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_558(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_558(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_559) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_559(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_559(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_560) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_560(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_560(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_561) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_561(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_561(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_562) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_562(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_562(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_563) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_563(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_563(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_564) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_564(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_564(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_565) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_565(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_565(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_566) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_566(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_566(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_567) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_567(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_567(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_568) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_568(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_568(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_569) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_569(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_569(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_570) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_570(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_570(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_571) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_571(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_571(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_572) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_572(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_572(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_573) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_573(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_573(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_574) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_574(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_574(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_575) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_575(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_575(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_576) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_576(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_576(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_577) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_577(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_577(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_578) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_578(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_578(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_579) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_579(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_579(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_580) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_580(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_580(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_581) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_581(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_581(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_582) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_582(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_582(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_583) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_583(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_583(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_584) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_584(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_584(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_585) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_585(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_585(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_586) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_586(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_586(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_587) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_587(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_587(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_588) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_588(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_588(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_589) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_589(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_589(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_590) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_590(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_590(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_591) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_591(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_591(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_592) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_592(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_592(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_593) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_593(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_593(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_594) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_594(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_594(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_595) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_595(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_595(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_596) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_596(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_596(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_597) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_597(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_597(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_598) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_598(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_598(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_599) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_599(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_599(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_600) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_600(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_600(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_601) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_601(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_601(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_602) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_602(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_602(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_603) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_603(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_603(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_604) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_604(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_604(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_605) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_605(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_605(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_606) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_606(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_606(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_607) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_607(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_607(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_608) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_608(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_608(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_609) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_609(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_609(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_610) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_610(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_610(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_611) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_611(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_611(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_612) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_612(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_612(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_613) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_613(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_613(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_614) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_614(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_614(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_615) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_615(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_615(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_616) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_616(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_616(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_617) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_617(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_617(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_618) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_618(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_618(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_619) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_619(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_619(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_620) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_620(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_620(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_621) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_621(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_621(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_622) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_622(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_622(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_623) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_623(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_623(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_624) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_624(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_624(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_625) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_625(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_625(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_626) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_626(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_626(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_627) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_627(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_627(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_628) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_628(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_628(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_629) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_629(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_629(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_630) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_630(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_630(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_631) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_631(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_631(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_632) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_632(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_632(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_633) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_633(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_633(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_634) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_634(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_634(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_635) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_635(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_635(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_636) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_636(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_636(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_637) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_637(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_637(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_638) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_638(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_638(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_639) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_639(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_639(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_640) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_640(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_640(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_641) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_641(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_641(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_642) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_642(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_642(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_643) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_643(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_643(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_644) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_644(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_644(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_645) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_645(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_645(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_646) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_646(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_646(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_647) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_647(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_647(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_648) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_648(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_648(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_649) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_649(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_649(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_650) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_650(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_650(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_651) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_651(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_651(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_652) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_652(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_652(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_653) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_653(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_653(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_654) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_654(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_654(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_655) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_655(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_655(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_656) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_656(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_656(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_657) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_657(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_657(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_658) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_658(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_658(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_659) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_659(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_659(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_660) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_660(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_660(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_661) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_661(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_661(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_662) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_662(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_662(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_663) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_663(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_663(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_664) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_664(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_664(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_665) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_665(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_665(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_666) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_666(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_666(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_667) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_667(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_667(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_668) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_668(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_668(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_669) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_669(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_669(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_670) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_670(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_670(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_671) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_671(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_671(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_672) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_672(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_672(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_673) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_673(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_673(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_674) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_674(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_674(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_675) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_675(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_675(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_676) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_676(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_676(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_677) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_677(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_677(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_678) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_678(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_678(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_679) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_679(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_679(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_680) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_680(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_680(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_681) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_681(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_681(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_682) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_682(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_682(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_683) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_683(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_683(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_684) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_684(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_684(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_685) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_685(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_685(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_686) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_686(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_686(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_687) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_687(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_687(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_688) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_688(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_688(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_689) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_689(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_689(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_690) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_690(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_690(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_691) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_691(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_691(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_692) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_692(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_692(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_693) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_693(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_693(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_694) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_694(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_694(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_695) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_695(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_695(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_696) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_696(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_696(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_697) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_697(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_697(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_698) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_698(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_698(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_699) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_699(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_699(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_700) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_700(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_700(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_701) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_701(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_701(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_702) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_702(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_702(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_703) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_703(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_703(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_704) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_704(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_704(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_705) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_705(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_705(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_706) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_706(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_706(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_707) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_707(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_707(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_708) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_708(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_708(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_709) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_709(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_709(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_710) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_710(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_710(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_711) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_711(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_711(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_712) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_712(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_712(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_713) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_713(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_713(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_714) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_714(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_714(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_715) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_715(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_715(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_716) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_716(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_716(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_717) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_717(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_717(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_718) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_718(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_718(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_719) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_719(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_719(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_720) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_720(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_720(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_721) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_721(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_721(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_722) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_722(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_722(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_723) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_723(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_723(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_724) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_724(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_724(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_725) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_725(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_725(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_726) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_726(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_726(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_727) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_727(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_727(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_728) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_728(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_728(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_729) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_729(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_729(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_730) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_730(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_730(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_731) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_731(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_731(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_732) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_732(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_732(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_733) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_733(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_733(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_734) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_734(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_734(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_735) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_735(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_735(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_736) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_736(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_736(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_737) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_737(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_737(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_738) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_738(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_738(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_739) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_739(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_739(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_740) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_740(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_740(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_741) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_741(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_741(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_742) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_742(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_742(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_743) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_743(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_743(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_744) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_744(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_744(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_745) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_745(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_745(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_746) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_746(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_746(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_747) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_747(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_747(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_748) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_748(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_748(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_749) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_749(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_749(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_750) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_750(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_750(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_751) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_751(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_751(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_752) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_752(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_752(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_753) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_753(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_753(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_754) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_754(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_754(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_755) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_755(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_755(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_756) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_756(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_756(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_757) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_757(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_757(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_758) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_758(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_758(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_759) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_759(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_759(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_760) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_760(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_760(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_761) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_761(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_761(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_762) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_762(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_762(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_763) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_763(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_763(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_764) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_764(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_764(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_765) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_765(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_765(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_766) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_766(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_766(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_767) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_767(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_767(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_768) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_768(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_768(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_769) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_769(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_769(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_770) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_770(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_770(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_771) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_771(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_771(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_772) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_772(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_772(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_773) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_773(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_773(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_774) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_774(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_774(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_775) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_775(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_775(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_776) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_776(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_776(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_777) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_777(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_777(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_778) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_778(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_778(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_779) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_779(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_779(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_780) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_780(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_780(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_781) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_781(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_781(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_782) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_782(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_782(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_783) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_783(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_783(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_784) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_784(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_784(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_785) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_785(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_785(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_786) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_786(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_786(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_787) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_787(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_787(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_788) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_788(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_788(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_789) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_789(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_789(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_790) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_790(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_790(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_791) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_791(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_791(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_792) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_792(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_792(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_793) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_793(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_793(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_794) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_794(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_794(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_795) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_795(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_795(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_796) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_796(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_796(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_797) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_797(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_797(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_798) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_798(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_798(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_799) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_799(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_799(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_800) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_800(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_800(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_801) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_801(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_801(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_802) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_802(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_802(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_803) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_803(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_803(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_804) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_804(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_804(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_805) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_805(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_805(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_806) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_806(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_806(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_807) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_807(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_807(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_808) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_808(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_808(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_809) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_809(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_809(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_810) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_810(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_810(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_811) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_811(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_811(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_812) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_812(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_812(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_813) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_813(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_813(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_814) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_814(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_814(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_815) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_815(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_815(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_816) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_816(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_816(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_817) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_817(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_817(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_818) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_818(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_818(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_819) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_819(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_819(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_820) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_820(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_820(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_821) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_821(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_821(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_822) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_822(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_822(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_823) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_823(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_823(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_824) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_824(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_824(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_825) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_825(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_825(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_826) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_826(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_826(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_827) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_827(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_827(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_828) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_828(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_828(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_829) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_829(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_829(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_830) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_830(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_830(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_831) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_831(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_831(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_832) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_832(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_832(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_833) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_833(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_833(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_834) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_834(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_834(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_835) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_835(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_835(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_836) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_836(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_836(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_837) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_837(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_837(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_838) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_838(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_838(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_839) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_839(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_839(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_840) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_840(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_840(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_841) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_841(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_841(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_842) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_842(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_842(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_843) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_843(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_843(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_844) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_844(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_844(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_845) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_845(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_845(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_846) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_846(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_846(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_847) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_847(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_847(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_848) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_848(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_848(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_849) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_849(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_849(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_850) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_850(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_850(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_851) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_851(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_851(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_852) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_852(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_852(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_853) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_853(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_853(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_854) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_854(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_854(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_855) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_855(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_855(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_856) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_856(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_856(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_857) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_857(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_857(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_858) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_858(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_858(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_859) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_859(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_859(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_860) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_860(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_860(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_861) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_861(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_861(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_862) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_862(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_862(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_863) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_863(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_863(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_864) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_864(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_864(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_865) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_865(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_865(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_866) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_866(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_866(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_867) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_867(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_867(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_868) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_868(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_868(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_869) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_869(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_869(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_870) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_870(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_870(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_871) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_871(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_871(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_872) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_872(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_872(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_873) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_873(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_873(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_874) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_874(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_874(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_875) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_875(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_875(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_876) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_876(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_876(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_877) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_877(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_877(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_878) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_878(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_878(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_879) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_879(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_879(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_880) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_880(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_880(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_881) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_881(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_881(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_882) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_882(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_882(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_883) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_883(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_883(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_884) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_884(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_884(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_885) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_885(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_885(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_886) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_886(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_886(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_887) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_887(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_887(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_888) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_888(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_888(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_889) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_889(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_889(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_890) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_890(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_890(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_891) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_891(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_891(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_892) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_892(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_892(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_893) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_893(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_893(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_894) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_894(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_894(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_895) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_895(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_895(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_896) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_896(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_896(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_897) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_897(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_897(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_898) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_898(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_898(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_899) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_899(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_899(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_900) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_900(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_900(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_901) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_901(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_901(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_902) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_902(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_902(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_903) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_903(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_903(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_904) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_904(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_904(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_905) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_905(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_905(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_906) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_906(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_906(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_907) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_907(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_907(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_908) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_908(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_908(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_909) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_909(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_909(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_910) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_910(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_910(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_911) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_911(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_911(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_912) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_912(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_912(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_913) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_913(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_913(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_914) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_914(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_914(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_915) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_915(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_915(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_916) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_916(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_916(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_917) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_917(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_917(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_918) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_918(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_918(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_919) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_919(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_919(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_920) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_920(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_920(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_921) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_921(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_921(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_922) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_922(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_922(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_923) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_923(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_923(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_924) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_924(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_924(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_925) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_925(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_925(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_926) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_926(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_926(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_927) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_927(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_927(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_928) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_928(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_928(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_929) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_929(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_929(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_930) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_930(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_930(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_931) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_931(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_931(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_932) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_932(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_932(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_933) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_933(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_933(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_934) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_934(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_934(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_935) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_935(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_935(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_936) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_936(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_936(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_937) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_937(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_937(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_938) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_938(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_938(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_939) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_939(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_939(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_940) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_940(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_940(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_941) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_941(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_941(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_942) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_942(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_942(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_943) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_943(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_943(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_944) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_944(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_944(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_945) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_945(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_945(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_946) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_946(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_946(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_947) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_947(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_947(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_948) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_948(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_948(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_949) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_949(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_949(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_950) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_950(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_950(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_951) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_951(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_951(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_952) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_952(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_952(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_953) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_953(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_953(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_954) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_954(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_954(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_955) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_955(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_955(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_956) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_956(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_956(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_957) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_957(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_957(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_958) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_958(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_958(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_959) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_959(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_959(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_960) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_960(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_960(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_961) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_961(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_961(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_962) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_962(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_962(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_963) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_963(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_963(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_964) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_964(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_964(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_965) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_965(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_965(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_966) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_966(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_966(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_967) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_967(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_967(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_968) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_968(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_968(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_969) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_969(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_969(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_970) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_970(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_970(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_971) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_971(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_971(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_972) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_972(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_972(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_973) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_973(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_973(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_974) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_974(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_974(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_975) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_975(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_975(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_976) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_976(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_976(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_977) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_977(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_977(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_978) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_978(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_978(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_979) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_979(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_979(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_980) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_980(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_980(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_981) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_981(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_981(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_982) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_982(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_982(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_983) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_983(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_983(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_984) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_984(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_984(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_985) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_985(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_985(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_986) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_986(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_986(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_987) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_987(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_987(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_988) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_988(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_988(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_989) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_989(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_989(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_990) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_990(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_990(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_991) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_991(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_991(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_992) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_992(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_992(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_993) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_993(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_993(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_994) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_994(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_994(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_995) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_995(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_995(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_996) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_996(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_996(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_997) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_997(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_997(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_998) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_998(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_998(FUNC)
+#endif
+
+#if defined(EIGEN_TEST_PART_999) || defined(EIGEN_TEST_PART_ALL)
+#define CALL_SUBTEST_999(FUNC) CALL_SUBTEST(FUNC)
+#else
+#define CALL_SUBTEST_999(FUNC)
+#endif
+
diff --git a/test/spqr_support.cpp b/test/spqr_support.cpp
index 81e63b6a5..79c2c12fc 100644
--- a/test/spqr_support.cpp
+++ b/test/spqr_support.cpp
@@ -57,7 +57,7 @@ template<typename Scalar> void test_spqr_scalar()
   refX = dA.colPivHouseholderQr().solve(b);
   VERIFY(x.isApprox(refX,test_precision<Scalar>()));
 }
-void test_spqr_support()
+EIGEN_DECLARE_TEST(spqr_support)
 {
   CALL_SUBTEST_1(test_spqr_scalar<double>());
   CALL_SUBTEST_2(test_spqr_scalar<std::complex<double> >());
diff --git a/test/stable_norm.cpp b/test/stable_norm.cpp
index 3c02474b8..ee5f91674 100644
--- a/test/stable_norm.cpp
+++ b/test/stable_norm.cpp
@@ -21,7 +21,6 @@ template<typename MatrixType> void stable_norm(const MatrixType& m)
   */
   using std::sqrt;
   using std::abs;
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
   
@@ -190,13 +189,51 @@ template<typename MatrixType> void stable_norm(const MatrixType& m)
   }
 }
 
-void test_stable_norm()
+template<typename Scalar>
+void test_hypot()
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  Scalar factor = internal::random<Scalar>();
+  while(numext::abs2(factor)<RealScalar(1e-4))
+    factor = internal::random<Scalar>();
+  Scalar big = factor * ((std::numeric_limits<RealScalar>::max)() * RealScalar(1e-4));
+  
+  factor = internal::random<Scalar>();
+  while(numext::abs2(factor)<RealScalar(1e-4))
+    factor = internal::random<Scalar>();
+  Scalar small = factor * ((std::numeric_limits<RealScalar>::min)() * RealScalar(1e4));
+
+  Scalar  one   (1),
+          zero  (0),
+          sqrt2 (std::sqrt(2)),
+          nan   (std::numeric_limits<RealScalar>::quiet_NaN());
+
+  Scalar a = internal::random<Scalar>(-1,1);
+  Scalar b = internal::random<Scalar>(-1,1);
+  VERIFY_IS_APPROX(numext::hypot(a,b),std::sqrt(numext::abs2(a)+numext::abs2(b)));
+  VERIFY_IS_EQUAL(numext::hypot(zero,zero), zero);
+  VERIFY_IS_APPROX(numext::hypot(one, one), sqrt2);
+  VERIFY_IS_APPROX(numext::hypot(big,big), sqrt2*numext::abs(big));
+  VERIFY_IS_APPROX(numext::hypot(small,small), sqrt2*numext::abs(small));
+  VERIFY_IS_APPROX(numext::hypot(small,big), numext::abs(big));
+  VERIFY((numext::isnan)(numext::hypot(nan,a)));
+  VERIFY((numext::isnan)(numext::hypot(a,nan)));
+}
+
+EIGEN_DECLARE_TEST(stable_norm)
 {
   for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_3( test_hypot<double>() );
+    CALL_SUBTEST_4( test_hypot<float>() );
+    CALL_SUBTEST_5( test_hypot<std::complex<double> >() );
+    CALL_SUBTEST_6( test_hypot<std::complex<float> >() );
+
     CALL_SUBTEST_1( stable_norm(Matrix<float, 1, 1>()) );
     CALL_SUBTEST_2( stable_norm(Vector4d()) );
     CALL_SUBTEST_3( stable_norm(VectorXd(internal::random<int>(10,2000))) );
+    CALL_SUBTEST_3( stable_norm(MatrixXd(internal::random<int>(10,200), internal::random<int>(10,200))) );
     CALL_SUBTEST_4( stable_norm(VectorXf(internal::random<int>(10,2000))) );
     CALL_SUBTEST_5( stable_norm(VectorXcd(internal::random<int>(10,2000))) );
+    CALL_SUBTEST_6( stable_norm(VectorXcf(internal::random<int>(10,2000))) );
   }
 }
diff --git a/test/stddeque.cpp b/test/stddeque.cpp
index bb4b476f3..0a6aa2572 100644
--- a/test/stddeque.cpp
+++ b/test/stddeque.cpp
@@ -15,8 +15,6 @@
 template<typename MatrixType>
 void check_stddeque_matrix(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
-  
   Index rows = m.rows();
   Index cols = m.cols();
   MatrixType x = MatrixType::Random(rows,cols), y = MatrixType::Random(rows,cols);
@@ -102,7 +100,7 @@ void check_stddeque_quaternion(const QuaternionType&)
   VERIFY_IS_APPROX(v.back(), x);
 }
 
-void test_stddeque()
+EIGEN_DECLARE_TEST(stddeque)
 {
   // some non vectorizable fixed sizes
   CALL_SUBTEST_1(check_stddeque_matrix(Vector2f()));
diff --git a/test/stddeque_overload.cpp b/test/stddeque_overload.cpp
index 4da618bbf..eebe93d81 100644
--- a/test/stddeque_overload.cpp
+++ b/test/stddeque_overload.cpp
@@ -28,8 +28,8 @@ EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(Quaterniond)
 template<typename MatrixType>
 void check_stddeque_matrix(const MatrixType& m)
 {
-  typename MatrixType::Index rows = m.rows();
-  typename MatrixType::Index cols = m.cols();
+  Index rows = m.rows();
+  Index cols = m.cols();
   MatrixType x = MatrixType::Random(rows,cols), y = MatrixType::Random(rows,cols);
   std::deque<MatrixType> v(10, MatrixType(rows,cols)), w(20, y);
   v[5] = x;
@@ -128,7 +128,7 @@ void check_stddeque_quaternion(const QuaternionType&)
   }
 }
 
-void test_stddeque_overload()
+EIGEN_DECLARE_TEST(stddeque_overload)
 {
   // some non vectorizable fixed sizes
   CALL_SUBTEST_1(check_stddeque_matrix(Vector2f()));
diff --git a/test/stdlist.cpp b/test/stdlist.cpp
index 17cce779a..c9e5b7286 100644
--- a/test/stdlist.cpp
+++ b/test/stdlist.cpp
@@ -15,8 +15,6 @@
 template<typename MatrixType>
 void check_stdlist_matrix(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
-  
   Index rows = m.rows();
   Index cols = m.cols();
   MatrixType x = MatrixType::Random(rows,cols), y = MatrixType::Random(rows,cols);
@@ -102,7 +100,7 @@ void check_stdlist_quaternion(const QuaternionType&)
   VERIFY_IS_APPROX(v.back(), x);
 }
 
-void test_stdlist()
+EIGEN_DECLARE_TEST(stdlist)
 {
   // some non vectorizable fixed sizes
   CALL_SUBTEST_1(check_stdlist_matrix(Vector2f()));
diff --git a/test/stdlist_overload.cpp b/test/stdlist_overload.cpp
index bb910bd43..93b6fc9ed 100644
--- a/test/stdlist_overload.cpp
+++ b/test/stdlist_overload.cpp
@@ -44,8 +44,8 @@ void set(Container & c, Position position, const Value & value)
 template<typename MatrixType>
 void check_stdlist_matrix(const MatrixType& m)
 {
-  typename MatrixType::Index rows = m.rows();
-  typename MatrixType::Index cols = m.cols();
+  Index rows = m.rows();
+  Index cols = m.cols();
   MatrixType x = MatrixType::Random(rows,cols), y = MatrixType::Random(rows,cols);
   std::list<MatrixType> v(10, MatrixType(rows,cols)), w(20, y);
   typename std::list<MatrixType>::iterator itv = get(v, 5);
@@ -162,7 +162,7 @@ void check_stdlist_quaternion(const QuaternionType&)
   }
 }
 
-void test_stdlist_overload()
+EIGEN_DECLARE_TEST(stdlist_overload)
 {
   // some non vectorizable fixed sizes
   CALL_SUBTEST_1(check_stdlist_matrix(Vector2f()));
diff --git a/test/stdvector.cpp b/test/stdvector.cpp
index 50cb3341d..e2b7bd061 100644
--- a/test/stdvector.cpp
+++ b/test/stdvector.cpp
@@ -14,8 +14,8 @@
 template<typename MatrixType>
 void check_stdvector_matrix(const MatrixType& m)
 {
-  typename MatrixType::Index rows = m.rows();
-  typename MatrixType::Index cols = m.cols();
+  Index rows = m.rows();
+  Index cols = m.cols();
   MatrixType x = MatrixType::Random(rows,cols), y = MatrixType::Random(rows,cols);
   std::vector<MatrixType,Eigen::aligned_allocator<MatrixType> > v(10, MatrixType(rows,cols)), w(20, y);
   v[5] = x;
@@ -117,7 +117,7 @@ void check_stdvector_quaternion(const QuaternionType&)
   }
 }
 
-void test_stdvector()
+EIGEN_DECLARE_TEST(stdvector)
 {
   // some non vectorizable fixed sizes
   CALL_SUBTEST_1(check_stdvector_matrix(Vector2f()));
diff --git a/test/stdvector_overload.cpp b/test/stdvector_overload.cpp
index 959665954..5c042a64c 100644
--- a/test/stdvector_overload.cpp
+++ b/test/stdvector_overload.cpp
@@ -28,8 +28,8 @@ EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(Quaterniond)
 template<typename MatrixType>
 void check_stdvector_matrix(const MatrixType& m)
 {
-  typename MatrixType::Index rows = m.rows();
-  typename MatrixType::Index cols = m.cols();
+  Index rows = m.rows();
+  Index cols = m.cols();
   MatrixType x = MatrixType::Random(rows,cols), y = MatrixType::Random(rows,cols);
   std::vector<MatrixType> v(10, MatrixType(rows,cols)), w(20, y);
   v[5] = x;
@@ -131,7 +131,7 @@ void check_stdvector_quaternion(const QuaternionType&)
   }
 }
 
-void test_stdvector_overload()
+EIGEN_DECLARE_TEST(stdvector_overload)
 {
   // some non vectorizable fixed sizes
   CALL_SUBTEST_1(check_stdvector_matrix(Vector2f()));
diff --git a/test/superlu_support.cpp b/test/superlu_support.cpp
index 98a7bc5c8..55450c868 100644
--- a/test/superlu_support.cpp
+++ b/test/superlu_support.cpp
@@ -12,7 +12,7 @@
 
 #include <Eigen/SuperLUSupport>
 
-void test_superlu_support()
+EIGEN_DECLARE_TEST(superlu_support)
 {
   SuperLU<SparseMatrix<double> > superlu_double_colmajor;
   SuperLU<SparseMatrix<std::complex<double> > > superlu_cplxdouble_colmajor;
diff --git a/test/svd_common.h b/test/svd_common.h
index 605d5dfef..cba066593 100644
--- a/test/svd_common.h
+++ b/test/svd_common.h
@@ -23,7 +23,6 @@
 template<typename SvdType, typename MatrixType>
 void svd_check_full(const MatrixType& m, const SvdType& svd)
 {
-  typedef typename MatrixType::Index Index;
   Index rows = m.rows();
   Index cols = m.cols();
 
@@ -101,7 +100,6 @@ void svd_least_square(const MatrixType& m, unsigned int computationOptions)
 {
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
-  typedef typename MatrixType::Index Index;
   Index rows = m.rows();
   Index cols = m.cols();
 
@@ -168,7 +166,6 @@ template<typename MatrixType>
 void svd_min_norm(const MatrixType& m, unsigned int computationOptions)
 {
   typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::Index Index;
   Index cols = m.cols();
 
   enum {
@@ -261,7 +258,6 @@ void svd_test_all_computation_options(const MatrixType& m, bool full_only)
     CALL_SUBTEST(( svd_min_norm(m, ComputeThinU | ComputeThinV) ));
 
     // test reconstruction
-    typedef typename MatrixType::Index Index;
     Index diagSize = (std::min)(m.rows(), m.cols());
     SvdType svd(m, ComputeThinU | ComputeThinV);
     VERIFY_IS_APPROX(m, svd.matrixU().leftCols(diagSize) * svd.singularValues().asDiagonal() * svd.matrixV().leftCols(diagSize).adjoint());
@@ -437,7 +433,6 @@ template<typename SvdType,typename MatrixType>
 void svd_verify_assert(const MatrixType& m)
 {
   typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::Index Index;
   Index rows = m.rows();
   Index cols = m.cols();
 
diff --git a/test/svd_fill.h b/test/svd_fill.h
index 3877c0c7e..d68647e99 100644
--- a/test/svd_fill.h
+++ b/test/svd_fill.h
@@ -23,7 +23,6 @@ void svd_fill_random(MatrixType &m, int Option = 0)
   using std::pow;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
-  typedef typename MatrixType::Index Index;
   Index diagSize = (std::min)(m.rows(), m.cols());
   RealScalar s = std::numeric_limits<RealScalar>::max_exponent10/4;
   s = internal::random<RealScalar>(1,s);
diff --git a/test/swap.cpp b/test/swap.cpp
index f76e3624d..5b259d3ec 100644
--- a/test/swap.cpp
+++ b/test/swap.cpp
@@ -28,8 +28,8 @@ template<typename MatrixType> void swap(const MatrixType& m)
   typedef typename MatrixType::Scalar Scalar;
 
   eigen_assert((!internal::is_same<MatrixType,OtherMatrixType>::value));
-  typename MatrixType::Index rows = m.rows();
-  typename MatrixType::Index cols = m.cols();
+  Index rows = m.rows();
+  Index cols = m.cols();
   
   // construct 3 matrix guaranteed to be distinct
   MatrixType m1 = MatrixType::Random(rows,cols);
@@ -83,7 +83,7 @@ template<typename MatrixType> void swap(const MatrixType& m)
   }
 }
 
-void test_swap()
+EIGEN_DECLARE_TEST(swap)
 {
   int s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
   CALL_SUBTEST_1( swap(Matrix3f()) ); // fixed size, no vectorization 
diff --git a/test/symbolic_index.cpp b/test/symbolic_index.cpp
index 1db85144b..cf89b9fbe 100644
--- a/test/symbolic_index.cpp
+++ b/test/symbolic_index.cpp
@@ -58,6 +58,15 @@ bool is_same_symb(const T1& a, const T2& b, Index size)
   return a.eval(last=size-1) == b.eval(last=size-1);
 }
 
+template<typename T>
+void check_is_symbolic(const T&) {
+  STATIC_CHECK(( Symbolic::is_symbolic<T>::value ))
+}
+
+template<typename T>
+void check_isnot_symbolic(const T&) {
+  STATIC_CHECK(( !Symbolic::is_symbolic<T>::value ))
+}
 
 #define VERIFY_EQ_INT(A,B) VERIFY_IS_APPROX(int(A),int(B))
 
@@ -66,6 +75,13 @@ void check_symbolic_index()
   using Eigen::placeholders::last;
   using Eigen::placeholders::end;
 
+  check_is_symbolic(last);
+  check_is_symbolic(end);
+  check_is_symbolic(last+1);
+  check_is_symbolic(last-end);
+  check_is_symbolic(2*last-end/2);
+  check_isnot_symbolic(fix<3>());
+
   Index size=100;
 
   // First, let's check FixedInt arithmetic:
@@ -97,7 +113,7 @@ void check_symbolic_index()
 #endif
 }
 
-void test_symbolic_index()
+EIGEN_DECLARE_TEST(symbolic_index)
 {
   CALL_SUBTEST_1( check_symbolic_index() );
   CALL_SUBTEST_2( check_symbolic_index() );
diff --git a/test/triangular.cpp b/test/triangular.cpp
index b96856486..73b6fc46b 100644
--- a/test/triangular.cpp
+++ b/test/triangular.cpp
@@ -19,8 +19,8 @@ template<typename MatrixType> void triangular_square(const MatrixType& m)
 
   RealScalar largerEps = 10*test_precision<RealScalar>();
 
-  typename MatrixType::Index rows = m.rows();
-  typename MatrixType::Index cols = m.cols();
+  Index rows = m.rows();
+  Index cols = m.cols();
 
   MatrixType m1 = MatrixType::Random(rows, cols),
              m2 = MatrixType::Random(rows, cols),
@@ -134,7 +134,6 @@ template<typename MatrixType> void triangular_square(const MatrixType& m)
 
 template<typename MatrixType> void triangular_rect(const MatrixType& m)
 {
-  typedef const typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
   enum { Rows =  MatrixType::RowsAtCompileTime, Cols =  MatrixType::ColsAtCompileTime };
@@ -221,7 +220,7 @@ void bug_159()
   EIGEN_UNUSED_VARIABLE(m)
 }
 
-void test_triangular()
+EIGEN_DECLARE_TEST(triangular)
 {
   int maxsize = (std::min)(EIGEN_TEST_MAX_SIZE,20);
   for(int i = 0; i < g_repeat ; i++)
diff --git a/test/umeyama.cpp b/test/umeyama.cpp
index 2e8092434..1590a0a81 100644
--- a/test/umeyama.cpp
+++ b/test/umeyama.cpp
@@ -155,7 +155,7 @@ void run_fixed_size_test(int num_elements)
   VERIFY(error < Scalar(16)*std::numeric_limits<Scalar>::epsilon());
 }
 
-void test_umeyama()
+EIGEN_DECLARE_TEST(umeyama)
 {
   for (int i=0; i<g_repeat; ++i)
   {
diff --git a/test/umfpack_support.cpp b/test/umfpack_support.cpp
index 37ab11f0b..d8f2a6f80 100644
--- a/test/umfpack_support.cpp
+++ b/test/umfpack_support.cpp
@@ -12,10 +12,10 @@
 
 #include <Eigen/UmfPackSupport>
 
-template<typename T> void test_umfpack_support_T()
+template<typename T1, typename T2> void test_umfpack_support_T()
 {
-  UmfPackLU<SparseMatrix<T, ColMajor> > umfpack_colmajor;
-  UmfPackLU<SparseMatrix<T, RowMajor> > umfpack_rowmajor;
+  UmfPackLU<SparseMatrix<T1, ColMajor, T2> > umfpack_colmajor;
+  UmfPackLU<SparseMatrix<T1, RowMajor, T2> > umfpack_rowmajor;
   
   check_sparse_square_solving(umfpack_colmajor);
   check_sparse_square_solving(umfpack_rowmajor);
@@ -24,9 +24,11 @@ template<typename T> void test_umfpack_support_T()
   check_sparse_square_determinant(umfpack_rowmajor);
 }
 
-void test_umfpack_support()
+EIGEN_DECLARE_TEST(umfpack_support)
 {
-  CALL_SUBTEST_1(test_umfpack_support_T<double>());
-  CALL_SUBTEST_2(test_umfpack_support_T<std::complex<double> >());
+  CALL_SUBTEST_1((test_umfpack_support_T<double, int>()));
+  CALL_SUBTEST_2((test_umfpack_support_T<std::complex<double>, int>()));
+  CALL_SUBTEST_3((test_umfpack_support_T<double, long >()));
+  CALL_SUBTEST_4((test_umfpack_support_T<std::complex<double>, long>()));
 }
 
diff --git a/test/unalignedassert.cpp b/test/unalignedassert.cpp
index 731a08977..120cc42bb 100644
--- a/test/unalignedassert.cpp
+++ b/test/unalignedassert.cpp
@@ -174,7 +174,7 @@ void unalignedassert()
 #endif
 }
 
-void test_unalignedassert()
+EIGEN_DECLARE_TEST(unalignedassert)
 {
   CALL_SUBTEST(unalignedassert());
 }
diff --git a/test/unalignedcount.cpp b/test/unalignedcount.cpp
index d6ffeafdf..069fc1bb9 100644
--- a/test/unalignedcount.cpp
+++ b/test/unalignedcount.cpp
@@ -28,7 +28,7 @@ static int nb_storeu;
 
 #include "main.h"
 
-void test_unalignedcount()
+EIGEN_DECLARE_TEST(unalignedcount)
 {
   #if defined(EIGEN_VECTORIZE_AVX)
   VectorXf a(40), b(40);
diff --git a/test/upperbidiagonalization.cpp b/test/upperbidiagonalization.cpp
index 847b34b55..945c99959 100644
--- a/test/upperbidiagonalization.cpp
+++ b/test/upperbidiagonalization.cpp
@@ -12,8 +12,8 @@
 
 template<typename MatrixType> void upperbidiag(const MatrixType& m)
 {
-  const typename MatrixType::Index rows = m.rows();
-  const typename MatrixType::Index cols = m.cols();
+  const Index rows = m.rows();
+  const Index cols = m.cols();
 
   typedef Matrix<typename MatrixType::RealScalar, MatrixType::RowsAtCompileTime,  MatrixType::ColsAtCompileTime> RealMatrixType;
   typedef Matrix<typename MatrixType::Scalar, MatrixType::ColsAtCompileTime,  MatrixType::RowsAtCompileTime> TransposeMatrixType;
@@ -29,7 +29,7 @@ template<typename MatrixType> void upperbidiag(const MatrixType& m)
   VERIFY_IS_APPROX(a.adjoint(),d);
 }
 
-void test_upperbidiagonalization()
+EIGEN_DECLARE_TEST(upperbidiagonalization)
 {
   for(int i = 0; i < g_repeat; i++) {
    CALL_SUBTEST_1( upperbidiag(MatrixXf(3,3)) );
diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp
index 37e7495f5..01b55e192 100644
--- a/test/vectorization_logic.cpp
+++ b/test/vectorization_logic.cpp
@@ -386,7 +386,7 @@ template<typename Scalar> struct vectorization_logic_half<Scalar,false>
   static void run() {}
 };
 
-void test_vectorization_logic()
+EIGEN_DECLARE_TEST(vectorization_logic)
 {
 
 #ifdef EIGEN_VECTORIZE
diff --git a/test/vectorwiseop.cpp b/test/vectorwiseop.cpp
index f3ab561ee..2d7ddbed1 100644
--- a/test/vectorwiseop.cpp
+++ b/test/vectorwiseop.cpp
@@ -15,7 +15,6 @@
 
 template<typename ArrayType> void vectorwiseop_array(const ArrayType& m)
 {
-  typedef typename ArrayType::Index Index;
   typedef typename ArrayType::Scalar Scalar;
   typedef Array<Scalar, ArrayType::RowsAtCompileTime, 1> ColVectorType;
   typedef Array<Scalar, 1, ArrayType::ColsAtCompileTime> RowVectorType;
@@ -129,7 +128,6 @@ template<typename ArrayType> void vectorwiseop_array(const ArrayType& m)
 
 template<typename MatrixType> void vectorwiseop_matrix(const MatrixType& m)
 {
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> ColVectorType;
@@ -239,7 +237,7 @@ template<typename MatrixType> void vectorwiseop_matrix(const MatrixType& m)
   VERIFY_EVALUATION_COUNT( m2 = (m1.rowwise() - m1.colwise().sum()/RealScalar(m1.rows())), (MatrixType::RowsAtCompileTime!=1 ? 1 : 0) );
 }
 
-void test_vectorwiseop()
+EIGEN_DECLARE_TEST(vectorwiseop)
 {
   CALL_SUBTEST_1( vectorwiseop_array(Array22cd()) );
   CALL_SUBTEST_2( vectorwiseop_array(Array<double, 3, 2>()) );
diff --git a/test/visitor.cpp b/test/visitor.cpp
index 844170ec6..de938fc95 100644
--- a/test/visitor.cpp
+++ b/test/visitor.cpp
@@ -12,7 +12,6 @@
 template<typename MatrixType> void matrixVisitor(const MatrixType& p)
 {
   typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::Index Index;
 
   Index rows = p.rows();
   Index cols = p.cols();
@@ -65,7 +64,6 @@ template<typename MatrixType> void matrixVisitor(const MatrixType& p)
 template<typename VectorType> void vectorVisitor(const VectorType& w)
 {
   typedef typename VectorType::Scalar Scalar;
-  typedef typename VectorType::Index Index;
 
   Index size = w.size();
 
@@ -115,7 +113,7 @@ template<typename VectorType> void vectorVisitor(const VectorType& w)
   VERIFY(eigen_maxidx == (std::min)(idx0,idx2));
 }
 
-void test_visitor()
+EIGEN_DECLARE_TEST(visitor)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( matrixVisitor(Matrix<float, 1, 1>()) );
diff --git a/test/zerosized.cpp b/test/zerosized.cpp
index 477ff0070..edd1f6925 100644
--- a/test/zerosized.cpp
+++ b/test/zerosized.cpp
@@ -81,7 +81,7 @@ template<typename VectorType> void zeroSizedVector()
   }
 }
 
-void test_zerosized()
+EIGEN_DECLARE_TEST(zerosized)
 {
   zeroSizedMatrix<Matrix2d>();
   zeroSizedMatrix<Matrix3i>();
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index d243fe035..47514703a 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -80,12 +80,16 @@ typedef unsigned __int64 uint64_t;
 #endif
 
 #ifdef EIGEN_USE_GPU
-#include <iostream>
-#include <cuda_runtime.h>
-#if __cplusplus >= 201103L
-#include <atomic>
-#include <unistd.h>
-#endif
+  #include <iostream>
+  #if defined(EIGEN_USE_HIP)
+    #include <hip/hip_runtime.h>
+  #else
+    #include <cuda_runtime.h>
+  #endif
+  #if __cplusplus >= 201103L
+    #include <atomic>
+    #include <unistd.h>
+  #endif
 #endif
 
 #include "src/Tensor/TensorMacros.h"
@@ -95,7 +99,7 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorCostModel.h"
 #include "src/Tensor/TensorDeviceDefault.h"
 #include "src/Tensor/TensorDeviceThreadPool.h"
-#include "src/Tensor/TensorDeviceCuda.h"
+#include "src/Tensor/TensorDeviceGpu.h"
 #include "src/Tensor/TensorDeviceSycl.h"
 #include "src/Tensor/TensorIndexList.h"
 #include "src/Tensor/TensorDimensionList.h"
@@ -108,18 +112,19 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorGlobalFunctions.h"
 
 #include "src/Tensor/TensorBase.h"
+#include "src/Tensor/TensorBlock.h"
 
 #include "src/Tensor/TensorEvaluator.h"
 #include "src/Tensor/TensorExpr.h"
 #include "src/Tensor/TensorReduction.h"
-#include "src/Tensor/TensorReductionCuda.h"
+#include "src/Tensor/TensorReductionGpu.h"
 #include "src/Tensor/TensorArgMax.h"
 #include "src/Tensor/TensorConcatenation.h"
 #include "src/Tensor/TensorContractionMapper.h"
 #include "src/Tensor/TensorContractionBlocking.h"
 #include "src/Tensor/TensorContraction.h"
 #include "src/Tensor/TensorContractionThreadPool.h"
-#include "src/Tensor/TensorContractionCuda.h"
+#include "src/Tensor/TensorContractionGpu.h"
 #include "src/Tensor/TensorConversion.h"
 #include "src/Tensor/TensorConvolution.h"
 #include "src/Tensor/TensorFFT.h"
diff --git a/unsupported/Eigen/CXX11/ThreadPool b/unsupported/Eigen/CXX11/ThreadPool
index c34614194..cbb3bbf2c 100644
--- a/unsupported/Eigen/CXX11/ThreadPool
+++ b/unsupported/Eigen/CXX11/ThreadPool
@@ -55,21 +55,8 @@
 #include "src/ThreadPool/RunQueue.h"
 #include "src/ThreadPool/ThreadPoolInterface.h"
 #include "src/ThreadPool/ThreadEnvironment.h"
-#include "src/ThreadPool/SimpleThreadPool.h"
 #include "src/ThreadPool/NonBlockingThreadPool.h"
 
-
-// Use the more efficient NonBlockingThreadPool by default.
-namespace Eigen {
-#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
-template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
-typedef NonBlockingThreadPool ThreadPool;
-#else
-template <typename Env> using ThreadPoolTempl = SimpleThreadPoolTempl<Env>;
-typedef SimpleThreadPool ThreadPool;
-#endif
-}  // namespace Eigen
-
 #endif
 
 #include <Eigen/src/Core/util/ReenableStupidWarnings.h>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md
index 30d553af7..dfd7ab7c7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/README.md
+++ b/unsupported/Eigen/CXX11/src/Tensor/README.md
@@ -581,7 +581,7 @@ is not initialized.
 
 Creates a tensor mapping an existing array of data. The data must not be freed
 until the TensorMap is discarded, and the size of the data must be large enough
-to accomodate of the coefficients of the tensor.
+to accommodate the coefficients of the tensor.
 
     float data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
     Eigen::TensorMap<Tensor<float, 2>> a(data, 3, 4);
@@ -1013,23 +1013,23 @@ multidimensional case.
     Eigen::Tensor<int, 2> a(2, 3);
     a.setValues({{1, 2, 3}, {6, 5, 4}});
     Eigen::Tensor<int, 2> b(3, 2);
-    a.setValues({{1, 2}, {4, 5}, {5, 6}});
+    b.setValues({{1, 2}, {4, 5}, {5, 6}});
 
     // Compute the traditional matrix product
-    Eigen::array<Eigen::IndexPair<int>, 1> product_dims = { Eigen::IndexPair(1, 0) };
+    Eigen::array<Eigen::IndexPair<int>, 1> product_dims = { Eigen::IndexPair<int>(1, 0) };
     Eigen::Tensor<int, 2> AB = a.contract(b, product_dims);
 
     // Compute the product of the transpose of the matrices
-    Eigen::array<Eigen::IndexPair<int>, 1> transpose_product_dims = { Eigen::IndexPair(0, 1) };
+    Eigen::array<Eigen::IndexPair<int>, 1> transposed_product_dims = { Eigen::IndexPair<int>(0, 1) };
     Eigen::Tensor<int, 2> AtBt = a.contract(b, transposed_product_dims);
-    
-    // Contraction to scalar value using a ouble contraction 
-    // First coordinate of both tensors are contracted as well as both second coordinates
+
+    // Contraction to scalar value using a double contraction.
+    // First coordinate of both tensors are contracted as well as both second coordinates, i.e., this computes the sum of the squares of the elements.
     Eigen::array<Eigen::IndexPair<int>, 2> double_contraction_product_dims = { Eigen::IndexPair<int>(0, 0), Eigen::IndexPair<int>(1, 1) };
-    Eigen::Tensor<int, 0> AdoubleontractedA = a.contract(a, double_contraction_product_dims);
-    
+    Eigen::Tensor<int, 0> AdoubleContractedA = a.contract(a, double_contraction_product_dims);
+
     // Extracting the scalar value of the tensor contraction for further usage
-    int value = AdoublecontractedA(0);
+    int value = AdoubleContractedA(0);
 
 ## Reduction Operations
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
index 1940a9692..e3f6e37f0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@@ -48,7 +48,7 @@ namespace Eigen {
   *
   * <dl>
   * <dt><b>Relation to other parts of Eigen:</b></dt>
-  * <dd>The midterm developement goal for this class is to have a similar hierarchy as Eigen uses for matrices, so that
+  * <dd>The midterm development goal for this class is to have a similar hierarchy as Eigen uses for matrices, so that
   * taking blocks or using tensors in expressions is easily possible, including an interface with the vector/matrix code
   * by providing .asMatrix() and .asVector() (or similar) methods for rank 2 and 1 tensors. However, currently, the %Tensor
   * class does not provide any of these features and is only available as a stand-alone class that just allows for
@@ -398,6 +398,21 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
       internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
     }
 
+    #if EIGEN_HAS_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Tensor(Self&& other)
+      : Tensor()
+    {
+      m_storage.swap(other.m_storage);
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Tensor& operator=(Self&& other)
+    {
+      m_storage.swap(other.m_storage);
+      return *this;
+    }
+    #endif
+
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other)
     {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index 027305586..199ddb123 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -68,6 +68,8 @@ class TensorAssignOp : public TensorBase<TensorAssignOp<LhsXprType, RhsXprType>
   typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index;
 
+  static const int NumDims = Eigen::internal::traits<TensorAssignOp>::NumDimensions;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs)
       : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {}
 
@@ -95,20 +97,33 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
+
   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int NumDims = XprType::NumDims;
 
   enum {
-    IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-    RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess
+    IsAligned    = TensorEvaluator<LeftArgType, Device>::IsAligned &
+                   TensorEvaluator<RightArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
+                   TensorEvaluator<RightArgType, Device>::PacketAccess,
+    BlockAccess  = TensorEvaluator<LeftArgType, Device>::BlockAccess &
+                   TensorEvaluator<RightArgType, Device>::BlockAccess,
+    Layout       = TensorEvaluator<LeftArgType, Device>::Layout,
+    RawAccess    = TensorEvaluator<LeftArgType, Device>::RawAccess
   };
 
+  typedef typename internal::TensorBlock<
+      typename internal::remove_const<Scalar>::type, Index, NumDims, Layout>
+      TensorBlock;
+
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
       m_leftImpl(op.lhsExpression(), device),
       m_rightImpl(op.rhsExpression(), device)
   {
-    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT(
+        (static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
+         static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
+        YOU_MADE_A_PROGRAMMING_MISTAKE);
   }
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
@@ -164,6 +179,25 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
            TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    m_leftImpl.getResourceRequirements(resources);
+    m_rightImpl.getResourceRequirements(resources);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlock* block) {
+    if (TensorEvaluator<LeftArgType, Device>::RawAccess &&
+        m_leftImpl.data() != nullptr) {
+      TensorBlock left_block(block->first_coeff_index(), block->block_sizes(),
+                             block->tensor_strides(), block->tensor_strides(),
+                             m_leftImpl.data() + block->first_coeff_index());
+      m_rightImpl.block(&left_block);
+    } else {
+      m_rightImpl.block(block);
+      m_leftImpl.writeBlock(*block);
+    }
+  }
+
   /// required by sycl in order to extract the accessor
   const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
   /// required by sycl in order to extract the accessor
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 1d459a3a0..97f90f638 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -20,7 +20,7 @@ namespace Eigen {
   * \brief The tensor base class.
   *
   * This class is the common parent of the Tensor and TensorMap class, thus
-  * making it possible to use either class interchangably in expressions.
+  * making it possible to use either class interchangeably in expressions.
   */
 
 template<typename Derived>
@@ -133,6 +133,18 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return unaryExpr(internal::scalar_digamma_op<Scalar>());
     }
 
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_i0e_op<Scalar>, const Derived>
+    i0e() const {
+      return unaryExpr(internal::scalar_i0e_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_i1e_op<Scalar>, const Derived>
+    i1e() const {
+      return unaryExpr(internal::scalar_i1e_op<Scalar>());
+    }
+
     // igamma(a = this, x = other)
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorCwiseBinaryOp<internal::scalar_igamma_op<Scalar>, const Derived, const OtherDerived>
@@ -140,6 +152,20 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return binaryExpr(other.derived(), internal::scalar_igamma_op<Scalar>());
     }
 
+    // igamma_der_a(a = this, x = other)
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_igamma_der_a_op<Scalar>, const Derived, const OtherDerived>
+    igamma_der_a(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_igamma_der_a_op<Scalar>());
+    }
+
+    // gamma_sample_der_alpha(alpha = this, sample = other)
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_gamma_sample_der_alpha_op<Scalar>, const Derived, const OtherDerived>
+    gamma_sample_der_alpha(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_gamma_sample_der_alpha_op<Scalar>());
+    }
+
     // igammac(a = this, x = other)
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorCwiseBinaryOp<internal::scalar_igammac_op<Scalar>, const Derived, const OtherDerived>
@@ -210,6 +236,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
 
     EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_clamp_op<Scalar>, const Derived>
+    clip(Scalar min, Scalar max) const {
+      return unaryExpr(internal::scalar_clamp_op<Scalar>(min, max));
+    }
+
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Derived>
     conjugate() const {
       return unaryExpr(internal::scalar_conjugate_op<Scalar>());
@@ -485,9 +517,15 @@ class TensorBase<Derived, ReadOnlyAccessors>
     typedef Eigen::IndexPair<Index> DimensionPair;
 
     template<typename OtherDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorContractionOp<const Dimensions, const Derived, const OtherDerived>
+    const TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const NoOpOutputKernel>
     contract(const OtherDerived& other, const Dimensions& dims) const {
-      return TensorContractionOp<const Dimensions, const Derived, const OtherDerived>(derived(), other.derived(), dims);
+      return TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const NoOpOutputKernel>(derived(), other.derived(), dims);
+    }
+
+    template<typename OtherDerived, typename Dimensions, typename OutputKernel> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const OutputKernel>
+    contract(const OtherDerived& other, const Dimensions& dims, const OutputKernel& output_kernel) const {
+      return TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const OutputKernel>(derived(), other.derived(), dims, output_kernel);
     }
 
     // Convolutions.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
new file mode 100644
index 000000000..84cf6d216
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -0,0 +1,943 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Andy Davis <andydavis@google.com>
+// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
+#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
+
+namespace Eigen {
+namespace internal {
+
+namespace {
+
+// Helper template to choose between ColMajor and RowMajor values.
+template <int Layout>
+struct cond;
+
+template <>
+struct cond<ColMajor> {
+  template <typename T>
+  EIGEN_STRONG_INLINE const T& operator()(const T& col,
+                                          const T& /*row*/) const {
+    return col;
+  }
+};
+
+template <>
+struct cond<RowMajor> {
+  template <typename T>
+  EIGEN_STRONG_INLINE const T& operator()(const T& /*col*/,
+                                          const T& row) const {
+    return row;
+  }
+};
+
+}  // namespace
+
+/**
+ * \class TensorBlockShapeType
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor block shape type.
+ *
+ * Tensor block shape type defines what are the shape preference for the blocks
+ * extracted from the larger tensor.
+ *
+ * Example:
+ *
+ * We want to extract blocks of 100 elements from the large 100x100 tensor:
+ *  - tensor: 100x100
+ *  - target_block_size: 100
+ *
+ * TensorBlockShapeType:
+ *  - kUniformAllDims: 100 blocks of size 10x10
+ *  - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column
+ *                      or row major layout)
+ */
+enum class TensorBlockShapeType {
+  kUniformAllDims,
+  kSkewedInnerDims,
+};
+
+struct TensorOpResourceRequirements {
+  TensorBlockShapeType block_shape;
+  Index block_total_size;
+  // TODO(andydavis) Add 'target_num_threads' to support communication of
+  // thread-resource requirements. This will allow ops deep in the
+  // expression tree (like reductions) to communicate resources
+  // requirements based on local state (like the total number of reductions
+  // to be computed).
+  TensorOpResourceRequirements(internal::TensorBlockShapeType shape,
+                               const Index size)
+      : block_shape(shape), block_total_size(size) {}
+};
+
+// Tries to merge multiple resource requirements.
+EIGEN_STRONG_INLINE void MergeResourceRequirements(
+    const std::vector<TensorOpResourceRequirements>& resources,
+    TensorBlockShapeType* block_shape, Index* block_total_size) {
+  if (resources.empty()) {
+    return;
+  }
+  // TODO(andydavis) Implement different policies (i.e. revert to a default
+  // policy if block shapes/sizes conflict).
+  *block_shape = resources[0].block_shape;
+  *block_total_size = resources[0].block_total_size;
+  for (int i = 1; i < resources.size(); ++i) {
+    if (resources[i].block_shape == TensorBlockShapeType::kSkewedInnerDims &&
+        *block_shape != TensorBlockShapeType::kSkewedInnerDims) {
+      *block_shape = TensorBlockShapeType::kSkewedInnerDims;
+    }
+    *block_total_size =
+        numext::maxi(*block_total_size, resources[i].block_total_size);
+  }
+}
+
+/**
+ * \class TensorBlock
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor block class.
+ *
+ * This class represents a tensor block specified by the index of the
+ * first block coefficient, and the size of the block in each dimension.
+ */
+template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
+class TensorBlock {
+ public:
+  typedef DSizes<StorageIndex, NumDims> Dimensions;
+
+  TensorBlock(const StorageIndex first_coeff_index, const Dimensions& block_sizes,
+              const Dimensions& block_strides, const Dimensions& tensor_strides,
+              Scalar* data)
+      : m_first_coeff_index(first_coeff_index),
+        m_block_sizes(block_sizes),
+        m_block_strides(block_strides),
+        m_tensor_strides(tensor_strides),
+        m_data(data) {}
+
+  StorageIndex first_coeff_index() const { return m_first_coeff_index; }
+
+  const Dimensions& block_sizes() const { return m_block_sizes; }
+
+  const Dimensions& block_strides() const { return m_block_strides; }
+
+  const Dimensions& tensor_strides() const { return m_tensor_strides; }
+
+  Scalar* data() { return m_data; }
+
+  const Scalar* data() const { return m_data; }
+
+ private:
+  StorageIndex m_first_coeff_index;
+  Dimensions m_block_sizes;
+  Dimensions m_block_strides;
+  Dimensions m_tensor_strides;
+  Scalar* m_data;  // Not owned.
+};
+
+template <typename Scalar, typename StorageIndex>
+struct TensorBlockCopyOp {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const StorageIndex num_coeff_to_copy, const StorageIndex dst_index,
+      const StorageIndex dst_stride, Scalar* EIGEN_RESTRICT dst_data,
+      const StorageIndex src_index, const StorageIndex src_stride,
+      const Scalar* EIGEN_RESTRICT src_data) {
+    const Scalar* src_base = &src_data[src_index];
+    Scalar* dst_base = &dst_data[dst_index];
+
+    using Src = const Eigen::Array<Scalar, Dynamic, 1>;
+    using Dst = Eigen::Array<Scalar, Dynamic, 1>;
+
+    using SrcMap = Eigen::Map<Src, 0, InnerStride<>>;
+    using DstMap = Eigen::Map<Dst, 0, InnerStride<>>;
+
+    const SrcMap src(src_base, num_coeff_to_copy, InnerStride<>(src_stride));
+    DstMap dst(dst_base, num_coeff_to_copy, InnerStride<>(dst_stride));
+
+    dst = src;
+  }
+};
+
+/**
+ * \class TensorBlockIO
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor block IO class.
+ *
+ * This class is responsible for copying data between a tensor and a tensor
+ * block.
+ */
+template <typename Scalar, typename StorageIndex, int NumDims, int Layout,
+          bool BlockRead>
+class TensorBlockIO {
+ public:
+  typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
+      TensorBlock;
+  typedef typename internal::TensorBlockCopyOp<Scalar, StorageIndex>
+      TensorBlockCopyOp;
+
+ protected:
+  struct BlockIteratorState {
+    StorageIndex input_stride;
+    StorageIndex output_stride;
+    StorageIndex input_span;
+    StorageIndex output_span;
+    StorageIndex size;
+    StorageIndex count;
+  };
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy(
+      const TensorBlock& block, StorageIndex first_coeff_index,
+      const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
+      const array<StorageIndex, NumDims>& tensor_strides, const Scalar* src_data,
+      Scalar* dst_data) {
+    // Find the innermost tensor dimension whose size is not 1. This is the
+    // effective inner dim. If all dimensions are of size 1, then fallback to
+    // using the actual innermost dim to avoid out-of-bound access.
+    StorageIndex num_size_one_inner_dims = 0;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = cond<Layout>()(i, NumDims - i - 1);
+      if (block.block_sizes()[tensor_to_block_dim_map[dim]] != 1) {
+        num_size_one_inner_dims = i;
+        break;
+      }
+    }
+    // Calculate strides and dimensions.
+    const StorageIndex tensor_stride1_dim = cond<Layout>()(
+        num_size_one_inner_dims, NumDims - num_size_one_inner_dims - 1);
+    const StorageIndex block_dim_for_tensor_stride1_dim =
+        NumDims == 0 ? 1 : tensor_to_block_dim_map[tensor_stride1_dim];
+    size_t block_inner_dim_size =
+        NumDims == 0 ? 1
+                     : block.block_sizes()[block_dim_for_tensor_stride1_dim];
+    for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
+      const int dim = cond<Layout>()(i, NumDims - i - 1);
+      const StorageIndex block_stride =
+          block.block_strides()[tensor_to_block_dim_map[dim]];
+      if (block_inner_dim_size == block_stride &&
+          block_stride == tensor_strides[dim]) {
+        block_inner_dim_size *=
+            block.block_sizes()[tensor_to_block_dim_map[dim]];
+        ++num_size_one_inner_dims;
+      } else {
+        break;
+      }
+    }
+
+    StorageIndex inputIndex;
+    StorageIndex outputIndex;
+    StorageIndex input_stride;
+    StorageIndex output_stride;
+
+    // Setup strides to read/write along the tensor's stride1 dimension.
+    if (BlockRead) {
+      inputIndex = first_coeff_index;
+      outputIndex = 0;
+      input_stride = NumDims == 0 ? 1 : tensor_strides[tensor_stride1_dim];
+      output_stride =
+          NumDims == 0
+              ? 1
+              : block.block_strides()[block_dim_for_tensor_stride1_dim];
+    } else {
+      inputIndex = 0;
+      outputIndex = first_coeff_index;
+      input_stride =
+          NumDims == 0
+              ? 1
+              : block.block_strides()[block_dim_for_tensor_stride1_dim];
+      output_stride = NumDims == 0 ? 1 : tensor_strides[tensor_stride1_dim];
+    }
+
+    const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
+    array<BlockIteratorState, at_least_1_dim> block_iter_state;
+
+    // Initialize block iterator state. Squeeze away any dimension of size 1.
+    int num_squeezed_dims = 0;
+    for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
+      const int dim = cond<Layout>()(i + 1, NumDims - i - 2);
+      const StorageIndex size = block.block_sizes()[tensor_to_block_dim_map[dim]];
+      if (size == 1) {
+        continue;
+      }
+      block_iter_state[num_squeezed_dims].size = size;
+      if (BlockRead) {
+        block_iter_state[num_squeezed_dims].input_stride = tensor_strides[dim];
+        block_iter_state[num_squeezed_dims].output_stride =
+            block.block_strides()[tensor_to_block_dim_map[dim]];
+      } else {
+        block_iter_state[num_squeezed_dims].input_stride =
+            block.block_strides()[tensor_to_block_dim_map[dim]];
+        block_iter_state[num_squeezed_dims].output_stride = tensor_strides[dim];
+      }
+      block_iter_state[num_squeezed_dims].input_span =
+          block_iter_state[num_squeezed_dims].input_stride *
+          (block_iter_state[num_squeezed_dims].size - 1);
+      block_iter_state[num_squeezed_dims].output_span =
+          block_iter_state[num_squeezed_dims].output_stride *
+          (block_iter_state[num_squeezed_dims].size - 1);
+      block_iter_state[num_squeezed_dims].count = 0;
+      ++num_squeezed_dims;
+    }
+
+    // Iterate copying data from src to dst.
+    const StorageIndex block_total_size =
+        NumDims == 0 ? 1 : block.block_sizes().TotalSize();
+    for (StorageIndex i = 0; i < block_total_size; i += block_inner_dim_size) {
+      TensorBlockCopyOp::Run(block_inner_dim_size, outputIndex, output_stride,
+                             dst_data, inputIndex, input_stride, src_data);
+      // Update index.
+      for (int j = 0; j < num_squeezed_dims; ++j) {
+        if (++block_iter_state[j].count < block_iter_state[j].size) {
+          inputIndex += block_iter_state[j].input_stride;
+          outputIndex += block_iter_state[j].output_stride;
+          break;
+        }
+        block_iter_state[j].count = 0;
+        inputIndex -= block_iter_state[j].input_span;
+        outputIndex -= block_iter_state[j].output_span;
+      }
+    }
+  }
+};
+
+/**
+ * \class TensorBlockReader
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor block reader class.
+ *
+ * This class is responsible for reading a tensor block.
+ *
+ */
+template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
+class TensorBlockReader : public TensorBlockIO<Scalar, StorageIndex, NumDims,
+                                               Layout, /*BlockRead=*/true> {
+ public:
+  typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
+      TensorBlock;
+  typedef TensorBlockIO<Scalar, StorageIndex, NumDims, Layout, /*BlockRead=*/true>
+      Base;
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      TensorBlock* block, const Scalar* src_data) {
+    array<StorageIndex, NumDims> tensor_to_block_dim_map;
+    for (int i = 0; i < NumDims; ++i) {
+      tensor_to_block_dim_map[i] = i;
+    }
+    Base::Copy(*block, block->first_coeff_index(), tensor_to_block_dim_map,
+               block->tensor_strides(), src_data, block->data());
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      TensorBlock* block, StorageIndex first_coeff_index,
+      const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
+      const array<StorageIndex, NumDims>& tensor_strides, const Scalar* src_data) {
+    Base::Copy(*block, first_coeff_index, tensor_to_block_dim_map,
+               tensor_strides, src_data, block->data());
+  }
+};
+
+/**
+ * \class TensorBlockWriter
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor block writer class.
+ *
+ * This class is responsible for writing a tensor block.
+ *
+ */
+template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
+class TensorBlockWriter : public TensorBlockIO<Scalar, StorageIndex, NumDims,
+                                               Layout, /*BlockRead=*/false> {
+ public:
+  typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
+      TensorBlock;
+  typedef TensorBlockIO<Scalar, StorageIndex, NumDims, Layout, /*BlockRead=*/false>
+      Base;
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const TensorBlock& block, Scalar* dst_data) {
+    array<StorageIndex, NumDims> tensor_to_block_dim_map;
+    for (int i = 0; i < NumDims; ++i) {
+      tensor_to_block_dim_map[i] = i;
+    }
+    Base::Copy(block, block.first_coeff_index(), tensor_to_block_dim_map,
+               block.tensor_strides(), block.data(), dst_data);
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const TensorBlock& block, StorageIndex first_coeff_index,
+      const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
+      const array<StorageIndex, NumDims>& tensor_strides, Scalar* dst_data) {
+    Base::Copy(block, first_coeff_index, tensor_to_block_dim_map,
+               tensor_strides, block.data(), dst_data);
+  }
+};
+
+/**
+ * \class TensorBlockCwiseBinaryOp
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Carries out a cwise binary op on a number of coefficients.
+ *
+ * This class reads strided inputs from left and right operands, and writes the
+ * result of the cwise binary op to the strided output array.
+ *
+ */
+struct TensorBlockCwiseBinaryOp {
+  template <typename StorageIndex, typename BinaryFunctor, typename OutputScalar,
+            typename LeftScalar, typename RightScalar>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const BinaryFunctor& functor, const StorageIndex num_coeff,
+      const StorageIndex output_index, const StorageIndex output_stride,
+      OutputScalar* output_data, const StorageIndex left_index,
+      const StorageIndex left_stride, const LeftScalar* left_data,
+      const StorageIndex right_index, const StorageIndex right_stride,
+      const RightScalar* right_data) {
+    using Lhs = const Eigen::Array<LeftScalar, Dynamic, 1>;
+    using Rhs = const Eigen::Array<RightScalar, Dynamic, 1>;
+    using Out = Eigen::Array<OutputScalar, Dynamic, 1>;
+
+    using LhsMap = Eigen::Map<Lhs, 0, InnerStride<>>;
+    using RhsMap = Eigen::Map<Rhs, 0, InnerStride<>>;
+    using OutMap = Eigen::Map<Out, 0, InnerStride<>>;
+
+    const LeftScalar* lhs_base = &left_data[left_index];
+    const RightScalar* rhs_base = &right_data[right_index];
+    OutputScalar* out_base = &output_data[output_index];
+
+    const LhsMap lhs(lhs_base, num_coeff, InnerStride<>(left_stride));
+    const RhsMap rhs(rhs_base, num_coeff, InnerStride<>(right_stride));
+    OutMap out(out_base, num_coeff, InnerStride<>(output_stride));
+
+    out =
+        Eigen::CwiseBinaryOp<BinaryFunctor, LhsMap, RhsMap>(lhs, rhs, functor);
+  }
+};
+
+/**
+ * \class TensorBlockCwiseBinaryIO
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor block IO class for carrying out cwise binary ops.
+ *
+ * This class carries out the binary op on given blocks.
+ *
+ */
+template <typename BinaryFunctor, typename StorageIndex, typename OutputScalar,
+          int NumDims, int Layout>
+struct TensorBlockCwiseBinaryIO {
+  typedef typename internal::TensorBlock<OutputScalar, StorageIndex, NumDims,
+                                         Layout>::Dimensions Dimensions;
+
+  struct BlockIteratorState {
+    StorageIndex output_stride, output_span;
+    StorageIndex left_stride, left_span;
+    StorageIndex right_stride, right_span;
+    StorageIndex size, count;
+  };
+
+  template <typename LeftScalar, typename RightScalar>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const BinaryFunctor& functor, const Dimensions& block_sizes,
+      const Dimensions& block_strides, OutputScalar* output_data,
+      const array<StorageIndex, NumDims>& left_strides,
+      const LeftScalar* left_data,
+      const array<StorageIndex, NumDims>& right_strides,
+      const RightScalar* right_data) {
+    // Find the innermost dimension whose size is not 1. This is the effective
+    // inner dim. If all dimensions are of size 1, fallback to using the actual
+    // innermost dim to avoid out-of-bound access.
+    int num_size_one_inner_dims = 0;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = cond<Layout>()(i, NumDims - i - 1);
+      if (block_sizes[dim] != 1) {
+        num_size_one_inner_dims = i;
+        break;
+      }
+    }
+    // Calculate strides and dimensions.
+    const int inner_dim =
+        NumDims == 0 ? 1
+                     : cond<Layout>()(num_size_one_inner_dims,
+                                      NumDims - num_size_one_inner_dims - 1);
+    StorageIndex inner_dim_size = NumDims == 0 ? 1 : block_sizes[inner_dim];
+    for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
+      const int dim = cond<Layout>()(i, NumDims - i - 1);
+      // Merge multiple inner dims into one for larger inner dim size (i.e.
+      // fewer calls to TensorBlockCwiseBinaryOp::Run()).
+      if (inner_dim_size == block_strides[dim] &&
+          block_strides[dim] == left_strides[dim] &&
+          block_strides[dim] == right_strides[dim]) {
+        inner_dim_size *= block_sizes[dim];
+        ++num_size_one_inner_dims;
+      } else {
+        break;
+      }
+    }
+
+    StorageIndex output_index = 0, left_index = 0, right_index = 0;
+    const StorageIndex output_stride =
+        NumDims == 0 ? 1 : block_strides[inner_dim];
+    const StorageIndex left_stride = NumDims == 0 ? 1 : left_strides[inner_dim];
+    const StorageIndex right_stride =
+        NumDims == 0 ? 1 : right_strides[inner_dim];
+
+    const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
+    array<BlockIteratorState, at_least_1_dim> block_iter_state;
+
+    // Initialize block iterator state. Squeeze away any dimension of size 1.
+    int num_squeezed_dims = 0;
+    for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
+      const int dim = cond<Layout>()(i + 1, NumDims - i - 2);
+      const StorageIndex size = block_sizes[dim];
+      if (size == 1) {
+        continue;
+      }
+      auto& state = block_iter_state[num_squeezed_dims];
+      state.output_stride = block_strides[dim];
+      state.left_stride = left_strides[dim];
+      state.right_stride = right_strides[dim];
+      state.size = size;
+      state.output_span = state.output_stride * (size - 1);
+      state.left_span = state.left_stride * (size - 1);
+      state.right_span = state.right_stride * (size - 1);
+      state.count = 0;
+      ++num_squeezed_dims;
+    }
+
+    // Compute cwise binary op.
+    const StorageIndex block_total_size =
+        NumDims == 0 ? 1 : block_sizes.TotalSize();
+    for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) {
+      TensorBlockCwiseBinaryOp::Run(functor, inner_dim_size, output_index,
+                                    output_stride, output_data, left_index,
+                                    left_stride, left_data, right_index,
+                                    right_stride, right_data);
+      // Update index.
+      for (int j = 0; j < num_squeezed_dims; ++j) {
+        auto& state = block_iter_state[j];
+        if (++state.count < state.size) {
+          output_index += state.output_stride;
+          left_index += state.left_stride;
+          right_index += state.right_stride;
+          break;
+        }
+        state.count = 0;
+        output_index -= state.output_span;
+        left_index -= state.left_span;
+        right_index -= state.right_span;
+      }
+    }
+  }
+};
+
+/**
+ * \class TensorBlockView
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Read-only view into a block of data.
+ *
+ * This class provides read-only access to a block of data in impl. It may need
+ * to allocate space for holding the intermediate result.
+ *
+ */
+template <class ArgType, class Device>
+struct TensorBlockView {
+  typedef TensorEvaluator<ArgType, Device> Impl;
+  typedef typename Impl::Index StorageIndex;
+  typedef typename remove_const<typename Impl::Scalar>::type Scalar;
+  static const int NumDims = array_size<typename Impl::Dimensions>::value;
+  typedef DSizes<StorageIndex, NumDims> Dimensions;
+
+  // Constructs a TensorBlockView for `impl`. `block` is only used for for
+  // specifying the start offset, shape, and strides of the block.
+  template <typename OtherTensorBlock>
+  TensorBlockView(const Device& device,
+                  const TensorEvaluator<ArgType, Device>& impl,
+                  const OtherTensorBlock& block)
+      : m_device(device),
+        m_block_sizes(block.block_sizes()),
+        m_data(NULL),
+        m_allocated_data(NULL) {
+    if (Impl::RawAccess && impl.data() != NULL) {
+      m_data = impl.data() + block.first_coeff_index();
+      m_block_strides = block.tensor_strides();
+    } else {
+      // Actually make a copy.
+
+      // TODO(wuke): This sometimes put a lot pressure on the heap allocator.
+      // Consider allowing ops to request additional temporary block memory in
+      // TensorOpResourceRequirements.
+      m_allocated_data = static_cast<Scalar*>(
+          m_device.allocate(m_block_sizes.TotalSize() * sizeof(Scalar)));
+      m_data = m_allocated_data;
+      if (NumDims > 0) {
+        if (static_cast<int>(Impl::Layout) == static_cast<int>(ColMajor)) {
+          m_block_strides[0] = 1;
+          for (int i = 1; i < NumDims; ++i) {
+            m_block_strides[i] = m_block_strides[i - 1] * m_block_sizes[i - 1];
+          }
+        } else {
+          m_block_strides[NumDims - 1] = 1;
+          for (int i = NumDims - 2; i >= 0; --i) {
+            m_block_strides[i] = m_block_strides[i + 1] * m_block_sizes[i + 1];
+          }
+        }
+      }
+      TensorBlock<Scalar, StorageIndex, NumDims, Impl::Layout> input_block(
+          block.first_coeff_index(), m_block_sizes, m_block_strides,
+          block.tensor_strides(), m_allocated_data);
+      impl.block(&input_block);
+    }
+  }
+
+  ~TensorBlockView() {
+    if (m_allocated_data != NULL) {
+      m_device.deallocate(m_allocated_data);
+    }
+  }
+
+  const Dimensions& block_sizes() const { return m_block_sizes; }
+  const Dimensions& block_strides() const { return m_block_strides; }
+  const Scalar* data() const { return m_data; }
+
+ private:
+  const Device& m_device;
+  Dimensions m_block_sizes, m_block_strides;
+  const Scalar* m_data;      // Not owned.
+  Scalar* m_allocated_data;  // Owned.
+};
+
+/**
+ * \class TensorBlockMapper
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor block mapper class.
+ *
+ * This class is responsible for iterating over the blocks of a tensor.
+ */
+template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
+class TensorBlockMapper {
+ public:
+  typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
+      TensorBlock;
+  typedef DSizes<StorageIndex, NumDims> Dimensions;
+
+  TensorBlockMapper(const Dimensions& dims,
+                    const TensorBlockShapeType block_shape,
+                    Index min_target_size)
+      : m_dimensions(dims),
+        m_block_dim_sizes(BlockDimensions(dims, block_shape, min_target_size)) {
+    // Calculate block counts by dimension and total block count.
+    DSizes<StorageIndex, NumDims> block_count;
+    for (Index i = 0; i < block_count.rank(); ++i) {
+      block_count[i] = divup(m_dimensions[i], m_block_dim_sizes[i]);
+    }
+    m_total_block_count = array_prod(block_count);
+
+    // Calculate block strides (used for enumerating blocks).
+    if (NumDims > 0) {
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        m_block_strides[0] = 1;
+        m_tensor_strides[0] = 1;
+        for (int i = 1; i < NumDims; ++i) {
+          m_block_strides[i] = m_block_strides[i - 1] * block_count[i - 1];
+          m_tensor_strides[i] = m_tensor_strides[i - 1] * m_dimensions[i - 1];
+        }
+      } else {
+        m_block_strides[NumDims - 1] = 1;
+        m_tensor_strides[NumDims - 1] = 1;
+        for (int i = NumDims - 2; i >= 0; --i) {
+          m_block_strides[i] = m_block_strides[i + 1] * block_count[i + 1];
+          m_tensor_strides[i] = m_tensor_strides[i + 1] * m_dimensions[i + 1];
+        }
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  GetBlockForIndex(StorageIndex block_index, Scalar* data) const {
+    StorageIndex first_coeff_index = 0;
+    DSizes<StorageIndex, NumDims> coords;
+    DSizes<StorageIndex, NumDims> sizes;
+    DSizes<StorageIndex, NumDims> strides;
+    if (NumDims > 0) {
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        for (int i = NumDims - 1; i > 0; --i) {
+          const StorageIndex idx = block_index / m_block_strides[i];
+          coords[i] = idx * m_block_dim_sizes[i];
+          sizes[i] =
+              numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]);
+          block_index -= idx * m_block_strides[i];
+          first_coeff_index += coords[i] * m_tensor_strides[i];
+        }
+        coords[0] = block_index * m_block_dim_sizes[0];
+        sizes[0] =
+            numext::mini((m_dimensions[0] - coords[0]), m_block_dim_sizes[0]);
+        first_coeff_index += coords[0] * m_tensor_strides[0];
+
+        strides[0] = 1;
+        for (int i = 1; i < NumDims; ++i) {
+          strides[i] = strides[i - 1] * sizes[i - 1];
+        }
+      } else {
+        for (int i = 0; i < NumDims - 1; ++i) {
+          const StorageIndex idx = block_index / m_block_strides[i];
+          coords[i] = idx * m_block_dim_sizes[i];
+          sizes[i] =
+              numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]);
+          block_index -= idx * m_block_strides[i];
+          first_coeff_index += coords[i] * m_tensor_strides[i];
+        }
+        coords[NumDims - 1] = block_index * m_block_dim_sizes[NumDims - 1];
+        sizes[NumDims - 1] =
+            numext::mini((m_dimensions[NumDims - 1] - coords[NumDims - 1]),
+                         m_block_dim_sizes[NumDims - 1]);
+        first_coeff_index +=
+            coords[NumDims - 1] * m_tensor_strides[NumDims - 1];
+
+        strides[NumDims - 1] = 1;
+        for (int i = NumDims - 2; i >= 0; --i) {
+          strides[i] = strides[i + 1] * sizes[i + 1];
+        }
+      }
+    }
+
+    return TensorBlock(first_coeff_index, sizes, strides, m_tensor_strides,
+                       data);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex total_block_count() const {
+    return m_total_block_count;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex
+  block_dims_total_size() const {
+    return m_block_dim_sizes.TotalSize();
+  }
+
+ private:
+  static Dimensions BlockDimensions(const Dimensions& tensor_dims,
+                                    const TensorBlockShapeType block_shape,
+                                    Index min_target_size) {
+    min_target_size = numext::maxi<Index>(1, min_target_size);
+
+    // If tensor fully fits into the target size, we'll treat it a single block.
+    Dimensions block_dim_sizes = tensor_dims;
+
+    if (tensor_dims.TotalSize() == 0) {
+      // Corner case: one of the dimensions is zero. Logic below is too complex
+      // to handle this case on a general basis, just use unit block size.
+      // Note: we must not yield blocks with zero dimensions (recipe for
+      // overflows/underflows, divisions by zero and NaNs later).
+      for (int i = 0; i < NumDims; ++i) {
+        block_dim_sizes[i] = 1;
+      }
+    } else if (block_dim_sizes.TotalSize() > min_target_size) {
+      if (block_shape == TensorBlockShapeType::kUniformAllDims) {
+        // Tensor will not fit within 'min_target_size' budget: calculate tensor
+        // block dimension sizes based on "square" dimension size target.
+        const size_t dim_size_target = static_cast<const size_t>(
+            std::pow(static_cast<float>(min_target_size),
+                     1.0 / static_cast<float>(block_dim_sizes.rank())));
+        for (size_t i = 0; i < block_dim_sizes.rank(); ++i) {
+          // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
+          // a multiple of the packet size. Note that reducing
+          // 'block_dim_size' in this manner can increase the number of
+          // blocks, and so will amplify any per-block overhead.
+          block_dim_sizes[i] = numext::mini(
+              dim_size_target, static_cast<size_t>(tensor_dims[i]));
+        }
+        // Add any un-allocated coefficients to inner dimension(s).
+        StorageIndex total_size = block_dim_sizes.TotalSize();
+        for (int i = 0; i < NumDims; ++i) {
+          const int dim = cond<Layout>()(i, NumDims - i - 1);
+          if (block_dim_sizes[dim] < tensor_dims[dim]) {
+            const StorageIndex total_size_other_dims =
+                total_size / block_dim_sizes[dim];
+            const StorageIndex alloc_avail =
+                divup<StorageIndex>(min_target_size, total_size_other_dims);
+            if (alloc_avail == block_dim_sizes[dim]) {
+              // Insufficient excess coefficients to allocate.
+              break;
+            }
+            block_dim_sizes[dim] = numext::mini(tensor_dims[dim], alloc_avail);
+            total_size = total_size_other_dims * block_dim_sizes[dim];
+          }
+        }
+      } else if (block_shape == TensorBlockShapeType::kSkewedInnerDims) {
+        StorageIndex coeff_to_allocate = min_target_size;
+        for (int i = 0; i < NumDims; ++i) {
+          const int dim = cond<Layout>()(i, NumDims - i - 1);
+          block_dim_sizes[dim] =
+              numext::mini(coeff_to_allocate, tensor_dims[dim]);
+          coeff_to_allocate = divup(
+              coeff_to_allocate,
+              numext::maxi(static_cast<StorageIndex>(1), block_dim_sizes[dim]));
+        }
+        eigen_assert(coeff_to_allocate == 1);
+      } else {
+        eigen_assert(false);  // someone added new block shape type
+      }
+    }
+
+    eigen_assert(
+        block_dim_sizes.TotalSize() >=
+        numext::mini<size_t>(min_target_size, tensor_dims.TotalSize()));
+
+    return block_dim_sizes;
+  }
+
+  Dimensions m_dimensions;
+  Dimensions m_block_dim_sizes;
+  Dimensions m_block_strides;
+  Dimensions m_tensor_strides;
+  StorageIndex m_total_block_count;
+};
+
+/**
+ * \class TensorSliceBlockMapper
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor slice block mapper class.
+ *
+ * This class is responsible for iterating over the blocks of
+ * a slice of a tensor. Supports shuffling of the block strides
+ * for callers that want to reduce strides for dimensions to be
+ * processed together.
+ *
+ */
+template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
+class TensorSliceBlockMapper {
+ public:
+  typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
+      TensorBlock;
+  typedef DSizes<StorageIndex, NumDims> Dimensions;
+
+  TensorSliceBlockMapper(const Dimensions& tensor_dims,
+                         const Dimensions& tensor_slice_offsets,
+                         const Dimensions& tensor_slice_extents,
+                         const Dimensions& block_dim_sizes,
+                         const Dimensions& block_stride_order)
+      : m_tensor_dimensions(tensor_dims),
+        m_tensor_slice_offsets(tensor_slice_offsets),
+        m_tensor_slice_extents(tensor_slice_extents),
+        m_block_dim_sizes(block_dim_sizes),
+        m_block_stride_order(block_stride_order),
+        m_total_block_count(1) {
+    // Calculate block counts by dimension and total block count.
+    DSizes<StorageIndex, NumDims> block_count;
+    for (size_t i = 0; i < block_count.rank(); ++i) {
+      block_count[i] = divup(m_tensor_slice_extents[i], m_block_dim_sizes[i]);
+    }
+    m_total_block_count = array_prod(block_count);
+
+    // Calculate block strides (used for enumerating blocks).
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_block_strides[0] = 1;
+      m_tensor_strides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_block_strides[i] = m_block_strides[i - 1] * block_count[i - 1];
+        m_tensor_strides[i] =
+            m_tensor_strides[i - 1] * m_tensor_dimensions[i - 1];
+      }
+    } else {
+      m_block_strides[NumDims - 1] = 1;
+      m_tensor_strides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_block_strides[i] = m_block_strides[i + 1] * block_count[i + 1];
+        m_tensor_strides[i] =
+            m_tensor_strides[i + 1] * m_tensor_dimensions[i + 1];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  GetBlockForIndex(StorageIndex block_index, Scalar* data) const {
+    StorageIndex first_coeff_index = 0;
+    DSizes<StorageIndex, NumDims> coords;
+    DSizes<StorageIndex, NumDims> sizes;
+    DSizes<StorageIndex, NumDims> strides;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = block_index / m_block_strides[i];
+        coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i];
+        sizes[i] = numext::mini(
+            m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i],
+            m_block_dim_sizes[i]);
+        block_index -= idx * m_block_strides[i];
+        first_coeff_index += coords[i] * m_tensor_strides[i];
+      }
+      coords[0] =
+          m_tensor_slice_offsets[0] + block_index * m_block_dim_sizes[0];
+      sizes[0] = numext::mini(
+          m_tensor_slice_offsets[0] + m_tensor_slice_extents[0] - coords[0],
+          m_block_dim_sizes[0]);
+      first_coeff_index += coords[0] * m_tensor_strides[0];
+
+      StorageIndex prev_dim = m_block_stride_order[0];
+      strides[prev_dim] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        const StorageIndex curr_dim = m_block_stride_order[i];
+        strides[curr_dim] = strides[prev_dim] * sizes[prev_dim];
+        prev_dim = curr_dim;
+      }
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const StorageIndex idx = block_index / m_block_strides[i];
+        coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i];
+        sizes[i] = numext::mini(
+            m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i],
+            m_block_dim_sizes[i]);
+        block_index -= idx * m_block_strides[i];
+        first_coeff_index += coords[i] * m_tensor_strides[i];
+      }
+      coords[NumDims - 1] = m_tensor_slice_offsets[NumDims - 1] +
+                            block_index * m_block_dim_sizes[NumDims - 1];
+      sizes[NumDims - 1] = numext::mini(
+          m_tensor_slice_offsets[NumDims - 1] +
+              m_tensor_slice_extents[NumDims - 1] - coords[NumDims - 1],
+          m_block_dim_sizes[NumDims - 1]);
+      first_coeff_index += coords[NumDims - 1] * m_tensor_strides[NumDims - 1];
+
+      StorageIndex prev_dim = m_block_stride_order[NumDims - 1];
+      strides[prev_dim] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        const StorageIndex curr_dim = m_block_stride_order[i];
+        strides[curr_dim] = strides[prev_dim] * sizes[prev_dim];
+        prev_dim = curr_dim;
+      }
+    }
+
+    return TensorBlock(first_coeff_index, sizes, strides, m_tensor_strides,
+                       data);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex total_block_count() const {
+    return m_total_block_count;
+  }
+
+ private:
+  Dimensions m_tensor_dimensions;
+  Dimensions m_tensor_slice_offsets;
+  Dimensions m_tensor_slice_extents;
+  Dimensions m_tensor_strides;
+  Dimensions m_block_dim_sizes;
+  Dimensions m_block_stride_order;
+  Dimensions m_block_strides;
+  StorageIndex m_total_block_count;
+};
+
+}  // namespace internal
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index b6c93aff9..8fecbe657 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -105,10 +105,12 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  bool isCopy= false, nByOne = false, oneByN = false;
 
   enum {
     IsAligned = true,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess = false
   };
@@ -121,10 +123,13 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     // tensor with N >= 1 of 1 element first and then broadcast.
     EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
     const InputDimensions& input_dims = m_impl.dimensions();
-    const Broadcast& broadcast = op.broadcast();
+    isCopy = true;
     for (int i = 0; i < NumDims; ++i) {
       eigen_assert(input_dims[i] > 0);
-      m_dimensions[i] = input_dims[i] * broadcast[i];
+      m_dimensions[i] = input_dims[i] * m_broadcast[i];
+      if (m_broadcast[i] != 1) {
+        isCopy = false;
+      }
     }
 
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -142,6 +147,40 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
         m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
       }
     }
+
+    if (input_dims[0] == 1) {
+      oneByN = true;
+      for (int i = 1; i < NumDims; ++i) {
+        if (m_broadcast[i] != 1) {
+          oneByN = false;
+          break;
+        }
+      }
+    } else if (input_dims[NumDims-1] == 1) {
+      nByOne = true;
+      for (int i = 0; i < NumDims-1; ++i) {
+        if (m_broadcast[i] != 1) {
+          nByOne = false;
+          break;
+        }
+      }
+    }
+
+    // Handle special format like NCHW, its input shape is '[1, N..., 1]' and
+    // broadcast shape is '[N, 1..., N]'
+    if (!oneByN && !nByOne) {
+      if (input_dims[0] == 1 && input_dims[NumDims-1] == 1 && NumDims > 2) {
+        nByOne = true;
+        oneByN = true;
+        for (int i = 1; i < NumDims-1; ++i) {
+          if (m_broadcast[i] != 1) {
+            nByOne = false;
+            oneByN = false;
+            break;
+          }
+        }
+      }
+    }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -162,9 +201,17 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     }
 
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      return coeffColMajor(index);
+      if (isCopy) {
+        return m_impl.coeff(index);
+      } else {
+        return coeffColMajor(index);
+      }
     } else {
-      return coeffRowMajor(index);
+      if (isCopy) {
+        return m_impl.coeff(index);
+      } else {
+        return coeffRowMajor(index);
+      }
     }
   }
 
@@ -237,9 +284,145 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     }
 
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      return packetColMajor<LoadMode>(index);
+      if (isCopy) {
+        #ifdef EIGEN_GPU_COMPILE_PHASE
+        // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing
+        // unaligned loads here. The reason is unclear though.
+        return m_impl.template packet<Unaligned>(index);
+        #else
+        return m_impl.template packet<LoadMode>(index);
+        #endif
+      } else if (oneByN && !nByOne) {
+        return packetNByOne<LoadMode>(index);
+      } else if (!oneByN && nByOne) {
+        return packetOneByN<LoadMode>(index);
+      } else if (oneByN && nByOne) {
+        return packetOneByNByOne<LoadMode>(index);
+      } else {
+        return packetColMajor<LoadMode>(index);
+      }
+    } else {
+      if (isCopy) {
+        #ifdef EIGEN_GPU_COMPILE_PHASE
+        // See above.
+        return m_impl.template packet<Unaligned>(index);
+        #else
+        return m_impl.template packet<LoadMode>(index);
+        #endif
+      } else if (oneByN && !nByOne) {
+        return packetOneByN<LoadMode>(index);
+      } else if (!oneByN && nByOne) {
+        return packetNByOne<LoadMode>(index);
+      } else if (oneByN && nByOne) {
+        return packetOneByNByOne<LoadMode>(index);
+      } else {
+        return packetRowMajor<LoadMode>(index);
+      }
+    }
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne
+  (Index index) const
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    Index startDim, endDim;
+    Index inputIndex, outputOffset, batchedIndex;
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      startDim = NumDims - 1;
+      endDim = 1;
+    } else {
+      startDim = 0;
+      endDim = NumDims - 2;
+    }
+
+    batchedIndex = index % m_outputStrides[startDim];
+    inputIndex   = batchedIndex / m_outputStrides[endDim];
+    outputOffset = batchedIndex % m_outputStrides[endDim];
+
+    if (outputOffset + PacketSize <= m_outputStrides[endDim]) {
+      values[0] = m_impl.coeff(inputIndex);
+      return internal::pload1<PacketReturnType>(values);
+    } else {
+      for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
+        if (outputOffset + cur < m_outputStrides[endDim]) {
+          values[i] = m_impl.coeff(inputIndex);
+        } else {
+          ++inputIndex;
+          inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex);
+          values[i] = m_impl.coeff(inputIndex);
+          outputOffset = 0;
+          cur = 0;
+        }
+      }
+      return internal::pload<PacketReturnType>(values);
+    }
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+    Index dim, inputIndex;
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      dim = NumDims - 1;
+    } else {
+      dim = 0;
+    }
+
+    inputIndex = index % m_inputStrides[dim];
+    if (inputIndex + PacketSize <= m_inputStrides[dim]) {
+      return m_impl.template packet<Unaligned>(inputIndex);
+    } else {
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+      for (int i = 0; i < PacketSize; ++i) {
+        if (inputIndex > m_inputStrides[dim]-1) {
+          inputIndex = 0;
+        }
+        values[i] = m_impl.coeff(inputIndex++);
+      }
+      return internal::pload<PacketReturnType>(values);
+    }
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    Index dim, inputIndex, outputOffset;
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      dim = 1;
+    } else {
+      dim = NumDims - 2;
+    }
+
+    inputIndex   = index / m_outputStrides[dim];
+    outputOffset = index % m_outputStrides[dim];
+    if (outputOffset + PacketSize <= m_outputStrides[dim]) {
+      values[0] = m_impl.coeff(inputIndex);
+      return internal::pload1<PacketReturnType>(values);
     } else {
-      return packetRowMajor<LoadMode>(index);
+      for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
+        if (outputOffset + cur < m_outputStrides[dim]) {
+          values[i] = m_impl.coeff(inputIndex);
+        } else {
+          values[i] = m_impl.coeff(++inputIndex);
+          outputOffset = 0;
+          cur = 0;
+        }
+      }
+      return internal::pload<PacketReturnType>(values);
     }
   }
 
@@ -290,7 +473,11 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
       EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       values[0] = m_impl.coeff(inputIndex);
       for (int i = 1; i < PacketSize; ++i) {
-        values[i] = coeffColMajor(originalIndex+i);
+        if (innermostLoc + i < m_impl.dimensions()[0]) {
+          values[i] = m_impl.coeff(inputIndex+i);
+        } else {
+          values[i] = coeffColMajor(originalIndex+i);
+        }
       }
       PacketReturnType rslt = internal::pload<PacketReturnType>(values);
       return rslt;
@@ -342,7 +529,11 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
       EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       values[0] = m_impl.coeff(inputIndex);
       for (int i = 1; i < PacketSize; ++i) {
-        values[i] = coeffRowMajor(originalIndex+i);
+        if (innermostLoc + i < m_impl.dimensions()[NumDims-1]) {
+          values[i] = m_impl.coeff(inputIndex+i);
+        } else {
+          values[i] = coeffRowMajor(originalIndex+i);
+        }
       }
       PacketReturnType rslt = internal::pload<PacketReturnType>(values);
       return rslt;
@@ -352,7 +543,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
   costPerCoeff(bool vectorized) const {
     double compute_cost = TensorOpCost::AddCost<Index>();
-    if (NumDims > 0) {
+    if (!isCopy && NumDims > 0) {
       for (int i = NumDims - 1; i > 0; --i) {
         compute_cost += TensorOpCost::DivCost<Index>();
         if (internal::index_statically_eq<Broadcast>(i, 1)) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 21ffa2872..085c05f3d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -146,6 +146,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
     // slice offsets.
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -343,6 +344,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     RawAccess = false
   };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index a7c1380b8..9f0321880 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -122,6 +122,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<LeftArgType, Device>::Layout,
     RawAccess = false
   };
@@ -306,6 +307,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<LeftArgType, Device>::Layout,
     RawAccess = false
   };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index e72ddb4a9..86602c27e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -85,8 +85,8 @@ template<typename LhsScalar, typename RhsScalar, typename Scalar>
 #endif
 
 
-template<typename Dimensions, typename LhsXprType, typename RhsXprType>
-struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
+template<typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
+struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> >
 {
   // Type promotion to handle the case where the types of the lhs and the rhs are different.
   typedef typename gebp_traits<typename remove_const<typename LhsXprType::Scalar>::type,
@@ -112,23 +112,24 @@ struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
   };
 };
 
-template<typename Dimensions, typename LhsXprType, typename RhsXprType>
-struct eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, Eigen::Dense>
+template<typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
+struct eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>, Eigen::Dense>
 {
-  typedef const TensorContractionOp<Dimensions, LhsXprType, RhsXprType>& type;
+  typedef const TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>& type;
 };
 
-template<typename Dimensions, typename LhsXprType, typename RhsXprType>
-struct nested<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, 1, typename eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >::type>
+template<typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
+struct nested<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>, 1, typename eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> >::type>
 {
-  typedef TensorContractionOp<Dimensions, LhsXprType, RhsXprType> type;
+  typedef TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> type;
 };
 
-template<typename Indices_, typename LeftArgType_, typename RightArgType_, typename Device_>
-struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_, RightArgType_>, Device_> > {
+template<typename Indices_, typename LeftArgType_, typename RightArgType_, typename OutputKernelType_, typename Device_>
+struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_, RightArgType_, OutputKernelType_>, Device_> > {
   typedef Indices_ Indices;
   typedef LeftArgType_ LeftArgType;
   typedef RightArgType_ RightArgType;
+  typedef OutputKernelType_ OutputKernelType;
   typedef Device_ Device;
 
   // From NumDims below.
@@ -137,8 +138,52 @@ struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_,
 
 }  // end namespace internal
 
-template<typename Indices, typename LhsXprType, typename RhsXprType>
-class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXprType, RhsXprType>, ReadOnlyAccessors>
+// Tensor contraction params that should enable to get from output matrix
+// 2-dimensional coordinates to the output tensor dimensions.
+struct TensorContractionParams {
+  // TensorContraction evaluator assumes that both tensors are in ColMajor
+  // layout, if tensors are in RowMajor evaluator swap lhs with rhs.
+  bool swapped_arguments;
+};
+
+// Output kernel allows to fuse operations into the tensor contraction.
+//
+// Examples:
+//   1. Elementwise Relu transformation following Conv2D.
+//   2. AddBias to the Conv2D output channels dimension.
+//
+// See expected implementation in NoOpOutputKernel.
+struct OutputKernel {
+  template <typename Index, typename Scalar>
+  using OutputMapper = internal::blas_data_mapper<Scalar, Index, ColMajor>;
+};
+
+// Output kernel that does absolutely nothing.
+struct NoOpOutputKernel {
+  /**
+   * Tensor contraction evaluator calls this kernel after finishing each block
+   * of output matrix. Output blocks belong to the 2-dimensional output tensor.
+   *
+   * TensorContractionParams contains contraction dimensions information
+   * required to map output 2-d space into the expected output tensor space
+   * (potentially higher dimensional).
+   *
+   * \param[in] output_mapper Access to output tensor memory
+   * \param[in] params   Tensor contraction parameters
+   * \param[in] i        Index of a first row available through output_mapper
+   * \param[in] j        Index of a first column available through output_mapper
+   * \param[in] num_rows Number of available rows
+   * \param[in] num_cols Number of available columns
+   */
+  template <typename Index, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(
+      const OutputKernel::OutputMapper<Index, Scalar>& /*output_mapper*/,
+      const TensorContractionParams& /*params*/, Index /*i*/,
+      Index /*j*/, Index /*num_rows*/, Index /*num_cols*/) const {}
+};
+
+template<typename Indices, typename LhsXprType, typename RhsXprType, typename OutputKernelType = const NoOpOutputKernel>
+class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXprType, RhsXprType, OutputKernelType>, ReadOnlyAccessors>
 {
   public:
   typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar;
@@ -149,8 +194,10 @@ class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXp
   typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp(
-      const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims)
-      : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {}
+      const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims,
+      const OutputKernelType& output_kernel = OutputKernelType())
+      : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims),
+        m_output_kernel(output_kernel) {}
 
   EIGEN_DEVICE_FUNC
   const Indices& indices() const { return m_indices; }
@@ -164,10 +211,14 @@ class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXp
   const typename internal::remove_all<typename RhsXprType::Nested>::type&
   rhsExpression() const { return m_rhs_xpr; }
 
+  EIGEN_DEVICE_FUNC
+  const OutputKernelType& outputKernel() const { return m_output_kernel; }
+
   protected:
     typename LhsXprType::Nested m_lhs_xpr;
     typename RhsXprType::Nested m_rhs_xpr;
     const Indices m_indices;
+    const OutputKernelType m_output_kernel;
 };
 
 
@@ -177,9 +228,10 @@ struct TensorContractionEvaluatorBase
   typedef typename internal::traits<Derived>::Indices Indices;
   typedef typename internal::traits<Derived>::LeftArgType LeftArgType;
   typedef typename internal::traits<Derived>::RightArgType RightArgType;
+  typedef typename internal::traits<Derived>::OutputKernelType OutputKernelType;
   typedef typename internal::traits<Derived>::Device Device;
 
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
@@ -188,6 +240,7 @@ struct TensorContractionEvaluatorBase
   enum {
     IsAligned = true,
     PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+    BlockAccess = false,
     Layout = TensorEvaluator<LeftArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = true
@@ -221,6 +274,7 @@ struct TensorContractionEvaluatorBase
                           op.lhsExpression(), op.rhsExpression()), device),
     m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
                           op.rhsExpression(), op.lhsExpression()), device),
+        m_output_kernel(op.outputKernel()),
         m_device(device),
         m_result(NULL) {
     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
@@ -391,6 +445,12 @@ struct TensorContractionEvaluatorBase
         numext::swap(m_dimensions[i], m_dimensions[j]);
       }
     }
+
+    // A set of parameters that will allow output kernel to get from output
+    // tensor dimensions (i, j) into the original tensor dimensions.
+    // TODO(ezhulenev): Add parameters required to infer output tensor index for
+    // more complex contractions than 2x2 on internal dimension.
+    m_tensor_contraction_params.swapped_arguments = static_cast<int>(Layout) == RowMajor;
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -408,47 +468,66 @@ struct TensorContractionEvaluatorBase
     }
   }
 
-  EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const {
-    if (this->m_lhs_inner_dim_contiguous) {
-      if (this->m_rhs_inner_dim_contiguous) {
-        if (this->m_rhs_inner_dim_reordered) {
-          static_cast<const Derived*>(this)->template evalProduct<true, true, true, Unaligned>(buffer);
-        }
-        else {
-          static_cast<const Derived*>(this)->template evalProduct<true, true, false, Unaligned>(buffer);
-        }
-      }
-      else {
-       if (this->m_rhs_inner_dim_reordered) {
-          static_cast<const Derived*>(this)->template evalProduct<true, false, true, Unaligned>(buffer);
-        }
-        else {
-          static_cast<const Derived*>(this)->template evalProduct<true, false, false, Unaligned>(buffer);
-        }
-      }
+#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS)   \
+    if (this->m_lhs_inner_dim_contiguous) { \
+      if (this->m_rhs_inner_dim_contiguous) { \
+        if (this->m_rhs_inner_dim_reordered) { \
+          METHOD<true, true, true, ALIGNMENT>ARGS;    \
+        } \
+        else { \
+          METHOD<true, true, false, ALIGNMENT>ARGS; \
+        } \
+      } \
+      else { \
+       if (this->m_rhs_inner_dim_reordered) { \
+          METHOD<true, false, true, ALIGNMENT>ARGS; \
+        } \
+        else { \
+          METHOD<true, false, false, ALIGNMENT>ARGS; \
+        } \
+      } \
+    } \
+    else { \
+      if (this->m_rhs_inner_dim_contiguous) { \
+        if (this->m_rhs_inner_dim_reordered) { \
+          METHOD<false, true, true, ALIGNMENT>ARGS; \
+        } \
+        else { \
+          METHOD<false, true, false, ALIGNMENT>ARGS; \
+        } \
+      } \
+      else { \
+       if (this->m_rhs_inner_dim_reordered) { \
+          METHOD<false, false, true, ALIGNMENT>ARGS; \
+        } \
+        else { \
+          METHOD<false, false, false, ALIGNMENT>ARGS; \
+        } \
+      } \
     }
-    else {
-      if (this->m_rhs_inner_dim_contiguous) {
-        if (this->m_rhs_inner_dim_reordered) {
-          static_cast<const Derived*>(this)->template evalProduct<false, true, true, Unaligned>(buffer);
-        }
-        else {
-          static_cast<const Derived*>(this)->template evalProduct<false, true, false, Unaligned>(buffer);
-        }
-      }
-      else {
-       if (this->m_rhs_inner_dim_reordered) {
-          static_cast<const Derived*>(this)->template evalProduct<false, false, true, Unaligned>(buffer);
-        }
-        else {
-          static_cast<const Derived*>(this)->template evalProduct<false, false, false, Unaligned>(buffer);
-        }
-      }
+
+  EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const {
+   static_cast<const Derived*>(this)->template evalProduct<Unaligned>(buffer);
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
+            bool rhs_inner_dim_reordered, int Alignment>
+  void evalProductSequential(Scalar* buffer) const {
+    if (this->m_j_size == 1) {
+      this->template evalGemv<lhs_inner_dim_contiguous,
+                              rhs_inner_dim_contiguous, rhs_inner_dim_reordered,
+                              Alignment>(buffer);
+    } else {
+      this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous,
+                              rhs_inner_dim_reordered, Alignment>(buffer);
     }
   }
 
   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const {
+  #if !defined(EIGEN_HIPCC)
+  EIGEN_DEVICE_FUNC
+  #endif
+  void evalGemv(Scalar* buffer) const {
     const Index rows = m_i_size;
     const Index cols = m_k_size;
 
@@ -486,10 +565,18 @@ struct TensorContractionEvaluatorBase
     internal::general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,false,RhsScalar,RhsMapper,false>::run(
         rows, cols, lhs, rhs,
         buffer, resIncr, alpha);
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+    m_output_kernel(OutputMapper(buffer, rows), m_tensor_contraction_params,
+                    static_cast<Index>(0), static_cast<Index>(0), rows,
+                    static_cast<Index>(1));
   }
 
   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
+  #if !defined(EIGEN_HIPCC)
+  EIGEN_DEVICE_FUNC
+  #endif
+  void evalGemm(Scalar* buffer) const {
     #if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
     if (m_can_use_xsmm) {
       evalGemmXSMM(buffer);
@@ -553,7 +640,7 @@ struct TensorContractionEvaluatorBase
     OutputMapper output(buffer, m);
 
     // Sizes of the blocks to load in cache. See the Goto paper for details.
-    internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, 1);
+    internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index, internal::ShardByCol> blocking(k, m, n, 1);
     const Index kc = blocking.kc();
     const Index mc = numext::mini(m, blocking.mc());
     const Index nc = numext::mini(n, blocking.nc());
@@ -579,7 +666,15 @@ struct TensorContractionEvaluatorBase
 
           // call gebp (matrix kernel)
           // The parameters here are copied from Eigen's GEMM implementation
-          gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, Scalar(1), -1, -1, 0, 0);
+          const auto output_mapper = output.getSubMapper(i2, j2);
+          gebp(output_mapper, blockA, blockB, actual_mc, actual_kc, actual_nc,
+               Scalar(1), -1, -1, 0, 0);
+
+          // We are done with this [i2, j2] output block.
+          if (k2 + kc >= k) {
+            m_output_kernel(output_mapper, m_tensor_contraction_params, i2, j2,
+                            actual_mc, actual_nc);
+          }
         }
       }
     }
@@ -842,23 +937,26 @@ protected:
   Index m_j_size;
   Index m_k_size;
 
+  TensorContractionParams m_tensor_contraction_params;
+
   TensorEvaluator<EvalLeftArgType, Device> m_leftImpl;
   TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
   const Device& m_device;
+  OutputKernelType m_output_kernel;
   Scalar* m_result;
   bool m_can_use_xsmm;
 };
 
 
 // evaluator for default device
-template<typename Indices, typename LeftArgType, typename RightArgType, typename Device>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> :
+template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType, typename Device>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> :
     public TensorContractionEvaluatorBase<
-      TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> > {
-  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
+      TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> > {
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
   typedef TensorContractionEvaluatorBase<Self> Base;
 
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
@@ -895,14 +993,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
       Base(op, device) { }
 
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  EIGEN_DEVICE_FUNC void evalProduct(Scalar* buffer) const {
-    if (this->m_j_size == 1) {
-      this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
-      return;
-    }
-
-    this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
+  template <int Alignment>
+  void evalProduct(Scalar* buffer) const {
+    TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential, Alignment, (buffer));
   }
 };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
index d34f9caee..cf281192c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@@ -21,14 +21,28 @@ enum {
 
 
 // Default Blocking Strategy
-template <typename LhsMapper, typename RhsMapper, typename Index, int ShardingType=ShardByCol>
+template <typename LhsScalar, typename RhsScalar, typename Index, int ShardingType=ShardByCol>
 class TensorContractionBlocking {
  public:
 
-  typedef typename LhsMapper::Scalar LhsScalar;
-  typedef typename RhsMapper::Scalar RhsScalar;
+ /*
+   adding EIGEN_DEVICE_FUNC unconditionally to 'TensorContractionBlocking' constructor in `TensorContractionBlocking.h`
+     requires adding EIGEN_DEVICE_FUNC to `computeProductBlockingSizes` in `GeneralBlockPanelKernel.h`
+     which in turn, requires adding EIGEN_DEVICE_FUNC to `evaluateProductBlockingSizesHeuristic` in `GeneralBlockPanelKernel.h`
+     which in turn, requires adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h`
+     (else HIPCC will error out)
 
-  EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
+   However adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h`
+   results in NVCC erroring out with the following error
+
+   ../Eigen/src/Core/products/GeneralBlockPanelKernel.h(57): error #2901:
+      dynamic initialization is not supported for function-scope static variables within a __device__/__global__ function
+ */
+
+  #if !defined(EIGEN_HIPCC)
+  EIGEN_DEVICE_FUNC
+  #endif
+ TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
       kc_(k), mc_(m), nc_(n)
   {
     if (ShardingType == ShardByCol) {
@@ -75,7 +89,7 @@ class TensorXsmmContractionBlocking {
       outer_n_ = outer_n_ != 0 ? outer_n_ : n;
     }
 #else
-    // Defaults, possibly overriden per-platform.
+    // Defaults, possibly overridden per-platform.
     copyA_ = true;
     copyB_ = false;
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index 903bc51cc..3f315fedc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -1,1395 +1,6 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2015 Navdeep Jaitly <ndjaitly@google.com>
-// Copyright (C) 2014 Eric Martin <eric@ericmart.in>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
-
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
-
-namespace Eigen {
-
-template<typename Scalar, typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper, bool needs_edge_check>
-__device__ EIGEN_STRONG_INLINE void
-EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
-                               const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem,
-                       const Index m_size, const Index n_size, const Index k_size) {
-
-  const Index m_block_idx = blockIdx.x;
-  const Index n_block_idx = blockIdx.y;
-
-  const Index base_m = 64 * m_block_idx;
-  const Index base_n = 64 * n_block_idx;
-
-  // declare and initialize 64 registers for output 8x8 block
-
-  // prefetch registers
-  Scalar lhs_pf0;
-  Scalar lhs_pf1;
-  Scalar lhs_pf2;
-  Scalar lhs_pf3;
-  Scalar lhs_pf4;
-  Scalar lhs_pf5;
-  Scalar lhs_pf6;
-  Scalar lhs_pf7;
-
-  Scalar rhs_pf0;
-  Scalar rhs_pf1;
-  Scalar rhs_pf2;
-  Scalar rhs_pf3;
-  Scalar rhs_pf4;
-  Scalar rhs_pf5;
-  Scalar rhs_pf6;
-  Scalar rhs_pf7;
-
-  // shared memory is formatted
-  // (contract idx in block, nocontract idx in block, block idx)
-  // where block idx is column major. This transposition limits the number of
-  // bank conflicts when reading the LHS. The core idea is that since the contracting
-  // index is shared by both sides, then the contracting index should be in threadIdx.x.
-
-  // On the LHS, we pad each row inside of each block with an extra element. This makes
-  // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts
-  // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks.
-
-  // On the RHS we just add 8 padding elements to the end of each block. This gives no bank
-  // conflicts on writes and also none on reads.
-
-  // storage indices
-  const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z;
-  const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x;
-
-  const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0;
-  const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1;
-  const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2;
-  const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3;
-  const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4;
-  const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5;
-  const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6;
-  const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7;
-
-  const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0;
-  const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1;
-  const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2;
-  const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3;
-  const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4;
-  const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5;
-  const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6;
-  const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7;
-
-  // in the loading code, the following variables are important:
-  // threadIdx.x: the vertical position in an 8x8 block
-  // threadIdx.y: the vertical index of the 8x8 block in the grid
-  // threadIdx.z: the horizontal position in an 8x8 block
-  // k: the horizontal index of the 8x8 block in the grid
-  //
-  // The k parameter is implicit (it was the loop counter for a loop that went
-  // from 0 to <8, but now that loop is unrolled in the below code.
-
-  const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y;
-  const Index lhs_vert = base_m + load_idx_vert;
-
-#define prefetchIntoRegisters(base_k)                           \
-  {                                                             \
-    lhs_pf0 = conv(0);                                          \
-    lhs_pf1 = conv(0);                                          \
-    lhs_pf2 = conv(0);                                          \
-    lhs_pf3 = conv(0);                                          \
-    lhs_pf4 = conv(0);                                          \
-    lhs_pf5 = conv(0);                                          \
-    lhs_pf6 = conv(0);                                          \
-    lhs_pf7 = conv(0);                                          \
-                                                                \
-    rhs_pf0 = conv(0);                                          \
-    rhs_pf1 = conv(0);                                          \
-    rhs_pf2 = conv(0);                                          \
-    rhs_pf3 = conv(0);                                          \
-    rhs_pf4 = conv(0);                                          \
-    rhs_pf5 = conv(0);                                          \
-    rhs_pf6 = conv(0);                                          \
-    rhs_pf7 = conv(0);                                          \
-                                                                \
-    if (!needs_edge_check || lhs_vert < m_size) {               \
-      const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8;   \
-      const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8;   \
-      const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8;   \
-      const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8;   \
-      const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8;   \
-      const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8;   \
-      const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8;   \
-      const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8;   \
-                                                                \
-      if (!needs_edge_check || lhs_horiz_7 < k_size) {          \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
-        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
-        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
-        lhs_pf7 = lhs(lhs_vert, lhs_horiz_7);                   \
-      } else if (lhs_horiz_6 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
-        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
-        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
-      } else if (lhs_horiz_5 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
-        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
-      } else if (lhs_horiz_4 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
-      } else if (lhs_horiz_3 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-      } else if (lhs_horiz_2 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-      } else if (lhs_horiz_1 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-      } else if (lhs_horiz_0 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-      }                                                         \
-    }                                                           \
-                                                                \
-    const Index rhs_vert = base_k + load_idx_vert;              \
-    if (!needs_edge_check || rhs_vert < k_size) {               \
-      const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8;   \
-      const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8;   \
-      const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8;   \
-      const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8;   \
-      const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8;   \
-      const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8;   \
-      const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8;   \
-      const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8;   \
-                                                                \
-      if (rhs_horiz_7 < n_size) {                               \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
-        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
-        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
-        rhs_pf7 = rhs(rhs_vert, rhs_horiz_7);                   \
-      } else if (rhs_horiz_6 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
-        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
-        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
-      } else if (rhs_horiz_5 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
-        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
-      } else if (rhs_horiz_4 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
-      } else if (rhs_horiz_3 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-      } else if (rhs_horiz_2 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-      } else if (rhs_horiz_1 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-      } else if (rhs_horiz_0 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-      }                                                         \
-    }                                                           \
-  }                                                             \
-
-#define writeRegToShmem(_)                      \
-  lhs_shmem[lhs_store_idx_0] = lhs_pf0;         \
-  rhs_shmem[rhs_store_idx_0] = rhs_pf0;         \
-                                                \
-  lhs_shmem[lhs_store_idx_1] = lhs_pf1;         \
-  rhs_shmem[rhs_store_idx_1] = rhs_pf1;         \
-                                                \
-  lhs_shmem[lhs_store_idx_2] = lhs_pf2;         \
-  rhs_shmem[rhs_store_idx_2] = rhs_pf2;         \
-                                                \
-  lhs_shmem[lhs_store_idx_3] = lhs_pf3;         \
-  rhs_shmem[rhs_store_idx_3] = rhs_pf3;         \
-                                                \
-  lhs_shmem[lhs_store_idx_4] = lhs_pf4;         \
-  rhs_shmem[rhs_store_idx_4] = rhs_pf4;         \
-                                                \
-  lhs_shmem[lhs_store_idx_5] = lhs_pf5;         \
-  rhs_shmem[rhs_store_idx_5] = rhs_pf5;         \
-                                                \
-  lhs_shmem[lhs_store_idx_6] = lhs_pf6;         \
-  rhs_shmem[rhs_store_idx_6] = rhs_pf6;         \
-                                                \
-  lhs_shmem[lhs_store_idx_7] = lhs_pf7;         \
-  rhs_shmem[rhs_store_idx_7] = rhs_pf7;         \
-
-  // declare and initialize result array
-#define res(i, j) _res_##i##j
-#define initResultRow(i)                        \
-  Scalar res(i, 0) = conv(0);                   \
-  Scalar res(i, 1) = conv(0);                   \
-  Scalar res(i, 2) = conv(0);                   \
-  Scalar res(i, 3) = conv(0);                   \
-  Scalar res(i, 4) = conv(0);                   \
-  Scalar res(i, 5) = conv(0);                   \
-  Scalar res(i, 6) = conv(0);                   \
-  Scalar res(i, 7) = conv(0);                   \
-
-  internal::scalar_cast_op<int, Scalar> conv;
-  initResultRow(0);
-  initResultRow(1);
-  initResultRow(2);
-  initResultRow(3);
-  initResultRow(4);
-  initResultRow(5);
-  initResultRow(6);
-  initResultRow(7);
-#undef initResultRow
-
-  for (Index base_k = 0; base_k < k_size; base_k += 64) {
-    // wait for previous iteration to finish with shmem. Despite common sense,
-    // the code is a bit faster with this here then at bottom of loop
-    __syncthreads();
-
-    prefetchIntoRegisters(base_k);
-    writeRegToShmem();
-
-    #undef prefetchIntoRegisters
-    #undef writeRegToShmem
-
-    // wait for shared mem packing to be done before starting computation
-    __syncthreads();
-
-    // compute 8x8 matrix product by outer product. This involves packing one column
-    // of LHS and one row of RHS into registers (takes 16 registers).
-
-#define lcol(i) _lcol##i
-    Scalar lcol(0);
-    Scalar lcol(1);
-    Scalar lcol(2);
-    Scalar lcol(3);
-    Scalar lcol(4);
-    Scalar lcol(5);
-    Scalar lcol(6);
-    Scalar lcol(7);
-
-#define rrow(j) _rrow##j
-    Scalar rrow(0);
-    Scalar rrow(1);
-    Scalar rrow(2);
-    Scalar rrow(3);
-    Scalar rrow(4);
-    Scalar rrow(5);
-    Scalar rrow(6);
-    Scalar rrow(7);
-
-    // Now x corresponds to k, y to m, and z to n
-    const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
-    const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
-
-#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
-#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
-
-#define loadData(i, j)                          \
-    lcol(0) = lhs_element(0, j);               \
-    rrow(0) = rhs_element(i, 0);               \
-    lcol(1) = lhs_element(1, j);               \
-    rrow(1) = rhs_element(i, 1);               \
-    lcol(2) = lhs_element(2, j);               \
-    rrow(2) = rhs_element(i, 2);               \
-    lcol(3) = lhs_element(3, j);               \
-    rrow(3) = rhs_element(i, 3);               \
-    lcol(4) = lhs_element(4, j);               \
-    rrow(4) = rhs_element(i, 4);               \
-    lcol(5) = lhs_element(5, j);               \
-    rrow(5) = rhs_element(i, 5);               \
-    lcol(6) = lhs_element(6, j);               \
-    rrow(6) = rhs_element(i, 6);               \
-    lcol(7) = lhs_element(7, j);               \
-    rrow(7) = rhs_element(i, 7);               \
-
-#define computeCol(j)                           \
-    res(0, j) += lcol(0) * rrow(j);             \
-    res(1, j) += lcol(1) * rrow(j);             \
-    res(2, j) += lcol(2) * rrow(j);             \
-    res(3, j) += lcol(3) * rrow(j);             \
-    res(4, j) += lcol(4) * rrow(j);             \
-    res(5, j) += lcol(5) * rrow(j);             \
-    res(6, j) += lcol(6) * rrow(j);             \
-    res(7, j) += lcol(7) * rrow(j);             \
-
-#define computePass(i)                          \
-    loadData(i, i);                             \
-                                                \
-    computeCol(0);                              \
-    computeCol(1);                              \
-    computeCol(2);                              \
-    computeCol(3);                              \
-    computeCol(4);                              \
-    computeCol(5);                              \
-    computeCol(6);                              \
-    computeCol(7);                              \
-
-    computePass(0);
-    computePass(1);
-    computePass(2);
-    computePass(3);
-    computePass(4);
-    computePass(5);
-    computePass(6);
-    computePass(7);
-
-#undef lcol
-#undef rrow
-#undef lhs_element
-#undef rhs_element
-#undef loadData
-#undef computeCol
-#undef computePass
-  } // end loop over k
-
-  // we've now iterated over all of the large (ie width 64) k blocks and
-  // accumulated results in registers. At this point thread (x, y, z) contains
-  // the sum across all big k blocks of the product of little k block of index (x, y)
-  // with block of index (y, z). To compute the final output, we need to reduce
-  // the 8 threads over y by summation.
-#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
-#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
-#else
-#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
+#if defined(__clang__) || defined(__GNUC__)
+#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorContractionGpu.h file"
 #endif
 
-#define reduceRow(i, mask)                      \
-  shuffleInc(i, 0, mask);                       \
-  shuffleInc(i, 1, mask);                       \
-  shuffleInc(i, 2, mask);                       \
-  shuffleInc(i, 3, mask);                       \
-  shuffleInc(i, 4, mask);                       \
-  shuffleInc(i, 5, mask);                       \
-  shuffleInc(i, 6, mask);                       \
-  shuffleInc(i, 7, mask);                       \
-
-#define reduceMatrix(mask)                      \
-  reduceRow(0, mask);                           \
-  reduceRow(1, mask);                           \
-  reduceRow(2, mask);                           \
-  reduceRow(3, mask);                           \
-  reduceRow(4, mask);                           \
-  reduceRow(5, mask);                           \
-  reduceRow(6, mask);                           \
-  reduceRow(7, mask);                           \
-
-  // actually perform the reduction, now each thread of index (_, y, z)
-  // contains the correct values in its registers that belong in the output
-  // block
-  reduceMatrix(1);
-  reduceMatrix(2);
-  reduceMatrix(4);
-
-#undef shuffleInc
-#undef reduceRow
-#undef reduceMatrix
-
-  // now we need to copy the 64 values into main memory. We can't split work
-  // among threads because all variables are in registers. There's 2 ways
-  // to do this:
-  // (1) have 1 thread do 64 writes from registers into global memory
-  // (2) have 1 thread do 64 writes into shared memory, and then 8 threads
-  //     each do 8 writes into global memory. We can just overwrite the shared
-  //     memory from the problem we just solved.
-  // (2) is slightly faster than (1) due to less branching and more ILP
-
-  // TODO: won't yield much gain, but could just use currently unused shared mem
-  //       and then we won't have to sync
-  // wait for shared mem to be out of use
-  __syncthreads();
-
-#define writeResultShmem(i, j)                                          \
-  lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \
-
-#define writeRow(i)                             \
-  writeResultShmem(i, 0);                       \
-  writeResultShmem(i, 1);                       \
-  writeResultShmem(i, 2);                       \
-  writeResultShmem(i, 3);                       \
-  writeResultShmem(i, 4);                       \
-  writeResultShmem(i, 5);                       \
-  writeResultShmem(i, 6);                       \
-  writeResultShmem(i, 7);                       \
-
-  if (threadIdx.x == 0) {
-    writeRow(0);
-    writeRow(1);
-    writeRow(2);
-    writeRow(3);
-    writeRow(4);
-    writeRow(5);
-    writeRow(6);
-    writeRow(7);
-  }
-#undef writeResultShmem
-#undef writeRow
-
-  const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
-  const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
-
-  if (threadIdx.x < max_i_write) {
-    if (max_j_write == 8) {
-      // TODO: can i trade bank conflicts for coalesced writes?
-      Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0];
-      Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1];
-      Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2];
-      Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3];
-      Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4];
-      Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5];
-      Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6];
-      Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7];
-
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7;
-    } else {
-#pragma unroll 7
-      for (int j = 0; j < max_j_write; j++) {
-        Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j];
-        output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val;
-      }
-    }
-  }
-#undef res
-}
-
-
-template<typename Scalar, typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper>
-__global__ void
-__launch_bounds__(512)
-EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output,
-                       const Index m_size, const Index n_size, const Index k_size) {
-  __shared__ Scalar lhs_shmem[72 * 64];
-  __shared__ Scalar rhs_shmem[72 * 64];
-
-  const Index m_block_idx = blockIdx.x;
-  const Index n_block_idx = blockIdx.y;
-
-  const Index base_m = 64 * m_block_idx;
-  const Index base_n = 64 * n_block_idx;
-
-  if (base_m + 63 < m_size && base_n + 63 < n_size) {
-    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
-  } else {
-    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
-  }
-}
-
-
-template<typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
-         bool CHECK_RHS_BOUNDARY>
-__device__ EIGEN_STRONG_INLINE void
-EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output, float2 lhs_shmem2[][16],
-                       float2 rhs_shmem2[][8], const Index m_size,
-                       const Index n_size, const Index k_size,
-                       const Index base_m, const Index base_n) {
-
-  // prefetch registers
-  float4 lhs_pf0, rhs_pf0;
-
-  float4 results[4];
-  for (int i=0; i < 4; i++) {
-    results[i].x = results[i].y = results[i].z = results[i].w = 0;
-  }
-
-
-#define prefetch_lhs(reg, row, col)                            \
-    if (!CHECK_LHS_BOUNDARY) {                                 \
-      if (col < k_size) {                                      \
-        reg =lhs.template loadPacket<Unaligned>(row, col);     \
-      }                                                        \
-    } else {                                                   \
-      if (col < k_size) {                                      \
-        if (row + 3 < m_size) {                                \
-          reg =lhs.template loadPacket<Unaligned>(row, col);   \
-        } else if (row + 2 < m_size) {                         \
-          reg.x =lhs(row + 0, col);                            \
-          reg.y =lhs(row + 1, col);                            \
-          reg.z =lhs(row + 2, col);                            \
-        } else if (row + 1 < m_size) {                         \
-          reg.x =lhs(row + 0, col);                            \
-          reg.y =lhs(row + 1, col);                            \
-        } else if (row  < m_size) {                            \
-          reg.x =lhs(row + 0, col);                            \
-        }                                                      \
-      }                                                        \
-    }                                                          \
-
-
-  Index lhs_vert = base_m+threadIdx.x*4;
-
-  for (Index k = 0; k < k_size; k += 16) {
-    lhs_pf0 = internal::pset1<float4>(0);
-    rhs_pf0 = internal::pset1<float4>(0);
-
-    Index lhs_horiz = threadIdx.y+k;
-    prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz)
-
-    Index rhs_vert = k+(threadIdx.x%4)*4;
-    Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n;
-
-    if (!CHECK_RHS_BOUNDARY) {
-      if ((rhs_vert + 3) < k_size) {
-        // just CHECK_RHS_BOUNDARY
-        rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
-      } else if (rhs_vert + 2 < k_size) {
-        // just CHECK_RHS_BOUNDARY
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-      } else if (rhs_vert + 1 < k_size) {
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-      } else if (rhs_vert  < k_size) {
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-      }
-    } else {
-      if (rhs_horiz0 < n_size) {
-        if ((rhs_vert + 3) < k_size) {
-          rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
-        } else if ((rhs_vert + 2) < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-        } else if ((rhs_vert + 1) < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        } else if (rhs_vert  < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        }
-      }
-    }
-    float x1, x2 ;
-    // the following can be a bitwise operation..... some day.
-    if((threadIdx.x%8) < 4) {
-      x1 = rhs_pf0.y;
-      x2 = rhs_pf0.w;
-    } else {
-      x1 = rhs_pf0.x;
-      x2 = rhs_pf0.z;
-    }
-    #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
-    x1 = __shfl_xor(x1, 4);
-    x2 = __shfl_xor(x2, 4);
-    #else
-    x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4);
-    x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4);
-    #endif
-    if((threadIdx.x%8) < 4) {
-      rhs_pf0.y = x1;
-      rhs_pf0.w = x2;
-    } else {
-      rhs_pf0.x = x1;
-      rhs_pf0.z = x2;
-    }
-
-    // We have 64 features.
-    // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1.
-    // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3.
-    // ...
-    // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63
-    // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1
-    // ...
-    rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y);
-    rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w);
-
-    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
-    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
-    // ...
-    // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
-    // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63)
-    // ...
-
-    lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y);
-    lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w);
-
-
-#define add_vals(fl1, fl2, fr1, fr2)\
-    results[0].x += fl1.x * fr1.x;\
-    results[0].y += fl1.y * fr1.x;\
-    results[0].z += fl2.x * fr1.x;\
-    results[0].w += fl2.y * fr1.x;\
-\
-    results[1].x += fl1.x * fr1.y;\
-    results[1].y += fl1.y * fr1.y;\
-    results[1].z += fl2.x * fr1.y;\
-    results[1].w += fl2.y * fr1.y;\
-\
-    results[2].x += fl1.x * fr2.x;\
-    results[2].y += fl1.y * fr2.x;\
-    results[2].z += fl2.x * fr2.x;\
-    results[2].w += fl2.y * fr2.x;\
-\
-    results[3].x += fl1.x * fr2.y;\
-    results[3].y += fl1.y * fr2.y;\
-    results[3].z += fl2.x * fr2.y;\
-    results[3].w += fl2.y * fr2.y;\
-
-    __syncthreads();
-
-    // Do the multiplies.
-    #pragma unroll
-    for (int koff = 0; koff < 16; koff ++) {
-      // 32 x threads.
-      float2 fl1 = lhs_shmem2[koff][threadIdx.x];
-      float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x];
-
-      int start_feature = threadIdx.y * 4;
-      float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
-      float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
-
-      add_vals(fl1, fl2, fr1, fr2)
-    }
-    __syncthreads();
-  }
-
-#undef prefetch_lhs
-#undef add_vals
-
-  Index horiz_base = threadIdx.y*4+base_n;
-  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
-    for (int i = 0; i < 4; i++) {
-      output(lhs_vert, horiz_base + i) = results[i].x;
-      output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      output(lhs_vert + 3, horiz_base + i) = results[i].w;
-    }
-  } else if (!CHECK_RHS_BOUNDARY) {
-    // CHECK LHS
-    if (lhs_vert + 3 < m_size) {
-      for (int i = 0; i < 4; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    } else if (lhs_vert + 2 < m_size) {
-      for (int i = 0; i < 4; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      }
-    } else if (lhs_vert + 1 < m_size) {
-      for (int i = 0; i < 4; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      }
-    } else if (lhs_vert  < m_size) {
-      for (int i = 0; i < 4; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-      }
-    }
-  } else if (!CHECK_LHS_BOUNDARY) {
-    // CHECK RHS
-    /*
-    int ncols_rem = fminf(n_size- horiz_base, 4);
-    for (int i = 0; i < ncols_rem; i++) {
-      output(lhs_vert, horiz_base + i) = results[i].x;
-      output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      output(lhs_vert + 3, horiz_base + i) = results[i].w;
-    }*/
-    for (int i = 0; i < 4; i++) {
-      if (horiz_base+i < n_size) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        output(lhs_vert + 3, horiz_base + i) = results[i].w;
-       }
-    }
-  } else {
-    // CHECK both boundaries.
-    for (int i = 0; i < 4; i++) {
-      if (horiz_base+i < n_size) {
-        if (lhs_vert < m_size)
-          output(lhs_vert, horiz_base + i) = results[i].x;
-        if (lhs_vert + 1 < m_size)
-          output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        if (lhs_vert + 2 < m_size)
-          output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        if (lhs_vert + 3 < m_size)
-          output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    }
-  }
-}
-
-
-template<typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
-         bool CHECK_RHS_BOUNDARY>
-__device__ EIGEN_STRONG_INLINE void
-EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output, float2 lhs_shmem2[][32],
-                       float2 rhs_shmem2[][8], const Index m_size,
-                       const Index n_size, const Index k_size,
-                       const Index base_m, const Index base_n) {
-
-  // prefetch registers
-  float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
-  float4 rhs_pf0, rhs_pf1;
-
-  float4 results[8];
-  for (int i=0; i < 8; i++) {
-    results[i].x = results[i].y = results[i].z = results[i].w = 0;
-  }
-
-
-  Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32;
-  for (Index k = 0; k < k_size; k += 32) {
-    lhs_pf0 = internal::pset1<float4>(0);
-    lhs_pf1 = internal::pset1<float4>(0);
-    lhs_pf2 = internal::pset1<float4>(0);
-    lhs_pf3 = internal::pset1<float4>(0);
-
-    rhs_pf0 = internal::pset1<float4>(0);
-    rhs_pf1 = internal::pset1<float4>(0);
-
-     if (!CHECK_LHS_BOUNDARY) {
-      if ((threadIdx.y/4+k+24) < k_size) {
-        lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-        lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
-        lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
-      } else if ((threadIdx.y/4+k+16) < k_size) {
-        lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-        lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
-      } else if ((threadIdx.y/4+k+8) < k_size) {
-        lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-      } else if ((threadIdx.y/4+k) < k_size) {
-        lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-      }
-    } else {
-      // just CHECK_LHS_BOUNDARY
-      if (lhs_vert + 3 < m_size) {
-        if ((threadIdx.y/4+k+24) < k_size) {
-          lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-          lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
-          lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
-        } else if ((threadIdx.y/4+k+16) < k_size) {
-          lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-          lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
-        } else if ((threadIdx.y/4+k+8) < k_size) {
-          lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-        } else if ((threadIdx.y/4+k) < k_size) {
-          lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        }
-      } else if (lhs_vert + 2 < m_size) {
-        if ((threadIdx.y/4+k+24) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
-          lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
-          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
-          lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
-          lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24));
-        } else if ((threadIdx.y/4+k+16) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
-          lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
-        } else if ((threadIdx.y/4+k+8) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
-        } else if ((threadIdx.y/4+k) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
-        }
-      } else if (lhs_vert + 1 < m_size) {
-        if ((threadIdx.y/4+k+24) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
-          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
-          lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
-        } else if ((threadIdx.y/4+k+16) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
-        } else if ((threadIdx.y/4+k+8) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-        } else if ((threadIdx.y/4+k) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-        }
-      } else if (lhs_vert < m_size) {
-        if ((threadIdx.y/4+k+24) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
-        } else if ((threadIdx.y/4+k+16) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-        } else if ((threadIdx.y/4+k+8) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-        } else if ((threadIdx.y/4+k) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-        }
-      }
-    }
-    __syncthreads();
-    Index rhs_vert = k+threadIdx.x*4;
-    Index rhs_horiz0 = threadIdx.y*2+base_n;
-    Index rhs_horiz1 = threadIdx.y*2+1+base_n;
-    if (!CHECK_RHS_BOUNDARY) {
-      if ((rhs_vert + 3) < k_size) {
-        // just CHECK_RHS_BOUNDARY
-        rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
-        rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
-      } else if (rhs_vert + 2 < k_size) {
-        // just CHECK_RHS_BOUNDARY
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-        rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
-      } else if (rhs_vert + 1 < k_size) {
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-      } else if (rhs_vert  < k_size) {
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-      }
-    } else {
-      if (rhs_horiz1 < n_size) {
-        if ((rhs_vert + 3) < k_size) {
-          // just CHECK_RHS_BOUNDARY
-          rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
-          rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
-        } else if (rhs_vert + 2 < k_size) {
-          // just CHECK_RHS_BOUNDARY
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-          rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
-        } else if (k+threadIdx.x*4 + 1 < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-        } else if (k+threadIdx.x*4  < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-        }
-      } else if (rhs_horiz0 < n_size) {
-        if ((rhs_vert + 3) < k_size) {
-          // just CHECK_RHS_BOUNDARY
-          rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
-        } else if ((rhs_vert + 2) < k_size) {
-          // just CHECK_RHS_BOUNDARY
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-        } else if ((rhs_vert + 1) < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        } else if (rhs_vert  < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        }
-      }
-    }
-    __syncthreads();
-    // Loaded. Do computation
-    // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1.
-    // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3.
-    // ..
-    // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63
-    rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x);
-    // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1.
-    // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3.
-    // ..
-    rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y);
-    // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1.
-    // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3.
-    rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z);
-    // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1.
-    // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3.
-    rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w);
-
-    // LHS.
-    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
-    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
-    // ...
-    // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
-    // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
-
-
-#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\
-      results[0].x += a_feat1.x * f1.x;\
-      results[1].x += a_feat1.x * f1.y;\
-      results[2].x += a_feat1.x * f2.x;\
-      results[3].x += a_feat1.x * f2.y;\
-      results[4].x += a_feat1.x * f3.x;\
-      results[5].x += a_feat1.x * f3.y;\
-      results[6].x += a_feat1.x * f4.x;\
-      results[7].x += a_feat1.x * f4.y;\
-\
-      results[0].y += a_feat1.y * f1.x;\
-      results[1].y += a_feat1.y * f1.y;\
-      results[2].y += a_feat1.y * f2.x;\
-      results[3].y += a_feat1.y * f2.y;\
-      results[4].y += a_feat1.y * f3.x;\
-      results[5].y += a_feat1.y * f3.y;\
-      results[6].y += a_feat1.y * f4.x;\
-      results[7].y += a_feat1.y * f4.y;\
-\
-      results[0].z += a_feat2.x * f1.x;\
-      results[1].z += a_feat2.x * f1.y;\
-      results[2].z += a_feat2.x * f2.x;\
-      results[3].z += a_feat2.x * f2.y;\
-      results[4].z += a_feat2.x * f3.x;\
-      results[5].z += a_feat2.x * f3.y;\
-      results[6].z += a_feat2.x * f4.x;\
-      results[7].z += a_feat2.x * f4.y;\
-\
-      results[0].w += a_feat2.y * f1.x;\
-      results[1].w += a_feat2.y * f1.y;\
-      results[2].w += a_feat2.y * f2.x;\
-      results[3].w += a_feat2.y * f2.y;\
-      results[4].w += a_feat2.y * f3.x;\
-      results[5].w += a_feat2.y * f3.y;\
-      results[6].w += a_feat2.y * f4.x;\
-      results[7].w += a_feat2.y * f4.y;\
-
-    lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y);
-    lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y);
-    lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y);
-    lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y);
-
-    lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w);
-    lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w);
-    lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w);
-    lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w);
-
-    __syncthreads();
-
-    // Do the multiplies.
-    #pragma unroll
-    for (int koff = 0; koff < 32; koff ++) {
-      float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8];
-      float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8];
-
-      // first feature is at (threadIdx.y/4) * 8 last is at start + 8.
-      int start_feature = (threadIdx.y / 4) * 8;
-
-      float2 br1 = rhs_shmem2[start_feature/2 +     (koff % 4) * 32][koff/4];
-      float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4];
-      float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4];
-      float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4];
-
-      add_vals(a3, a4, br1, br2, br3, br4)
-    }
-    __syncthreads();
-  } // end loop over k
-
-
-  __syncthreads();
-  Index horiz_base = (threadIdx.y/4)*8+base_n;
-  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
-    for (int i = 0; i < 8; i++) {
-      output(lhs_vert, horiz_base + i) = results[i].x;
-      output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      output(lhs_vert + 3, horiz_base + i) = results[i].w;
-    }
-  } else if (!CHECK_RHS_BOUNDARY) {
-    if (lhs_vert + 3 < m_size) {
-      for (int i = 0; i < 8; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    } else if (lhs_vert + 2 < m_size) {
-      for (int i = 0; i < 8; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      }
-    } else if (lhs_vert + 1 < m_size) {
-      for (int i = 0; i < 8; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      }
-    } else if (lhs_vert  < m_size) {
-      for (int i = 0; i < 8; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-      }
-    }
-  } else if (!CHECK_LHS_BOUNDARY) {
-    // CHECK BOUNDARY_B
-    for (int i = 0; i < 8; i++) {
-      if (horiz_base + i < n_size) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    }
-  } else {
-    // CHECK both boundaries.
-    for (int i = 0; i < 8; i++) {
-      if (horiz_base + i < n_size) {
-        if (lhs_vert < m_size)
-          output(lhs_vert, horiz_base + i) = results[i].x;
-        if (lhs_vert + 1 < m_size)
-          output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        if (lhs_vert + 2 < m_size)
-          output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        if (lhs_vert + 3 < m_size)
-          output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    }
-  }
-}
-
-
-template<typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper>
-__global__ void
-__launch_bounds__(256)
-EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output,
-                       const Index m_size, const Index n_size, const Index k_size) {
-  __shared__ float2 lhs_shmem[64*32];
-  __shared__ float2 rhs_shmem[128*8];
-
-  typedef float2 LHS_MEM[64][32];
-  typedef float2 RHS_MEM[128][8];
-
-  const Index m_block_idx = blockIdx.x;
-  const Index n_block_idx = blockIdx.y;
-
-  const Index base_m = 128 * m_block_idx;
-  const Index base_n = 64 * n_block_idx;
-
-  bool check_rhs = (base_n + 63) >= n_size;
-  bool check_lhs128 = (base_m + 127) >= m_size;
-
-  if (!check_rhs) {
-    if (!check_lhs128) {
-      // >= 128 rows left
-      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(
-                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
-    } else {
-      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(
-                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
-    }
-  } else {
-    if (!check_lhs128) {
-      // >= 128 rows left
-      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(
-                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
-    } else {
-      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(
-                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
-    }
-  }
-}
-
-template<typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper>
-__global__ void
-__launch_bounds__(256)
-EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output,
-                       const Index m_size, const Index n_size, const Index k_size) {
-  __shared__ float2 lhs_shmem[32][16];
-  __shared__ float2 rhs_shmem[64][8];
-
-  const Index m_block_idx = blockIdx.x;
-  const Index n_block_idx = blockIdx.y;
-
-  const Index base_m = 64 * m_block_idx;
-  const Index base_n = 64 * n_block_idx;
-
-  if (base_m + 63 < m_size) {
-    if (base_n + 63 < n_size) {
-      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
-    } else {
-      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
-    }
-  } else {
-    if (base_n + 63 < n_size) {
-      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
-    } else {
-      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
-    }
-  }
-}
-
-
-template<typename Indices, typename LeftArgType, typename RightArgType>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> :
-    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> > {
-
-  typedef GpuDevice Device;
-
-  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
-  typedef TensorContractionEvaluatorBase<Self> Base;
-
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
-  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
-  typedef typename XprType::Index Index;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
-
-  enum {
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-  };
-
-  // Most of the code is assuming that both input tensors are ColMajor. If the
-  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
-  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
-  // will pretend B is LHS and A is RHS.
-  typedef typename internal::conditional<
-    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
-  typedef typename internal::conditional<
-    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
-
-  static const int LDims =
-      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
-  static const int RDims =
-      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
-  static const int ContractDims = internal::array_size<Indices>::value;
-
-  typedef array<Index, LDims> left_dim_mapper_t;
-  typedef array<Index, RDims> right_dim_mapper_t;
-
-  typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, LDims - ContractDims> left_nocontract_t;
-  typedef array<Index, RDims - ContractDims> right_nocontract_t;
-
-  static const int NumDims = LDims + RDims - 2 * ContractDims;
-
-  typedef DSizes<Index, NumDims> Dimensions;
-
-  // typedefs needed in evalTo
-  typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
-  typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-
-  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
-  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
-
-  typedef typename LeftEvaluator::Dimensions LeftDimensions;
-  typedef typename RightEvaluator::Dimensions RightDimensions;
-
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
-      Base(op, device) {}
-
-  // We need to redefine this method to make nvcc happy
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
-    this->m_leftImpl.evalSubExprsIfNeeded(NULL);
-    this->m_rightImpl.evalSubExprsIfNeeded(NULL);
-    if (data) {
-      evalTo(data);
-      return false;
-    } else {
-      this->m_result = static_cast<Scalar *>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
-      evalTo(this->m_result);
-      return true;
-    }
-  }
-
-  void evalTo(Scalar* buffer) const {
-    if (this->m_lhs_inner_dim_contiguous) {
-      if (this->m_rhs_inner_dim_contiguous) {
-        if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<true, true, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<true, true, false, Unaligned>(buffer);
-        }
-      }
-      else {
-       if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<true, false, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<true, false, false, Unaligned>(buffer);
-        }
-      }
-    }
-    else {
-      if (this->m_rhs_inner_dim_contiguous) {
-        if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<false, true, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<false, true, false, Unaligned>(buffer);
-        }
-      }
-      else {
-       if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<false, false, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<false, false, false, Unaligned>(buffer);
-        }
-      }
-    }
-  }
-
-  template <typename LhsScalar, typename RhsScalar, typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels {
-    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
-    const Index m_blocks = (m + 63) / 64;
-    const Index n_blocks = (n + 63) / 64;
-    const dim3 num_blocks(m_blocks, n_blocks, 1);
-    const dim3 block_size(8, 8, 8);
-    LAUNCH_CUDA_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
-    }
-  };
-
-  template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels<float, float, Index, LhsMapper, RhsMapper, OutputMapper> {
-    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
-      if (m < 768 || n < 768) {
-        const Index m_blocks = (m + 63) / 64;
-        const Index n_blocks = (n + 63) / 64;
-        const dim3 num_blocks(m_blocks, n_blocks, 1);
-        const dim3 block_size(16, 16, 1);
-        LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
-      } else {
-        const Index m_blocks = (m + 127) / 128;
-        const Index n_blocks = (n + 63) / 64;
-        const dim3 num_blocks(m_blocks, n_blocks, 1);
-        const dim3 block_size(8, 32, 1);
-        LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
-      }
-    }
-  };
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  void evalTyped(Scalar* buffer) const {
-    // columns in left side, rows in right side
-    const Index k = this->m_k_size;
-    EIGEN_UNUSED_VARIABLE(k)
-
-    // rows in left side
-    const Index m = this->m_i_size;
-
-    // columns in right side
-    const Index n = this->m_j_size;
-
-    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
-    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
-
-    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
-                                                   LeftEvaluator, left_nocontract_t,
-                                                   contract_t, 4,
-                                                   lhs_inner_dim_contiguous,
-                                                   false, Unaligned> LhsMapper;
-
-    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
-                                                   RightEvaluator, right_nocontract_t,
-                                                   contract_t, 4,
-                                                   rhs_inner_dim_contiguous,
-                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
-
-    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-
-
-    // initialize data mappers
-    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
-                  this->m_left_contracting_strides, this->m_k_strides);
-
-    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
-                  this->m_right_contracting_strides, this->m_k_strides);
-
-    OutputMapper output(buffer, m);
-
-    setCudaSharedMemConfig(cudaSharedMemBankSizeEightByte);
-    LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output,  m, n, k, this->m_device);
-  }
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_USE_GPU and EIGEN_CUDACC
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
+#include "TensorContractionGpu.h"
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
new file mode 100644
index 000000000..b5e186d21
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
@@ -0,0 +1,1412 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2015 Navdeep Jaitly <ndjaitly@google.com>
+// Copyright (C) 2014 Eric Martin <eric@ericmart.in>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
+
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+
+namespace Eigen {
+
+template<typename Scalar, typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper, bool needs_edge_check>
+__device__ EIGEN_STRONG_INLINE void
+EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
+                               const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem,
+                       const Index m_size, const Index n_size, const Index k_size) {
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  // declare and initialize 64 registers for output 8x8 block
+
+  // prefetch registers
+  Scalar lhs_pf0;
+  Scalar lhs_pf1;
+  Scalar lhs_pf2;
+  Scalar lhs_pf3;
+  Scalar lhs_pf4;
+  Scalar lhs_pf5;
+  Scalar lhs_pf6;
+  Scalar lhs_pf7;
+
+  Scalar rhs_pf0;
+  Scalar rhs_pf1;
+  Scalar rhs_pf2;
+  Scalar rhs_pf3;
+  Scalar rhs_pf4;
+  Scalar rhs_pf5;
+  Scalar rhs_pf6;
+  Scalar rhs_pf7;
+
+  // shared memory is formatted
+  // (contract idx in block, nocontract idx in block, block idx)
+  // where block idx is column major. This transposition limits the number of
+  // bank conflicts when reading the LHS. The core idea is that since the contracting
+  // index is shared by both sides, then the contracting index should be in threadIdx.x.
+
+  // On the LHS, we pad each row inside of each block with an extra element. This makes
+  // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts
+  // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks.
+
+  // On the RHS we just add 8 padding elements to the end of each block. This gives no bank
+  // conflicts on writes and also none on reads.
+
+  // storage indices
+  const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z;
+  const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x;
+
+  const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0;
+  const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1;
+  const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2;
+  const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3;
+  const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4;
+  const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5;
+  const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6;
+  const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7;
+
+  const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0;
+  const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1;
+  const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2;
+  const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3;
+  const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4;
+  const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5;
+  const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6;
+  const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7;
+
+  // in the loading code, the following variables are important:
+  // threadIdx.x: the vertical position in an 8x8 block
+  // threadIdx.y: the vertical index of the 8x8 block in the grid
+  // threadIdx.z: the horizontal position in an 8x8 block
+  // k: the horizontal index of the 8x8 block in the grid
+  //
+  // The k parameter is implicit (it was the loop counter for a loop that went
+  // from 0 to <8, but now that loop is unrolled in the below code.
+
+  const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y;
+  const Index lhs_vert = base_m + load_idx_vert;
+
+#define prefetchIntoRegisters(base_k)                           \
+  {                                                             \
+    lhs_pf0 = conv(0);                                          \
+    lhs_pf1 = conv(0);                                          \
+    lhs_pf2 = conv(0);                                          \
+    lhs_pf3 = conv(0);                                          \
+    lhs_pf4 = conv(0);                                          \
+    lhs_pf5 = conv(0);                                          \
+    lhs_pf6 = conv(0);                                          \
+    lhs_pf7 = conv(0);                                          \
+                                                                \
+    rhs_pf0 = conv(0);                                          \
+    rhs_pf1 = conv(0);                                          \
+    rhs_pf2 = conv(0);                                          \
+    rhs_pf3 = conv(0);                                          \
+    rhs_pf4 = conv(0);                                          \
+    rhs_pf5 = conv(0);                                          \
+    rhs_pf6 = conv(0);                                          \
+    rhs_pf7 = conv(0);                                          \
+                                                                \
+    if (!needs_edge_check || lhs_vert < m_size) {               \
+      const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8;   \
+      const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8;   \
+      const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8;   \
+      const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8;   \
+      const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8;   \
+      const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8;   \
+      const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8;   \
+      const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8;   \
+                                                                \
+      if (!needs_edge_check || lhs_horiz_7 < k_size) {          \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
+        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
+        lhs_pf7 = lhs(lhs_vert, lhs_horiz_7);                   \
+      } else if (lhs_horiz_6 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
+        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
+      } else if (lhs_horiz_5 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
+      } else if (lhs_horiz_4 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+      } else if (lhs_horiz_3 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+      } else if (lhs_horiz_2 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+      } else if (lhs_horiz_1 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+      } else if (lhs_horiz_0 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+      }                                                         \
+    }                                                           \
+                                                                \
+    const Index rhs_vert = base_k + load_idx_vert;              \
+    if (!needs_edge_check || rhs_vert < k_size) {               \
+      const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8;   \
+      const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8;   \
+      const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8;   \
+      const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8;   \
+      const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8;   \
+      const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8;   \
+      const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8;   \
+      const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8;   \
+                                                                \
+      if (rhs_horiz_7 < n_size) {                               \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
+        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
+        rhs_pf7 = rhs(rhs_vert, rhs_horiz_7);                   \
+      } else if (rhs_horiz_6 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
+        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
+      } else if (rhs_horiz_5 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
+      } else if (rhs_horiz_4 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+      } else if (rhs_horiz_3 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+      } else if (rhs_horiz_2 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+      } else if (rhs_horiz_1 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+      } else if (rhs_horiz_0 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+      }                                                         \
+    }                                                           \
+  }                                                             \
+
+#define writeRegToShmem(_)                      \
+  lhs_shmem[lhs_store_idx_0] = lhs_pf0;         \
+  rhs_shmem[rhs_store_idx_0] = rhs_pf0;         \
+                                                \
+  lhs_shmem[lhs_store_idx_1] = lhs_pf1;         \
+  rhs_shmem[rhs_store_idx_1] = rhs_pf1;         \
+                                                \
+  lhs_shmem[lhs_store_idx_2] = lhs_pf2;         \
+  rhs_shmem[rhs_store_idx_2] = rhs_pf2;         \
+                                                \
+  lhs_shmem[lhs_store_idx_3] = lhs_pf3;         \
+  rhs_shmem[rhs_store_idx_3] = rhs_pf3;         \
+                                                \
+  lhs_shmem[lhs_store_idx_4] = lhs_pf4;         \
+  rhs_shmem[rhs_store_idx_4] = rhs_pf4;         \
+                                                \
+  lhs_shmem[lhs_store_idx_5] = lhs_pf5;         \
+  rhs_shmem[rhs_store_idx_5] = rhs_pf5;         \
+                                                \
+  lhs_shmem[lhs_store_idx_6] = lhs_pf6;         \
+  rhs_shmem[rhs_store_idx_6] = rhs_pf6;         \
+                                                \
+  lhs_shmem[lhs_store_idx_7] = lhs_pf7;         \
+  rhs_shmem[rhs_store_idx_7] = rhs_pf7;         \
+
+  // declare and initialize result array
+#define res(i, j) _res_##i##j
+#define initResultRow(i)                        \
+  Scalar res(i, 0) = conv(0);                   \
+  Scalar res(i, 1) = conv(0);                   \
+  Scalar res(i, 2) = conv(0);                   \
+  Scalar res(i, 3) = conv(0);                   \
+  Scalar res(i, 4) = conv(0);                   \
+  Scalar res(i, 5) = conv(0);                   \
+  Scalar res(i, 6) = conv(0);                   \
+  Scalar res(i, 7) = conv(0);                   \
+
+  internal::scalar_cast_op<int, Scalar> conv;
+  initResultRow(0);
+  initResultRow(1);
+  initResultRow(2);
+  initResultRow(3);
+  initResultRow(4);
+  initResultRow(5);
+  initResultRow(6);
+  initResultRow(7);
+#undef initResultRow
+
+  for (Index base_k = 0; base_k < k_size; base_k += 64) {
+    // wait for previous iteration to finish with shmem. Despite common sense,
+    // the code is a bit faster with this here then at bottom of loop
+    __syncthreads();
+
+    prefetchIntoRegisters(base_k);
+    writeRegToShmem();
+
+    #undef prefetchIntoRegisters
+    #undef writeRegToShmem
+
+    // wait for shared mem packing to be done before starting computation
+    __syncthreads();
+
+    // compute 8x8 matrix product by outer product. This involves packing one column
+    // of LHS and one row of RHS into registers (takes 16 registers).
+
+#define lcol(i) _lcol##i
+    Scalar lcol(0);
+    Scalar lcol(1);
+    Scalar lcol(2);
+    Scalar lcol(3);
+    Scalar lcol(4);
+    Scalar lcol(5);
+    Scalar lcol(6);
+    Scalar lcol(7);
+
+#define rrow(j) _rrow##j
+    Scalar rrow(0);
+    Scalar rrow(1);
+    Scalar rrow(2);
+    Scalar rrow(3);
+    Scalar rrow(4);
+    Scalar rrow(5);
+    Scalar rrow(6);
+    Scalar rrow(7);
+
+    // Now x corresponds to k, y to m, and z to n
+    const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
+    const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
+
+#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
+#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
+
+#define loadData(i, j)                          \
+    lcol(0) = lhs_element(0, j);               \
+    rrow(0) = rhs_element(i, 0);               \
+    lcol(1) = lhs_element(1, j);               \
+    rrow(1) = rhs_element(i, 1);               \
+    lcol(2) = lhs_element(2, j);               \
+    rrow(2) = rhs_element(i, 2);               \
+    lcol(3) = lhs_element(3, j);               \
+    rrow(3) = rhs_element(i, 3);               \
+    lcol(4) = lhs_element(4, j);               \
+    rrow(4) = rhs_element(i, 4);               \
+    lcol(5) = lhs_element(5, j);               \
+    rrow(5) = rhs_element(i, 5);               \
+    lcol(6) = lhs_element(6, j);               \
+    rrow(6) = rhs_element(i, 6);               \
+    lcol(7) = lhs_element(7, j);               \
+    rrow(7) = rhs_element(i, 7);               \
+
+#define computeCol(j)                           \
+    res(0, j) += lcol(0) * rrow(j);             \
+    res(1, j) += lcol(1) * rrow(j);             \
+    res(2, j) += lcol(2) * rrow(j);             \
+    res(3, j) += lcol(3) * rrow(j);             \
+    res(4, j) += lcol(4) * rrow(j);             \
+    res(5, j) += lcol(5) * rrow(j);             \
+    res(6, j) += lcol(6) * rrow(j);             \
+    res(7, j) += lcol(7) * rrow(j);             \
+
+#define computePass(i)                          \
+    loadData(i, i);                             \
+                                                \
+    computeCol(0);                              \
+    computeCol(1);                              \
+    computeCol(2);                              \
+    computeCol(3);                              \
+    computeCol(4);                              \
+    computeCol(5);                              \
+    computeCol(6);                              \
+    computeCol(7);                              \
+
+    computePass(0);
+    computePass(1);
+    computePass(2);
+    computePass(3);
+    computePass(4);
+    computePass(5);
+    computePass(6);
+    computePass(7);
+
+#undef lcol
+#undef rrow
+#undef lhs_element
+#undef rhs_element
+#undef loadData
+#undef computeCol
+#undef computePass
+  } // end loop over k
+
+  // we've now iterated over all of the large (ie width 64) k blocks and
+  // accumulated results in registers. At this point thread (x, y, z) contains
+  // the sum across all big k blocks of the product of little k block of index (x, y)
+  // with block of index (y, z). To compute the final output, we need to reduce
+  // the 8 threads over y by summation.
+#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
+#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
+#else
+#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
+#endif
+
+#define reduceRow(i, mask)                      \
+  shuffleInc(i, 0, mask);                       \
+  shuffleInc(i, 1, mask);                       \
+  shuffleInc(i, 2, mask);                       \
+  shuffleInc(i, 3, mask);                       \
+  shuffleInc(i, 4, mask);                       \
+  shuffleInc(i, 5, mask);                       \
+  shuffleInc(i, 6, mask);                       \
+  shuffleInc(i, 7, mask);                       \
+
+#define reduceMatrix(mask)                      \
+  reduceRow(0, mask);                           \
+  reduceRow(1, mask);                           \
+  reduceRow(2, mask);                           \
+  reduceRow(3, mask);                           \
+  reduceRow(4, mask);                           \
+  reduceRow(5, mask);                           \
+  reduceRow(6, mask);                           \
+  reduceRow(7, mask);                           \
+
+  // actually perform the reduction, now each thread of index (_, y, z)
+  // contains the correct values in its registers that belong in the output
+  // block
+  reduceMatrix(1);
+  reduceMatrix(2);
+  reduceMatrix(4);
+
+#undef shuffleInc
+#undef reduceRow
+#undef reduceMatrix
+
+  // now we need to copy the 64 values into main memory. We can't split work
+  // among threads because all variables are in registers. There's 2 ways
+  // to do this:
+  // (1) have 1 thread do 64 writes from registers into global memory
+  // (2) have 1 thread do 64 writes into shared memory, and then 8 threads
+  //     each do 8 writes into global memory. We can just overwrite the shared
+  //     memory from the problem we just solved.
+  // (2) is slightly faster than (1) due to less branching and more ILP
+
+  // TODO: won't yield much gain, but could just use currently unused shared mem
+  //       and then we won't have to sync
+  // wait for shared mem to be out of use
+  __syncthreads();
+
+#define writeResultShmem(i, j)                                          \
+  lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \
+
+#define writeRow(i)                             \
+  writeResultShmem(i, 0);                       \
+  writeResultShmem(i, 1);                       \
+  writeResultShmem(i, 2);                       \
+  writeResultShmem(i, 3);                       \
+  writeResultShmem(i, 4);                       \
+  writeResultShmem(i, 5);                       \
+  writeResultShmem(i, 6);                       \
+  writeResultShmem(i, 7);                       \
+
+  if (threadIdx.x == 0) {
+    writeRow(0);
+    writeRow(1);
+    writeRow(2);
+    writeRow(3);
+    writeRow(4);
+    writeRow(5);
+    writeRow(6);
+    writeRow(7);
+  }
+#undef writeResultShmem
+#undef writeRow
+
+  const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
+  const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
+
+  if (threadIdx.x < max_i_write) {
+    if (max_j_write == 8) {
+      // TODO: can i trade bank conflicts for coalesced writes?
+      Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0];
+      Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1];
+      Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2];
+      Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3];
+      Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4];
+      Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5];
+      Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6];
+      Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7];
+
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7;
+    } else {
+#pragma unroll 7
+      for (int j = 0; j < max_j_write; j++) {
+        Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j];
+        output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val;
+      }
+    }
+  }
+#undef res
+}
+
+
+template<typename Scalar, typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper>
+__global__ void
+#if defined(EIGEN_HIPCC)
+__launch_bounds__(512, 1)
+#else
+__launch_bounds__(512)
+#endif
+EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output,
+                       const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ Scalar lhs_shmem[72 * 64];
+  __shared__ Scalar rhs_shmem[72 * 64];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  if (base_m + 63 < m_size && base_n + 63 < n_size) {
+    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
+  } else {
+    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
+  }
+}
+
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
+         bool CHECK_RHS_BOUNDARY>
+__device__ EIGEN_STRONG_INLINE void
+EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output, float2 lhs_shmem2[][16],
+                       float2 rhs_shmem2[][8], const Index m_size,
+                       const Index n_size, const Index k_size,
+                       const Index base_m, const Index base_n) {
+
+  // prefetch registers
+  float4 lhs_pf0, rhs_pf0;
+
+  float4 results[4];
+  for (int i=0; i < 4; i++) {
+    results[i].x = results[i].y = results[i].z = results[i].w = 0;
+  }
+
+#define prefetch_lhs(reg, row, col)                            \
+    if (!CHECK_LHS_BOUNDARY) {                                 \
+      if (col < k_size) {                                      \
+        reg =lhs.template loadPacket<Unaligned>(row, col);     \
+      }                                                        \
+    } else {                                                   \
+      if (col < k_size) {                                      \
+        if (row + 3 < m_size) {                                \
+          reg =lhs.template loadPacket<Unaligned>(row, col);   \
+        } else if (row + 2 < m_size) {                         \
+          reg.x =lhs(row + 0, col);                            \
+          reg.y =lhs(row + 1, col);                            \
+          reg.z =lhs(row + 2, col);                            \
+        } else if (row + 1 < m_size) {                         \
+          reg.x =lhs(row + 0, col);                            \
+          reg.y =lhs(row + 1, col);                            \
+        } else if (row  < m_size) {                            \
+          reg.x =lhs(row + 0, col);                            \
+        }                                                      \
+      }                                                        \
+    }							       \
+
+  Index lhs_vert = base_m+threadIdx.x*4;
+
+  for (Index k = 0; k < k_size; k += 16) {
+
+    lhs_pf0 = internal::pset1<float4>(0);
+    rhs_pf0 = internal::pset1<float4>(0);
+
+    Index lhs_horiz = threadIdx.y+k;
+    prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz)
+
+    Index rhs_vert = k+(threadIdx.x%4)*4;
+    Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n;
+
+    if (!CHECK_RHS_BOUNDARY) {
+      if ((rhs_vert + 3) < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+      } else if (rhs_vert + 2 < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+      } else if (rhs_vert + 1 < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+      } else if (rhs_vert  < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+      }
+    } else {
+      if (rhs_horiz0 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+        } else if ((rhs_vert + 2) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        } else if ((rhs_vert + 1) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        } else if (rhs_vert  < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        }
+      }
+    }
+    float x1, x2 ;
+    // the following can be a bitwise operation..... some day.
+    if((threadIdx.x%8) < 4) {
+      x1 = rhs_pf0.y;
+      x2 = rhs_pf0.w;
+    } else {
+      x1 = rhs_pf0.x;
+      x2 = rhs_pf0.z;
+    }
+    #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
+    x1 = __shfl_xor(x1, 4);
+    x2 = __shfl_xor(x2, 4);
+    #else
+    x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4);
+    x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4);
+    #endif
+    if((threadIdx.x%8) < 4) {
+      rhs_pf0.y = x1;
+      rhs_pf0.w = x2;
+    } else {
+      rhs_pf0.x = x1;
+      rhs_pf0.z = x2;
+    }
+
+    // We have 64 features.
+    // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1.
+    // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3.
+    // ...
+    // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63
+    // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1
+    // ...
+    rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y);
+    rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w);
+
+    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // ...
+    // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63)
+    // ...
+
+    lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y);
+    lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w);
+
+
+#define add_vals(fl1, fl2, fr1, fr2)\
+    results[0].x += fl1.x * fr1.x;\
+    results[0].y += fl1.y * fr1.x;\
+    results[0].z += fl2.x * fr1.x;\
+    results[0].w += fl2.y * fr1.x;\
+\
+    results[1].x += fl1.x * fr1.y;\
+    results[1].y += fl1.y * fr1.y;\
+    results[1].z += fl2.x * fr1.y;\
+    results[1].w += fl2.y * fr1.y;\
+\
+    results[2].x += fl1.x * fr2.x;\
+    results[2].y += fl1.y * fr2.x;\
+    results[2].z += fl2.x * fr2.x;\
+    results[2].w += fl2.y * fr2.x;\
+\
+    results[3].x += fl1.x * fr2.y;\
+    results[3].y += fl1.y * fr2.y;\
+    results[3].z += fl2.x * fr2.y;\
+    results[3].w += fl2.y * fr2.y;\
+
+    __syncthreads();
+
+    // Do the multiplies.
+    #pragma unroll
+    for (int koff = 0; koff < 16; koff ++) {
+      // 32 x threads.
+      float2 fl1 = lhs_shmem2[koff][threadIdx.x];
+      float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x];
+
+      int start_feature = threadIdx.y * 4;
+      float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
+      float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
+
+      add_vals(fl1, fl2, fr1, fr2)
+    }
+    __syncthreads();
+  }
+
+#undef prefetch_lhs
+#undef add_vals
+
+  Index horiz_base = threadIdx.y*4+base_n;
+  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
+    for (int i = 0; i < 4; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }
+  } else if (!CHECK_RHS_BOUNDARY) {
+    // CHECK LHS
+    if (lhs_vert + 3 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    } else if (lhs_vert + 2 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      }
+    } else if (lhs_vert + 1 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      }
+    } else if (lhs_vert  < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+      }
+    }
+  } else if (!CHECK_LHS_BOUNDARY) {
+    // CHECK RHS
+    /*
+    int ncols_rem = fminf(n_size- horiz_base, 4);
+    for (int i = 0; i < ncols_rem; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }*/
+    for (int i = 0; i < 4; i++) {
+      if (horiz_base+i < n_size) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+       }
+    }
+  } else {
+    // CHECK both boundaries.
+    for (int i = 0; i < 4; i++) {
+      if (horiz_base+i < n_size) {
+        if (lhs_vert < m_size)
+          output(lhs_vert, horiz_base + i) = results[i].x;
+        if (lhs_vert + 1 < m_size)
+          output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        if (lhs_vert + 2 < m_size)
+          output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        if (lhs_vert + 3 < m_size)
+          output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  }
+}
+
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
+         bool CHECK_RHS_BOUNDARY>
+__device__ EIGEN_STRONG_INLINE void
+EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output, float2 lhs_shmem2[][32],
+                       float2 rhs_shmem2[][8], const Index m_size,
+                       const Index n_size, const Index k_size,
+                       const Index base_m, const Index base_n) {
+
+  // prefetch registers
+  float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
+  float4 rhs_pf0, rhs_pf1;
+
+  float4 results[8];
+  for (int i=0; i < 8; i++) {
+    results[i].x = results[i].y = results[i].z = results[i].w = 0;
+  }
+
+  Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32;
+  for (Index k = 0; k < k_size; k += 32) {
+    lhs_pf0 = internal::pset1<float4>(0);
+    lhs_pf1 = internal::pset1<float4>(0);
+    lhs_pf2 = internal::pset1<float4>(0);
+    lhs_pf3 = internal::pset1<float4>(0);
+
+    rhs_pf0 = internal::pset1<float4>(0);
+    rhs_pf1 = internal::pset1<float4>(0);
+
+     if (!CHECK_LHS_BOUNDARY) {
+      if ((threadIdx.y/4+k+24) < k_size) {
+        lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+        lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
+      } else if ((threadIdx.y/4+k+16) < k_size) {
+        lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+      } else if ((threadIdx.y/4+k+8) < k_size) {
+        lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+      } else if ((threadIdx.y/4+k) < k_size) {
+        lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+      }
+    } else {
+      // just CHECK_LHS_BOUNDARY
+      if (lhs_vert + 3 < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+          lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+          lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+          lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        }
+      } else if (lhs_vert + 2 < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+          lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
+          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+          lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
+          lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+          lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+        }
+      } else if (lhs_vert + 1 < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+          lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+        }
+      } else if (lhs_vert < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+        }
+      }
+    }
+    __syncthreads();
+    Index rhs_vert = k+threadIdx.x*4;
+    Index rhs_horiz0 = threadIdx.y*2+base_n;
+    Index rhs_horiz1 = threadIdx.y*2+1+base_n;
+    if (!CHECK_RHS_BOUNDARY) {
+      if ((rhs_vert + 3) < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+        rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
+      } else if (rhs_vert + 2 < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+        rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+      } else if (rhs_vert + 1 < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+      } else if (rhs_vert  < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+      }
+    } else {
+      if (rhs_horiz1 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+          rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
+        } else if (rhs_vert + 2 < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+          rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+        } else if (k+threadIdx.x*4 + 1 < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+        } else if (k+threadIdx.x*4  < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        }
+      } else if (rhs_horiz0 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+        } else if ((rhs_vert + 2) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        } else if ((rhs_vert + 1) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        } else if (rhs_vert  < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        }
+      }
+    }
+    __syncthreads();
+    // Loaded. Do computation
+    // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1.
+    // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3.
+    // ..
+    // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63
+    rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x);
+    // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1.
+    // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3.
+    // ..
+    rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y);
+    // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1.
+    // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3.
+    rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z);
+    // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1.
+    // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3.
+    rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w);
+
+    // LHS.
+    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
+    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
+    // ...
+    // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
+    // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
+
+
+#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\
+      results[0].x += a_feat1.x * f1.x;\
+      results[1].x += a_feat1.x * f1.y;\
+      results[2].x += a_feat1.x * f2.x;\
+      results[3].x += a_feat1.x * f2.y;\
+      results[4].x += a_feat1.x * f3.x;\
+      results[5].x += a_feat1.x * f3.y;\
+      results[6].x += a_feat1.x * f4.x;\
+      results[7].x += a_feat1.x * f4.y;\
+\
+      results[0].y += a_feat1.y * f1.x;\
+      results[1].y += a_feat1.y * f1.y;\
+      results[2].y += a_feat1.y * f2.x;\
+      results[3].y += a_feat1.y * f2.y;\
+      results[4].y += a_feat1.y * f3.x;\
+      results[5].y += a_feat1.y * f3.y;\
+      results[6].y += a_feat1.y * f4.x;\
+      results[7].y += a_feat1.y * f4.y;\
+\
+      results[0].z += a_feat2.x * f1.x;\
+      results[1].z += a_feat2.x * f1.y;\
+      results[2].z += a_feat2.x * f2.x;\
+      results[3].z += a_feat2.x * f2.y;\
+      results[4].z += a_feat2.x * f3.x;\
+      results[5].z += a_feat2.x * f3.y;\
+      results[6].z += a_feat2.x * f4.x;\
+      results[7].z += a_feat2.x * f4.y;\
+\
+      results[0].w += a_feat2.y * f1.x;\
+      results[1].w += a_feat2.y * f1.y;\
+      results[2].w += a_feat2.y * f2.x;\
+      results[3].w += a_feat2.y * f2.y;\
+      results[4].w += a_feat2.y * f3.x;\
+      results[5].w += a_feat2.y * f3.y;\
+      results[6].w += a_feat2.y * f4.x;\
+      results[7].w += a_feat2.y * f4.y;\
+
+    lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y);
+    lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y);
+    lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y);
+    lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y);
+
+    lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w);
+    lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w);
+    lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w);
+    lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w);
+
+    __syncthreads();
+
+    // Do the multiplies.
+    #pragma unroll
+    for (int koff = 0; koff < 32; koff ++) {
+      float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8];
+      float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8];
+
+      // first feature is at (threadIdx.y/4) * 8 last is at start + 8.
+      int start_feature = (threadIdx.y / 4) * 8;
+
+      float2 br1 = rhs_shmem2[start_feature/2 +     (koff % 4) * 32][koff/4];
+      float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4];
+      float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4];
+      float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4];
+
+      add_vals(a3, a4, br1, br2, br3, br4)
+    }
+    __syncthreads();
+  } // end loop over k
+
+  __syncthreads();
+  Index horiz_base = (threadIdx.y/4)*8+base_n;
+  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
+    for (int i = 0; i < 8; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }
+  } else if (!CHECK_RHS_BOUNDARY) {
+    if (lhs_vert + 3 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    } else if (lhs_vert + 2 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      }
+    } else if (lhs_vert + 1 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      }
+    } else if (lhs_vert  < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+      }
+    }
+  } else if (!CHECK_LHS_BOUNDARY) {
+    // CHECK BOUNDARY_B
+    for (int i = 0; i < 8; i++) {
+      if (horiz_base + i < n_size) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  } else {
+    // CHECK both boundaries.
+    for (int i = 0; i < 8; i++) {
+      if (horiz_base + i < n_size) {
+        if (lhs_vert < m_size)
+          output(lhs_vert, horiz_base + i) = results[i].x;
+        if (lhs_vert + 1 < m_size)
+          output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        if (lhs_vert + 2 < m_size)
+          output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        if (lhs_vert + 3 < m_size)
+          output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  }
+}
+
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper>
+__global__ void
+#if defined(EIGEN_HIPCC)
+__launch_bounds__(256, 1)
+#else
+__launch_bounds__(256)
+#endif
+EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output,
+                       const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ float2 lhs_shmem[64*32];
+  __shared__ float2 rhs_shmem[128*8];
+
+  typedef float2 LHS_MEM[64][32];
+  typedef float2 RHS_MEM[128][8];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 128 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  bool check_rhs = (base_n + 63) >= n_size;
+  bool check_lhs128 = (base_m + 127) >= m_size;
+
+  if (!check_rhs) {
+    if (!check_lhs128) {
+      // >= 128 rows left
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    }
+  } else {
+    if (!check_lhs128) {
+      // >= 128 rows left
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    }
+  }
+}
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper>
+__global__ void
+#if defined(EIGEN_HIPCC)
+__launch_bounds__(256, 1)
+#else
+__launch_bounds__(256)
+#endif
+EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output,
+                       const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ float2 lhs_shmem[32][16];
+  __shared__ float2 rhs_shmem[64][8];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  if (base_m + 63 < m_size) {
+    if (base_n + 63 < n_size) {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    }
+  } else {
+    if (base_n + 63 < n_size) {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    }
+  }
+}
+
+
+template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> :
+    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> > {
+
+  static_assert(std::is_same<OutputKernelType, const NoOpOutputKernel>::value,
+                "GPU tensor contraction does not support output kernels.");
+
+  typedef GpuDevice Device;
+
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
+  typedef TensorContractionEvaluatorBase<Self> Base;
+
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
+
+  enum {
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+  };
+
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+
+  static const int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static const int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static const int ContractDims = internal::array_size<Indices>::value;
+
+  typedef array<Index, LDims> left_dim_mapper_t;
+  typedef array<Index, RDims> right_dim_mapper_t;
+
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
+
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
+
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  // typedefs needed in evalTo
+  typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+  typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+
+  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+  typedef typename LeftEvaluator::Dimensions LeftDimensions;
+  typedef typename RightEvaluator::Dimensions RightDimensions;
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
+      Base(op, device) {}
+
+  // We need to redefine this method to make nvcc happy
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+    this->m_leftImpl.evalSubExprsIfNeeded(NULL);
+    this->m_rightImpl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      evalTo(data);
+      return false;
+    } else {
+      this->m_result = static_cast<Scalar *>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
+      evalTo(this->m_result);
+      return true;
+    }
+  }
+
+  void evalTo(Scalar* buffer) const {
+    if (this->m_lhs_inner_dim_contiguous) {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, true, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<true, true, false, Unaligned>(buffer);
+        }
+      }
+      else {
+       if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, false, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<true, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+    else {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, true, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<false, true, false, Unaligned>(buffer);
+        }
+      }
+      else {
+       if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, false, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<false, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+  }
+
+  template <typename LhsScalar, typename RhsScalar, typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels {
+    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
+    const Index m_blocks = (m + 63) / 64;
+    const Index n_blocks = (n + 63) / 64;
+    const dim3 num_blocks(m_blocks, n_blocks, 1);
+    const dim3 block_size(8, 8, 8);
+    LAUNCH_GPU_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
+    }
+  };
+
+  template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels<float, float, Index, LhsMapper, RhsMapper, OutputMapper> {
+    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
+      if (m < 768 || n < 768) {
+        const Index m_blocks = (m + 63) / 64;
+        const Index n_blocks = (n + 63) / 64;
+        const dim3 num_blocks(m_blocks, n_blocks, 1);
+        const dim3 block_size(16, 16, 1);
+        LAUNCH_GPU_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
+      } else {
+        const Index m_blocks = (m + 127) / 128;
+        const Index n_blocks = (n + 63) / 64;
+        const dim3 num_blocks(m_blocks, n_blocks, 1);
+        const dim3 block_size(8, 32, 1);
+        LAUNCH_GPU_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
+      }
+    }
+  };
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalTyped(Scalar* buffer) const {
+    // columns in left side, rows in right side
+    const Index k = this->m_k_size;
+    EIGEN_UNUSED_VARIABLE(k)
+
+    // rows in left side
+    const Index m = this->m_i_size;
+
+    // columns in right side
+    const Index n = this->m_j_size;
+
+    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
+    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+                                                   LeftEvaluator, left_nocontract_t,
+                                                   contract_t, 4,
+                                                   lhs_inner_dim_contiguous,
+                                                   false, Unaligned> LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+                                                   RightEvaluator, right_nocontract_t,
+                                                   contract_t, 4,
+                                                   rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+    OutputMapper output(buffer, m);
+
+#if defined(EIGEN_USE_HIP)
+    setGpuSharedMemConfig(hipSharedMemBankSizeEightByte);
+#else
+    setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte);
+#endif
+
+    LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output,  m, n, k, this->m_device);
+  }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_USE_GPU and EIGEN_GPUCC
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
index e6840bc87..35f931c53 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
@@ -23,15 +23,18 @@
 namespace Eigen {
 
 template <typename Index, typename LhsScalar, typename RhsScalar,bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels;
-template<typename Indices, typename LeftArgType, typename RightArgType>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, const Eigen::SyclDevice> :
-    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, const Eigen::SyclDevice> > {
+template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, const Eigen::SyclDevice> :
+    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, const Eigen::SyclDevice> > {
+
+  static_assert(std::is_same<OutputKernelType, const NoOpOutputKernel>::value,
+                "SYCL tensor contraction does not support output kernels.");
 
   typedef const Eigen::SyclDevice Device;
 
-  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
   typedef TensorContractionEvaluatorBase<Self> Base;
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 6fb69910e..1d145c4b1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -15,57 +15,16 @@
 
 namespace Eigen {
 
-#ifdef EIGEN_USE_SIMPLE_THREAD_POOL
-namespace internal {
-
-template<typename LhsScalar, typename LhsMapper, typename Index>
-struct packLhsArg {
-  LhsScalar* blockA;
-  const LhsMapper& lhs;
-  const Index m_start;
-  const Index k_start;
-  const Index mc;
-  const Index kc;
-};
-
-template<typename LhsScalar, typename RhsScalar, typename RhsMapper, typename OutputMapper, typename Index>
-struct packRhsAndKernelArg {
-  const MaxSizeVector<LhsScalar*>* blockAs;
-  RhsScalar* blockB;
-  const RhsMapper& rhs;
-  OutputMapper& output;
-  const Index m;
-  const Index k;
-  const Index n;
-  const Index mc;
-  const Index kc;
-  const Index nc;
-  const Index num_threads;
-  const Index num_blockAs;
-  const Index max_m;
-  const Index k_block_idx;
-  const Index m_block_idx;
-  const Index n_block_idx;
-  const Index m_blocks;
-  const Index n_blocks;
-  MaxSizeVector<Notification*>* kernel_notifications;
-  const MaxSizeVector<Notification*>* lhs_notifications;
-  const bool need_to_pack;
-};
-
-}  // end namespace internal
-#endif  // EIGEN_USE_SIMPLE_THREAD_POOL
-
-template<typename Indices, typename LeftArgType, typename RightArgType>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> :
-    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> > {
+template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> :
+    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> > {
 
   typedef ThreadPoolDevice Device;
 
-  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
   typedef TensorContractionEvaluatorBase<Self> Base;
 
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
@@ -112,9 +71,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   TensorEvaluator(const XprType& op, const Device& device) :
       Base(op, device) {}
 
-#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
-            bool rhs_inner_dim_reordered, int Alignment>
+  template <int Alignment>
   void evalProduct(Scalar* buffer) const {
     const Index m = this->m_i_size;
     const Index n = this->m_j_size;
@@ -138,39 +95,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     }
 #endif
 
-    typedef
-        typename internal::remove_const<typename EvalLeftArgType::Scalar>::type
-            LhsScalar;
-    typedef
-        typename internal::remove_const<typename EvalRightArgType::Scalar>::type
-            RhsScalar;
-    typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
-    typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
-    typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
-    typedef internal::TensorContractionInputMapper<
-        LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
-        contract_t, internal::packet_traits<LhsScalar>::size,
-        lhs_inner_dim_contiguous, false, Unaligned>
-        LhsMapper;
-    typedef internal::TensorContractionInputMapper<
-        RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
-        contract_t, internal::packet_traits<RhsScalar>::size,
-        rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
-        RhsMapper;
-    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-    typedef internal::gemm_pack_lhs<LhsScalar, Index,
-                                    typename LhsMapper::SubMapper, Traits::mr,
-                                    Traits::LhsProgress, ColMajor>
-        LhsPacker;
-    typedef internal::gemm_pack_rhs<
-        RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor>
-        RhsPacker;
-    typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
-                                  Traits::mr, Traits::nr, false, false>
-        GebpKernel;
-
-
-
     // Compute a set of algorithm parameters:
     // - kernel block sizes (bm, bn, bk)
     // - task grain sizes (number of kernels executed per task: gm, gn)
@@ -200,14 +124,14 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     // Again, we don't know number of threads yet, so we use 2.
     Index bm, bn, bk;
     if (shard_by_col) {
-      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+      internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index,
                                           internal::ShardByCol>
           blocking(k, m, n, 2);
       bm = blocking.mc();
       bn = blocking.nc();
       bk = blocking.kc();
     } else {
-      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+      internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index,
                                           internal::ShardByRow>
           blocking(k, m, n, 2);
       bm = blocking.mc();
@@ -229,29 +153,22 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     if (n == 1) num_threads = 1;
 
     if (num_threads == 1) {
-      // The single-threaded algorithm should be faster in this case.
-      if (n == 1)
-        this->template evalGemv<lhs_inner_dim_contiguous,
-                                rhs_inner_dim_contiguous,
-                                rhs_inner_dim_reordered, Alignment>(buffer);
-      else
-        this->template evalGemm<lhs_inner_dim_contiguous,
-                                rhs_inner_dim_contiguous,
-                                rhs_inner_dim_reordered, Alignment>(buffer);
+      TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential,
+                                  Unaligned, (buffer));
       return;
     }
 
     // Now that we know number of threads, recalculate sharding and blocking.
     shard_by_col = shardByCol(m, n, num_threads);
     if (shard_by_col) {
-      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+      internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index,
                                           internal::ShardByCol>
           blocking(k, m, n, num_threads);
       bm = blocking.mc();
       bn = blocking.nc();
       bk = blocking.kc();
     } else {
-      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+      internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index,
                                           internal::ShardByRow>
           blocking(k, m, n, num_threads);
       bm = blocking.mc();
@@ -299,36 +216,59 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     // more important in this case.
     if ((shard_by_col ? nm : nn) == 1) parallel_pack = false;
 
-    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides,
-                  this->m_i_strides, this->m_left_contracting_strides,
-                  this->m_k_strides);
+    #define CONTEXT_ARGS                                                        \
+  (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, \
+   nn0, shard_by_col, parallel_pack)                                        \
+      .run()
 
-    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides,
-                  this->m_j_strides, this->m_right_contracting_strides,
-                  this->m_k_strides);
+    TENSOR_CONTRACTION_DISPATCH(Context, Alignment, CONTEXT_ARGS);
+
+#undef CONTEXT_ARGS
 
-    Context<LhsPacker, RhsPacker, GebpKernel, LhsMapper, RhsMapper,
-            OutputMapper>(this->m_device, num_threads, lhs, rhs, buffer, m, n,
-                          k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0,
-                          shard_by_col, parallel_pack)
-        .run();
   }
 
   // Context coordinates a single parallel gemm operation.
-  template <typename LhsPacker, typename RhsPacker, typename GebpKernel,
-            typename LhsMapper, typename RhsMapper, typename OutputMapper>
+ template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
+            bool rhs_inner_dim_reordered, int Alignment>
   class Context {
    public:
-    Context(const Device& device, int num_threads, LhsMapper& lhs,
-            RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm,
-            Index bn, Index bk, Index nm, Index nn, Index nk, Index gm,
-            Index gn, Index nm0, Index nn0, bool shard_by_col,
+    typedef internal::TensorContractionInputMapper<
+        LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
+        contract_t, internal::packet_traits<LhsScalar>::size,
+        lhs_inner_dim_contiguous, false, Unaligned>
+        LhsMapper;
+    typedef internal::TensorContractionInputMapper<
+        RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
+        contract_t, internal::packet_traits<RhsScalar>::size,
+        rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
+        RhsMapper;
+    typedef internal::gemm_pack_lhs<LhsScalar, Index,
+                                    typename LhsMapper::SubMapper, Traits::mr,
+                                    Traits::LhsProgress, ColMajor>
+        LhsPacker;
+    typedef internal::gemm_pack_rhs<
+        RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor>
+        RhsPacker;
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+    typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
+                                  Traits::mr, Traits::nr, false, false>
+        GebpKernel;
+
+    Context(const Self* self, int num_threads, Scalar* buffer, Index tm, Index tn,
+            Index tk, Index bm, Index bn, Index bk, Index nm, Index nn, Index nk,
+            Index gm, Index gn, Index nm0, Index nn0, bool shard_by_col,
             bool parallel_pack)
-        : device_(device),
-          lhs_(lhs),
-          rhs_(rhs),
+        : device_(self->m_device),
+          lhs_(self->m_leftImpl, self->m_left_nocontract_strides,
+               self->m_i_strides, self->m_left_contracting_strides,
+               self->m_k_strides),
+          rhs_(self->m_rightImpl, self->m_right_nocontract_strides,
+               self->m_j_strides, self->m_right_contracting_strides,
+               self->m_k_strides),
           buffer_(buffer),
           output_(buffer, tm),
+          output_kernel_(self->m_output_kernel),
+          tensor_contraction_params_(self->m_tensor_contraction_params),
           num_threads_(num_threads),
           shard_by_col_(shard_by_col),
           parallel_pack_(parallel_pack),
@@ -350,7 +290,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
         // Normal number of notifications for k slice switch is
         // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only
         // nm_ + nn_ notifications, because they will not receive notifications
-        // from preceeding kernels.
+        // from preceding kernels.
         state_switch_[x] =
             x == 0
                 ? 1
@@ -416,10 +356,12 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
    private:
     Notification done_;
     const Device& device_;
-    LhsMapper& lhs_;
-    RhsMapper& rhs_;
+    LhsMapper lhs_;
+    RhsMapper rhs_;
     Scalar* const buffer_;
     OutputMapper output_;
+    OutputKernelType output_kernel_;
+    TensorContractionParams tensor_contraction_params_;
     const int num_threads_;
     const bool shard_by_col_;
     const bool parallel_pack_;
@@ -530,25 +472,38 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
 
     void kernel(Index m, Index n, Index k) {
       // Note: order of iteration matters here. Iteration over m is innermost
-      // because we want to reuse the same packed rhs in consequetive tasks
+      // because we want to reuse the same packed rhs in consecutive tasks
       // (rhs fits into L2$ while lhs only into L3$).
       const Index nend = n * gn_ + gn(n);
       const Index mend = m * gm_ + gm(m);
       if (shard_by_col_) {
         for (Index n1 = n * gn_; n1 < nend; n1++) {
-          for (Index m1 = m * gm_; m1 < mend; m1++)
-            GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_),
-                         packed_lhs_[k % (P - 1)][m1],
+          for (Index m1 = m * gm_; m1 < mend; m1++) {
+            const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
+            GebpKernel()(output_mapper, packed_lhs_[k % (P - 1)][m1],
                          packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
                          Scalar(1), -1, -1, 0, 0);
+
+            // We are done with the last task for the [m1, n1] block.
+            if (k + 1 == nk_) {
+              output_kernel_(output_mapper, tensor_contraction_params_,
+                             m1 * bm_, n1 * bn_, bm(m1), bn(n1));
+            }
+          }
         }
       } else {
         for (Index m1 = m * gm_; m1 < mend; m1++)
           for (Index n1 = n * gn_; n1 < nend; n1++) {
-            GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_),
-                         packed_lhs_[k % (P - 1)][m1],
+            const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
+            GebpKernel()(output_mapper, packed_lhs_[k % (P - 1)][m1],
                          packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
                          Scalar(1), -1, -1, 0, 0);
+
+            // We are done with the last task for the [m1, n1] block.
+            if (k + 1 == nk_) {
+              output_kernel_(output_mapper, tensor_contraction_params_,
+                             m1 * bm_, n1 * bn_, bm(m1), bn(n1));
+            }
           }
       }
       signal_kernel(m, n, k + 1, false);
@@ -623,11 +578,13 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
         else
           pack_lhs(start, k);
       } else {
-        Index mid = (start + end) / 2;
-        device_.enqueueNoNotification(
-            [=]() { enqueue_packing_helper(mid, end, k, rhs); });
-        device_.enqueueNoNotification(
-            [=]() { enqueue_packing_helper(start, mid, k, rhs); });
+        while (end - start > 1) {
+          Index mid = (start + end) / 2;
+          device_.enqueueNoNotification(
+              [=]() { enqueue_packing_helper(mid, end, k, rhs); });
+          end = mid;
+        }
+        enqueue_packing_helper(start, end, k, rhs);
       }
     }
 
@@ -746,284 +703,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     return 0;
   }
 
-#else  // EIGEN_USE_SIMPLE_THREAD_POOL
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  void evalProduct(Scalar* buffer) const {
-    if (this->m_j_size == 1) {
-      this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
-      return;
-    }
-
-    evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
-  }
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  void evalGemm(Scalar* buffer) const {
-    // columns in left side, rows in right side
-    const Index k = this->m_k_size;
-
-    // rows in left side
-    const Index m = this->m_i_size;
-
-    // columns in right side
-    const Index n = this->m_j_size;
-
-    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
-    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
-
-
-    const int lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
-    const int rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;
-
-    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
-                                                   LeftEvaluator, left_nocontract_t,
-                                                   contract_t, lhs_packet_size,
-                                                   lhs_inner_dim_contiguous,
-                                                   false, Unaligned> LhsMapper;
-
-    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
-                                                   RightEvaluator, right_nocontract_t,
-                                                   contract_t, rhs_packet_size,
-                                                   rhs_inner_dim_contiguous,
-                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
-
-    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-
-    // TODO: packing could be faster sometimes if we supported row major tensor mappers
-    typedef internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, Traits::mr,
-                                    Traits::LhsProgress, ColMajor> LhsPacker;
-    typedef internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> RhsPacker;
-
-    // TODO: replace false, false with conjugate values?
-    typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
-                                  Traits::mr, Traits::nr, false, false> GebpKernel;
-
-    typedef internal::packLhsArg<LhsScalar, LhsMapper, Index> packLArg;
-    typedef internal::packRhsAndKernelArg<LhsScalar, RhsScalar, RhsMapper, OutputMapper, Index> packRKArg;
-
-    // initialize data mappers
-    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
-                  this->m_left_contracting_strides, this->m_k_strides);
-
-    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
-                  this->m_right_contracting_strides, this->m_k_strides);
-
-    OutputMapper output(buffer, m);
-
-    // compute block sizes (which depend on number of threads)
-    const Index num_threads = this->m_device.numThreads();
-    internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, num_threads);
-    Index mc = blocking.mc();
-    Index nc = blocking.nc();
-    Index kc = blocking.kc();
-    eigen_assert(mc <= m);
-    eigen_assert(nc <= n);
-    eigen_assert(kc <= k);
-
-#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
-    const Index k_blocks = CEIL_DIV(k, kc);
-    const Index n_blocks = CEIL_DIV(n, nc);
-    const Index m_blocks = CEIL_DIV(m, mc);
-    const Index sizeA = mc * kc;
-    const Index sizeB = kc * nc;
-
-    /*    cout << "m: " << m << " n: " << n << " k: " << k << endl;
-    cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl;
-    cout << "m_blocks: " << m_blocks << " n_blocks: " << n_blocks << " k_blocks: " << k_blocks << endl;
-    cout << "num threads: " << num_threads << endl;
-    */
-
-    // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB
-    //       aren't 16 byte aligned segfaults will happen due to SIMD instructions
-    // note: You can get away with allocating just a single blockA and offsets and meet the
-    //       the alignment requirements with the assumption that
-    //       (Traits::mr * sizeof(ResScalar)) % 16 == 0
-    const Index numBlockAs = numext::mini(num_threads, m_blocks);
-    MaxSizeVector<LhsScalar *> blockAs(num_threads);
-    for (int i = 0; i < num_threads; i++) {
-      blockAs.push_back(static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar))));
-    }
-
-    // To circumvent alignment issues, I'm just going to separately allocate the memory for each thread
-    // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful.
-    //       Other options: (1) reuse memory when a thread finishes. con: tricky
-    //                      (2) allocate block B memory in each thread. con: overhead
-    MaxSizeVector<RhsScalar *> blockBs(n_blocks);
-    for (int i = 0; i < n_blocks; i++) {
-      blockBs.push_back(static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar))));
-    }
-
-    // lhs_notifications starts with all null Notifications
-    MaxSizeVector<Notification*> lhs_notifications(num_threads, nullptr);
-
-    // this should really be numBlockAs * n_blocks;
-    const Index num_kernel_notifications = num_threads * n_blocks;
-    MaxSizeVector<Notification*> kernel_notifications(num_kernel_notifications,
-                                                    nullptr);
-
-    for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) {
-      const Index k_start = k_block_idx * kc;
-      // make sure we don't overshoot right edge of left matrix
-      const Index actual_kc = numext::mini(k_start + kc, k) - k_start;
-
-      for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) {
-        const Index num_blocks = numext::mini(m_blocks-m_block_idx, numBlockAs);
-
-        for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) {
-          const Index m_start = mt_block_idx * mc;
-          const Index actual_mc = numext::mini(m_start + mc, m) - m_start;
-          eigen_assert(actual_mc > 0);
-
-          Index blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads;
-
-          for (int i = 0; i < n_blocks; ++i) {
-            Index notification_id = (blockAId * n_blocks + i);
-            // Wait for any current kernels using this slot to complete
-            // before using it.
-            if (kernel_notifications[notification_id]) {
-              wait_until_ready(kernel_notifications[notification_id]);
-              delete kernel_notifications[notification_id];
-            }
-            kernel_notifications[notification_id] = new Notification();
-          }
-          const packLArg arg = {
-            blockAs[blockAId], // blockA
-            lhs,        // lhs
-            m_start,    // m
-            k_start,    // k
-            actual_mc,  // mc
-            actual_kc,  // kc
-          };
-
-          // Delete any existing notification since we may be
-          // replacing it.  The algorithm should ensure that there are
-          // no existing waiters on this notification.
-          delete lhs_notifications[blockAId];
-          lhs_notifications[blockAId] =
-          this->m_device.enqueue(&Self::packLhs<packLArg, LhsPacker>, arg);
-        }
-
-        // now start kernels.
-        const Index m_base_start = m_block_idx * mc;
-        const bool need_to_pack = m_block_idx == 0;
-
-        for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) {
-          const Index n_start = n_block_idx * nc;
-          const Index actual_nc = numext::mini(n_start + nc, n) - n_start;
-
-          // first make sure the previous kernels are all done before overwriting rhs. Also wait if
-          // we're going to start new k. In both cases need_to_pack is true.
-          if (need_to_pack) {
-            for (Index i = num_blocks; i < num_threads; ++i) {
-              Index blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads;
-              Index future_id = (blockAId * n_blocks + n_block_idx);
-              wait_until_ready(kernel_notifications[future_id]);
-            }
-          }
-
-          packRKArg arg = {
-            &blockAs, // blockA
-            blockBs[n_block_idx], // blockB
-            rhs,          // rhs
-            output,       // output
-            m_base_start, // m
-            k_start,      // k
-            n_start,      // n
-            mc,           // mc
-            actual_kc,    // kc
-            actual_nc,    // nc
-            num_threads,
-            numBlockAs,
-            m,
-            k_block_idx,
-            m_block_idx,
-            n_block_idx, // n_block_idx
-            m_blocks, // m_blocks
-            n_blocks, // n_blocks
-            &kernel_notifications, // kernel notifications
-            &lhs_notifications,    // lhs notifications
-            need_to_pack, // need_to_pack
-          };
-
-          // We asynchronously kick off this function, which ends up
-          // notifying the appropriate kernel_notifications objects,
-          // which this thread waits on before exiting.
-          this->m_device.enqueueNoNotification(&Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>, arg);
-        }
-      }
-    }
-
-    // Make sure all the kernels are done.
-    for (size_t i = 0; i < kernel_notifications.size(); ++i) {
-      wait_until_ready(kernel_notifications[i]);
-      delete kernel_notifications[i];
-    }
-
-    // No need to wait for lhs notifications since they should have
-    // already been waited on.  Just clean them up.
-    for (size_t i = 0; i < lhs_notifications.size(); ++i) {
-      delete lhs_notifications[i];
-    }
-
-    // deallocate all of the memory for both A and B's
-    for (size_t i = 0; i < blockAs.size(); i++) {
-      this->m_device.deallocate(blockAs[i]);
-    }
-    for (size_t i = 0; i < blockBs.size(); i++) {
-      this->m_device.deallocate(blockBs[i]);
-    }
-
-#undef CEIL_DIV
-  }
-
-  /*
-   * Packs a LHS block of size (mt, kc) starting at lhs(m, k). Before packing
-   * the LHS block, check that all of the kernels that worked on the same
-   * mt_block_idx in the previous m_block are done.
-   */
-  template <typename packLArg, typename LhsPacker>
-  static void packLhs(const packLArg arg) {
-    // perform actual packing
-    LhsPacker pack_lhs;
-    pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m_start, arg.k_start), arg.kc, arg.mc);
-  }
-
-  /*
-   * Packs a RHS block of size (kc, nc) starting at (k, n) after checking that
-   * all kernels in the previous block are done.
-   * Then for each LHS future, we wait on the future and then call GEBP
-   * on the area packed by the future (which starts at
-   * blockA + future_idx * mt * kc) on the LHS and with the full packed
-   * RHS block.
-   * The output of this GEBP is written to output(m + i * mt, n).
-   */
-  template <typename packRKArg, typename RhsPacker, typename GebpKernel>
-  static void packRhsAndKernel(packRKArg arg) {
-    if (arg.need_to_pack) {
-      RhsPacker pack_rhs;
-      pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k, arg.n), arg.kc, arg.nc);
-    }
-
-    GebpKernel gebp;
-    for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) {
-      const Index m_base_start = arg.m + arg.mc*mt_block_idx;
-      if (m_base_start < arg.max_m) {
-        Index blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads;
-        wait_until_ready((*arg.lhs_notifications)[blockAId]);
-        const Index actual_mc = numext::mini(m_base_start + arg.mc, arg.max_m) - m_base_start;
-        gebp(arg.output.getSubMapper(m_base_start, arg.n),
-             (*arg.blockAs)[blockAId], arg.blockB,
-             actual_mc, arg.kc, arg.nc, Scalar(1), -1, -1, 0, 0);
-
-        // Notify that the kernel is done.
-        const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx;
-        (*arg.kernel_notifications)[set_idx]->Notify();
-      }
-    }
-  }
-#endif  // EIGEN_USE_SIMPLE_THREAD_POOL
-
   TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk,
                                bool shard_by_col, bool prepacked) const {
     const int packed_size = std::min<int>(PacketType<LhsScalar, Device>::size,
@@ -1065,6 +744,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   }
 
 #if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
+  // TODO(ezhulenev): Add support for output kernels and LIBXSMM.
+  static_assert(std::is_same<OutputKernelType, const NoOpOutputKernel>::value,
+                "XSMM does not support contraction output kernels.");
+
   template<int Alignment>
   class ContextXsmm {
    public:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index 182bef918..e0cbbb315 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -195,6 +195,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
   enum {
     IsAligned = false,
     PacketAccess = true,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess = false
   };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 84d5be173..1ec5819a7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -54,8 +54,8 @@ class IndexMapper {
       }
     }
 
-    array<Index, NumDims> cudaInputDimensions;
-    array<Index, NumDims> cudaOutputDimensions;
+    array<Index, NumDims> gpuInputDimensions;
+    array<Index, NumDims> gpuOutputDimensions;
     array<Index, NumDims> tmp = dimensions;
     array<Index, NumDims> ordering;
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
@@ -65,8 +65,8 @@ class IndexMapper {
       const Index index = i + offset;
       ordering[index] = indices[i];
       tmp[indices[i]] = -1;
-      cudaInputDimensions[index] = input_dims[indices[i]];
-      cudaOutputDimensions[index] = dimensions[indices[i]];
+      gpuInputDimensions[index] = input_dims[indices[i]];
+      gpuOutputDimensions[index] = dimensions[indices[i]];
     }
 
     int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
@@ -75,8 +75,8 @@ class IndexMapper {
     for (int i = 0; i < NumDims; ++i) {
       if (tmp[i] >= 0) {
         ordering[written] = i;
-        cudaInputDimensions[written] = input_dims[i];
-        cudaOutputDimensions[written] = dimensions[i];
+        gpuInputDimensions[written] = input_dims[i];
+        gpuOutputDimensions[written] = dimensions[i];
         ++written;
       }
     }
@@ -89,37 +89,37 @@ class IndexMapper {
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int i = 0; i < NumDims; ++i) {
         if (i > NumKernelDims) {
-          m_cudaInputStrides[i] =
-              m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1];
-          m_cudaOutputStrides[i] =
-              m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1];
+          m_gpuInputStrides[i] =
+              m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1];
+          m_gpuOutputStrides[i] =
+              m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1];
         } else {
-          m_cudaInputStrides[i] = 1;
-          m_cudaOutputStrides[i] = 1;
+          m_gpuInputStrides[i] = 1;
+          m_gpuOutputStrides[i] = 1;
         }
       }
     } else {
       for (int i = NumDims - 1; i >= 0; --i) {
         if (static_cast<size_t>(i + 1) < offset) {
-          m_cudaInputStrides[i] =
-              m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1];
-          m_cudaOutputStrides[i] =
-              m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1];
+          m_gpuInputStrides[i] =
+              m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
+          m_gpuOutputStrides[i] =
+              m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
         } else {
-          m_cudaInputStrides[i] = 1;
-          m_cudaOutputStrides[i] = 1;
+          m_gpuInputStrides[i] = 1;
+          m_gpuOutputStrides[i] = 1;
         }
       }
     }
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const {
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int d = NumDims - 1; d > NumKernelDims; --d) {
-        const Index idx = p / m_cudaInputStrides[d];
+        const Index idx = p / m_gpuInputStrides[d];
         inputIndex += idx * m_inputStrides[d];
-        p -= idx * m_cudaInputStrides[d];
+        p -= idx * m_gpuInputStrides[d];
       }
       inputIndex += p * m_inputStrides[NumKernelDims];
     } else {
@@ -128,22 +128,22 @@ class IndexMapper {
         limit = NumDims - NumKernelDims - 1;
       }
       for (int d = 0; d < limit; ++d) {
-        const Index idx = p / m_cudaInputStrides[d];
+        const Index idx = p / m_gpuInputStrides[d];
         inputIndex += idx * m_inputStrides[d];
-        p -= idx * m_cudaInputStrides[d];
+        p -= idx * m_gpuInputStrides[d];
       }
       inputIndex += p * m_inputStrides[limit];
     }
     return inputIndex;
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const {
     Index outputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int d = NumDims - 1; d > NumKernelDims; --d) {
-        const Index idx = p / m_cudaOutputStrides[d];
+        const Index idx = p / m_gpuOutputStrides[d];
         outputIndex += idx * m_outputStrides[d];
-        p -= idx * m_cudaOutputStrides[d];
+        p -= idx * m_gpuOutputStrides[d];
       }
       outputIndex += p * m_outputStrides[NumKernelDims];
     } else {
@@ -152,44 +152,44 @@ class IndexMapper {
         limit = NumDims - NumKernelDims - 1;
       }
       for (int d = 0; d < limit; ++d) {
-        const Index idx = p / m_cudaOutputStrides[d];
+        const Index idx = p / m_gpuOutputStrides[d];
         outputIndex += idx * m_outputStrides[d];
-        p -= idx * m_cudaOutputStrides[d];
+        p -= idx * m_gpuOutputStrides[d];
       }
       outputIndex += p * m_outputStrides[limit];
     }
     return outputIndex;
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
     return i * m_inputStrides[offset];
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
     return i * m_outputStrides[offset];
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
     return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
     return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
@@ -197,7 +197,7 @@ class IndexMapper {
            k * m_inputStrides[offset + 2];
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
@@ -209,8 +209,8 @@ class IndexMapper {
   static const int NumDims = internal::array_size<InputDims>::value;
   array<Index, NumDims> m_inputStrides;
   array<Index, NumDims> m_outputStrides;
-  array<Index, NumDims> m_cudaInputStrides;
-  array<Index, NumDims> m_cudaOutputStrides;
+  array<Index, NumDims> m_gpuInputStrides;
+  array<Index, NumDims> m_gpuOutputStrides;
 };
 
 
@@ -307,6 +307,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   enum {
     IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<InputArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -553,7 +554,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
 
 
 // Use an optimized implementation of the evaluation code for GPUs whenever possible.
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
 
 template <int StaticKernelSize>
 struct GetKernelSize {
@@ -576,7 +577,11 @@ __global__ void EigenConvolutionKernel1D(
         indexMapper,
     const float* __restrict kernel, const int numPlanes, const int numX,
     const int maxX, const int kernelSize, float* buffer) {
+#if defined(EIGEN_HIPCC)
+  HIP_DYNAMIC_SHARED(float, s)
+#else
   extern __shared__ float s[];
+#endif
 
   const int first_x = blockIdx.x * maxX;
   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
@@ -588,18 +593,18 @@ __global__ void EigenConvolutionKernel1D(
 
   for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
     // Load inputs to shared memory
-    const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+    const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
     const int plane_kernel_offset = threadIdx.y * num_x_input;
     #pragma unroll
     for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
-      const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x);
+      const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x);
       s[i + plane_kernel_offset] = eval.coeff(tensor_index);
     }
 
     __syncthreads();
 
     // Compute the convolution
-    const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
+    const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
 
     #pragma unroll
     for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
@@ -609,7 +614,7 @@ __global__ void EigenConvolutionKernel1D(
       for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
         result += s[k + kernel_offset] * kernel[k];
       }
-      const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x);
+      const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x);
       buffer[tensor_index] = result;
     }
     __syncthreads();
@@ -625,7 +630,11 @@ __global__ void EigenConvolutionKernel2D(
     const float* __restrict kernel, const int numPlanes, const int numX,
     const int maxX, const int numY, const int maxY, const int kernelSizeX,
     const int kernelSizeY, float* buffer) {
+#if defined(EIGEN_HIPCC)
+  HIP_DYNAMIC_SHARED(float, s)
+#else
   extern __shared__ float s[];
+#endif
 
   const int first_x = blockIdx.x * maxX;
   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
@@ -642,7 +651,7 @@ __global__ void EigenConvolutionKernel2D(
 
   for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
 
-    const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+    const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
     const int plane_kernel_offset = threadIdx.z * num_y_input;
 
     // Load inputs to shared memory
@@ -651,7 +660,7 @@ __global__ void EigenConvolutionKernel2D(
       const int input_offset = num_x_input * (j + plane_kernel_offset);
       #pragma unroll
       for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
-        const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y);
+        const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y);
         s[i + input_offset] = eval.coeff(tensor_index);
       }
     }
@@ -659,7 +668,7 @@ __global__ void EigenConvolutionKernel2D(
     __syncthreads();
 
     // Convolution
-    const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
+    const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
 
     #pragma unroll
     for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
@@ -675,7 +684,7 @@ __global__ void EigenConvolutionKernel2D(
             result += s[k + input_offset] * kernel[k + kernel_offset];
           }
         }
-        const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
+        const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
         buffer[tensor_index] = result;
       }
     }
@@ -693,7 +702,11 @@ __global__ void EigenConvolutionKernel3D(
     const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
     const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
     const size_t kernelSizeZ, float* buffer) {
+#if defined(EIGEN_HIPCC)
+  HIP_DYNAMIC_SHARED(float, s)
+#else
   extern __shared__ float s[];
+#endif
 
   // Load inputs to shared memory
   const int first_x = blockIdx.x * maxX;
@@ -710,13 +723,13 @@ __global__ void EigenConvolutionKernel3D(
 
   for (int p = 0; p < numPlanes; ++p) {
 
-    const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+    const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
     const int plane_kernel_offset = 0;
 
     for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
       for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
         for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
-          const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
+          const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
           s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
         }
       }
@@ -728,7 +741,7 @@ __global__ void EigenConvolutionKernel3D(
     const int num_z_output = last_z - first_z + 1;
     const int num_y_output = last_y - first_y + 1;
     const int num_x_output = last_x - first_x + 1;
-    const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
+    const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
 
     for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
       for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
@@ -741,7 +754,7 @@ __global__ void EigenConvolutionKernel3D(
               }
             }
           }
-          const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
+          const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
           buffer[tensor_index] = result;
         }
       }
@@ -766,6 +779,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   enum {
     IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
     PacketAccess = false,
+    BlockAccess = false,
     Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -854,9 +868,9 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
     typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
 
     const int maxSharedMem = m_device.sharedMemPerBlock();
-    const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock();
-    const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock;
-    const int numMultiProcessors = m_device.getNumCudaMultiProcessors();
+    const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
+    const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
+    const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
     const int warpSize = 32;
 
     switch (NumKernelDims) {
@@ -891,7 +905,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
         }
 
         const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
-        assert(shared_mem <= maxSharedMem);
+        gpu_assert(shared_mem <= maxSharedMem);
 
         const int num_x_blocks = ceil(numX, maxX);
         const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
@@ -908,15 +922,15 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
             m_inputImpl.dimensions(), kernel_dims, indices);
         switch(kernel_size) {
           case 4: {
-            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
+            LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
             break;
           }
           case 7: {
-            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
+            LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
             break;
           }
           default: {
-            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
+            LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
           }
         }
         break;
@@ -948,7 +962,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
         block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
 
         const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
-        assert(shared_mem <= maxSharedMem);
+        gpu_assert(shared_mem <= maxSharedMem);
 
         const int num_x_blocks = ceil(numX, maxX);
         const int num_y_blocks = ceil(numY, maxY);
@@ -969,11 +983,11 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
           case 4: {
             switch (kernel_size_y) {
               case 7: {
-                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
+                LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
                 break;
               }
               default: {
-                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
+                LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
                 break;
               }
             }
@@ -982,18 +996,18 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
           case 7: {
             switch (kernel_size_y) {
               case 4: {
-                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
+                LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
                 break;
               }
               default: {
-                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
+                LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
                 break;
               }
             }
             break;
           }
           default: {
-            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
+            LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
             break;
           }
         }
@@ -1028,7 +1042,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
         dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
 
         const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
-        assert(shared_mem <= maxSharedMem);
+        gpu_assert(shared_mem <= maxSharedMem);
 
         //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z  << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
         const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
@@ -1039,7 +1053,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
         internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
             m_inputImpl.dimensions(), kernel_dims, indices);
 
-        LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
+        LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
         break;
       }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
index da88bcb3b..d301d0c01 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@@ -242,6 +242,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   enum {
     IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned,
     PacketAccess = false,
+    BlockAccess = false,
     Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -352,7 +353,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
           size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y;
           m_device.parallel_for_setup(numX, numP, tileSize_x,tileSize_y,range_x,range_y, GRange_x, GRange_y );
           const size_t shared_mem =(tileSize_x +kernel_size -1)*(tileSize_y);
-          assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
+          gpu_assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
           auto global_range=cl::sycl::range<2>(GRange_x, GRange_y);  // global range
           auto local_range=cl::sycl::range<2>(tileSize_x, tileSize_y);  // local range
           InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
@@ -377,7 +378,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
           size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z;
           m_device.parallel_for_setup(numX, numY, numP, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z );
           const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * tileSize_z;
-          assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
+          gpu_assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
           auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z);  // global range
           auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z);  // local range
           InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
@@ -408,7 +409,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
           size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z;
           m_device.parallel_for_setup(numX, numY, numZ, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z );
           const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * (tileSize_z +kernel_size_y -1);
-          assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
+          gpu_assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
           auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z);  // global range
           auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z);  // local range
           InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
index b148dae39..bb63baee2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -195,7 +195,7 @@ class TensorCostModel {
     // 11 is L2 cache latency on Haswell.
     // We don't know whether data is in L1, L2 or L3. But we are most interested
     // in single-threaded computational time around 100us-10ms (smaller time
-    // is too small for parallelization, larger time is not intersting
+    // is too small for parallelization, larger time is not interesting
     // either because we are probably using all available threads already).
     // And for the target time range, L2 seems to be what matters. Data set
     // fitting into L1 is too small to take noticeable time. Data set fitting
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index ded7129da..f77923933 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -1,340 +1,6 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H)
-#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
-
-namespace Eigen {
-
-static const int kCudaScratchSize = 1024;
-
-// This defines an interface that GPUDevice can take to use
-// CUDA streams underneath.
-class StreamInterface {
- public:
-  virtual ~StreamInterface() {}
-
-  virtual const cudaStream_t& stream() const = 0;
-  virtual const cudaDeviceProp& deviceProperties() const = 0;
-
-  // Allocate memory on the actual device where the computation will run
-  virtual void* allocate(size_t num_bytes) const = 0;
-  virtual void deallocate(void* buffer) const = 0;
-
-  // Return a scratchpad buffer of size 1k
-  virtual void* scratchpad() const = 0;
-
-  // Return a semaphore. The semaphore is initially initialized to 0, and
-  // each kernel using it is responsible for resetting to 0 upon completion
-  // to maintain the invariant that the semaphore is always equal to 0 upon
-  // each kernel start.
-  virtual unsigned int* semaphore() const = 0;
-};
-
-static cudaDeviceProp* m_deviceProperties;
-static bool m_devicePropInitialized = false;
-
-static void initializeDeviceProp() {
-  if (!m_devicePropInitialized) {
-    // Attempts to ensure proper behavior in the case of multiple threads
-    // calling this function simultaneously. This would be trivial to
-    // implement if we could use std::mutex, but unfortunately mutex don't
-    // compile with nvcc, so we resort to atomics and thread fences instead.
-    // Note that if the caller uses a compiler that doesn't support c++11 we
-    // can't ensure that the initialization is thread safe.
-#if __cplusplus >= 201103L
-    static std::atomic<bool> first(true);
-    if (first.exchange(false)) {
-#else
-    static bool first = true;
-    if (first) {
-      first = false;
-#endif
-      // We're the first thread to reach this point.
-      int num_devices;
-      cudaError_t status = cudaGetDeviceCount(&num_devices);
-      if (status != cudaSuccess) {
-        std::cerr << "Failed to get the number of CUDA devices: "
-                  << cudaGetErrorString(status)
-                  << std::endl;
-        assert(status == cudaSuccess);
-      }
-      m_deviceProperties = new cudaDeviceProp[num_devices];
-      for (int i = 0; i < num_devices; ++i) {
-        status = cudaGetDeviceProperties(&m_deviceProperties[i], i);
-        if (status != cudaSuccess) {
-          std::cerr << "Failed to initialize CUDA device #"
-                    << i
-                    << ": "
-                    << cudaGetErrorString(status)
-                    << std::endl;
-          assert(status == cudaSuccess);
-        }
-      }
-
-#if __cplusplus >= 201103L
-      std::atomic_thread_fence(std::memory_order_release);
-#endif
-      m_devicePropInitialized = true;
-    } else {
-      // Wait for the other thread to inititialize the properties.
-      while (!m_devicePropInitialized) {
-#if __cplusplus >= 201103L
-        std::atomic_thread_fence(std::memory_order_acquire);
-#endif
-        EIGEN_SLEEP(1000);
-      }
-    }
-  }
-}
-
-static const cudaStream_t default_stream = cudaStreamDefault;
-
-class CudaStreamDevice : public StreamInterface {
- public:
-  // Use the default stream on the current device
-  CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
-    cudaGetDevice(&device_);
-    initializeDeviceProp();
-  }
-  // Use the default stream on the specified device
-  CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
-    initializeDeviceProp();
-  }
-  // Use the specified stream. Note that it's the
-  // caller responsibility to ensure that the stream can run on
-  // the specified device. If no device is specified the code
-  // assumes that the stream is associated to the current gpu device.
-  CudaStreamDevice(const cudaStream_t* stream, int device = -1)
-      : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
-    if (device < 0) {
-      cudaGetDevice(&device_);
-    } else {
-      int num_devices;
-      cudaError_t err = cudaGetDeviceCount(&num_devices);
-      EIGEN_UNUSED_VARIABLE(err)
-      assert(err == cudaSuccess);
-      assert(device < num_devices);
-      device_ = device;
-    }
-    initializeDeviceProp();
-  }
-
-  virtual ~CudaStreamDevice() {
-    if (scratch_) {
-      deallocate(scratch_);
-    }
-  }
-
-  const cudaStream_t& stream() const { return *stream_; }
-  const cudaDeviceProp& deviceProperties() const {
-    return m_deviceProperties[device_];
-  }
-  virtual void* allocate(size_t num_bytes) const {
-    cudaError_t err = cudaSetDevice(device_);
-    EIGEN_UNUSED_VARIABLE(err)
-    assert(err == cudaSuccess);
-    void* result;
-    err = cudaMalloc(&result, num_bytes);
-    assert(err == cudaSuccess);
-    assert(result != NULL);
-    return result;
-  }
-  virtual void deallocate(void* buffer) const {
-    cudaError_t err = cudaSetDevice(device_);
-    EIGEN_UNUSED_VARIABLE(err)
-    assert(err == cudaSuccess);
-    assert(buffer != NULL);
-    err = cudaFree(buffer);
-    assert(err == cudaSuccess);
-  }
-
-  virtual void* scratchpad() const {
-    if (scratch_ == NULL) {
-      scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int));
-    }
-    return scratch_;
-  }
-
-  virtual unsigned int* semaphore() const {
-    if (semaphore_ == NULL) {
-      char* scratch = static_cast<char*>(scratchpad()) + kCudaScratchSize;
-      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
-      cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
-      EIGEN_UNUSED_VARIABLE(err)
-      assert(err == cudaSuccess);
-    }
-    return semaphore_;
-  }
-
- private:
-  const cudaStream_t* stream_;
-  int device_;
-  mutable void* scratch_;
-  mutable unsigned int* semaphore_;
-};
-
-struct GpuDevice {
-  // The StreamInterface is not owned: the caller is
-  // responsible for its initialization and eventual destruction.
-  explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
-    eigen_assert(stream);
-  }
-  explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
-    eigen_assert(stream);
-  }
-  // TODO(bsteiner): This is an internal API, we should not expose it.
-  EIGEN_STRONG_INLINE const cudaStream_t& stream() const {
-    return stream_->stream();
-  }
-
-  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
-    return stream_->allocate(num_bytes);
-  }
-
-  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
-    stream_->deallocate(buffer);
-  }
-
-  EIGEN_STRONG_INLINE void* scratchpad() const {
-    return stream_->scratchpad();
-  }
-
-  EIGEN_STRONG_INLINE unsigned int* semaphore() const {
-    return stream_->semaphore();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
-#ifndef EIGEN_CUDA_ARCH
-    cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
-                                      stream_->stream());
-    EIGEN_UNUSED_VARIABLE(err)
-    assert(err == cudaSuccess);
-#else
-    EIGEN_UNUSED_VARIABLE(dst);
-    EIGEN_UNUSED_VARIABLE(src);
-    EIGEN_UNUSED_VARIABLE(n);
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
-  }
-
-  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
-    cudaError_t err =
-        cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
-    EIGEN_UNUSED_VARIABLE(err)
-    assert(err == cudaSuccess);
-  }
-
-  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
-    cudaError_t err =
-        cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
-    EIGEN_UNUSED_VARIABLE(err)
-    assert(err == cudaSuccess);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
-#ifndef EIGEN_CUDA_ARCH
-    cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream());
-    EIGEN_UNUSED_VARIABLE(err)
-    assert(err == cudaSuccess);
-#else
-  eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
-  }
-
-  EIGEN_STRONG_INLINE size_t numThreads() const {
-    // FIXME
-    return 32;
-  }
-
-  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
-    // FIXME
-    return 48*1024;
-  }
-
-  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
-    // We won't try to take advantage of the l2 cache for the time being, and
-    // there is no l3 cache on cuda devices.
-    return firstLevelCacheSize();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
-#if defined(EIGEN_CUDACC) && !defined(EIGEN_CUDA_ARCH)
-    cudaError_t err = cudaStreamSynchronize(stream_->stream());
-    if (err != cudaSuccess) {
-      std::cerr << "Error detected in CUDA stream: "
-                << cudaGetErrorString(err)
-                << std::endl;
-      assert(err == cudaSuccess);
-    }
-#else
-    assert(false && "The default device should be used instead to generate kernel code");
+#if defined(__clang__) || defined(__GNUC__)
+#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorDeviceGpu.h file"
 #endif
-  }
-
-  EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
-    return stream_->deviceProperties().multiProcessorCount;
-  }
-  EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
-    return stream_->deviceProperties().maxThreadsPerBlock;
-  }
-  EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
-    return stream_->deviceProperties().maxThreadsPerMultiProcessor;
-  }
-  EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
-    return stream_->deviceProperties().sharedMemPerBlock;
-  }
-  EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-    return stream_->deviceProperties().major;
-  }
-  EIGEN_STRONG_INLINE int minorDeviceVersion() const {
-    return stream_->deviceProperties().minor;
-  }
-
-  EIGEN_STRONG_INLINE int maxBlocks() const {
-    return max_blocks_;
-  }
-
-  // This function checks if the CUDA runtime recorded an error for the
-  // underlying stream device.
-  inline bool ok() const {
-#ifdef EIGEN_CUDACC
-    cudaError_t error = cudaStreamQuery(stream_->stream());
-    return (error == cudaSuccess) || (error == cudaErrorNotReady);
-#else
-    return false;
-#endif
-  }
-
- private:
-  const StreamInterface* stream_;
-  int max_blocks_;
-};
-
-#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
-  (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);   \
-  assert(cudaGetLastError() == cudaSuccess);
-
-
-// FIXME: Should be device and kernel specific.
-#ifdef EIGEN_CUDACC
-static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
-#ifndef EIGEN_CUDA_ARCH
-  cudaError_t status = cudaDeviceSetSharedMemConfig(config);
-  EIGEN_UNUSED_VARIABLE(status)
-  assert(status == cudaSuccess);
-#else
-  EIGEN_UNUSED_VARIABLE(config)
-#endif
-}
-#endif
-
-}  // end namespace Eigen
 
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
+#include "TensorDeviceGpu.h"
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
index 341889e88..5c1c68912 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
@@ -35,9 +35,12 @@ struct DefaultDevice {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
-#ifndef EIGEN_CUDA_ARCH
+#if !defined(EIGEN_GPU_COMPILE_PHASE)
     // Running on the host CPU
     return 1;
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+    // Running on a HIP device
+    return 64;
 #else
     // Running on a CUDA device
     return 32;
@@ -45,9 +48,12 @@ struct DefaultDevice {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
-#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__)
     // Running on the host CPU
     return l1CacheSize();
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+    // Running on a HIP device
+    return 48*1024; // FIXME : update this number for HIP
 #else
     // Running on a CUDA device, return the amount of shared memory available.
     return 48*1024;
@@ -55,9 +61,12 @@ struct DefaultDevice {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
-#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__)
     // Running single threaded on the host CPU
     return l3CacheSize();
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+    // Running on a HIP device
+    return firstLevelCacheSize(); // FIXME : update this number for HIP
 #else
     // Running on a CUDA device
     return firstLevelCacheSize();
@@ -65,10 +74,14 @@ struct DefaultDevice {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-#ifndef EIGEN_CUDA_ARCH
+#if !defined(EIGEN_GPU_COMPILE_PHASE)
     // Running single threaded on the host CPU
     // Should return an enum that encodes the ISA supported by the CPU
     return 1;
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+    // Running on a HIP device
+    // return 1 as major for HIP
+    return 1;
 #else
     // Running on a CUDA device
     return EIGEN_CUDA_ARCH / 100;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
new file mode 100644
index 000000000..0c036833f
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
@@ -0,0 +1,357 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H)
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
+
+// This header file container defines fo gpu* macros which will resolve to
+// their equivalent hip* or cuda* versions depending on the compiler in use
+// A separte header (included at the end of this file) will undefine all 
+#include "TensorGpuHipCudaDefines.h"
+
+namespace Eigen {
+
+static const int kGpuScratchSize = 1024;
+
+// This defines an interface that GPUDevice can take to use
+// HIP / CUDA streams underneath.
+class StreamInterface {
+ public:
+  virtual ~StreamInterface() {}
+
+  virtual const gpuStream_t& stream() const = 0;
+  virtual const gpuDeviceProp_t& deviceProperties() const = 0;
+
+  // Allocate memory on the actual device where the computation will run
+  virtual void* allocate(size_t num_bytes) const = 0;
+  virtual void deallocate(void* buffer) const = 0;
+
+  // Return a scratchpad buffer of size 1k
+  virtual void* scratchpad() const = 0;
+
+  // Return a semaphore. The semaphore is initially initialized to 0, and
+  // each kernel using it is responsible for resetting to 0 upon completion
+  // to maintain the invariant that the semaphore is always equal to 0 upon
+  // each kernel start.
+  virtual unsigned int* semaphore() const = 0;
+};
+
+static gpuDeviceProp_t* m_deviceProperties;
+static bool m_devicePropInitialized = false;
+
+static void initializeDeviceProp() {
+  if (!m_devicePropInitialized) {
+    // Attempts to ensure proper behavior in the case of multiple threads
+    // calling this function simultaneously. This would be trivial to
+    // implement if we could use std::mutex, but unfortunately mutex don't
+    // compile with nvcc, so we resort to atomics and thread fences instead.
+    // Note that if the caller uses a compiler that doesn't support c++11 we
+    // can't ensure that the initialization is thread safe.
+#if __cplusplus >= 201103L
+    static std::atomic<bool> first(true);
+    if (first.exchange(false)) {
+#else
+    static bool first = true;
+    if (first) {
+      first = false;
+#endif
+      // We're the first thread to reach this point.
+      int num_devices;
+      gpuError_t status = gpuGetDeviceCount(&num_devices);
+      if (status != gpuSuccess) {
+        std::cerr << "Failed to get the number of GPU devices: "
+                  << gpuGetErrorString(status)
+                  << std::endl;
+        gpu_assert(status == gpuSuccess);
+      }
+      m_deviceProperties = new gpuDeviceProp_t[num_devices];
+      for (int i = 0; i < num_devices; ++i) {
+        status = gpuGetDeviceProperties(&m_deviceProperties[i], i);
+        if (status != gpuSuccess) {
+          std::cerr << "Failed to initialize GPU device #"
+                    << i
+                    << ": "
+                    << gpuGetErrorString(status)
+                    << std::endl;
+          gpu_assert(status == gpuSuccess);
+        }
+      }
+
+#if __cplusplus >= 201103L
+      std::atomic_thread_fence(std::memory_order_release);
+#endif
+      m_devicePropInitialized = true;
+    } else {
+      // Wait for the other thread to inititialize the properties.
+      while (!m_devicePropInitialized) {
+#if __cplusplus >= 201103L
+        std::atomic_thread_fence(std::memory_order_acquire);
+#endif
+        EIGEN_SLEEP(1000);
+      }
+    }
+  }
+}
+
+static const gpuStream_t default_stream = gpuStreamDefault;
+
+class GpuStreamDevice : public StreamInterface {
+ public:
+  // Use the default stream on the current device
+  GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
+    gpuGetDevice(&device_);
+    initializeDeviceProp();
+  }
+  // Use the default stream on the specified device
+  GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
+    initializeDeviceProp();
+  }
+  // Use the specified stream. Note that it's the
+  // caller responsibility to ensure that the stream can run on
+  // the specified device. If no device is specified the code
+  // assumes that the stream is associated to the current gpu device.
+  GpuStreamDevice(const gpuStream_t* stream, int device = -1)
+      : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
+    if (device < 0) {
+      gpuGetDevice(&device_);
+    } else {
+      int num_devices;
+      gpuError_t err = gpuGetDeviceCount(&num_devices);
+      EIGEN_UNUSED_VARIABLE(err)
+      gpu_assert(err == gpuSuccess);
+      gpu_assert(device < num_devices);
+      device_ = device;
+    }
+    initializeDeviceProp();
+  }
+
+  virtual ~GpuStreamDevice() {
+    if (scratch_) {
+      deallocate(scratch_);
+    }
+  }
+
+  const gpuStream_t& stream() const { return *stream_; }
+  const gpuDeviceProp_t& deviceProperties() const {
+    return m_deviceProperties[device_];
+  }
+  virtual void* allocate(size_t num_bytes) const {
+    gpuError_t err = gpuSetDevice(device_);
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+    void* result;
+    err = gpuMalloc(&result, num_bytes);
+    gpu_assert(err == gpuSuccess);
+    gpu_assert(result != NULL);
+    return result;
+  }
+  virtual void deallocate(void* buffer) const {
+    gpuError_t err = gpuSetDevice(device_);
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+    gpu_assert(buffer != NULL);
+    err = gpuFree(buffer);
+    gpu_assert(err == gpuSuccess);
+  }
+
+  virtual void* scratchpad() const {
+    if (scratch_ == NULL) {
+      scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int));
+    }
+    return scratch_;
+  }
+
+  virtual unsigned int* semaphore() const {
+    if (semaphore_ == NULL) {
+      char* scratch = static_cast<char*>(scratchpad()) + kGpuScratchSize;
+      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+      gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
+      EIGEN_UNUSED_VARIABLE(err)
+      gpu_assert(err == gpuSuccess);
+    }
+    return semaphore_;
+  }
+
+ private:
+  const gpuStream_t* stream_;
+  int device_;
+  mutable void* scratch_;
+  mutable unsigned int* semaphore_;
+};
+
+struct GpuDevice {
+  // The StreamInterface is not owned: the caller is
+  // responsible for its initialization and eventual destruction.
+  explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
+    eigen_assert(stream);
+  }
+  explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
+    eigen_assert(stream);
+  }
+  // TODO(bsteiner): This is an internal API, we should not expose it.
+  EIGEN_STRONG_INLINE const gpuStream_t& stream() const {
+    return stream_->stream();
+  }
+
+  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+    return stream_->allocate(num_bytes);
+  }
+
+  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+    stream_->deallocate(buffer);
+  }
+
+  EIGEN_STRONG_INLINE void* scratchpad() const {
+    return stream_->scratchpad();
+  }
+
+  EIGEN_STRONG_INLINE unsigned int* semaphore() const {
+    return stream_->semaphore();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+    gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice,
+                                      stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+#else
+    EIGEN_UNUSED_VARIABLE(dst);
+    EIGEN_UNUSED_VARIABLE(src);
+    EIGEN_UNUSED_VARIABLE(n);
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+    gpuError_t err =
+        gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+  }
+
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+    gpuError_t err =
+        gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+    gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+#else
+  eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_STRONG_INLINE size_t numThreads() const {
+    // FIXME
+    return 32;
+  }
+
+  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+    // FIXME
+    return 48*1024;
+  }
+
+  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+    // We won't try to take advantage of the l2 cache for the time being, and
+    // there is no l3 cache on hip/cuda devices.
+    return firstLevelCacheSize();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
+#if defined(EIGEN_GPUCC) && !defined(EIGEN_GPU_COMPILE_PHASE)
+    gpuError_t err = gpuStreamSynchronize(stream_->stream());
+    if (err != gpuSuccess) {
+      std::cerr << "Error detected in GPU stream: "
+                << gpuGetErrorString(err)
+                << std::endl;
+      gpu_assert(err == gpuSuccess);
+    }
+#else
+    gpu_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const {
+    return stream_->deviceProperties().multiProcessorCount;
+  }
+  EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const {
+    return stream_->deviceProperties().maxThreadsPerBlock;
+  }
+  EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const {
+    return stream_->deviceProperties().maxThreadsPerMultiProcessor;
+  }
+  EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
+    return stream_->deviceProperties().sharedMemPerBlock;
+  }
+  EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+    return stream_->deviceProperties().major;
+  }
+  EIGEN_STRONG_INLINE int minorDeviceVersion() const {
+    return stream_->deviceProperties().minor;
+  }
+
+  EIGEN_STRONG_INLINE int maxBlocks() const {
+    return max_blocks_;
+  }
+
+  // This function checks if the GPU runtime recorded an error for the
+  // underlying stream device.
+  inline bool ok() const {
+#ifdef EIGEN_GPUCC
+    gpuError_t error = gpuStreamQuery(stream_->stream());
+    return (error == gpuSuccess) || (error == gpuErrorNotReady);
+#else
+    return false;
+#endif
+  }
+
+ private:
+  const StreamInterface* stream_;
+  int max_blocks_;
+};
+
+#if defined(EIGEN_HIPCC)
+
+#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
+  hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
+  gpu_assert(hipGetLastError() == hipSuccess);
+
+#else
+ 
+#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
+  (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);   \
+  gpu_assert(cudaGetLastError() == cudaSuccess);
+
+#endif
+ 
+// FIXME: Should be device and kernel specific.
+#ifdef EIGEN_GPUCC
+static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+  gpuError_t status = gpuDeviceSetSharedMemConfig(config);
+  EIGEN_UNUSED_VARIABLE(status)
+  gpu_assert(status == gpuSuccess);
+#else
+  EIGEN_UNUSED_VARIABLE(config)
+#endif
+}
+#endif
+
+}  // end namespace Eigen
+
+// undefine all the gpu* macros we defined at the beginning of the file
+#include "TensorGpuHipCudaUndefines.h"
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
index 6158acbd9..e7beb2c82 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -286,7 +286,7 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
     tileSize =static_cast<Index>(m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>());
     auto s=  m_queue.get_device().template get_info<cl::sycl::info::device::vendor>();
     std::transform(s.begin(), s.end(), s.begin(), ::tolower);
-    if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
+    if(m_queue.get_device().is_cpu()){ // intel doesn't allow to use max workgroup size
       tileSize=std::min(static_cast<Index>(256), static_cast<Index>(tileSize));
     }
     rng = n;
@@ -303,7 +303,7 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
   template<typename Index>
   EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1)  const {
     Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
-    if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
+    if(m_queue.get_device().is_cpu()){ // intel doesn't allow to use max workgroup size
       max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
     }
     Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
@@ -331,7 +331,7 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
   template<typename Index>
   EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2)  const {
     Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
-    if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
+    if(m_queue.get_device().is_cpu()){ // intel doesn't allow to use max workgroup size
       max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
     }
     Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
@@ -377,7 +377,7 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
   EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
 
   EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
-    // OpenCL doesnot have such concept
+    // OpenCL doesn't have such concept
     return 2;
   }
 
@@ -519,7 +519,7 @@ struct SyclDevice {
     return m_queue_stream->maxSyclThreadsPerBlock();
   }
   EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
-    // OpenCL doesnot have such concept
+    // OpenCL doesn't have such concept
     return m_queue_stream->maxSyclThreadsPerMultiProcessor();
   //  return stream_->deviceProperties().maxThreadsPerMultiProcessor;
   }
@@ -544,7 +544,7 @@ struct SyclDevice {
 };
 // This is used as a distingushable device inside the kernel as the sycl device class is not Standard layout.
 // This is internal and must not be used by user. This dummy device allow us to specialise the tensor evaluator
-// inside the kenrel. So we can have two types of eval for host and device. This is required for TensorArgMax operation
+// inside the kernel. So we can have two types of eval for host and device. This is required for TensorArgMax operation
 struct SyclKernelDevice:DefaultDevice{};
 
 }  // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index ec6802e85..90fd99027 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -169,7 +169,7 @@ struct ThreadPoolDevice {
 
   // parallelFor executes f with [0, n) arguments in parallel and waits for
   // completion. F accepts a half-open interval [first, last).
-  // Block size is choosen based on the iteration cost and resulting parallel
+  // Block size is chosen based on the iteration cost and resulting parallel
   // efficiency. If block_align is not nullptr, it is called to round up the
   // block size.
   void parallelFor(Index n, const TensorOpCost& cost,
@@ -189,9 +189,11 @@ struct ThreadPoolDevice {
     // of blocks to be evenly dividable across threads.
 
     double block_size_f = 1.0 / CostModel::taskSize(1, cost);
-    Index block_size = numext::mini(n, numext::maxi<Index>(1, block_size_f));
-    const Index max_block_size =
-        numext::mini(n, numext::maxi<Index>(1, 2 * block_size_f));
+    const Index max_oversharding_factor = 4;
+    Index block_size = numext::mini(
+        n, numext::maxi<Index>(divup<Index>(n, max_oversharding_factor * numThreads()),
+                               block_size_f));
+    const Index max_block_size = numext::mini(n, 2 * block_size);
     if (block_align) {
       Index new_block_size = block_align(block_size);
       eigen_assert(new_block_size >= block_size);
@@ -205,7 +207,8 @@ struct ThreadPoolDevice {
         (divup<int>(block_count, numThreads()) * numThreads());
     // Now try to increase block size up to max_block_size as long as it
     // doesn't decrease parallel efficiency.
-    for (Index prev_block_count = block_count; prev_block_count > 1;) {
+    for (Index prev_block_count = block_count;
+         max_efficiency < 1.0 && prev_block_count > 1;) {
       // This is the next block size that divides size into a smaller number
       // of blocks than the current block_size.
       Index coarser_block_size = divup(n, prev_block_count - 1);
@@ -261,6 +264,9 @@ struct ThreadPoolDevice {
     parallelFor(n, cost, nullptr, std::move(f));
   }
 
+  // Thread pool accessor.
+  ThreadPoolInterface* getPool() const { return pool_; }
+
  private:
   ThreadPoolInterface* pool_;
   int num_threads_;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index 86405e69b..5ca47cca7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -284,6 +284,28 @@ struct DSizes : array<DenseIndex, NumDims> {
     (*this)[0] = i0;
   }
 
+  EIGEN_DEVICE_FUNC DSizes(const DimensionList<DenseIndex, NumDims>& a) {
+    for (int i = 0 ; i < NumDims; ++i) {
+      (*this)[i] = a[i];
+    }
+  }
+
+#ifndef EIGEN_EMULATE_CXX11_META_H
+  template <typename std::ptrdiff_t... Indices>
+  EIGEN_DEVICE_FUNC DSizes(const Sizes<Indices...>& a) {
+    for (int i = 0 ; i < NumDims; ++i) {
+      (*this)[i] = a[i];
+    }
+  }
+#else
+  template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
+  EIGEN_DEVICE_FUNC DSizes(const Sizes<V1, V2, V3, V4, V5>& a) {
+    for (int i = 0 ; i < NumDims; ++i) {
+      (*this)[i] = a[i];
+    }
+  }
+#endif
+
 #if EIGEN_HAS_VARIADIC_TEMPLATES
   template<typename... IndexTypes> EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index d0c027890..af39daa91 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -107,6 +107,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
   enum {
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = true
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 2264be391..f9a1bd68c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -41,11 +41,22 @@ struct TensorEvaluator
   enum {
     IsAligned = Derived::IsAligned,
     PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+    BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
     Layout = Derived::Layout,
     CoordAccess = NumCoords > 0,
     RawAccess = true
   };
 
+  typedef typename internal::TensorBlock<
+      typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
+      TensorBlock;
+  typedef typename internal::TensorBlockReader<
+      typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
+      TensorBlockReader;
+  typedef typename internal::TensorBlockWriter<
+      typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
+      TensorBlockWriter;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
       : m_data(const_cast<typename internal::traits<Derived>::template MakePointer<Scalar>::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m)
   { }
@@ -55,7 +66,7 @@ struct TensorEvaluator
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) {
-    if (dest) {
+    if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && dest) {
       m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize());
       return false;
     }
@@ -113,6 +124,20 @@ struct TensorEvaluator
                         internal::unpacket_traits<PacketReturnType>::size);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
+    assert(m_data != NULL);
+    TensorBlockReader::Run(block, m_data);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+      const TensorBlock& block) {
+    assert(m_data != NULL);
+    TensorBlockWriter::Run(block, m_data);
+  }
+
   EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<Scalar>::Type data() const { return m_data; }
 
   /// required by sycl in order to construct sycl buffer from raw pointer
@@ -167,11 +192,19 @@ struct TensorEvaluator<const Derived, Device>
   enum {
     IsAligned = Derived::IsAligned,
     PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+    BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
     Layout = Derived::Layout,
     CoordAccess = NumCoords > 0,
     RawAccess = true
   };
 
+  typedef typename internal::TensorBlock<
+      typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
+      TensorBlock;
+  typedef typename internal::TensorBlockReader<
+      typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
+      TensorBlockReader;
+
   // Used for accessor extraction in SYCL Managed TensorMap:
   const Derived& derived() const { return m_impl; }
 
@@ -219,6 +252,14 @@ struct TensorEvaluator<const Derived, Device>
                         internal::unpacket_traits<PacketReturnType>::size);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
+    assert(m_data != NULL);
+    TensorBlockReader::Run(block, m_data);
+  }
+
   EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<const Scalar>::Type data() const { return m_data; }
 
   /// added for sycl in order to construct the buffer from the sycl device
@@ -244,6 +285,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
   enum {
     IsAligned = true,
     PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -308,7 +350,9 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
 
   enum {
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess & internal::functor_traits<UnaryOp>::PacketAccess,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess &
+                   internal::functor_traits<UnaryOp>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -375,16 +419,21 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
   typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
 
   enum {
-    IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess &
+    IsAligned    = TensorEvaluator<LeftArgType, Device>::IsAligned &
+                   TensorEvaluator<RightArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
+                   TensorEvaluator<RightArgType, Device>::PacketAccess &
                    internal::functor_traits<BinaryOp>::PacketAccess,
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    BlockAccess  = TensorEvaluator<LeftArgType, Device>::BlockAccess &
+                   TensorEvaluator<RightArgType, Device>::BlockAccess,
+    Layout       = TensorEvaluator<LeftArgType, Device>::Layout,
+    CoordAccess  = false,  // to be implemented
+    RawAccess    = false
   };
 
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
-    : m_functor(op.functor()),
+    : m_device(device),
+      m_functor(op.functor()),
       m_leftImpl(op.lhsExpression(), device),
       m_rightImpl(op.rhsExpression(), device)
   {
@@ -399,6 +448,14 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
   typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
 
+  static const int NumDims = internal::array_size<
+      typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
+
+  typedef internal::TensorBlock<
+      typename internal::remove_const<Scalar>::type, Index, NumDims,
+      TensorEvaluator<LeftArgType, Device>::Layout>
+      TensorBlock;
+
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
   {
     // TODO: use right impl instead if right impl dimensions are known at compile time.
@@ -433,6 +490,30 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
            TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    m_leftImpl.getResourceRequirements(resources);
+    m_rightImpl.getResourceRequirements(resources);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
+      TensorBlock* output_block) const {
+    if (NumDims <= 0) {
+      output_block->data()[0] = coeff(0);
+      return;
+    }
+    internal::TensorBlockView<LeftArgType, Device> left_block(
+        m_device, m_leftImpl, *output_block);
+    internal::TensorBlockView<RightArgType, Device> right_block(
+        m_device, m_rightImpl, *output_block);
+    internal::TensorBlockCwiseBinaryIO<
+        BinaryOp, Index, typename internal::remove_const<Scalar>::type, NumDims,
+        Layout>::Run(m_functor, output_block->block_sizes(),
+                     output_block->block_strides(), output_block->data(),
+                     left_block.block_strides(), left_block.data(),
+                     right_block.block_strides(), right_block.data());
+  }
+
   EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
   /// required by sycl in order to extract the accessor
   const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
@@ -442,6 +523,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
   BinaryOp functor() const { return m_functor; }
 
  private:
+  const Device& m_device;
   const BinaryOp m_functor;
   TensorEvaluator<LeftArgType, Device> m_leftImpl;
   TensorEvaluator<RightArgType, Device> m_rightImpl;
@@ -458,6 +540,7 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
     IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned,
     PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess &
                    internal::functor_traits<TernaryOp>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<Arg1Type, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -562,6 +645,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
     IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess &
                    internal::packet_traits<Scalar>::HasBlend,
+    BlockAccess = false,
     Layout = TensorEvaluator<IfArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 0ffe68ab3..ac5afd891 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -12,31 +12,40 @@
 
 namespace Eigen {
 
-/** \class TensorExecutor
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief The tensor executor class.
-  *
-  * This class is responsible for launch the evaluation of the expression on
-  * the specified computing device.
-  */
+/**
+ * \class TensorExecutor
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The tensor executor class.
+ *
+ * This class is responsible for launch the evaluation of the expression on
+ * the specified computing device.
+ *
+ * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and
+ *                      instructions)
+ * @tparam Tileable     can use block based tensor evaluation
+ *                      (see TensorBlock.h)
+ */
 namespace internal {
 
-// Default strategy: the expression is evaluated with a single cpu thread.
-template<typename Expression, typename Device, bool Vectorizable>
-class TensorExecutor
-{
+/**
+ * Default strategy: the expression is evaluated sequentially with a single cpu
+ * thread, without vectorization and block evaluation.
+ */
+template <typename Expression, typename Device, bool Vectorizable,
+          bool Tileable>
+class TensorExecutor {
  public:
-  typedef typename Expression::Index Index;
+  using StorageIndex = typename Expression::Index;
+
   EIGEN_DEVICE_FUNC
-  static inline void run(const Expression& expr, const Device& device = Device())
-  {
+  static inline void run(const Expression& expr,
+                         const Device& device = Device()) {
     TensorEvaluator<Expression, Device> evaluator(expr, device);
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-    if (needs_assign)
-    {
-      const Index size = array_prod(evaluator.dimensions());
-      for (Index i = 0; i < size; ++i) {
+    if (needs_assign) {
+      const StorageIndex size = array_prod(evaluator.dimensions());
+      for (StorageIndex i = 0; i < size; ++i) {
         evaluator.evalScalar(i);
       }
     }
@@ -44,35 +53,40 @@ class TensorExecutor
   }
 };
 
-
-template<typename Expression>
-class TensorExecutor<Expression, DefaultDevice, true>
-{
+/**
+ * Process all the data with a single cpu thread, using vectorized instructions.
+ */
+template <typename Expression>
+class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true,
+                     /*Tileable*/ false> {
  public:
-  typedef typename Expression::Index Index;
+  using StorageIndex = typename Expression::Index;
+
   EIGEN_DEVICE_FUNC
-  static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
-  {
+  static inline void run(const Expression& expr,
+                         const DefaultDevice& device = DefaultDevice()) {
     TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-    if (needs_assign)
-    {
-      const Index size = array_prod(evaluator.dimensions());
-      const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
-      // Give the compiler a strong hint to unroll the loop. But don't insist
-      // on unrolling, because if the function is expensive the compiler should not
+    if (needs_assign) {
+      const StorageIndex size = array_prod(evaluator.dimensions());
+      const int PacketSize = unpacket_traits<typename TensorEvaluator<
+          Expression, DefaultDevice>::PacketReturnType>::size;
+
+      // Give compiler a strong possibility to unroll the loop. But don't insist
+      // on unrolling, because if the function is expensive compiler should not
       // unroll the loop at the expense of inlining.
-      const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
-      for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
-        for (Index j = 0; j < 4; j++) {
+      const StorageIndex UnrolledSize =
+          (size / (4 * PacketSize)) * 4 * PacketSize;
+      for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
+        for (StorageIndex j = 0; j < 4; j++) {
           evaluator.evalPacket(i + j * PacketSize);
         }
       }
-      const Index VectorizedSize = (size / PacketSize) * PacketSize;
-      for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
+      const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
+      for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
         evaluator.evalPacket(i);
       }
-      for (Index i = VectorizedSize; i < size; ++i) {
+      for (StorageIndex i = VectorizedSize; i < size; ++i) {
         evaluator.evalScalar(i);
       }
     }
@@ -80,41 +94,107 @@ class TensorExecutor<Expression, DefaultDevice, true>
   }
 };
 
+/**
+ * Process all the data with a single cpu thread, using blocks of data. By
+ * sizing a block to fit L1 cache we get better cache performance.
+ */
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, DefaultDevice, Vectorizable,
+                     /*Tileable*/ true> {
+ public:
+  using Scalar = typename traits<Expression>::Scalar;
+  using ScalarNoConst = typename remove_const<Scalar>::type;
+
+  using Evaluator = TensorEvaluator<Expression, DefaultDevice>;
+  using StorageIndex = typename traits<Expression>::Index;
+
+  static const int NumDims = traits<Expression>::NumDimensions;
+
+  EIGEN_DEVICE_FUNC
+  static inline void run(const Expression& expr,
+                         const DefaultDevice& device = DefaultDevice()) {
+    using TensorBlock =
+        TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
+    using TensorBlockMapper = TensorBlockMapper<ScalarNoConst, StorageIndex,
+                                                NumDims, Evaluator::Layout>;
 
+    Evaluator evaluator(expr, device);
+    Index total_size = array_prod(evaluator.dimensions());
+    Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
+
+    if (total_size < cache_size) {
+      // TODO(andydavis) Reduce block management overhead for small tensors.
+      // TODO(wuke) Do not do this when evaluating TensorBroadcastingOp.
+      internal::TensorExecutor<Expression, DefaultDevice, Vectorizable,
+                               /*Tileable*/ false>::run(expr, device);
+      return;
+    }
 
-// Multicore strategy: the index space is partitioned and each partition is executed on a single core
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+    if (needs_assign) {
+      // Size tensor blocks to fit in cache (or requested target block size).
+      Index block_total_size = numext::mini(cache_size, total_size);
+      TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims;
+      // Query expression tree for desired block size/shape.
+      std::vector<TensorOpResourceRequirements> resources;
+      evaluator.getResourceRequirements(&resources);
+      MergeResourceRequirements(resources, &block_shape, &block_total_size);
+
+      TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape,
+                                     block_total_size);
+      block_total_size = block_mapper.block_dims_total_size();
+
+      Scalar* data = static_cast<Scalar*>(
+          device.allocate(block_total_size * sizeof(Scalar)));
+
+      const StorageIndex total_block_count = block_mapper.total_block_count();
+      for (StorageIndex i = 0; i < total_block_count; ++i) {
+        TensorBlock block = block_mapper.GetBlockForIndex(i, data);
+        evaluator.evalBlock(&block);
+      }
+      device.deallocate(data);
+    }
+    evaluator.cleanup();
+  }
+};
+
+/**
+ * Multicore strategy: the index space is partitioned and each partition is
+ * executed on a single core.
+ */
 #ifdef EIGEN_USE_THREADS
-template <typename Evaluator, typename Index, bool Vectorizable>
+template <typename Evaluator, typename StorageIndex, bool Vectorizable>
 struct EvalRange {
-  static void run(Evaluator* evaluator_in, const Index first, const Index last) {
+  static void run(Evaluator* evaluator_in, const StorageIndex first,
+                  const StorageIndex last) {
     Evaluator evaluator = *evaluator_in;
     eigen_assert(last >= first);
-    for (Index i = first; i < last; ++i) {
+    for (StorageIndex i = first; i < last; ++i) {
       evaluator.evalScalar(i);
     }
   }
 
-  static Index alignBlockSize(Index size) {
-    return size;
-  }
+  static StorageIndex alignBlockSize(StorageIndex size) { return size; }
 };
 
-template <typename Evaluator, typename Index>
-struct EvalRange<Evaluator, Index, true> {
-  static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+template <typename Evaluator, typename StorageIndex>
+struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
+  static const int PacketSize =
+      unpacket_traits<typename Evaluator::PacketReturnType>::size;
 
-  static void run(Evaluator* evaluator_in, const Index first, const Index last) {
+  static void run(Evaluator* evaluator_in, const StorageIndex first,
+                  const StorageIndex last) {
     Evaluator evaluator = *evaluator_in;
     eigen_assert(last >= first);
-    Index i = first;
+    StorageIndex i = first;
     if (last - first >= PacketSize) {
       eigen_assert(first % PacketSize == 0);
-      Index last_chunk_offset = last - 4 * PacketSize;
-      // Give the compiler a strong hint to unroll the loop. But don't insist
-      // on unrolling, because if the function is expensive the compiler should not
+      StorageIndex last_chunk_offset = last - 4 * PacketSize;
+      // Give compiler a strong possibility to unroll the loop. But don't insist
+      // on unrolling, because if the function is expensive compiler should not
       // unroll the loop at the expense of inlining.
-      for (; i <= last_chunk_offset; i += 4*PacketSize) {
-        for (Index j = 0; j < 4; j++) {
+      for (; i <= last_chunk_offset; i += 4 * PacketSize) {
+        for (StorageIndex j = 0; j < 4; j++) {
           evaluator.evalPacket(i + j * PacketSize);
         }
       }
@@ -128,7 +208,7 @@ struct EvalRange<Evaluator, Index, true> {
     }
   }
 
-  static Index alignBlockSize(Index size) {
+  static StorageIndex alignBlockSize(StorageIndex size) {
     // Align block size to packet size and account for unrolling in run above.
     if (size >= 16 * PacketSize) {
       return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
@@ -138,133 +218,185 @@ struct EvalRange<Evaluator, Index, true> {
   }
 };
 
-template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
+template <typename Expression, bool Vectorizable, bool Tileable>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
  public:
-  typedef typename Expression::Index Index;
-  static inline void run(const Expression& expr, const ThreadPoolDevice& device)
-  {
+  using StorageIndex = typename Expression::Index;
+
+  static inline void run(const Expression& expr,
+                         const ThreadPoolDevice& device) {
     typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+    typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
+
     Evaluator evaluator(expr, device);
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-    if (needs_assign)
-    {
-      const Index size = array_prod(evaluator.dimensions());
-#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL)
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+    if (needs_assign) {
+      const StorageIndex PacketSize =
+          Vectorizable
+              ? unpacket_traits<typename Evaluator::PacketReturnType>::size
+              : 1;
+      const StorageIndex size = array_prod(evaluator.dimensions());
       device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
-                         EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize,
-                         [&evaluator](Index first, Index last) {
-                           EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, first, last);
+                         EvalRange::alignBlockSize,
+                         [&evaluator](StorageIndex first, StorageIndex last) {
+                           EvalRange::run(&evaluator, first, last);
                          });
-#else
-      size_t num_threads = device.numThreads();
-      if (num_threads > 1) {
-        num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
-            size, evaluator.costPerCoeff(Vectorizable), num_threads);
-      }
-      if (num_threads == 1) {
-        EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
-      } else {
-        const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
-        Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
-        const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
-        const Index numblocks = size / blocksize;
-
-        Barrier barrier(numblocks);
-        for (int i = 0; i < numblocks; ++i) {
-          device.enqueue_with_barrier(
-              &barrier, &EvalRange<Evaluator, Index, Vectorizable>::run,
-              &evaluator, i * blocksize, (i + 1) * blocksize);
-        }
-        if (numblocks * blocksize < size) {
-          EvalRange<Evaluator, Index, Vectorizable>::run(
-              &evaluator, numblocks * blocksize, size);
-        }
-        barrier.Wait();
-      }
-#endif  // defined(!EIGEN_USE_SIMPLE_THREAD_POOL)
     }
     evaluator.cleanup();
   }
 };
+
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ true> {
+ public:
+  using Scalar = typename traits<Expression>::Scalar;
+  using ScalarNoConst = typename remove_const<Scalar>::type;
+
+  using Evaluator = TensorEvaluator<Expression, ThreadPoolDevice>;
+  using StorageIndex = typename traits<Expression>::Index;
+
+  static const int NumDims = traits<Expression>::NumDimensions;
+
+  static inline void run(const Expression& expr,
+                         const ThreadPoolDevice& device) {
+    using TensorBlock =
+        TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
+    using TensorBlockMapper =
+        TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
+
+    Evaluator evaluator(expr, device);
+    StorageIndex total_size = array_prod(evaluator.dimensions());
+    StorageIndex cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
+    if (total_size < cache_size) {
+      // TODO(andydavis) Reduce block management overhead for small tensors.
+      internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
+                               false>::run(expr, device);
+      evaluator.cleanup();
+      return;
+    }
+
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+    if (needs_assign) {
+      TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims;
+      Index block_total_size = 0;
+      // Query expression tree for desired block size/shape.
+      std::vector<internal::TensorOpResourceRequirements> resources;
+      evaluator.getResourceRequirements(&resources);
+      MergeResourceRequirements(resources, &block_shape, &block_total_size);
+      int num_threads = device.numThreads();
+
+      // Estimate minimum block size based on cost.
+      TensorOpCost cost = evaluator.costPerCoeff(Vectorizable);
+      double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(1, cost);
+      size_t block_size = static_cast<size_t>(1.0 / taskSize);
+      TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape,
+                                     block_size);
+      block_size = block_mapper.block_dims_total_size();
+      const size_t aligned_blocksize =
+          EIGEN_MAX_ALIGN_BYTES *
+          divup<size_t>(block_size * sizeof(Scalar), EIGEN_MAX_ALIGN_BYTES);
+      void* buf = device.allocate((num_threads + 1) * aligned_blocksize);
+      device.parallelFor(
+          block_mapper.total_block_count(), cost * block_size,
+          [=, &device, &evaluator, &block_mapper](StorageIndex first,
+                                                  StorageIndex last) {
+            // currentThreadId() returns -1 if called from a thread not in the
+            // thread pool, such as the main thread dispatching Eigen
+            // expressions.
+            const int thread_idx = device.currentThreadId();
+            eigen_assert(thread_idx >= -1 && thread_idx < num_threads);
+            Scalar* thread_buf = reinterpret_cast<Scalar*>(
+                static_cast<char*>(buf) + aligned_blocksize * (thread_idx + 1));
+            for (StorageIndex i = first; i < last; ++i) {
+              auto block = block_mapper.GetBlockForIndex(i, thread_buf);
+              evaluator.evalBlock(&block);
+            }
+          });
+      device.deallocate(buf);
+    }
+    evaluator.cleanup();
+  }
+};
+
 #endif  // EIGEN_USE_THREADS
 
 
 // GPU: the evaluation of the expression is offloaded to a GPU.
 #if defined(EIGEN_USE_GPU)
 
-template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, GpuDevice, Vectorizable> {
+template <typename Expression, bool Vectorizable, bool Tileable>
+class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> {
  public:
-  typedef typename Expression::Index Index;
+  typedef typename Expression::Index StorageIndex;
   static void run(const Expression& expr, const GpuDevice& device);
 };
 
 
-#if defined(EIGEN_CUDACC)
-template <typename Evaluator, typename Index, bool Vectorizable>
+#if defined(EIGEN_GPUCC)
+template <typename Evaluator, typename StorageIndex, bool Vectorizable>
 struct EigenMetaKernelEval {
   static __device__ EIGEN_ALWAYS_INLINE
-  void run(Evaluator& eval, Index first, Index last, Index step_size) {
-    for (Index i = first; i < last; i += step_size) {
+  void run(Evaluator& eval, StorageIndex first, StorageIndex last, StorageIndex step_size) {
+    for (StorageIndex i = first; i < last; i += step_size) {
       eval.evalScalar(i);
     }
   }
 };
 
-template <typename Evaluator, typename Index>
-struct EigenMetaKernelEval<Evaluator, Index, true> {
+template <typename Evaluator, typename StorageIndex>
+struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
   static __device__ EIGEN_ALWAYS_INLINE
-  void run(Evaluator& eval, Index first, Index last, Index step_size) {
-    const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
-    const Index vectorized_size = (last / PacketSize) * PacketSize;
-    const Index vectorized_step_size = step_size * PacketSize;
+  void run(Evaluator& eval, StorageIndex first, StorageIndex last, StorageIndex step_size) {
+    const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+    const StorageIndex vectorized_size = (last / PacketSize) * PacketSize;
+    const StorageIndex vectorized_step_size = step_size * PacketSize;
 
     // Use the vector path
-    for (Index i = first * PacketSize; i < vectorized_size;
+    for (StorageIndex i = first * PacketSize; i < vectorized_size;
          i += vectorized_step_size) {
       eval.evalPacket(i);
     }
-    for (Index i = vectorized_size + first; i < last; i += step_size) {
+    for (StorageIndex i = vectorized_size + first; i < last; i += step_size) {
       eval.evalScalar(i);
     }
   }
 };
 
-template <typename Evaluator, typename Index>
+template <typename Evaluator, typename StorageIndex>
 __global__ void
 __launch_bounds__(1024)
-EigenMetaKernel(Evaluator eval, Index size) {
+EigenMetaKernel(Evaluator eval, StorageIndex size) {
 
-  const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
-  const Index step_size = blockDim.x * gridDim.x;
+  const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
+  const StorageIndex step_size = blockDim.x * gridDim.x;
 
   const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
-  EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
+  EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
 }
 
 /*static*/
-template <typename Expression, bool Vectorizable>
-inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
+template <typename Expression, bool Vectorizable, bool Tileable>
+inline void TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable>::run(
     const Expression& expr, const GpuDevice& device) {
   TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
   const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
   if (needs_assign) {
-    const int block_size = device.maxCudaThreadsPerBlock();
-    const int max_blocks = device.getNumCudaMultiProcessors() *
-                           device.maxCudaThreadsPerMultiProcessor() / block_size;
-    const Index size = array_prod(evaluator.dimensions());
+
+    const int block_size = device.maxGpuThreadsPerBlock();
+    const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const StorageIndex size = array_prod(evaluator.dimensions());
     // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
     const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
 
-    LAUNCH_CUDA_KERNEL(
-        (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
+    LAUNCH_GPU_KERNEL(
+        (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
         num_blocks, block_size, 0, device, evaluator, size);
   }
   evaluator.cleanup();
 }
 
-#endif  // EIGEN_CUDACC
+#endif  // EIGEN_GPUCC
 #endif  // EIGEN_USE_GPU
 
 // SYCL Executor policy
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index f81da318c..d6ab4d997 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -274,7 +274,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
           }
         }
 
-        // processs the line
+        // process the line
         if (is_power_of_two) {
           processDataLineCooleyTukey(line_buf, line_len, log_len);
         }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index e943757ad..1342e47a6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -40,6 +40,8 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
 
     enum {
       IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0),
+      PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+      BlockAccess = false,
       Layout = Options_ & RowMajor ? RowMajor : ColMajor,
       CoordAccess = true,
       RawAccess = true
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index c015ce196..fdb31928f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -98,6 +98,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
   enum {
     IsAligned = true,
     PacketAccess = (PacketSize > 1),
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess = true
   };
@@ -109,7 +110,10 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+  #if !defined(EIGEN_HIPCC)
+  EIGEN_DEVICE_FUNC
+  #endif
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
     const Index numValues =  internal::array_prod(m_impl.dimensions());
     m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType));
     // Should initialize the memory in case we're dealing with non POD types.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index 354bbe8d1..8ed1796df 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -12,7 +12,7 @@
 
 namespace Eigen {
 
-// MakePointer class is used as a container of the adress space of the pointer
+// MakePointer class is used as a container of the address space of the pointer
 // on the host and on the device. From the host side it generates the T* pointer
 // and when EIGEN_USE_SYCL is used it construct a buffer with a map_allocator to
 // T* m_data on the host. It is always called on the device.
@@ -65,7 +65,7 @@ template<typename Op, typename Dims, typename XprType, template <class> class Ma
 template<typename XprType> class TensorIndexTupleOp;
 template<typename ReduceOp, typename Dims, typename XprType> class TensorTupleReducerOp;
 template<typename Axis, typename LeftXprType, typename RightXprType> class TensorConcatenationOp;
-template<typename Dimensions, typename LeftXprType, typename RightXprType> class TensorContractionOp;
+template<typename Dimensions, typename LeftXprType, typename RightXprType, typename OutputKernelType> class TensorContractionOp;
 template<typename TargetType, typename XprType> class TensorConversionOp;
 template<typename Dimensions, typename InputXprType, typename KernelXprType> class TensorConvolutionOp;
 template<typename FFT, typename XprType, int FFTDataType, int FFTDirection> class TensorFFTOp;
@@ -97,6 +97,8 @@ template<typename XprType> class TensorForcedEvalOp;
 template<typename ExpressionType, typename DeviceType> class TensorDevice;
 template<typename Derived, typename Device> struct TensorEvaluator;
 
+class NoOpOutputKernel;
+
 struct DefaultDevice;
 struct ThreadPoolDevice;
 struct GpuDevice;
@@ -127,8 +129,14 @@ struct IsVectorizable<GpuDevice, Expression> {
                             TensorEvaluator<Expression, GpuDevice>::IsAligned;
 };
 
+template <typename Device, typename Expression>
+struct IsTileable {
+  static const bool value = TensorEvaluator<Expression, Device>::BlockAccess;
+};
+
 template <typename Expression, typename Device,
-          bool Vectorizable = IsVectorizable<Device, Expression>::value>
+          bool Vectorizable = IsVectorizable<Device, Expression>::value,
+          bool Tileable = IsTileable<Device, Expression>::value>
 class TensorExecutor;
 
 }  // end namespace internal
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index 5dcc3794c..3d0e4035a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -140,7 +140,7 @@ struct reducer_traits<SumReducer<T>, Device> {
 
 template <typename T> struct MeanReducer
 {
-  static const bool PacketAccess = packet_traits<T>::HasAdd && !NumTraits<T>::IsInteger;
+  static const bool PacketAccess = packet_traits<T>::HasAdd && packet_traits<T>::HasDiv && !NumTraits<T>::IsInteger;
   static const bool IsStateful = true;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -171,7 +171,7 @@ template <typename T> struct MeanReducer
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
-    return pdiv(vaccum, pset1<Packet>(packetCount_));
+    return pdiv(vaccum, pset1<Packet>(T(packetCount_)));
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
@@ -487,6 +487,25 @@ struct functor_traits<GaussianGenerator<T, Index, NumDims> > {
   };
 };
 
+template <typename Scalar>
+struct scalar_clamp_op {
+  EIGEN_DEVICE_FUNC inline scalar_clamp_op(const Scalar& _min, const Scalar& _max) : m_min(_min), m_max(_max) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
+  operator()(const Scalar& x) const {
+    return numext::mini(numext::maxi(x, m_min), m_max);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
+  packetOp(const Packet& x) const {
+    return internal::pmin(internal::pmax(x, pset1<Packet>(m_min)), pset1<Packet>(m_max));
+  }
+  const Scalar m_min;
+  const Scalar m_max;
+};
+template<typename Scalar>
+struct functor_traits<scalar_clamp_op<Scalar> >
+{ enum { Cost = 2 * NumTraits<Scalar>::AddCost, PacketAccess = (packet_traits<Scalar>::HasMin && packet_traits<Scalar>::HasMax)}; };
+
 } // end namespace internal
 } // end namespace Eigen
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
new file mode 100644
index 000000000..5438ebe71
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
@@ -0,0 +1,88 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
+#define EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
+
+// Note that we are using EIGEN_USE_HIP here instead of EIGEN_HIPCC...this is by design
+// There is code in the Tensorflow codebase that will define EIGEN_USE_GPU,  but
+// for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler
+// When compiling such files, gcc will end up trying to pick up the CUDA headers by 
+// default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU)
+// This will obsviously not work when trying to compile tensorflow on a sytem with no CUDA
+// To work around this issue for HIP systems (and leave the default behaviour intact), the
+// HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and 
+// "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is
+// defined. In continuation of that requirement, the guard here needs to be EIGEN_USE_HIP as well
+
+#if defined(EIGEN_USE_HIP)
+
+#define gpuStream_t hipStream_t
+#define gpuDeviceProp_t hipDeviceProp_t
+#define gpuError_t hipError_t
+#define gpuSuccess hipSuccess
+#define gpuErrorNotReady hipErrorNotReady
+#define gpuGetDeviceCount hipGetDeviceCount
+#define gpuGetErrorString hipGetErrorString
+#define gpuGetDeviceProperties hipGetDeviceProperties
+#define gpuStreamDefault hipStreamDefault
+#define gpuGetDevice hipGetDevice
+#define gpuSetDevice hipSetDevice
+#define gpuMalloc hipMalloc
+#define gpuFree hipFree
+#define gpuMemsetAsync hipMemsetAsync
+#define gpuMemcpyAsync hipMemcpyAsync
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuStreamQuery hipStreamQuery
+#define gpuSharedMemConfig hipSharedMemConfig
+#define gpuDeviceSetSharedMemConfig hipDeviceSetSharedMemConfig
+#define gpuStreamSynchronize hipStreamSynchronize
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuMemcpy hipMemcpy
+
+#else
+
+#define gpuStream_t cudaStream_t
+#define gpuDeviceProp_t cudaDeviceProp
+#define gpuError_t cudaError_t
+#define gpuSuccess cudaSuccess
+#define gpuErrorNotReady cudaErrorNotReady
+#define gpuGetDeviceCount cudaGetDeviceCount
+#define gpuGetErrorString cudaGetErrorString
+#define gpuGetDeviceProperties cudaGetDeviceProperties
+#define gpuStreamDefault cudaStreamDefault
+#define gpuGetDevice cudaGetDevice
+#define gpuSetDevice cudaSetDevice
+#define gpuMalloc cudaMalloc
+#define gpuFree cudaFree
+#define gpuMemsetAsync cudaMemsetAsync
+#define gpuMemcpyAsync cudaMemcpyAsync
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuStreamQuery cudaStreamQuery
+#define gpuSharedMemConfig cudaSharedMemConfig
+#define gpuDeviceSetSharedMemConfig cudaDeviceSetSharedMemConfig
+#define gpuStreamSynchronize cudaStreamSynchronize
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuMemcpy cudaMemcpy
+
+#endif
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDACC) && (EIGEN_CUDACC_VER==0))
+// clang-cuda and HIPCC do not support the use of assert on the GPU side.
+#define gpu_assert(COND)
+#else
+#define gpu_assert(COND) assert(COND)
+#endif
+
+#endif  // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h
new file mode 100644
index 000000000..db394bcbb
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h
@@ -0,0 +1,40 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
+
+#undef gpuStream_t
+#undef gpuDeviceProp_t 
+#undef gpuError_t
+#undef gpuSuccess
+#undef gpuErrorNotReady
+#undef gpuGetDeviceCount
+#undef gpuGetErrorString
+#undef gpuGetDeviceProperties
+#undef gpuStreamDefault
+#undef gpuGetDevice
+#undef gpuSetDevice
+#undef gpuMalloc
+#undef gpuFree
+#undef gpuMemsetAsync
+#undef gpuMemcpyAsync
+#undef gpuMemcpyDeviceToDevice
+#undef gpuMemcpyDeviceToHost
+#undef gpuMemcpyHostToDevice
+#undef gpuStreamQuery
+#undef gpuSharedMemConfig
+#undef gpuDeviceSetSharedMemConfig
+#undef gpuStreamSynchronize
+#undef gpuDeviceSynchronize
+#undef gpuMemcpy
+
+#undef EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
+
+#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index 91d4ead28..72cb2d15f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -186,6 +186,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = false
@@ -272,8 +273,8 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
           break;
         default:
           eigen_assert(false && "unexpected padding");
-          m_outputCols=0; // silence the uninitialised warnig;
-          m_outputRows=0; //// silence the uninitialised warnig;
+          m_outputCols=0; // silence the uninitialised warning;
+          m_outputRows=0; //// silence the uninitialised warning;
       }
     }
     eigen_assert(m_outputRows > 0);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
index 3209fecd3..8810d78cf 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@@ -350,7 +350,8 @@ struct IndexPairList : internal::IndexTuple<FirstType, OtherTypes...> {
 
 namespace internal {
 
-template<typename FirstType, typename... OtherTypes> size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
+template<typename FirstType, typename... OtherTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
   size_t result = 1;
   for (int i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) {
     result *= sizes[i];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index fb6454623..b6d445c50 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -21,7 +21,7 @@ namespace Eigen {
   * \brief Fast integer division by a constant.
   *
   * See the paper from Granlund and Montgomery for explanation.
-  *   (at http://dx.doi.org/10.1145/773473.178249)
+  *   (at https://doi.org/10.1145/773473.178249)
   *
   * \sa Tensor
   */
@@ -35,7 +35,7 @@ namespace {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val)
   {
-#ifdef EIGEN_CUDA_ARCH
+#ifdef EIGEN_GPU_COMPILE_PHASE
     return __clz(val);
 #elif defined(__SYCL_DEVICE_ONLY__)
     return cl::sycl::clz(val);
@@ -53,7 +53,7 @@ namespace {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val)
   {
-#ifdef EIGEN_CUDA_ARCH
+#ifdef EIGEN_GPU_COMPILE_PHASE
     return __clzll(val);
 #elif defined(__SYCL_DEVICE_ONLY__)
     return cl::sycl::clz(val);
@@ -90,7 +90,7 @@ namespace {
 
   template <typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
-#if defined(EIGEN_CUDA_ARCH)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
     return __umulhi(a, b);
 #elif defined(__SYCL_DEVICE_ONLY__)
     return cl::sycl::mul_hi(a, static_cast<uint32_t>(b));
@@ -101,7 +101,7 @@ namespace {
 
   template <typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
-#if defined(EIGEN_CUDA_ARCH)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
     return __umul64hi(a, b);
 #elif defined(__SYCL_DEVICE_ONLY__)
     return cl::sycl::mul_hi(a, static_cast<uint64_t>(b));
@@ -124,7 +124,7 @@ namespace {
   template <typename T>
   struct DividerHelper<64, T> {
     static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
-#if defined(__SIZEOF_INT128__) && !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
+#if defined(__SIZEOF_INT128__) && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__)
       return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
 #else
       const uint64_t shift = 1ULL << log_div;
@@ -167,7 +167,7 @@ struct TensorIntDivisor {
     shift2 = log_div > 1 ? log_div-1 : 0;
   }
 
-  // Must have 0 <= numerator. On platforms that dont support the __uint128_t
+  // Must have 0 <= numerator. On platforms that don't support the __uint128_t
   // type numerator should also be less than 2^32-1.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const {
     eigen_assert(static_cast<typename UnsignedTraits<T>::type>(numerator) < NumTraits<UnsignedType>::highest()/2);
@@ -203,7 +203,7 @@ class TensorIntDivisor<int32_t, true> {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
-#ifdef EIGEN_CUDA_ARCH
+#ifdef EIGEN_GPU_COMPILE_PHASE
     return (__umulhi(magic, n) >> shift);
 #elif defined(__SYCL_DEVICE_ONLY__)
     return (cl::sycl::mul_hi(static_cast<uint64_t>(magic), static_cast<uint64_t>(n)) >> shift);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
index 4e384f9b9..e3165fa10 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -119,6 +119,7 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
   enum {
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
     CoordAccess = false,  // to be implemented
     RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
@@ -181,6 +182,7 @@ template<typename ArgType, typename Device>
   enum {
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
     CoordAccess = false  // to be implemented
   };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
index c9e61f359..c6ca396a3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
@@ -27,7 +27,7 @@
  */
 
 // SFINAE requires variadic templates
-#ifndef EIGEN_CUDACC
+#if !defined(EIGEN_GPUCC)
 #if EIGEN_HAS_VARIADIC_TEMPLATES
   // SFINAE doesn't work for gcc <= 4.7
   #ifdef EIGEN_COMP_GNUC
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index 5431eb740..87be090f9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -52,7 +52,7 @@ struct PacketType : internal::packet_traits<Scalar> {
 };
 
 // For CUDA packet types when using a GpuDevice
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) && defined(EIGEN_HAS_CUDA_FP16)
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16)
 template <>
 struct PacketType<half, GpuDevice> {
   typedef half2 type;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 329655817..498488649 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -105,6 +105,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
   enum {
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
@@ -170,6 +171,7 @@ template<typename NewDimensions, typename ArgType, typename Device>
   enum {
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
@@ -325,6 +327,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
     // slice offsets and sizes.
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = false
@@ -398,7 +401,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
       const MemcpyTriggerForSlicing<Index, Device> trigger(m_device);
       if (trigger(contiguous_values)) {
         Scalar* src = (Scalar*)m_impl.data();
-        for (int i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
+        for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
           Index offset = srcCoeff(i);
           m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar));
         }
@@ -557,9 +560,10 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
   enum {
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
-    RawAccess = false
+    RawAccess = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -716,7 +720,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
   static const int NumDims = internal::array_size<Strides>::value;
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
-  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef Strides Dimensions;
@@ -859,7 +862,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
     return inputIndex;
   }
 
-  static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
 #ifndef __SYCL_DEVICE_ONLY__
     return numext::maxi(min, numext::mini(max,value));
 #else
@@ -907,7 +910,6 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
 
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
-  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef Strides Dimensions;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index 5956e513d..ffa22f31e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -96,6 +96,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   enum {
     IsAligned = true,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = true,
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
index 9e0a20abf..950ac32af 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -94,6 +94,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
index 230915db2..787cbd031 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@@ -16,10 +16,10 @@ namespace internal {
 namespace {
 
 EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
-#ifdef EIGEN_CUDA_ARCH
+#if defined(EIGEN_GPU_COMPILE_PHASE)
   // We don't support 3d kernels since we currently only use 1 and
   // 2d kernels.
-  assert(threadIdx.z == 0);
+  gpu_assert(threadIdx.z == 0);
   return clock64() +
       blockIdx.x * blockDim.x + threadIdx.x +
       gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index da0ffe728..375fc0802 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -334,12 +334,12 @@ struct OuterReducer {
 };
 
 
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
 template <int B, int N, typename S, typename R, typename I>
 __global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
 
 
-#ifdef EIGEN_HAS_CUDA_FP16
+#if defined(EIGEN_HAS_GPU_FP16)
 template <typename S, typename R, typename I>
 __global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
 template <int B, int N, typename S, typename R, typename I>
@@ -412,6 +412,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
   enum {
     IsAligned = false,
     PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -495,7 +496,14 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(typename MakePointer_<CoeffReturnType>::Type data) {
+  EIGEN_STRONG_INLINE
+    #if !defined(EIGEN_HIPCC)
+    // Marking this as EIGEN_DEVICE_FUNC for HIPCC requires also doing the same for all the functions
+    // being called within here, which then leads to proliferation of EIGEN_DEVICE_FUNC markings, one
+    // of which will eventually result in an NVCC error
+    EIGEN_DEVICE_FUNC
+    #endif
+    bool evalSubExprsIfNeeded(typename MakePointer_<CoeffReturnType>::Type data) {
     m_impl.evalSubExprsIfNeeded(NULL);
 
     // Use the FullReducer if possible.
@@ -694,9 +702,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
 #ifdef EIGEN_USE_THREADS
   template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
 #endif
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
   template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
-#ifdef EIGEN_HAS_CUDA_FP16
+#if defined(EIGEN_HAS_GPU_FP16)
   template <typename S, typename R, typename I> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
   template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
   template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
@@ -781,7 +789,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
   Op m_reducer;
 
   // For full reductions
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
   static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
   static const bool RunningOnSycl = false;
 #elif defined(EIGEN_USE_SYCL)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index ebcbd6f41..68780cd3c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -1,772 +1,6 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
-#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
+#if defined(__clang__) || defined(__GNUC__)
+#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorReductionGpu.h file"
+#endif
 
-namespace Eigen {
-namespace internal {
-
-
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
-// Full reducers for GPU, don't vectorize for now
-
-// Reducer function that enables multiple cuda thread to safely accumulate at the same
-// output address. It basically reads the current value of the output variable, and
-// attempts to update it with the new value. If in the meantime another cuda thread
-// updated the content of the output address it will try again.
-template <typename T, typename R>
-__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
-#if EIGEN_CUDA_ARCH >= 300
-  if (sizeof(T) == 4)
-  {
-    unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
-    unsigned int newval = oldval;
-    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-    if (newval == oldval) {
-      return;
-    }
-    unsigned int readback;
-    while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
-      oldval = readback;
-      newval = oldval;
-      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-      if (newval == oldval) {
-        return;
-      }
-    }
-  }
-  else if (sizeof(T) == 8) {
-    unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
-    unsigned long long newval = oldval;
-    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-    if (newval == oldval) {
-      return;
-    }
-    unsigned long long readback;
-    while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
-      oldval = readback;
-      newval = oldval;
-      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-      if (newval == oldval) {
-        return;
-      }
-    }
-  }
-  else {
-    assert(0 && "Wordsize not supported");
-  }
-#else // EIGEN_CUDA_ARCH >= 300
-  assert(0 && "Shouldn't be called on unsupported device");
-#endif // EIGEN_CUDA_ARCH >= 300
-}
-
-// We extend atomicExch to support extra data types
-template <typename Type>
-__device__ inline Type atomicExchCustom(Type* address, Type val) {
-  return atomicExch(address, val);
-}
-
-template <>
-__device__ inline double atomicExchCustom(double* address, double val) {
-  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
-  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
-}
-
-#ifdef EIGEN_HAS_CUDA_FP16
-template <template <typename T> class R>
-__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
-  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
-  unsigned int newval = oldval;
-  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
-  if (newval == oldval) {
-    return;
-  }
-  unsigned int readback;
-  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
-    oldval = readback;
-    newval = oldval;
-    reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
-    if (newval == oldval) {
-      return;
-    }
-  }
-}
-#endif // EIGEN_HAS_CUDA_FP16
-
-template <>
-__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
-#if EIGEN_CUDA_ARCH >= 300
-  atomicAdd(output, accum);
-#else // EIGEN_CUDA_ARCH >= 300
-  assert(0 && "Shouldn't be called on unsupported device");
-#endif // EIGEN_CUDA_ARCH >= 300
-}
-
-
-template <typename CoeffType, typename Index>
-__global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  const Index num_threads = blockDim.x * gridDim.x;
-  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
-    output[i] = val;
-  }
-}
-
-
-template <int BlockSize, int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
-                                    typename Self::CoeffReturnType* output, unsigned int* semaphore) {
-#if EIGEN_CUDA_ARCH >= 300
-  // Initialize the output value
-  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
-  if (gridDim.x == 1) {
-    if (first_index == 0) {
-      *output = reducer.initialize();
-    }
-  }
-  else {
-    if (threadIdx.x == 0) {
-      unsigned int block = atomicCAS(semaphore, 0u, 1u);
-      if (block == 0) {
-        // We're the first block to run, initialize the output value
-        atomicExchCustom(output, reducer.initialize());
-        __threadfence();
-        atomicExch(semaphore, 2u);
-      }
-      else {
-        // Wait for the first block to initialize the output value.
-        // Use atomicCAS here to ensure that the reads aren't cached
-        unsigned int val;
-        do {
-          val = atomicCAS(semaphore, 2u, 2u);
-        }
-        while (val < 2u);
-      }
-    }
-  }
-
-  __syncthreads();
-
-  eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
-
-  typename Self::CoeffReturnType accum = reducer.initialize();
-  Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
-  for (Index i = 0; i < max_iter; i+=BlockSize) {
-    const Index index = first_index + i;
-    eigen_assert(index < num_coeffs);
-    typename Self::CoeffReturnType val = input.m_impl.coeff(index);
-    reducer.reduce(val, &accum);
-  }
-
-#pragma unroll
-  for (int offset = warpSize/2; offset > 0; offset /= 2) {
-  #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
-    reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
-  #else
-    reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
-  #endif
-  }
-
-  if ((threadIdx.x & (warpSize - 1)) == 0) {
-    atomicReduce(output, accum, reducer);
-  }
-
-  if (gridDim.x > 1 && threadIdx.x == 0) {
-    // Let the last block reset the semaphore
-    atomicInc(semaphore, gridDim.x + 1);
-  }
-#else // EIGEN_CUDA_ARCH >= 300
-  assert(0 && "Shouldn't be called on unsupported device");
-#endif // EIGEN_CUDA_ARCH >= 300
-}
-
-
-#ifdef EIGEN_HAS_CUDA_FP16
-template <typename Self,
-          typename Reducer, typename Index>
-__global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) {
-  eigen_assert(blockDim.x == 1);
-  eigen_assert(gridDim.x == 1);
-  if (num_coeffs % 2 != 0) {
-    half last = input.m_impl.coeff(num_coeffs-1);
-    *scratch = __halves2half2(last, reducer.initialize());
-  } else {
-    *scratch = reducer.template initializePacket<half2>();
-  }
-}
-
-template <typename Self,
-          typename Reducer, typename Index>
-__global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  const Index num_threads = blockDim.x * gridDim.x;
-  const Index num_packets = num_coeffs / 2;
-  for (Index i = thread_id; i < num_packets; i += num_threads) {
-    ((half2*)output)[i] = reducer.template initializePacket<half2>();
-  }
-
-  if (thread_id == 0 && num_coeffs % 2 != 0) {
-    output[num_coeffs-1] = reducer.initialize();
-  }
-}
-
-template <int BlockSize, int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
-                                    half* output, half2* scratch) {
-  eigen_assert(NumPerThread % 2 == 0);
-
-  const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x;
-
-  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
-  if (gridDim.x == 1) {
-    if (first_index == 0) {
-      if (num_coeffs % 2 != 0) {
-        half last = input.m_impl.coeff(num_coeffs-1);
-        *scratch = __halves2half2(last, reducer.initialize());
-      } else {
-        *scratch = reducer.template initializePacket<half2>();
-      }
-    }
-    __syncthreads();
-  }
-
-  half2 accum = reducer.template initializePacket<half2>();
-  const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
-  for (Index i = 0; i < max_iter; i += BlockSize) {
-    const Index index = first_index + 2*i;
-    eigen_assert(index + 1 < num_coeffs);
-    half2 val = input.m_impl.template packet<Unaligned>(index);
-    reducer.reducePacket(val, &accum);
-  }
-
-#pragma unroll
-  for (int offset = warpSize/2; offset > 0; offset /= 2) {
-  #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
-    reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
-  #else
-    int temp = __shfl_down_sync(0xFFFFFFFF, *(int*)(&accum), (unsigned)offset, warpSize);
-    reducer.reducePacket(*(half2*)(&temp), &accum);
-  #endif
-  }
-
-  if ((threadIdx.x & (warpSize - 1)) == 0) {
-    atomicReduce(scratch, accum, reducer);
-  }
-
-  if (gridDim.x == 1) {
-    __syncthreads();
-    if (first_index == 0) {
-      half tmp = __low2half(*scratch);
-      reducer.reduce(__high2half(*scratch), &tmp);
-      *output = tmp;
-    }
-  }
-}
-
-template <typename Op>
-__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
-  eigen_assert(threadIdx.x == 1);
-  half tmp = __low2half(*scratch);
-  reducer.reduce(__high2half(*scratch), &tmp);
-  *output = tmp;
-}
-
-#endif // EIGEN_HAS_CUDA_FP16
-
-template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
-struct FullReductionLauncher {
-  static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
-    assert(false && "Should only be called on doubles, floats and half floats");
-  }
-};
-
-// Specialization for float and double
-template <typename Self, typename Op, typename OutputType, bool PacketAccess>
-struct FullReductionLauncher<
-    Self, Op, OutputType, PacketAccess,
-    typename internal::enable_if<
-      internal::is_same<float, OutputType>::value ||
-      internal::is_same<double, OutputType>::value,
-    void>::type> {
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
-    typedef typename Self::Index Index;
-    const int block_size = 256;
-    const int num_per_thread = 128;
-    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-
-    unsigned int* semaphore = NULL;
-    if (num_blocks > 1) {
-      semaphore = device.semaphore();
-    }
-
-    LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
-  }
-};
-
-#ifdef EIGEN_HAS_CUDA_FP16
-template <typename Self, typename Op>
-struct FullReductionLauncher<Self, Op, Eigen::half, false> {
-  static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
-    assert(false && "Should not be called since there is no packet accessor");
-  }
-};
-
-template <typename Self, typename Op>
-struct FullReductionLauncher<Self, Op, Eigen::half, true> {
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
-    typedef typename Self::Index Index;
-
-    const int block_size = 256;
-    const int num_per_thread = 128;
-    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-    half2* scratch = static_cast<half2*>(device.scratchpad());
-
-    if (num_blocks > 1) {
-      // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
-      // won't be a race conditions between multiple thread blocks.
-      LAUNCH_CUDA_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
-                         1, 1, 0, device, reducer, self, num_coeffs, scratch);
-    }
-
-    LAUNCH_CUDA_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
-
-    if (num_blocks > 1) {
-      LAUNCH_CUDA_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
-                         1, 1, 0, device, reducer, output, scratch);
-    }
-  }
-};
-#endif // EIGEN_HAS_CUDA_FP16
-
-
-template <typename Self, typename Op, bool Vectorizable>
-struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
-  // Unfortunately nvidia doesn't support well exotic types such as complex,
-  // so reduce the scope of the optimized version of the code to the simple cases
-  // of doubles, floats and half floats
-#ifdef EIGEN_HAS_CUDA_FP16
-  static const bool HasOptimizedImplementation = !Op::IsStateful &&
-      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-       internal::is_same<typename Self::CoeffReturnType, double>::value ||
-       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
-#else // EIGEN_HAS_CUDA_FP16
-  static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-                                                 internal::is_same<typename Self::CoeffReturnType, double>::value);
-#endif // EIGEN_HAS_CUDA_FP16
-
-  template <typename OutputType>
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
-    assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
-    const Index num_coeffs = array_prod(self.m_impl.dimensions());
-    // Don't crash when we're called with an input tensor of size 0.
-    if (num_coeffs == 0) {
-      return;
-    }
-
-    FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
-  }
-};
-
-
-template <int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
-                                         typename Self::CoeffReturnType* output) {
-#if EIGEN_CUDA_ARCH >= 300
-  typedef typename Self::CoeffReturnType Type;
-  eigen_assert(blockDim.y == 1);
-  eigen_assert(blockDim.z == 1);
-  eigen_assert(gridDim.y == 1);
-  eigen_assert(gridDim.z == 1);
-
-  const int unroll_times = 16;
-  eigen_assert(NumPerThread % unroll_times == 0);
-
-  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
-  const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
-
-  const Index num_threads = blockDim.x * gridDim.x;
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-
-  // Initialize the output values if they weren't initialized by the ReductionInitKernel
-  if (gridDim.x == 1) {
-    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
-      output[i] = reducer.initialize();
-    }
-    __syncthreads();
-  }
-
-  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
-    const Index row = i / input_col_blocks;
-
-    if (row < num_preserved_coeffs) {
-      const Index col_block = i % input_col_blocks;
-      const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
-
-      Type reduced_val = reducer.initialize();
-
-      for (Index j = 0; j < NumPerThread; j += unroll_times) {
-        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
-        if (last_col >= num_coeffs_to_reduce) {
-          for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
-            const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
-            reducer.reduce(val, &reduced_val);
-          }
-          break;
-        } else {
-          // Faster version of the loop with no branches after unrolling.
-#pragma unroll
-          for (int k = 0; k < unroll_times; ++k) {
-            const Index col = col_begin + blockDim.x * (j + k);
-            reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
-          }
-        }
-      }
-
-#pragma unroll
-      for (int offset = warpSize/2; offset > 0; offset /= 2) {
-      #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
-        reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
-      #else
-        reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
-      #endif
-      }
-
-      if ((threadIdx.x & (warpSize - 1)) == 0) {
-        atomicReduce(&(output[row]), reduced_val, reducer);
-      }
-    }
-  }
-#else // EIGEN_CUDA_ARCH >= 300
-  assert(0 && "Shouldn't be called on unsupported device");
-#endif // EIGEN_CUDA_ARCH >= 300
-}
-
-#ifdef EIGEN_HAS_CUDA_FP16
-
-template <int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
-                                              half* output) {
-  eigen_assert(blockDim.y == 1);
-  eigen_assert(blockDim.z == 1);
-  eigen_assert(gridDim.y == 1);
-  eigen_assert(gridDim.z == 1);
-
-  const int unroll_times = 16;
-  eigen_assert(NumPerThread % unroll_times == 0);
-  eigen_assert(unroll_times % 2 == 0);
-
-  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
-  const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
-
-  const Index num_threads = blockDim.x * gridDim.x;
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-
-  // Initialize the output values if they weren't initialized by the ReductionInitKernel
-  if (gridDim.x == 1) {
-    Index i = 2*thread_id;
-    for (; i + 1 < num_preserved_coeffs; i += 2*num_threads) {
-      half* loc = output + i;
-      *((half2*)loc) = reducer.template initializePacket<half2>();
-    }
-    if (i < num_preserved_coeffs) {
-      output[i] = reducer.initialize();
-    }
-    __syncthreads();
-  }
-
-  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
-    const Index row = 2 * (i / input_col_blocks);
-
-    if (row + 1 < num_preserved_coeffs) {
-      const Index col_block = i % input_col_blocks;
-      const Index col_begin = 2 * (col_block * blockDim.x * NumPerThread + threadIdx.x);
-
-      half2 reduced_val1 = reducer.template initializePacket<half2>();
-      half2 reduced_val2 = reducer.template initializePacket<half2>();
-
-      for (Index j = 0; j < NumPerThread; j += unroll_times) {
-        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1) * 2;
-        if (last_col >= num_coeffs_to_reduce) {
-          Index col = col_begin + blockDim.x * j;
-          for (; col + 1 < num_coeffs_to_reduce; col += blockDim.x) {
-            const half2 val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col);
-            reducer.reducePacket(val1, &reduced_val1);
-            const half2 val2 = input.m_impl.template packet<Unaligned>((row+1) * num_coeffs_to_reduce + col);
-            reducer.reducePacket(val2, &reduced_val2);
-          }
-          if (col < num_coeffs_to_reduce) {
-            // Peel;
-            const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
-            const half2 val1 = __halves2half2(last1, reducer.initialize());
-            reducer.reducePacket(val1, &reduced_val1);
-            const half last2 = input.m_impl.coeff((row+1) * num_coeffs_to_reduce + col);
-            const half2 val2 = __halves2half2(last2, reducer.initialize());
-            reducer.reducePacket(val2, &reduced_val2);
-          }
-          break;
-        } else {
-          // Faster version of the loop with no branches after unrolling.
-#pragma unroll
-          for (int k = 0; k < unroll_times; ++k) {
-            const Index col = col_begin + blockDim.x * (j + k) * 2;
-            reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col), &reduced_val1);
-            reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1)* num_coeffs_to_reduce + col), &reduced_val2);
-          }
-        }
-      }
-
-#pragma unroll
-      for (int offset = warpSize/2; offset > 0; offset /= 2) {
-      #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
-        reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
-        reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
-      #else
-        int temp1 = __shfl_down_sync(0xFFFFFFFF, *(int*)(&reduced_val1), (unsigned)offset, warpSize);
-        int temp2 = __shfl_down_sync(0xFFFFFFFF, *(int*)(&reduced_val2), (unsigned)offset, warpSize);
-        reducer.reducePacket(*(half2*)(&temp1), &reduced_val1);
-        reducer.reducePacket(*(half2*)(&temp2), &reduced_val2);
-      #endif
-      }
-
-      half val1 =  __low2half(reduced_val1);
-      reducer.reduce(__high2half(reduced_val1), &val1);
-      half val2 =  __low2half(reduced_val2);
-      reducer.reduce(__high2half(reduced_val2), &val2);
-      half2 val = __halves2half2(val1, val2);
-
-      if ((threadIdx.x & (warpSize - 1)) == 0) {
-        half* loc = output + row;
-        atomicReduce((half2*)loc, val, reducer);
-      }
-    }
-  }
-}
-
-#endif // EIGEN_HAS_CUDA_FP16
-
-template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
-struct InnerReductionLauncher {
-  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
-    assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
-    return true;
-  }
-};
-
-// Specialization for float and double
-template <typename Self, typename Op, typename OutputType, bool PacketAccess>
-struct InnerReductionLauncher<
-  Self, Op, OutputType, PacketAccess,
-  typename internal::enable_if<
-    internal::is_same<float, OutputType>::value ||
-    internal::is_same<double, OutputType>::value,
-  void>::type> {
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    typedef typename Self::Index Index;
-
-    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
-    const int block_size = 256;
-    const int num_per_thread = 128;
-    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-    const int max_blocks = device.getNumCudaMultiProcessors() *
-                           device.maxCudaThreadsPerMultiProcessor() / block_size;
-    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-
-    if (num_blocks > 1) {
-      // We initialize the outputs outside the reduction kernel when we can't be sure that there
-      // won't be a race conditions between multiple thread blocks.
-      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
-      const int max_blocks = device.getNumCudaMultiProcessors() *
-                           device.maxCudaThreadsPerMultiProcessor() / 1024;
-      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-      LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>),
-                         num_blocks, 1024, 0, device, reducer.initialize(),
-                         num_preserved_vals, output);
-    }
-
-    LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
-
-    return false;
-  }
-};
-
-#ifdef EIGEN_HAS_CUDA_FP16
-template <typename Self, typename Op>
-struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
-  static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
-    assert(false && "Should not be called since there is no packet accessor");
-    return true;
-  }
-};
-
-template <typename Self, typename Op>
-struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    typedef typename Self::Index Index;
-
-    if (num_preserved_vals % 2 != 0) {
-      // Not supported yet, revert to the slower code path
-      return true;
-    }
-
-    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
-    const int block_size = /*256*/128;
-    const int num_per_thread = /*128*/64;
-    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-    const int max_blocks = device.getNumCudaMultiProcessors() *
-                           device.maxCudaThreadsPerMultiProcessor() / block_size;
-    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-
-    if (num_blocks > 1) {
-      // We initialize the outputs outside the reduction kernel when we can't be sure that there
-      // won't be a race conditions between multiple thread blocks.
-      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
-      const int max_blocks = device.getNumCudaMultiProcessors() *
-                           device.maxCudaThreadsPerMultiProcessor() / 1024;
-      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-      LAUNCH_CUDA_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
-                         1, 1, 0, device, reducer, self, num_preserved_vals, output);
-    }
-
-    LAUNCH_CUDA_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
-
-    return false;
-  }
-};
-#endif // EIGEN_HAS_CUDA_FP16
-
-
-template <typename Self, typename Op>
-struct InnerReducer<Self, Op, GpuDevice> {
-  // Unfortunately nvidia doesn't support well exotic types such as complex,
-  // so reduce the scope of the optimized version of the code to the simple case
-  // of floats and half floats.
-#ifdef EIGEN_HAS_CUDA_FP16
-  static const bool HasOptimizedImplementation = !Op::IsStateful &&
-      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-       internal::is_same<typename Self::CoeffReturnType, double>::value ||
-       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
-#else // EIGEN_HAS_CUDA_FP16
-  static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
-#endif // EIGEN_HAS_CUDA_FP16
-
-  template <typename OutputType>
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
-    const Index num_coeffs = array_prod(self.m_impl.dimensions());
-    // Don't crash when we're called with an input tensor of size 0.
-    if (num_coeffs == 0) {
-      return true;
-    }
-    // It's faster to use the usual code.
-    if (num_coeffs_to_reduce <= 128) {
-      return true;
-    }
-
-    return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
-  }
-};
-
-template <int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
-                                     typename Self::CoeffReturnType* output) {
-  const Index num_threads = blockDim.x * gridDim.x;
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  // Initialize the output values if they weren't initialized by the ReductionInitKernel
-  if (gridDim.x == 1) {
-    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
-      output[i] = reducer.initialize();
-    }
-    __syncthreads();
-  }
-
-  // Do the reduction.
-  const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
-  for (Index i = thread_id; i < max_iter; i += num_threads) {
-    const Index input_col = i % num_preserved_coeffs;
-    const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
-    typename Self::CoeffReturnType reduced_val = reducer.initialize();
-    const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
-    for (Index j = input_row; j < max_row; j++) {
-      typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
-      reducer.reduce(val, &reduced_val);
-    }
-    atomicReduce(&(output[input_col]), reduced_val, reducer);
-  }
-}
-
-
-template <typename Self, typename Op>
-struct OuterReducer<Self, Op, GpuDevice> {
-  // Unfortunately nvidia doesn't support well exotic types such as complex,
-  // so reduce the scope of the optimized version of the code to the simple case
-  // of floats.
-  static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
-  template <typename Device, typename OutputType>
-  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
-    assert(false && "Should only be called to reduce doubles or floats on a gpu device");
-    return true;
-  }
-
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    typedef typename Self::Index Index;
-
-    // It's faster to use the usual code.
-    if (num_coeffs_to_reduce <= 32) {
-      return true;
-    }
-
-    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
-    const int block_size = 256;
-    const int num_per_thread = 16;
-    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-    const int max_blocks = device.getNumCudaMultiProcessors() *
-                           device.maxCudaThreadsPerMultiProcessor() / block_size;
-    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-
-    if (num_blocks > 1) {
-      // We initialize the outputs in the reduction kernel itself when we don't have to worry
-      // about race conditions between multiple thread blocks.
-      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
-      const int max_blocks = device.getNumCudaMultiProcessors() *
-                             device.maxCudaThreadsPerMultiProcessor() / 1024;
-      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-      LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
-                         num_blocks, 1024, 0, device, reducer.initialize(),
-                         num_preserved_vals, output);
-    }
-
-    LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
-
-    return false;
-  }
-};
-
-#endif // defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-
-
-} // end namespace internal
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
+#include "TensorReductionGpu.h"
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
new file mode 100644
index 000000000..cd20df505
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
@@ -0,0 +1,825 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
+
+namespace Eigen {
+namespace internal {
+
+
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+// Full reducers for GPU, don't vectorize for now
+
+// Reducer function that enables multiple gpu thread to safely accumulate at the same
+// output address. It basically reads the current value of the output variable, and
+// attempts to update it with the new value. If in the meantime another gpu thread
+// updated the content of the output address it will try again.
+template <typename T, typename R>
+__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  if (sizeof(T) == 4)
+  {
+    unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+    unsigned int newval = oldval;
+    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+    unsigned int readback;
+    while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+      oldval = readback;
+      newval = oldval;
+      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+      if (newval == oldval) {
+        return;
+      }
+    }
+  }
+  else if (sizeof(T) == 8) {
+    unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
+    unsigned long long newval = oldval;
+    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+    unsigned long long readback;
+    while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
+      oldval = readback;
+      newval = oldval;
+      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+      if (newval == oldval) {
+        return;
+      }
+    }
+  }
+  else {
+    gpu_assert(0 && "Wordsize not supported");
+  }
+#else // EIGEN_CUDA_ARCH >= 300
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif // EIGEN_CUDA_ARCH >= 300
+}
+
+// We extend atomicExch to support extra data types
+template <typename Type>
+__device__ inline Type atomicExchCustom(Type* address, Type val) {
+  return atomicExch(address, val);
+}
+
+template <>
+__device__ inline double atomicExchCustom(double* address, double val) {
+  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
+  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
+}
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <template <typename T> class R>
+__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
+  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+  unsigned int newval = oldval;
+  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+  if (newval == oldval) {
+    return;
+  }
+  unsigned int readback;
+  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+    oldval = readback;
+    newval = oldval;
+    reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+  }
+}
+#endif // EIGEN_HAS_GPU_FP16
+
+template <>
+__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  atomicAdd(output, accum);
+#else // EIGEN_CUDA_ARCH >= 300
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif // EIGEN_CUDA_ARCH >= 300
+}
+
+
+template <typename CoeffType, typename Index>
+__global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index num_threads = blockDim.x * gridDim.x;
+  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+    output[i] = val;
+  }
+}
+
+
+template <int BlockSize, int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
+                                    typename Self::CoeffReturnType* output, unsigned int* semaphore) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  // Initialize the output value
+  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      *output = reducer.initialize();
+    }
+  }
+  else {
+    if (threadIdx.x == 0) {
+      unsigned int block = atomicCAS(semaphore, 0u, 1u);
+      if (block == 0) {
+        // We're the first block to run, initialize the output value
+        atomicExchCustom(output, reducer.initialize());
+        __threadfence();
+        atomicExch(semaphore, 2u);
+      }
+      else {
+        // Wait for the first block to initialize the output value.
+        // Use atomicCAS here to ensure that the reads aren't cached
+        unsigned int val;
+        do {
+          val = atomicCAS(semaphore, 2u, 2u);
+        }
+        while (val < 2u);
+      }
+    }
+  }
+
+  __syncthreads();
+
+  eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
+
+  typename Self::CoeffReturnType accum = reducer.initialize();
+  Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
+  for (Index i = 0; i < max_iter; i+=BlockSize) {
+    const Index index = first_index + i;
+    eigen_assert(index < num_coeffs);
+    typename Self::CoeffReturnType val = input.m_impl.coeff(index);
+    reducer.reduce(val, &accum);
+  }
+
+#pragma unroll
+  for (int offset = warpSize/2; offset > 0; offset /= 2) {
+  #if defined(EIGEN_HIPCC)
+    // use std::is_floating_point to determine the type of reduced_val 
+    // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error 
+    // and list the float and int versions of __shfl_down as the candidate functions. 
+    if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
+      reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
+    } else {
+      reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
+    }
+  #elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
+    reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
+  #else
+    reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
+  #endif
+  }
+
+  if ((threadIdx.x & (warpSize - 1)) == 0) {
+    atomicReduce(output, accum, reducer);
+  }
+
+  if (gridDim.x > 1 && threadIdx.x == 0) {
+    // Let the last block reset the semaphore
+    atomicInc(semaphore, gridDim.x + 1);
+#if defined(EIGEN_HIPCC)
+    __threadfence_system();
+#endif
+  }
+#else // EIGEN_CUDA_ARCH >= 300
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif // EIGEN_CUDA_ARCH >= 300
+}
+
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename Self,
+          typename Reducer, typename Index>
+__global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) {
+  eigen_assert(blockDim.x == 1);
+  eigen_assert(gridDim.x == 1);
+  if (num_coeffs % 2 != 0) {
+    half last = input.m_impl.coeff(num_coeffs-1);
+    *scratch = __halves2half2(last, reducer.initialize());
+  } else {
+    *scratch = reducer.template initializePacket<half2>();
+  }
+}
+
+template <typename Self,
+          typename Reducer, typename Index>
+__global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index num_packets = num_coeffs / 2;
+  for (Index i = thread_id; i < num_packets; i += num_threads) {
+    ((half2*)output)[i] = reducer.template initializePacket<half2>();
+  }
+
+  if (thread_id == 0 && num_coeffs % 2 != 0) {
+    output[num_coeffs-1] = reducer.initialize();
+  }
+}
+
+template <int BlockSize, int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
+                                    half* output, half2* scratch) {
+  eigen_assert(NumPerThread % 2 == 0);
+
+  const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x;
+
+  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
+
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      if (num_coeffs % 2 != 0) {
+        half last = input.m_impl.coeff(num_coeffs-1);
+        *scratch = __halves2half2(last, reducer.initialize());
+      } else {
+        *scratch = reducer.template initializePacket<half2>();
+      }
+    }
+    __syncthreads();
+  }
+  
+  half2 accum = reducer.template initializePacket<half2>();
+  const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
+  for (Index i = 0; i < max_iter; i += BlockSize) {
+    const Index index = first_index + 2*i;
+    eigen_assert(index + 1 < num_coeffs);
+    half2 val = input.m_impl.template packet<Unaligned>(index);
+    reducer.reducePacket(val, &accum);
+  }
+
+#pragma unroll
+  for (int offset = warpSize/2; offset > 0; offset /= 2) {
+  #if defined(EIGEN_HIPCC)
+    // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
+    union { int i; half2 h; } wka_in, wka_out;
+    wka_in.h = accum;
+    wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
+    reducer.reducePacket(wka_out.h, &accum);
+  #elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
+    reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
+  #else
+    int temp = __shfl_down_sync(0xFFFFFFFF, *(int*)(&accum), (unsigned)offset, warpSize);
+    reducer.reducePacket(*(half2*)(&temp), &accum);
+  #endif
+  }
+
+  if ((threadIdx.x & (warpSize - 1)) == 0) {
+    atomicReduce(scratch, accum, reducer);
+  }
+
+  if (gridDim.x == 1) {
+    __syncthreads();
+    if (first_index == 0) {
+      half tmp = __low2half(*scratch);
+      reducer.reduce(__high2half(*scratch), &tmp);
+      *output = tmp;
+    }
+  }
+}
+
+template <typename Op>
+__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
+  eigen_assert(threadIdx.x == 1);
+  half tmp = __low2half(*scratch);
+  reducer.reduce(__high2half(*scratch), &tmp);
+  *output = tmp;
+}
+
+#endif // EIGEN_HAS_GPU_FP16
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct FullReductionLauncher {
+  static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
+    gpu_assert(false && "Should only be called on doubles, floats and half floats");
+  }
+};
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct FullReductionLauncher<
+    Self, Op, OutputType, PacketAccess,
+    typename internal::enable_if<
+      internal::is_same<float, OutputType>::value ||
+      internal::is_same<double, OutputType>::value,
+    void>::type> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
+
+    typedef typename Self::Index Index;
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+
+    unsigned int* semaphore = NULL;
+    if (num_blocks > 1) {
+      semaphore = device.semaphore();
+    }
+
+    LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
+  }
+};
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, false> {
+  static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
+    gpu_assert(false && "Should not be called since there is no packet accessor");
+  }
+};
+
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, true> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
+    typedef typename Self::Index Index;
+
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    half2* scratch = static_cast<half2*>(device.scratchpad());
+
+    if (num_blocks > 1) {
+      // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
+                         1, 1, 0, device, reducer, self, num_coeffs, scratch);
+    }
+
+    LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
+
+    if (num_blocks > 1) {
+      LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
+                         1, 1, 0, device, reducer, output, scratch);
+    }
+  }
+};
+#endif // EIGEN_HAS_GPU_FP16
+
+
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple cases
+  // of doubles, floats and half floats
+#ifdef EIGEN_HAS_GPU_FP16
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+       internal::is_same<typename Self::CoeffReturnType, double>::value ||
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else // EIGEN_HAS_GPU_FP16
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+                                                (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                 internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif // EIGEN_HAS_GPU_FP16
+
+  template <typename OutputType>
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
+    gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    // Don't crash when we're called with an input tensor of size 0.
+    if (num_coeffs == 0) {
+      return;
+    }
+
+    FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
+  }
+};
+
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                         typename Self::CoeffReturnType* output) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  typedef typename Self::CoeffReturnType Type;
+  eigen_assert(blockDim.y == 1);
+  eigen_assert(blockDim.z == 1);
+  eigen_assert(gridDim.y == 1);
+  eigen_assert(gridDim.z == 1);
+
+  const int unroll_times = 16;
+  eigen_assert(NumPerThread % unroll_times == 0);
+
+  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
+  const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
+
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+    const Index row = i / input_col_blocks;
+
+    if (row < num_preserved_coeffs) {
+      const Index col_block = i % input_col_blocks;
+      const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
+
+      Type reduced_val = reducer.initialize();
+
+      for (Index j = 0; j < NumPerThread; j += unroll_times) {
+        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
+        if (last_col >= num_coeffs_to_reduce) {
+          for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
+            const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+            reducer.reduce(val, &reduced_val);
+          }
+          break;
+        } else {
+          // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+          for (int k = 0; k < unroll_times; ++k) {
+            const Index col = col_begin + blockDim.x * (j + k);
+            reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
+          }
+        }
+      }
+
+#pragma unroll
+      for (int offset = warpSize/2; offset > 0; offset /= 2) {
+      #if defined(EIGEN_HIPCC)
+        // use std::is_floating_point to determine the type of reduced_val 
+	// This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error 
+	// and list the float and int versions of __shfl_down as the candidate functions. 
+        if (std::is_floating_point<Type>::value) {
+          reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
+        } else {
+          reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
+        }
+      #elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
+        reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
+      #else
+        reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
+      #endif
+      }
+
+      if ((threadIdx.x & (warpSize - 1)) == 0) {
+        atomicReduce(&(output[row]), reduced_val, reducer);
+      }
+    }
+  }
+#else // EIGEN_CUDA_ARCH >= 300
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif // EIGEN_CUDA_ARCH >= 300
+}
+
+#ifdef EIGEN_HAS_GPU_FP16
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                              half* output) {
+  eigen_assert(blockDim.y == 1);
+  eigen_assert(blockDim.z == 1);
+  eigen_assert(gridDim.y == 1);
+  eigen_assert(gridDim.z == 1);
+
+  const int unroll_times = 16;
+  eigen_assert(NumPerThread % unroll_times == 0);
+  eigen_assert(unroll_times % 2 == 0);
+
+  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
+  const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
+
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    Index i = 2*thread_id;
+    for (; i + 1 < num_preserved_coeffs; i += 2*num_threads) {
+      half* loc = output + i;
+      *((half2*)loc) = reducer.template initializePacket<half2>();
+    }
+    if (i < num_preserved_coeffs) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+    const Index row = 2 * (i / input_col_blocks);
+
+    if (row + 1 < num_preserved_coeffs) {
+      const Index col_block = i % input_col_blocks;
+      const Index col_begin = 2 * (col_block * blockDim.x * NumPerThread + threadIdx.x);
+
+      half2 reduced_val1 = reducer.template initializePacket<half2>();
+      half2 reduced_val2 = reducer.template initializePacket<half2>();
+
+      for (Index j = 0; j < NumPerThread; j += unroll_times) {
+        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1) * 2;
+        if (last_col >= num_coeffs_to_reduce) {
+          Index col = col_begin + blockDim.x * j;
+          for (; col + 1 < num_coeffs_to_reduce; col += blockDim.x) {
+            const half2 val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col);
+            reducer.reducePacket(val1, &reduced_val1);
+            const half2 val2 = input.m_impl.template packet<Unaligned>((row+1) * num_coeffs_to_reduce + col);
+            reducer.reducePacket(val2, &reduced_val2);
+          }
+          if (col < num_coeffs_to_reduce) {
+            // Peel;
+            const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+            const half2 val1 = __halves2half2(last1, reducer.initialize());
+            reducer.reducePacket(val1, &reduced_val1);
+            const half last2 = input.m_impl.coeff((row+1) * num_coeffs_to_reduce + col);
+            const half2 val2 = __halves2half2(last2, reducer.initialize());
+            reducer.reducePacket(val2, &reduced_val2);
+          }
+          break;
+        } else {
+          // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+          for (int k = 0; k < unroll_times; ++k) {
+            const Index col = col_begin + blockDim.x * (j + k) * 2;
+            reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col), &reduced_val1);
+            reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1)* num_coeffs_to_reduce + col), &reduced_val2);
+          }
+        }
+      }
+
+#pragma unroll
+      for (int offset = warpSize/2; offset > 0; offset /= 2) {
+      #if defined(EIGEN_HIPCC)
+	// FIXME : remove this workaround once we have native half/half2 support for __shfl_down
+	union { int i; half2 h; } wka_in, wka_out;
+
+	wka_in.h = reduced_val1;
+	wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
+        reducer.reducePacket(wka_out.h, &reduced_val1);
+	
+	wka_in.h = reduced_val2;
+	wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
+        reducer.reducePacket(wka_out.h, &reduced_val2);
+      #elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
+        reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
+        reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
+      #else
+        int temp1 = __shfl_down_sync(0xFFFFFFFF, *(int*)(&reduced_val1), (unsigned)offset, warpSize);
+        int temp2 = __shfl_down_sync(0xFFFFFFFF, *(int*)(&reduced_val2), (unsigned)offset, warpSize);
+        reducer.reducePacket(*(half2*)(&temp1), &reduced_val1);
+        reducer.reducePacket(*(half2*)(&temp2), &reduced_val2);
+      #endif
+      }
+
+      half val1 =  __low2half(reduced_val1);
+      reducer.reduce(__high2half(reduced_val1), &val1);
+      half val2 =  __low2half(reduced_val2);
+      reducer.reduce(__high2half(reduced_val2), &val2);
+      half2 val = __halves2half2(val1, val2);
+
+      if ((threadIdx.x & (warpSize - 1)) == 0) {
+        half* loc = output + row;
+        atomicReduce((half2*)loc, val, reducer);
+      }
+    }
+  }
+}
+
+#endif // EIGEN_HAS_GPU_FP16
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct InnerReductionLauncher {
+  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
+    gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
+    return true;
+  }
+};
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct InnerReductionLauncher<
+  Self, Op, OutputType, PacketAccess,
+  typename internal::enable_if<
+    internal::is_same<float, OutputType>::value ||
+    internal::is_same<double, OutputType>::value,
+  void>::type> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / 1024;
+      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
+                         num_blocks, 1024, 0, device, reducer.initialize(),
+                         num_preserved_vals, output);
+    }
+
+    LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
+  static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
+    gpu_assert(false && "Should not be called since there is no packet accessor");
+    return true;
+  }
+};
+
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    if (num_preserved_vals % 2 != 0) {
+      // Not supported yet, revert to the slower code path
+      return true;
+    }
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = /*256*/128;
+    const int num_per_thread = /*128*/64;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / 1024;
+      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
+                         1, 1, 0, device, reducer, self, num_preserved_vals, output);
+    }
+
+    LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+#endif // EIGEN_HAS_GPU_FP16
+
+
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, GpuDevice> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple case
+  // of floats and half floats.
+#ifdef EIGEN_HAS_GPU_FP16
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+       internal::is_same<typename Self::CoeffReturnType, double>::value ||
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else // EIGEN_HAS_GPU_FP16
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif // EIGEN_HAS_GPU_FP16
+
+  template <typename OutputType>
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    // Don't crash when we're called with an input tensor of size 0.
+    if (num_coeffs == 0) {
+      return true;
+    }
+    // It's faster to use the usual code.
+    if (num_coeffs_to_reduce <= 128) {
+      return true;
+    }
+
+    return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
+  }
+};
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                     typename Self::CoeffReturnType* output) {
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  // Do the reduction.
+  const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
+  for (Index i = thread_id; i < max_iter; i += num_threads) {
+    const Index input_col = i % num_preserved_coeffs;
+    const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
+    typename Self::CoeffReturnType reduced_val = reducer.initialize();
+    const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
+    for (Index j = input_row; j < max_row; j++) {
+      typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
+      reducer.reduce(val, &reduced_val);
+    }
+    atomicReduce(&(output[input_col]), reduced_val, reducer);
+  }
+}
+
+
+template <typename Self, typename Op>
+struct OuterReducer<Self, Op, GpuDevice> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple case
+  // of floats.
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
+  template <typename Device, typename OutputType>
+  static
+    #if !defined(EIGEN_HIPCC)
+    // FIXME :  leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error
+    //          (in the cxx11_tensor_reduction_gpu test)
+    //
+    // terminate called after throwing an instance of 'std::runtime_error'
+    //   what():  No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL...
+    //
+    // dont know why this happens (and why is it a runtime error instead of a compile time errror)
+    //
+    // this will be fixed by HIP PR#457
+    EIGEN_DEVICE_FUNC
+    #endif
+    bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
+    gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device");
+    return true;
+  }
+
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    // It's faster to use the usual code.
+    if (num_coeffs_to_reduce <= 32) {
+      return true;
+    }
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = 256;
+    const int num_per_thread = 16;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs in the reduction kernel itself when we don't have to worry
+      // about race conditions between multiple thread blocks.
+      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks = device.getNumGpuMultiProcessors() *
+                             device.maxGpuThreadsPerMultiProcessor() / 1024;
+      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
+                         num_blocks, 1024, 0, device, reducer.initialize(),
+                         num_preserved_vals, output);
+    }
+
+    LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+
+#endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
index 94899252b..a379f5a94 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@@ -106,7 +106,7 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
     /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one.
     if (GRange < outTileSize) outTileSize=GRange;
     /// creating the shared memory for calculating reduction.
-    /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
+    /// This one is used to collect all the reduced value of shared memory as we don't have global barrier on GPU. Once it is saved we can
     /// recursively apply reduction on it in order to reduce the whole.
     auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
     typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
@@ -150,7 +150,7 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
 
     // getting final out buffer at the moment the created buffer is true because there is no need for assign
     /// creating the shared memory for calculating reduction.
-    /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
+    /// This one is used to collect all the reduced value of shared memory as we don't have global barrier on GPU. Once it is saved we can
     /// recursively apply reduction on it in order to reduce the whole.
       dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
       dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
index 99245f778..a6cade50f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@@ -31,7 +31,7 @@ class TensorLazyBaseEvaluator {
   int refCount() const { return m_refcount; }
 
  private:
-  // No copy, no assigment;
+  // No copy, no assignment;
   TensorLazyBaseEvaluator(const TensorLazyBaseEvaluator& other);
   TensorLazyBaseEvaluator& operator = (const TensorLazyBaseEvaluator& other);
 
@@ -136,6 +136,7 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
     enum {
       IsAligned = false,
       PacketAccess = false,
+      BlockAccess = false,
       Layout = PlainObjectType::Layout,
       CoordAccess = false,  // to be implemented
       RawAccess = false
@@ -364,6 +365,7 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
   enum {
     IsAligned = false,
     PacketAccess = false,
+    BlockAccess = false,
     Layout = TensorRef<Derived>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -411,6 +413,7 @@ struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<cons
   enum {
     IsAligned = false,
     PacketAccess = false,
+    BlockAccess = false,
     RawAccess = false
   };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 14a50a029..bb2768ab1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -113,6 +113,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -253,6 +254,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index 1f545ef1a..39717efaa 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -242,7 +242,7 @@ struct ScanLauncher {
   }
 };
 
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
 
 // GPU implementation of scan
 // TODO(ibab) This placeholder implementation performs multiple scans in
@@ -278,10 +278,11 @@ struct ScanLauncher<Self, Reducer, GpuDevice> {
      Index total_size = internal::array_prod(self.dimensions());
      Index num_blocks = (total_size / self.size() + 63) / 64;
      Index block_size = 64;
-     LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
+
+     LAUNCH_GPU_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
   }
 };
-#endif  // EIGEN_USE_GPU && EIGEN_CUDACC
+#endif  // EIGEN_USE_GPU && (EIGEN_GPUCC)
 
 }  // end namespace Eigen
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index 0697fd1ce..6b54f40ad 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -112,6 +112,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
   enum {
     IsAligned = false,
     PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -240,6 +241,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
   enum {
     IsAligned = false,
     PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    BlockAccess = false,
     RawAccess = false
   };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index a7eea99b6..c09513c10 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -112,6 +112,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
   enum {
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -273,6 +274,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
   enum {
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
index a7905706d..a248e303b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
@@ -117,7 +117,7 @@ SYCLEXTRFUNCTERNARY()
 
 
 
-//TensorCustomOp must be specialised otherewise it will be captured by UnaryCategory while its action is different
+//TensorCustomOp must be specialised otherwise it will be captured by UnaryCategory while its action is different
 //from the UnaryCategory and it is similar to the general FunctorExtractor.
 /// specialisation of TensorCustomOp
 #define SYCLEXTRFUNCCUSTOMUNARYOP(CVQual)\
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
index e5b892f2e..a447c3f88 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
@@ -80,7 +80,7 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen
     typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
     auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
     /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
-    /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
+    /// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the
     /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
     const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
     /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
@@ -121,7 +121,7 @@ class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::interna
     typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
     auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
     /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
-    /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
+    /// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the
     /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
     const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
     /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
@@ -168,7 +168,7 @@ public:
     typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
     auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
     /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
-    /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
+    /// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the
     /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
     const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
     /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
@@ -215,7 +215,7 @@ public:
     typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
     auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
     /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
-    /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
+    /// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the
     /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
     const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
     /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
index 58ab0f0d5..9e6c3e4fa 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
@@ -143,7 +143,7 @@ struct IndexList {};
 /// \brief Collects internal details for generating index ranges [MIN, MAX)
 /// Declare primary template for index range builder
 /// \tparam MIN is the starting index in the tuple
-/// \tparam N represents sizeof..(elemens)- sizeof...(Is)
+/// \tparam N represents sizeof..(elements)- sizeof...(Is)
 /// \tparam Is... are the list of generated index so far
 template <size_t MIN, size_t N, size_t... Is>
 struct RangeBuilder;
@@ -161,7 +161,7 @@ struct RangeBuilder<MIN, MIN, Is...> {
 /// in this case we are recursively subtracting N by one and adding one
 /// index to Is... list until MIN==N
 /// \tparam MIN is the starting index in the tuple
-/// \tparam N represents sizeof..(elemens)- sizeof...(Is)
+/// \tparam N represents sizeof..(elements)- sizeof...(Is)
 /// \tparam Is... are the list of generated index so far
 template <size_t MIN, size_t N, size_t... Is>
 struct RangeBuilder : public RangeBuilder<MIN, N - 1, N - 1, Is...> {};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
index 2b1968de1..c8b2fad1e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
@@ -95,6 +95,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = false
@@ -110,7 +111,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
     for (int i = 0; i < NumInputDims; ++i) {
       m_reduced[i] = false;
     }
-    
+
     const Dims& op_dims = op.dims();
     for (int i = 0; i < NumReducedDims; ++i) {
       eigen_assert(op_dims[i] >= 0);
@@ -128,7 +129,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
 
     eigen_assert(num_distinct_reduce_dims == NumReducedDims);
 
-    // Compute the dimensions of the result. 
+    // Compute the dimensions of the result.
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
 
     int output_index = 0;
@@ -229,7 +230,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
         result += m_impl.coeff(cur_index);
         cur_index += index_stride;
     }
-      
+
     return result;
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
index 51c099591..ef199bfb6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -568,7 +568,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
 
   Dimensions m_dimensions;
 
-  // Parameters passed to the costructor.
+  // Parameters passed to the constructor.
   Index m_plane_strides;
   Index m_row_strides;
   Index m_col_strides;
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
index 0fe0b7c46..04d6d6b23 100644
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
+++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
@@ -241,7 +241,7 @@ struct dimino_first_step_elements
   * multiplying all elements in the given subgroup with the new
   * coset representative. Note that the first element of the
   * subgroup is always the identity element, so the first element of
-  * ther result of this template is going to be the coset
+  * the result of this template is going to be the coset
   * representative itself.
   *
   * Note that this template accepts an additional boolean parameter
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
index 71d55552d..0a7181102 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
@@ -33,10 +33,10 @@ namespace Eigen {
 //   ec.Notify(true);
 //
 // Notify is cheap if there are no waiting threads. Prewait/CommitWait are not
-// cheap, but they are executed only if the preceeding predicate check has
+// cheap, but they are executed only if the preceding predicate check has
 // failed.
 //
-// Algorihtm outline:
+// Algorithm outline:
 // There are two main variables: predicate (managed by user) and state_.
 // Operation closely resembles Dekker mutual algorithm:
 // https://en.wikipedia.org/wiki/Dekker%27s_algorithm
@@ -79,7 +79,7 @@ class EventCount {
     uint64_t state = state_.load(std::memory_order_seq_cst);
     for (;;) {
       if (int64_t((state & kEpochMask) - epoch) < 0) {
-        // The preceeding waiter has not decided on its fate. Wait until it
+        // The preceding waiter has not decided on its fate. Wait until it
         // calls either CancelWait or CommitWait, or is notified.
         EIGEN_THREAD_YIELD();
         state = state_.load(std::memory_order_seq_cst);
@@ -110,7 +110,7 @@ class EventCount {
     uint64_t state = state_.load(std::memory_order_relaxed);
     for (;;) {
       if (int64_t((state & kEpochMask) - epoch) < 0) {
-        // The preceeding waiter has not decided on its fate. Wait until it
+        // The preceding waiter has not decided on its fate. Wait until it
         // calls either CancelWait or CommitWait, or is notified.
         EIGEN_THREAD_YIELD();
         state = state_.load(std::memory_order_relaxed);
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
index 1264a0270..ecd49f382 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -14,15 +14,15 @@
 namespace Eigen {
 
 template <typename Environment>
-class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
+class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
  public:
   typedef typename Environment::Task Task;
   typedef RunQueue<Task, 1024> Queue;
 
-  NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment())
-      : NonBlockingThreadPoolTempl(num_threads, true, env) {}
+  ThreadPoolTempl(int num_threads, Environment env = Environment())
+      : ThreadPoolTempl(num_threads, true, env) {}
 
-  NonBlockingThreadPoolTempl(int num_threads, bool allow_spinning,
+  ThreadPoolTempl(int num_threads, bool allow_spinning,
                              Environment env = Environment())
       : env_(env),
         num_threads_(num_threads),
@@ -66,7 +66,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
     }
   }
 
-  ~NonBlockingThreadPoolTempl() {
+  ~ThreadPoolTempl() {
     done_ = true;
 
     // Now if all threads block without work, they will start exiting.
@@ -136,7 +136,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
 
   int CurrentThreadId() const final {
     const PerThread* pt =
-        const_cast<NonBlockingThreadPoolTempl*>(this)->GetPerThread();
+        const_cast<ThreadPoolTempl*>(this)->GetPerThread();
     if (pt->pool == this) {
       return pt->thread_id;
     } else {
@@ -149,7 +149,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
 
   struct PerThread {
     constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) { }
-    NonBlockingThreadPoolTempl* pool;  // Parent pool, or null for normal threads.
+    ThreadPoolTempl* pool;  // Parent pool, or null for normal threads.
     uint64_t rand;  // Random generator state.
     int thread_id;  // Worker thread index in pool.
   };
@@ -337,7 +337,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
   }
 };
 
-typedef NonBlockingThreadPoolTempl<StlThreadEnvironment> NonBlockingThreadPool;
+typedef ThreadPoolTempl<StlThreadEnvironment> ThreadPool;
 
 }  // namespace Eigen
 
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
index 49d0cdc36..cb3690a2e 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
@@ -198,7 +198,7 @@ class RunQueue {
   };
   std::mutex mutex_;
   // Low log(kSize) + 1 bits in front_ and back_ contain rolling index of
-  // front/back, repsectively. The remaining bits contain modification counters
+  // front/back, respectively. The remaining bits contain modification counters
   // that are incremented on Push operations. This allows us to (1) distinguish
   // between empty and full conditions (if we would use log(kSize) bits for
   // position, these conditions would be indistinguishable); (2) obtain
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
deleted file mode 100644
index 335728665..000000000
--- a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
+++ /dev/null
@@ -1,162 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H
-#define EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H
-
-namespace Eigen {
-
-// The implementation of the ThreadPool type ensures that the Schedule method
-// runs the functions it is provided in FIFO order when the scheduling is done
-// by a single thread.
-// Environment provides a way to create threads and also allows to intercept
-// task submission and execution.
-template <typename Environment>
-class SimpleThreadPoolTempl : public ThreadPoolInterface {
- public:
-  // Construct a pool that contains "num_threads" threads.
-  explicit SimpleThreadPoolTempl(int num_threads, Environment env = Environment())
-      : env_(env), threads_(num_threads), waiters_(num_threads) {
-    for (int i = 0; i < num_threads; i++) {
-      threads_.push_back(env.CreateThread([this, i]() { WorkerLoop(i); }));
-    }
-  }
-
-  // Wait until all scheduled work has finished and then destroy the
-  // set of threads.
-  ~SimpleThreadPoolTempl() {
-    {
-      // Wait for all work to get done.
-      std::unique_lock<std::mutex> l(mu_);
-      while (!pending_.empty()) {
-        empty_.wait(l);
-      }
-      exiting_ = true;
-
-      // Wakeup all waiters.
-      for (auto w : waiters_) {
-        w->ready = true;
-        w->task.f = nullptr;
-        w->cv.notify_one();
-      }
-    }
-
-    // Wait for threads to finish.
-    for (auto t : threads_) {
-      delete t;
-    }
-  }
-
-  // Schedule fn() for execution in the pool of threads. The functions are
-  // executed in the order in which they are scheduled.
-  void Schedule(std::function<void()> fn) final {
-    Task t = env_.CreateTask(std::move(fn));
-    std::unique_lock<std::mutex> l(mu_);
-    if (waiters_.empty()) {
-      pending_.push_back(std::move(t));
-    } else {
-      Waiter* w = waiters_.back();
-      waiters_.pop_back();
-      w->ready = true;
-      w->task = std::move(t);
-      w->cv.notify_one();
-    }
-  }
-
-  void Cancel() {
-#ifdef EIGEN_THREAD_ENV_SUPPORTS_CANCELLATION
-    for (size_t i = 0; i < threads_.size(); i++) {
-      threads_[i]->OnCancel();
-    }
-#endif
-  }
-
-  int NumThreads() const final {
-    return static_cast<int>(threads_.size());
-  }
-
-  int CurrentThreadId() const final {
-    const PerThread* pt = this->GetPerThread();
-    if (pt->pool == this) {
-      return pt->thread_id;
-    } else {
-      return -1;
-    }
-  }
-
- protected:
-  void WorkerLoop(int thread_id) {
-    std::unique_lock<std::mutex> l(mu_);
-    PerThread* pt = GetPerThread();
-    pt->pool = this;
-    pt->thread_id = thread_id;
-    Waiter w;
-    Task t;
-    while (!exiting_) {
-      if (pending_.empty()) {
-        // Wait for work to be assigned to me
-        w.ready = false;
-        waiters_.push_back(&w);
-        while (!w.ready) {
-          w.cv.wait(l);
-        }
-        t = w.task;
-        w.task.f = nullptr;
-      } else {
-        // Pick up pending work
-        t = std::move(pending_.front());
-        pending_.pop_front();
-        if (pending_.empty()) {
-          empty_.notify_all();
-        }
-      }
-      if (t.f) {
-        mu_.unlock();
-        env_.ExecuteTask(t);
-        t.f = nullptr;
-        mu_.lock();
-      }
-    }
-  }
-
- private:
-  typedef typename Environment::Task Task;
-  typedef typename Environment::EnvThread Thread;
-
-  struct Waiter {
-    std::condition_variable cv;
-    Task task;
-    bool ready;
-  };
-
-  struct PerThread {
-    constexpr PerThread() : pool(NULL), thread_id(-1) { }
-    SimpleThreadPoolTempl* pool;  // Parent pool, or null for normal threads.
-    int thread_id;                // Worker thread index in pool.
-  };
-
-  Environment env_;
-  std::mutex mu_;
-  MaxSizeVector<Thread*> threads_;  // All threads
-  MaxSizeVector<Waiter*> waiters_;  // Stack of waiting threads.
-  std::deque<Task> pending_;        // Queue of pending work
-  std::condition_variable empty_;   // Signaled on pending_.empty()
-  bool exiting_ = false;
-
-  PerThread* GetPerThread() const {
-    EIGEN_THREAD_LOCAL PerThread per_thread;
-    return &per_thread;
-  }
-};
-
-typedef SimpleThreadPoolTempl<StlThreadEnvironment> SimpleThreadPool;
-
-}  // namespace Eigen
-
-#endif  // EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H
diff --git a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
index 49d315a66..8de3bbcab 100644
--- a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
+++ b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
@@ -268,7 +268,7 @@ template<
   typename Reducer
 > struct reduce<Reducer>
 {
-  constexpr static inline int run() { return Reducer::Identity; }
+  EIGEN_DEVICE_FUNC constexpr static inline int run() { return Reducer::Identity; }
 };
 
 template<
@@ -276,7 +276,7 @@ template<
   typename A
 > struct reduce<Reducer, A>
 {
-  constexpr static inline A run(A a) { return a; }
+  EIGEN_DEVICE_FUNC constexpr static inline A run(A a) { return a; }
 };
 
 template<
@@ -285,7 +285,7 @@ template<
   typename... Ts
 > struct reduce<Reducer, A, Ts...>
 {
-  constexpr static inline auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) {
+  EIGEN_DEVICE_FUNC constexpr static inline auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) {
     return Reducer::run(a, reduce<Reducer, Ts...>::run(ts...));
   }
 };
@@ -324,7 +324,7 @@ struct greater_equal_zero_op { template<typename A> constexpr static inline auto
 // together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1
 // does...
 template<typename... Ts>
-constexpr inline decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts)
+EIGEN_DEVICE_FUNC constexpr inline decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts)
 {
   return reduce<product_op, Ts...>::run(ts...);
 }
diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
index 96b3a8261..d91662d96 100644
--- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
@@ -15,7 +15,7 @@
 // The array class is only available starting with cxx11. Emulate our own here
 // if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler!
 // Moreover, CUDA doesn't support the STL containers, so we use our own instead.
-#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_CUDACC) || defined(EIGEN_AVOID_STL_ARRAY)
+#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_GPUCC) || defined(EIGEN_AVOID_STL_ARRAY)
 
 namespace Eigen {
 template <typename T, size_t n> class array {
@@ -219,7 +219,7 @@ template<class T, std::size_t N> struct array_size<const array<T,N>& > {
 
 #else
 
-// The compiler supports c++11, and we're not targetting cuda: use std::array as Eigen::array
+// The compiler supports c++11, and we're not targeting cuda: use std::array as Eigen::array
 #include <array>
 namespace Eigen {
 
diff --git a/unsupported/Eigen/NonLinearOptimization b/unsupported/Eigen/NonLinearOptimization
index 600ab4c12..28eaa5f29 100644
--- a/unsupported/Eigen/NonLinearOptimization
+++ b/unsupported/Eigen/NonLinearOptimization
@@ -30,12 +30,12 @@
   * actually linear. But if this is so, you should probably better use other
   * methods more fitted to this special case.
   *
-  * One algorithm allows to find an extremum of such a system (Levenberg
-  * Marquardt algorithm) and the second one is used to find 
+  * One algorithm allows to find a least-squares solution of such a system
+  * (Levenberg-Marquardt algorithm) and the second one is used to find 
   * a zero for the system (Powell hybrid "dogleg" method).
   *
   * This code is a port of minpack (http://en.wikipedia.org/wiki/MINPACK).
-  * Minpack is a very famous, old, robust and well-reknown package, written in 
+  * Minpack is a very famous, old, robust and well renowned package, written in
   * fortran. Those implementations have been carefully tuned, tested, and used
   * for several decades.
   *
@@ -58,35 +58,41 @@
   * There are two kinds of tests : those that come from examples bundled with cminpack.
   * They guaranty we get the same results as the original algorithms (value for 'x',
   * for the number of evaluations of the function, and for the number of evaluations
-  * of the jacobian if ever).
+  * of the Jacobian if ever).
   * 
   * Other tests were added by myself at the very beginning of the 
-  * process and check the results for levenberg-marquardt using the reference data 
+  * process and check the results for Levenberg-Marquardt using the reference data 
   * on http://www.itl.nist.gov/div898/strd/nls/nls_main.shtml. Since then i've 
-  * carefully checked that the same results were obtained when modifiying the 
+  * carefully checked that the same results were obtained when modifying the
   * code. Please note that we do not always get the exact same decimals as they do,
   * but this is ok : they use 128bits float, and we do the tests using the C type 'double',
   * which is 64 bits on most platforms (x86 and amd64, at least).
-  * I've performed those tests on several other implementations of levenberg-marquardt, and
+  * I've performed those tests on several other implementations of Levenberg-Marquardt, and
   * (c)minpack performs VERY well compared to those, both in accuracy and speed.
   * 
   * The documentation for running the tests is on the wiki
   * http://eigen.tuxfamily.org/index.php?title=Tests
   * 
-  * \section API API : overview of methods
+  * \section API API: overview of methods
   * 
-  * Both algorithms can use either the jacobian (provided by the user) or compute 
-  * an approximation by themselves (actually using Eigen \ref NumericalDiff_Module).
-  * The part of API referring to the latter use 'NumericalDiff' in the method names
-  * (exemple: LevenbergMarquardt.minimizeNumericalDiff() ) 
+  * Both algorithms needs a functor computing the Jacobian. It can be computed by
+  * hand, using auto-differentiation (see \ref AutoDiff_Module), or using numerical
+  * differences (see \ref NumericalDiff_Module). For instance:
+  *\code
+  * MyFunc func;
+  * NumericalDiff<MyFunc> func_with_num_diff(func);
+  * LevenbergMarquardt<NumericalDiff<MyFunc> > lm(func_with_num_diff);
+  * \endcode
+  * For HybridNonLinearSolver, the method solveNumericalDiff() does the above wrapping for
+  * you.
   * 
   * The methods LevenbergMarquardt.lmder1()/lmdif1()/lmstr1() and 
   * HybridNonLinearSolver.hybrj1()/hybrd1() are specific methods from the original 
   * minpack package that you probably should NOT use until you are porting a code that
-  *  was previously using minpack. They just define a 'simple' API with default values 
+  * was previously using minpack. They just define a 'simple' API with default values 
   * for some parameters.
   * 
-  * All algorithms are provided using Two APIs :
+  * All algorithms are provided using two APIs :
   *     - one where the user inits the algorithm, and uses '*OneStep()' as much as he wants : 
   * this way the caller have control over the steps
   *     - one where the user just calls a method (optimize() or solve()) which will 
@@ -94,7 +100,7 @@
   *  convenience.
   * 
   * As an example, the method LevenbergMarquardt::minimize() is 
-  * implemented as follow : 
+  * implemented as follow: 
   * \code
   * Status LevenbergMarquardt<FunctorType,Scalar>::minimize(FVectorType  &x, const int mode)
   * {
diff --git a/unsupported/Eigen/OpenGLSupport b/unsupported/Eigen/OpenGLSupport
index 87f50947d..11d99567e 100644
--- a/unsupported/Eigen/OpenGLSupport
+++ b/unsupported/Eigen/OpenGLSupport
@@ -25,7 +25,7 @@ namespace Eigen {
   *
   * This module provides wrapper functions for a couple of OpenGL functions
   * which simplify the way to pass Eigen's object to openGL.
-  * Here is an exmaple:
+  * Here is an example:
   * 
   * \code
   * // You need to add path_to_eigen/unsupported to your include path.
diff --git a/unsupported/Eigen/SpecialFunctions b/unsupported/Eigen/SpecialFunctions
index a2ad4925e..44fd99b43 100644
--- a/unsupported/Eigen/SpecialFunctions
+++ b/unsupported/Eigen/SpecialFunctions
@@ -29,11 +29,15 @@ namespace Eigen {
   * - erfc
   * - lgamma
   * - igamma
+  * - igamma_der_a
+  * - gamma_sample_der_alpha
   * - igammac
   * - digamma
   * - polygamma
   * - zeta
   * - betainc
+  * - i0e
+  * - i1e
   *
   * \code
   * #include <unsupported/Eigen/SpecialFunctions>
@@ -49,8 +53,8 @@ namespace Eigen {
 #include "src/SpecialFunctions/SpecialFunctionsFunctors.h"
 #include "src/SpecialFunctions/SpecialFunctionsArrayAPI.h"
 
-#if defined EIGEN_VECTORIZE_CUDA
-  #include "src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h"
+#if defined EIGEN_VECTORIZE_GPU
+  #include "src/SpecialFunctions/arch/GPU/GpuSpecialFunctions.h"
 #endif
 
 namespace Eigen {
diff --git a/unsupported/Eigen/src/BVH/KdBVH.h b/unsupported/Eigen/src/BVH/KdBVH.h
index 1b8d75865..13f792cd0 100644
--- a/unsupported/Eigen/src/BVH/KdBVH.h
+++ b/unsupported/Eigen/src/BVH/KdBVH.h
@@ -170,7 +170,7 @@ private:
   typedef internal::vector_int_pair<Scalar, Dim> VIPair;
   typedef std::vector<VIPair, aligned_allocator<VIPair> > VIPairList;
   typedef Matrix<Scalar, Dim, 1> VectorType;
-  struct VectorComparator //compares vectors, or, more specificall, VIPairs along a particular dimension
+  struct VectorComparator //compares vectors, or more specifically, VIPairs along a particular dimension
   {
     VectorComparator(int inDim) : dim(inDim) {}
     inline bool operator()(const VIPair &v1, const VIPair &v2) const { return v1.first[dim] < v2.first[dim]; }
diff --git a/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h b/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
index 866a8a460..9f7bff764 100644
--- a/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
+++ b/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
@@ -300,7 +300,7 @@ public:
 
   /** \brief Reports whether previous computation was successful.
    *
-   * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+   * \returns \c Success if computation was successful, \c NoConvergence otherwise.
    */
   ComputationInfo info() const
   {
diff --git a/unsupported/Eigen/src/EulerAngles/EulerSystem.h b/unsupported/Eigen/src/EulerAngles/EulerSystem.h
index 28f52da61..65c2e94c7 100644
--- a/unsupported/Eigen/src/EulerAngles/EulerSystem.h
+++ b/unsupported/Eigen/src/EulerAngles/EulerSystem.h
@@ -12,7 +12,7 @@
 
 namespace Eigen
 {
-  // Forward declerations
+  // Forward declarations
   template <typename _Scalar, class _System>
   class EulerAngles;
   
diff --git a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
index be51b4e6f..079e88602 100644
--- a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
+++ b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
@@ -316,8 +316,8 @@ struct kissfft_impl
 
         // use optimized mode for even real
         fwd( dst, reinterpret_cast<const Complex*> (src), ncfft);
-        Complex dc = dst[0].real() +  dst[0].imag();
-        Complex nyquist = dst[0].real() -  dst[0].imag();
+        Complex dc(dst[0].real() +  dst[0].imag());
+        Complex nyquist(dst[0].real() -  dst[0].imag());
         int k;
         for ( k=1;k <= ncfft2 ; ++k ) {
           Complex fpk = dst[k];
diff --git a/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h b/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h
index dc0093eb9..37d5b4c6c 100644
--- a/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h
+++ b/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h
@@ -99,7 +99,7 @@ void pseudo_inverse(const CMatrix &C, CINVMatrix &CINV)
 /** \ingroup IterativeSolvers_Module
   * Constrained conjugate gradient
   *
-  * Computes the minimum of \f$ 1/2((Ax).x) - bx \f$ under the contraint \f$ Cx \le f \f$
+  * Computes the minimum of \f$ 1/2((Ax).x) - bx \f$ under the constraint \f$ Cx \le f \f$
   */
 template<typename TMatrix, typename CMatrix,
          typename VectorX, typename VectorB, typename VectorF>
diff --git a/unsupported/Eigen/src/IterativeSolvers/DGMRES.h b/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
index bae04fc30..85a4f696c 100644
--- a/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
+++ b/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
@@ -39,7 +39,6 @@ template <typename VectorType, typename IndexType>
 void sortWithPermutation (VectorType& vec, IndexType& perm, typename IndexType::Scalar& ncut)
 {
   eigen_assert(vec.size() == perm.size());
-  typedef typename IndexType::Scalar Index; 
   bool flag; 
   for (Index k  = 0; k < ncut; k++)
   {
@@ -89,7 +88,7 @@ void sortWithPermutation (VectorType& vec, IndexType& perm, typename IndexType::
  * [1] D. NUENTSA WAKAM and F. PACULL, Memory Efficient Hybrid
  *  Algebraic Solvers for Linear Systems Arising from Compressible
  *  Flows, Computers and Fluids, In Press,
- *  http://dx.doi.org/10.1016/j.compfluid.2012.03.023   
+ *  https://doi.org/10.1016/j.compfluid.2012.03.023   
  * [2] K. Burrage and J. Erhel, On the performance of various 
  * adaptive preconditioned GMRES strategies, 5(1998), 101-121.
  * [3] J. Erhel, K. Burrage and B. Pohl, Restarted GMRES 
@@ -112,7 +111,6 @@ class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
     using Base::_solve_impl;
     typedef _MatrixType MatrixType;
     typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
     typedef typename MatrixType::StorageIndex StorageIndex;
     typedef typename MatrixType::RealScalar RealScalar;
     typedef _Preconditioner Preconditioner;
@@ -146,7 +144,7 @@ class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
   void _solve_with_guess_impl(const Rhs& b, Dest& x) const
   {    
     bool failed = false;
-    for(int j=0; j<b.cols(); ++j)
+    for(Index j=0; j<b.cols(); ++j)
     {
       m_iterations = Base::maxIterations();
       m_error = Base::m_tolerance;
@@ -170,17 +168,17 @@ class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
   /** 
    * Get the restart value
     */
-  int restart() { return m_restart; }
+  Index restart() { return m_restart; }
   
   /** 
    * Set the restart value (default is 30)  
    */
-  void set_restart(const int restart) { m_restart=restart; }
+  void set_restart(const Index restart) { m_restart=restart; }
   
   /** 
    * Set the number of eigenvalues to deflate at each restart 
    */
-  void setEigenv(const int neig) 
+  void setEigenv(const Index neig) 
   {
     m_neig = neig;
     if (neig+1 > m_maxNeig) m_maxNeig = neig+1; // To allow for complex conjugates
@@ -189,12 +187,12 @@ class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
   /** 
    * Get the size of the deflation subspace size
    */ 
-  int deflSize() {return m_r; }
+  Index deflSize() {return m_r; }
   
   /**
    * Set the maximum size of the deflation subspace
    */
-  void setMaxEigenv(const int maxNeig) { m_maxNeig = maxNeig; }
+  void setMaxEigenv(const Index maxNeig) { m_maxNeig = maxNeig; }
   
   protected:
     // DGMRES algorithm 
@@ -202,27 +200,27 @@ class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
     void dgmres(const MatrixType& mat,const Rhs& rhs, Dest& x, const Preconditioner& precond) const;
     // Perform one cycle of GMRES
     template<typename Dest>
-    int dgmresCycle(const MatrixType& mat, const Preconditioner& precond, Dest& x, DenseVector& r0, RealScalar& beta, const RealScalar& normRhs, int& nbIts) const; 
+    Index dgmresCycle(const MatrixType& mat, const Preconditioner& precond, Dest& x, DenseVector& r0, RealScalar& beta, const RealScalar& normRhs, Index& nbIts) const; 
     // Compute data to use for deflation 
-    int dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, StorageIndex& neig) const;
+    Index dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, StorageIndex& neig) const;
     // Apply deflation to a vector
     template<typename RhsType, typename DestType>
-    int dgmresApplyDeflation(const RhsType& In, DestType& Out) const; 
+    Index dgmresApplyDeflation(const RhsType& In, DestType& Out) const; 
     ComplexVector schurValues(const ComplexSchur<DenseMatrix>& schurofH) const;
     ComplexVector schurValues(const RealSchur<DenseMatrix>& schurofH) const;
     // Init data for deflation
     void dgmresInitDeflation(Index& rows) const; 
     mutable DenseMatrix m_V; // Krylov basis vectors
     mutable DenseMatrix m_H; // Hessenberg matrix 
-    mutable DenseMatrix m_Hes; // Initial hessenberg matrix wihout Givens rotations applied
+    mutable DenseMatrix m_Hes; // Initial hessenberg matrix without Givens rotations applied
     mutable Index m_restart; // Maximum size of the Krylov subspace
     mutable DenseMatrix m_U; // Vectors that form the basis of the invariant subspace 
     mutable DenseMatrix m_MU; // matrix operator applied to m_U (for next cycles)
     mutable DenseMatrix m_T; /* T=U^T*M^{-1}*A*U */
     mutable PartialPivLU<DenseMatrix> m_luT; // LU factorization of m_T
     mutable StorageIndex m_neig; //Number of eigenvalues to extract at each restart
-    mutable int m_r; // Current number of deflated eigenvalues, size of m_U
-    mutable int m_maxNeig; // Maximum number of eigenvalues to deflate
+    mutable Index m_r; // Current number of deflated eigenvalues, size of m_U
+    mutable Index m_maxNeig; // Maximum number of eigenvalues to deflate
     mutable RealScalar m_lambdaN; //Modulus of the largest eigenvalue of A
     mutable bool m_isDeflAllocated;
     mutable bool m_isDeflInitialized;
@@ -244,13 +242,13 @@ void DGMRES<_MatrixType, _Preconditioner>::dgmres(const MatrixType& mat,const Rh
               const Preconditioner& precond) const
 {
   //Initialization
-  int n = mat.rows(); 
+  Index n = mat.rows(); 
   DenseVector r0(n); 
-  int nbIts = 0; 
+  Index nbIts = 0; 
   m_H.resize(m_restart+1, m_restart);
   m_Hes.resize(m_restart, m_restart);
   m_V.resize(n,m_restart+1);
-  //Initial residual vector and intial norm
+  //Initial residual vector and initial norm
   x = precond.solve(x);
   r0 = rhs - mat * x; 
   RealScalar beta = r0.norm(); 
@@ -284,7 +282,7 @@ void DGMRES<_MatrixType, _Preconditioner>::dgmres(const MatrixType& mat,const Rh
  */
 template< typename _MatrixType, typename _Preconditioner>
 template<typename Dest>
-int DGMRES<_MatrixType, _Preconditioner>::dgmresCycle(const MatrixType& mat, const Preconditioner& precond, Dest& x, DenseVector& r0, RealScalar& beta, const RealScalar& normRhs, int& nbIts) const
+Index DGMRES<_MatrixType, _Preconditioner>::dgmresCycle(const MatrixType& mat, const Preconditioner& precond, Dest& x, DenseVector& r0, RealScalar& beta, const RealScalar& normRhs, Index& nbIts) const
 {
   //Initialization 
   DenseVector g(m_restart+1); // Right hand side of the least square problem
@@ -293,8 +291,8 @@ int DGMRES<_MatrixType, _Preconditioner>::dgmresCycle(const MatrixType& mat, con
   m_V.col(0) = r0/beta; 
   m_info = NoConvergence; 
   std::vector<JacobiRotation<Scalar> >gr(m_restart); // Givens rotations
-  int it = 0; // Number of inner iterations 
-  int n = mat.rows();
+  Index it = 0; // Number of inner iterations 
+  Index n = mat.rows();
   DenseVector tv1(n), tv2(n);  //Temporary vectors
   while (m_info == NoConvergence && it < m_restart && nbIts < m_iterations)
   {    
@@ -312,7 +310,7 @@ int DGMRES<_MatrixType, _Preconditioner>::dgmresCycle(const MatrixType& mat, con
    
     // Orthogonalize it with the previous basis in the basis using modified Gram-Schmidt
     Scalar coef; 
-    for (int i = 0; i <= it; ++i)
+    for (Index i = 0; i <= it; ++i)
     { 
       coef = tv1.dot(m_V.col(i));
       tv1 = tv1 - coef * m_V.col(i); 
@@ -328,7 +326,7 @@ int DGMRES<_MatrixType, _Preconditioner>::dgmresCycle(const MatrixType& mat, con
     // FIXME Check for happy breakdown 
     
     // Update Hessenberg matrix with Givens rotations
-    for (int i = 1; i <= it; ++i) 
+    for (Index i = 1; i <= it; ++i) 
     {
       m_H.col(it).applyOnTheLeft(i-1,i,gr[i-1].adjoint());
     }
@@ -394,7 +392,6 @@ inline typename DGMRES<_MatrixType, _Preconditioner>::ComplexVector DGMRES<_Matr
 template< typename _MatrixType, typename _Preconditioner>
 inline typename DGMRES<_MatrixType, _Preconditioner>::ComplexVector DGMRES<_MatrixType, _Preconditioner>::schurValues(const RealSchur<DenseMatrix>& schurofH) const
 {
-  typedef typename MatrixType::Index Index;
   const DenseMatrix& T = schurofH.matrixT();
   Index it = T.rows();
   ComplexVector eig(it);
@@ -418,7 +415,7 @@ inline typename DGMRES<_MatrixType, _Preconditioner>::ComplexVector DGMRES<_Matr
 }
 
 template< typename _MatrixType, typename _Preconditioner>
-int DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, StorageIndex& neig) const
+Index DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, StorageIndex& neig) const
 {
   // First, find the Schur form of the Hessenberg matrix H
   typename internal::conditional<NumTraits<Scalar>::IsComplex, ComplexSchur<DenseMatrix>, RealSchur<DenseMatrix> >::type schurofH; 
@@ -433,8 +430,8 @@ int DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const Matri
   
   // Reorder the absolute values of Schur values
   DenseRealVector modulEig(it); 
-  for (int j=0; j<it; ++j) modulEig(j) = std::abs(eig(j)); 
-  perm.setLinSpaced(it,0,it-1);
+  for (Index j=0; j<it; ++j) modulEig(j) = std::abs(eig(j)); 
+  perm.setLinSpaced(it,0,internal::convert_index<StorageIndex>(it-1));
   internal::sortWithPermutation(modulEig, perm, neig);
   
   if (!m_lambdaN)
@@ -442,7 +439,7 @@ int DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const Matri
     m_lambdaN = (std::max)(modulEig.maxCoeff(), m_lambdaN);
   }
   //Count the real number of extracted eigenvalues (with complex conjugates)
-  int nbrEig = 0; 
+  Index nbrEig = 0; 
   while (nbrEig < neig)
   {
     if(eig(perm(it-nbrEig-1)).imag() == RealScalar(0)) nbrEig++; 
@@ -451,7 +448,7 @@ int DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const Matri
   // Extract the  Schur vectors corresponding to the smallest Ritz values
   DenseMatrix Sr(it, nbrEig); 
   Sr.setZero();
-  for (int j = 0; j < nbrEig; j++)
+  for (Index j = 0; j < nbrEig; j++)
   {
     Sr.col(j) = schurofH.matrixU().col(perm(it-j-1));
   }
@@ -462,8 +459,8 @@ int DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const Matri
   if (m_r)
   {
    // Orthogonalize X against m_U using modified Gram-Schmidt
-   for (int j = 0; j < nbrEig; j++)
-     for (int k =0; k < m_r; k++)
+   for (Index j = 0; j < nbrEig; j++)
+     for (Index k =0; k < m_r; k++)
       X.col(j) = X.col(j) - (m_U.col(k).dot(X.col(j)))*m_U.col(k); 
   }
   
@@ -473,7 +470,7 @@ int DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const Matri
     dgmresInitDeflation(m); 
   DenseMatrix MX(m, nbrEig);
   DenseVector tv1(m);
-  for (int j = 0; j < nbrEig; j++)
+  for (Index j = 0; j < nbrEig; j++)
   {
     tv1 = mat * X.col(j);
     MX.col(j) = precond.solve(tv1);
@@ -488,8 +485,8 @@ int DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const Matri
   }
   
   // Save X into m_U and m_MX in m_MU
-  for (int j = 0; j < nbrEig; j++) m_U.col(m_r+j) = X.col(j);
-  for (int j = 0; j < nbrEig; j++) m_MU.col(m_r+j) = MX.col(j);
+  for (Index j = 0; j < nbrEig; j++) m_U.col(m_r+j) = X.col(j);
+  for (Index j = 0; j < nbrEig; j++) m_MU.col(m_r+j) = MX.col(j);
   // Increase the size of the invariant subspace
   m_r += nbrEig; 
   
@@ -502,7 +499,7 @@ int DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const Matri
 }
 template<typename _MatrixType, typename _Preconditioner>
 template<typename RhsType, typename DestType>
-int DGMRES<_MatrixType, _Preconditioner>::dgmresApplyDeflation(const RhsType &x, DestType &y) const
+Index DGMRES<_MatrixType, _Preconditioner>::dgmresApplyDeflation(const RhsType &x, DestType &y) const
 {
   DenseVector x1 = m_U.leftCols(m_r).transpose() * x; 
   y = x + m_U.leftCols(m_r) * ( m_lambdaN * m_luT.solve(x1) - x1);
diff --git a/unsupported/Eigen/src/IterativeSolvers/MINRES.h b/unsupported/Eigen/src/IterativeSolvers/MINRES.h
index 256990c1a..3a5c73eaf 100644
--- a/unsupported/Eigen/src/IterativeSolvers/MINRES.h
+++ b/unsupported/Eigen/src/IterativeSolvers/MINRES.h
@@ -3,6 +3,7 @@
 //
 // Copyright (C) 2012 Giacomo Po <gpo@ucla.edu>
 // Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2018 David Hyde <dabh@stanford.edu>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -64,8 +65,6 @@ namespace Eigen {
             eigen_assert(beta_new2 >= 0.0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE");
             RealScalar beta_new(sqrt(beta_new2));
             const RealScalar beta_one(beta_new);
-            v_new /= beta_new;
-            w_new /= beta_new;
             // Initialize other variables
             RealScalar c(1.0); // the cosine of the Givens rotation
             RealScalar c_old(1.0);
@@ -83,18 +82,18 @@ namespace Eigen {
                 /* Note that there are 4 variants on the Lanczos algorithm. These are
                  * described in Paige, C. C. (1972). Computational variants of
                  * the Lanczos method for the eigenproblem. IMA Journal of Applied
-                 * Mathematics, 10(3), 373–381. The current implementation corresponds 
+                 * Mathematics, 10(3), 373-381. The current implementation corresponds 
                  * to the case A(2,7) in the paper. It also corresponds to 
-                 * algorithm 6.14 in Y. Saad, Iterative Methods for Sparse Linear
+                 * algorithm 6.14 in Y. Saad, Iterative Methods for Sparse Linear
                  * Systems, 2003 p.173. For the preconditioned version see 
                  * A. Greenbaum, Iterative Methods for Solving Linear Systems, SIAM (1987).
                  */
                 const RealScalar beta(beta_new);
                 v_old = v; // update: at first time step, this makes v_old = 0 so value of beta doesn't matter
-//                const VectorType v_old(v); // NOT SURE IF CREATING v_old EVERY ITERATION IS EFFICIENT
+                v_new /= beta_new; // overwrite v_new for next iteration
+                w_new /= beta_new; // overwrite w_new for next iteration
                 v = v_new; // update
                 w = w_new; // update
-//                const VectorType w(w_new); // NOT SURE IF CREATING w EVERY ITERATION IS EFFICIENT
                 v_new.noalias() = mat*w - beta*v_old; // compute v_new
                 const RealScalar alpha = v_new.dot(w);
                 v_new -= alpha*v; // overwrite v_new
@@ -102,8 +101,6 @@ namespace Eigen {
                 beta_new2 = v_new.dot(w_new); // compute beta_new
                 eigen_assert(beta_new2 >= 0.0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE");
                 beta_new = sqrt(beta_new2); // compute beta_new
-                v_new /= beta_new; // overwrite v_new for next iteration
-                w_new /= beta_new; // overwrite w_new for next iteration
                 
                 // Givens rotation
                 const RealScalar r2 =s*alpha+c*c_old*beta; // s, s_old, c and c_old are still from previous iteration
@@ -117,7 +114,6 @@ namespace Eigen {
                 
                 // Update solution
                 p_oold = p_old;
-//                const VectorType p_oold(p_old); // NOT SURE IF CREATING p_oold EVERY ITERATION IS EFFICIENT
                 p_old = p;
                 p.noalias()=(w-r2*p_old-r3*p_oold) /r1; // IS NOALIAS REQUIRED?
                 x += beta_one*c*eta*p;
@@ -286,4 +282,3 @@ namespace Eigen {
 } // end namespace Eigen
 
 #endif // EIGEN_MINRES_H
-
diff --git a/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h b/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h
index ae9d793b1..123485817 100644
--- a/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h
+++ b/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h
@@ -73,7 +73,7 @@ void lmqrsolv(
             qtbpj = -givens.s() * wa[k] + givens.c() * qtbpj;
             wa[k] = temp;
 
-            /*           accumulate the tranformation in the row of s. */
+            /*           accumulate the transformation in the row of s. */
             for (i = k+1; i<n; ++i) {
                 temp = givens.c() * s(i,k) + givens.s() * sdiag[i];
                 sdiag[i] = -givens.s() * s(i,k) + givens.c() * sdiag[i];
diff --git a/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h b/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
index 995427978..62561da1d 100644
--- a/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
+++ b/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
@@ -117,7 +117,7 @@ class LevenbergMarquardt : internal::no_assignment_operator
     typedef typename JacobianType::RealScalar RealScalar; 
     typedef typename QRSolver::StorageIndex PermIndex;
     typedef Matrix<Scalar,Dynamic,1> FVectorType;
-    typedef PermutationMatrix<Dynamic,Dynamic> PermutationType;
+    typedef PermutationMatrix<Dynamic,Dynamic,int> PermutationType;
   public:
     LevenbergMarquardt(FunctorType& functor) 
     : m_functor(functor),m_nfev(0),m_njev(0),m_fnorm(0.0),m_gnorm(0),
@@ -233,9 +233,9 @@ class LevenbergMarquardt : internal::no_assignment_operator
     
     /** 
      * \brief Reports whether the minimization was successful
-     * \returns \c Success if the minimization was succesful,
+     * \returns \c Success if the minimization was successful,
      *         \c NumericalIssue if a numerical problem arises during the 
-     *          minimization process, for exemple during the QR factorization
+     *          minimization process, for example during the QR factorization
      *         \c NoConvergence if the minimization did not converge after 
      *          the maximum number of function evaluation allowed
      *          \c InvalidInput if the input matrix is invalid
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
index 85ab3d97c..54037d58d 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
@@ -313,7 +313,7 @@ struct matrix_exp_computeUV<MatrixType, long double>
       matrix_exp_pade17(A, U, V);
     }
   
-#elif LDBL_MANT_DIG <= 112  // quadruple precison
+#elif LDBL_MANT_DIG <= 112  // quadruple precision
   
     if (l1norm < 1.639394610288918690547467954466970e-005L) {
       matrix_exp_pade3(arg, U, V);
@@ -395,7 +395,6 @@ void matrix_exp_compute(const ArgType& arg, ResultType &result, false_type) // d
 template<typename Derived> struct MatrixExponentialReturnValue
 : public ReturnByValue<MatrixExponentialReturnValue<Derived> >
 {
-    typedef typename Derived::Index Index;
   public:
     /** \brief Constructor.
       *
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
index ef50c46a9..133d78625 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
@@ -53,7 +53,7 @@ template <typename MatrixType>
 typename NumTraits<typename MatrixType::Scalar>::Real matrix_function_compute_mu(const MatrixType& A)
 {
   typedef typename plain_col_type<MatrixType>::type VectorType;
-  typename MatrixType::Index rows = A.rows();
+  Index rows = A.rows();
   const MatrixType N = MatrixType::Identity(rows, rows) - A;
   VectorType e = VectorType::Ones(rows);
   N.template triangularView<Upper>().solveInPlace(e);
@@ -65,7 +65,6 @@ MatrixType MatrixFunctionAtomic<MatrixType>::compute(const MatrixType& A)
 {
   // TODO: Use that A is upper triangular
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef typename MatrixType::Index Index;
   Index rows = A.rows();
   Scalar avgEival = A.trace() / Scalar(RealScalar(rows));
   MatrixType Ashifted = A - avgEival * MatrixType::Identity(rows, rows);
@@ -131,7 +130,6 @@ typename ListOfClusters::iterator matrix_function_find_cluster(Index key, ListOf
 template <typename EivalsType, typename Cluster>
 void matrix_function_partition_eigenvalues(const EivalsType& eivals, std::list<Cluster>& clusters)
 {
-  typedef typename EivalsType::Index Index;
   typedef typename EivalsType::RealScalar RealScalar;
   for (Index i=0; i<eivals.rows(); ++i) {
     // Find cluster containing i-th ei'val, adding a new cluster if necessary
@@ -179,7 +177,7 @@ void matrix_function_compute_block_start(const VectorType& clusterSize, VectorTy
 {
   blockStart.resize(clusterSize.rows());
   blockStart(0) = 0;
-  for (typename VectorType::Index i = 1; i < clusterSize.rows(); i++) {
+  for (Index i = 1; i < clusterSize.rows(); i++) {
     blockStart(i) = blockStart(i-1) + clusterSize(i-1);
   }
 }
@@ -188,7 +186,6 @@ void matrix_function_compute_block_start(const VectorType& clusterSize, VectorTy
 template <typename EivalsType, typename ListOfClusters, typename VectorType>
 void matrix_function_compute_map(const EivalsType& eivals, const ListOfClusters& clusters, VectorType& eivalToCluster)
 {
-  typedef typename EivalsType::Index Index;
   eivalToCluster.resize(eivals.rows());
   Index clusterIndex = 0;
   for (typename ListOfClusters::const_iterator cluster = clusters.begin(); cluster != clusters.end(); ++cluster) {
@@ -205,7 +202,6 @@ void matrix_function_compute_map(const EivalsType& eivals, const ListOfClusters&
 template <typename DynVectorType, typename VectorType>
 void matrix_function_compute_permutation(const DynVectorType& blockStart, const DynVectorType& eivalToCluster, VectorType& permutation)
 {
-  typedef typename VectorType::Index Index;
   DynVectorType indexNextEntry = blockStart;
   permutation.resize(eivalToCluster.rows());
   for (Index i = 0; i < eivalToCluster.rows(); i++) {
@@ -219,7 +215,6 @@ void matrix_function_compute_permutation(const DynVectorType& blockStart, const
 template <typename VectorType, typename MatrixType>
 void matrix_function_permute_schur(VectorType& permutation, MatrixType& U, MatrixType& T)
 {
-  typedef typename VectorType::Index Index;
   for (Index i = 0; i < permutation.rows() - 1; i++) {
     Index j;
     for (j = i; j < permutation.rows(); j++) {
@@ -247,7 +242,7 @@ template <typename MatrixType, typename AtomicType, typename VectorType>
 void matrix_function_compute_block_atomic(const MatrixType& T, AtomicType& atomic, const VectorType& blockStart, const VectorType& clusterSize, MatrixType& fT)
 { 
   fT.setZero(T.rows(), T.cols());
-  for (typename VectorType::Index i = 0; i < clusterSize.rows(); ++i) {
+  for (Index i = 0; i < clusterSize.rows(); ++i) {
     fT.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i))
       = atomic.compute(T.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i)));
   }
@@ -285,7 +280,6 @@ MatrixType matrix_function_solve_triangular_sylvester(const MatrixType& A, const
   eigen_assert(C.rows() == A.rows());
   eigen_assert(C.cols() == B.rows());
 
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
 
   Index m = A.rows();
@@ -330,11 +324,8 @@ void matrix_function_compute_above_diagonal(const MatrixType& T, const VectorTyp
 { 
   typedef internal::traits<MatrixType> Traits;
   typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::Index Index;
-  static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
-  static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
   static const int Options = MatrixType::Options;
-  typedef Matrix<Scalar, Dynamic, Dynamic, Options, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
+  typedef Matrix<Scalar, Dynamic, Dynamic, Options, Traits::RowsAtCompileTime, Traits::ColsAtCompileTime> DynMatrixType;
 
   for (Index k = 1; k < clusterSize.rows(); k++) {
     for (Index i = 0; i < clusterSize.rows() - k; i++) {
@@ -481,7 +472,6 @@ template<typename Derived> class MatrixFunctionReturnValue
 {
   public:
     typedef typename Derived::Scalar Scalar;
-    typedef typename Derived::Index Index;
     typedef typename internal::stem_function<Scalar>::type StemFunction;
 
   protected:
@@ -506,10 +496,8 @@ template<typename Derived> class MatrixFunctionReturnValue
       typedef typename internal::nested_eval<Derived, 10>::type NestedEvalType;
       typedef typename internal::remove_all<NestedEvalType>::type NestedEvalTypeClean;
       typedef internal::traits<NestedEvalTypeClean> Traits;
-      static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
-      static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
       typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
-      typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
+      typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, Traits::RowsAtCompileTime, Traits::ColsAtCompileTime> DynMatrixType;
 
       typedef internal::MatrixFunctionAtomic<DynMatrixType> AtomicType;
       AtomicType atomic(m_f);
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
index ff8f6e732..a8d879a12 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
@@ -332,10 +332,8 @@ public:
     typedef typename internal::nested_eval<Derived, 10>::type DerivedEvalType;
     typedef typename internal::remove_all<DerivedEvalType>::type DerivedEvalTypeClean;
     typedef internal::traits<DerivedEvalTypeClean> Traits;
-    static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
-    static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
     typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
-    typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
+    typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, Traits::RowsAtCompileTime, Traits::ColsAtCompileTime> DynMatrixType;
     typedef internal::MatrixLogarithmAtomic<DynMatrixType> AtomicType;
     AtomicType atomic;
     
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
index ebc433d89..1ceb5cf39 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
@@ -40,7 +40,6 @@ class MatrixPowerParenthesesReturnValue : public ReturnByValue< MatrixPowerParen
 {
   public:
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
 
     /**
      * \brief Constructor.
@@ -81,7 +80,7 @@ class MatrixPowerParenthesesReturnValue : public ReturnByValue< MatrixPowerParen
  *
  * \note Currently this class is only used by MatrixPower. One may
  * insist that this be nested into MatrixPower. This class is here to
- * faciliate future development of triangular matrix functions.
+ * facilitate future development of triangular matrix functions.
  */
 template<typename MatrixType>
 class MatrixPowerAtomic : internal::noncopyable
@@ -94,7 +93,6 @@ class MatrixPowerAtomic : internal::noncopyable
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
     typedef std::complex<RealScalar> ComplexScalar;
-    typedef typename MatrixType::Index Index;
     typedef Block<MatrixType,Dynamic,Dynamic> ResultType;
 
     const MatrixType& m_A;
@@ -340,7 +338,6 @@ class MatrixPower : internal::noncopyable
   private:
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
 
   public:
     /**
@@ -600,7 +597,6 @@ class MatrixPowerReturnValue : public ReturnByValue< MatrixPowerReturnValue<Deri
   public:
     typedef typename Derived::PlainObject PlainObject;
     typedef typename Derived::RealScalar RealScalar;
-    typedef typename Derived::Index Index;
 
     /**
      * \brief Constructor.
@@ -648,7 +644,6 @@ class MatrixComplexPowerReturnValue : public ReturnByValue< MatrixComplexPowerRe
   public:
     typedef typename Derived::PlainObject PlainObject;
     typedef typename std::complex<typename Derived::RealScalar> ComplexScalar;
-    typedef typename Derived::Index Index;
 
     /**
      * \brief Constructor.
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h b/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
index afd88ec4d..34bf78913 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
@@ -17,7 +17,7 @@ namespace internal {
 // pre:  T.block(i,i,2,2) has complex conjugate eigenvalues
 // post: sqrtT.block(i,i,2,2) is square root of T.block(i,i,2,2)
 template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_2x2_diagonal_block(const MatrixType& T, typename MatrixType::Index i, ResultType& sqrtT)
+void matrix_sqrt_quasi_triangular_2x2_diagonal_block(const MatrixType& T, Index i, ResultType& sqrtT)
 {
   // TODO: This case (2-by-2 blocks with complex conjugate eigenvalues) is probably hidden somewhere
   //       in EigenSolver. If we expose it, we could call it directly from here.
@@ -32,7 +32,7 @@ void matrix_sqrt_quasi_triangular_2x2_diagonal_block(const MatrixType& T, typena
 //       all blocks of sqrtT to left of and below (i,j) are correct
 // post: sqrtT(i,j) has the correct value
 template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
+void matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT)
 {
   typedef typename traits<MatrixType>::Scalar Scalar;
   Scalar tmp = (sqrtT.row(i).segment(i+1,j-i-1) * sqrtT.col(j).segment(i+1,j-i-1)).value();
@@ -41,7 +41,7 @@ void matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(const MatrixType& T, ty
 
 // similar to compute1x1offDiagonalBlock()
 template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
+void matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT)
 {
   typedef typename traits<MatrixType>::Scalar Scalar;
   Matrix<Scalar,1,2> rhs = T.template block<1,2>(i,j);
@@ -54,7 +54,7 @@ void matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(const MatrixType& T, ty
 
 // similar to compute1x1offDiagonalBlock()
 template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
+void matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT)
 {
   typedef typename traits<MatrixType>::Scalar Scalar;
   Matrix<Scalar,2,1> rhs = T.template block<2,1>(i,j);
@@ -101,7 +101,7 @@ void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const
 
 // similar to compute1x1offDiagonalBlock()
 template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
+void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT)
 {
   typedef typename traits<MatrixType>::Scalar Scalar;
   Matrix<Scalar,2,2> A = sqrtT.template block<2,2>(i,i);
@@ -120,7 +120,6 @@ template <typename MatrixType, typename ResultType>
 void matrix_sqrt_quasi_triangular_diagonal(const MatrixType& T, ResultType& sqrtT)
 {
   using std::sqrt;
-  typedef typename MatrixType::Index Index;
   const Index size = T.rows();
   for (Index i = 0; i < size; i++) {
     if (i == size - 1 || T.coeff(i+1, i) == 0) {
@@ -139,7 +138,6 @@ void matrix_sqrt_quasi_triangular_diagonal(const MatrixType& T, ResultType& sqrt
 template <typename MatrixType, typename ResultType>
 void matrix_sqrt_quasi_triangular_off_diagonal(const MatrixType& T, ResultType& sqrtT)
 {
-  typedef typename MatrixType::Index Index;
   const Index size = T.rows();
   for (Index j = 1; j < size; j++) {
       if (T.coeff(j, j-1) != 0)  // if T(j-1:j, j-1:j) is a 2-by-2 block
@@ -206,8 +204,7 @@ template <typename MatrixType, typename ResultType>
 void matrix_sqrt_triangular(const MatrixType &arg, ResultType &result)
 {
   using std::sqrt;
-  typedef typename MatrixType::Index Index;
-      typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::Scalar Scalar;
 
   eigen_assert(arg.rows() == arg.cols());
 
@@ -318,7 +315,6 @@ template<typename Derived> class MatrixSquareRootReturnValue
 : public ReturnByValue<MatrixSquareRootReturnValue<Derived> >
 {
   protected:
-    typedef typename Derived::Index Index;
     typedef typename internal::ref_selector<Derived>::type DerivedNested;
 
   public:
diff --git a/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h b/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h
index feafd62a8..4f2f560b3 100644
--- a/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h
+++ b/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h
@@ -61,7 +61,7 @@ void qrsolv(
             qtbpj = -givens.s() * wa[k] + givens.c() * qtbpj;
             wa[k] = temp;
 
-            /*           accumulate the tranformation in the row of s. */
+            /*           accumulate the transformation in the row of s. */
             for (i = k+1; i<n; ++i) {
                 temp = givens.c() * s(i,k) + givens.s() * sdiag[i];
                 sdiag[i] = -givens.s() * s(i,k) + givens.c() * sdiag[i];
diff --git a/unsupported/Eigen/src/NonLinearOptimization/r1updt.h b/unsupported/Eigen/src/NonLinearOptimization/r1updt.h
index f28766061..09fc65255 100644
--- a/unsupported/Eigen/src/NonLinearOptimization/r1updt.h
+++ b/unsupported/Eigen/src/NonLinearOptimization/r1updt.h
@@ -22,7 +22,7 @@ void r1updt(
     Scalar temp;
     JacobiRotation<Scalar> givens;
 
-    // r1updt had a broader usecase, but we dont use it here. And, more
+    // r1updt had a broader usecase, but we don't use it here. And, more
     // importantly, we can not test it.
     eigen_assert(m==n);
     eigen_assert(u.size()==m);
diff --git a/unsupported/Eigen/src/Polynomials/Companion.h b/unsupported/Eigen/src/Polynomials/Companion.h
index e0af6ebe4..126be783b 100644
--- a/unsupported/Eigen/src/Polynomials/Companion.h
+++ b/unsupported/Eigen/src/Polynomials/Companion.h
@@ -89,13 +89,13 @@ class companion
     {
       const Index deg   = m_monic.size();
       const Index deg_1 = deg-1;
-      DenseCompanionMatrixType companion(deg,deg);
-      companion <<
+      DenseCompanionMatrixType companMat(deg,deg);
+      companMat <<
         ( LeftBlock(deg,deg_1)
           << LeftBlockFirstRow::Zero(1,deg_1),
           BottomLeftBlock::Identity(deg-1,deg-1)*m_bl_diag.asDiagonal() ).finished()
         , m_monic;
-      return companion;
+      return companMat;
     }
 
 
@@ -104,7 +104,7 @@ class companion
     /** Helper function for the balancing algorithm.
      * \returns true if the row and the column, having colNorm and rowNorm
      * as norms, are balanced, false otherwise.
-     * colB and rowB are repectively the multipliers for
+     * colB and rowB are respectively the multipliers for
      * the column and the row in order to balance them.
      * */
     bool balanced( RealScalar colNorm, RealScalar rowNorm,
@@ -113,7 +113,7 @@ class companion
     /** Helper function for the balancing algorithm.
      * \returns true if the row and the column, having colNorm and rowNorm
      * as norms, are balanced, false otherwise.
-     * colB and rowB are repectively the multipliers for
+     * colB and rowB are respectively the multipliers for
      * the column and the row in order to balance them.
      * */
     bool balancedR( RealScalar colNorm, RealScalar rowNorm,
diff --git a/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h b/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h
index a1f54ed35..bda057a85 100644
--- a/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h
+++ b/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h
@@ -41,7 +41,7 @@ public:
 
     /** Sets the relative threshold value used to prune zero coefficients during the decomposition.
      *
-     * Setting a value greater than zero speeds up computation, and yields to an imcomplete
+     * Setting a value greater than zero speeds up computation, and yields to an incomplete
      * factorization with fewer non zero coefficients. Such approximate factors are especially
      * useful to initialize an iterative solver.
      *
diff --git a/unsupported/Eigen/src/Skyline/SkylineMatrix.h b/unsupported/Eigen/src/Skyline/SkylineMatrix.h
index a2a8933ca..f77d79a04 100644
--- a/unsupported/Eigen/src/Skyline/SkylineMatrix.h
+++ b/unsupported/Eigen/src/Skyline/SkylineMatrix.h
@@ -206,26 +206,26 @@ public:
             if (col > row) //upper matrix
             {
                 const Index minOuterIndex = inner - m_data.upperProfile(inner);
-                eigen_assert(outer >= minOuterIndex && "you try to acces a coeff that do not exist in the storage");
+                eigen_assert(outer >= minOuterIndex && "You tried to access a coeff that does not exist in the storage");
                 return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
             }
             if (col < row) //lower matrix
             {
                 const Index minInnerIndex = outer - m_data.lowerProfile(outer);
-                eigen_assert(inner >= minInnerIndex && "you try to acces a coeff that do not exist in the storage");
+                eigen_assert(inner >= minInnerIndex && "You tried to access a coeff that does not exist in the storage");
                 return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
             }
         } else {
             if (outer > inner) //upper matrix
             {
                 const Index maxOuterIndex = inner + m_data.upperProfile(inner);
-                eigen_assert(outer <= maxOuterIndex && "you try to acces a coeff that do not exist in the storage");
+                eigen_assert(outer <= maxOuterIndex && "You tried to access a coeff that does not exist in the storage");
                 return this->m_data.upper(m_colStartIndex[inner] + (outer - inner));
             }
             if (outer < inner) //lower matrix
             {
                 const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
-                eigen_assert(inner <= maxInnerIndex && "you try to acces a coeff that do not exist in the storage");
+                eigen_assert(inner <= maxInnerIndex && "You tried to access a coeff that does not exist in the storage");
                 return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer));
             }
         }
@@ -300,11 +300,11 @@ public:
 
         if (IsRowMajor) {
             const Index minInnerIndex = outer - m_data.lowerProfile(outer);
-            eigen_assert(inner >= minInnerIndex && "you try to acces a coeff that do not exist in the storage");
+            eigen_assert(inner >= minInnerIndex && "You tried to access a coeff that does not exist in the storage");
             return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
         } else {
             const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
-            eigen_assert(inner <= maxInnerIndex && "you try to acces a coeff that do not exist in the storage");
+            eigen_assert(inner <= maxInnerIndex && "You tried to access a coeff that does not exist in the storage");
             return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer));
         }
     }
@@ -336,11 +336,11 @@ public:
 
         if (IsRowMajor) {
             const Index minOuterIndex = inner - m_data.upperProfile(inner);
-            eigen_assert(outer >= minOuterIndex && "you try to acces a coeff that do not exist in the storage");
+            eigen_assert(outer >= minOuterIndex && "You tried to access a coeff that does not exist in the storage");
             return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
         } else {
             const Index maxOuterIndex = inner + m_data.upperProfile(inner);
-            eigen_assert(outer <= maxOuterIndex && "you try to acces a coeff that do not exist in the storage");
+            eigen_assert(outer <= maxOuterIndex && "You tried to access a coeff that does not exist in the storage");
             return this->m_data.upper(m_colStartIndex[inner] + (outer - inner));
         }
     }
diff --git a/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h b/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
index 037a13f86..3f1ff14ad 100644
--- a/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
+++ b/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
@@ -187,7 +187,7 @@ template<typename _Scalar, int _Options, typename _StorageIndex>
     /** Does nothing: provided for compatibility with SparseMatrix */
     inline void finalize() {}
 
-    /** Suppress all nonzeros which are smaller than \a reference under the tolerence \a epsilon */
+    /** Suppress all nonzeros which are smaller than \a reference under the tolerance \a epsilon */
     void prune(Scalar reference, RealScalar epsilon = NumTraits<RealScalar>::dummy_precision())
     {
       for (Index j=0; j<outerSize(); ++j)
@@ -224,21 +224,21 @@ template<typename _Scalar, int _Options, typename _StorageIndex>
       }
     }
 
-    /** The class DynamicSparseMatrix is deprectaed */
+    /** The class DynamicSparseMatrix is deprecated */
     EIGEN_DEPRECATED inline DynamicSparseMatrix()
       : m_innerSize(0), m_data(0)
     {
       eigen_assert(innerSize()==0 && outerSize()==0);
     }
 
-    /** The class DynamicSparseMatrix is deprectaed */
+    /** The class DynamicSparseMatrix is deprecated */
     EIGEN_DEPRECATED inline DynamicSparseMatrix(Index rows, Index cols)
       : m_innerSize(0)
     {
       resize(rows, cols);
     }
 
-    /** The class DynamicSparseMatrix is deprectaed */
+    /** The class DynamicSparseMatrix is deprecated */
     template<typename OtherDerived>
     EIGEN_DEPRECATED explicit inline DynamicSparseMatrix(const SparseMatrixBase<OtherDerived>& other)
       : m_innerSize(0)
diff --git a/unsupported/Eigen/src/SparseExtra/MarketIO.h b/unsupported/Eigen/src/SparseExtra/MarketIO.h
index fc70a24d8..1618b09a8 100644
--- a/unsupported/Eigen/src/SparseExtra/MarketIO.h
+++ b/unsupported/Eigen/src/SparseExtra/MarketIO.h
@@ -104,11 +104,12 @@ namespace internal
     out << value.real << " " << value.imag()<< "\n"; 
   }
 
-} // end namepsace internal
+} // end namespace internal
 
 inline bool getMarketHeader(const std::string& filename, int& sym, bool& iscomplex, bool& isvector)
 {
   sym = 0; 
+  iscomplex = false;
   isvector = false;
   std::ifstream in(filename.c_str(),std::ios::in);
   if(!in)
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
index ed415db99..30cdf4751 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
@@ -33,6 +33,48 @@ igamma(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerive
   );
 }
 
+/** \cpp11 \returns an expression of the coefficient-wise igamma_der_a(\a a, \a x) to the given arrays.
+  *
+  * This function computes the coefficient-wise derivative of the incomplete
+  * gamma function with respect to the parameter a.
+  *
+  * \note This function supports only float and double scalar types in c++11
+  * mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations
+  * of igamma_der_a(T,T) for any scalar
+  * type T to be supported.
+  *
+  * \sa Eigen::igamma(), Eigen::lgamma()
+  */
+template <typename Derived, typename ExponentDerived>
+inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_der_a_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+igamma_der_a(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x) {
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_der_a_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
+    a.derived(),
+    x.derived());
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise gamma_sample_der_alpha(\a alpha, \a sample) to the given arrays.
+  *
+  * This function computes the coefficient-wise derivative of the sample
+  * of a Gamma(alpha, 1) random variable with respect to the parameter alpha.
+  *
+  * \note This function supports only float and double scalar types in c++11
+  * mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations
+  * of gamma_sample_der_alpha(T,T) for any scalar
+  * type T to be supported.
+  *
+  * \sa Eigen::igamma(), Eigen::lgamma()
+  */
+template <typename AlphaDerived, typename SampleDerived>
+inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_gamma_sample_der_alpha_op<typename AlphaDerived::Scalar>, const AlphaDerived, const SampleDerived>
+gamma_sample_der_alpha(const Eigen::ArrayBase<AlphaDerived>& alpha, const Eigen::ArrayBase<SampleDerived>& sample) {
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_gamma_sample_der_alpha_op<typename AlphaDerived::Scalar>, const AlphaDerived, const SampleDerived>(
+      alpha.derived(),
+      sample.derived());
+}
+
 /** \cpp11 \returns an expression of the coefficient-wise igammac(\a a, \a x) to the given arrays.
   *
   * This function computes the coefficient-wise complementary incomplete gamma function.
@@ -119,6 +161,52 @@ zeta(const Eigen::ArrayBase<DerivedX>& x, const Eigen::ArrayBase<DerivedQ>& q)
   );
 }
 
+/** \returns an expression of the coefficient-wise i0e(\a x) to the given
+ * arrays.
+  *
+  * It returns the exponentially scaled modified Bessel
+  * function of order zero.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of i0e(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::i0e()
+  */
+template <typename Derived>
+inline const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_i0e_op<typename Derived::Scalar>, const Derived>
+i0e(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_i0e_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise i1e(\a x) to the given
+ * arrays.
+  *
+  * It returns the exponentially scaled modified Bessel
+  * function of order one.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of i1e(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::i1e()
+  */
+template <typename Derived>
+inline const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_i1e_op<typename Derived::Scalar>, const Derived>
+i1e(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_i1e_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
 } // end namespace Eigen
 
 #endif // EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
index d8f2363be..3a63dcdd6 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
@@ -41,6 +41,60 @@ struct functor_traits<scalar_igamma_op<Scalar> > {
   };
 };
 
+/** \internal
+  * \brief Template functor to compute the derivative of the incomplete gamma
+  * function igamma_der_a(a, x)
+  *
+  * \sa class CwiseBinaryOp, Cwise::igamma_der_a
+  */
+template <typename Scalar>
+struct scalar_igamma_der_a_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_igamma_der_a_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a, const Scalar& x) const {
+    using numext::igamma_der_a;
+    return igamma_der_a(a, x);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const {
+    return internal::pigamma_der_a(a, x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_igamma_der_a_op<Scalar> > {
+  enum {
+    // 2x the cost of igamma
+    Cost = 40 * NumTraits<Scalar>::MulCost + 20 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasIGammaDerA
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the derivative of the sample
+  * of a Gamma(alpha, 1) random variable with respect to the parameter alpha
+  * gamma_sample_der_alpha(alpha, sample)
+  *
+  * \sa class CwiseBinaryOp, Cwise::gamma_sample_der_alpha
+  */
+template <typename Scalar>
+struct scalar_gamma_sample_der_alpha_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_gamma_sample_der_alpha_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& alpha, const Scalar& sample) const {
+    using numext::gamma_sample_der_alpha;
+    return gamma_sample_der_alpha(alpha, sample);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& alpha, const Packet& sample) const {
+    return internal::pgamma_sample_der_alpha(alpha, sample);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_gamma_sample_der_alpha_op<Scalar> > {
+  enum {
+    // 2x the cost of igamma, minus the lgamma cost (the lgamma cancels out)
+    Cost = 30 * NumTraits<Scalar>::MulCost + 15 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasGammaSampleDerAlpha
+  };
+};
 
 /** \internal
   * \brief Template functor to compute the complementary incomplete gamma function igammac(a, x)
@@ -229,6 +283,60 @@ struct functor_traits<scalar_erfc_op<Scalar> >
   };
 };
 
+/** \internal
+ * \brief Template functor to compute the exponentially scaled modified Bessel
+ * function of order zero
+ * \sa class CwiseUnaryOp, Cwise::i0e()
+ */
+template <typename Scalar>
+struct scalar_i0e_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_i0e_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& x) const {
+    using numext::i0e;
+    return i0e(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x) const {
+    return internal::pi0e(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_i0e_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=20 is computed.
+    // The cost is N multiplications and 2N additions.
+    Cost = 20 * NumTraits<Scalar>::MulCost + 40 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasI0e
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the exponentially scaled modified Bessel
+ * function of order zero
+ * \sa class CwiseUnaryOp, Cwise::i1e()
+ */
+template <typename Scalar>
+struct scalar_i1e_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_i1e_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& x) const {
+    using numext::i1e;
+    return i1e(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x) const {
+    return internal::pi1e(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_i1e_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=20 is computed.
+    // The cost is N multiplications and 2N additions.
+    Cost = 20 * NumTraits<Scalar>::MulCost + 40 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasI1e
+  };
+};
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
index 553bcda6a..fbdfd299e 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
@@ -33,12 +33,28 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erfc(const Eigen::h
 template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma(const Eigen::half& a, const Eigen::half& x) {
   return Eigen::half(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x)));
 }
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma_der_a(const Eigen::half& a, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::igamma_der_a(static_cast<float>(a), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half gamma_sample_der_alpha(const Eigen::half& alpha, const Eigen::half& sample) {
+  return Eigen::half(Eigen::numext::gamma_sample_der_alpha(static_cast<float>(alpha), static_cast<float>(sample)));
+}
 template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igammac(const Eigen::half& a, const Eigen::half& x) {
   return Eigen::half(Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x)));
 }
 template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half betainc(const Eigen::half& a, const Eigen::half& b, const Eigen::half& x) {
   return Eigen::half(Eigen::numext::betainc(static_cast<float>(a), static_cast<float>(b), static_cast<float>(x)));
 }
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half i0e(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::i0e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half i1e(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::i1e(static_cast<float>(x)));
+}
 #endif
 
 }  // end namespace numext
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
index 5d1b8fcc2..dbcc9d8ac 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
@@ -95,6 +95,75 @@ struct polevl<Scalar, 0> {
   }
 };
 
+/* chbevl (modified for Eigen)
+ *
+ *     Evaluate Chebyshev series
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * int N;
+ * Scalar x, y, coef[N], chebevl();
+ *
+ * y = chbevl( x, coef, N );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Evaluates the series
+ *
+ *        N-1
+ *         - '
+ *  y  =   >   coef[i] T (x/2)
+ *         -            i
+ *        i=0
+ *
+ * of Chebyshev polynomials Ti at argument x/2.
+ *
+ * Coefficients are stored in reverse order, i.e. the zero
+ * order term is last in the array.  Note N is the number of
+ * coefficients, not the order.
+ *
+ * If coefficients are for the interval a to b, x must
+ * have been transformed to x -> 2(2x - b - a)/(b-a) before
+ * entering the routine.  This maps x from (a, b) to (-1, 1),
+ * over which the Chebyshev polynomials are defined.
+ *
+ * If the coefficients are for the inverted interval, in
+ * which (a, b) is mapped to (1/b, 1/a), the transformation
+ * required is x -> 2(2ab/x - b - a)/(b-a).  If b is infinity,
+ * this becomes x -> 4a/x - 1.
+ *
+ *
+ *
+ * SPEED:
+ *
+ * Taking advantage of the recurrence properties of the
+ * Chebyshev polynomials, the routine requires one more
+ * addition per loop than evaluating a nested polynomial of
+ * the same degree.
+ *
+ */
+template <typename Scalar, int N>
+struct chebevl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(Scalar x, const Scalar coef[]) {
+    Scalar b0 = coef[0];
+    Scalar b1 = 0;
+    Scalar b2;
+
+    for (int i = 1; i < N; i++) {
+      b2 = b1;
+      b1 = b0;
+      b0 = x * b1 - b2 + coef[i];
+    }
+
+    return Scalar(0.5) * (b0 - b2);
+  }
+};
+
 }  // end namespace cephes
 
 /****************************************************************************
@@ -121,7 +190,7 @@ template <>
 struct lgamma_impl<float> {
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE float run(float x) {
-#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) 
     int dummy;
     return ::lgammaf_r(x, &dummy);
 #else
@@ -134,7 +203,7 @@ template <>
 struct lgamma_impl<double> {
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE double run(double x) {
-#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) 
     int dummy;
     return ::lgamma_r(x, &dummy);
 #else
@@ -452,6 +521,199 @@ struct cephes_helper<double> {
   }
 };
 
+enum IgammaComputationMode { VALUE, DERIVATIVE, SAMPLE_DERIVATIVE };
+
+template <typename Scalar, IgammaComputationMode mode>
+EIGEN_DEVICE_FUNC
+int igamma_num_iterations() {
+  /* Returns the maximum number of internal iterations for igamma computation.
+   */
+  if (mode == VALUE) {
+    return 2000;
+  }
+
+  if (internal::is_same<Scalar, float>::value) {
+    return 200;
+  } else if (internal::is_same<Scalar, double>::value) {
+    return 500;
+  } else {
+    return 2000;
+  }
+}
+
+template <typename Scalar, IgammaComputationMode mode>
+struct igammac_cf_impl {
+  /* Computes igamc(a, x) or derivative (depending on the mode)
+   * using the continued fraction expansion of the complementary
+   * incomplete Gamma function.
+   *
+   * Preconditions:
+   *   a > 0
+   *   x >= 1
+   *   x >= a
+   */
+  EIGEN_DEVICE_FUNC
+  static Scalar run(Scalar a, Scalar x) {
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar two = 2;
+    const Scalar machep = cephes_helper<Scalar>::machep();
+    const Scalar big = cephes_helper<Scalar>::big();
+    const Scalar biginv = cephes_helper<Scalar>::biginv();
+
+    if ((numext::isinf)(x)) {
+      return zero;
+    }
+
+    // continued fraction
+    Scalar y = one - a;
+    Scalar z = x + y + one;
+    Scalar c = zero;
+    Scalar pkm2 = one;
+    Scalar qkm2 = x;
+    Scalar pkm1 = x + one;
+    Scalar qkm1 = z * x;
+    Scalar ans = pkm1 / qkm1;
+
+    Scalar dpkm2_da = zero;
+    Scalar dqkm2_da = zero;
+    Scalar dpkm1_da = zero;
+    Scalar dqkm1_da = -x;
+    Scalar dans_da = (dpkm1_da - ans * dqkm1_da) / qkm1;
+
+    for (int i = 0; i < igamma_num_iterations<Scalar, mode>(); i++) {
+      c += one;
+      y += one;
+      z += two;
+
+      Scalar yc = y * c;
+      Scalar pk = pkm1 * z - pkm2 * yc;
+      Scalar qk = qkm1 * z - qkm2 * yc;
+
+      Scalar dpk_da = dpkm1_da * z - pkm1 - dpkm2_da * yc + pkm2 * c;
+      Scalar dqk_da = dqkm1_da * z - qkm1 - dqkm2_da * yc + qkm2 * c;
+
+      if (qk != zero) {
+        Scalar ans_prev = ans;
+        ans = pk / qk;
+
+        Scalar dans_da_prev = dans_da;
+        dans_da = (dpk_da - ans * dqk_da) / qk;
+
+        if (mode == VALUE) {
+          if (numext::abs(ans_prev - ans) <= machep * numext::abs(ans)) {
+            break;
+          }
+        } else {
+          if (numext::abs(dans_da - dans_da_prev) <= machep) {
+            break;
+          }
+        }
+      }
+
+      pkm2 = pkm1;
+      pkm1 = pk;
+      qkm2 = qkm1;
+      qkm1 = qk;
+
+      dpkm2_da = dpkm1_da;
+      dpkm1_da = dpk_da;
+      dqkm2_da = dqkm1_da;
+      dqkm1_da = dqk_da;
+
+      if (numext::abs(pk) > big) {
+        pkm2 *= biginv;
+        pkm1 *= biginv;
+        qkm2 *= biginv;
+        qkm1 *= biginv;
+
+        dpkm2_da *= biginv;
+        dpkm1_da *= biginv;
+        dqkm2_da *= biginv;
+        dqkm1_da *= biginv;
+      }
+    }
+
+    /* Compute  x**a * exp(-x) / gamma(a)  */
+    Scalar logax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
+    Scalar dlogax_da = numext::log(x) - digamma_impl<Scalar>::run(a);
+    Scalar ax = numext::exp(logax);
+    Scalar dax_da = ax * dlogax_da;
+
+    switch (mode) {
+      case VALUE:
+        return ans * ax;
+      case DERIVATIVE:
+        return ans * dax_da + dans_da * ax;
+      case SAMPLE_DERIVATIVE:
+      default: // this is needed to suppress clang warning
+        return -(dans_da + ans * dlogax_da) * x;
+    }
+  }
+};
+
+template <typename Scalar, IgammaComputationMode mode>
+struct igamma_series_impl {
+  /* Computes igam(a, x) or its derivative (depending on the mode)
+   * using the series expansion of the incomplete Gamma function.
+   *
+   * Preconditions:
+   *   x > 0
+   *   a > 0
+   *   !(x > 1 && x > a)
+   */
+  EIGEN_DEVICE_FUNC
+  static Scalar run(Scalar a, Scalar x) {
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar machep = cephes_helper<Scalar>::machep();
+
+    /* power series */
+    Scalar r = a;
+    Scalar c = one;
+    Scalar ans = one;
+
+    Scalar dc_da = zero;
+    Scalar dans_da = zero;
+
+    for (int i = 0; i < igamma_num_iterations<Scalar, mode>(); i++) {
+      r += one;
+      Scalar term = x / r;
+      Scalar dterm_da = -x / (r * r);
+      dc_da = term * dc_da + dterm_da * c;
+      dans_da += dc_da;
+      c *= term;
+      ans += c;
+
+      if (mode == VALUE) {
+        if (c <= machep * ans) {
+          break;
+        }
+      } else {
+        if (numext::abs(dc_da) <= machep * numext::abs(dans_da)) {
+          break;
+        }
+      }
+    }
+
+    /* Compute  x**a * exp(-x) / gamma(a + 1)  */
+    Scalar logax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a + one);
+    Scalar dlogax_da = numext::log(x) - digamma_impl<Scalar>::run(a + one);
+    Scalar ax = numext::exp(logax);
+    Scalar dax_da = ax * dlogax_da;
+
+    switch (mode) {
+      case VALUE:
+        return ans * ax;
+      case DERIVATIVE:
+        return ans * dax_da + dans_da * ax;
+      case SAMPLE_DERIVATIVE:
+      default: // this is needed to suppress clang warning
+        return -(dans_da + ans * dlogax_da) * x / a;
+    }
+  }
+};
+
 #if !EIGEN_HAS_C99_MATH
 
 template <typename Scalar>
@@ -466,8 +728,6 @@ struct igammac_impl {
 
 #else
 
-template <typename Scalar> struct igamma_impl;  // predeclare igamma_impl
-
 template <typename Scalar>
 struct igammac_impl {
   EIGEN_DEVICE_FUNC
@@ -535,93 +795,15 @@ struct igammac_impl {
       return nan;
     }
 
-    if ((x < one) || (x < a)) {
-      /* The checks above ensure that we meet the preconditions for
-       * igamma_impl::Impl(), so call it, rather than igamma_impl::Run().
-       * Calling Run() would also work, but in that case the compiler may not be
-       * able to prove that igammac_impl::Run and igamma_impl::Run are not
-       * mutually recursive.  This leads to worse code, particularly on
-       * platforms like nvptx, where recursion is allowed only begrudgingly.
-       */
-      return (one - igamma_impl<Scalar>::Impl(a, x));
-    }
-
-    return Impl(a, x);
-  }
-
- private:
-  /* igamma_impl calls igammac_impl::Impl. */
-  friend struct igamma_impl<Scalar>;
-
-  /* Actually computes igamc(a, x).
-   *
-   * Preconditions:
-   *   a > 0
-   *   x >= 1
-   *   x >= a
-   */
-  EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) {
-    const Scalar zero = 0;
-    const Scalar one = 1;
-    const Scalar two = 2;
-    const Scalar machep = cephes_helper<Scalar>::machep();
-    const Scalar maxlog = numext::log(NumTraits<Scalar>::highest());
-    const Scalar big = cephes_helper<Scalar>::big();
-    const Scalar biginv = cephes_helper<Scalar>::biginv();
-    const Scalar inf = NumTraits<Scalar>::infinity();
-
-    Scalar ans, ax, c, yc, r, t, y, z;
-    Scalar pk, pkm1, pkm2, qk, qkm1, qkm2;
-
-    if (x == inf) return zero;  // std::isinf crashes on CUDA
-
-    /* Compute  x**a * exp(-x) / gamma(a)  */
-    ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
-    if (ax < -maxlog) {  // underflow
-      return zero;
+    if ((numext::isnan)(a) || (numext::isnan)(x)) {  // propagate nans
+      return nan;
     }
-    ax = numext::exp(ax);
 
-    // continued fraction
-    y = one - a;
-    z = x + y + one;
-    c = zero;
-    pkm2 = one;
-    qkm2 = x;
-    pkm1 = x + one;
-    qkm1 = z * x;
-    ans = pkm1 / qkm1;
-
-    while (true) {
-      c += one;
-      y += one;
-      z += two;
-      yc = y * c;
-      pk = pkm1 * z - pkm2 * yc;
-      qk = qkm1 * z - qkm2 * yc;
-      if (qk != zero) {
-        r = pk / qk;
-        t = numext::abs((ans - r) / r);
-        ans = r;
-      } else {
-        t = one;
-      }
-      pkm2 = pkm1;
-      pkm1 = pk;
-      qkm2 = qkm1;
-      qkm1 = qk;
-      if (numext::abs(pk) > big) {
-        pkm2 *= biginv;
-        pkm1 *= biginv;
-        qkm2 *= biginv;
-        qkm1 *= biginv;
-      }
-      if (t <= machep) {
-        break;
-      }
+    if ((x < one) || (x < a)) {
+      return (one - igamma_series_impl<Scalar, VALUE>::run(a, x));
     }
 
-    return (ans * ax);
+    return igammac_cf_impl<Scalar, VALUE>::run(a, x);
   }
 };
 
@@ -631,15 +813,10 @@ struct igammac_impl {
  * Implementation of igamma (incomplete gamma integral), based on Cephes but requires C++11/C99 *
  ************************************************************************************************/
 
-template <typename Scalar>
-struct igamma_retval {
-  typedef Scalar type;
-};
-
 #if !EIGEN_HAS_C99_MATH
 
-template <typename Scalar>
-struct igamma_impl {
+template <typename Scalar, IgammaComputationMode mode>
+struct igamma_generic_impl {
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar x) {
     EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
@@ -650,69 +827,17 @@ struct igamma_impl {
 
 #else
 
-template <typename Scalar>
-struct igamma_impl {
+template <typename Scalar, IgammaComputationMode mode>
+struct igamma_generic_impl {
   EIGEN_DEVICE_FUNC
   static Scalar run(Scalar a, Scalar x) {
-    /*	igam()
-     *	Incomplete gamma integral
-     *
-     *
-     *
-     * SYNOPSIS:
-     *
-     * double a, x, y, igam();
-     *
-     * y = igam( a, x );
-     *
-     * DESCRIPTION:
-     *
-     * The function is defined by
-     *
-     *                           x
-     *                            -
-     *                   1       | |  -t  a-1
-     *  igam(a,x)  =   -----     |   e   t   dt.
-     *                  -      | |
-     *                 | (a)    -
-     *                           0
-     *
-     *
-     * In this implementation both arguments must be positive.
-     * The integral is evaluated by either a power series or
-     * continued fraction expansion, depending on the relative
-     * values of a and x.
-     *
-     * ACCURACY (double):
-     *
-     *                      Relative error:
-     * arithmetic   domain     # trials      peak         rms
-     *    IEEE      0,30       200000       3.6e-14     2.9e-15
-     *    IEEE      0,100      300000       9.9e-14     1.5e-14
-     *
-     *
-     * ACCURACY (float):
-     *
-     *                      Relative error:
-     * arithmetic   domain     # trials      peak         rms
-     *    IEEE      0,30        20000       7.8e-6      5.9e-7
-     *
-     */
-    /*
-      Cephes Math Library Release 2.2: June, 1992
-      Copyright 1985, 1987, 1992 by Stephen L. Moshier
-      Direct inquiries to 30 Frost Street, Cambridge, MA 02140
-    */
-
-
-    /* left tail of incomplete gamma function:
-     *
-     *          inf.      k
-     *   a  -x   -       x
-     *  x  e     >   ----------
-     *           -     -
-     *          k=0   | (a+k+1)
+    /* Depending on the mode, returns
+     * - VALUE: incomplete Gamma function igamma(a, x)
+     * - DERIVATIVE: derivative of incomplete Gamma function d/da igamma(a, x)
+     * - SAMPLE_DERIVATIVE: implicit derivative of a Gamma random variable
+     * x ~ Gamma(x | a, 1), dx/da = -1 / Gamma(x | a, 1) * d igamma(a, x) / dx
      *
+     * Derivatives are implemented by forward-mode differentiation.
      */
     const Scalar zero = 0;
     const Scalar one = 1;
@@ -724,67 +849,167 @@ struct igamma_impl {
       return nan;
     }
 
+    if ((numext::isnan)(a) || (numext::isnan)(x)) {  // propagate nans
+      return nan;
+    }
+
     if ((x > one) && (x > a)) {
-      /* The checks above ensure that we meet the preconditions for
-       * igammac_impl::Impl(), so call it, rather than igammac_impl::Run().
-       * Calling Run() would also work, but in that case the compiler may not be
-       * able to prove that igammac_impl::Run and igamma_impl::Run are not
-       * mutually recursive.  This leads to worse code, particularly on
-       * platforms like nvptx, where recursion is allowed only begrudgingly.
-       */
-      return (one - igammac_impl<Scalar>::Impl(a, x));
+      Scalar ret = igammac_cf_impl<Scalar, mode>::run(a, x);
+      if (mode == VALUE) {
+        return one - ret;
+      } else {
+        return -ret;
+      }
     }
 
-    return Impl(a, x);
+    return igamma_series_impl<Scalar, mode>::run(a, x);
   }
+};
 
- private:
-  /* igammac_impl calls igamma_impl::Impl. */
-  friend struct igammac_impl<Scalar>;
+#endif  // EIGEN_HAS_C99_MATH
 
-  /* Actually computes igam(a, x).
+template <typename Scalar>
+struct igamma_retval {
+  typedef Scalar type;
+};
+
+template <typename Scalar>
+struct igamma_impl : igamma_generic_impl<Scalar, VALUE> {
+  /* igam()
+   * Incomplete gamma integral.
+   *
+   * The CDF of Gamma(a, 1) random variable at the point x.
+   *
+   * Accuracy estimation. For each a in [10^-2, 10^-1...10^3] we sample
+   * 50 Gamma random variables x ~ Gamma(x | a, 1), a total of 300 points.
+   * The ground truth is computed by mpmath. Mean absolute error:
+   * float: 1.26713e-05
+   * double: 2.33606e-12
+   *
+   * Cephes documentation below.
+   *
+   * SYNOPSIS:
+   *
+   * double a, x, y, igam();
+   *
+   * y = igam( a, x );
+   *
+   * DESCRIPTION:
+   *
+   * The function is defined by
+   *
+   *                           x
+   *                            -
+   *                   1       | |  -t  a-1
+   *  igam(a,x)  =   -----     |   e   t   dt.
+   *                  -      | |
+   *                 | (a)    -
+   *                           0
+   *
+   *
+   * In this implementation both arguments must be positive.
+   * The integral is evaluated by either a power series or
+   * continued fraction expansion, depending on the relative
+   * values of a and x.
+   *
+   * ACCURACY (double):
+   *
+   *                      Relative error:
+   * arithmetic   domain     # trials      peak         rms
+   *    IEEE      0,30       200000       3.6e-14     2.9e-15
+   *    IEEE      0,100      300000       9.9e-14     1.5e-14
+   *
+   *
+   * ACCURACY (float):
+   *
+   *                      Relative error:
+   * arithmetic   domain     # trials      peak         rms
+   *    IEEE      0,30        20000       7.8e-6      5.9e-7
    *
-   * Preconditions:
-   *   x > 0
-   *   a > 0
-   *   !(x > 1 && x > a)
    */
-  EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) {
-    const Scalar zero = 0;
-    const Scalar one = 1;
-    const Scalar machep = cephes_helper<Scalar>::machep();
-    const Scalar maxlog = numext::log(NumTraits<Scalar>::highest());
+  /*
+    Cephes Math Library Release 2.2: June, 1992
+    Copyright 1985, 1987, 1992 by Stephen L. Moshier
+    Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+  */
 
-    Scalar ans, ax, c, r;
+  /* left tail of incomplete gamma function:
+   *
+   *          inf.      k
+   *   a  -x   -       x
+   *  x  e     >   ----------
+   *           -     -
+   *          k=0   | (a+k+1)
+   *
+   */
+};
 
-    /* Compute  x**a * exp(-x) / gamma(a)  */
-    ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
-    if (ax < -maxlog) {
-      // underflow
-      return zero;
-    }
-    ax = numext::exp(ax);
+template <typename Scalar>
+struct igamma_der_a_retval : igamma_retval<Scalar> {};
 
-    /* power series */
-    r = a;
-    c = one;
-    ans = one;
+template <typename Scalar>
+struct igamma_der_a_impl : igamma_generic_impl<Scalar, DERIVATIVE> {
+  /* Derivative of the incomplete Gamma function with respect to a.
+   *
+   * Computes d/da igamma(a, x) by forward differentiation of the igamma code.
+   *
+   * Accuracy estimation. For each a in [10^-2, 10^-1...10^3] we sample
+   * 50 Gamma random variables x ~ Gamma(x | a, 1), a total of 300 points.
+   * The ground truth is computed by mpmath. Mean absolute error:
+   * float: 6.17992e-07
+   * double: 4.60453e-12
+   *
+   * Reference:
+   * R. Moore. "Algorithm AS 187: Derivatives of the incomplete gamma
+   * integral". Journal of the Royal Statistical Society. 1982
+   */
+};
 
-    while (true) {
-      r += one;
-      c *= x/r;
-      ans += c;
-      if (c/ans <= machep) {
-        break;
-      }
-    }
+template <typename Scalar>
+struct gamma_sample_der_alpha_retval : igamma_retval<Scalar> {};
 
-    return (ans * ax / a);
-  }
+template <typename Scalar>
+struct gamma_sample_der_alpha_impl
+    : igamma_generic_impl<Scalar, SAMPLE_DERIVATIVE> {
+  /* Derivative of a Gamma random variable sample with respect to alpha.
+   *
+   * Consider a sample of a Gamma random variable with the concentration
+   * parameter alpha: sample ~ Gamma(alpha, 1). The reparameterization
+   * derivative that we want to compute is dsample / dalpha =
+   * d igammainv(alpha, u) / dalpha, where u = igamma(alpha, sample).
+   * However, this formula is numerically unstable and expensive, so instead
+   * we use implicit differentiation:
+   *
+   * igamma(alpha, sample) = u, where u ~ Uniform(0, 1).
+   * Apply d / dalpha to both sides:
+   * d igamma(alpha, sample) / dalpha
+   *     + d igamma(alpha, sample) / dsample * dsample/dalpha  = 0
+   * d igamma(alpha, sample) / dalpha
+   *     + Gamma(sample | alpha, 1) dsample / dalpha = 0
+   * dsample/dalpha = - (d igamma(alpha, sample) / dalpha)
+   *                   / Gamma(sample | alpha, 1)
+   *
+   * Here Gamma(sample | alpha, 1) is the PDF of the Gamma distribution
+   * (note that the derivative of the CDF w.r.t. sample is the PDF).
+   * See the reference below for more details.
+   *
+   * The derivative of igamma(alpha, sample) is computed by forward
+   * differentiation of the igamma code. Division by the Gamma PDF is performed
+   * in the same code, increasing the accuracy and speed due to cancellation
+   * of some terms.
+   *
+   * Accuracy estimation. For each alpha in [10^-2, 10^-1...10^3] we sample
+   * 50 Gamma random variables sample ~ Gamma(sample | alpha, 1), a total of 300
+   * points. The ground truth is computed by mpmath. Mean absolute error:
+   * float: 2.1686e-06
+   * double: 1.4774e-12
+   *
+   * Reference:
+   * M. Figurnov, S. Mohamed, A. Mnih "Implicit Reparameterization Gradients".
+   * 2018
+   */
 };
 
-#endif  // EIGEN_HAS_C99_MATH
-
 /*****************************************************************************
  * Implementation of Riemann zeta function of two arguments, based on Cephes *
  *****************************************************************************/
@@ -1499,6 +1724,334 @@ struct betainc_impl<double> {
 
 #endif  // EIGEN_HAS_C99_MATH
 
+/****************************************************************************
+ * Implementation of Bessel function, based on Cephes                       *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct i0e_retval {
+  typedef Scalar type;
+};
+
+template <typename Scalar>
+struct i0e_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+template <>
+struct i0e_impl<float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float run(float x) {
+    /*  i0ef.c
+     *
+     *  Modified Bessel function of order zero,
+     *  exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, i0ef();
+     *
+     * y = i0ef( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of order zero of the argument.
+     *
+     * The function is defined as i0e(x) = exp(-|x|) j0( ix ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30        100000      3.7e-7      7.0e-8
+     * See i0f().
+     *
+     */
+    const float A[] = {-1.30002500998624804212E-8f, 6.04699502254191894932E-8f,
+                       -2.67079385394061173391E-7f, 1.11738753912010371815E-6f,
+                       -4.41673835845875056359E-6f, 1.64484480707288970893E-5f,
+                       -5.75419501008210370398E-5f, 1.88502885095841655729E-4f,
+                       -5.76375574538582365885E-4f, 1.63947561694133579842E-3f,
+                       -4.32430999505057594430E-3f, 1.05464603945949983183E-2f,
+                       -2.37374148058994688156E-2f, 4.93052842396707084878E-2f,
+                       -9.49010970480476444210E-2f, 1.71620901522208775349E-1f,
+                       -3.04682672343198398683E-1f, 6.76795274409476084995E-1f};
+
+    const float B[] = {3.39623202570838634515E-9f, 2.26666899049817806459E-8f,
+                       2.04891858946906374183E-7f, 2.89137052083475648297E-6f,
+                       6.88975834691682398426E-5f, 3.36911647825569408990E-3f,
+                       8.04490411014108831608E-1f};
+    if (x < 0.0f) {
+      x = -x;
+    }
+
+    if (x <= 8.0f) {
+      float y = 0.5f * x - 2.0f;
+      return cephes::chebevl<float, 18>::run(y, A);
+    }
+
+    return cephes::chebevl<float, 7>::run(32.0f / x - 2.0f, B) / numext::sqrt(x);
+  }
+};
+
+template <>
+struct i0e_impl<double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double run(double x) {
+    /*  i0e.c
+     *
+     *  Modified Bessel function of order zero,
+     *  exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, i0e();
+     *
+     * y = i0e( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of order zero of the argument.
+     *
+     * The function is defined as i0e(x) = exp(-|x|) j0( ix ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30        30000       5.4e-16     1.2e-16
+     * See i0().
+     *
+     */
+    const double A[] = {-4.41534164647933937950E-18, 3.33079451882223809783E-17,
+                        -2.43127984654795469359E-16, 1.71539128555513303061E-15,
+                        -1.16853328779934516808E-14, 7.67618549860493561688E-14,
+                        -4.85644678311192946090E-13, 2.95505266312963983461E-12,
+                        -1.72682629144155570723E-11, 9.67580903537323691224E-11,
+                        -5.18979560163526290666E-10, 2.65982372468238665035E-9,
+                        -1.30002500998624804212E-8,  6.04699502254191894932E-8,
+                        -2.67079385394061173391E-7,  1.11738753912010371815E-6,
+                        -4.41673835845875056359E-6,  1.64484480707288970893E-5,
+                        -5.75419501008210370398E-5,  1.88502885095841655729E-4,
+                        -5.76375574538582365885E-4,  1.63947561694133579842E-3,
+                        -4.32430999505057594430E-3,  1.05464603945949983183E-2,
+                        -2.37374148058994688156E-2,  4.93052842396707084878E-2,
+                        -9.49010970480476444210E-2,  1.71620901522208775349E-1,
+                        -3.04682672343198398683E-1,  6.76795274409476084995E-1};
+    const double B[] = {
+        -7.23318048787475395456E-18, -4.83050448594418207126E-18,
+        4.46562142029675999901E-17,  3.46122286769746109310E-17,
+        -2.82762398051658348494E-16, -3.42548561967721913462E-16,
+        1.77256013305652638360E-15,  3.81168066935262242075E-15,
+        -9.55484669882830764870E-15, -4.15056934728722208663E-14,
+        1.54008621752140982691E-14,  3.85277838274214270114E-13,
+        7.18012445138366623367E-13,  -1.79417853150680611778E-12,
+        -1.32158118404477131188E-11, -3.14991652796324136454E-11,
+        1.18891471078464383424E-11,  4.94060238822496958910E-10,
+        3.39623202570838634515E-9,   2.26666899049817806459E-8,
+        2.04891858946906374183E-7,   2.89137052083475648297E-6,
+        6.88975834691682398426E-5,   3.36911647825569408990E-3,
+        8.04490411014108831608E-1};
+
+    if (x < 0.0) {
+      x = -x;
+    }
+
+    if (x <= 8.0) {
+      double y = (x / 2.0) - 2.0;
+      return cephes::chebevl<double, 30>::run(y, A);
+    }
+
+    return cephes::chebevl<double, 25>::run(32.0 / x - 2.0, B) /
+           numext::sqrt(x);
+  }
+};
+
+template <typename Scalar>
+struct i1e_retval {
+  typedef Scalar type;
+};
+
+template <typename Scalar>
+struct i1e_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+template <>
+struct i1e_impl<float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float run(float x) {
+    /* i1ef.c
+     *
+     *  Modified Bessel function of order one,
+     *  exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, i1ef();
+     *
+     * y = i1ef( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of order one of the argument.
+     *
+     * The function is defined as i1(x) = -i exp(-|x|) j1( ix ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       1.5e-6      1.5e-7
+     * See i1().
+     *
+     */
+    const float A[] = {9.38153738649577178388E-9f, -4.44505912879632808065E-8f,
+                       2.00329475355213526229E-7f, -8.56872026469545474066E-7f,
+                       3.47025130813767847674E-6f, -1.32731636560394358279E-5f,
+                       4.78156510755005422638E-5f, -1.61760815825896745588E-4f,
+                       5.12285956168575772895E-4f, -1.51357245063125314899E-3f,
+                       4.15642294431288815669E-3f, -1.05640848946261981558E-2f,
+                       2.47264490306265168283E-2f, -5.29459812080949914269E-2f,
+                       1.02643658689847095384E-1f, -1.76416518357834055153E-1f,
+                       2.52587186443633654823E-1f};
+
+    const float B[] = {-3.83538038596423702205E-9f, -2.63146884688951950684E-8f,
+                       -2.51223623787020892529E-7f, -3.88256480887769039346E-6f,
+                       -1.10588938762623716291E-4f, -9.76109749136146840777E-3f,
+                       7.78576235018280120474E-1f};
+
+    float z = numext::abs(x);
+
+    if (z <= 8.0f) {
+      float y = 0.5f * z - 2.0f;
+      z = cephes::chebevl<float, 17>::run(y, A) * z;
+    } else {
+      z = cephes::chebevl<float, 7>::run(32.0f / z - 2.0f, B) / numext::sqrt(z);
+    }
+
+    if (x < 0.0f) {
+      z = -z;
+    }
+
+    return z;
+  }
+};
+
+template <>
+struct i1e_impl<double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double run(double x) {
+    /*  i1e.c
+     *
+     *  Modified Bessel function of order one,
+     *  exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, i1e();
+     *
+     * y = i1e( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of order one of the argument.
+     *
+     * The function is defined as i1(x) = -i exp(-|x|) j1( ix ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       2.0e-15     2.0e-16
+     * See i1().
+     *
+     */
+    const double A[] = {2.77791411276104639959E-18, -2.11142121435816608115E-17,
+                        1.55363195773620046921E-16, -1.10559694773538630805E-15,
+                        7.60068429473540693410E-15, -5.04218550472791168711E-14,
+                        3.22379336594557470981E-13, -1.98397439776494371520E-12,
+                        1.17361862988909016308E-11, -6.66348972350202774223E-11,
+                        3.62559028155211703701E-10, -1.88724975172282928790E-9,
+                        9.38153738649577178388E-9,  -4.44505912879632808065E-8,
+                        2.00329475355213526229E-7,  -8.56872026469545474066E-7,
+                        3.47025130813767847674E-6,  -1.32731636560394358279E-5,
+                        4.78156510755005422638E-5,  -1.61760815825896745588E-4,
+                        5.12285956168575772895E-4,  -1.51357245063125314899E-3,
+                        4.15642294431288815669E-3,  -1.05640848946261981558E-2,
+                        2.47264490306265168283E-2,  -5.29459812080949914269E-2,
+                        1.02643658689847095384E-1,  -1.76416518357834055153E-1,
+                        2.52587186443633654823E-1};
+    const double B[] = {
+        7.51729631084210481353E-18,  4.41434832307170791151E-18,
+        -4.65030536848935832153E-17, -3.20952592199342395980E-17,
+        2.96262899764595013876E-16,  3.30820231092092828324E-16,
+        -1.88035477551078244854E-15, -3.81440307243700780478E-15,
+        1.04202769841288027642E-14,  4.27244001671195135429E-14,
+        -2.10154184277266431302E-14, -4.08355111109219731823E-13,
+        -7.19855177624590851209E-13, 2.03562854414708950722E-12,
+        1.41258074366137813316E-11,  3.25260358301548823856E-11,
+        -1.89749581235054123450E-11, -5.58974346219658380687E-10,
+        -3.83538038596423702205E-9,  -2.63146884688951950684E-8,
+        -2.51223623787020892529E-7,  -3.88256480887769039346E-6,
+        -1.10588938762623716291E-4,  -9.76109749136146840777E-3,
+        7.78576235018280120474E-1};
+
+    double z = numext::abs(x);
+
+    if (z <= 8.0) {
+      double y = (z / 2.0) - 2.0;
+      z = cephes::chebevl<double, 29>::run(y, A) * z;
+    } else {
+      z = cephes::chebevl<double, 25>::run(32.0 / z - 2.0, B) / numext::sqrt(z);
+    }
+
+    if (x < 0.0) {
+      z = -z;
+    }
+
+    return z;
+  }
+};
+
 }  // end namespace internal
 
 namespace numext {
@@ -1546,6 +2099,18 @@ EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma, Scalar)
 }
 
 template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma_der_a, Scalar)
+    igamma_der_a(const Scalar& a, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(igamma_der_a, Scalar)::run(a, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(gamma_sample_der_alpha, Scalar)
+    gamma_sample_der_alpha(const Scalar& a, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(gamma_sample_der_alpha, Scalar)::run(a, x);
+}
+
+template <typename Scalar>
 EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igammac, Scalar)
     igammac(const Scalar& a, const Scalar& x) {
   return EIGEN_MATHFUNC_IMPL(igammac, Scalar)::run(a, x);
@@ -1557,6 +2122,18 @@ EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(betainc, Scalar)
   return EIGEN_MATHFUNC_IMPL(betainc, Scalar)::run(a, b, x);
 }
 
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(i0e, Scalar)
+    i0e(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(i0e, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(i1e, Scalar)
+    i1e(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(i1e, Scalar)::run(x);
+}
+
 }  // end namespace numext
 
 
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
index 46d60d323..465f41d54 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
@@ -42,6 +42,21 @@ Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); }
 template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 Packet pigamma(const Packet& a, const Packet& x) { using numext::igamma; return igamma(a, x); }
 
+/** \internal \returns the derivative of the incomplete gamma function
+ * igamma_der_a(\a a, \a x) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pigamma_der_a(const Packet& a, const Packet& x) {
+  using numext::igamma_der_a; return igamma_der_a(a, x);
+}
+
+/** \internal \returns compute the derivative of the sample
+  * of Gamma(alpha, 1) random variable with respect to the parameter a
+  * gamma_sample_der_alpha(\a alpha, \a sample) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pgamma_sample_der_alpha(const Packet& alpha, const Packet& sample) {
+  using numext::gamma_sample_der_alpha; return gamma_sample_der_alpha(alpha, sample);
+}
+
 /** \internal \returns the complementary incomplete gamma function igammac(\a a, \a x) */
 template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 Packet pigammac(const Packet& a, const Packet& x) { using numext::igammac; return igammac(a, x); }
@@ -50,6 +65,18 @@ Packet pigammac(const Packet& a, const Packet& x) { using numext::igammac; retur
 template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 Packet pbetainc(const Packet& a, const Packet& b,const Packet& x) { using numext::betainc; return betainc(a, b, x); }
 
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero i0e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pi0e(const Packet& x) { using numext::i0e; return i0e(x); }
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one i1e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pi1e(const Packet& x) { using numext::i1e; return i1e(x); }
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/GPU/GpuSpecialFunctions.h
index e0e3a8be6..40abcee3a 100644
--- a/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/GPU/GpuSpecialFunctions.h
@@ -7,8 +7,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_CUDA_SPECIALFUNCTIONS_H
-#define EIGEN_CUDA_SPECIALFUNCTIONS_H
+#ifndef EIGEN_GPU_SPECIALFUNCTIONS_H
+#define EIGEN_GPU_SPECIALFUNCTIONS_H
 
 namespace Eigen {
 
@@ -17,7 +17,7 @@ namespace internal {
 // Make sure this is only available when targeting a GPU: we don't want to
 // introduce conflicts between these packet_traits definitions and the ones
 // we'll use on the host side (SSE, AVX, ...)
-#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
+#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 plgamma<float4>(const float4& a)
@@ -120,6 +120,41 @@ double2 pigamma<double2>(const double2& a, const double2& x)
   return make_double2(igamma(a.x, x.x), igamma(a.y, x.y));
 }
 
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pigamma_der_a<float4>(
+    const float4& a, const float4& x) {
+  using numext::igamma_der_a;
+  return make_float4(igamma_der_a(a.x, x.x), igamma_der_a(a.y, x.y),
+                     igamma_der_a(a.z, x.z), igamma_der_a(a.w, x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pigamma_der_a<double2>(const double2& a, const double2& x) {
+  using numext::igamma_der_a;
+  return make_double2(igamma_der_a(a.x, x.x), igamma_der_a(a.y, x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pgamma_sample_der_alpha<float4>(
+    const float4& alpha, const float4& sample) {
+  using numext::gamma_sample_der_alpha;
+  return make_float4(
+      gamma_sample_der_alpha(alpha.x, sample.x),
+      gamma_sample_der_alpha(alpha.y, sample.y),
+      gamma_sample_der_alpha(alpha.z, sample.z),
+      gamma_sample_der_alpha(alpha.w, sample.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pgamma_sample_der_alpha<double2>(const double2& alpha, const double2& sample) {
+  using numext::gamma_sample_der_alpha;
+  return make_double2(
+      gamma_sample_der_alpha(alpha.x, sample.x),
+      gamma_sample_der_alpha(alpha.y, sample.y));
+}
+
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 pigammac<float4>(const float4& a, const float4& x)
 {
@@ -156,10 +191,36 @@ double2 pbetainc<double2>(const double2& a, const double2& b, const double2& x)
   return make_double2(betainc(a.x, b.x, x.x), betainc(a.y, b.y, x.y));
 }
 
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pi0e<float4>(const float4& x) {
+  using numext::i0e;
+  return make_float4(i0e(x.x), i0e(x.y), i0e(x.z), i0e(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pi0e<double2>(const double2& x) {
+  using numext::i0e;
+  return make_double2(i0e(x.x), i0e(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pi1e<float4>(const float4& x) {
+  using numext::i1e;
+  return make_float4(i1e(x.x), i1e(x.y), i1e(x.z), i1e(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pi1e<double2>(const double2& x) {
+  using numext::i1e;
+  return make_double2(i1e(x.x), i1e(x.y));
+}
+
 #endif
 
 } // end namespace internal
 
 } // end namespace Eigen
 
-#endif // EIGEN_CUDA_SPECIALFUNCTIONS_H
+#endif // EIGEN_GPU_SPECIALFUNCTIONS_H
diff --git a/unsupported/Eigen/src/Splines/SplineFitting.h b/unsupported/Eigen/src/Splines/SplineFitting.h
index c761a9b3d..1a4b80a2f 100644
--- a/unsupported/Eigen/src/Splines/SplineFitting.h
+++ b/unsupported/Eigen/src/Splines/SplineFitting.h
@@ -181,7 +181,7 @@ namespace Eigen
    * \ingroup Splines_Module
    *
    * \param[in] pts The data points to which a spline should be fit.
-   * \param[out] chord_lengths The resulting chord lenggth vector.
+   * \param[out] chord_lengths The resulting chord length vector.
    *
    * \sa Les Piegl and Wayne Tiller, The NURBS book (2nd ed.), 1997, 9.2.1 Global Curve Interpolation to Point Data
    **/   
diff --git a/unsupported/README.txt b/unsupported/README.txt
index 83479ff0b..70793bf13 100644
--- a/unsupported/README.txt
+++ b/unsupported/README.txt
@@ -20,7 +20,7 @@ However, it:
  - must rely on Eigen,
  - must be highly related to math,
  - should have some general purpose in the sense that it could
-   potentially become an offical Eigen module (or be merged into another one).
+   potentially become an official Eigen module (or be merged into another one).
 
 In doubt feel free to contact us. For instance, if your addons is very too specific
 but it shows an interesting way of using Eigen, then it could be a nice demo.
diff --git a/unsupported/bench/bench_svd.cpp b/unsupported/bench/bench_svd.cpp
index 01d8231ae..e7028a2b9 100644
--- a/unsupported/bench/bench_svd.cpp
+++ b/unsupported/bench/bench_svd.cpp
@@ -70,7 +70,7 @@ void bench_svd(const MatrixType& a = MatrixType())
   std::cout<< std::endl;
   timerJacobi.reset();
   timerBDC.reset();
-  cout << " Computes rotaion matrix" <<endl;
+  cout << " Computes rotation matrix" <<endl;
   for (int k=1; k<=NUMBER_SAMPLE; ++k)
   {
     timerBDC.start();
diff --git a/unsupported/doc/examples/FFT.cpp b/unsupported/doc/examples/FFT.cpp
index fcbf81276..85e8a0241 100644
--- a/unsupported/doc/examples/FFT.cpp
+++ b/unsupported/doc/examples/FFT.cpp
@@ -61,14 +61,14 @@ template <typename T>
 void RandomFill(std::vector<T> & vec)
 {
     for (size_t k=0;k<vec.size();++k)
-        vec[k] = T( rand() )/T(RAND_MAX) - .5;
+        vec[k] = T( rand() )/T(RAND_MAX) - T(.5);
 }
 
 template <typename T>
 void RandomFill(std::vector<std::complex<T> > & vec)
 {
     for (size_t k=0;k<vec.size();++k)
-        vec[k] = std::complex<T> ( T( rand() )/T(RAND_MAX) - .5, T( rand() )/T(RAND_MAX) - .5);
+        vec[k] = std::complex<T> ( T( rand() )/T(RAND_MAX) - T(.5), T( rand() )/T(RAND_MAX) - T(.5));
 }
 
 template <typename T_time,typename T_freq>
@@ -85,7 +85,7 @@ void fwd_inv(size_t nfft)
     vector<T_time> timebuf2;
     fft.inv(timebuf2,freqbuf);
 
-    long double rmse = mag2(timebuf - timebuf2) / mag2(timebuf);
+    T_time rmse = mag2(timebuf - timebuf2) / mag2(timebuf);
     cout << "roundtrip rmse: " << rmse << endl;
 }
 
diff --git a/unsupported/test/BVH.cpp b/unsupported/test/BVH.cpp
index ff5b3299d..d8c39d556 100644
--- a/unsupported/test/BVH.cpp
+++ b/unsupported/test/BVH.cpp
@@ -192,7 +192,7 @@ struct TreeTest
 };
 
 
-void test_BVH()
+EIGEN_DECLARE_TEST(BVH)
 {
   for(int i = 0; i < g_repeat; i++) {
 #ifdef EIGEN_TEST_PART_1
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 22647cadd..239a80926 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -1,16 +1,7 @@
-# generate split test header file only if it does not yet exist
-# in order to prevent a rebuild everytime cmake is configured
-if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)
-  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h "")
-  foreach(i RANGE 1 999)
-    file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h
-      "#ifdef EIGEN_TEST_PART_${i}\n"
-      "#define CALL_SUBTEST_${i}(FUNC) CALL_SUBTEST(FUNC)\n"
-      "#else\n"
-      "#define CALL_SUBTEST_${i}(FUNC)\n"
-      "#endif\n\n"
-    )
-  endforeach()
+# The file split_test_helper.h was generated at first run,
+# it is now included in test/
+if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)
+  file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)
 endif()
 
 set_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT "Unsupported")
@@ -41,11 +32,16 @@ else(GOOGLEHASH_FOUND)
   ei_add_property(EIGEN_MISSING_BACKENDS  "GoogleHash, ")
 endif(GOOGLEHASH_FOUND)
 
+
 find_package(Adolc)
 if(ADOLC_FOUND)
   include_directories(${ADOLC_INCLUDES})
   ei_add_property(EIGEN_TESTED_BACKENDS "Adolc, ")
-  ei_add_test(forward_adolc "" ${ADOLC_LIBRARIES})
+  if(EIGEN_TEST_CXX11)
+    ei_add_test(forward_adolc "" ${ADOLC_LIBRARIES})
+  else()
+    message(STATUS "Adolc found, but tests require C++11 mode")
+  endif()
 else(ADOLC_FOUND)
   ei_add_property(EIGEN_MISSING_BACKENDS "Adolc, ")
 endif(ADOLC_FOUND)
@@ -119,6 +115,7 @@ ei_add_test(polynomialsolver)
 ei_add_test(polynomialutils)
 ei_add_test(splines)
 ei_add_test(gmres)
+ei_add_test(dgmres)
 ei_add_test(minres)
 ei_add_test(levenberg_marquardt)
 ei_add_test(kronecker_product)
@@ -133,6 +130,7 @@ if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
 ei_add_test(cxx11_tensor_dimension)
 ei_add_test(cxx11_tensor_map)
 ei_add_test(cxx11_tensor_assign)
+ei_add_test(cxx11_tensor_block_access)
 ei_add_test(cxx11_tensor_comparisons)
 ei_add_test(cxx11_tensor_forced_eval)
 ei_add_test(cxx11_tensor_math)
@@ -144,10 +142,6 @@ ei_add_test(cxx11_tensor_sugar)
 ei_add_test(cxx11_tensor_roundings)
 ei_add_test(cxx11_tensor_layout_swap)
 ei_add_test(cxx11_tensor_io)
-if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
-  # This test requires __uint128_t which is only available on 64bit systems
-  ei_add_test(cxx11_tensor_uint128)
-endif()
 endif()
 
 if(EIGEN_TEST_CXX11)
@@ -219,6 +213,7 @@ if(EIGEN_TEST_CXX11)
   ei_add_test(cxx11_tensor_striding)
   ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
   ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+  ei_add_test(cxx11_tensor_executor "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
   ei_add_test(cxx11_tensor_ref)
   ei_add_test(cxx11_tensor_random)
   ei_add_test(cxx11_tensor_generator)
@@ -228,6 +223,11 @@ if(EIGEN_TEST_CXX11)
   ei_add_test(cxx11_tensor_ifft)
   ei_add_test(cxx11_tensor_scan)
   ei_add_test(cxx11_tensor_trace)
+  ei_add_test(cxx11_tensor_move)
+if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8" AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+  # This test requires __uint128_t which is only available on 64bit systems
+  ei_add_test(cxx11_tensor_uint128)
+endif()
 
 endif()
 
@@ -269,26 +269,83 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
   cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
   set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
 
-  ei_add_test(cxx11_tensor_complex_cuda)
-  ei_add_test(cxx11_tensor_complex_cwise_ops_cuda)
-  ei_add_test(cxx11_tensor_reduction_cuda)
-  ei_add_test(cxx11_tensor_argmax_cuda)
-  ei_add_test(cxx11_tensor_cast_float16_cuda)
-  ei_add_test(cxx11_tensor_scan_cuda)
+  ei_add_test(cxx11_tensor_complex_gpu)
+  ei_add_test(cxx11_tensor_complex_cwise_ops_gpu)
+  ei_add_test(cxx11_tensor_reduction_gpu)
+  ei_add_test(cxx11_tensor_argmax_gpu)
+  ei_add_test(cxx11_tensor_cast_float16_gpu)
+  ei_add_test(cxx11_tensor_scan_gpu)
 
   # Contractions require arch 3.0 or higher
   if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 29)
     ei_add_test(cxx11_tensor_device)
-    ei_add_test(cxx11_tensor_cuda)
-    ei_add_test(cxx11_tensor_contract_cuda)
-    ei_add_test(cxx11_tensor_of_float16_cuda)
+    ei_add_test(cxx11_tensor_gpu)
+    ei_add_test(cxx11_tensor_contract_gpu)
+    ei_add_test(cxx11_tensor_of_float16_gpu)
   endif()
 
   # The random number generation code requires arch 3.5 or greater.
   if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 34)
-    ei_add_test(cxx11_tensor_random_cuda)
+    ei_add_test(cxx11_tensor_random_gpu)
   endif()
 
 
   unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
 endif()
+
+# Add HIP specific tests
+if (EIGEN_TEST_HIP)
+
+  set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.")
+
+  if (EXISTS ${HIP_PATH})
+
+    list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake)
+
+    find_package(HIP REQUIRED)
+    if (HIP_FOUND)
+
+      execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)
+
+      if (${HIP_PLATFORM} STREQUAL "hcc")
+
+	include_directories(${CMAKE_CURRENT_BINARY_DIR})
+	include_directories(${HIP_PATH}/include)
+
+	set(EIGEN_ADD_TEST_FILENAME_EXTENSION  "cu")
+	#
+	# complex datatype is not yet supported by HIP
+	# so leaving out those tests for now
+	#
+	# ei_add_test(cxx11_tensor_complex_gpu)
+	# ei_add_test(cxx11_tensor_complex_cwise_ops_gpu)
+	#
+	ei_add_test(cxx11_tensor_reduction_gpu)
+	ei_add_test(cxx11_tensor_argmax_gpu)
+	ei_add_test(cxx11_tensor_cast_float16_gpu)
+	ei_add_test(cxx11_tensor_scan_gpu)
+	ei_add_test(cxx11_tensor_device)
+
+	ei_add_test(cxx11_tensor_gpu)
+	ei_add_test(cxx11_tensor_contract_gpu)
+	ei_add_test(cxx11_tensor_of_float16_gpu)
+	ei_add_test(cxx11_tensor_random_gpu)
+
+	unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+
+      elseif (${HIP_PLATFORM} STREQUAL "nvcc")
+	message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen")
+      else ()
+	message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}")
+      endif()
+
+    endif(HIP_FOUND)
+
+  else ()
+
+    message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist")
+
+  endif()
+
+endif(EIGEN_TEST_HIP)
+
diff --git a/unsupported/test/EulerAngles.cpp b/unsupported/test/EulerAngles.cpp
index 500fb2d17..572fc08a3 100644
--- a/unsupported/test/EulerAngles.cpp
+++ b/unsupported/test/EulerAngles.cpp
@@ -272,7 +272,7 @@ template<typename Scalar> void eulerangles_rand()
   check_all_var(ea);
 }
 
-void test_EulerAngles()
+EIGEN_DECLARE_TEST(EulerAngles)
 {
   // Simple cast test
   EulerAnglesXYZd onesEd(1, 1, 1);
diff --git a/unsupported/test/FFTW.cpp b/unsupported/test/FFTW.cpp
index 8b7528fb7..cfe559ebd 100644
--- a/unsupported/test/FFTW.cpp
+++ b/unsupported/test/FFTW.cpp
@@ -225,7 +225,7 @@ void test_return_by_value(int len)
     VERIFY( (in1-in).norm() < test_precision<float>() );
 }
 
-void test_FFTW()
+EIGEN_DECLARE_TEST(FFTW)
 {
   CALL_SUBTEST( test_return_by_value(32) );
   //CALL_SUBTEST( ( test_complex2d<float,4,8> () ) ); CALL_SUBTEST( ( test_complex2d<double,4,8> () ) );
diff --git a/unsupported/test/NonLinearOptimization.cpp b/unsupported/test/NonLinearOptimization.cpp
index 1d682dd83..cc95cea29 100644
--- a/unsupported/test/NonLinearOptimization.cpp
+++ b/unsupported/test/NonLinearOptimization.cpp
@@ -1818,7 +1818,7 @@ void testNistEckerle4(void)
   VERIFY_IS_APPROX(x[2], 4.5154121844E+02);
 }
 
-void test_NonLinearOptimization()
+EIGEN_DECLARE_TEST(NonLinearOptimization)
 {
     // Tests using the examples provided by (c)minpack
     CALL_SUBTEST/*_1*/(testChkder());
diff --git a/unsupported/test/NumericalDiff.cpp b/unsupported/test/NumericalDiff.cpp
index 27d888056..35f2f6d7c 100644
--- a/unsupported/test/NumericalDiff.cpp
+++ b/unsupported/test/NumericalDiff.cpp
@@ -107,7 +107,7 @@ void test_central()
     VERIFY_IS_APPROX(jac, actual_jac);
 }
 
-void test_NumericalDiff()
+EIGEN_DECLARE_TEST(NumericalDiff)
 {
     CALL_SUBTEST(test_forward());
     CALL_SUBTEST(test_central());
diff --git a/unsupported/test/alignedvector3.cpp b/unsupported/test/alignedvector3.cpp
index 252cb1d3f..fcc89daab 100644
--- a/unsupported/test/alignedvector3.cpp
+++ b/unsupported/test/alignedvector3.cpp
@@ -76,7 +76,7 @@ void alignedvector3()
   VERIFY(ss1.str()==ss2.str());
 }
 
-void test_alignedvector3()
+EIGEN_DECLARE_TEST(alignedvector3)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST( alignedvector3<float>() );
diff --git a/unsupported/test/autodiff.cpp b/unsupported/test/autodiff.cpp
index 85743137e..495cee03e 100644
--- a/unsupported/test/autodiff.cpp
+++ b/unsupported/test/autodiff.cpp
@@ -306,6 +306,8 @@ double bug_1222() {
   return denom.value();
 }
 
+#ifdef EIGEN_TEST_PART_5
+
 double bug_1223() {
   using std::min;
   typedef Eigen::AutoDiffScalar<Eigen::Vector3d> AD;
@@ -326,8 +328,8 @@ double bug_1223() {
 
 // regression test for some compilation issues with specializations of ScalarBinaryOpTraits
 void bug_1260() {
-  Matrix4d A;
-  Vector4d v;
+  Matrix4d A = Matrix4d::Ones();
+  Vector4d v = Vector4d::Ones();
   A*v;
 }
 
@@ -336,7 +338,7 @@ double bug_1261() {
   typedef AutoDiffScalar<Matrix2d> AD;
   typedef Matrix<AD,2,1> VectorAD;
 
-  VectorAD v;
+  VectorAD v(0.,0.);
   const AD maxVal = v.maxCoeff();
   const AD minVal = v.minCoeff();
   return maxVal.value() + minVal.value();
@@ -344,13 +346,15 @@ double bug_1261() {
 
 double bug_1264() {
   typedef AutoDiffScalar<Vector2d> AD;
-  const AD s;
-  const Matrix<AD, 3, 1> v1;
+  const AD s = 0.;
+  const Matrix<AD, 3, 1> v1(0.,0.,0.);
   const Matrix<AD, 3, 1> v2 = (s + 3.0) * v1;
   return v2(0).value();
 }
 
-void test_autodiff()
+#endif
+
+EIGEN_DECLARE_TEST(autodiff)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( test_autodiff_scalar<1>() );
@@ -359,9 +363,9 @@ void test_autodiff()
     CALL_SUBTEST_4( test_autodiff_hessian<1>() );
   }
 
-  bug_1222();
-  bug_1223();
-  bug_1260();
-  bug_1261();
+  CALL_SUBTEST_5( bug_1222() );
+  CALL_SUBTEST_5( bug_1223() );
+  CALL_SUBTEST_5( bug_1260() );
+  CALL_SUBTEST_5( bug_1261() );
 }
 
diff --git a/unsupported/test/autodiff_scalar.cpp b/unsupported/test/autodiff_scalar.cpp
index 9cf11280c..e81a7788b 100644
--- a/unsupported/test/autodiff_scalar.cpp
+++ b/unsupported/test/autodiff_scalar.cpp
@@ -81,12 +81,15 @@ void check_limits_specialization()
   typedef std::numeric_limits<AD> A;
   typedef std::numeric_limits<Scalar> B;
 
+  // workaround "unused typedef" warning:
+  VERIFY(!bool(internal::is_same<B, A>::value));
+
 #if EIGEN_HAS_CXX11
   VERIFY(bool(std::is_base_of<B, A>::value));
 #endif
 }
 
-void test_autodiff_scalar()
+EIGEN_DECLARE_TEST(autodiff_scalar)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( check_atan2<float>() );
diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp
index 3b598bf42..2f1418684 100644
--- a/unsupported/test/cxx11_eventcount.cpp
+++ b/unsupported/test/cxx11_eventcount.cpp
@@ -135,7 +135,7 @@ static void test_stress_eventcount()
   }
 }
 
-void test_cxx11_eventcount()
+EIGEN_DECLARE_TEST(cxx11_eventcount)
 {
   CALL_SUBTEST(test_basic_eventcount());
   CALL_SUBTEST(test_stress_eventcount());
diff --git a/unsupported/test/cxx11_meta.cpp b/unsupported/test/cxx11_meta.cpp
index 8911c59d8..510e11032 100644
--- a/unsupported/test/cxx11_meta.cpp
+++ b/unsupported/test/cxx11_meta.cpp
@@ -340,7 +340,7 @@ static void test_array_misc()
   VERIFY_IS_EQUAL((instantiate_by_c_array<dummy_inst, int, 5>(data).c), 5);
 }
 
-void test_cxx11_meta()
+EIGEN_DECLARE_TEST(cxx11_meta)
 {
   CALL_SUBTEST(test_gen_numeric_list());
   CALL_SUBTEST(test_concat());
diff --git a/unsupported/test/cxx11_non_blocking_thread_pool.cpp b/unsupported/test/cxx11_non_blocking_thread_pool.cpp
index 48cd2d4e4..e73a034b1 100644
--- a/unsupported/test/cxx11_non_blocking_thread_pool.cpp
+++ b/unsupported/test/cxx11_non_blocking_thread_pool.cpp
@@ -18,7 +18,7 @@ static void test_create_destroy_empty_pool()
   // Just create and destroy the pool. This will wind up and tear down worker
   // threads. Ensure there are no issues in that logic.
   for (int i = 0; i < 16; ++i) {
-    NonBlockingThreadPool tp(i);
+    ThreadPool tp(i);
   }
 }
 
@@ -27,7 +27,7 @@ static void test_parallelism(bool allow_spinning)
 {
   // Test we never-ever fail to match available tasks with idle threads.
   const int kThreads = 16;  // code below expects that this is a multiple of 4
-  NonBlockingThreadPool tp(kThreads, allow_spinning);
+  ThreadPool tp(kThreads, allow_spinning);
   VERIFY_IS_EQUAL(tp.NumThreads(), kThreads);
   VERIFY_IS_EQUAL(tp.CurrentThreadId(), -1);
   for (int iter = 0; iter < 100; ++iter) {
@@ -104,7 +104,7 @@ static void test_parallelism(bool allow_spinning)
 
 static void test_cancel()
 {
-  NonBlockingThreadPool tp(2);
+  ThreadPool tp(2);
 
   // Schedule a large number of closure that each sleeps for one second. This
   // will keep the thread pool busy for much longer than the default test timeout.
@@ -116,7 +116,7 @@ static void test_cancel()
   tp.Cancel();
 }
 
-void test_cxx11_non_blocking_thread_pool()
+EIGEN_DECLARE_TEST(cxx11_non_blocking_thread_pool)
 {
   CALL_SUBTEST(test_create_destroy_empty_pool());
   CALL_SUBTEST(test_parallelism(true));
diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp
index 91f690114..8fc5a3074 100644
--- a/unsupported/test/cxx11_runqueue.cpp
+++ b/unsupported/test/cxx11_runqueue.cpp
@@ -227,7 +227,7 @@ void test_stress_runqueue()
   VERIFY(total.load() == 0);
 }
 
-void test_cxx11_runqueue()
+EIGEN_DECLARE_TEST(cxx11_runqueue)
 {
   CALL_SUBTEST_1(test_basic_runqueue());
   CALL_SUBTEST_2(test_empty_runqueue());
diff --git a/unsupported/test/cxx11_tensor_argmax.cpp b/unsupported/test/cxx11_tensor_argmax.cpp
index 037767270..4a0c8967b 100644
--- a/unsupported/test/cxx11_tensor_argmax.cpp
+++ b/unsupported/test/cxx11_tensor_argmax.cpp
@@ -273,7 +273,7 @@ static void test_argmin_dim()
   }
 }
 
-void test_cxx11_tensor_argmax()
+EIGEN_DECLARE_TEST(cxx11_tensor_argmax)
 {
   CALL_SUBTEST(test_simple_index_tuples<RowMajor>());
   CALL_SUBTEST(test_simple_index_tuples<ColMajor>());
diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_gpu.cu
index 3d73d491a..79f4066e9 100644
--- a/unsupported/test/cxx11_tensor_argmax_cuda.cu
+++ b/unsupported/test/cxx11_tensor_argmax_gpu.cu
@@ -9,16 +9,18 @@
 
 
 #define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_FUNC cxx11_tensor_cuda
+
 #define EIGEN_USE_GPU
 
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
 using Eigen::Tensor;
 
 template <int Layout>
-void test_cuda_simple_argmax()
+void test_gpu_simple_argmax()
 {
   Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97));
   Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1));
@@ -34,13 +36,13 @@ void test_cuda_simple_argmax()
   double* d_in;
   DenseIndex* d_out_max;
   DenseIndex* d_out_min;
-  cudaMalloc((void**)(&d_in), in_bytes);
-  cudaMalloc((void**)(&d_out_max), out_bytes);
-  cudaMalloc((void**)(&d_out_min), out_bytes);
+  gpuMalloc((void**)(&d_in), in_bytes);
+  gpuMalloc((void**)(&d_out_max), out_bytes);
+  gpuMalloc((void**)(&d_out_min), out_bytes);
 
-  cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in, in.data(), in_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97));
@@ -50,20 +52,20 @@ void test_cuda_simple_argmax()
   gpu_out_max.device(gpu_device) = gpu_in.argmax();
   gpu_out_min.device(gpu_device) = gpu_in.argmin();
 
-  assert(cudaMemcpyAsync(out_max.data(), d_out_max, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaMemcpyAsync(out_min.data(), d_out_min, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out_max.data(), d_out_max, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuMemcpyAsync(out_min.data(), d_out_min, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1);
   VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0);
 
-  cudaFree(d_in);
-  cudaFree(d_out_max);
-  cudaFree(d_out_min);
+  gpuFree(d_in);
+  gpuFree(d_out_max);
+  gpuFree(d_out_min);
 }
 
 template <int DataLayout>
-void test_cuda_argmax_dim()
+void test_gpu_argmax_dim()
 {
   Tensor<float, 4, DataLayout> tensor(2,3,5,7);
   std::vector<int> dims;
@@ -97,12 +99,12 @@ void test_cuda_argmax_dim()
 
     float* d_in;
     DenseIndex* d_out;
-    cudaMalloc((void**)(&d_in), in_bytes);
-    cudaMalloc((void**)(&d_out), out_bytes);
+    gpuMalloc((void**)(&d_in), in_bytes);
+    gpuMalloc((void**)(&d_out), out_bytes);
 
-    cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
+    gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
 
-    Eigen::CudaStreamDevice stream;
+    Eigen::GpuStreamDevice stream;
     Eigen::GpuDevice gpu_device(&stream);
 
     Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
@@ -110,8 +112,8 @@ void test_cuda_argmax_dim()
 
     gpu_out.device(gpu_device) = gpu_in.argmax(dim);
 
-    assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+    assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+    assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
     VERIFY_IS_EQUAL(tensor_arg.size(),
                     size_t(2*3*5*7 / tensor.dimension(dim)));
@@ -134,25 +136,25 @@ void test_cuda_argmax_dim()
       }
     }
 
-    cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
+    gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
 
     gpu_out.device(gpu_device) = gpu_in.argmax(dim);
 
-    assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+    assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+    assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
     for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
       // Expect max to be in the last index of the reduced dimension
       VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
     }
 
-    cudaFree(d_in);
-    cudaFree(d_out);
+    gpuFree(d_in);
+    gpuFree(d_out);
   }
 }
 
 template <int DataLayout>
-void test_cuda_argmin_dim()
+void test_gpu_argmin_dim()
 {
   Tensor<float, 4, DataLayout> tensor(2,3,5,7);
   std::vector<int> dims;
@@ -186,12 +188,12 @@ void test_cuda_argmin_dim()
 
     float* d_in;
     DenseIndex* d_out;
-    cudaMalloc((void**)(&d_in), in_bytes);
-    cudaMalloc((void**)(&d_out), out_bytes);
+    gpuMalloc((void**)(&d_in), in_bytes);
+    gpuMalloc((void**)(&d_out), out_bytes);
 
-    cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
+    gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
 
-    Eigen::CudaStreamDevice stream;
+    Eigen::GpuStreamDevice stream;
     Eigen::GpuDevice gpu_device(&stream);
 
     Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
@@ -199,8 +201,8 @@ void test_cuda_argmin_dim()
 
     gpu_out.device(gpu_device) = gpu_in.argmin(dim);
 
-    assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+    assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+    assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
     VERIFY_IS_EQUAL(tensor_arg.size(),
                     2*3*5*7 / tensor.dimension(dim));
@@ -223,29 +225,29 @@ void test_cuda_argmin_dim()
       }
     }
 
-    cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
+    gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
 
     gpu_out.device(gpu_device) = gpu_in.argmin(dim);
 
-    assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+    assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+    assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
     for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
       // Expect max to be in the last index of the reduced dimension
       VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
     }
 
-    cudaFree(d_in);
-    cudaFree(d_out);
+    gpuFree(d_in);
+    gpuFree(d_out);
   }
 }
 
-void test_cxx11_tensor_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_argmax_gpu)
 {
-  CALL_SUBTEST_1(test_cuda_simple_argmax<RowMajor>());
-  CALL_SUBTEST_1(test_cuda_simple_argmax<ColMajor>());
-  CALL_SUBTEST_2(test_cuda_argmax_dim<RowMajor>());
-  CALL_SUBTEST_2(test_cuda_argmax_dim<ColMajor>());
-  CALL_SUBTEST_3(test_cuda_argmin_dim<RowMajor>());
-  CALL_SUBTEST_3(test_cuda_argmin_dim<ColMajor>());
+  CALL_SUBTEST_1(test_gpu_simple_argmax<RowMajor>());
+  CALL_SUBTEST_1(test_gpu_simple_argmax<ColMajor>());
+  CALL_SUBTEST_2(test_gpu_argmax_dim<RowMajor>());
+  CALL_SUBTEST_2(test_gpu_argmax_dim<ColMajor>());
+  CALL_SUBTEST_3(test_gpu_argmin_dim<RowMajor>());
+  CALL_SUBTEST_3(test_gpu_argmin_dim<ColMajor>());
 }
diff --git a/unsupported/test/cxx11_tensor_argmax_sycl.cpp b/unsupported/test/cxx11_tensor_argmax_sycl.cpp
index 521a7f82c..0bbb0f6dc 100644
--- a/unsupported/test/cxx11_tensor_argmax_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_argmax_sycl.cpp
@@ -13,7 +13,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_argmax_sycl
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
@@ -237,7 +237,7 @@ template<typename DataType, typename Device_Selector> void sycl_argmax_test_per_
   test_sycl_argmin_dim<DataType, RowMajor, int64_t>(sycl_device);
 }
 
-void test_cxx11_tensor_argmax_sycl() {
+EIGEN_DECLARE_TEST(cxx11_tensor_argmax_sycl) {
  for (const auto& device :Eigen::get_sycl_supported_devices()) {
     CALL_SUBTEST(sycl_argmax_test_per_device<double>(device));
   }
diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp
index 8fe85d83c..ce9d24369 100644
--- a/unsupported/test/cxx11_tensor_assign.cpp
+++ b/unsupported/test/cxx11_tensor_assign.cpp
@@ -358,7 +358,7 @@ static void test_std_initializers_tensor() {
 #endif  // EIGEN_HAS_VARIADIC_TEMPLATES
 }
 
-void test_cxx11_tensor_assign()
+EIGEN_DECLARE_TEST(cxx11_tensor_assign)
 {
   CALL_SUBTEST(test_1d());
   CALL_SUBTEST(test_2d());
diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp
new file mode 100644
index 000000000..6feeff231
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_block_access.cpp
@@ -0,0 +1,995 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Andy Davis <andydavis@google.com>
+// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <set>
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::Index;
+using Eigen::RowMajor;
+using Eigen::ColMajor;
+
+using internal::TensorBlockShapeType;
+
+template<typename T>
+static const T& choose(int layout, const T& col, const T& row) {
+  return layout == ColMajor ? col : row;
+}
+
+static const TensorBlockShapeType RandomShape() {
+  return internal::random<bool>()
+             ? internal::TensorBlockShapeType::kUniformAllDims
+             : internal::TensorBlockShapeType::kSkewedInnerDims;
+}
+
+template <int NumDims>
+static std::size_t RandomTargetSize(const DSizes<Index, NumDims>& dims) {
+  return internal::random<int>(1, dims.TotalSize());
+}
+
+template <int NumDims>
+static DSizes<Index, NumDims> RandomDims() {
+  array<Index, NumDims> dims;
+  for (int i = 0; i < NumDims; ++i) {
+    dims[i] = internal::random<int>(1, 20);
+  }
+  return DSizes<Index, NumDims>(dims);
+};
+
+/** Dummy data type to test TensorBlock copy ops. */
+struct Data {
+  Data() : Data(0) {}
+  explicit Data(int v) { value = v; }
+  int value;
+};
+
+bool operator==(const Data& lhs, const Data& rhs) {
+  return lhs.value == rhs.value;
+}
+
+std::ostream& operator<<(std::ostream& os, const Data& d) {
+  os << "Data: value=" << d.value;
+  return os;
+}
+
+template <typename T>
+static T* GenerateRandomData(const Index& size) {
+  T* data = new T[size];
+  for (int i = 0; i < size; ++i) {
+    data[i] = internal::random<T>();
+  }
+  return data;
+}
+
+template <>
+Data* GenerateRandomData(const Index& size) {
+  Data* data = new Data[size];
+  for (int i = 0; i < size; ++i) {
+    data[i] = Data(internal::random<int>(1, 100));
+  }
+  return data;
+}
+
+template <int NumDims>
+static void Debug(DSizes<Index, NumDims> dims) {
+  for (int i = 0; i < NumDims; ++i) {
+    std::cout << dims[i] << "; ";
+  }
+  std::cout << std::endl;
+}
+
+template <int Layout>
+static void test_block_mapper_sanity()
+{
+  using T = int;
+  using TensorBlock = internal::TensorBlock<T, Index, 2, Layout>;
+  using TensorBlockMapper = internal::TensorBlockMapper<T, Index, 2, Layout>;
+
+  DSizes<Index, 2> tensor_dims(100, 100);
+
+  // Test uniform blocks.
+  TensorBlockMapper uniform_block_mapper(
+      tensor_dims, internal::TensorBlockShapeType::kUniformAllDims, 100);
+
+  VERIFY_IS_EQUAL(uniform_block_mapper.total_block_count(), 100);
+  VERIFY_IS_EQUAL(uniform_block_mapper.block_dims_total_size(), 100);
+
+  // 10x10 blocks
+  auto uniform_b0 = uniform_block_mapper.GetBlockForIndex(0, nullptr);
+  VERIFY_IS_EQUAL(uniform_b0.block_sizes().at(0), 10);
+  VERIFY_IS_EQUAL(uniform_b0.block_sizes().at(1), 10);
+  // Depending on a layout we stride by cols rows.
+  VERIFY_IS_EQUAL(uniform_b0.block_strides().at(0), choose(Layout, 1, 10));
+  VERIFY_IS_EQUAL(uniform_b0.block_strides().at(1), choose(Layout, 10, 1));
+  // Tensor strides depend only on a layout and not on the block size.
+  VERIFY_IS_EQUAL(uniform_b0.tensor_strides().at(0), choose(Layout, 1, 100));
+  VERIFY_IS_EQUAL(uniform_b0.tensor_strides().at(1), choose(Layout, 100, 1));
+
+  // Test skewed to inner dims blocks.
+  TensorBlockMapper skewed_block_mapper(
+      tensor_dims, internal::TensorBlockShapeType::kSkewedInnerDims, 100);
+
+  VERIFY_IS_EQUAL(skewed_block_mapper.total_block_count(), 100);
+  VERIFY_IS_EQUAL(skewed_block_mapper.block_dims_total_size(), 100);
+
+  // 1x100 (100x1) rows/cols depending on a tensor layout.
+  auto skewed_b0 = skewed_block_mapper.GetBlockForIndex(0, nullptr);
+  VERIFY_IS_EQUAL(skewed_b0.block_sizes().at(0), choose(Layout, 100, 1));
+  VERIFY_IS_EQUAL(skewed_b0.block_sizes().at(1), choose(Layout, 1, 100));
+  // Depending on a layout we stride by cols rows.
+  VERIFY_IS_EQUAL(skewed_b0.block_strides().at(0), choose(Layout, 1, 100));
+  VERIFY_IS_EQUAL(skewed_b0.block_strides().at(1), choose(Layout, 100, 1));
+  // Tensor strides depend only on a layout and not on the block size.
+  VERIFY_IS_EQUAL(skewed_b0.tensor_strides().at(0), choose(Layout, 1, 100));
+  VERIFY_IS_EQUAL(skewed_b0.tensor_strides().at(1), choose(Layout, 100, 1));
+}
+
+// Given a TensorBlock "visit" every element accessible though it, and a keep an
+// index in the visited set. Verify that every coeff accessed only once.
+template <typename T, int Layout, int NumDims>
+static void UpdateCoeffSet(
+    const internal::TensorBlock<T, Index, NumDims, Layout>& block,
+    Index first_coeff_index, int dim_index, std::set<Index>* visited_coeffs) {
+  const DSizes<Index, NumDims> block_sizes = block.block_sizes();
+  const DSizes<Index, NumDims> tensor_strides = block.tensor_strides();
+
+  for (int i = 0; i < block_sizes[dim_index]; ++i) {
+    if (tensor_strides[dim_index] == 1) {
+      auto inserted = visited_coeffs->insert(first_coeff_index + i);
+      VERIFY_IS_EQUAL(inserted.second, true);
+    } else {
+      int next_dim_index = dim_index + choose(Layout, -1, 1);
+      UpdateCoeffSet<T, Layout, NumDims>(block, first_coeff_index,
+                                         next_dim_index, visited_coeffs);
+      first_coeff_index += tensor_strides[dim_index];
+    }
+  }
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_block_mapper_maps_every_element() {
+  using TensorBlock = internal::TensorBlock<T, Index, NumDims, Layout>;
+  using TensorBlockMapper =
+      internal::TensorBlockMapper<T, Index, NumDims, Layout>;
+
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>();
+
+  // Keep track of elements indices available via block access.
+  std::set<Index> coeff_set;
+
+  // Try different combinations of block types and sizes.
+  TensorBlockMapper block_mapper(dims, RandomShape(), RandomTargetSize(dims));
+
+  for (int i = 0; i < block_mapper.total_block_count(); ++i) {
+    TensorBlock block = block_mapper.GetBlockForIndex(i, nullptr);
+    UpdateCoeffSet<T, Layout, NumDims>(block, block.first_coeff_index(),
+                                       choose(Layout, NumDims - 1, 0),
+                                       &coeff_set);
+  }
+
+  // Verify that every coefficient in the original Tensor is accessible through
+  // TensorBlock only once.
+  Index total_coeffs = dims.TotalSize();
+  VERIFY_IS_EQUAL(coeff_set.size(), total_coeffs);
+  VERIFY_IS_EQUAL(*coeff_set.begin(), 0);
+  VERIFY_IS_EQUAL(*coeff_set.rbegin(), total_coeffs - 1);
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_slice_block_mapper_maps_every_element() {
+  using TensorBlock = internal::TensorBlock<T, Index, NumDims, Layout>;
+  using TensorSliceBlockMapper =
+      internal::TensorSliceBlockMapper<T, Index, NumDims, Layout>;
+
+  DSizes<Index, NumDims> tensor_dims = RandomDims<NumDims>();
+  DSizes<Index, NumDims> tensor_slice_offsets = RandomDims<NumDims>();
+  DSizes<Index, NumDims> tensor_slice_extents = RandomDims<NumDims>();
+
+  // Make sure that tensor offsets + extents do not overflow.
+  for (int i = 0; i < NumDims; ++i) {
+    tensor_slice_offsets[i] =
+        numext::mini(tensor_dims[i] - 1, tensor_slice_offsets[i]);
+    tensor_slice_extents[i] = numext::mini(
+        tensor_slice_extents[i], tensor_dims[i] - tensor_slice_offsets[i]);
+  }
+
+  // Keep track of elements indices available via block access.
+  std::set<Index> coeff_set;
+
+  auto total_coeffs = static_cast<int>(tensor_slice_extents.TotalSize());
+
+  // Pick a random dimension sizes for the tensor blocks.
+  DSizes<Index, NumDims> block_sizes;
+  for (int i = 0; i < NumDims; ++i) {
+    block_sizes[i] = internal::random<int>(1, tensor_slice_extents[i]);
+  }
+
+  TensorSliceBlockMapper block_mapper(tensor_dims, tensor_slice_offsets,
+                                      tensor_slice_extents, block_sizes,
+                                      DimensionList<Index, NumDims>());
+
+  for (int i = 0; i < block_mapper.total_block_count(); ++i) {
+    TensorBlock block = block_mapper.GetBlockForIndex(i, nullptr);
+    UpdateCoeffSet<T, Layout, NumDims>(block, block.first_coeff_index(),
+                                       choose(Layout, NumDims - 1, 0),
+                                       &coeff_set);
+  }
+
+  VERIFY_IS_EQUAL(coeff_set.size(), total_coeffs);
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_block_io_copy_data_from_source_to_target() {
+  typedef internal::TensorBlock<T, Index, NumDims, Layout> TensorBlock;
+  typedef internal::TensorBlockMapper<T, Index, NumDims, Layout>
+      TensorBlockMapper;
+
+  typedef internal::TensorBlockReader<T, Index, NumDims, Layout>
+      TensorBlockReader;
+  typedef internal::TensorBlockWriter<T, Index, NumDims, Layout>
+      TensorBlockWriter;
+
+  DSizes<Index, NumDims> input_tensor_dims = RandomDims<NumDims>();
+  const auto input_tensor_size = input_tensor_dims.TotalSize();
+
+  T* input_data = GenerateRandomData<T>(input_tensor_size);
+  T* output_data = new T[input_tensor_size];
+
+  TensorBlockMapper block_mapper(input_tensor_dims, RandomShape(),
+                                 RandomTargetSize(input_tensor_dims));
+  T* block_data = new T[block_mapper.block_dims_total_size()];
+
+  for (int i = 0; i < block_mapper.total_block_count(); ++i) {
+    TensorBlock block = block_mapper.GetBlockForIndex(i, block_data);
+    TensorBlockReader::Run(&block, input_data);
+    TensorBlockWriter::Run(block, output_data);
+  }
+
+  for (int i = 0; i < input_tensor_size; ++i) {
+    VERIFY_IS_EQUAL(input_data[i], output_data[i]);
+  }
+
+  delete[] input_data;
+  delete[] output_data;
+  delete[] block_data;
+}
+
+template <int Layout, int NumDims>
+static int GetInputIndex(Index output_index,
+                         const array<Index, NumDims>& output_to_input_dim_map,
+                         const array<Index, NumDims>& input_strides,
+                         const array<Index, NumDims>& output_strides) {
+  int input_index = 0;
+  if (Layout == ColMajor) {
+    for (int i = NumDims - 1; i > 0; --i) {
+      const int idx = output_index / output_strides[i];
+      input_index += idx * input_strides[output_to_input_dim_map[i]];
+      output_index -= idx * output_strides[i];
+    }
+    return input_index +
+           output_index * input_strides[output_to_input_dim_map[0]];
+  } else {
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const int idx = output_index / output_strides[i];
+      input_index += idx * input_strides[output_to_input_dim_map[i]];
+      output_index -= idx * output_strides[i];
+    }
+    return input_index +
+           output_index * input_strides[output_to_input_dim_map[NumDims - 1]];
+  }
+}
+
+template <int Layout, int NumDims>
+static array<Index, NumDims> ComputeStrides(
+    const array<Index, NumDims>& sizes) {
+  array<Index, NumDims> strides;
+  if (Layout == ColMajor) {
+    strides[0] = 1;
+    for (int i = 1; i < NumDims; ++i) {
+      strides[i] = strides[i - 1] * sizes[i - 1];
+    }
+  } else {
+    strides[NumDims - 1] = 1;
+    for (int i = NumDims - 2; i >= 0; --i) {
+      strides[i] = strides[i + 1] * sizes[i + 1];
+    }
+  }
+  return strides;
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_block_io_copy_using_reordered_dimensions() {
+  typedef internal::TensorBlock<T, Index, NumDims, Layout> TensorBlock;
+  typedef internal::TensorBlockMapper<T, Index, NumDims, Layout>
+      TensorBlockMapper;
+
+  typedef internal::TensorBlockReader<T, Index, NumDims, Layout>
+      TensorBlockReader;
+  typedef internal::TensorBlockWriter<T, Index, NumDims, Layout>
+      TensorBlockWriter;
+
+  DSizes<Index, NumDims> input_tensor_dims = RandomDims<NumDims>();
+  const auto input_tensor_size = input_tensor_dims.TotalSize();
+
+  // Create a random input tensor.
+  T* input_data = GenerateRandomData<T>(input_tensor_size);
+
+  // Create a random dimension re-ordering/shuffle.
+  std::vector<Index> shuffle;
+  for (int i = 0; i < NumDims; ++i) shuffle.push_back(i);
+  std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
+
+  DSizes<Index, NumDims> output_tensor_dims;
+  array<Index, NumDims> input_to_output_dim_map;
+  array<Index, NumDims> output_to_input_dim_map;
+  for (Index i = 0; i < NumDims; ++i) {
+    output_tensor_dims[shuffle[i]] = input_tensor_dims[i];
+    input_to_output_dim_map[i] = shuffle[i];
+    output_to_input_dim_map[shuffle[i]] = i;
+  }
+
+  // Random block shape and size.
+  TensorBlockMapper block_mapper(output_tensor_dims, RandomShape(),
+                                 RandomTargetSize(input_tensor_dims));
+
+  auto* block_data = new T[block_mapper.block_dims_total_size()];
+  auto* output_data = new T[input_tensor_size];
+
+  array<Index, NumDims> input_tensor_strides =
+      ComputeStrides<Layout, NumDims>(input_tensor_dims);
+  array<Index, NumDims> output_tensor_strides =
+      ComputeStrides<Layout, NumDims>(output_tensor_dims);
+
+  for (Index i = 0; i < block_mapper.total_block_count(); ++i) {
+    TensorBlock block = block_mapper.GetBlockForIndex(i, block_data);
+    const Index first_coeff_index = GetInputIndex<Layout, NumDims>(
+        block.first_coeff_index(), output_to_input_dim_map,
+        input_tensor_strides, output_tensor_strides);
+    TensorBlockReader::Run(&block, first_coeff_index, input_to_output_dim_map,
+                           input_tensor_strides, input_data);
+    TensorBlockWriter::Run(block, first_coeff_index, input_to_output_dim_map,
+                           input_tensor_strides, output_data);
+  }
+
+  for (int i = 0; i < input_tensor_size; ++i) {
+    VERIFY_IS_EQUAL(input_data[i], output_data[i]);
+  }
+
+  delete[] input_data;
+  delete[] block_data;
+  delete[] output_data;
+}
+
+template <int Layout>
+static void test_block_io_zero_stride()
+{
+  typedef internal::TensorBlock<float, Index, 5, Layout> TensorBlock;
+  typedef internal::TensorBlockReader<float, Index, 5, Layout>
+      TensorBlockReader;
+  typedef internal::TensorBlockWriter<float, Index, 5, Layout>
+      TensorBlockWriter;
+
+  DSizes<Index, 5> rnd_dims = RandomDims<5>();
+
+  DSizes<Index, 5> input_tensor_dims = rnd_dims;
+  input_tensor_dims[0] = 1;
+  input_tensor_dims[2] = 1;
+  input_tensor_dims[4] = 1;
+  const auto input_tensor_size = input_tensor_dims.TotalSize();
+  auto* input_data = GenerateRandomData<float>(input_tensor_size);
+
+  DSizes<Index, 5> output_tensor_dims = rnd_dims;
+
+  DSizes<Index, 5> input_tensor_strides(
+      ComputeStrides<Layout, 5>(input_tensor_dims));
+  DSizes<Index, 5> output_tensor_strides(
+      ComputeStrides<Layout, 5>(output_tensor_dims));
+
+  DSizes<Index, 5> input_tensor_strides_with_zeros(input_tensor_strides);
+  input_tensor_strides_with_zeros[0] = 0;
+  input_tensor_strides_with_zeros[2] = 0;
+  input_tensor_strides_with_zeros[4] = 0;
+
+  // Verify that data was correctly read/written from/into the block.
+  const auto verify_is_equal = [&](const float* output_data) {
+    for (int i = 0; i < output_tensor_dims[0]; ++i) {
+      for (int j = 0; j < output_tensor_dims[1]; ++j) {
+        for (int k = 0; k < output_tensor_dims[2]; ++k) {
+          for (int l = 0; l < output_tensor_dims[3]; ++l) {
+            for (int m = 0; m < output_tensor_dims[4]; ++m) {
+              const Index output_offset =
+                  i * output_tensor_strides[0] + j * output_tensor_strides[1] +
+                  k * output_tensor_strides[2] + l * output_tensor_strides[3] +
+                  m * output_tensor_strides[4];
+              const Index input_offset =
+                  i % input_tensor_dims[0] * input_tensor_strides[0] +
+                  j % input_tensor_dims[1] * input_tensor_strides[1] +
+                  k % input_tensor_dims[2] * input_tensor_strides[2] +
+                  l % input_tensor_dims[3] * input_tensor_strides[3] +
+                  m % input_tensor_dims[4] * input_tensor_strides[4];
+              VERIFY_IS_EQUAL(output_data[output_offset],
+                              input_data[input_offset]);
+            }
+          }
+        }
+      }
+    }
+  };
+
+  {
+    auto* output_data = new float[output_tensor_dims.TotalSize()];
+    TensorBlock read_block(0, output_tensor_dims, output_tensor_strides,
+                           input_tensor_strides_with_zeros, output_data);
+    TensorBlockReader::Run(&read_block, input_data);
+    verify_is_equal(output_data);
+    delete[] output_data;
+  }
+
+  {
+    auto* output_data = new float[output_tensor_dims.TotalSize()];
+    TensorBlock write_block(0, output_tensor_dims,
+                            input_tensor_strides_with_zeros,
+                            output_tensor_strides, input_data);
+    TensorBlockWriter::Run(write_block, output_data);
+    verify_is_equal(output_data);
+    delete[] output_data;
+  }
+
+  delete[] input_data;
+}
+
+template <int Layout>
+static void test_block_io_squeeze_ones() {
+  typedef internal::TensorBlock<float, Index, 5, Layout> TensorBlock;
+  typedef internal::TensorBlockReader<float, Index, 5, Layout>
+      TensorBlockReader;
+  typedef internal::TensorBlockWriter<float, Index, 5, Layout>
+      TensorBlockWriter;
+
+  // Total size > 1.
+  {
+    DSizes<Index, 5> block_sizes(1, 2, 1, 2, 1);
+    const auto total_size = block_sizes.TotalSize();
+
+    // Create a random input tensor.
+    auto* input_data = GenerateRandomData<float>(total_size);
+    DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));
+
+    {
+      auto* output_data = new float[block_sizes.TotalSize()];
+      TensorBlock read_block(0, block_sizes, strides, strides, output_data);
+      TensorBlockReader::Run(&read_block, input_data);
+      for (int i = 0; i < total_size; ++i) {
+        VERIFY_IS_EQUAL(output_data[i], input_data[i]);
+      }
+      delete[] output_data;
+    }
+
+    {
+      auto* output_data = new float[block_sizes.TotalSize()];
+      TensorBlock write_block(0, block_sizes, strides, strides, input_data);
+      TensorBlockWriter::Run(write_block, output_data);
+      for (int i = 0; i < total_size; ++i) {
+        VERIFY_IS_EQUAL(output_data[i], input_data[i]);
+      }
+      delete[] output_data;
+    }
+  }
+
+  // Total size == 1.
+  {
+    DSizes<Index, 5> block_sizes(1, 1, 1, 1, 1);
+    const auto total_size = block_sizes.TotalSize();
+
+    // Create a random input tensor.
+    auto* input_data = GenerateRandomData<float>(total_size);
+    DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));
+
+    {
+      auto* output_data = new float[block_sizes.TotalSize()];
+      TensorBlock read_block(0, block_sizes, strides, strides, output_data);
+      TensorBlockReader::Run(&read_block, input_data);
+      for (int i = 0; i < total_size; ++i) {
+        VERIFY_IS_EQUAL(output_data[i], input_data[i]);
+      }
+      delete[] output_data;
+    }
+
+    {
+      auto* output_data = new float[block_sizes.TotalSize()];
+      TensorBlock write_block(0, block_sizes, strides, strides, input_data);
+      TensorBlockWriter::Run(write_block, output_data);
+      for (int i = 0; i < total_size; ++i) {
+        VERIFY_IS_EQUAL(output_data[i], input_data[i]);
+      }
+      delete[] output_data;
+    }
+  }
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_block_cwise_binary_io_basic() {
+  typedef internal::scalar_sum_op<T> BinaryFunctor;
+  typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, T, NumDims,
+                                             Layout>
+      TensorBlockCwiseBinaryIO;
+
+  DSizes<Index, NumDims> block_sizes = RandomDims<NumDims>();
+  DSizes<Index, NumDims> strides(ComputeStrides<Layout, NumDims>(block_sizes));
+
+  const auto total_size = block_sizes.TotalSize();
+
+  // Create a random input tensors.
+  T* left_data = GenerateRandomData<T>(total_size);
+  T* right_data = GenerateRandomData<T>(total_size);
+
+  T* output_data = new T[total_size];
+  BinaryFunctor functor;
+  TensorBlockCwiseBinaryIO::Run(functor, block_sizes, strides, output_data,
+                                strides, left_data, strides, right_data);
+  for (int i = 0; i < total_size; ++i) {
+    VERIFY_IS_EQUAL(output_data[i], functor(left_data[i], right_data[i]));
+  }
+
+  delete[] left_data;
+  delete[] right_data;
+  delete[] output_data;
+}
+
+template <int Layout>
+static void test_block_cwise_binary_io_squeeze_ones() {
+  typedef internal::scalar_sum_op<float> BinaryFunctor;
+  typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, float, 5,
+                                             Layout>
+      TensorBlockCwiseBinaryIO;
+
+  DSizes<Index, 5> block_sizes(1, 2, 1, 3, 1);
+  DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));
+
+  const auto total_size = block_sizes.TotalSize();
+
+  // Create a random input tensors.
+  auto* left_data = GenerateRandomData<float>(total_size);
+  auto* right_data = GenerateRandomData<float>(total_size);
+
+  auto* output_data = new float[total_size];
+  BinaryFunctor functor;
+  TensorBlockCwiseBinaryIO::Run(functor, block_sizes, strides, output_data,
+                                strides, left_data, strides, right_data);
+  for (int i = 0; i < total_size; ++i) {
+    VERIFY_IS_EQUAL(output_data[i], functor(left_data[i], right_data[i]));
+  }
+
+  delete[] left_data;
+  delete[] right_data;
+  delete[] output_data;
+}
+
+template <int Layout>
+static void test_block_cwise_binary_io_zero_strides() {
+  typedef internal::scalar_sum_op<float> BinaryFunctor;
+  typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, float, 5,
+                                             Layout>
+      TensorBlockCwiseBinaryIO;
+
+  DSizes<Index, 5> rnd_dims = RandomDims<5>();
+
+  DSizes<Index, 5> left_sizes = rnd_dims;
+  left_sizes[0] = 1;
+  left_sizes[2] = 1;
+  left_sizes[4] = 1;
+
+  DSizes<Index, 5> left_strides(ComputeStrides<Layout, 5>(left_sizes));
+  left_strides[0] = 0;
+  left_strides[2] = 0;
+  left_strides[4] = 0;
+
+  DSizes<Index, 5> right_sizes = rnd_dims;
+  right_sizes[1] = 0;
+  right_sizes[3] = 0;
+
+  DSizes<Index, 5> right_strides(ComputeStrides<Layout, 5>(right_sizes));
+  right_strides[1] = 0;
+  right_strides[3] = 0;
+
+  // Generate random data.
+  auto* left_data = GenerateRandomData<float>(left_sizes.TotalSize());
+  auto* right_data = GenerateRandomData<float>(right_sizes.TotalSize());
+
+  DSizes<Index, 5> output_sizes = rnd_dims;
+  DSizes<Index, 5> output_strides(ComputeStrides<Layout, 5>(output_sizes));
+
+  const auto output_total_size = output_sizes.TotalSize();
+  auto* output_data = new float[output_total_size];
+
+  BinaryFunctor functor;
+  TensorBlockCwiseBinaryIO::Run(functor, output_sizes, output_strides,
+                                output_data, left_strides, left_data,
+                                right_strides, right_data);
+  for (int i = 0; i < rnd_dims[0]; ++i) {
+    for (int j = 0; j < rnd_dims[1]; ++j) {
+      for (int k = 0; k < rnd_dims[2]; ++k) {
+        for (int l = 0; l < rnd_dims[3]; ++l) {
+          for (int m = 0; m < rnd_dims[4]; ++m) {
+            Index output_index = i * output_strides[0] + j * output_strides[1] +
+                                 k * output_strides[2] + l * output_strides[3] +
+                                 m * output_strides[4];
+            Index left_index = i * left_strides[0] + j * left_strides[1] +
+                               k * left_strides[2] + l * left_strides[3] +
+                               m * left_strides[4];
+            Index right_index = i * right_strides[0] + j * right_strides[1] +
+                                k * right_strides[2] + l * right_strides[3] +
+                                m * right_strides[4];
+            VERIFY_IS_EQUAL(
+                output_data[output_index],
+                functor(left_data[left_index], right_data[right_index]));
+          }
+        }
+      }
+    }
+  }
+
+  delete[] left_data;
+  delete[] right_data;
+  delete[] output_data;
+}
+
+template <int Layout>
+static void test_uniform_block_shape()
+{
+  using T = int;
+  typedef internal::TensorBlock<T, Index, 5, Layout> TensorBlock;
+  typedef internal::TensorBlockMapper<T, Index, 5, Layout> TensorBlockMapper;
+
+  {
+    // Test shape 'UniformAllDims' with uniform 'max_coeff count'.
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const size_t max_coeff_count = 5 * 5 * 5 * 5 * 5;
+    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
+                                   max_coeff_count);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    for (int i = 0; i < 5; ++i) {
+      VERIFY_IS_EQUAL(5, block.block_sizes()[i]);
+    }
+    VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'UniformAllDims' with larger 'max_coeff count' which spills
+  // partially into first inner-most dimension.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const size_t max_coeff_count = 7 * 5 * 5 * 5 * 5;
+    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
+                                   max_coeff_count);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    VERIFY_IS_EQUAL(7, block.block_sizes()[0]);
+    for (int i = 1; i < 5; ++i) {
+      VERIFY_IS_EQUAL(5, block.block_sizes()[i]);
+    }
+    VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const size_t max_coeff_count = 5 * 5 * 5 * 5 * 6;
+    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
+                                   max_coeff_count);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    VERIFY_IS_EQUAL(6, block.block_sizes()[4]);
+    for (int i = 3; i >= 0; --i) {
+      VERIFY_IS_EQUAL(5, block.block_sizes()[i]);
+    }
+    VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'UniformAllDims' with larger 'max_coeff count' which spills
+  // fully into first inner-most dimension.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const size_t max_coeff_count = 11 * 5 * 5 * 5 * 5;
+    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
+                                   max_coeff_count);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    VERIFY_IS_EQUAL(11, block.block_sizes()[0]);
+    for (int i = 1; i < 5; ++i) {
+      VERIFY_IS_EQUAL(5, block.block_sizes()[i]);
+    }
+    VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const size_t max_coeff_count = 5 * 5 * 5 * 5 * 7;
+    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
+                                   max_coeff_count);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
+    for (int i = 3; i >= 0; --i) {
+      VERIFY_IS_EQUAL(5, block.block_sizes()[i]);
+    }
+    VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'UniformAllDims' with larger 'max_coeff count' which spills
+  // fully into first few inner-most dimensions.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(7, 5, 6, 17, 7);
+    const size_t max_coeff_count = 7 * 5 * 6 * 7 * 5;
+    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
+                                   max_coeff_count);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    VERIFY_IS_EQUAL(7, block.block_sizes()[0]);
+    VERIFY_IS_EQUAL(5, block.block_sizes()[1]);
+    VERIFY_IS_EQUAL(6, block.block_sizes()[2]);
+    VERIFY_IS_EQUAL(7, block.block_sizes()[3]);
+    VERIFY_IS_EQUAL(5, block.block_sizes()[4]);
+    VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(7, 5, 6, 9, 7);
+    const size_t max_coeff_count = 5 * 5 * 5 * 6 * 7;
+    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
+                                   max_coeff_count);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
+    VERIFY_IS_EQUAL(6, block.block_sizes()[3]);
+    VERIFY_IS_EQUAL(5, block.block_sizes()[2]);
+    VERIFY_IS_EQUAL(5, block.block_sizes()[1]);
+    VERIFY_IS_EQUAL(5, block.block_sizes()[0]);
+    VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'UniformAllDims' with full allocation to all dims.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(7, 5, 6, 17, 7);
+    const size_t max_coeff_count = 7 * 5 * 6 * 17 * 7;
+    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
+                                   max_coeff_count);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    VERIFY_IS_EQUAL(7, block.block_sizes()[0]);
+    VERIFY_IS_EQUAL(5, block.block_sizes()[1]);
+    VERIFY_IS_EQUAL(6, block.block_sizes()[2]);
+    VERIFY_IS_EQUAL(17, block.block_sizes()[3]);
+    VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
+    VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(7, 5, 6, 9, 7);
+    const size_t max_coeff_count = 7 * 5 * 6 * 9 * 7;
+    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims,
+                                   max_coeff_count);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
+    VERIFY_IS_EQUAL(9, block.block_sizes()[3]);
+    VERIFY_IS_EQUAL(6, block.block_sizes()[2]);
+    VERIFY_IS_EQUAL(5, block.block_sizes()[1]);
+    VERIFY_IS_EQUAL(7, block.block_sizes()[0]);
+    VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
+  }
+}
+
+template <int Layout>
+static void test_skewed_inner_dim_block_shape()
+{
+  using T = int;
+  typedef internal::TensorBlock<T, Index, 5, Layout> TensorBlock;
+  typedef internal::TensorBlockMapper<T, Index, 5, Layout> TensorBlockMapper;
+
+  // Test shape 'SkewedInnerDims' with partial allocation to inner-most dim.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const size_t max_coeff_count = 10 * 1 * 1 * 1 * 1;
+    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
+                                   max_coeff_count);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    VERIFY_IS_EQUAL(10, block.block_sizes()[0]);
+    for (int i = 1; i < 5; ++i) {
+      VERIFY_IS_EQUAL(1, block.block_sizes()[i]);
+    }
+    VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const size_t max_coeff_count = 1 * 1 * 1 * 1 * 6;
+    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
+                                   max_coeff_count);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    VERIFY_IS_EQUAL(6, block.block_sizes()[4]);
+    for (int i = 3; i >= 0; --i) {
+      VERIFY_IS_EQUAL(1, block.block_sizes()[i]);
+    }
+    VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'SkewedInnerDims' with full allocation to inner-most dim.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const size_t max_coeff_count = 11 * 1 * 1 * 1 * 1;
+    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
+                                   max_coeff_count);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    VERIFY_IS_EQUAL(11, block.block_sizes()[0]);
+    for (int i = 1; i < 5; ++i) {
+      VERIFY_IS_EQUAL(1, block.block_sizes()[i]);
+    }
+    VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const size_t max_coeff_count = 1 * 1 * 1 * 1 * 7;
+    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
+                                   max_coeff_count);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
+    for (int i = 3; i >= 0; --i) {
+      VERIFY_IS_EQUAL(1, block.block_sizes()[i]);
+    }
+    VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'SkewedInnerDims' with full allocation to inner-most dim,
+  // and partial allocation to second inner-dim.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const size_t max_coeff_count = 11 * 3 * 1 * 1 * 1;
+    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
+                                   max_coeff_count);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    VERIFY_IS_EQUAL(11, block.block_sizes()[0]);
+    VERIFY_IS_EQUAL(3, block.block_sizes()[1]);
+    for (int i = 2; i < 5; ++i) {
+      VERIFY_IS_EQUAL(1, block.block_sizes()[i]);
+    }
+    VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const size_t max_coeff_count = 1 * 1 * 1 * 15 * 7;
+    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
+                                   max_coeff_count);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
+    VERIFY_IS_EQUAL(15, block.block_sizes()[3]);
+    for (int i = 2; i >= 0; --i) {
+      VERIFY_IS_EQUAL(1, block.block_sizes()[i]);
+    }
+    VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'SkewedInnerDims' with full allocation to inner-most dim,
+  // and partial allocation to third inner-dim.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const size_t max_coeff_count = 11 * 5 * 5 * 1 * 1;
+    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
+                                   max_coeff_count);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    VERIFY_IS_EQUAL(11, block.block_sizes()[0]);
+    VERIFY_IS_EQUAL(5, block.block_sizes()[1]);
+    VERIFY_IS_EQUAL(5, block.block_sizes()[2]);
+    for (int i = 3; i < 5; ++i) {
+      VERIFY_IS_EQUAL(1, block.block_sizes()[i]);
+    }
+    VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const size_t max_coeff_count = 1 * 1 * 5 * 17 * 7;
+    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
+                                   max_coeff_count);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
+    VERIFY_IS_EQUAL(17, block.block_sizes()[3]);
+    VERIFY_IS_EQUAL(5, block.block_sizes()[2]);
+    for (int i = 1; i >= 0; --i) {
+      VERIFY_IS_EQUAL(1, block.block_sizes()[i]);
+    }
+    VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'SkewedInnerDims' with full allocation to all dims.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const size_t max_coeff_count = 11 * 5 * 6 * 17 * 7;
+    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
+                                   max_coeff_count);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    VERIFY_IS_EQUAL(11, block.block_sizes()[0]);
+    VERIFY_IS_EQUAL(5, block.block_sizes()[1]);
+    VERIFY_IS_EQUAL(6, block.block_sizes()[2]);
+    VERIFY_IS_EQUAL(17, block.block_sizes()[3]);
+    VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
+    VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const size_t max_coeff_count = 11 * 5 * 6 * 17 * 7;
+    TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims,
+                                   max_coeff_count);
+    TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr);
+    VERIFY_IS_EQUAL(7, block.block_sizes()[4]);
+    VERIFY_IS_EQUAL(17, block.block_sizes()[3]);
+    VERIFY_IS_EQUAL(6, block.block_sizes()[2]);
+    VERIFY_IS_EQUAL(5, block.block_sizes()[1]);
+    VERIFY_IS_EQUAL(11, block.block_sizes()[0]);
+    VERIFY(block.block_sizes().TotalSize() <= max_coeff_count);
+  }
+}
+
+template <int Layout>
+static void test_empty_dims(const internal::TensorBlockShapeType block_shape)
+{
+  using T = int;
+
+  // Test blocking of tensors with zero dimensions:
+  //  - we must not crash on asserts and divisions by zero
+  //  - we must not return block with zero dimensions
+  //    (recipe for overflows/underflows, divisions by zero and NaNs later)
+  //  - total block count must be zero
+  {
+    typedef internal::TensorBlockMapper<T, Index, 1, Layout> TensorBlockMapper;
+    DSizes<Index, 1> dims(0);
+    for (int max_coeff_count = 0; max_coeff_count < 2; ++max_coeff_count) {
+      TensorBlockMapper block_mapper(dims, block_shape, max_coeff_count);
+      VERIFY_IS_EQUAL(block_mapper.total_block_count(), 0);
+      VERIFY(block_mapper.block_dims_total_size() >= 1);
+    }
+  }
+
+  {
+    typedef internal::TensorBlockMapper<T, Index, 2, Layout> TensorBlockMapper;
+    for (int dim1 = 0; dim1 < 3; ++dim1) {
+      for (int dim2 = 0; dim2 < 3; ++dim2) {
+        DSizes<Index, 2> dims(dim1, dim2);
+        for (int max_coeff_count = 0; max_coeff_count < 2; ++max_coeff_count) {
+          TensorBlockMapper block_mapper(dims, block_shape, max_coeff_count);
+          if (dim1 * dim2 == 0) {
+            VERIFY_IS_EQUAL(block_mapper.total_block_count(), 0);
+          }
+          VERIFY(block_mapper.block_dims_total_size() >= 1);
+        }
+      }
+    }
+  }
+}
+
+#define TEST_LAYOUTS(NAME) \
+  CALL_SUBTEST(NAME<ColMajor>()); \
+  CALL_SUBTEST(NAME<RowMajor>())
+
+#define TEST_LAYOUTS_AND_DIMS(TYPE, NAME)    \
+  CALL_SUBTEST((NAME<TYPE, 1, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 1, RowMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 2, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 2, RowMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 3, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 3, RowMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 4, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 4, RowMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 5, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 5, RowMajor>()))
+
+#define TEST_LAYOUTS_WITH_ARG(NAME, ARG) \
+  CALL_SUBTEST(NAME<ColMajor>(ARG)); \
+  CALL_SUBTEST(NAME<RowMajor>(ARG))
+
+EIGEN_DECLARE_TEST(cxx11_tensor_block_access) {
+  TEST_LAYOUTS(test_block_mapper_sanity);
+  TEST_LAYOUTS_AND_DIMS(float, test_block_mapper_maps_every_element);
+  TEST_LAYOUTS_AND_DIMS(float, test_slice_block_mapper_maps_every_element);
+  TEST_LAYOUTS_AND_DIMS(float, test_block_io_copy_data_from_source_to_target);
+  TEST_LAYOUTS_AND_DIMS(Data, test_block_io_copy_data_from_source_to_target);
+  TEST_LAYOUTS_AND_DIMS(float, test_block_io_copy_using_reordered_dimensions);
+  TEST_LAYOUTS_AND_DIMS(Data, test_block_io_copy_using_reordered_dimensions);
+  TEST_LAYOUTS(test_block_io_zero_stride);
+  TEST_LAYOUTS(test_block_io_squeeze_ones);
+  TEST_LAYOUTS_AND_DIMS(float, test_block_cwise_binary_io_basic);
+  TEST_LAYOUTS(test_block_cwise_binary_io_squeeze_ones);
+  TEST_LAYOUTS(test_block_cwise_binary_io_zero_strides);
+  TEST_LAYOUTS(test_uniform_block_shape);
+  TEST_LAYOUTS(test_skewed_inner_dim_block_shape);
+  TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kUniformAllDims);
+  TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kSkewedInnerDims);
+}
+
+#undef TEST_LAYOUTS
+#undef TEST_LAYOUTS_WITH_ARG
+\ No newline at end of file
diff --git a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
index 21fdfca22..20f84b8e0 100644
--- a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
@@ -13,7 +13,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_broadcast_sycl
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
@@ -137,7 +137,7 @@ template<typename DataType> void sycl_broadcast_test_per_device(const cl::sycl::
   test_broadcast_sycl_fixed<DataType, ColMajor, int64_t>(sycl_device);
 }
 
-void test_cxx11_tensor_broadcast_sycl() {
+EIGEN_DECLARE_TEST(cxx11_tensor_broadcast_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     CALL_SUBTEST(sycl_broadcast_test_per_device<float>(device));
   }
diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp
index 5c0ea5889..2f8ab6afd 100644
--- a/unsupported/test/cxx11_tensor_broadcasting.cpp
+++ b/unsupported/test/cxx11_tensor_broadcasting.cpp
@@ -180,8 +180,119 @@ static void test_fixed_size_broadcasting()
 #endif
 }
 
+template <int DataLayout>
+static void test_simple_broadcasting_one_by_n()
+{
+  Tensor<float, 4, DataLayout> tensor(1,13,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> broadcasts;
+  broadcasts[0] = 9;
+  broadcasts[1] = 1;
+  broadcasts[2] = 1;
+  broadcasts[3] = 1;
+  Tensor<float, 4, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 9);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 13);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 5);
+  VERIFY_IS_EQUAL(broadcast.dimension(3), 7);
+
+  for (int i = 0; i < 9; ++i) {
+    for (int j = 0; j < 13; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i%1,j%13,k%5,l%7), broadcast(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_simple_broadcasting_n_by_one()
+{
+  Tensor<float, 4, DataLayout> tensor(7,3,5,1);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> broadcasts;
+  broadcasts[0] = 1;
+  broadcasts[1] = 1;
+  broadcasts[2] = 1;
+  broadcasts[3] = 19;
+  Tensor<float, 4, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 7);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 3);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 5);
+  VERIFY_IS_EQUAL(broadcast.dimension(3), 19);
+
+  for (int i = 0; i < 7; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 19; ++l) {
+          VERIFY_IS_EQUAL(tensor(i%7,j%3,k%5,l%1), broadcast(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_simple_broadcasting_one_by_n_by_one_1d()
+{
+  Tensor<float, 3, DataLayout> tensor(1,7,1);
+  tensor.setRandom();
+  array<ptrdiff_t, 3> broadcasts;
+  broadcasts[0] = 5;
+  broadcasts[1] = 1;
+  broadcasts[2] = 13;
+  Tensor<float, 3, DataLayout> broadcasted;
+  broadcasted = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcasted.dimension(0), 5);
+  VERIFY_IS_EQUAL(broadcasted.dimension(1), 7);
+  VERIFY_IS_EQUAL(broadcasted.dimension(2), 13);
+
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      for (int k = 0; k < 13; ++k) {
+        VERIFY_IS_EQUAL(tensor(0,j%7,0), broadcasted(i,j,k));
+      }
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_simple_broadcasting_one_by_n_by_one_2d()
+{
+  Tensor<float, 4, DataLayout> tensor(1,7,13,1);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> broadcasts;
+  broadcasts[0] = 5;
+  broadcasts[1] = 1;
+  broadcasts[2] = 1;
+  broadcasts[3] = 19;
+  Tensor<float, 4, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 5);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 7);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 13);
+  VERIFY_IS_EQUAL(broadcast.dimension(3), 19);
+
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      for (int k = 0; k < 13; ++k) {
+        for (int l = 0; l < 19; ++l) {
+          VERIFY_IS_EQUAL(tensor(0,j%7,k%13,0), broadcast(i,j,k,l));
+        }
+      }
+    }
+  }
+}
 
-void test_cxx11_tensor_broadcasting()
+EIGEN_DECLARE_TEST(cxx11_tensor_broadcasting)
 {
   CALL_SUBTEST(test_simple_broadcasting<ColMajor>());
   CALL_SUBTEST(test_simple_broadcasting<RowMajor>());
@@ -191,4 +302,12 @@ void test_cxx11_tensor_broadcasting()
   CALL_SUBTEST(test_static_broadcasting<RowMajor>());
   CALL_SUBTEST(test_fixed_size_broadcasting<ColMajor>());
   CALL_SUBTEST(test_fixed_size_broadcasting<RowMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n<RowMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_n_by_one<RowMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_n_by_one<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_1d<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_1d<RowMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d<RowMajor>());
 }
diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
index 400a31d09..db2975783 100644
--- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
@@ -13,7 +13,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_builtins_sycl
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
@@ -257,7 +257,7 @@ static void test_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
   TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, ColMajor)
 }
 
-void test_cxx11_tensor_builtins_sycl() {
+EIGEN_DECLARE_TEST(cxx11_tensor_builtins_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     QueueInterface queueInterface(device);
     Eigen::SyclDevice sycl_device(&queueInterface);
diff --git a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/unsupported/test/cxx11_tensor_cast_float16_gpu.cu
index 816e03220..97923d15f 100644
--- a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_cast_float16_gpu.cu
@@ -9,7 +9,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_cuda
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
@@ -18,8 +18,8 @@
 
 using Eigen::Tensor;
 
-void test_cuda_conversion() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_conversion() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = 101;
 
@@ -72,8 +72,8 @@ void test_fallback_conversion() {
 }
 
 
-void test_cxx11_tensor_cast_float16_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_cast_float16_gpu)
 {
-  CALL_SUBTEST(test_cuda_conversion());
+  CALL_SUBTEST(test_gpu_conversion());
   CALL_SUBTEST(test_fallback_conversion());
 }
diff --git a/unsupported/test/cxx11_tensor_casts.cpp b/unsupported/test/cxx11_tensor_casts.cpp
index 3c6d0d2ff..c4fe9a798 100644
--- a/unsupported/test/cxx11_tensor_casts.cpp
+++ b/unsupported/test/cxx11_tensor_casts.cpp
@@ -105,7 +105,7 @@ static void test_small_to_big_type_cast()
 }
 
 
-void test_cxx11_tensor_casts()
+EIGEN_DECLARE_TEST(cxx11_tensor_casts)
 {
    CALL_SUBTEST(test_simple_cast());
    CALL_SUBTEST(test_vectorized_cast());
diff --git a/unsupported/test/cxx11_tensor_chipping.cpp b/unsupported/test/cxx11_tensor_chipping.cpp
index 89cf5c7b7..922274462 100644
--- a/unsupported/test/cxx11_tensor_chipping.cpp
+++ b/unsupported/test/cxx11_tensor_chipping.cpp
@@ -410,7 +410,7 @@ static void test_chip_raw_data_row_major()
   VERIFY_IS_EQUAL(chip4.data(), static_cast<float*>(0));
 }
 
-void test_cxx11_tensor_chipping()
+EIGEN_DECLARE_TEST(cxx11_tensor_chipping)
 {
   CALL_SUBTEST(test_simple_chip<ColMajor>());
   CALL_SUBTEST(test_simple_chip<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_chipping_sycl.cpp b/unsupported/test/cxx11_tensor_chipping_sycl.cpp
index 39e4f0a7f..a91efe00c 100644
--- a/unsupported/test/cxx11_tensor_chipping_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_chipping_sycl.cpp
@@ -15,7 +15,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_chipping_sycl
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
@@ -614,7 +614,7 @@ template<typename DataType, typename dev_Selector> void sycl_chipping_test_per_d
   test_chip_as_lvalue_sycl<DataType, RowMajor, int64_t>(sycl_device);
   test_chip_as_lvalue_sycl<DataType, ColMajor, int64_t>(sycl_device);
 }
-void test_cxx11_tensor_chipping_sycl()
+EIGEN_DECLARE_TEST(cxx11_tensor_chipping_sycl)
 {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     CALL_SUBTEST(sycl_chipping_test_per_device<float>(device));
diff --git a/unsupported/test/cxx11_tensor_comparisons.cpp b/unsupported/test/cxx11_tensor_comparisons.cpp
index b1ff8aecb..1a18e07cc 100644
--- a/unsupported/test/cxx11_tensor_comparisons.cpp
+++ b/unsupported/test/cxx11_tensor_comparisons.cpp
@@ -77,7 +77,7 @@ static void test_equality()
 }
 
 
-void test_cxx11_tensor_comparisons()
+EIGEN_DECLARE_TEST(cxx11_tensor_comparisons)
 {
   CALL_SUBTEST(test_orderings());
   CALL_SUBTEST(test_equality());
diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu
index aac780905..f2a2a6cfa 100644
--- a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
+++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu
@@ -8,7 +8,7 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops
+
 #define EIGEN_USE_GPU
 
 #include "main.h"
@@ -28,7 +28,7 @@ void test_cuda_complex_cwise_ops() {
   cudaMalloc((void**)(&d_in2), complex_bytes);
   cudaMalloc((void**)(&d_out), complex_bytes);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in1(
@@ -48,11 +48,13 @@ void test_cuda_complex_cwise_ops() {
     Add = 0,
     Sub,
     Mul,
-    Div
+    Div,
+    Neg,
+    NbOps
   };
 
   Tensor<std::complex<T>, 1, 0, int> actual(kNumItems);
-  for (int op = Add; op <= Div; op++) {
+  for (int op = Add; op < NbOps; op++) {
     std::complex<T> expected;
     switch (static_cast<CwiseOp>(op)) {
       case Add:
@@ -71,6 +73,10 @@ void test_cuda_complex_cwise_ops() {
         gpu_out.device(gpu_device) = gpu_in1 / gpu_in2;
         expected = a / b;
         break;
+      case Neg:
+        gpu_out.device(gpu_device) = -gpu_in1;
+        expected = -a;
+        break;
     }
     assert(cudaMemcpyAsync(actual.data(), d_out, complex_bytes, cudaMemcpyDeviceToHost,
                            gpu_device.stream()) == cudaSuccess);
@@ -87,7 +93,7 @@ void test_cuda_complex_cwise_ops() {
 }
 
 
-void test_cxx11_tensor_complex_cwise_ops()
+EIGEN_DECLARE_TEST(test_cxx11_tensor_complex_cwise_ops)
 {
   CALL_SUBTEST(test_cuda_complex_cwise_ops<float>());
   CALL_SUBTEST(test_cuda_complex_cwise_ops<double>());
diff --git a/unsupported/test/cxx11_tensor_complex_cuda.cu b/unsupported/test/cxx11_tensor_complex_gpu.cu
index a52350f85..f8b8ae704 100644
--- a/unsupported/test/cxx11_tensor_complex_cuda.cu
+++ b/unsupported/test/cxx11_tensor_complex_gpu.cu
@@ -8,7 +8,7 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_FUNC cxx11_tensor_complex
+
 #define EIGEN_USE_GPU
 
 #include "main.h"
@@ -34,7 +34,7 @@ void test_cuda_nullary() {
   cudaMemcpy(d_in1, in1.data(), complex_bytes, cudaMemcpyHostToDevice);
   cudaMemcpy(d_in2, in2.data(), complex_bytes, cudaMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in1(
@@ -70,7 +70,7 @@ void test_cuda_nullary() {
 
 static void test_cuda_sum_reductions() {
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   const int num_rows = internal::random<int>(1024, 5*1024);
@@ -106,7 +106,7 @@ static void test_cuda_sum_reductions() {
 
 static void test_cuda_mean_reductions() {
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   const int num_rows = internal::random<int>(1024, 5*1024);
@@ -142,7 +142,7 @@ static void test_cuda_mean_reductions() {
 
 static void test_cuda_product_reductions() {
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   const int num_rows = internal::random<int>(1024, 5*1024);
@@ -177,7 +177,7 @@ static void test_cuda_product_reductions() {
 }
 
 
-void test_cxx11_tensor_complex()
+EIGEN_DECLARE_TEST(test_cxx11_tensor_complex)
 {
   CALL_SUBTEST(test_cuda_nullary());
   CALL_SUBTEST(test_cuda_sum_reductions());
diff --git a/unsupported/test/cxx11_tensor_concatenation.cpp b/unsupported/test/cxx11_tensor_concatenation.cpp
index 03ef12e63..9189a609b 100644
--- a/unsupported/test/cxx11_tensor_concatenation.cpp
+++ b/unsupported/test/cxx11_tensor_concatenation.cpp
@@ -123,7 +123,7 @@ static void test_concatenation_as_lvalue()
 }
 
 
-void test_cxx11_tensor_concatenation()
+EIGEN_DECLARE_TEST(cxx11_tensor_concatenation)
 {
    CALL_SUBTEST(test_dimension_failures<ColMajor>());
    CALL_SUBTEST(test_dimension_failures<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
index e3023a368..765991b35 100644
--- a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
@@ -13,7 +13,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_concatenation_sycl
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
@@ -173,7 +173,7 @@ template <typename DataType, typename Dev_selector> void tensorConcat_perDevice(
   test_simple_concatenation<DataType, ColMajor, int64_t>(sycl_device);
   test_concatenation_as_lvalue<DataType, ColMajor, int64_t>(sycl_device);
 }
-void test_cxx11_tensor_concatenation_sycl() {
+EIGEN_DECLARE_TEST(cxx11_tensor_concatenation_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     CALL_SUBTEST(tensorConcat_perDevice<float>(device));
   }
diff --git a/unsupported/test/cxx11_tensor_const.cpp b/unsupported/test/cxx11_tensor_const.cpp
index ad9c9da39..9d806ee3c 100644
--- a/unsupported/test/cxx11_tensor_const.cpp
+++ b/unsupported/test/cxx11_tensor_const.cpp
@@ -55,7 +55,7 @@ static void test_assign_of_const_tensor()
 }
 
 
-void test_cxx11_tensor_const()
+EIGEN_DECLARE_TEST(cxx11_tensor_const)
 {
   CALL_SUBTEST(test_simple_assign());
   CALL_SUBTEST(test_assign_of_const_tensor());
diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_gpu.cu
index 3621e2aa6..575bdc1f9 100644
--- a/unsupported/test/cxx11_tensor_contract_cuda.cu
+++ b/unsupported/test/cxx11_tensor_contract_gpu.cu
@@ -10,19 +10,20 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_cuda
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
 
 using Eigen::Tensor;
 typedef Tensor<float, 1>::DimensionPair DimPair;
 
 template<int DataLayout>
-void test_cuda_contraction(int m_size, int k_size, int n_size)
+void test_gpu_contraction(int m_size, int k_size, int n_size)
 {
   std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
   // with these dimensions, the output has 300 * 140 elements, which is
@@ -45,14 +46,14 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
   float* d_t_right;
   float* d_t_result;
 
-  cudaMalloc((void**)(&d_t_left), t_left_bytes);
-  cudaMalloc((void**)(&d_t_right), t_right_bytes);
-  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+  gpuMalloc((void**)(&d_t_left), t_left_bytes);
+  gpuMalloc((void**)(&d_t_right), t_right_bytes);
+  gpuMalloc((void**)(&d_t_result), t_result_bytes);
 
-  cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
@@ -66,7 +67,7 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
   gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
   t_result = t_left.contract(t_right, dims);
 
-  cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+  gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
   for (DenseIndex i = 0; i < t_result.size(); i++) {
     if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
       continue;
@@ -79,9 +80,9 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
     assert(false);
   }
 
-  cudaFree((void*)d_t_left);
-  cudaFree((void*)d_t_right);
-  cudaFree((void*)d_t_result);
+  gpuFree((void*)d_t_left);
+  gpuFree((void*)d_t_right);
+  gpuFree((void*)d_t_result);
 }
 
 
@@ -109,14 +110,14 @@ void test_scalar(int m_size, int k_size, int n_size)
   float* d_t_right;
   float* d_t_result;
 
-  cudaMalloc((void**)(&d_t_left), t_left_bytes);
-  cudaMalloc((void**)(&d_t_right), t_right_bytes);
-  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+  gpuMalloc((void**)(&d_t_left), t_left_bytes);
+  gpuMalloc((void**)(&d_t_right), t_right_bytes);
+  gpuMalloc((void**)(&d_t_result), t_result_bytes);
 
-  cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
@@ -129,7 +130,7 @@ void test_scalar(int m_size, int k_size, int n_size)
   gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
   t_result = t_left.contract(t_right, dims);
 
-  cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+  gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
   if (fabs(t_result() - t_result_gpu()) > 1e-4f &&
       !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
     std::cout << "mismatch detected: " << t_result()
@@ -137,39 +138,39 @@ void test_scalar(int m_size, int k_size, int n_size)
     assert(false);
   }
 
-  cudaFree((void*)d_t_left);
-  cudaFree((void*)d_t_right);
-  cudaFree((void*)d_t_result);
+  gpuFree((void*)d_t_left);
+  gpuFree((void*)d_t_right);
+  gpuFree((void*)d_t_result);
 }
 
 
 template<int DataLayout>
-void test_cuda_contraction_m() {
+void test_gpu_contraction_m() {
   for (int k = 32; k < 256; k++) {
-    test_cuda_contraction<ColMajor>(k, 128, 128);
-    test_cuda_contraction<RowMajor>(k, 128, 128);
+    test_gpu_contraction<ColMajor>(k, 128, 128);
+    test_gpu_contraction<RowMajor>(k, 128, 128);
   }
 }
 
 template<int DataLayout>
-void test_cuda_contraction_k() {
+void test_gpu_contraction_k() {
   for (int k = 32; k < 256; k++) {
-    test_cuda_contraction<ColMajor>(128, k, 128);
-    test_cuda_contraction<RowMajor>(128, k, 128);
+    test_gpu_contraction<ColMajor>(128, k, 128);
+    test_gpu_contraction<RowMajor>(128, k, 128);
   }
 }
 
 template<int DataLayout>
-void test_cuda_contraction_n() {
+void test_gpu_contraction_n() {
   for (int k = 32; k < 256; k++) {
-    test_cuda_contraction<ColMajor>(128, 128, k);
-    test_cuda_contraction<RowMajor>(128, 128, k);
+    test_gpu_contraction<ColMajor>(128, 128, k);
+    test_gpu_contraction<RowMajor>(128, 128, k);
   }
 }
 
 
 template<int DataLayout>
-void test_cuda_contraction_sizes() {
+void test_gpu_contraction_sizes() {
   int m_sizes[] = { 31,  39,   63,   64,   65,
                    127, 129,  255,  257 , 511,
                    512, 513, 1023, 1024, 1025};
@@ -186,29 +187,32 @@ void test_cuda_contraction_sizes() {
   for (int i = 0; i < 15; i++) {
     for (int j = 0; j < 15; j++) {
       for (int k = 0; k < 17; k++) {
-        test_cuda_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
+        test_gpu_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
       }
     }
   }
 }
 
-void test_cxx11_tensor_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_contract_gpu)
 {
-  CALL_SUBTEST_1(test_cuda_contraction<ColMajor>(128, 128, 128));
-  CALL_SUBTEST_1(test_cuda_contraction<RowMajor>(128, 128, 128));
+  CALL_SUBTEST_1(test_gpu_contraction<ColMajor>(128, 128, 128));
+  CALL_SUBTEST_1(test_gpu_contraction<RowMajor>(128, 128, 128));
 
   CALL_SUBTEST_1(test_scalar<ColMajor>(128, 128, 128));
   CALL_SUBTEST_1(test_scalar<RowMajor>(128, 128, 128));
 
-  CALL_SUBTEST_2(test_cuda_contraction_m<ColMajor>());
-  CALL_SUBTEST_3(test_cuda_contraction_m<RowMajor>());
+  CALL_SUBTEST_2(test_gpu_contraction_m<ColMajor>());
+  CALL_SUBTEST_3(test_gpu_contraction_m<RowMajor>());
 
-  CALL_SUBTEST_4(test_cuda_contraction_k<ColMajor>());
-  CALL_SUBTEST_5(test_cuda_contraction_k<RowMajor>());
+  CALL_SUBTEST_4(test_gpu_contraction_k<ColMajor>());
+  CALL_SUBTEST_5(test_gpu_contraction_k<RowMajor>());
 
-  CALL_SUBTEST_6(test_cuda_contraction_n<ColMajor>());
-  CALL_SUBTEST_7(test_cuda_contraction_n<RowMajor>());
+  CALL_SUBTEST_6(test_gpu_contraction_n<ColMajor>());
+  CALL_SUBTEST_7(test_gpu_contraction_n<RowMajor>());
 
-  CALL_SUBTEST_8(test_cuda_contraction_sizes<ColMajor>());
-  CALL_SUBTEST_9(test_cuda_contraction_sizes<RowMajor>());
+#if !defined(EIGEN_USE_HIP)
+// disable these subtests for HIP
+  CALL_SUBTEST_8(test_gpu_contraction_sizes<ColMajor>());
+  CALL_SUBTEST_9(test_gpu_contraction_sizes<RowMajor>());
+#endif	
 }
diff --git a/unsupported/test/cxx11_tensor_contract_sycl.cpp b/unsupported/test/cxx11_tensor_contract_sycl.cpp
index 5bace66c5..c8e86e69f 100644
--- a/unsupported/test/cxx11_tensor_contract_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_contract_sycl.cpp
@@ -13,7 +13,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_contract_sycl
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
@@ -283,7 +283,7 @@ template <typename Dev_selector> void tensorContractionPerDevice(Dev_selector& s
 
 }
 
-void test_cxx11_tensor_contract_sycl() {
+EIGEN_DECLARE_TEST(cxx11_tensor_contract_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     CALL_SUBTEST(tensorContractionPerDevice(device));
   }
diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp
index ace97057f..d4cfbd0da 100644
--- a/unsupported/test/cxx11_tensor_contraction.cpp
+++ b/unsupported/test/cxx11_tensor_contraction.cpp
@@ -510,7 +510,56 @@ static void test_const_inputs()
   VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1));
 }
 
-void test_cxx11_tensor_contraction()
+// Apply Sqrt to all output elements.
+struct SqrtOutputKernel {
+  template <typename Index, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(
+      const OutputKernel::OutputMapper<Index, Scalar>& output_mapper,
+      const TensorContractionParams&, Index, Index, Index num_rows,
+      Index num_cols) const {
+    for (int i = 0; i < num_rows; ++i) {
+      for (int j = 0; j < num_cols; ++j) {
+        output_mapper(i, j) = std::sqrt(output_mapper(i, j));
+      }
+    }
+  }
+};
+
+template <int DataLayout>
+static void test_large_contraction_with_output_kernel() {
+  Tensor<float, 4, DataLayout> t_left(30, 50, 8, 31);
+  Tensor<float, 5, DataLayout> t_right(8, 31, 7, 20, 10);
+  Tensor<float, 5, DataLayout> t_result(30, 50, 7, 20, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+  // Put trash in mat4 to verify contraction clears output memory.
+  t_result.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 1500, 248);
+  MapXf m_right(t_right.data(), 248, 1400);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+
+  // compute results by separate methods
+  t_result = t_left.contract(t_right, dims, SqrtOutputKernel());
+
+  m_result = m_left * m_right;
+
+  for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY(&t_result.data()[i] != &m_result.data()[i]);
+    VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_contraction)
 {
   CALL_SUBTEST(test_evals<ColMajor>());
   CALL_SUBTEST(test_evals<RowMajor>());
@@ -542,4 +591,6 @@ void test_cxx11_tensor_contraction()
   CALL_SUBTEST(test_tensor_product<RowMajor>());
   CALL_SUBTEST(test_const_inputs<ColMajor>());
   CALL_SUBTEST(test_const_inputs<RowMajor>());
+  CALL_SUBTEST(test_large_contraction_with_output_kernel<ColMajor>());
+  CALL_SUBTEST(test_large_contraction_with_output_kernel<RowMajor>());
 }
diff --git a/unsupported/test/cxx11_tensor_convolution.cpp b/unsupported/test/cxx11_tensor_convolution.cpp
index e3d4675eb..01bc77bc1 100644
--- a/unsupported/test/cxx11_tensor_convolution.cpp
+++ b/unsupported/test/cxx11_tensor_convolution.cpp
@@ -136,7 +136,7 @@ static void test_strides() {
                                input(12)*kernel(2)));
 }
 
-void test_cxx11_tensor_convolution()
+EIGEN_DECLARE_TEST(cxx11_tensor_convolution)
 {
   CALL_SUBTEST(test_evals<ColMajor>());
   CALL_SUBTEST(test_evals<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_convolution_sycl.cpp b/unsupported/test/cxx11_tensor_convolution_sycl.cpp
index a4226a63a..3954c8a28 100644
--- a/unsupported/test/cxx11_tensor_convolution_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_convolution_sycl.cpp
@@ -13,7 +13,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_convolution_sycl
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
@@ -462,7 +462,7 @@ template <typename Dev_selector> void tensorConvolutionPerDevice(Dev_selector& s
   test_strides<float, RowMajor, int64_t>(sycl_device);
 }
 
-void test_cxx11_tensor_convolution_sycl() {
+EIGEN_DECLARE_TEST(cxx11_tensor_convolution_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     CALL_SUBTEST(tensorConvolutionPerDevice(device));
   }
diff --git a/unsupported/test/cxx11_tensor_custom_index.cpp b/unsupported/test/cxx11_tensor_custom_index.cpp
index 4528cc176..b5dbc97bd 100644
--- a/unsupported/test/cxx11_tensor_custom_index.cpp
+++ b/unsupported/test/cxx11_tensor_custom_index.cpp
@@ -88,7 +88,7 @@ static void test_sizes_as_index()
 }
 
 
-void test_cxx11_tensor_custom_index() {
+EIGEN_DECLARE_TEST(cxx11_tensor_custom_index) {
   test_map_as_index<ColMajor>();
   test_map_as_index<RowMajor>();
   test_matrix_as_index<ColMajor>();
diff --git a/unsupported/test/cxx11_tensor_custom_op.cpp b/unsupported/test/cxx11_tensor_custom_op.cpp
index 8baa477cc..875ea57d2 100644
--- a/unsupported/test/cxx11_tensor_custom_op.cpp
+++ b/unsupported/test/cxx11_tensor_custom_op.cpp
@@ -104,7 +104,7 @@ static void test_custom_binary_op()
 }
 
 
-void test_cxx11_tensor_custom_op()
+EIGEN_DECLARE_TEST(cxx11_tensor_custom_op)
 {
   CALL_SUBTEST(test_custom_unary_op());
   CALL_SUBTEST(test_custom_binary_op());
diff --git a/unsupported/test/cxx11_tensor_custom_op_sycl.cpp b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp
index 9ff287fff..cc3b02448 100644
--- a/unsupported/test/cxx11_tensor_custom_op_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp
@@ -13,7 +13,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_custom_op_sycl
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
@@ -158,7 +158,7 @@ template <typename DataType, typename Dev_selector> void custom_op_perDevice(Dev
   test_custom_binary_op_sycl<DataType, RowMajor, int64_t>(sycl_device);
 
 }
-void test_cxx11_tensor_custom_op_sycl() {
+EIGEN_DECLARE_TEST(cxx11_tensor_custom_op_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     CALL_SUBTEST(custom_op_perDevice<float>(device));
   }
diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu
index 7c14bc187..c9f78d2d3 100644
--- a/unsupported/test/cxx11_tensor_device.cu
+++ b/unsupported/test/cxx11_tensor_device.cu
@@ -9,13 +9,14 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_device
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
 
 using Eigen::Tensor;
 using Eigen::RowMajor;
@@ -66,22 +67,22 @@ struct CPUContext {
 // Context for evaluation on GPU
 struct GPUContext {
   GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
-    assert(cudaMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == cudaSuccess);
+    assert(gpuMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == gpuSuccess);
     float kernel_1d_val[] = {3.14f, 2.7f};
-    assert(cudaMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
+    assert(gpuMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
 
-    assert(cudaMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == cudaSuccess);
+    assert(gpuMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == gpuSuccess);
     float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f};
-    assert(cudaMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
+    assert(gpuMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
 
-    assert(cudaMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == cudaSuccess);
+    assert(gpuMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == gpuSuccess);
     float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f};
-    assert(cudaMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
+    assert(gpuMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
   }
   ~GPUContext() {
-    assert(cudaFree(kernel_1d_) == cudaSuccess);
-    assert(cudaFree(kernel_2d_) == cudaSuccess);
-    assert(cudaFree(kernel_3d_) == cudaSuccess);
+    assert(gpuFree(kernel_1d_) == gpuSuccess);
+    assert(gpuFree(kernel_2d_) == gpuSuccess);
+    assert(gpuFree(kernel_3d_) == gpuSuccess);
   }
 
   const Eigen::GpuDevice& device() const { return gpu_device_; }
@@ -102,7 +103,7 @@ struct GPUContext {
   float* kernel_2d_;
   float* kernel_3d_;
 
-  Eigen::CudaStreamDevice stream_;
+  Eigen::GpuStreamDevice stream_;
   Eigen::GpuDevice gpu_device_;
 };
 
@@ -281,12 +282,12 @@ void test_gpu() {
   float* d_in1;
   float* d_in2;
   float* d_out;
-  cudaMalloc((void**)(&d_in1), in1_bytes);
-  cudaMalloc((void**)(&d_in2), in2_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_in1), in1_bytes);
+  gpuMalloc((void**)(&d_in2), in2_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
 
   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
@@ -294,7 +295,7 @@ void test_gpu() {
 
   GPUContext context(gpu_in1, gpu_in2, gpu_out);
   test_contextual_eval(&context);
-  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 50; ++j) {
       for (int k = 0; k < 70; ++k) {
@@ -304,7 +305,7 @@ void test_gpu() {
   }
 
   test_forced_contextual_eval(&context);
-  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 50; ++j) {
       for (int k = 0; k < 70; ++k) {
@@ -314,7 +315,7 @@ void test_gpu() {
   }
 
   test_compound_assignment(&context);
-  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 50; ++j) {
       for (int k = 0; k < 70; ++k) {
@@ -324,7 +325,7 @@ void test_gpu() {
   }
 
   test_contraction(&context);
-  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 40; ++j) {
       const float result = out(i,j,0);
@@ -339,8 +340,8 @@ void test_gpu() {
   }
 
   test_1d_convolution(&context);
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 49; ++j) {
       for (int k = 0; k < 70; ++k) {
@@ -350,8 +351,8 @@ void test_gpu() {
   }
 
   test_2d_convolution(&context);
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 49; ++j) {
       for (int k = 0; k < 69; ++k) {
@@ -363,9 +364,13 @@ void test_gpu() {
     }
   }
 
+#if !defined(EIGEN_USE_HIP)
+// disable this test on the HIP platform
+// 3D tensor convolutions seem to hang on the HIP platform
+
   test_3d_convolution(&context);
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
   for (int i = 0; i < 39; ++i) {
     for (int j = 0; j < 49; ++j) {
       for (int k = 0; k < 69; ++k) {
@@ -378,10 +383,13 @@ void test_gpu() {
       }
     }
   }
+
+#endif
+ 
 }
 
 
-void test_cxx11_tensor_device()
+EIGEN_DECLARE_TEST(cxx11_tensor_device)
 {
   CALL_SUBTEST_1(test_cpu());
   CALL_SUBTEST_2(test_gpu());
diff --git a/unsupported/test/cxx11_tensor_device_sycl.cpp b/unsupported/test/cxx11_tensor_device_sycl.cpp
index 3ecc68df0..5095cb078 100644
--- a/unsupported/test/cxx11_tensor_device_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_device_sycl.cpp
@@ -13,7 +13,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_device_sycl
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
@@ -70,7 +70,7 @@ template<typename DataType> void sycl_device_test_per_device(const cl::sycl::dev
   //test_device_exceptions<DataType, ColMajor>(sycl_device);
 }
 
-void test_cxx11_tensor_device_sycl() {
+EIGEN_DECLARE_TEST(cxx11_tensor_device_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     CALL_SUBTEST(sycl_device_test_per_device<float>(device));
   }
diff --git a/unsupported/test/cxx11_tensor_dimension.cpp b/unsupported/test/cxx11_tensor_dimension.cpp
index 16f168ed4..10364d4b4 100644
--- a/unsupported/test/cxx11_tensor_dimension.cpp
+++ b/unsupported/test/cxx11_tensor_dimension.cpp
@@ -60,7 +60,7 @@ static void test_rank_zero()
   VERIFY_IS_EQUAL((int)dscalar.rank(), 0);
 }
 
-void test_cxx11_tensor_dimension()
+EIGEN_DECLARE_TEST(cxx11_tensor_dimension)
 {
   CALL_SUBTEST(test_dynamic_size());
   CALL_SUBTEST(test_fixed_size());
diff --git a/unsupported/test/cxx11_tensor_empty.cpp b/unsupported/test/cxx11_tensor_empty.cpp
index d7eea42d7..fd889c46c 100644
--- a/unsupported/test/cxx11_tensor_empty.cpp
+++ b/unsupported/test/cxx11_tensor_empty.cpp
@@ -33,7 +33,7 @@ static void test_empty_fixed_size_tensor()
 }
 
 
-void test_cxx11_tensor_empty()
+EIGEN_DECLARE_TEST(cxx11_tensor_empty)
 {
    CALL_SUBTEST(test_empty_tensor());
    CALL_SUBTEST(test_empty_fixed_size_tensor());
diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
new file mode 100644
index 000000000..274f901ce
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@@ -0,0 +1,87 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+using Eigen::ColMajor;
+
+// A set of tests to verify that different TensorExecutor strategies yields the
+// same results for all the ops, supporting tiled execution.
+
+template <typename Device, bool Vectorizable, bool Tileable, int Layout>
+static void test_execute_binary_expr(Device d) {
+  // Pick a large enough tensor size to bypass small tensor block evaluation
+  // optimization.
+  int d0 = internal::random<int>(100, 200);
+  int d1 = internal::random<int>(100, 200);
+  int d2 = internal::random<int>(100, 200);
+
+  static constexpr int Options = 0;
+  using IndexType = int;
+
+  Tensor<float, 3, Options, IndexType> lhs(d0, d1, d2);
+  Tensor<float, 3, Options, IndexType> rhs(d0, d1, d2);
+  Tensor<float, 3, Options, IndexType> dst(d0, d1, d2);
+
+  lhs.setRandom();
+  rhs.setRandom();
+
+  const auto expr = lhs + rhs;
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (int i = 0; i < d0; ++i) {
+    for (int j = 0; j < d1; ++j) {
+      for (int k = 0; k < d2; ++k) {
+        float sum = lhs(i, j, k) + rhs(i, j, k);
+        VERIFY_IS_EQUAL(sum, dst(i, j, k));
+      }
+    }
+  }
+}
+
+#define CALL_SUBTEST_COMBINATIONS(NAME)                                        \
+  CALL_SUBTEST((NAME<DefaultDevice, false, false, ColMajor>(default_device))); \
+  CALL_SUBTEST((NAME<DefaultDevice, false, true, ColMajor>(default_device)));  \
+  CALL_SUBTEST((NAME<DefaultDevice, true, false, ColMajor>(default_device)));  \
+  CALL_SUBTEST((NAME<DefaultDevice, true, true, ColMajor>(default_device)));   \
+  CALL_SUBTEST((NAME<DefaultDevice, false, false, RowMajor>(default_device))); \
+  CALL_SUBTEST((NAME<DefaultDevice, false, true, RowMajor>(default_device)));  \
+  CALL_SUBTEST((NAME<DefaultDevice, true, false, RowMajor>(default_device)));  \
+  CALL_SUBTEST((NAME<DefaultDevice, true, true, RowMajor>(default_device)));   \
+  CALL_SUBTEST((NAME<ThreadPoolDevice, false, false, ColMajor>(tp_device)));   \
+  CALL_SUBTEST((NAME<ThreadPoolDevice, false, true, ColMajor>(tp_device)));    \
+  CALL_SUBTEST((NAME<ThreadPoolDevice, true, false, ColMajor>(tp_device)));    \
+  CALL_SUBTEST((NAME<ThreadPoolDevice, true, true, ColMajor>(tp_device)));     \
+  CALL_SUBTEST((NAME<ThreadPoolDevice, false, false, RowMajor>(tp_device)));   \
+  CALL_SUBTEST((NAME<ThreadPoolDevice, false, true, RowMajor>(tp_device)));    \
+  CALL_SUBTEST((NAME<ThreadPoolDevice, true, false, RowMajor>(tp_device)));    \
+  CALL_SUBTEST((NAME<ThreadPoolDevice, true, true, RowMajor>(tp_device)))
+
+EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
+  Eigen::DefaultDevice default_device;
+
+  const auto num_threads = internal::random<int>(1, 24);
+  Eigen::ThreadPool tp(num_threads);
+  Eigen::ThreadPoolDevice tp_device(&tp, num_threads);
+
+  CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr);
+}
+
+#undef CALL_SUBTEST_COMBINATIONS
diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp
index 129b4e659..30924b6b6 100644
--- a/unsupported/test/cxx11_tensor_expr.cpp
+++ b/unsupported/test/cxx11_tensor_expr.cpp
@@ -340,13 +340,33 @@ void test_minmax_nan_propagation_templ() {
   }
 }
 
+static void test_clip()
+{
+  Tensor<float, 1> vec(6);
+  vec(0) = 4.0;
+  vec(1) = 8.0;
+  vec(2) = 15.0;
+  vec(3) = 16.0;
+  vec(4) = 23.0;
+  vec(5) = 42.0;
+
+  float kMin = 20;
+  float kMax = 30;
+
+  Tensor<float, 1> vec_clipped(6);
+  vec_clipped = vec.clip(kMin, kMax);
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(vec_clipped(i), numext::mini(numext::maxi(vec(i), kMin), kMax));
+  }
+}
+
 static void test_minmax_nan_propagation()
 {
   test_minmax_nan_propagation_templ<float>();
   test_minmax_nan_propagation_templ<double>();
 }
 
-void test_cxx11_tensor_expr()
+EIGEN_DECLARE_TEST(cxx11_tensor_expr)
 {
   CALL_SUBTEST(test_1d());
   CALL_SUBTEST(test_2d());
@@ -356,5 +376,6 @@ void test_cxx11_tensor_expr()
   CALL_SUBTEST(test_functors());
   CALL_SUBTEST(test_type_casting());
   CALL_SUBTEST(test_select());
+  CALL_SUBTEST(test_clip());
   CALL_SUBTEST(test_minmax_nan_propagation());
 }
diff --git a/unsupported/test/cxx11_tensor_fft.cpp b/unsupported/test/cxx11_tensor_fft.cpp
index a55369477..4e4c9c4ec 100644
--- a/unsupported/test/cxx11_tensor_fft.cpp
+++ b/unsupported/test/cxx11_tensor_fft.cpp
@@ -228,10 +228,13 @@ template <typename RealScalar>
 static void test_fft_non_power_of_2_round_trip(int exponent) {
   int n = (1 << exponent) + 1;
 
-  Eigen::DSizes<long, 1> dimensions;
+  // The dimension type needs to be at least 8 bytes long for the
+  // Tensor constructor to work. On Windows, long is only 4 bytes long,
+  // so use long long here to force the usage of a 8 bytes integer type.
+  Eigen::DSizes<std::int64_t, 1> dimensions;
   dimensions[0] = n;
-  const DSizes<long, 1> arr = dimensions;
-  Tensor<RealScalar, 1, ColMajor, long> input;
+  const DSizes<std::int64_t, 1> arr = dimensions;
+  Tensor<RealScalar, 1, ColMajor, std::int64_t> input;
 
   input.resize(arr);
   input.setRandom();
@@ -242,7 +245,7 @@ static void test_fft_non_power_of_2_round_trip(int exponent) {
   Tensor<std::complex<RealScalar>, 1, ColMajor> forward =
       input.template fft<BothParts, FFT_FORWARD>(fft);
 
-  Tensor<RealScalar, 1, ColMajor, long> output =
+  Tensor<RealScalar, 1, ColMajor, std::int64_t> output =
       forward.template fft<RealPart, FFT_REVERSE>(fft);
 
   for (int i = 0; i < n; ++i) {
@@ -250,7 +253,7 @@ static void test_fft_non_power_of_2_round_trip(int exponent) {
   }
 }
 
-void test_cxx11_tensor_fft() {
+EIGEN_DECLARE_TEST(cxx11_tensor_fft) {
     test_fft_complex_input_golden();
     test_fft_real_input_golden();
 
diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp
index e6274f8eb..456ce6bea 100644
--- a/unsupported/test/cxx11_tensor_fixed_size.cpp
+++ b/unsupported/test/cxx11_tensor_fixed_size.cpp
@@ -250,7 +250,7 @@ static void test_array()
   }
 }
 
-void test_cxx11_tensor_fixed_size()
+EIGEN_DECLARE_TEST(cxx11_tensor_fixed_size)
 {
   CALL_SUBTEST(test_0d());
   CALL_SUBTEST(test_1d());
diff --git a/unsupported/test/cxx11_tensor_forced_eval.cpp b/unsupported/test/cxx11_tensor_forced_eval.cpp
index 45d7345e9..f76e2ea97 100644
--- a/unsupported/test/cxx11_tensor_forced_eval.cpp
+++ b/unsupported/test/cxx11_tensor_forced_eval.cpp
@@ -72,7 +72,7 @@ static void test_const()
 }
 
 
-void test_cxx11_tensor_forced_eval()
+EIGEN_DECLARE_TEST(cxx11_tensor_forced_eval)
 {
   CALL_SUBTEST(test_simple());
   CALL_SUBTEST(test_const());
diff --git a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
index a21514d56..74d38a644 100644
--- a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
@@ -13,7 +13,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_forced_eval_sycl
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
@@ -69,7 +69,7 @@ template <typename DataType, typename Dev_selector> void tensorForced_evalperDev
   test_forced_eval_sycl<DataType, RowMajor, int64_t>(sycl_device);
   test_forced_eval_sycl<DataType, ColMajor, int64_t>(sycl_device);
 }
-void test_cxx11_tensor_forced_eval_sycl() {
+EIGEN_DECLARE_TEST(cxx11_tensor_forced_eval_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     CALL_SUBTEST(tensorForced_evalperDevice<float>(device));
   }
diff --git a/unsupported/test/cxx11_tensor_generator.cpp b/unsupported/test/cxx11_tensor_generator.cpp
index dcb928714..ee5e29b77 100644
--- a/unsupported/test/cxx11_tensor_generator.cpp
+++ b/unsupported/test/cxx11_tensor_generator.cpp
@@ -80,7 +80,7 @@ static void test_gaussian()
 }
 
 
-void test_cxx11_tensor_generator()
+EIGEN_DECLARE_TEST(cxx11_tensor_generator)
 {
   CALL_SUBTEST(test_1D<ColMajor>());
   CALL_SUBTEST(test_1D<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_generator_sycl.cpp b/unsupported/test/cxx11_tensor_generator_sycl.cpp
index f551c8d0c..fb6e3d9d0 100644
--- a/unsupported/test/cxx11_tensor_generator_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_generator_sycl.cpp
@@ -13,7 +13,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_generator_sycl
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 static const float error_threshold =1e-8f;
@@ -139,7 +139,7 @@ template<typename DataType, typename dev_Selector> void sycl_generator_test_per_
   test_gaussian_sycl<DataType, RowMajor, int64_t>(sycl_device);
   test_gaussian_sycl<DataType, ColMajor, int64_t>(sycl_device);
 }
-void test_cxx11_tensor_generator_sycl()
+EIGEN_DECLARE_TEST(cxx11_tensor_generator_sycl)
 {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     CALL_SUBTEST(sycl_generator_test_per_device<float>(device));
diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_gpu.cu
index 9584a539f..14fc0bd04 100644
--- a/unsupported/test/cxx11_tensor_cuda.cu
+++ b/unsupported/test/cxx11_tensor_gpu.cu
@@ -9,15 +9,17 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_cuda
+
 #define EIGEN_USE_GPU
 
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
 using Eigen::Tensor;
 
-void test_cuda_nullary() {
+void test_gpu_nullary() {
   Tensor<float, 1, 0, int> in1(2);
   Tensor<float, 1, 0, int> in2(2);
   in1.setRandom();
@@ -27,12 +29,12 @@ void test_cuda_nullary() {
 
   float* d_in1;
   float* d_in2;
-  cudaMalloc((void**)(&d_in1), tensor_bytes);
-  cudaMalloc((void**)(&d_in2), tensor_bytes);
-  cudaMemcpy(d_in1, in1.data(), tensor_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in2, in2.data(), tensor_bytes, cudaMemcpyHostToDevice);
+  gpuMalloc((void**)(&d_in1), tensor_bytes);
+  gpuMalloc((void**)(&d_in2), tensor_bytes);
+  gpuMemcpy(d_in1, in1.data(), tensor_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in2, in2.data(), tensor_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1(
@@ -46,23 +48,23 @@ void test_cuda_nullary() {
   Tensor<float, 1, 0, int> new1(2);
   Tensor<float, 1, 0, int> new2(2);
 
-  assert(cudaMemcpyAsync(new1.data(), d_in1, tensor_bytes, cudaMemcpyDeviceToHost,
-                         gpu_device.stream()) == cudaSuccess);
-  assert(cudaMemcpyAsync(new2.data(), d_in2, tensor_bytes, cudaMemcpyDeviceToHost,
-                         gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(new1.data(), d_in1, tensor_bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuMemcpyAsync(new2.data(), d_in2, tensor_bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
 
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 2; ++i) {
     VERIFY_IS_APPROX(new1(i), 3.14f);
     VERIFY_IS_NOT_EQUAL(new2(i), in2(i));
   }
 
-  cudaFree(d_in1);
-  cudaFree(d_in2);
+  gpuFree(d_in1);
+  gpuFree(d_in2);
 }
 
-void test_cuda_elementwise_small() {
+void test_gpu_elementwise_small() {
   Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2));
   Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2));
   Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>(2));
@@ -76,14 +78,14 @@ void test_cuda_elementwise_small() {
   float* d_in1;
   float* d_in2;
   float* d_out;
-  cudaMalloc((void**)(&d_in1), in1_bytes);
-  cudaMalloc((void**)(&d_in2), in2_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_in1), in1_bytes);
+  gpuMalloc((void**)(&d_in2), in2_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
@@ -95,9 +97,9 @@ void test_cuda_elementwise_small() {
 
   gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost,
-                         gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 2; ++i) {
     VERIFY_IS_APPROX(
@@ -105,12 +107,12 @@ void test_cuda_elementwise_small() {
         in1(Eigen::array<Eigen::DenseIndex, 1>(i)) + in2(Eigen::array<Eigen::DenseIndex, 1>(i)));
   }
 
-  cudaFree(d_in1);
-  cudaFree(d_in2);
-  cudaFree(d_out);
+  gpuFree(d_in1);
+  gpuFree(d_in2);
+  gpuFree(d_out);
 }
 
-void test_cuda_elementwise()
+void test_gpu_elementwise()
 {
   Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
   Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
@@ -129,16 +131,16 @@ void test_cuda_elementwise()
   float* d_in2;
   float* d_in3;
   float* d_out;
-  cudaMalloc((void**)(&d_in1), in1_bytes);
-  cudaMalloc((void**)(&d_in2), in2_bytes);
-  cudaMalloc((void**)(&d_in3), in3_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_in1), in1_bytes);
+  gpuMalloc((void**)(&d_in2), in2_bytes);
+  gpuMalloc((void**)(&d_in3), in3_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in3, in3.data(), in3_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in3, in3.data(), in3_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
@@ -148,8 +150,8 @@ void test_cuda_elementwise()
 
   gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 72; ++i) {
     for (int j = 0; j < 53; ++j) {
@@ -159,13 +161,13 @@ void test_cuda_elementwise()
     }
   }
 
-  cudaFree(d_in1);
-  cudaFree(d_in2);
-  cudaFree(d_in3);
-  cudaFree(d_out);
+  gpuFree(d_in1);
+  gpuFree(d_in2);
+  gpuFree(d_in3);
+  gpuFree(d_out);
 }
 
-void test_cuda_props() {
+void test_gpu_props() {
   Tensor<float, 1> in1(200);
   Tensor<bool, 1> out(200);
   in1.setRandom();
@@ -175,12 +177,12 @@ void test_cuda_props() {
 
   float* d_in1;
   bool* d_out;
-  cudaMalloc((void**)(&d_in1), in1_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_in1), in1_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
@@ -190,19 +192,19 @@ void test_cuda_props() {
 
   gpu_out.device(gpu_device) = (gpu_in1.isnan)();
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost,
-                         gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 200; ++i) {
     VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i)));
   }
 
-  cudaFree(d_in1);
-  cudaFree(d_out);
+  gpuFree(d_in1);
+  gpuFree(d_out);
 }
 
-void test_cuda_reduction()
+void test_gpu_reduction()
 {
   Tensor<float, 4> in1(72,53,97,113);
   Tensor<float, 2> out(72,97);
@@ -213,12 +215,12 @@ void test_cuda_reduction()
 
   float* d_in1;
   float* d_out;
-  cudaMalloc((void**)(&d_in1), in1_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_in1), in1_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
@@ -230,8 +232,8 @@ void test_cuda_reduction()
 
   gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 72; ++i) {
     for (int j = 0; j < 97; ++j) {
@@ -246,12 +248,12 @@ void test_cuda_reduction()
     }
   }
 
-  cudaFree(d_in1);
-  cudaFree(d_out);
+  gpuFree(d_in1);
+  gpuFree(d_out);
 }
 
 template<int DataLayout>
-void test_cuda_contraction()
+void test_gpu_contraction()
 {
   // with these dimensions, the output has 300 * 140 elements, which is
   // more than 30 * 1024, which is the number of threads in blocks on
@@ -271,14 +273,14 @@ void test_cuda_contraction()
   float* d_t_right;
   float* d_t_result;
 
-  cudaMalloc((void**)(&d_t_left), t_left_bytes);
-  cudaMalloc((void**)(&d_t_right), t_right_bytes);
-  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+  gpuMalloc((void**)(&d_t_left), t_left_bytes);
+  gpuMalloc((void**)(&d_t_right), t_right_bytes);
+  gpuMalloc((void**)(&d_t_result), t_result_bytes);
 
-  cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31);
@@ -298,7 +300,7 @@ void test_cuda_contraction()
   m_result = m_left * m_right;
   gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
 
-  cudaMemcpy(t_result.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+  gpuMemcpy(t_result.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
 
   for (DenseIndex i = 0; i < t_result.size(); i++) {
     if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
@@ -307,13 +309,13 @@ void test_cuda_contraction()
     }
   }
 
-  cudaFree(d_t_left);
-  cudaFree(d_t_right);
-  cudaFree(d_t_result);
+  gpuFree(d_t_left);
+  gpuFree(d_t_right);
+  gpuFree(d_t_result);
 }
 
 template<int DataLayout>
-void test_cuda_convolution_1d()
+void test_gpu_convolution_1d()
 {
   Tensor<float, 4, DataLayout> input(74,37,11,137);
   Tensor<float, 1, DataLayout> kernel(4);
@@ -328,14 +330,14 @@ void test_cuda_convolution_1d()
   float* d_input;
   float* d_kernel;
   float* d_out;
-  cudaMalloc((void**)(&d_input), input_bytes);
-  cudaMalloc((void**)(&d_kernel), kernel_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_input), input_bytes);
+  gpuMalloc((void**)(&d_kernel), kernel_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137);
@@ -345,8 +347,8 @@ void test_cuda_convolution_1d()
   Eigen::array<Eigen::DenseIndex, 1> dims(1);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 74; ++i) {
     for (int j = 0; j < 34; ++j) {
@@ -361,12 +363,12 @@ void test_cuda_convolution_1d()
     }
   }
 
-  cudaFree(d_input);
-  cudaFree(d_kernel);
-  cudaFree(d_out);
+  gpuFree(d_input);
+  gpuFree(d_kernel);
+  gpuFree(d_out);
 }
 
-void test_cuda_convolution_inner_dim_col_major_1d()
+void test_gpu_convolution_inner_dim_col_major_1d()
 {
   Tensor<float, 4, ColMajor> input(74,9,11,7);
   Tensor<float, 1, ColMajor> kernel(4);
@@ -381,14 +383,14 @@ void test_cuda_convolution_inner_dim_col_major_1d()
   float* d_input;
   float* d_kernel;
   float* d_out;
-  cudaMalloc((void**)(&d_input), input_bytes);
-  cudaMalloc((void**)(&d_kernel), kernel_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_input), input_bytes);
+  gpuMalloc((void**)(&d_kernel), kernel_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7);
@@ -398,8 +400,8 @@ void test_cuda_convolution_inner_dim_col_major_1d()
   Eigen::array<Eigen::DenseIndex, 1> dims(0);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 71; ++i) {
     for (int j = 0; j < 9; ++j) {
@@ -414,12 +416,12 @@ void test_cuda_convolution_inner_dim_col_major_1d()
     }
   }
 
-  cudaFree(d_input);
-  cudaFree(d_kernel);
-  cudaFree(d_out);
+  gpuFree(d_input);
+  gpuFree(d_kernel);
+  gpuFree(d_out);
 }
 
-void test_cuda_convolution_inner_dim_row_major_1d()
+void test_gpu_convolution_inner_dim_row_major_1d()
 {
   Tensor<float, 4, RowMajor> input(7,9,11,74);
   Tensor<float, 1, RowMajor> kernel(4);
@@ -434,14 +436,14 @@ void test_cuda_convolution_inner_dim_row_major_1d()
   float* d_input;
   float* d_kernel;
   float* d_out;
-  cudaMalloc((void**)(&d_input), input_bytes);
-  cudaMalloc((void**)(&d_kernel), kernel_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_input), input_bytes);
+  gpuMalloc((void**)(&d_kernel), kernel_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74);
@@ -451,8 +453,8 @@ void test_cuda_convolution_inner_dim_row_major_1d()
   Eigen::array<Eigen::DenseIndex, 1> dims(3);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 7; ++i) {
     for (int j = 0; j < 9; ++j) {
@@ -467,13 +469,13 @@ void test_cuda_convolution_inner_dim_row_major_1d()
     }
   }
 
-  cudaFree(d_input);
-  cudaFree(d_kernel);
-  cudaFree(d_out);
+  gpuFree(d_input);
+  gpuFree(d_kernel);
+  gpuFree(d_out);
 }
 
 template<int DataLayout>
-void test_cuda_convolution_2d()
+void test_gpu_convolution_2d()
 {
   Tensor<float, 4, DataLayout> input(74,37,11,137);
   Tensor<float, 2, DataLayout> kernel(3,4);
@@ -488,14 +490,14 @@ void test_cuda_convolution_2d()
   float* d_input;
   float* d_kernel;
   float* d_out;
-  cudaMalloc((void**)(&d_input), input_bytes);
-  cudaMalloc((void**)(&d_kernel), kernel_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_input), input_bytes);
+  gpuMalloc((void**)(&d_kernel), kernel_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137);
@@ -505,8 +507,8 @@ void test_cuda_convolution_2d()
   Eigen::array<Eigen::DenseIndex, 2> dims(1,2);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 74; ++i) {
     for (int j = 0; j < 35; ++j) {
@@ -531,13 +533,13 @@ void test_cuda_convolution_2d()
     }
   }
 
-  cudaFree(d_input);
-  cudaFree(d_kernel);
-  cudaFree(d_out);
+  gpuFree(d_input);
+  gpuFree(d_kernel);
+  gpuFree(d_out);
 }
 
 template<int DataLayout>
-void test_cuda_convolution_3d()
+void test_gpu_convolution_3d()
 {
   Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>(74,37,11,137,17));
   Tensor<float, 3, DataLayout> kernel(3,4,2);
@@ -552,14 +554,14 @@ void test_cuda_convolution_3d()
   float* d_input;
   float* d_kernel;
   float* d_out;
-  cudaMalloc((void**)(&d_input), input_bytes);
-  cudaMalloc((void**)(&d_kernel), kernel_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_input), input_bytes);
+  gpuMalloc((void**)(&d_kernel), kernel_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;    
+  Eigen::GpuStreamDevice stream;    
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17);
@@ -569,8 +571,8 @@ void test_cuda_convolution_3d()
   Eigen::array<Eigen::DenseIndex, 3> dims(1,2,3);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 74; ++i) {
     for (int j = 0; j < 35; ++j) {
@@ -609,14 +611,14 @@ void test_cuda_convolution_3d()
     }
   }
 
-  cudaFree(d_input);
-  cudaFree(d_kernel);
-  cudaFree(d_out);
+  gpuFree(d_input);
+  gpuFree(d_kernel);
+  gpuFree(d_out);
 }
 
 
 template <typename Scalar>
-void test_cuda_lgamma(const Scalar stddev)
+void test_gpu_lgamma(const Scalar stddev)
 {
   Tensor<Scalar, 2> in(72,97);
   in.setRandom();
@@ -628,12 +630,12 @@ void test_cuda_lgamma(const Scalar stddev)
 
   Scalar* d_in;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_in), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  gpuMalloc((void**)(&d_in), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
 
-  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
@@ -641,8 +643,8 @@ void test_cuda_lgamma(const Scalar stddev)
 
   gpu_out.device(gpu_device) = gpu_in.lgamma();
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 72; ++i) {
     for (int j = 0; j < 97; ++j) {
@@ -650,12 +652,12 @@ void test_cuda_lgamma(const Scalar stddev)
     }
   }
 
-  cudaFree(d_in);
-  cudaFree(d_out);
+  gpuFree(d_in);
+  gpuFree(d_out);
 }
 
 template <typename Scalar>
-void test_cuda_digamma()
+void test_gpu_digamma()
 {
   Tensor<Scalar, 1> in(7);
   Tensor<Scalar, 1> out(7);
@@ -682,12 +684,12 @@ void test_cuda_digamma()
 
   Scalar* d_in;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_in), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  gpuMalloc((void**)(&d_in), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
 
-  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7);
@@ -695,8 +697,8 @@ void test_cuda_digamma()
 
   gpu_out.device(gpu_device) = gpu_in.digamma();
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 5; ++i) {
     VERIFY_IS_APPROX(out(i), expected_out(i));
@@ -705,12 +707,12 @@ void test_cuda_digamma()
     VERIFY_IS_EQUAL(out(i), expected_out(i));
   }
 
-  cudaFree(d_in);
-  cudaFree(d_out);
+  gpuFree(d_in);
+  gpuFree(d_out);
 }
 
 template <typename Scalar>
-void test_cuda_zeta()
+void test_gpu_zeta()
 {
   Tensor<Scalar, 1> in_x(6);
   Tensor<Scalar, 1> in_q(6);
@@ -744,14 +746,14 @@ void test_cuda_zeta()
   Scalar* d_in_x;
   Scalar* d_in_q;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_in_x), bytes);
-  cudaMalloc((void**)(&d_in_q), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  gpuMalloc((void**)(&d_in_x), bytes);
+  gpuMalloc((void**)(&d_in_q), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
 
-  cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in_q, in_q.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in_q, in_q.data(), bytes, gpuMemcpyHostToDevice);
   
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
@@ -760,8 +762,8 @@ void test_cuda_zeta()
 
   gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   VERIFY_IS_EQUAL(out(0), expected_out(0));
   VERIFY((std::isnan)(out(3)));
@@ -772,13 +774,13 @@ void test_cuda_zeta()
     }
   }
 
-  cudaFree(d_in_x);
-  cudaFree(d_in_q);
-  cudaFree(d_out);
+  gpuFree(d_in_x);
+  gpuFree(d_in_q);
+  gpuFree(d_out);
 }
 
 template <typename Scalar>
-void test_cuda_polygamma()
+void test_gpu_polygamma()
 {
   Tensor<Scalar, 1> in_x(7);
   Tensor<Scalar, 1> in_n(7);
@@ -815,14 +817,14 @@ void test_cuda_polygamma()
   Scalar* d_in_x;
   Scalar* d_in_n;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_in_x), bytes);
-  cudaMalloc((void**)(&d_in_n), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  gpuMalloc((void**)(&d_in_x), bytes);
+  gpuMalloc((void**)(&d_in_n), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
 
-  cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in_n, in_n.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in_n, in_n.data(), bytes, gpuMemcpyHostToDevice);
   
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7);
@@ -831,20 +833,20 @@ void test_cuda_polygamma()
 
   gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 7; ++i) {
     VERIFY_IS_APPROX(out(i), expected_out(i));
   }
 
-  cudaFree(d_in_x);
-  cudaFree(d_in_n);
-  cudaFree(d_out);
+  gpuFree(d_in_x);
+  gpuFree(d_in_n);
+  gpuFree(d_out);
 }
 
 template <typename Scalar>
-void test_cuda_igamma()
+void test_gpu_igamma()
 {
   Tensor<Scalar, 2> a(6, 6);
   Tensor<Scalar, 2> x(6, 6);
@@ -880,14 +882,14 @@ void test_cuda_igamma()
   Scalar* d_a;
   Scalar* d_x;
   Scalar* d_out;
-  assert(cudaMalloc((void**)(&d_a), bytes) == cudaSuccess);
-  assert(cudaMalloc((void**)(&d_x), bytes) == cudaSuccess);
-  assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess);
+  assert(gpuMalloc((void**)(&d_a), bytes) == gpuSuccess);
+  assert(gpuMalloc((void**)(&d_x), bytes) == gpuSuccess);
+  assert(gpuMalloc((void**)(&d_out), bytes) == gpuSuccess);
 
-  cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_a, a.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_x, x.data(), bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
@@ -896,8 +898,8 @@ void test_cuda_igamma()
 
   gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 6; ++i) {
     for (int j = 0; j < 6; ++j) {
@@ -909,13 +911,13 @@ void test_cuda_igamma()
     }
   }
 
-  cudaFree(d_a);
-  cudaFree(d_x);
-  cudaFree(d_out);
+  gpuFree(d_a);
+  gpuFree(d_x);
+  gpuFree(d_out);
 }
 
 template <typename Scalar>
-void test_cuda_igammac()
+void test_gpu_igammac()
 {
   Tensor<Scalar, 2> a(6, 6);
   Tensor<Scalar, 2> x(6, 6);
@@ -950,14 +952,14 @@ void test_cuda_igammac()
   Scalar* d_a;
   Scalar* d_x;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_a), bytes);
-  cudaMalloc((void**)(&d_x), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  gpuMalloc((void**)(&d_a), bytes);
+  gpuMalloc((void**)(&d_x), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
 
-  cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_a, a.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_x, x.data(), bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
@@ -966,8 +968,8 @@ void test_cuda_igammac()
 
   gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 6; ++i) {
     for (int j = 0; j < 6; ++j) {
@@ -979,13 +981,13 @@ void test_cuda_igammac()
     }
   }
 
-  cudaFree(d_a);
-  cudaFree(d_x);
-  cudaFree(d_out);
+  gpuFree(d_a);
+  gpuFree(d_x);
+  gpuFree(d_out);
 }
 
 template <typename Scalar>
-void test_cuda_erf(const Scalar stddev)
+void test_gpu_erf(const Scalar stddev)
 {
   Tensor<Scalar, 2> in(72,97);
   in.setRandom();
@@ -997,12 +999,12 @@ void test_cuda_erf(const Scalar stddev)
 
   Scalar* d_in;
   Scalar* d_out;
-  assert(cudaMalloc((void**)(&d_in), bytes) == cudaSuccess);
-  assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess);
+  assert(gpuMalloc((void**)(&d_in), bytes) == gpuSuccess);
+  assert(gpuMalloc((void**)(&d_out), bytes) == gpuSuccess);
 
-  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
@@ -1010,8 +1012,8 @@ void test_cuda_erf(const Scalar stddev)
 
   gpu_out.device(gpu_device) = gpu_in.erf();
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 72; ++i) {
     for (int j = 0; j < 97; ++j) {
@@ -1019,12 +1021,12 @@ void test_cuda_erf(const Scalar stddev)
     }
   }
 
-  cudaFree(d_in);
-  cudaFree(d_out);
+  gpuFree(d_in);
+  gpuFree(d_out);
 }
 
 template <typename Scalar>
-void test_cuda_erfc(const Scalar stddev)
+void test_gpu_erfc(const Scalar stddev)
 {
   Tensor<Scalar, 2> in(72,97);
   in.setRandom();
@@ -1036,12 +1038,12 @@ void test_cuda_erfc(const Scalar stddev)
 
   Scalar* d_in;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_in), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  gpuMalloc((void**)(&d_in), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
 
-  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
@@ -1049,8 +1051,8 @@ void test_cuda_erfc(const Scalar stddev)
 
   gpu_out.device(gpu_device) = gpu_in.erfc();
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 72; ++i) {
     for (int j = 0; j < 97; ++j) {
@@ -1058,12 +1060,12 @@ void test_cuda_erfc(const Scalar stddev)
     }
   }
 
-  cudaFree(d_in);
-  cudaFree(d_out);
+  gpuFree(d_in);
+  gpuFree(d_out);
 }
 
 template <typename Scalar>
-void test_cuda_betainc()
+void test_gpu_betainc()
 {
   Tensor<Scalar, 1> in_x(125);
   Tensor<Scalar, 1> in_a(125);
@@ -1172,16 +1174,16 @@ void test_cuda_betainc()
   Scalar* d_in_a;
   Scalar* d_in_b;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_in_x), bytes);
-  cudaMalloc((void**)(&d_in_a), bytes);
-  cudaMalloc((void**)(&d_in_b), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  gpuMalloc((void**)(&d_in_x), bytes);
+  gpuMalloc((void**)(&d_in_a), bytes);
+  gpuMalloc((void**)(&d_in_b), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
 
-  cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in_a, in_a.data(), bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in_b, in_b.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in_a, in_a.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in_b, in_b.data(), bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125);
@@ -1191,8 +1193,8 @@ void test_cuda_betainc()
 
   gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 1; i < 125; ++i) {
     if ((std::isnan)(expected_out(i))) {
@@ -1202,83 +1204,370 @@ void test_cuda_betainc()
     }
   }
 
-  cudaFree(d_in_x);
-  cudaFree(d_in_a);
-  cudaFree(d_in_b);
-  cudaFree(d_out);
+  gpuFree(d_in_x);
+  gpuFree(d_in_a);
+  gpuFree(d_in_b);
+  gpuFree(d_out);
+}
+
+template <typename Scalar>
+void test_gpu_i0e()
+{
+  Tensor<Scalar, 1> in_x(21);
+  Tensor<Scalar, 1> out(21);
+  Tensor<Scalar, 1> expected_out(21);
+  out.setZero();
+
+  Array<Scalar, 1, Dynamic> in_x_array(21);
+  Array<Scalar, 1, Dynamic> expected_out_array(21);
+
+  in_x_array << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0,
+      -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+  expected_out_array << 0.0897803118848, 0.0947062952128, 0.100544127361,
+      0.107615251671, 0.116426221213, 0.127833337163, 0.143431781857,
+      0.16665743264, 0.207001921224, 0.308508322554, 1.0, 0.308508322554,
+      0.207001921224, 0.16665743264, 0.143431781857, 0.127833337163,
+      0.116426221213, 0.107615251671, 0.100544127361, 0.0947062952128,
+      0.0897803118848;
+
+  for (int i = 0; i < 21; ++i) {
+    in_x(i) = in_x_array(i);
+    expected_out(i) = expected_out_array(i);
+  }
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_in), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_in, in_x.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 21);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 21);
+
+  gpu_out.device(gpu_device) = gpu_in.i0e();
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 21; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+
+  gpuFree(d_in);
+  gpuFree(d_out);
+}
+
+template <typename Scalar>
+void test_gpu_i1e()
+{
+  Tensor<Scalar, 1> in_x(21);
+  Tensor<Scalar, 1> out(21);
+  Tensor<Scalar, 1> expected_out(21);
+  out.setZero();
+
+  Array<Scalar, 1, Dynamic> in_x_array(21);
+  Array<Scalar, 1, Dynamic> expected_out_array(21);
+
+  in_x_array << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0,
+      -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+  expected_out_array << -0.0875062221833, -0.092036796872, -0.0973496147565,
+      -0.103697667463, -0.11146429929, -0.121262681384, -0.134142493293,
+      -0.152051459309, -0.178750839502, -0.215269289249, 0.0, 0.215269289249,
+      0.178750839502, 0.152051459309, 0.134142493293, 0.121262681384,
+      0.11146429929, 0.103697667463, 0.0973496147565, 0.092036796872,
+      0.0875062221833;
+
+  for (int i = 0; i < 21; ++i) {
+    in_x(i) = in_x_array(i);
+    expected_out(i) = expected_out_array(i);
+  }
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_in), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_in, in_x.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 21);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 21);
+
+  gpu_out.device(gpu_device) = gpu_in.i1e();
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 21; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+
+  gpuFree(d_in);
+  gpuFree(d_out);
+}
+
+template <typename Scalar>
+void test_gpu_igamma_der_a()
+{
+  Tensor<Scalar, 1> in_x(30);
+  Tensor<Scalar, 1> in_a(30);
+  Tensor<Scalar, 1> out(30);
+  Tensor<Scalar, 1> expected_out(30);
+  out.setZero();
+
+  Array<Scalar, 1, Dynamic> in_a_array(30);
+  Array<Scalar, 1, Dynamic> in_x_array(30);
+  Array<Scalar, 1, Dynamic> expected_out_array(30);
+
+  // See special_functions.cpp for the Python code that generates the test data.
+
+  in_a_array << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0,
+      1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, 100.0,
+      100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
+
+  in_x_array << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
+      1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16, 0.0132865061065,
+      0.0200034203853, 6.29263709118e-17, 1.37160367764e-06, 0.333412038288,
+      1.18135687766, 0.580629033777, 0.170631439426, 0.786686768458,
+      7.63873279537, 13.1944344379, 11.896042354, 10.5830172417, 10.5020942233,
+      92.8918587747, 95.003720371, 86.3715926467, 96.0330217672, 82.6389930677,
+      968.702906754, 969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
+
+  expected_out_array << -32.7256441441, -36.4394150514, -9.66467612263,
+      -36.4394150514, -36.4394150514, -1.0891900302, -2.66351229645,
+      -2.48666868596, -0.929700494428, -3.56327722764, -0.455320135314,
+      -0.391437214323, -0.491352055991, -0.350454834292, -0.471773162921,
+      -0.104084440522, -0.0723646747909, -0.0992828975532, -0.121638215446,
+      -0.122619605294, -0.0317670267286, -0.0359974812869, -0.0154359225363,
+      -0.0375775365921, -0.00794899153653, -0.00777303219211, -0.00796085782042,
+      -0.0125850719397, -0.00455500206958, -0.00476436993148;
+
+  for (int i = 0; i < 30; ++i) {
+    in_x(i) = in_x_array(i);
+    in_a(i) = in_a_array(i);
+    expected_out(i) = expected_out_array(i);
+  }
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_a;
+  Scalar* d_x;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_a), bytes);
+  gpuMalloc((void**)(&d_x), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_a, in_a.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_a(d_a, 30);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_x(d_x, 30);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 30);
+
+  gpu_out.device(gpu_device) = gpu_a.igamma_der_a(gpu_x);
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 30; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+
+  gpuFree(d_a);
+  gpuFree(d_x);
+  gpuFree(d_out);
 }
 
+template <typename Scalar>
+void test_gpu_gamma_sample_der_alpha()
+{
+  Tensor<Scalar, 1> in_alpha(30);
+  Tensor<Scalar, 1> in_sample(30);
+  Tensor<Scalar, 1> out(30);
+  Tensor<Scalar, 1> expected_out(30);
+  out.setZero();
+
+  Array<Scalar, 1, Dynamic> in_alpha_array(30);
+  Array<Scalar, 1, Dynamic> in_sample_array(30);
+  Array<Scalar, 1, Dynamic> expected_out_array(30);
+
+  // See special_functions.cpp for the Python code that generates the test data.
+
+  in_alpha_array << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0,
+      1.0, 1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0,
+      100.0, 100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
+
+  in_sample_array << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
+      1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16, 0.0132865061065,
+      0.0200034203853, 6.29263709118e-17, 1.37160367764e-06, 0.333412038288,
+      1.18135687766, 0.580629033777, 0.170631439426, 0.786686768458,
+      7.63873279537, 13.1944344379, 11.896042354, 10.5830172417, 10.5020942233,
+      92.8918587747, 95.003720371, 86.3715926467, 96.0330217672, 82.6389930677,
+      968.702906754, 969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
+
+  expected_out_array << 7.42424742367e-23, 1.02004297287e-34, 0.0130155240738,
+      1.02004297287e-34, 1.02004297287e-34, 1.96505168277e-13, 0.525575786243,
+      0.713903991771, 2.32077561808e-14, 0.000179348049886, 0.635500453302,
+      1.27561284917, 0.878125852156, 0.41565819538, 1.03606488534,
+      0.885964824887, 1.16424049334, 1.10764479598, 1.04590810812,
+      1.04193666963, 0.965193152414, 0.976217589464, 0.93008035061,
+      0.98153216096, 0.909196397698, 0.98434963993, 0.984738050206,
+      1.00106492525, 0.97734200649, 1.02198794179;
+
+  for (int i = 0; i < 30; ++i) {
+    in_alpha(i) = in_alpha_array(i);
+    in_sample(i) = in_sample_array(i);
+    expected_out(i) = expected_out_array(i);
+  }
 
-void test_cxx11_tensor_cuda()
+  std::size_t bytes = in_alpha.size() * sizeof(Scalar);
+
+  Scalar* d_alpha;
+  Scalar* d_sample;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_alpha), bytes);
+  gpuMalloc((void**)(&d_sample), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_alpha, in_alpha.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_sample, in_sample.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_alpha(d_alpha, 30);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_sample(d_sample, 30);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 30);
+
+  gpu_out.device(gpu_device) = gpu_alpha.gamma_sample_der_alpha(gpu_sample);
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 30; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+
+  gpuFree(d_alpha);
+  gpuFree(d_sample);
+  gpuFree(d_out);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_gpu)
 {
-  CALL_SUBTEST_1(test_cuda_nullary());
-  CALL_SUBTEST_1(test_cuda_elementwise_small());
-  CALL_SUBTEST_1(test_cuda_elementwise());
-  CALL_SUBTEST_1(test_cuda_props());
-  CALL_SUBTEST_1(test_cuda_reduction());
-  CALL_SUBTEST_2(test_cuda_contraction<ColMajor>());
-  CALL_SUBTEST_2(test_cuda_contraction<RowMajor>());
-  CALL_SUBTEST_3(test_cuda_convolution_1d<ColMajor>());
-  CALL_SUBTEST_3(test_cuda_convolution_1d<RowMajor>());
-  CALL_SUBTEST_3(test_cuda_convolution_inner_dim_col_major_1d());
-  CALL_SUBTEST_3(test_cuda_convolution_inner_dim_row_major_1d());
-  CALL_SUBTEST_3(test_cuda_convolution_2d<ColMajor>());
-  CALL_SUBTEST_3(test_cuda_convolution_2d<RowMajor>());
-  CALL_SUBTEST_3(test_cuda_convolution_3d<ColMajor>());
-  CALL_SUBTEST_3(test_cuda_convolution_3d<RowMajor>());
+  CALL_SUBTEST_1(test_gpu_nullary());
+  CALL_SUBTEST_1(test_gpu_elementwise_small());
+  CALL_SUBTEST_1(test_gpu_elementwise());
+  CALL_SUBTEST_1(test_gpu_props());
+  CALL_SUBTEST_1(test_gpu_reduction());
+  CALL_SUBTEST_2(test_gpu_contraction<ColMajor>());
+  CALL_SUBTEST_2(test_gpu_contraction<RowMajor>());
+  CALL_SUBTEST_3(test_gpu_convolution_1d<ColMajor>());
+  CALL_SUBTEST_3(test_gpu_convolution_1d<RowMajor>());
+  CALL_SUBTEST_3(test_gpu_convolution_inner_dim_col_major_1d());
+  CALL_SUBTEST_3(test_gpu_convolution_inner_dim_row_major_1d());
+  CALL_SUBTEST_3(test_gpu_convolution_2d<ColMajor>());
+  CALL_SUBTEST_3(test_gpu_convolution_2d<RowMajor>());
+#if !defined(EIGEN_USE_HIP)
+// disable these tests on HIP for now.
+// they hang..need to investigate and fix
+  CALL_SUBTEST_3(test_gpu_convolution_3d<ColMajor>());
+  CALL_SUBTEST_3(test_gpu_convolution_3d<RowMajor>());
+#endif
 
 #if __cplusplus > 199711L
   // std::erf, std::erfc, and so on where only added in c++11. We use them
   // as a golden reference to validate the results produced by Eigen. Therefore
   // we can only run these tests if we use a c++11 compiler.
-  CALL_SUBTEST_4(test_cuda_lgamma<float>(1.0f));
-  CALL_SUBTEST_4(test_cuda_lgamma<float>(100.0f));
-  CALL_SUBTEST_4(test_cuda_lgamma<float>(0.01f));
-  CALL_SUBTEST_4(test_cuda_lgamma<float>(0.001f));
-
-  CALL_SUBTEST_4(test_cuda_lgamma<double>(1.0));
-  CALL_SUBTEST_4(test_cuda_lgamma<double>(100.0));
-  CALL_SUBTEST_4(test_cuda_lgamma<double>(0.01));
-  CALL_SUBTEST_4(test_cuda_lgamma<double>(0.001));
-
-  CALL_SUBTEST_4(test_cuda_erf<float>(1.0f));
-  CALL_SUBTEST_4(test_cuda_erf<float>(100.0f));
-  CALL_SUBTEST_4(test_cuda_erf<float>(0.01f));
-  CALL_SUBTEST_4(test_cuda_erf<float>(0.001f));
-
-  CALL_SUBTEST_4(test_cuda_erfc<float>(1.0f));
-  // CALL_SUBTEST(test_cuda_erfc<float>(100.0f));
-  CALL_SUBTEST_4(test_cuda_erfc<float>(5.0f)); // CUDA erfc lacks precision for large inputs
-  CALL_SUBTEST_4(test_cuda_erfc<float>(0.01f));
-  CALL_SUBTEST_4(test_cuda_erfc<float>(0.001f));
-
-  CALL_SUBTEST_4(test_cuda_erf<double>(1.0));
-  CALL_SUBTEST_4(test_cuda_erf<double>(100.0));
-  CALL_SUBTEST_4(test_cuda_erf<double>(0.01));
-  CALL_SUBTEST_4(test_cuda_erf<double>(0.001));
-
-  CALL_SUBTEST_4(test_cuda_erfc<double>(1.0));
-  // CALL_SUBTEST(test_cuda_erfc<double>(100.0));
-  CALL_SUBTEST_4(test_cuda_erfc<double>(5.0)); // CUDA erfc lacks precision for large inputs
-  CALL_SUBTEST_4(test_cuda_erfc<double>(0.01));
-  CALL_SUBTEST_4(test_cuda_erfc<double>(0.001));
-
-  CALL_SUBTEST_5(test_cuda_digamma<float>());
-  CALL_SUBTEST_5(test_cuda_digamma<double>());
-
-  CALL_SUBTEST_5(test_cuda_polygamma<float>());
-  CALL_SUBTEST_5(test_cuda_polygamma<double>());
-
-  CALL_SUBTEST_5(test_cuda_zeta<float>());
-  CALL_SUBTEST_5(test_cuda_zeta<double>());
-
-  CALL_SUBTEST_5(test_cuda_igamma<float>());
-  CALL_SUBTEST_5(test_cuda_igammac<float>());
-
-  CALL_SUBTEST_5(test_cuda_igamma<double>());
-  CALL_SUBTEST_5(test_cuda_igammac<double>());
-
-  CALL_SUBTEST_6(test_cuda_betainc<float>());
-  CALL_SUBTEST_6(test_cuda_betainc<double>());
+  CALL_SUBTEST_4(test_gpu_lgamma<float>(1.0f));
+  CALL_SUBTEST_4(test_gpu_lgamma<float>(100.0f));
+  CALL_SUBTEST_4(test_gpu_lgamma<float>(0.01f));
+  CALL_SUBTEST_4(test_gpu_lgamma<float>(0.001f));
+
+  CALL_SUBTEST_4(test_gpu_lgamma<double>(1.0));
+  CALL_SUBTEST_4(test_gpu_lgamma<double>(100.0));
+  CALL_SUBTEST_4(test_gpu_lgamma<double>(0.01));
+  CALL_SUBTEST_4(test_gpu_lgamma<double>(0.001));
+
+  CALL_SUBTEST_4(test_gpu_erf<float>(1.0f));
+  CALL_SUBTEST_4(test_gpu_erf<float>(100.0f));
+  CALL_SUBTEST_4(test_gpu_erf<float>(0.01f));
+  CALL_SUBTEST_4(test_gpu_erf<float>(0.001f));
+
+  CALL_SUBTEST_4(test_gpu_erfc<float>(1.0f));
+  // CALL_SUBTEST(test_gpu_erfc<float>(100.0f));
+  CALL_SUBTEST_4(test_gpu_erfc<float>(5.0f)); // GPU erfc lacks precision for large inputs
+  CALL_SUBTEST_4(test_gpu_erfc<float>(0.01f));
+  CALL_SUBTEST_4(test_gpu_erfc<float>(0.001f));
+
+  CALL_SUBTEST_4(test_gpu_erf<double>(1.0));
+  CALL_SUBTEST_4(test_gpu_erf<double>(100.0));
+  CALL_SUBTEST_4(test_gpu_erf<double>(0.01));
+  CALL_SUBTEST_4(test_gpu_erf<double>(0.001));
+
+  CALL_SUBTEST_4(test_gpu_erfc<double>(1.0));
+  // CALL_SUBTEST(test_gpu_erfc<double>(100.0));
+  CALL_SUBTEST_4(test_gpu_erfc<double>(5.0)); // GPU erfc lacks precision for large inputs
+  CALL_SUBTEST_4(test_gpu_erfc<double>(0.01));
+  CALL_SUBTEST_4(test_gpu_erfc<double>(0.001));
+
+#if !defined(EIGEN_USE_HIP)
+// disable these tests on HIP for now.
+  CALL_SUBTEST_5(test_gpu_digamma<float>());
+  CALL_SUBTEST_5(test_gpu_digamma<double>());
+
+  CALL_SUBTEST_5(test_gpu_polygamma<float>());
+  CALL_SUBTEST_5(test_gpu_polygamma<double>());
+
+  CALL_SUBTEST_5(test_gpu_zeta<float>());
+  CALL_SUBTEST_5(test_gpu_zeta<double>());
+#endif
+
+  CALL_SUBTEST_5(test_gpu_igamma<float>());
+  CALL_SUBTEST_5(test_gpu_igammac<float>());
+
+  CALL_SUBTEST_5(test_gpu_igamma<double>());
+  CALL_SUBTEST_5(test_gpu_igammac<double>());
+
+#if !defined(EIGEN_USE_HIP)
+// disable these tests on HIP for now.
+  CALL_SUBTEST_6(test_gpu_betainc<float>());
+  CALL_SUBTEST_6(test_gpu_betainc<double>());
+
+  CALL_SUBTEST_6(test_gpu_i0e<float>());
+  CALL_SUBTEST_6(test_gpu_i0e<double>());
+
+  CALL_SUBTEST_6(test_gpu_i1e<float>());
+  CALL_SUBTEST_6(test_gpu_i1e<double>());
+
+  CALL_SUBTEST_6(test_gpu_i1e<float>());
+  CALL_SUBTEST_6(test_gpu_i1e<double>());
+
+  CALL_SUBTEST_6(test_gpu_igamma_der_a<float>());
+  CALL_SUBTEST_6(test_gpu_igamma_der_a<double>());
+
+  CALL_SUBTEST_6(test_gpu_gamma_sample_der_alpha<float>());
+  CALL_SUBTEST_6(test_gpu_gamma_sample_der_alpha<double>());
+#endif
+
 #endif
 }
diff --git a/unsupported/test/cxx11_tensor_ifft.cpp b/unsupported/test/cxx11_tensor_ifft.cpp
index 5fd88fa6c..c20edd9ac 100644
--- a/unsupported/test/cxx11_tensor_ifft.cpp
+++ b/unsupported/test/cxx11_tensor_ifft.cpp
@@ -131,7 +131,7 @@ static void test_sub_fft_ifft_invariant(int dim0, int dim1, int dim2, int dim3)
   }
 }
 
-void test_cxx11_tensor_ifft() {
+EIGEN_DECLARE_TEST(cxx11_tensor_ifft) {
   CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(4));
   CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(16));
   CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(32));
diff --git a/unsupported/test/cxx11_tensor_image_patch.cpp b/unsupported/test/cxx11_tensor_image_patch.cpp
index 105d32fb4..862f1f7f0 100644
--- a/unsupported/test/cxx11_tensor_image_patch.cpp
+++ b/unsupported/test/cxx11_tensor_image_patch.cpp
@@ -797,7 +797,7 @@ void test_imagenet_patches()
   }
 }
 
-void test_cxx11_tensor_image_patch()
+EIGEN_DECLARE_TEST(cxx11_tensor_image_patch)
 {
   CALL_SUBTEST_1(test_simple_patch());
   CALL_SUBTEST_2(test_patch_no_extra_dim());
diff --git a/unsupported/test/cxx11_tensor_image_patch_sycl.cpp b/unsupported/test/cxx11_tensor_image_patch_sycl.cpp
index eea18ec70..c1828a0ec 100644
--- a/unsupported/test/cxx11_tensor_image_patch_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_image_patch_sycl.cpp
@@ -13,7 +13,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_image_patch_sycl
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
@@ -1084,7 +1084,7 @@ test_patch_padding_same_sycl<DataType, int64_t>(sycl_device);
 test_patch_no_extra_dim_sycl<DataType, int64_t>(sycl_device);
 test_imagenet_patches_sycl<DataType, int64_t>(sycl_device);
 }
-void test_cxx11_tensor_image_patch_sycl()
+EIGEN_DECLARE_TEST(cxx11_tensor_image_patch_sycl)
 {
 for (const auto& device :Eigen::get_sycl_supported_devices()) {
   CALL_SUBTEST(sycl_tensor_image_patch_test_per_device<float>(device));
diff --git a/unsupported/test/cxx11_tensor_index_list.cpp b/unsupported/test/cxx11_tensor_index_list.cpp
index 4cf5df666..e81fa5e40 100644
--- a/unsupported/test/cxx11_tensor_index_list.cpp
+++ b/unsupported/test/cxx11_tensor_index_list.cpp
@@ -373,7 +373,7 @@ static void test_dim_check()
 
 #endif
 
-void test_cxx11_tensor_index_list()
+EIGEN_DECLARE_TEST(cxx11_tensor_index_list)
 {
 #ifdef EIGEN_HAS_INDEX_LIST
   CALL_SUBTEST(test_static_index_list());
diff --git a/unsupported/test/cxx11_tensor_inflation.cpp b/unsupported/test/cxx11_tensor_inflation.cpp
index 4997935e9..75089e856 100644
--- a/unsupported/test/cxx11_tensor_inflation.cpp
+++ b/unsupported/test/cxx11_tensor_inflation.cpp
@@ -74,7 +74,7 @@ static void test_simple_inflation()
   }
 }
 
-void test_cxx11_tensor_inflation()
+EIGEN_DECLARE_TEST(cxx11_tensor_inflation)
 {
   CALL_SUBTEST(test_simple_inflation<ColMajor>());
   CALL_SUBTEST(test_simple_inflation<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_inflation_sycl.cpp b/unsupported/test/cxx11_tensor_inflation_sycl.cpp
index f2f87f7ed..521ae0cc3 100644
--- a/unsupported/test/cxx11_tensor_inflation_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_inflation_sycl.cpp
@@ -13,7 +13,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_inflation_sycl
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
@@ -22,10 +22,10 @@
 
 using Eigen::Tensor;
 
-// Inflation Defenition for each dimention the inflated val would be
+// Inflation Definition for each dimension the inflated val would be
 //((dim-1)*strid[dim] +1)
 
-// for 1 dimnention vector of size 3 with value (4,4,4) with the inflated stride value of 3 would be changed to
+// for 1 dimension vector of size 3 with value (4,4,4) with the inflated stride value of 3 would be changed to
 // tensor of size (2*3) +1 = 7 with the value of
 // (4, 0, 0, 4, 0, 0, 4).
 
@@ -128,7 +128,7 @@ template<typename DataType, typename dev_Selector> void sycl_inflation_test_per_
   test_simple_inflation_sycl<DataType, RowMajor, int64_t>(sycl_device);
   test_simple_inflation_sycl<DataType, ColMajor, int64_t>(sycl_device);
 }
-void test_cxx11_tensor_inflation_sycl()
+EIGEN_DECLARE_TEST(cxx11_tensor_inflation_sycl)
 {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     CALL_SUBTEST(sycl_inflation_test_per_device<float>(device));
diff --git a/unsupported/test/cxx11_tensor_intdiv.cpp b/unsupported/test/cxx11_tensor_intdiv.cpp
index 8e2b70b75..d18a05ec4 100644
--- a/unsupported/test/cxx11_tensor_intdiv.cpp
+++ b/unsupported/test/cxx11_tensor_intdiv.cpp
@@ -135,7 +135,7 @@ void test_specific() {
   VERIFY_IS_EQUAL(result, result_op);
 }
 
-void test_cxx11_tensor_intdiv()
+EIGEN_DECLARE_TEST(cxx11_tensor_intdiv)
 {
   CALL_SUBTEST_1(test_signed_32bit());
   CALL_SUBTEST_2(test_unsigned_32bit());
diff --git a/unsupported/test/cxx11_tensor_io.cpp b/unsupported/test/cxx11_tensor_io.cpp
index 489960529..2c638f9bf 100644
--- a/unsupported/test/cxx11_tensor_io.cpp
+++ b/unsupported/test/cxx11_tensor_io.cpp
@@ -119,7 +119,7 @@ static void test_output_const()
 }
 
 
-void test_cxx11_tensor_io()
+EIGEN_DECLARE_TEST(cxx11_tensor_io)
 {
   CALL_SUBTEST(test_output_0d<ColMajor>());
   CALL_SUBTEST(test_output_0d<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_layout_swap.cpp b/unsupported/test/cxx11_tensor_layout_swap.cpp
index ae297a9da..efb333360 100644
--- a/unsupported/test/cxx11_tensor_layout_swap.cpp
+++ b/unsupported/test/cxx11_tensor_layout_swap.cpp
@@ -54,7 +54,7 @@ static void test_swap_as_lvalue()
 }
 
 
-void test_cxx11_tensor_layout_swap()
+EIGEN_DECLARE_TEST(cxx11_tensor_layout_swap)
 {
   CALL_SUBTEST(test_simple_swap());
   CALL_SUBTEST(test_swap_as_lvalue());
diff --git a/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp
index 9e8db8b4b..9546b911c 100644
--- a/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp
@@ -14,7 +14,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_layout_swap_sycl
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
@@ -118,7 +118,7 @@ template<typename DataType, typename dev_Selector> void sycl_tensor_layout_swap_
   test_simple_swap_sycl<DataType, int64_t>(sycl_device);
   test_swap_as_lvalue_sycl<DataType, int64_t>(sycl_device);
 }
-void test_cxx11_tensor_layout_swap_sycl()
+EIGEN_DECLARE_TEST(cxx11_tensor_layout_swap_sycl)
 {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     CALL_SUBTEST(sycl_tensor_layout_swap_test_per_device<float>(device));
diff --git a/unsupported/test/cxx11_tensor_lvalue.cpp b/unsupported/test/cxx11_tensor_lvalue.cpp
index 071f5b406..6ba9a212d 100644
--- a/unsupported/test/cxx11_tensor_lvalue.cpp
+++ b/unsupported/test/cxx11_tensor_lvalue.cpp
@@ -36,7 +36,7 @@ static void test_compound_assignment()
 }
 
 
-void test_cxx11_tensor_lvalue()
+EIGEN_DECLARE_TEST(cxx11_tensor_lvalue)
 {
   CALL_SUBTEST(test_compound_assignment());
 }
diff --git a/unsupported/test/cxx11_tensor_map.cpp b/unsupported/test/cxx11_tensor_map.cpp
index 3db0ee7c0..ce608aca7 100644
--- a/unsupported/test/cxx11_tensor_map.cpp
+++ b/unsupported/test/cxx11_tensor_map.cpp
@@ -265,7 +265,7 @@ static void test_casting()
   VERIFY_IS_EQUAL(sum1, 861);
 }
 
-void test_cxx11_tensor_map()
+EIGEN_DECLARE_TEST(cxx11_tensor_map)
 {
   CALL_SUBTEST(test_0d());
   CALL_SUBTEST(test_1d());
diff --git a/unsupported/test/cxx11_tensor_math.cpp b/unsupported/test/cxx11_tensor_math.cpp
index 61c742a16..82a1a26d8 100644
--- a/unsupported/test/cxx11_tensor_math.cpp
+++ b/unsupported/test/cxx11_tensor_math.cpp
@@ -39,7 +39,7 @@ static void test_sigmoid()
 }
 
 
-void test_cxx11_tensor_math()
+EIGEN_DECLARE_TEST(cxx11_tensor_math)
 {
   CALL_SUBTEST(test_tanh());
   CALL_SUBTEST(test_sigmoid());
diff --git a/unsupported/test/cxx11_tensor_mixed_indices.cpp b/unsupported/test/cxx11_tensor_mixed_indices.cpp
index 4fba6fdd1..ee2616fd7 100644
--- a/unsupported/test/cxx11_tensor_mixed_indices.cpp
+++ b/unsupported/test/cxx11_tensor_mixed_indices.cpp
@@ -47,7 +47,7 @@ static void test_simple()
 }
 
 
-void test_cxx11_tensor_mixed_indices()
+EIGEN_DECLARE_TEST(cxx11_tensor_mixed_indices)
 {
   CALL_SUBTEST(test_simple());
 }
diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp
index f7de43110..6365cd89a 100644
--- a/unsupported/test/cxx11_tensor_morphing.cpp
+++ b/unsupported/test/cxx11_tensor_morphing.cpp
@@ -459,7 +459,7 @@ static void test_composition()
 }
 
 
-void test_cxx11_tensor_morphing()
+EIGEN_DECLARE_TEST(cxx11_tensor_morphing)
 {
   CALL_SUBTEST_1(test_simple_reshape<void>());
   CALL_SUBTEST_1(test_reshape_in_expr<void>());
diff --git a/unsupported/test/cxx11_tensor_morphing_sycl.cpp b/unsupported/test/cxx11_tensor_morphing_sycl.cpp
index 9b521bc6b..93dabe3ec 100644
--- a/unsupported/test/cxx11_tensor_morphing_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_morphing_sycl.cpp
@@ -15,7 +15,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_morphing_sycl
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
@@ -240,7 +240,7 @@ template<typename DataType, typename dev_Selector> void sycl_morphing_test_per_d
   test_strided_slice_write_sycl<DataType, ColMajor, int64_t>(sycl_device);
   test_strided_slice_write_sycl<DataType, RowMajor, int64_t>(sycl_device);
 }
-void test_cxx11_tensor_morphing_sycl()
+EIGEN_DECLARE_TEST(cxx11_tensor_morphing_sycl)
 {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     CALL_SUBTEST(sycl_morphing_test_per_device<float>(device));
diff --git a/unsupported/test/cxx11_tensor_move.cpp b/unsupported/test/cxx11_tensor_move.cpp
new file mode 100644
index 000000000..0ab2b7786
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_move.cpp
@@ -0,0 +1,81 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Viktor Csomor <viktor.csomor@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+#include <utility>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void calc_indices(int i, int& x, int& y, int& z)
+{
+  x = i / 4;
+  y = (i % 4) / 2;
+  z = i % 2;
+}
+
+static void test_move()
+{
+  int x;
+  int y;
+  int z;
+
+  Tensor<int,3> tensor1(2, 2, 2);
+  Tensor<int,3,RowMajor> tensor2(2, 2, 2);
+
+  for (int i = 0; i < 8; i++)
+  {
+    calc_indices(i, x, y, z);
+    tensor1(x,y,z) = i;
+    tensor2(x,y,z) = 2 * i;
+  }
+
+  // Invokes the move constructor.
+  Tensor<int,3> moved_tensor1 = std::move(tensor1);
+  Tensor<int,3,RowMajor> moved_tensor2 = std::move(tensor2);
+
+  VERIFY_IS_EQUAL(tensor1.size(), 0);
+  VERIFY_IS_EQUAL(tensor2.size(), 0);
+
+  for (int i = 0; i < 8; i++)
+  {
+    calc_indices(i, x, y, z);
+    VERIFY_IS_EQUAL(moved_tensor1(x,y,z), i);
+    VERIFY_IS_EQUAL(moved_tensor2(x,y,z), 2 * i);
+  }
+
+  Tensor<int,3> moved_tensor3(2,2,2);
+  Tensor<int,3,RowMajor> moved_tensor4(2,2,2);
+
+  moved_tensor3.setZero();
+  moved_tensor4.setZero();
+
+  // Invokes the move assignment operator.
+  moved_tensor3 = std::move(moved_tensor1);
+  moved_tensor4 = std::move(moved_tensor2);
+
+  VERIFY_IS_EQUAL(moved_tensor1.size(), 8);
+  VERIFY_IS_EQUAL(moved_tensor2.size(), 8);
+
+  for (int i = 0; i < 8; i++)
+  {
+    calc_indices(i, x, y, z);
+    VERIFY_IS_EQUAL(moved_tensor1(x,y,z), 0);
+    VERIFY_IS_EQUAL(moved_tensor2(x,y,z), 0);
+    VERIFY_IS_EQUAL(moved_tensor3(x,y,z), i);
+    VERIFY_IS_EQUAL(moved_tensor4(x,y,z), 2 * i);
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_move)
+{
+  CALL_SUBTEST(test_move());
+}
diff --git a/unsupported/test/cxx11_tensor_notification.cpp b/unsupported/test/cxx11_tensor_notification.cpp
index 183ef02c1..f63fee013 100644
--- a/unsupported/test/cxx11_tensor_notification.cpp
+++ b/unsupported/test/cxx11_tensor_notification.cpp
@@ -65,7 +65,7 @@ static void test_notification_multiple()
   VERIFY_IS_EQUAL(counter, 4);
 }
 
-void test_cxx11_tensor_notification()
+EIGEN_DECLARE_TEST(cxx11_tensor_notification)
 {
   CALL_SUBTEST(test_notification_single());
   CALL_SUBTEST(test_notification_multiple());
diff --git a/unsupported/test/cxx11_tensor_of_complex.cpp b/unsupported/test/cxx11_tensor_of_complex.cpp
index e9d1b2d3c..99e18076a 100644
--- a/unsupported/test/cxx11_tensor_of_complex.cpp
+++ b/unsupported/test/cxx11_tensor_of_complex.cpp
@@ -94,7 +94,7 @@ static void test_contractions()
 }
 
 
-void test_cxx11_tensor_of_complex()
+EIGEN_DECLARE_TEST(cxx11_tensor_of_complex)
 {
   CALL_SUBTEST(test_additions());
   CALL_SUBTEST(test_abs());
diff --git a/unsupported/test/cxx11_tensor_of_const_values.cpp b/unsupported/test/cxx11_tensor_of_const_values.cpp
index f179a0c21..344d678ef 100644
--- a/unsupported/test/cxx11_tensor_of_const_values.cpp
+++ b/unsupported/test/cxx11_tensor_of_const_values.cpp
@@ -97,7 +97,7 @@ static void test_plus_equal()
 }
 
 
-void test_cxx11_tensor_of_const_values()
+EIGEN_DECLARE_TEST(cxx11_tensor_of_const_values)
 {
   CALL_SUBTEST(test_assign());
   CALL_SUBTEST(test_plus());
diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_gpu.cu
index 167b75d25..4d74e6138 100644
--- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_gpu.cu
@@ -9,7 +9,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_cuda
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
@@ -20,8 +20,8 @@
 using Eigen::Tensor;
 
 template<typename>
-void test_cuda_numext() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_numext() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = 101;
 
@@ -57,11 +57,11 @@ void test_cuda_numext() {
 }
 
 
-#ifdef EIGEN_HAS_CUDA_FP16
+#ifdef EIGEN_HAS_GPU_FP16
 
 template<typename>
-void test_cuda_conversion() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_conversion() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = 101;
   
@@ -95,8 +95,8 @@ void test_cuda_conversion() {
 }
 
 template<typename>
-void test_cuda_unary() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_unary() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = 101;
 
@@ -132,8 +132,8 @@ void test_cuda_unary() {
 }
 
 template<typename>
-void test_cuda_elementwise() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_elementwise() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = 101;
 
@@ -174,8 +174,8 @@ void test_cuda_elementwise() {
 }
 
 template<typename>
-void test_cuda_trancendental() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_trancendental() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = 101;
 
@@ -247,7 +247,7 @@ void test_cuda_trancendental() {
   }
   for (int i = 0; i < num_elem; ++i) {
     std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
-    if(std::abs(input2(i)-1.f)<0.05f) // log lacks accurary nearby 1
+    if(std::abs(input2(i)-1.f)<0.05f) // log lacks accuracy nearby 1
       VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
     else
       VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
@@ -268,8 +268,8 @@ void test_cuda_trancendental() {
 }
 
 template<typename>
-void test_cuda_contractions() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_contractions() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int rows = 23;
   int cols = 23;
@@ -319,12 +319,12 @@ void test_cuda_contractions() {
 }
 
 template<typename>
-void test_cuda_reductions(int size1, int size2, int redux) {
+void test_gpu_reductions(int size1, int size2, int redux) {
 
    std::cout << "Reducing " << size1 << " by " << size2
              << " tensor along dim " << redux << std::endl; 
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = size1*size2;
   int result_size = (redux == 1 ? size1 : size2);
@@ -368,20 +368,20 @@ void test_cuda_reductions(int size1, int size2, int redux) {
 }
 
 template<typename>
-void test_cuda_reductions() {
-  test_cuda_reductions<void>(13, 13, 0);
-  test_cuda_reductions<void>(13, 13, 1);
+void test_gpu_reductions() {
+  test_gpu_reductions<void>(13, 13, 0);
+  test_gpu_reductions<void>(13, 13, 1);
 
-  test_cuda_reductions<void>(35, 36, 0);
-  test_cuda_reductions<void>(35, 36, 1);
+  test_gpu_reductions<void>(35, 36, 0);
+  test_gpu_reductions<void>(35, 36, 1);
 
-  test_cuda_reductions<void>(36, 35, 0);
-  test_cuda_reductions<void>(36, 35, 1);
+  test_gpu_reductions<void>(36, 35, 0);
+  test_gpu_reductions<void>(36, 35, 1);
 }
 
 template<typename>
-void test_cuda_full_reductions() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_full_reductions() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int size = 13;
   int num_elem = size*size;
@@ -429,9 +429,9 @@ void test_cuda_full_reductions() {
 }
 
 template<typename>
-void test_cuda_forced_evals() {
+void test_gpu_forced_evals() {
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = 101;
 
@@ -479,20 +479,20 @@ void test_cuda_forced_evals() {
 #endif
 
 
-void test_cxx11_tensor_of_float16_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_gpu)
 {
-  CALL_SUBTEST_1(test_cuda_numext<void>());
-
-#ifdef EIGEN_HAS_CUDA_FP16
-  CALL_SUBTEST_1(test_cuda_conversion<void>());
-  CALL_SUBTEST_1(test_cuda_unary<void>());
-  CALL_SUBTEST_1(test_cuda_elementwise<void>());
-  CALL_SUBTEST_1(test_cuda_trancendental<void>());
-  CALL_SUBTEST_2(test_cuda_contractions<void>());
-  CALL_SUBTEST_3(test_cuda_reductions<void>());
-  CALL_SUBTEST_4(test_cuda_full_reductions<void>());
-  CALL_SUBTEST_5(test_cuda_forced_evals<void>());
+  CALL_SUBTEST_1(test_gpu_numext<void>());
+
+#ifdef EIGEN_HAS_GPU_FP16
+  CALL_SUBTEST_1(test_gpu_conversion<void>());
+  CALL_SUBTEST_1(test_gpu_unary<void>());
+  CALL_SUBTEST_1(test_gpu_elementwise<void>());
+  CALL_SUBTEST_1(test_gpu_trancendental<void>());
+  CALL_SUBTEST_2(test_gpu_contractions<void>());
+  CALL_SUBTEST_3(test_gpu_reductions<void>());
+  CALL_SUBTEST_4(test_gpu_full_reductions<void>());
+  CALL_SUBTEST_5(test_gpu_forced_evals<void>());
 #else
-  std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl;
+  std::cout << "Half floats are not supported by this version of gpu: skipping the test" << std::endl;
 #endif
 }
diff --git a/unsupported/test/cxx11_tensor_of_strings.cpp b/unsupported/test/cxx11_tensor_of_strings.cpp
index 4ef9aed91..159656276 100644
--- a/unsupported/test/cxx11_tensor_of_strings.cpp
+++ b/unsupported/test/cxx11_tensor_of_strings.cpp
@@ -141,7 +141,7 @@ static void test_initialization()
 }
 
 
-void test_cxx11_tensor_of_strings()
+EIGEN_DECLARE_TEST(cxx11_tensor_of_strings)
 {
   // Beware: none of this is likely to ever work on a GPU.
   CALL_SUBTEST(test_assign());
diff --git a/unsupported/test/cxx11_tensor_padding.cpp b/unsupported/test/cxx11_tensor_padding.cpp
index ffa19896e..b8a329deb 100644
--- a/unsupported/test/cxx11_tensor_padding.cpp
+++ b/unsupported/test/cxx11_tensor_padding.cpp
@@ -84,7 +84,7 @@ static void test_padded_expr()
   }
 }
 
-void test_cxx11_tensor_padding()
+EIGEN_DECLARE_TEST(cxx11_tensor_padding)
 {
   CALL_SUBTEST(test_simple_padding<ColMajor>());
   CALL_SUBTEST(test_simple_padding<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_padding_sycl.cpp b/unsupported/test/cxx11_tensor_padding_sycl.cpp
index dc748b73e..727a9ffd7 100644
--- a/unsupported/test/cxx11_tensor_padding_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_padding_sycl.cpp
@@ -15,7 +15,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_padding_sycl
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
@@ -149,7 +149,7 @@ template<typename DataType, typename dev_Selector> void sycl_padding_test_per_de
   test_padded_expr<DataType, ColMajor, int64_t>(sycl_device);
 
 }
-void test_cxx11_tensor_padding_sycl()
+EIGEN_DECLARE_TEST(cxx11_tensor_padding_sycl)
 {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     CALL_SUBTEST(sycl_padding_test_per_device<float>(device));
diff --git a/unsupported/test/cxx11_tensor_patch.cpp b/unsupported/test/cxx11_tensor_patch.cpp
index 434359730..498ab8ca7 100644
--- a/unsupported/test/cxx11_tensor_patch.cpp
+++ b/unsupported/test/cxx11_tensor_patch.cpp
@@ -164,7 +164,7 @@ static void test_simple_patch()
   }
 }
 
-void test_cxx11_tensor_patch()
+EIGEN_DECLARE_TEST(cxx11_tensor_patch)
 {
    CALL_SUBTEST(test_simple_patch<ColMajor>());
    CALL_SUBTEST(test_simple_patch<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_patch_sycl.cpp b/unsupported/test/cxx11_tensor_patch_sycl.cpp
index 88a29cb31..7f92bec78 100644
--- a/unsupported/test/cxx11_tensor_patch_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_patch_sycl.cpp
@@ -14,7 +14,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_patch_sycl
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
@@ -241,7 +241,7 @@ template<typename DataType, typename dev_Selector> void sycl_tensor_patch_test_p
   test_simple_patch_sycl<DataType, RowMajor, int64_t>(sycl_device);
   test_simple_patch_sycl<DataType, ColMajor, int64_t>(sycl_device);
 }
-void test_cxx11_tensor_patch_sycl()
+EIGEN_DECLARE_TEST(cxx11_tensor_patch_sycl)
 {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     CALL_SUBTEST(sycl_tensor_patch_test_per_device<float>(device));
diff --git a/unsupported/test/cxx11_tensor_random.cpp b/unsupported/test/cxx11_tensor_random.cpp
index 0f3dc5787..4740d5811 100644
--- a/unsupported/test/cxx11_tensor_random.cpp
+++ b/unsupported/test/cxx11_tensor_random.cpp
@@ -70,7 +70,7 @@ static void test_custom()
   }
 }
 
-void test_cxx11_tensor_random()
+EIGEN_DECLARE_TEST(cxx11_tensor_random)
 {
   CALL_SUBTEST(test_default());
   CALL_SUBTEST(test_normal());
diff --git a/unsupported/test/cxx11_tensor_random_cuda.cu b/unsupported/test/cxx11_tensor_random_gpu.cu
index fa1a46732..090986ebc 100644
--- a/unsupported/test/cxx11_tensor_random_cuda.cu
+++ b/unsupported/test/cxx11_tensor_random_gpu.cu
@@ -9,15 +9,16 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_random_cuda
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
 #include "main.h"
 #include <Eigen/CXX11/Tensor>
 
+#include <Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
 
-void test_cuda_random_uniform()
+void test_gpu_random_uniform()
 {
   Tensor<float, 2> out(72,97);
   out.setZero();
@@ -25,24 +26,24 @@ void test_cuda_random_uniform()
   std::size_t out_bytes = out.size() * sizeof(float);
 
   float* d_out;
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
 
   gpu_out.device(gpu_device) = gpu_out.random();
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
-  // For now we just check thes code doesn't crash.
+  // For now we just check this code doesn't crash.
   // TODO: come up with a valid test of randomness
 }
 
 
-void test_cuda_random_normal()
+void test_gpu_random_normal()
 {
   Tensor<float, 2> out(72,97);
   out.setZero();
@@ -50,9 +51,9 @@ void test_cuda_random_normal()
   std::size_t out_bytes = out.size() * sizeof(float);
 
   float* d_out;
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
@@ -60,8 +61,8 @@ void test_cuda_random_normal()
   Eigen::internal::NormalRandomGenerator<float> gen(true);
   gpu_out.device(gpu_device) = gpu_out.random(gen);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 }
 
 static void test_complex()
@@ -77,9 +78,9 @@ static void test_complex()
 }
 
 
-void test_cxx11_tensor_random_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_random_gpu)
 {
-  CALL_SUBTEST(test_cuda_random_uniform());
-  CALL_SUBTEST(test_cuda_random_normal());
+  CALL_SUBTEST(test_gpu_random_uniform());
+  CALL_SUBTEST(test_gpu_random_normal());
   CALL_SUBTEST(test_complex());
 }
diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp
index 1490ec3da..ff8e18c07 100644
--- a/unsupported/test/cxx11_tensor_reduction.cpp
+++ b/unsupported/test/cxx11_tensor_reduction.cpp
@@ -53,20 +53,20 @@ static void test_trivial_reductions() {
   }
 }
 
-template <int DataLayout>
+template <typename Scalar,int DataLayout>
 static void test_simple_reductions() {
-  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+  Tensor<Scalar, 4, DataLayout> tensor(2, 3, 5, 7);
   tensor.setRandom();
   array<ptrdiff_t, 2> reduction_axis2;
   reduction_axis2[0] = 1;
   reduction_axis2[1] = 3;
 
-  Tensor<float, 2, DataLayout> result = tensor.sum(reduction_axis2);
+  Tensor<Scalar, 2, DataLayout> result = tensor.sum(reduction_axis2);
   VERIFY_IS_EQUAL(result.dimension(0), 2);
   VERIFY_IS_EQUAL(result.dimension(1), 5);
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 5; ++j) {
-      float sum = 0.0f;
+      Scalar sum = Scalar(0.0f);
       for (int k = 0; k < 3; ++k) {
         for (int l = 0; l < 7; ++l) {
           sum += tensor(i, k, j, l);
@@ -77,7 +77,7 @@ static void test_simple_reductions() {
   }
 
   {
-    Tensor<float, 0, DataLayout> sum1 = tensor.sum();
+    Tensor<Scalar, 0, DataLayout> sum1 = tensor.sum();
     VERIFY_IS_EQUAL(sum1.rank(), 0);
 
     array<ptrdiff_t, 4> reduction_axis4;
@@ -85,7 +85,7 @@ static void test_simple_reductions() {
     reduction_axis4[1] = 1;
     reduction_axis4[2] = 2;
     reduction_axis4[3] = 3;
-    Tensor<float, 0, DataLayout> sum2 = tensor.sum(reduction_axis4);
+    Tensor<Scalar, 0, DataLayout> sum2 = tensor.sum(reduction_axis4);
     VERIFY_IS_EQUAL(sum2.rank(), 0);
 
     VERIFY_IS_APPROX(sum1(), sum2());
@@ -98,7 +98,7 @@ static void test_simple_reductions() {
   VERIFY_IS_EQUAL(result.dimension(1), 7);
   for (int i = 0; i < 3; ++i) {
     for (int j = 0; j < 7; ++j) {
-      float prod = 1.0f;
+      Scalar prod = Scalar(1.0f);
       for (int k = 0; k < 2; ++k) {
         for (int l = 0; l < 5; ++l) {
           prod *= tensor(k, i, l, j);
@@ -109,7 +109,7 @@ static void test_simple_reductions() {
   }
 
   {
-    Tensor<float, 0, DataLayout> prod1 = tensor.prod();
+    Tensor<Scalar, 0, DataLayout> prod1 = tensor.prod();
     VERIFY_IS_EQUAL(prod1.rank(), 0);
 
     array<ptrdiff_t, 4> reduction_axis4;
@@ -117,7 +117,7 @@ static void test_simple_reductions() {
     reduction_axis4[1] = 1;
     reduction_axis4[2] = 2;
     reduction_axis4[3] = 3;
-    Tensor<float, 0, DataLayout> prod2 = tensor.prod(reduction_axis4);
+    Tensor<Scalar, 0, DataLayout> prod2 = tensor.prod(reduction_axis4);
     VERIFY_IS_EQUAL(prod2.rank(), 0);
 
     VERIFY_IS_APPROX(prod1(), prod2());
@@ -130,7 +130,7 @@ static void test_simple_reductions() {
   VERIFY_IS_EQUAL(result.dimension(1), 7);
   for (int i = 0; i < 3; ++i) {
     for (int j = 0; j < 7; ++j) {
-      float max_val = std::numeric_limits<float>::lowest();
+      Scalar max_val = std::numeric_limits<Scalar>::lowest();
       for (int k = 0; k < 2; ++k) {
         for (int l = 0; l < 5; ++l) {
           max_val = (std::max)(max_val, tensor(k, i, l, j));
@@ -141,7 +141,7 @@ static void test_simple_reductions() {
   }
 
   {
-    Tensor<float, 0, DataLayout> max1 = tensor.maximum();
+    Tensor<Scalar, 0, DataLayout> max1 = tensor.maximum();
     VERIFY_IS_EQUAL(max1.rank(), 0);
 
     array<ptrdiff_t, 4> reduction_axis4;
@@ -149,7 +149,7 @@ static void test_simple_reductions() {
     reduction_axis4[1] = 1;
     reduction_axis4[2] = 2;
     reduction_axis4[3] = 3;
-    Tensor<float, 0, DataLayout> max2 = tensor.maximum(reduction_axis4);
+    Tensor<Scalar, 0, DataLayout> max2 = tensor.maximum(reduction_axis4);
     VERIFY_IS_EQUAL(max2.rank(), 0);
 
     VERIFY_IS_APPROX(max1(), max2());
@@ -162,7 +162,7 @@ static void test_simple_reductions() {
   VERIFY_IS_EQUAL(result.dimension(1), 7);
   for (int i = 0; i < 5; ++i) {
     for (int j = 0; j < 7; ++j) {
-      float min_val = (std::numeric_limits<float>::max)();
+      Scalar min_val = (std::numeric_limits<Scalar>::max)();
       for (int k = 0; k < 2; ++k) {
         for (int l = 0; l < 3; ++l) {
           min_val = (std::min)(min_val, tensor(k, l, i, j));
@@ -173,7 +173,7 @@ static void test_simple_reductions() {
   }
 
   {
-    Tensor<float, 0, DataLayout> min1 = tensor.minimum();
+    Tensor<Scalar, 0, DataLayout> min1 = tensor.minimum();
     VERIFY_IS_EQUAL(min1.rank(), 0);
 
     array<ptrdiff_t, 4> reduction_axis4;
@@ -181,7 +181,7 @@ static void test_simple_reductions() {
     reduction_axis4[1] = 1;
     reduction_axis4[2] = 2;
     reduction_axis4[3] = 3;
-    Tensor<float, 0, DataLayout> min2 = tensor.minimum(reduction_axis4);
+    Tensor<Scalar, 0, DataLayout> min2 = tensor.minimum(reduction_axis4);
     VERIFY_IS_EQUAL(min2.rank(), 0);
 
     VERIFY_IS_APPROX(min1(), min2());
@@ -194,7 +194,7 @@ static void test_simple_reductions() {
   VERIFY_IS_EQUAL(result.dimension(1), 7);
   for (int i = 0; i < 5; ++i) {
     for (int j = 0; j < 7; ++j) {
-      float sum = 0.0f;
+      Scalar sum = Scalar(0.0f);
       int count = 0;
       for (int k = 0; k < 2; ++k) {
         for (int l = 0; l < 3; ++l) {
@@ -207,7 +207,7 @@ static void test_simple_reductions() {
   }
 
   {
-    Tensor<float, 0, DataLayout> mean1 = tensor.mean();
+    Tensor<Scalar, 0, DataLayout> mean1 = tensor.mean();
     VERIFY_IS_EQUAL(mean1.rank(), 0);
 
     array<ptrdiff_t, 4> reduction_axis4;
@@ -215,7 +215,7 @@ static void test_simple_reductions() {
     reduction_axis4[1] = 1;
     reduction_axis4[2] = 2;
     reduction_axis4[3] = 3;
-    Tensor<float, 0, DataLayout> mean2 = tensor.mean(reduction_axis4);
+    Tensor<Scalar, 0, DataLayout> mean2 = tensor.mean(reduction_axis4);
     VERIFY_IS_EQUAL(mean2.rank(), 0);
 
     VERIFY_IS_APPROX(mean1(), mean2());
@@ -484,11 +484,12 @@ static void test_reduce_middle_dims() {
   }
 }
 
-void test_cxx11_tensor_reduction() {
+EIGEN_DECLARE_TEST(cxx11_tensor_reduction) {
   CALL_SUBTEST(test_trivial_reductions<ColMajor>());
   CALL_SUBTEST(test_trivial_reductions<RowMajor>());
-  CALL_SUBTEST(test_simple_reductions<ColMajor>());
-  CALL_SUBTEST(test_simple_reductions<RowMajor>());
+  CALL_SUBTEST(( test_simple_reductions<float,ColMajor>() ));
+  CALL_SUBTEST(( test_simple_reductions<float,RowMajor>() ));
+  CALL_SUBTEST(( test_simple_reductions<Eigen::half,ColMajor>() ));
   CALL_SUBTEST(test_reductions_in_expr<ColMajor>());
   CALL_SUBTEST(test_reductions_in_expr<RowMajor>());
   CALL_SUBTEST(test_full_reductions<ColMajor>());
diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_gpu.cu
index ec0669704..122ac946b 100644
--- a/unsupported/test/cxx11_tensor_reduction_cuda.cu
+++ b/unsupported/test/cxx11_tensor_reduction_gpu.cu
@@ -9,7 +9,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda
+
 #define EIGEN_USE_GPU
 
 #include "main.h"
@@ -19,7 +19,7 @@
 template<typename Type, int DataLayout>
 static void test_full_reductions() {
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   const int num_rows = internal::random<int>(1024, 5*1024);
@@ -67,7 +67,7 @@ static void test_first_dim_reductions() {
   Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
 
   // Create device
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice dev(&stream);
   
   // Create data(T)
@@ -107,7 +107,7 @@ static void test_last_dim_reductions() {
   Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
 
   // Create device
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice dev(&stream);
   
   // Create data
@@ -134,7 +134,7 @@ static void test_last_dim_reductions() {
 }
 
 
-void test_cxx11_tensor_reduction_cuda() {
+EIGEN_DECLARE_TEST(cxx11_tensor_reduction_gpu) {
   CALL_SUBTEST_1((test_full_reductions<float, ColMajor>()));
   CALL_SUBTEST_1((test_full_reductions<double, ColMajor>()));
   CALL_SUBTEST_2((test_full_reductions<float, RowMajor>()));
diff --git a/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/unsupported/test/cxx11_tensor_reduction_sycl.cpp
index 440d48bca..f526299c6 100644
--- a/unsupported/test/cxx11_tensor_reduction_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp
@@ -13,7 +13,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_reduction_sycl
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
@@ -174,7 +174,7 @@ template<typename DataType> void sycl_reduction_test_per_device(const cl::sycl::
   test_first_dim_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device);
   test_last_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
 }
-void test_cxx11_tensor_reduction_sycl() {
+EIGEN_DECLARE_TEST(cxx11_tensor_reduction_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     CALL_SUBTEST(sycl_reduction_test_per_device<float>(device));
   }
diff --git a/unsupported/test/cxx11_tensor_ref.cpp b/unsupported/test/cxx11_tensor_ref.cpp
index c8f105e3d..7dbd0478c 100644
--- a/unsupported/test/cxx11_tensor_ref.cpp
+++ b/unsupported/test/cxx11_tensor_ref.cpp
@@ -235,7 +235,7 @@ static void test_nested_ops_with_ref()
 }
 
 
-void test_cxx11_tensor_ref()
+EIGEN_DECLARE_TEST(cxx11_tensor_ref)
 {
   CALL_SUBTEST(test_simple_lvalue_ref());
   CALL_SUBTEST(test_simple_rvalue_ref());
diff --git a/unsupported/test/cxx11_tensor_reverse.cpp b/unsupported/test/cxx11_tensor_reverse.cpp
index b35b8d29e..5e44ec007 100644
--- a/unsupported/test/cxx11_tensor_reverse.cpp
+++ b/unsupported/test/cxx11_tensor_reverse.cpp
@@ -179,7 +179,7 @@ static void test_expr_reverse(bool LValue)
 }
 
 
-void test_cxx11_tensor_reverse()
+EIGEN_DECLARE_TEST(cxx11_tensor_reverse)
 {
   CALL_SUBTEST(test_simple_reverse<ColMajor>());
   CALL_SUBTEST(test_simple_reverse<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_reverse_sycl.cpp b/unsupported/test/cxx11_tensor_reverse_sycl.cpp
index 2f5484484..77c2235d1 100644
--- a/unsupported/test/cxx11_tensor_reverse_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_reverse_sycl.cpp
@@ -13,7 +13,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_reverse_sycl
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
@@ -214,7 +214,7 @@ template<typename DataType> void sycl_reverse_test_per_device(const cl::sycl::de
   test_expr_reverse<DataType, RowMajor, int64_t>(sycl_device, true);
   test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, true);
 }
-void test_cxx11_tensor_reverse_sycl() {
+EIGEN_DECLARE_TEST(cxx11_tensor_reverse_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     CALL_SUBTEST(sycl_reverse_test_per_device<float>(device));
   }
diff --git a/unsupported/test/cxx11_tensor_roundings.cpp b/unsupported/test/cxx11_tensor_roundings.cpp
index 2c26151ab..83b592384 100644
--- a/unsupported/test/cxx11_tensor_roundings.cpp
+++ b/unsupported/test/cxx11_tensor_roundings.cpp
@@ -54,7 +54,7 @@ static void test_float_ceiling()
   }
 }
 
-void test_cxx11_tensor_roundings()
+EIGEN_DECLARE_TEST(cxx11_tensor_roundings)
 {
    CALL_SUBTEST(test_float_rounding());
    CALL_SUBTEST(test_float_ceiling());
diff --git a/unsupported/test/cxx11_tensor_scan.cpp b/unsupported/test/cxx11_tensor_scan.cpp
index af59aa3ef..dccee9e84 100644
--- a/unsupported/test/cxx11_tensor_scan.cpp
+++ b/unsupported/test/cxx11_tensor_scan.cpp
@@ -98,7 +98,7 @@ static void test_tensor_maps() {
   }
 }
 
-void test_cxx11_tensor_scan() {
+EIGEN_DECLARE_TEST(cxx11_tensor_scan) {
   CALL_SUBTEST((test_1d_scan<ColMajor, float, true>()));
   CALL_SUBTEST((test_1d_scan<ColMajor, float, false>()));
   CALL_SUBTEST((test_1d_scan<RowMajor, float, true>()));
diff --git a/unsupported/test/cxx11_tensor_scan_cuda.cu b/unsupported/test/cxx11_tensor_scan_gpu.cu
index 1d4edef11..770a144f1 100644
--- a/unsupported/test/cxx11_tensor_scan_cuda.cu
+++ b/unsupported/test/cxx11_tensor_scan_gpu.cu
@@ -9,19 +9,20 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_scan_cuda
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+#include <Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
 
 using Eigen::Tensor;
 typedef Tensor<float, 1>::DimensionPair DimPair;
 
 template<int DataLayout>
-void test_cuda_cumsum(int m_size, int k_size, int n_size)
+void test_gpu_cumsum(int m_size, int k_size, int n_size)
 {
   std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
   Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size);
@@ -36,12 +37,12 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size)
   float* d_t_input;
   float* d_t_result;
 
-  cudaMalloc((void**)(&d_t_input), t_input_bytes);
-  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+  gpuMalloc((void**)(&d_t_input), t_input_bytes);
+  gpuMalloc((void**)(&d_t_result), t_result_bytes);
 
-  cudaMemcpy(d_t_input, t_input.data(), t_input_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_t_input, t_input.data(), t_input_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
@@ -52,7 +53,7 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size)
   gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1);
   t_result = t_input.cumsum(1);
 
-  cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+  gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
   for (DenseIndex i = 0; i < t_result.size(); i++) {
     if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
       continue;
@@ -65,13 +66,13 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size)
     assert(false);
   }
 
-  cudaFree((void*)d_t_input);
-  cudaFree((void*)d_t_result);
+  gpuFree((void*)d_t_input);
+  gpuFree((void*)d_t_result);
 }
 
 
-void test_cxx11_tensor_scan_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_scan_gpu)
 {
-  CALL_SUBTEST_1(test_cuda_cumsum<ColMajor>(128, 128, 128));
-  CALL_SUBTEST_2(test_cuda_cumsum<RowMajor>(128, 128, 128));
+  CALL_SUBTEST_1(test_gpu_cumsum<ColMajor>(128, 128, 128));
+  CALL_SUBTEST_2(test_gpu_cumsum<RowMajor>(128, 128, 128));
 }
diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp
index d11444a14..ab19b6e6b 100644
--- a/unsupported/test/cxx11_tensor_shuffling.cpp
+++ b/unsupported/test/cxx11_tensor_shuffling.cpp
@@ -215,7 +215,7 @@ static void test_shuffle_unshuffle()
 }
 
 
-void test_cxx11_tensor_shuffling()
+EIGEN_DECLARE_TEST(cxx11_tensor_shuffling)
 {
   CALL_SUBTEST(test_simple_shuffling<ColMajor>());
   CALL_SUBTEST(test_simple_shuffling<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
index c88db7c72..0e8cc3bd2 100644
--- a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
@@ -15,7 +15,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_shuffling_sycl
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
@@ -111,7 +111,7 @@ template<typename DataType, typename dev_Selector> void sycl_shuffling_test_per_
   test_simple_shuffling_sycl<DataType, ColMajor, int64_t>(sycl_device);
 
 }
-void test_cxx11_tensor_shuffling_sycl()
+EIGEN_DECLARE_TEST(cxx11_tensor_shuffling_sycl)
 {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     CALL_SUBTEST(sycl_shuffling_test_per_device<float>(device));
diff --git a/unsupported/test/cxx11_tensor_simple.cpp b/unsupported/test/cxx11_tensor_simple.cpp
index 5a0d339ef..6d70f5435 100644
--- a/unsupported/test/cxx11_tensor_simple.cpp
+++ b/unsupported/test/cxx11_tensor_simple.cpp
@@ -316,7 +316,7 @@ static void test_resize()
   VERIFY_IS_EQUAL(epsilon.size(), 3*5*7);
 }
 
-void test_cxx11_tensor_simple()
+EIGEN_DECLARE_TEST(cxx11_tensor_simple)
 {
   CALL_SUBTEST(test_0d());
   CALL_SUBTEST(test_1d());
diff --git a/unsupported/test/cxx11_tensor_striding.cpp b/unsupported/test/cxx11_tensor_striding.cpp
index 935b908cc..aefdfa9b4 100644
--- a/unsupported/test/cxx11_tensor_striding.cpp
+++ b/unsupported/test/cxx11_tensor_striding.cpp
@@ -110,7 +110,7 @@ static void test_striding_as_lvalue()
 }
 
 
-void test_cxx11_tensor_striding()
+EIGEN_DECLARE_TEST(cxx11_tensor_striding)
 {
   CALL_SUBTEST(test_simple_striding<ColMajor>());
   CALL_SUBTEST(test_simple_striding<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_striding_sycl.cpp b/unsupported/test/cxx11_tensor_striding_sycl.cpp
index 603c3746f..d3b1fa77c 100644
--- a/unsupported/test/cxx11_tensor_striding_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_striding_sycl.cpp
@@ -13,7 +13,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_striding_sycl
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
@@ -196,7 +196,7 @@ template <typename Dev_selector> void tensorStridingPerDevice(Dev_selector& s){
   test_striding_as_lvalue<float, RowMajor, int64_t>(sycl_device);
 }
 
-void test_cxx11_tensor_striding_sycl() {
+EIGEN_DECLARE_TEST(cxx11_tensor_striding_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     CALL_SUBTEST(tensorStridingPerDevice(device));
   }
diff --git a/unsupported/test/cxx11_tensor_sugar.cpp b/unsupported/test/cxx11_tensor_sugar.cpp
index 2f56eb495..2ca5c47db 100644
--- a/unsupported/test/cxx11_tensor_sugar.cpp
+++ b/unsupported/test/cxx11_tensor_sugar.cpp
@@ -73,7 +73,7 @@ static void test_scalar_sugar_sub_div() {
   }
 }
 
-void test_cxx11_tensor_sugar()
+EIGEN_DECLARE_TEST(cxx11_tensor_sugar)
 {
   CALL_SUBTEST(test_comparison_sugar());
   CALL_SUBTEST(test_scalar_sugar_add_mul());
diff --git a/unsupported/test/cxx11_tensor_sycl.cpp b/unsupported/test/cxx11_tensor_sycl.cpp
index 5cd0f4c71..9357bed02 100644
--- a/unsupported/test/cxx11_tensor_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_sycl.cpp
@@ -15,7 +15,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_sycl
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
@@ -269,7 +269,7 @@ template<typename DataType, typename dev_Selector> void sycl_computing_test_per_
   test_sycl_cast<DataType, int, ColMajor, int64_t>(sycl_device);
 }
 
-void test_cxx11_tensor_sycl() {
+EIGEN_DECLARE_TEST(cxx11_tensor_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
   }
diff --git a/unsupported/test/cxx11_tensor_symmetry.cpp b/unsupported/test/cxx11_tensor_symmetry.cpp
index d680e9b3b..fed269a9a 100644
--- a/unsupported/test/cxx11_tensor_symmetry.cpp
+++ b/unsupported/test/cxx11_tensor_symmetry.cpp
@@ -801,7 +801,7 @@ static void test_tensor_randacc()
   }
 }
 
-void test_cxx11_tensor_symmetry()
+EIGEN_DECLARE_TEST(cxx11_tensor_symmetry)
 {
   CALL_SUBTEST(test_symgroups_static());
   CALL_SUBTEST(test_symgroups_dynamic());
diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
index 2ef665f30..20a197f2b 100644
--- a/unsupported/test/cxx11_tensor_thread_pool.cpp
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -232,6 +232,60 @@ void test_multithread_contraction_agrees_with_singlethread() {
   }
 }
 
+// Apply Sqrt to all output elements.
+struct SqrtOutputKernel {
+  template <typename Index, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(
+      const OutputKernel::OutputMapper<Index, Scalar>& output_mapper,
+      const TensorContractionParams&, Index, Index, Index num_rows,
+      Index num_cols) const {
+    for (int i = 0; i < num_rows; ++i) {
+      for (int j = 0; j < num_cols; ++j) {
+        output_mapper(i, j) = std::sqrt(output_mapper(i, j));
+      }
+    }
+  }
+};
+
+template <int DataLayout>
+static void test_multithread_contraction_with_output_kernel() {
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+
+  const int num_threads = internal::random<int>(2, 11);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+  Tensor<float, 4, DataLayout> t_left(30, 50, 8, 31);
+  Tensor<float, 5, DataLayout> t_right(8, 31, 7, 20, 10);
+  Tensor<float, 5, DataLayout> t_result(30, 50, 7, 20, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+  // Put trash in mat4 to verify contraction clears output memory.
+  t_result.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 1500, 248);
+  MapXf m_right(t_right.data(), 248, 1400);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+
+  // compute results by separate methods
+  t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel());
+
+  m_result = m_left * m_right;
+
+  for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY(&t_result.data()[i] != &m_result.data()[i]);
+    VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
+  }
+}
 
 template<int DataLayout>
 void test_full_contraction() {
@@ -345,7 +399,7 @@ void test_multithread_shuffle()
 }
 
 
-void test_cxx11_tensor_thread_pool()
+EIGEN_DECLARE_TEST(cxx11_tensor_thread_pool)
 {
   CALL_SUBTEST_1(test_multithread_elementwise());
   CALL_SUBTEST_1(test_multithread_compound_assignment());
@@ -355,6 +409,8 @@ void test_cxx11_tensor_thread_pool()
 
   CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<ColMajor>());
   CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<RowMajor>());
+  CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<ColMajor>());
+  CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<RowMajor>());
 
   // Exercise various cases that have been problematic in the past.
   CALL_SUBTEST_4(test_contraction_corner_cases<ColMajor>());
diff --git a/unsupported/test/cxx11_tensor_trace.cpp b/unsupported/test/cxx11_tensor_trace.cpp
index 340d1211c..1579bc1eb 100644
--- a/unsupported/test/cxx11_tensor_trace.cpp
+++ b/unsupported/test/cxx11_tensor_trace.cpp
@@ -159,7 +159,7 @@ static void test_trace_in_expr() {
 }
 
 
-void test_cxx11_tensor_trace() {
+EIGEN_DECLARE_TEST(cxx11_tensor_trace) {
   CALL_SUBTEST(test_0D_trace<ColMajor>());
   CALL_SUBTEST(test_0D_trace<RowMajor>());
   CALL_SUBTEST(test_all_dimensions_trace<ColMajor>());
diff --git a/unsupported/test/cxx11_tensor_uint128.cpp b/unsupported/test/cxx11_tensor_uint128.cpp
index d2a1e8673..07691df98 100644
--- a/unsupported/test/cxx11_tensor_uint128.cpp
+++ b/unsupported/test/cxx11_tensor_uint128.cpp
@@ -144,7 +144,7 @@ void test_misc2() {
 #endif
 
 
-void test_cxx11_tensor_uint128()
+EIGEN_DECLARE_TEST(cxx11_tensor_uint128)
 {
 #ifdef EIGEN_NO_INT128
   // Skip the test on compilers that don't support 128bit integers natively
diff --git a/unsupported/test/cxx11_tensor_volume_patch.cpp b/unsupported/test/cxx11_tensor_volume_patch.cpp
index ca6840f3b..3aa363eb5 100644
--- a/unsupported/test/cxx11_tensor_volume_patch.cpp
+++ b/unsupported/test/cxx11_tensor_volume_patch.cpp
@@ -105,7 +105,7 @@ static void test_entire_volume_patch()
   }
 }
 
-void test_cxx11_tensor_volume_patch()
+EIGEN_DECLARE_TEST(cxx11_tensor_volume_patch)
 {
   CALL_SUBTEST(test_single_voxel_patch());
   CALL_SUBTEST(test_entire_volume_patch());
diff --git a/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
index 039715abc..ca7994fd9 100644
--- a/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
@@ -13,7 +13,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_volume_patch_sycl
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
@@ -214,7 +214,7 @@ std::cout << "Running on " << s.template get_info<cl::sycl::info::device::name>(
 test_single_voxel_patch_sycl<DataType, int64_t>(sycl_device);
 test_entire_volume_patch_sycl<DataType, int64_t>(sycl_device);
 }
-void test_cxx11_tensor_volume_patch_sycl()
+EIGEN_DECLARE_TEST(cxx11_tensor_volume_patch_sycl)
 {
 for (const auto& device :Eigen::get_sycl_supported_devices()) {
   CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device<float>(device));
diff --git a/unsupported/test/dgmres.cpp b/unsupported/test/dgmres.cpp
index 2b11807c8..04f5ad670 100644
--- a/unsupported/test/dgmres.cpp
+++ b/unsupported/test/dgmres.cpp
@@ -24,7 +24,7 @@ template<typename T> void test_dgmres_T()
   //CALL_SUBTEST( check_sparse_square_solving(dgmres_colmajor_ssor)     );
 }
 
-void test_dgmres()
+EIGEN_DECLARE_TEST(dgmres)
 {
   CALL_SUBTEST_1(test_dgmres_T<double>());
   CALL_SUBTEST_2(test_dgmres_T<std::complex<double> >());
diff --git a/unsupported/test/forward_adolc.cpp b/unsupported/test/forward_adolc.cpp
index 866db8e86..688594eff 100644
--- a/unsupported/test/forward_adolc.cpp
+++ b/unsupported/test/forward_adolc.cpp
@@ -119,7 +119,7 @@ template<typename Func> void adolc_forward_jacobian(const Func& f)
     VERIFY_IS_APPROX(j, jref);
 }
 
-void test_forward_adolc()
+EIGEN_DECLARE_TEST(forward_adolc)
 {
   adtl::setNumDir(NUMBER_DIRECTIONS);
 
@@ -132,7 +132,7 @@ void test_forward_adolc()
   }
 
   {
-    // simple instanciation tests
+    // simple instantiation tests
     Matrix<adtl::adouble,2,1> x;
     foo(x);
     Matrix<adtl::adouble,Dynamic,Dynamic> A(4,4);;
diff --git a/unsupported/test/gmres.cpp b/unsupported/test/gmres.cpp
index f2969116b..8d2254b5b 100644
--- a/unsupported/test/gmres.cpp
+++ b/unsupported/test/gmres.cpp
@@ -24,7 +24,7 @@ template<typename T> void test_gmres_T()
   //CALL_SUBTEST( check_sparse_square_solving(gmres_colmajor_ssor)     );
 }
 
-void test_gmres()
+EIGEN_DECLARE_TEST(gmres)
 {
   CALL_SUBTEST_1(test_gmres_T<double>());
   CALL_SUBTEST_2(test_gmres_T<std::complex<double> >());
diff --git a/unsupported/test/kronecker_product.cpp b/unsupported/test/kronecker_product.cpp
index e770049e5..4f143b6de 100644
--- a/unsupported/test/kronecker_product.cpp
+++ b/unsupported/test/kronecker_product.cpp
@@ -83,7 +83,7 @@ void check_sparse_kronecker_product(const MatrixType& ab)
 }
 
 
-void test_kronecker_product()
+EIGEN_DECLARE_TEST(kronecker_product)
 {
   // DM = dense matrix; SM = sparse matrix
 
@@ -240,7 +240,7 @@ void test_kronecker_product()
 #include "main.h"
 #include <Eigen/KroneckerProduct>
 
-void test_kronecker_product()
+EIGEN_DECLARE_TEST(kronecker_product)
 {
   MatrixXd a(2,2), b(3,3), c;
   a.setRandom();
diff --git a/unsupported/test/levenberg_marquardt.cpp b/unsupported/test/levenberg_marquardt.cpp
index 64f168c16..7f9a81cd3 100644
--- a/unsupported/test/levenberg_marquardt.cpp
+++ b/unsupported/test/levenberg_marquardt.cpp
@@ -1445,7 +1445,7 @@ void testNistEckerle4(void)
   VERIFY_IS_APPROX(x[2], 4.5154121844E+02);
 }
 
-void test_levenberg_marquardt()
+EIGEN_DECLARE_TEST(levenberg_marquardt)
 {
     // Tests using the examples provided by (c)minpack
     CALL_SUBTEST(testLmder1());
diff --git a/unsupported/test/matrix_exponential.cpp b/unsupported/test/matrix_exponential.cpp
index 50dec083d..b032cbf1d 100644
--- a/unsupported/test/matrix_exponential.cpp
+++ b/unsupported/test/matrix_exponential.cpp
@@ -119,7 +119,7 @@ void randomTest(const MatrixType& m, double tol)
   }
 }
 
-void test_matrix_exponential()
+EIGEN_DECLARE_TEST(matrix_exponential)
 {
   CALL_SUBTEST_2(test2dRotation<double>(1e-13));
   CALL_SUBTEST_1(test2dRotation<float>(2e-5));  // was 1e-5, relaxed for clang 2.8 / linux / x86-64
diff --git a/unsupported/test/matrix_function.cpp b/unsupported/test/matrix_function.cpp
index 7c9b68a3c..93fb71430 100644
--- a/unsupported/test/matrix_function.cpp
+++ b/unsupported/test/matrix_function.cpp
@@ -181,7 +181,7 @@ void testMatrixType(const MatrixType& m)
   }
 }
 
-void test_matrix_function()
+EIGEN_DECLARE_TEST(matrix_function)
 {
   CALL_SUBTEST_1(testMatrixType(Matrix<float,1,1>()));
   CALL_SUBTEST_2(testMatrixType(Matrix3cf()));
diff --git a/unsupported/test/matrix_power.cpp b/unsupported/test/matrix_power.cpp
index 7ccfacfdf..fa52d256e 100644
--- a/unsupported/test/matrix_power.cpp
+++ b/unsupported/test/matrix_power.cpp
@@ -150,7 +150,7 @@ typedef Matrix<double,3,3,RowMajor>         Matrix3dRowMajor;
 typedef Matrix<long double,3,3>             Matrix3e;
 typedef Matrix<long double,Dynamic,Dynamic> MatrixXe;
  
-void test_matrix_power()
+EIGEN_DECLARE_TEST(matrix_power)
 {
   CALL_SUBTEST_2(test2dRotation<double>(1e-13));
   CALL_SUBTEST_1(test2dRotation<float>(2e-5));  // was 1e-5, relaxed for clang 2.8 / linux / x86-64
diff --git a/unsupported/test/matrix_square_root.cpp b/unsupported/test/matrix_square_root.cpp
index ea541e1ea..034f29217 100644
--- a/unsupported/test/matrix_square_root.cpp
+++ b/unsupported/test/matrix_square_root.cpp
@@ -18,7 +18,7 @@ void testMatrixSqrt(const MatrixType& m)
   VERIFY_IS_APPROX(sqrtA * sqrtA, A);
 }
 
-void test_matrix_square_root()
+EIGEN_DECLARE_TEST(matrix_square_root)
 {
   for (int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1(testMatrixSqrt(Matrix3cf()));
diff --git a/unsupported/test/minres.cpp b/unsupported/test/minres.cpp
index 8b300b78a..2eb40fef6 100644
--- a/unsupported/test/minres.cpp
+++ b/unsupported/test/minres.cpp
@@ -36,7 +36,7 @@ template<typename T> void test_minres_T()
 
 }
 
-void test_minres()
+EIGEN_DECLARE_TEST(minres)
 {
   CALL_SUBTEST_1(test_minres_T<double>());
 //  CALL_SUBTEST_2(test_minres_T<std::compex<double> >());
diff --git a/unsupported/test/mpreal_support.cpp b/unsupported/test/mpreal_support.cpp
index 685e7ea45..4a25e993c 100644
--- a/unsupported/test/mpreal_support.cpp
+++ b/unsupported/test/mpreal_support.cpp
@@ -7,7 +7,7 @@
 using namespace mpfr;
 using namespace Eigen;
 
-void test_mpreal_support()
+EIGEN_DECLARE_TEST(mpreal_support)
 {
   // set precision to 256 bits (double has only 53 bits)
   mpreal::set_default_prec(256);
diff --git a/unsupported/test/openglsupport.cpp b/unsupported/test/openglsupport.cpp
index 706a816f7..460830086 100644
--- a/unsupported/test/openglsupport.cpp
+++ b/unsupported/test/openglsupport.cpp
@@ -106,7 +106,7 @@ GLint createShader(const char* vtx, const char* frg)
   return prg_id;
 }
 
-void test_openglsupport()
+EIGEN_DECLARE_TEST(openglsupport)
 {
   int argc = 0;
   glutInit(&argc, 0);
diff --git a/unsupported/test/polynomialsolver.cpp b/unsupported/test/polynomialsolver.cpp
index 7ad4aa69d..65efea0cb 100644
--- a/unsupported/test/polynomialsolver.cpp
+++ b/unsupported/test/polynomialsolver.cpp
@@ -191,7 +191,7 @@ void polynomialsolver(int deg)
       realRoots );
 }
 
-void test_polynomialsolver()
+EIGEN_DECLARE_TEST(polynomialsolver)
 {
   for(int i = 0; i < g_repeat; i++)
   {
diff --git a/unsupported/test/polynomialutils.cpp b/unsupported/test/polynomialutils.cpp
index 5fc968402..8ff451996 100644
--- a/unsupported/test/polynomialutils.cpp
+++ b/unsupported/test/polynomialutils.cpp
@@ -101,7 +101,7 @@ template<typename _Scalar> void CauchyBounds_scalar()
           internal::random<int>(18,26) )) );
 }
 
-void test_polynomialutils()
+EIGEN_DECLARE_TEST(polynomialutils)
 {
   for(int i = 0; i < g_repeat; i++)
   {
diff --git a/unsupported/test/sparse_extra.cpp b/unsupported/test/sparse_extra.cpp
index 4f6723d6d..4ac53a9a7 100644
--- a/unsupported/test/sparse_extra.cpp
+++ b/unsupported/test/sparse_extra.cpp
@@ -8,7 +8,7 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 
-// import basic and product tests for deprectaed DynamicSparseMatrix
+// import basic and product tests for deprecated DynamicSparseMatrix
 #define EIGEN_NO_DEPRECATED_WARNING
 #include "sparse_basic.cpp"
 #include "sparse_product.cpp"
@@ -142,7 +142,7 @@ void check_marketio()
   VERIFY_IS_EQUAL(DenseMatrix(m1),DenseMatrix(m2));
 }
 
-void test_sparse_extra()
+EIGEN_DECLARE_TEST(sparse_extra)
 {
   for(int i = 0; i < g_repeat; i++) {
     int s = Eigen::internal::random<int>(1,50);
diff --git a/unsupported/test/special_functions.cpp b/unsupported/test/special_functions.cpp
index 057fb3e92..50dae6c93 100644
--- a/unsupported/test/special_functions.cpp
+++ b/unsupported/test/special_functions.cpp
@@ -335,10 +335,144 @@ template<typename ArrayType> void array_special_functions()
         ArrayType test = betainc(a, b + one, x) + eps;
         verify_component_wise(test, expected););
   }
-#endif
+#endif  // EIGEN_HAS_C99_MATH
+
+  // Test Bessel function i0e. Reference results obtained with SciPy.
+  {
+    ArrayType x(21);
+    ArrayType expected(21);
+    ArrayType res(21);
+
+    x << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, -2.0, 0.0,
+        2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+    expected << 0.0897803118848, 0.0947062952128, 0.100544127361,
+        0.107615251671, 0.116426221213, 0.127833337163, 0.143431781857,
+        0.16665743264, 0.207001921224, 0.308508322554, 1.0, 0.308508322554,
+        0.207001921224, 0.16665743264, 0.143431781857, 0.127833337163,
+        0.116426221213, 0.107615251671, 0.100544127361, 0.0947062952128,
+        0.0897803118848;
+
+    CALL_SUBTEST(res = i0e(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function i1e. Reference results obtained with SciPy.
+  {
+    ArrayType x(21);
+    ArrayType expected(21);
+    ArrayType res(21);
+
+    x << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, -2.0, 0.0,
+        2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+    expected << -0.0875062221833, -0.092036796872, -0.0973496147565,
+        -0.103697667463, -0.11146429929, -0.121262681384, -0.134142493293,
+        -0.152051459309, -0.178750839502, -0.215269289249, 0.0, 0.215269289249,
+        0.178750839502, 0.152051459309, 0.134142493293, 0.121262681384,
+        0.11146429929, 0.103697667463, 0.0973496147565, 0.092036796872,
+        0.0875062221833;
+
+    CALL_SUBTEST(res = i1e(x);
+                 verify_component_wise(res, expected););
+  }
+
+  /* Code to generate the data for the following two test cases.
+    N = 5
+    np.random.seed(3)
+
+    a = np.logspace(-2, 3, 6)
+    a = np.ravel(np.tile(np.reshape(a, [-1, 1]), [1, N]))
+    x = np.random.gamma(a, 1.0)
+    x = np.maximum(x, np.finfo(np.float32).tiny)
+
+    def igamma(a, x):
+      return mpmath.gammainc(a, 0, x, regularized=True)
+
+    def igamma_der_a(a, x):
+      res = mpmath.diff(lambda a_prime: igamma(a_prime, x), a)
+      return np.float64(res)
+
+    def gamma_sample_der_alpha(a, x):
+      igamma_x = igamma(a, x)
+      def igammainv_of_igamma(a_prime):
+        return mpmath.findroot(lambda x_prime: igamma(a_prime, x_prime) -
+            igamma_x, x, solver='newton')
+      return np.float64(mpmath.diff(igammainv_of_igamma, a))
+
+    v_igamma_der_a = np.vectorize(igamma_der_a)(a, x)
+    v_gamma_sample_der_alpha = np.vectorize(gamma_sample_der_alpha)(a, x)
+  */
+
+#if EIGEN_HAS_C99_MATH
+  // Test igamma_der_a
+  {
+    ArrayType a(30);
+    ArrayType x(30);
+    ArrayType res(30);
+    ArrayType v(30);
+
+    a << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 1.0,
+        1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, 100.0,
+        100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
+
+    x << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
+        1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16,
+        0.0132865061065, 0.0200034203853, 6.29263709118e-17, 1.37160367764e-06,
+        0.333412038288, 1.18135687766, 0.580629033777, 0.170631439426,
+        0.786686768458, 7.63873279537, 13.1944344379, 11.896042354,
+        10.5830172417, 10.5020942233, 92.8918587747, 95.003720371,
+        86.3715926467, 96.0330217672, 82.6389930677, 968.702906754,
+        969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
+
+    v << -32.7256441441, -36.4394150514, -9.66467612263, -36.4394150514,
+        -36.4394150514, -1.0891900302, -2.66351229645, -2.48666868596,
+        -0.929700494428, -3.56327722764, -0.455320135314, -0.391437214323,
+        -0.491352055991, -0.350454834292, -0.471773162921, -0.104084440522,
+        -0.0723646747909, -0.0992828975532, -0.121638215446, -0.122619605294,
+        -0.0317670267286, -0.0359974812869, -0.0154359225363, -0.0375775365921,
+        -0.00794899153653, -0.00777303219211, -0.00796085782042,
+        -0.0125850719397, -0.00455500206958, -0.00476436993148;
+
+    CALL_SUBTEST(res = igamma_der_a(a, x); verify_component_wise(res, v););
+  }
+
+  // Test gamma_sample_der_alpha
+  {
+    ArrayType alpha(30);
+    ArrayType sample(30);
+    ArrayType res(30);
+    ArrayType v(30);
+
+    alpha << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0,
+        1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, 100.0,
+        100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
+
+    sample << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
+        1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16,
+        0.0132865061065, 0.0200034203853, 6.29263709118e-17, 1.37160367764e-06,
+        0.333412038288, 1.18135687766, 0.580629033777, 0.170631439426,
+        0.786686768458, 7.63873279537, 13.1944344379, 11.896042354,
+        10.5830172417, 10.5020942233, 92.8918587747, 95.003720371,
+        86.3715926467, 96.0330217672, 82.6389930677, 968.702906754,
+        969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
+
+    v << 7.42424742367e-23, 1.02004297287e-34, 0.0130155240738,
+        1.02004297287e-34, 1.02004297287e-34, 1.96505168277e-13, 0.525575786243,
+        0.713903991771, 2.32077561808e-14, 0.000179348049886, 0.635500453302,
+        1.27561284917, 0.878125852156, 0.41565819538, 1.03606488534,
+        0.885964824887, 1.16424049334, 1.10764479598, 1.04590810812,
+        1.04193666963, 0.965193152414, 0.976217589464, 0.93008035061,
+        0.98153216096, 0.909196397698, 0.98434963993, 0.984738050206,
+        1.00106492525, 0.97734200649, 1.02198794179;
+
+    CALL_SUBTEST(res = gamma_sample_der_alpha(alpha, sample);
+                 verify_component_wise(res, v););
+  }
+#endif  // EIGEN_HAS_C99_MATH
 }
 
-void test_special_functions()
+EIGEN_DECLARE_TEST(special_functions)
 {
   CALL_SUBTEST_1(array_special_functions<ArrayXf>());
   CALL_SUBTEST_2(array_special_functions<ArrayXd>());
diff --git a/unsupported/test/splines.cpp b/unsupported/test/splines.cpp
index 3be020434..88ec87b97 100644
--- a/unsupported/test/splines.cpp
+++ b/unsupported/test/splines.cpp
@@ -268,7 +268,7 @@ void check_global_interpolation_with_derivatives2d()
   }
 }
 
-void test_splines()
+EIGEN_DECLARE_TEST(splines)
 {
   for (int i = 0; i < g_repeat; ++i)
   {