From c0f2cb016e60b7dbde1d5946f42234a709a711f9 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 28 Apr 2014 10:32:27 -0700
Subject: Extended support for Tensors:   * Added ability to map a region of
 the memory to a tensor   * Added basic support for unary and binary
 coefficient wise expressions, such as addition or square root   * Provided an
 emulation layer to make it possible to compile the code with compilers (such
 as nvcc) that don't support cxx11.

---
 Eigen/src/Core/util/Macros.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'Eigen')
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index bfd6ba7de..3a928001e 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -121,6 +121,11 @@
   #define EIGEN_HAVE_RVALUE_REFERENCES
 #endif
 
+// Does the compiler support variadic templates?
+#if __cplusplus > 199711L
+#define EIGEN_HAS_VARIADIC_TEMPLATES 1
+#endif
+
 /** Allows to disable some optimizations which might affect the accuracy of the result.
   * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
   * They currently include:
-- 
cgit v1.2.3


From 29aebf96e62f4fb5e4b1f3fb475e299df2e7a02e Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 6 Jun 2014 20:18:44 -0700
Subject: Created the pblend packet primitive and implemented it using SSE and
 AVX instructions.

---
 Eigen/src/Core/GenericPacketMath.h   | 14 ++++++++++++
 Eigen/src/Core/arch/AVX/PacketMath.h | 15 +++++++++++++
 Eigen/src/Core/arch/SSE/Complex.h    |  8 ++++++-
 Eigen/src/Core/arch/SSE/PacketMath.h | 41 +++++++++++++++++++++++++++++++++---
 test/packetmath.cpp                  | 16 ++++++++++++++
 5 files changed, 90 insertions(+), 4 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 98313c68f..0869dd49f 100755
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -54,6 +54,7 @@ struct default_packet_traits
     HasMax    = 1,
     HasConj   = 1,
     HasSetLinear = 1,
+    HasBlend  = 0,
 
     HasDiv    = 0,
     HasSqrt   = 0,
@@ -429,6 +430,19 @@ ptranspose(PacketBlock<Packet,1>& /*kernel*/) {
   // Nothing to do in the scalar case, i.e. a 1x1 matrix.
 }
 
+/***************************************************************************
+ * Selector, i.e. vector of N boolean values used to select (i.e. blend)
+ * words from 2 packets.
+***************************************************************************/
+template <size_t N> struct Selector {
+  bool select[N];
+};
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pblend(const Selector<unpacket_traits<Packet>::size>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
+  return ifPacket.select[0] ? thenPacket : elsePacket;
+}
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 8b8307d75..688ff91e4 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -59,6 +59,7 @@ template<> struct packet_traits<float>  : default_packet_traits
     HasLog  = 0,
     HasExp  = 0,
     HasSqrt = 0
+    HasBlend = 1,
   };
  };
 template<> struct packet_traits<double> : default_packet_traits
@@ -73,6 +74,7 @@ template<> struct packet_traits<double> : default_packet_traits
 
     HasDiv  = 1,
     HasExp  = 0
+    HasBlend = 1,
   };
 };
 
@@ -557,6 +559,19 @@ ptranspose(PacketBlock<Packet4d,4>& kernel) {
   kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
 }
 
+template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) {
+  const __m256 zero = _mm256_setzero_ps();
+  const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+  __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ);
+  return _mm256_blendv_ps(thenPacket, elsePacket, false_mask);
+}
+template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) {
+  const __m256d zero = _mm256_setzero_pd();
+  const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+  __m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ);
+  return _mm256_blendv_pd(thenPacket, elsePacket, false_mask);
+}
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index 758183c18..0bc03cf9e 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -44,7 +44,8 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
     HasAbs2   = 0,
     HasMin    = 0,
     HasMax    = 0,
-    HasSetLinear = 0
+    HasSetLinear = 0,
+    HasBlend = 1
   };
 };
 #endif
@@ -472,6 +473,11 @@ ptranspose(PacketBlock<Packet2cf,2>& kernel) {
   kernel.packet[1].v = tmp;
 }
 
+template<>  EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+  __m128d result = pblend(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v));
+  return Packet2cf(_mm_castpd_ps(result));
+}
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 6912f3bc3..1124b24df 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -108,7 +108,8 @@ template<> struct packet_traits<float>  : default_packet_traits
     HasCos  = EIGEN_FAST_MATH,
     HasLog  = 1,
     HasExp  = 1,
-    HasSqrt = 1
+    HasSqrt = 1,
+    HasBlend = 1
   };
 };
 template<> struct packet_traits<double> : default_packet_traits
@@ -123,7 +124,8 @@ template<> struct packet_traits<double> : default_packet_traits
 
     HasDiv  = 1,
     HasExp  = 1,
-    HasSqrt = 1
+    HasSqrt = 1,
+    HasBlend = 1
   };
 };
 #endif
@@ -135,7 +137,9 @@ template<> struct packet_traits<int>    : default_packet_traits
     // FIXME check the Has*
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=4
+    size=4,
+
+    HasBlend = 1
   };
 };
 
@@ -809,6 +813,37 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
   kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
 }
 
+template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+  __m128i false_mask = _mm_cmpeq_epi32(select, zero);
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_blendv_epi8(thenPacket, elsePacket, false_mask);
+#else
+  return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket));
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
+  const __m128 zero = _mm_setzero_ps();
+  const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+  __m128 false_mask = _mm_cmpeq_ps(select, zero);
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_blendv_ps(thenPacket, elsePacket, false_mask);
+#else
+  return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket));
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
+  const __m128d zero = _mm_setzero_pd();
+  const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]);
+  __m128d false_mask = _mm_cmpeq_pd(select, zero);
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_blendv_pd(thenPacket, elsePacket, false_mask);
+#else
+  return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket));
+#endif
+}
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index 9dab07522..663ab886d 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -261,6 +261,22 @@ template<typename Scalar> void packetmath()
       VERIFY(isApproxAbs(data2[j], data1[i+j*PacketSize], refvalue) && "ptranspose");
     }
   }
+
+  if (internal::packet_traits<Scalar>::HasBlend) {
+    Packet thenPacket = internal::pload<Packet>(data1);
+    Packet elsePacket = internal::pload<Packet>(data2);
+    EIGEN_ALIGN_DEFAULT internal::Selector<PacketSize> selector;
+    for (int i = 0; i < PacketSize; ++i) {
+      selector.select[i] = i;
+    }
+
+    Packet blend = internal::pblend(selector, thenPacket, elsePacket);
+    EIGEN_ALIGN_DEFAULT Scalar result[size];
+    internal::pstore(result, blend);
+    for (int i = 0; i < PacketSize; ++i) {
+      VERIFY(isApproxAbs(result[i], (selector.select[i] ? data1[i] : data2[i]), refvalue));
+    }
+  }
 }
 
 template<typename Scalar> void packetmath_real()
-- 
cgit v1.2.3


From 8c8ae2d8193809744f5952713287639817e2b442 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Sat, 7 Jun 2014 11:24:38 -0700
Subject: Fixed a typo

---
 Eigen/src/Core/arch/AVX/PacketMath.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 688ff91e4..74d3746d9 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -58,8 +58,8 @@ template<> struct packet_traits<float>  : default_packet_traits
     HasCos  = 0,
     HasLog  = 0,
     HasExp  = 0,
-    HasSqrt = 0
-    HasBlend = 1,
+    HasSqrt = 0,
+    HasBlend = 1
   };
  };
 template<> struct packet_traits<double> : default_packet_traits
@@ -73,8 +73,8 @@ template<> struct packet_traits<double> : default_packet_traits
     HasHalfPacket = 1,
 
     HasDiv  = 1,
-    HasExp  = 0
-    HasBlend = 1,
+    HasExp  = 0,
+    HasBlend = 1
   };
 };
 
-- 
cgit v1.2.3


From 5cc23199be743d0d1be85d709eb366e67e87a262 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 2 Oct 2014 10:30:44 -0700
Subject: More tests to validate the const-correctness of the tensor code.

---
 Eigen/src/Core/GenericPacketMath.h      |  2 ++
 Eigen/src/Core/util/XprHelper.h         |  8 +++++++
 unsupported/test/CMakeLists.txt         |  3 ++-
 unsupported/test/cxx11_tensor_const.cpp | 39 +++++++++++++++++++++++++++++++++
 4 files changed, 51 insertions(+), 1 deletion(-)
 create mode 100644 unsupported/test/cxx11_tensor_const.cpp

(limited to 'Eigen')

diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 6ec29d0fd..e6fea5bba 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -95,6 +95,8 @@ template<typename T> struct packet_traits : default_packet_traits
   };
 };
 
+template<typename T> struct packet_traits<const T> : packet_traits<T> { };
+
 /** \internal \returns a + b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 padd(const Packet& a,
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index 7c77b2263..67ca49754 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -415,6 +415,14 @@ template <typename A> struct promote_storage_type<A,A>
 {
   typedef A ret;
 };
+template <typename A> struct promote_storage_type<A, const A>
+{
+  typedef A ret;
+};
+template <typename A> struct promote_storage_type<const A, A>
+{
+  typedef A ret;
+};
 
 /** \internal gives the plain matrix or array type to store a row/column/diagonal of a matrix type.
   * \param Scalar optional parameter allowing to pass a different scalar type than the one of the MatrixType.
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index e83d8b54e..a47c7bc74 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -106,7 +106,8 @@ if(EIGEN_TEST_CXX11)
   ei_add_test(cxx11_tensor_convolution "-std=c++0x")
   ei_add_test(cxx11_tensor_expr "-std=c++0x")
 #  ei_add_test(cxx11_tensor_fixed_size "-std=c++0x")
-#  ei_add_test(cxx11_tensor_of_const_values "-std=c++0x")
+  ei_add_test(cxx11_tensor_const "-std=c++0x")
+  ei_add_test(cxx11_tensor_of_const_values "-std=c++0x")
   ei_add_test(cxx11_tensor_of_strings "-std=c++0x")
   ei_add_test(cxx11_tensor_intdiv "-std=c++0x")
   ei_add_test(cxx11_tensor_lvalue "-std=c++0x")
diff --git a/unsupported/test/cxx11_tensor_const.cpp b/unsupported/test/cxx11_tensor_const.cpp
new file mode 100644
index 000000000..0ffb02afd
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_const.cpp
@@ -0,0 +1,39 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+using Eigen::Tensor;
+
+
+
+
+static void test_simple_assign()
+{
+  Tensor<int, 3> random(2,3,7);
+  random.setRandom();
+
+  TensorMap<Tensor<const int, 3> > constant(random.data(), 2, 3, 7);
+  Tensor<int, 3> result(2,3,7);
+  result = constant;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL((result(i,j,k)), random(i,j,k));
+      }
+    }
+  }
+}
+
+void test_cxx11_tensor_const()
+{
+  CALL_SUBTEST(test_simple_assign());
+}
-- 
cgit v1.2.3


From b7271dffb5b1ceeee4c8bd99402ff89dcce58d74 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 2 Oct 2014 16:51:57 -0700
Subject: Generalized the gebp apis

---
 Eigen/src/Core/products/GeneralBlockPanelKernel.h  | 423 +++++++++++----------
 Eigen/src/Core/products/GeneralMatrixMatrix.h      |  80 ++--
 .../Core/products/GeneralMatrixMatrixTriangular.h  |  54 +--
 Eigen/src/Core/products/SelfadjointMatrixMatrix.h  |  51 ++-
 Eigen/src/Core/products/TriangularMatrixMatrix.h   |  65 ++--
 Eigen/src/Core/products/TriangularSolverMatrix.h   |  49 ++-
 Eigen/src/Core/util/BlasUtil.h                     | 106 +++++-
 unsupported/test/CMakeLists.txt                    |   2 +-
 8 files changed, 473 insertions(+), 357 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 7da52c2e8..090c8f4e6 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -667,7 +667,7 @@ protected:
  *  |real |cplx | no vectorization yet, would require to pack A with duplication
  *  |cplx |real | easy vectorization
  */
-template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
 struct gebp_kernel
 {
   typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
@@ -676,14 +676,15 @@ struct gebp_kernel
   typedef typename Traits::RhsPacket RhsPacket;
   typedef typename Traits::ResPacket ResPacket;
   typedef typename Traits::AccPacket AccPacket;
-  
+
   typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
   typedef typename SwappedTraits::ResScalar SResScalar;
   typedef typename SwappedTraits::LhsPacket SLhsPacket;
   typedef typename SwappedTraits::RhsPacket SRhsPacket;
   typedef typename SwappedTraits::ResPacket SResPacket;
   typedef typename SwappedTraits::AccPacket SAccPacket;
-            
+
+  typedef typename DataMapper::LinearMapper LinearMapper;
 
   enum {
     Vectorizable  = Traits::Vectorizable,
@@ -693,14 +694,16 @@ struct gebp_kernel
   };
 
   EIGEN_DONT_INLINE
-  void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
+  void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
+                  Index rows, Index depth, Index cols, ResScalar alpha,
                   Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
 };
 
-template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
 EIGEN_DONT_INLINE
-void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
-  ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
+void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
+  ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
+               Index rows, Index depth, Index cols, ResScalar alpha,
                Index strideA, Index strideB, Index offsetA, Index offsetB)
   {
     Traits traits;
@@ -743,15 +746,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
           traits.initAcc(C4);  traits.initAcc(C5);  traits.initAcc(C6);  traits.initAcc(C7);
           traits.initAcc(C8);  traits.initAcc(C9);  traits.initAcc(C10); traits.initAcc(C11);
 
-          ResScalar* r0 = &res[(j2+0)*resStride + i];
-          ResScalar* r1 = &res[(j2+1)*resStride + i];
-          ResScalar* r2 = &res[(j2+2)*resStride + i];
-          ResScalar* r3 = &res[(j2+3)*resStride + i];
-          
-          internal::prefetch(r0);
-          internal::prefetch(r1);
-          internal::prefetch(r2);
-          internal::prefetch(r3);
+          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+          r0.prefetch(0);
+          r1.prefetch(0);
+          r2.prefetch(0);
+          r3.prefetch(0);
 
           // performs "inner" products
           const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
@@ -813,48 +816,48 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
   
           ResPacket R0, R1, R2;
           ResPacket alphav = pset1<ResPacket>(alpha);
-          
-          R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
-          R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize);
-          R2 = ploadu<ResPacket>(r0+2*Traits::ResPacketSize);
+
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r0.loadPacket(2 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
           traits.acc(C4, alphav, R1);
           traits.acc(C8, alphav, R2);
-          pstoreu(r0+0*Traits::ResPacketSize, R0);
-          pstoreu(r0+1*Traits::ResPacketSize, R1);
-          pstoreu(r0+2*Traits::ResPacketSize, R2);
-          
-          R0 = ploadu<ResPacket>(r1+0*Traits::ResPacketSize);
-          R1 = ploadu<ResPacket>(r1+1*Traits::ResPacketSize);
-          R2 = ploadu<ResPacket>(r1+2*Traits::ResPacketSize);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r0.storePacket(1 * Traits::ResPacketSize, R1);
+          r0.storePacket(2 * Traits::ResPacketSize, R2);
+
+          R0 = r1.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r1.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r1.loadPacket(2 * Traits::ResPacketSize);
           traits.acc(C1, alphav, R0);
           traits.acc(C5, alphav, R1);
           traits.acc(C9, alphav, R2);
-          pstoreu(r1+0*Traits::ResPacketSize, R0);
-          pstoreu(r1+1*Traits::ResPacketSize, R1);
-          pstoreu(r1+2*Traits::ResPacketSize, R2);
-          
-          R0 = ploadu<ResPacket>(r2+0*Traits::ResPacketSize);
-          R1 = ploadu<ResPacket>(r2+1*Traits::ResPacketSize);
-          R2 = ploadu<ResPacket>(r2+2*Traits::ResPacketSize);
+          r1.storePacket(0 * Traits::ResPacketSize, R0);
+          r1.storePacket(1 * Traits::ResPacketSize, R1);
+          r1.storePacket(2 * Traits::ResPacketSize, R2);
+
+          R0 = r2.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r2.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r2.loadPacket(2 * Traits::ResPacketSize);
           traits.acc(C2, alphav, R0);
           traits.acc(C6, alphav, R1);
           traits.acc(C10, alphav, R2);
-          pstoreu(r2+0*Traits::ResPacketSize, R0);
-          pstoreu(r2+1*Traits::ResPacketSize, R1);
-          pstoreu(r2+2*Traits::ResPacketSize, R2);
-          
-          R0 = ploadu<ResPacket>(r3+0*Traits::ResPacketSize);
-          R1 = ploadu<ResPacket>(r3+1*Traits::ResPacketSize);
-          R2 = ploadu<ResPacket>(r3+2*Traits::ResPacketSize);
+          r2.storePacket(0 * Traits::ResPacketSize, R0);
+          r2.storePacket(1 * Traits::ResPacketSize, R1);
+          r2.storePacket(2 * Traits::ResPacketSize, R2);
+
+          R0 = r3.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r3.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r3.loadPacket(2 * Traits::ResPacketSize);
           traits.acc(C3, alphav, R0);
           traits.acc(C7, alphav, R1);
           traits.acc(C11, alphav, R2);
-          pstoreu(r3+0*Traits::ResPacketSize, R0);
-          pstoreu(r3+1*Traits::ResPacketSize, R1);
-          pstoreu(r3+2*Traits::ResPacketSize, R2);
+          r3.storePacket(0 * Traits::ResPacketSize, R0);
+          r3.storePacket(1 * Traits::ResPacketSize, R1);
+          r3.storePacket(2 * Traits::ResPacketSize, R2);
         }
-        
+
         // Deal with remaining columns of the rhs
         for(Index j2=packet_cols4; j2<cols; j2++)
         {
@@ -868,7 +871,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
           traits.initAcc(C4);
           traits.initAcc(C8);
 
-          ResScalar* r0 = &res[(j2+0)*resStride + i];
+          LinearMapper r0 = res.getLinearMapper(i, j2);
+          r0.prefetch(0);
 
           // performs "inner" products
           const RhsScalar* blB = &blockB[j2*strideB+offsetB];
@@ -912,19 +916,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
           ResPacket R0, R1, R2;
           ResPacket alphav = pset1<ResPacket>(alpha);
 
-          R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
-          R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize);
-          R2 = ploadu<ResPacket>(r0+2*Traits::ResPacketSize);
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r0.loadPacket(2 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
           traits.acc(C4, alphav, R1);
-          traits.acc(C8 , alphav, R2);
-          pstoreu(r0+0*Traits::ResPacketSize, R0);
-          pstoreu(r0+1*Traits::ResPacketSize, R1);
-          pstoreu(r0+2*Traits::ResPacketSize, R2);
+          traits.acc(C8, alphav, R2);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r0.storePacket(1 * Traits::ResPacketSize, R1);
+          r0.storePacket(2 * Traits::ResPacketSize, R2);
         }
       }
     }
-      
+
     //---------- Process 2 * LhsProgress rows at once ----------
     if(mr>=2*Traits::LhsProgress)
     {
@@ -946,15 +950,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
           traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
           traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
 
-          ResScalar* r0 = &res[(j2+0)*resStride + i];
-          ResScalar* r1 = &res[(j2+1)*resStride + i];
-          ResScalar* r2 = &res[(j2+2)*resStride + i];
-          ResScalar* r3 = &res[(j2+3)*resStride + i];
-          
-          internal::prefetch(r0+prefetch_res_offset);
-          internal::prefetch(r1+prefetch_res_offset);
-          internal::prefetch(r2+prefetch_res_offset);
-          internal::prefetch(r3+prefetch_res_offset);
+          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+          r0.prefetch(prefetch_res_offset);
+          r1.prefetch(prefetch_res_offset);
+          r2.prefetch(prefetch_res_offset);
+          r3.prefetch(prefetch_res_offset);
 
           // performs "inner" products
           const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
@@ -978,7 +982,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
             traits.madd(A1, B2,  C6, B2);                                     \
             traits.madd(A0, B3,  C3, T0);                                     \
             traits.madd(A1, B3,  C7, B3)
-            
+
             internal::prefetch(blB+(48+0));
             EIGEN_GEBGP_ONESTEP(0);
             EIGEN_GEBGP_ONESTEP(1);
@@ -1002,37 +1006,37 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
             blA += 2*Traits::LhsProgress;
           }
 #undef EIGEN_GEBGP_ONESTEP
-  
+
           ResPacket R0, R1, R2, R3;
           ResPacket alphav = pset1<ResPacket>(alpha);
-          
-          R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
-          R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize);
-          R2 = ploadu<ResPacket>(r1+0*Traits::ResPacketSize);
-          R3 = ploadu<ResPacket>(r1+1*Traits::ResPacketSize);
+
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r1.loadPacket(0 * Traits::ResPacketSize);
+          R3 = r1.loadPacket(1 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
           traits.acc(C4, alphav, R1);
           traits.acc(C1, alphav, R2);
           traits.acc(C5, alphav, R3);
-          pstoreu(r0+0*Traits::ResPacketSize, R0);
-          pstoreu(r0+1*Traits::ResPacketSize, R1);
-          pstoreu(r1+0*Traits::ResPacketSize, R2);
-          pstoreu(r1+1*Traits::ResPacketSize, R3);
-          
-          R0 = ploadu<ResPacket>(r2+0*Traits::ResPacketSize);
-          R1 = ploadu<ResPacket>(r2+1*Traits::ResPacketSize);
-          R2 = ploadu<ResPacket>(r3+0*Traits::ResPacketSize);
-          R3 = ploadu<ResPacket>(r3+1*Traits::ResPacketSize);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r0.storePacket(1 * Traits::ResPacketSize, R1);
+          r1.storePacket(0 * Traits::ResPacketSize, R2);
+          r1.storePacket(1 * Traits::ResPacketSize, R3);
+
+          R0 = r2.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r2.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r3.loadPacket(0 * Traits::ResPacketSize);
+          R3 = r3.loadPacket(1 * Traits::ResPacketSize);
           traits.acc(C2,  alphav, R0);
           traits.acc(C6,  alphav, R1);
           traits.acc(C3,  alphav, R2);
           traits.acc(C7,  alphav, R3);
-          pstoreu(r2+0*Traits::ResPacketSize, R0);
-          pstoreu(r2+1*Traits::ResPacketSize, R1);
-          pstoreu(r3+0*Traits::ResPacketSize, R2);
-          pstoreu(r3+1*Traits::ResPacketSize, R3);
+          r2.storePacket(0 * Traits::ResPacketSize, R0);
+          r2.storePacket(1 * Traits::ResPacketSize, R1);
+          r3.storePacket(0 * Traits::ResPacketSize, R2);
+          r3.storePacket(1 * Traits::ResPacketSize, R3);
         }
-        
+
         // Deal with remaining columns of the rhs
         for(Index j2=packet_cols4; j2<cols; j2++)
         {
@@ -1045,8 +1049,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
           traits.initAcc(C0);
           traits.initAcc(C4);
 
-          ResScalar* r0 = &res[(j2+0)*resStride + i];
-          internal::prefetch(r0+prefetch_res_offset);
+          LinearMapper r0 = res.getLinearMapper(i, j2);
+          r0.prefetch(prefetch_res_offset);
 
           // performs "inner" products
           const RhsScalar* blB = &blockB[j2*strideB+offsetB];
@@ -1089,12 +1093,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
           ResPacket R0, R1;
           ResPacket alphav = pset1<ResPacket>(alpha);
 
-          R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
-          R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize);
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
           traits.acc(C4, alphav, R1);
-          pstoreu(r0+0*Traits::ResPacketSize, R0);
-          pstoreu(r0+1*Traits::ResPacketSize, R1);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r0.storePacket(1 * Traits::ResPacketSize, R1);
         }
       }
     }
@@ -1120,15 +1124,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
           traits.initAcc(C2);
           traits.initAcc(C3);
 
-          ResScalar* r0 = &res[(j2+0)*resStride + i];
-          ResScalar* r1 = &res[(j2+1)*resStride + i];
-          ResScalar* r2 = &res[(j2+2)*resStride + i];
-          ResScalar* r3 = &res[(j2+3)*resStride + i];
-          
-          internal::prefetch(r0+prefetch_res_offset);
-          internal::prefetch(r1+prefetch_res_offset);
-          internal::prefetch(r2+prefetch_res_offset);
-          internal::prefetch(r3+prefetch_res_offset);
+          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+          r0.prefetch(prefetch_res_offset);
+          r1.prefetch(prefetch_res_offset);
+          r2.prefetch(prefetch_res_offset);
+          r3.prefetch(prefetch_res_offset);
 
           // performs "inner" products
           const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
@@ -1171,25 +1175,25 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
             blA += 1*LhsProgress;
           }
 #undef EIGEN_GEBGP_ONESTEP
-  
+
           ResPacket R0, R1;
           ResPacket alphav = pset1<ResPacket>(alpha);
-          
-          R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
-          R1 = ploadu<ResPacket>(r1+0*Traits::ResPacketSize);
+
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r1.loadPacket(0 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
           traits.acc(C1,  alphav, R1);
-          pstoreu(r0+0*Traits::ResPacketSize, R0);
-          pstoreu(r1+0*Traits::ResPacketSize, R1);
-          
-          R0 = ploadu<ResPacket>(r2+0*Traits::ResPacketSize);
-          R1 = ploadu<ResPacket>(r3+0*Traits::ResPacketSize);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r1.storePacket(0 * Traits::ResPacketSize, R1);
+
+          R0 = r2.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r3.loadPacket(0 * Traits::ResPacketSize);
           traits.acc(C2,  alphav, R0);
           traits.acc(C3,  alphav, R1);
-          pstoreu(r2+0*Traits::ResPacketSize, R0);
-          pstoreu(r3+0*Traits::ResPacketSize, R1);
+          r2.storePacket(0 * Traits::ResPacketSize, R0);
+          r3.storePacket(0 * Traits::ResPacketSize, R1);
         }
-        
+
         // Deal with remaining columns of the rhs
         for(Index j2=packet_cols4; j2<cols; j2++)
         {
@@ -1201,7 +1205,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
           AccPacket C0;
           traits.initAcc(C0);
 
-          ResScalar* r0 = &res[(j2+0)*resStride + i];
+          LinearMapper r0 = res.getLinearMapper(i, j2);
 
           // performs "inner" products
           const RhsScalar* blB = &blockB[j2*strideB+offsetB];
@@ -1241,9 +1245,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
 #undef EIGEN_GEBGP_ONESTEP
           ResPacket R0;
           ResPacket alphav = pset1<ResPacket>(alpha);
-          R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
-          pstoreu(r0+0*Traits::ResPacketSize, R0);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
         }
       }
     }
@@ -1259,7 +1263,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
           const LhsScalar* blA = &blockA[i*strideA+offsetA];
           prefetch(&blA[0]);
           const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
-          
+
           if( (SwappedTraits::LhsProgress % 4)==0 )
           {
             // NOTE The following piece of code wont work for 512 bit registers
@@ -1268,32 +1272,32 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
             straits.initAcc(C1);
             straits.initAcc(C2);
             straits.initAcc(C3);
-            
+
             const Index spk   = (std::max)(1,SwappedTraits::LhsProgress/4);
             const Index endk  = (depth/spk)*spk;
             const Index endk4 = (depth/(spk*4))*(spk*4);
-            
+
             Index k=0;
             for(; k<endk4; k+=4*spk)
             {
               SLhsPacket A0,A1;
               SRhsPacket B_0,B_1;
-              
+
               straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
               straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
-              
+
               straits.loadRhsQuad(blA+0*spk, B_0);
               straits.loadRhsQuad(blA+1*spk, B_1);
               straits.madd(A0,B_0,C0,B_0);
               straits.madd(A1,B_1,C1,B_1);
-              
+
               straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
               straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
               straits.loadRhsQuad(blA+2*spk, B_0);
               straits.loadRhsQuad(blA+3*spk, B_1);
               straits.madd(A0,B_0,C2,B_0);
               straits.madd(A1,B_1,C3,B_1);
-              
+
               blB += 4*SwappedTraits::LhsProgress;
               blA += 4*spk;
             }
@@ -1302,11 +1306,11 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
             {
               SLhsPacket A0;
               SRhsPacket B_0;
-              
+
               straits.loadLhsUnaligned(blB, A0);
               straits.loadRhsQuad(blA, B_0);
               straits.madd(A0,B_0,C0,B_0);
-              
+
               blB += SwappedTraits::LhsProgress;
               blA += spk;
             }
@@ -1317,10 +1321,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
               typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
               typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
               typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
-              
-              SResPacketHalf R = pgather<SResScalar, SResPacketHalf>(&res[j2*resStride + i], resStride);
+
+              SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
               SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
-             
+
               if(depth-endk>0)
               {
                 // We have to handle the last row of the rhs which corresponds to a half-packet
@@ -1336,14 +1340,14 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
               {
                 straits.acc(predux4(C0), alphav, R);
               }
-              pscatter(&res[j2*resStride + i], R, resStride);
+              res.scatterPacket(i, j2, R);
             }
             else
             {
-              SResPacket R = pgather<SResScalar, SResPacket>(&res[j2*resStride + i], resStride);
+              SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
               SResPacket alphav = pset1<SResPacket>(alpha);
               straits.acc(C0, alphav, R);
-              pscatter(&res[j2*resStride + i], R, resStride);
+              res.scatterPacket(i, j2, R);
             }
           }
           else // scalar path
@@ -1355,25 +1359,25 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
             {
               LhsScalar A0;
               RhsScalar B_0, B_1;
-              
+
               A0 = blA[k];
-              
+
               B_0 = blB[0];
               B_1 = blB[1];
               MADD(cj,A0,B_0,C0,  B_0);
               MADD(cj,A0,B_1,C1,  B_1);
-              
+
               B_0 = blB[2];
               B_1 = blB[3];
               MADD(cj,A0,B_0,C2,  B_0);
               MADD(cj,A0,B_1,C3,  B_1);
-              
+
               blB += 4;
             }
-            res[(j2+0)*resStride + i] += alpha*C0;
-            res[(j2+1)*resStride + i] += alpha*C1;
-            res[(j2+2)*resStride + i] += alpha*C2;
-            res[(j2+3)*resStride + i] += alpha*C3;
+            res(i, j2 + 0) += alpha * C0;
+            res(i, j2 + 1) += alpha * C1;
+            res(i, j2 + 2) += alpha * C2;
+            res(i, j2 + 3) += alpha * C3;
           }
         }
       }
@@ -1394,7 +1398,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
             RhsScalar B_0 = blB[k];
             MADD(cj, A0, B_0, C0, B_0);
           }
-          res[(j2+0)*resStride + i] += alpha*C0;
+          res(i, j2) += alpha * C0;
         }
       }
     }
@@ -1417,15 +1421,16 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
 //
 //  32 33 34 35 ...
 //  36 36 38 39 ...
-template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
 {
-  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0);
+  typedef typename DataMapper::LinearMapper LinearMapper;
+  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
 };
 
-template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
-  ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset)
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
+  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
   typedef typename packet_traits<Scalar>::type Packet;
   enum { PacketSize = packet_traits<Scalar>::size };
@@ -1436,30 +1441,29 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
   eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
   eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
   conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
-  const_blas_data_mapper<Scalar, Index, ColMajor> lhs(_lhs,lhsStride);
   Index count = 0;
-  
+
   const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
   const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
   const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
   const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
                          : Pack2>1             ? (rows/Pack2)*Pack2 : 0;
-  
+
   Index i=0;
-  
+
   // Pack 3 packets
   if(Pack1>=3*PacketSize)
   {
     for(; i<peeled_mc3; i+=3*PacketSize)
     {
       if(PanelMode) count += (3*PacketSize) * offset;
-      
+
       for(Index k=0; k<depth; k++)
       {
         Packet A, B, C;
-        A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
-        B = ploadu<Packet>(&lhs(i+1*PacketSize, k));
-        C = ploadu<Packet>(&lhs(i+2*PacketSize, k));
+        A = lhs.loadPacket(i+0*PacketSize, k);
+        B = lhs.loadPacket(i+1*PacketSize, k);
+        C = lhs.loadPacket(i+2*PacketSize, k);
         pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
         pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
         pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
@@ -1473,12 +1477,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
     for(; i<peeled_mc2; i+=2*PacketSize)
     {
       if(PanelMode) count += (2*PacketSize) * offset;
-      
+
       for(Index k=0; k<depth; k++)
       {
         Packet A, B;
-        A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
-        B = ploadu<Packet>(&lhs(i+1*PacketSize, k));
+        A = lhs.loadPacket(i+0*PacketSize, k);
+        B = lhs.loadPacket(i+1*PacketSize, k);
         pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
         pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
       }
@@ -1491,11 +1495,11 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
     for(; i<peeled_mc1; i+=1*PacketSize)
     {
       if(PanelMode) count += (1*PacketSize) * offset;
-      
+
       for(Index k=0; k<depth; k++)
       {
         Packet A;
-        A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
+        A = lhs.loadPacket(i+0*PacketSize, k);
         pstore(blockA+count, cj.pconj(A));
         count+=PacketSize;
       }
@@ -1508,11 +1512,11 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
     for(; i<peeled_mc0; i+=Pack2)
     {
       if(PanelMode) count += Pack2 * offset;
-      
+
       for(Index k=0; k<depth; k++)
         for(Index w=0; w<Pack2; w++)
           blockA[count++] = cj(lhs(i+w, k));
-        
+
       if(PanelMode) count += Pack2 * (stride-offset-depth);
     }
   }
@@ -1525,15 +1529,16 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
   }
 }
 
-template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
 {
-  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0);
+  typedef typename DataMapper::LinearMapper LinearMapper;
+  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
 };
 
-template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
-  ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset)
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
+  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
   typedef typename packet_traits<Scalar>::type Packet;
   enum { PacketSize = packet_traits<Scalar>::size };
@@ -1543,13 +1548,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj
   EIGEN_UNUSED_VARIABLE(offset);
   eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
   conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
-  const_blas_data_mapper<Scalar, Index, RowMajor> lhs(_lhs,lhsStride);
   Index count = 0;
-  
+
 //   const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
 //   const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
 //   const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
-  
+
   int pack = Pack1;
   Index i = 0;
   while(pack>0)
@@ -1569,7 +1573,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj
           for (Index m = 0; m < pack; m += PacketSize)
           {
             PacketBlock<Packet> kernel;
-            for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = ploadu<Packet>(&lhs(i+p+m, k));
+            for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k);
             ptranspose(kernel);
             for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
           }
@@ -1594,15 +1598,15 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj
           for(;w<pack;++w)
             blockA[count++] = cj(lhs(i+w, k));
       }
-      
+
       if(PanelMode) count += pack * (stride-offset-depth);
     }
-    
+
     pack -= PacketSize;
     if(pack<Pack2 && (pack+PacketSize)!=Pack2)
       pack = Pack2;
   }
-  
+
   for(; i<rows; i++)
   {
     if(PanelMode) count += offset;
@@ -1619,17 +1623,18 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj
 //  4  5  6  7   16 17 18 19   25 28
 //  8  9 10 11   20 21 22 23   26 29
 //  .  .  .  .    .  .  .  .    .  .
-template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
 {
   typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename DataMapper::LinearMapper LinearMapper;
   enum { PacketSize = packet_traits<Scalar>::size };
-  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
+  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
 };
 
-template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
-  ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset)
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+  ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
 {
   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
   EIGEN_UNUSED_VARIABLE(stride);
@@ -1685,27 +1690,27 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
 //       if(PanelMode) count += 8 * (stride-offset-depth);
 //     }
 //   }
-  
+
   if(nr>=4)
   {
     for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
     {
       // skip what we have before
       if(PanelMode) count += 4 * offset;
-      const Scalar* b0 = &rhs[(j2+0)*rhsStride];
-      const Scalar* b1 = &rhs[(j2+1)*rhsStride];
-      const Scalar* b2 = &rhs[(j2+2)*rhsStride];
-      const Scalar* b3 = &rhs[(j2+3)*rhsStride];
-      
+      const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
       Index k=0;
       if((PacketSize%4)==0) // TODO enbale vectorized transposition for PacketSize==2 ??
       {
         for(; k<peeled_k; k+=PacketSize) {
           PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
-          kernel.packet[0] = ploadu<Packet>(&b0[k]);
-          kernel.packet[1] = ploadu<Packet>(&b1[k]);
-          kernel.packet[2] = ploadu<Packet>(&b2[k]);
-          kernel.packet[3] = ploadu<Packet>(&b3[k]);
+          kernel.packet[0] = dm0.loadPacket(k);
+          kernel.packet[1] = dm1.loadPacket(k);
+          kernel.packet[2] = dm2.loadPacket(k);
+          kernel.packet[3] = dm3.loadPacket(k);
           ptranspose(kernel);
           pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
           pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1]));
@@ -1716,10 +1721,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
       }
       for(; k<depth; k++)
       {
-        blockB[count+0] = cj(b0[k]);
-        blockB[count+1] = cj(b1[k]);
-        blockB[count+2] = cj(b2[k]);
-        blockB[count+3] = cj(b3[k]);
+        blockB[count+0] = cj(dm0(k));
+        blockB[count+1] = cj(dm1(k));
+        blockB[count+2] = cj(dm2(k));
+        blockB[count+3] = cj(dm3(k));
         count += 4;
       }
       // skip what we have after
@@ -1731,10 +1736,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
   for(Index j2=packet_cols4; j2<cols; ++j2)
   {
     if(PanelMode) count += offset;
-    const Scalar* b0 = &rhs[(j2+0)*rhsStride];
+    const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
     for(Index k=0; k<depth; k++)
     {
-      blockB[count] = cj(b0[k]);
+      blockB[count] = cj(dm0(k));
       count += 1;
     }
     if(PanelMode) count += (stride-offset-depth);
@@ -1742,17 +1747,18 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
 }
 
 // this version is optimized for row major matrices
-template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
 {
   typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename DataMapper::LinearMapper LinearMapper;
   enum { PacketSize = packet_traits<Scalar>::size };
-  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
+  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
 };
 
-template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
-  ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset)
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+  ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
 {
   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
   EIGEN_UNUSED_VARIABLE(stride);
@@ -1762,7 +1768,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
   Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
   Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
   Index count = 0;
-  
+
 //   if(nr>=8)
 //   {
 //     for(Index j2=0; j2<packet_cols8; j2+=8)
@@ -1805,15 +1811,15 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
       for(Index k=0; k<depth; k++)
       {
         if (PacketSize==4) {
-          Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
+          Packet A = rhs.loadPacket(k, j2);
           pstoreu(blockB+count, cj.pconj(A));
           count += PacketSize;
         } else {
-          const Scalar* b0 = &rhs[k*rhsStride + j2];
-          blockB[count+0] = cj(b0[0]);
-          blockB[count+1] = cj(b0[1]);
-          blockB[count+2] = cj(b0[2]);
-          blockB[count+3] = cj(b0[3]);
+          const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
+          blockB[count+0] = cj(dm0(0));
+          blockB[count+1] = cj(dm0(1));
+          blockB[count+2] = cj(dm0(2));
+          blockB[count+3] = cj(dm0(3));
           count += 4;
         }
       }
@@ -1825,10 +1831,9 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
   for(Index j2=packet_cols4; j2<cols; ++j2)
   {
     if(PanelMode) count += offset;
-    const Scalar* b0 = &rhs[j2];
     for(Index k=0; k<depth; k++)
     {
-      blockB[count] = cj(b0[k*rhsStride]);
+      blockB[count] = cj(rhs(k, j2));
       count += 1;
     }
     if(PanelMode) count += stride-offset-depth;
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 6ad07eccb..4d7a6270a 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -59,21 +59,25 @@ typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScal
 static void run(Index rows, Index cols, Index depth,
   const LhsScalar* _lhs, Index lhsStride,
   const RhsScalar* _rhs, Index rhsStride,
-  ResScalar* res, Index resStride,
+  ResScalar* _res, Index resStride,
   ResScalar alpha,
   level3_blocking<LhsScalar,RhsScalar>& blocking,
   GemmParallelInfo<Index>* info = 0)
 {
-  const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-  const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
+  typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
+  typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
+  typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+  LhsMapper lhs(_lhs,lhsStride);
+  RhsMapper rhs(_rhs,rhsStride);
+  ResMapper res(_res, resStride);
 
   Index kc = blocking.kc();                   // cache block size along the K direction
   Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
   Index nc = (std::min)(cols,blocking.nc());  // cache block size along the N direction
 
-  gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
-  gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
-  gebp_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
+  gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+  gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+  gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
 
 #ifdef EIGEN_HAS_OPENMP
   if(info)
@@ -95,7 +99,7 @@ static void run(Index rows, Index cols, Index depth,
 
       // In order to reduce the chance that a thread has to wait for the other,
       // let's start by packing B'.
-      pack_rhs(blockB, &rhs(k,0), rhsStride, actual_kc, nc);
+      pack_rhs(blockB, rhs.getSubMapper(k,0), actual_kc, nc);
 
       // Pack A_k to A' in a parallel fashion:
       // each thread packs the sub block A_k,i to A'_i where i is the thread id.
@@ -105,8 +109,8 @@ static void run(Index rows, Index cols, Index depth,
       // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
       while(info[tid].users!=0) {}
       info[tid].users += threads;
-      
-      pack_lhs(blockA+info[tid].lhs_start*actual_kc, &lhs(info[tid].lhs_start,k), lhsStride, actual_kc, info[tid].lhs_length);
+
+      pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length);
 
       // Notify the other threads that the part A'_i is ready to go.
       info[tid].sync = k;
@@ -119,9 +123,12 @@ static void run(Index rows, Index cols, Index depth,
         // At this point we have to make sure that A'_i has been updated by the thread i,
         // we use testAndSetOrdered to mimic a volatile access.
         // However, no need to wait for the B' part which has been updated by the current thread!
-        if(shift>0)
-          while(info[i].sync!=k) {}
-        gebp(res+info[i].lhs_start, resStride, blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
+        if (shift>0) {
+          while(info[i].sync!=k) {
+          }
+        }
+
+        gebp(res.getSubMapper(info[i].lhs_start, 0), blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
       }
 
       // Then keep going as usual with the remaining B'
@@ -130,10 +137,10 @@ static void run(Index rows, Index cols, Index depth,
         const Index actual_nc = (std::min)(j+nc,cols)-j;
 
         // pack B_k,j to B'
-        pack_rhs(blockB, &rhs(k,j), rhsStride, actual_kc, actual_nc);
+        pack_rhs(blockB, rhs.getSubMapper(k,j), actual_kc, actual_nc);
 
         // C_j += A' * B'
-        gebp(res+j*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha);
+        gebp(res.getSubMapper(0, j), blockA, blockB, rows, actual_kc, actual_nc, alpha);
       }
 
       // Release all the sub blocks A'_i of A' for the current thread,
@@ -159,28 +166,33 @@ static void run(Index rows, Index cols, Index depth,
     ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
 
     // For each horizontal panel of the rhs, and corresponding panel of the lhs...
-    for(Index k2=0; k2<depth; k2+=kc)
+    for(Index i2=0; i2<rows; i2+=mc)
     {
-      const Index actual_kc = (std::min)(k2+kc,depth)-k2;
-
-      // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
-      // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
-      // Note that this panel will be read as many times as the number of blocks in the rhs's
-      // horizontal panel which is, in practice, a very low number.
-      pack_lhs(blockA, &lhs(0,k2), lhsStride, actual_kc, rows);
+      const Index actual_mc = (std::min)(i2+mc,rows)-i2;
 
-      // For each kc x nc block of the rhs's horizontal panel...
-      for(Index j2=0; j2<cols; j2+=nc)
+      for(Index k2=0; k2<depth; k2+=kc)
       {
-        const Index actual_nc = (std::min)(j2+nc,cols)-j2;
-
-        // We pack the rhs's block into a sequential chunk of memory (L2 caching)
-        // Note that this block will be read a very high number of times, which is equal to the number of
-        // micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
-        pack_rhs(blockB, &rhs(k2,j2), rhsStride, actual_kc, actual_nc);
-
-        // Everything is packed, we can now call the panel * block kernel:
-        gebp(res+j2*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha);
+        const Index actual_kc = (std::min)(k2+kc,depth)-k2;
+        
+        // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
+        // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
+        // Note that this panel will be read as many times as the number of blocks in the rhs's
+        // horizontal panel which is, in practice, a very low number.
+        pack_lhs(blockA, lhs.getSubMapper(i2,k2), actual_kc, actual_mc);
+        
+        // For each kc x nc block of the rhs's horizontal panel...
+        for(Index j2=0; j2<cols; j2+=nc)
+        {
+          const Index actual_nc = (std::min)(j2+nc,cols)-j2;
+          
+          // We pack the rhs's block into a sequential chunk of memory (L2 caching)
+          // Note that this block will be read a very high number of times, which is equal to the number of
+          // micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
+          pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc);
+          
+          // Everything is packed, we can now call the panel * block kernel:
+          gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha);
+        }
       }
     }
   }
@@ -335,7 +347,7 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
         DenseIndex n = this->m_nc;
         computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n);
       }
-      
+
       m_sizeA = this->m_mc * this->m_kc;
       m_sizeB = this->m_kc * this->m_nc;
     }
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
index 225b994d1..daa8a1d8a 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -58,13 +58,17 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
 {
   typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
   static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
-                                      const RhsScalar* _rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha)
+                                      const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, const ResScalar& alpha)
   {
-    const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-    const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
-
     typedef gebp_traits<LhsScalar,RhsScalar> Traits;
 
+    typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    RhsMapper rhs(_rhs,rhsStride);
+    ResMapper res(_res, resStride);
+
     Index kc = depth; // cache block size along the K direction
     Index mc = size;  // cache block size along the M direction
     Index nc = size;  // cache block size along the N direction
@@ -75,10 +79,10 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
 
     ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0);
     ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0);
-    
-    gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
-    gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
-    gebp_kernel <LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
+
+    gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+    gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
     tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, UpLo> sybb;
 
     for(Index k2=0; k2<depth; k2+=kc)
@@ -86,29 +90,30 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
       const Index actual_kc = (std::min)(k2+kc,depth)-k2;
 
       // note that the actual rhs is the transpose/adjoint of mat
-      pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, size);
+      pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, size);
 
       for(Index i2=0; i2<size; i2+=mc)
       {
         const Index actual_mc = (std::min)(i2+mc,size)-i2;
 
-        pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
+        pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
 
         // the selected actual_mc * size panel of res is split into three different part:
         //  1 - before the diagonal => processed with gebp or skipped
         //  2 - the actual_mc x actual_mc symmetric block => processed with a special kernel
         //  3 - after the diagonal => processed with gebp or skipped
         if (UpLo==Lower)
-          gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha,
-               -1, -1, 0, 0);
+          gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc,
+               (std::min)(size,i2), alpha, -1, -1, 0, 0);
+
 
-        sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
+        sybb(_res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
 
         if (UpLo==Upper)
         {
           Index j2 = i2+actual_mc;
-          gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha,
-               -1, -1, 0, 0);
+          gebp(res.getSubMapper(i2, j2), blockA, blockB+actual_kc*j2, actual_mc,
+               actual_kc, (std::max)(Index(0), size-j2), alpha, -1, -1, 0, 0);
         }
       }
     }
@@ -129,13 +134,16 @@ struct tribb_kernel
 {
   typedef gebp_traits<LhsScalar,RhsScalar,ConjLhs,ConjRhs> Traits;
   typedef typename Traits::ResScalar ResScalar;
-  
+
   enum {
     BlockSize  = EIGEN_PLAIN_ENUM_MAX(mr,nr)
   };
-  void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
+  void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
   {
-    gebp_kernel<LhsScalar, RhsScalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
+    typedef blas_data_mapper<ResScalar, Index, ColMajor> ResMapper;
+    ResMapper res(_res, resStride);
+    gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
+
     Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer;
 
     // let's process the block per panel of actual_mc x BlockSize,
@@ -146,7 +154,7 @@ struct tribb_kernel
       const RhsScalar* actual_b = blockB+j*depth;
 
       if(UpLo==Upper)
-        gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha,
+        gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha,
                     -1, -1, 0, 0);
 
       // selfadjoint micro block
@@ -154,12 +162,12 @@ struct tribb_kernel
         Index i = j;
         buffer.setZero();
         // 1 - apply the kernel on the temporary buffer
-        gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
+        gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
                     -1, -1, 0, 0);
         // 2 - triangular accumulation
         for(Index j1=0; j1<actualBlockSize; ++j1)
         {
-          ResScalar* r = res + (j+j1)*resStride + i;
+          ResScalar* r = &res(i, j + j1);
           for(Index i1=UpLo==Lower ? j1 : 0;
               UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1)
             r[i1] += buffer(i1,j1);
@@ -169,8 +177,8 @@ struct tribb_kernel
       if(UpLo==Lower)
       {
         Index i = j+actualBlockSize;
-        gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha,
-                    -1, -1, 0, 0);
+        gebp_kernel(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i, 
+                    depth, actualBlockSize, alpha, -1, -1, 0, 0);
       }
     }
   }
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index d67164ec3..d9e6084c3 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -324,16 +324,22 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
     Index rows, Index cols,
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
+    Scalar* _res,        Index resStride,
     const Scalar& alpha)
   {
     Index size = rows;
 
-    const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-    const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
-
     typedef gebp_traits<Scalar,Scalar> Traits;
 
+    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+    typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper;
+    typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    LhsTransposeMapper lhs_transpose(_lhs,lhsStride);
+    RhsMapper rhs(_rhs,rhsStride);
+    ResMapper res(_res, resStride);
+
     Index kc = size;  // cache block size along the K direction
     Index mc = rows;  // cache block size along the M direction
     Index nc = cols;  // cache block size along the N direction
@@ -346,10 +352,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
     ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
     Scalar* blockB = allocatedBlockB;
 
-    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+    gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
     symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
+    gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
 
     for(Index k2=0; k2<size; k2+=kc)
     {
@@ -358,7 +364,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
       // we have selected one row panel of rhs and one column panel of lhs
       // pack rhs's panel into a sequential chunk of memory
       // and expand each coeff to a constant packet for further reuse
-      pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols);
+      pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, cols);
 
       // the select lhs's panel has to be split in three different parts:
       //  1 - the transposed panel above the diagonal block => transposed packed copy
@@ -368,9 +374,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
       {
         const Index actual_mc = (std::min)(i2+mc,k2)-i2;
         // transposed packed copy
-        pack_lhs_transposed(blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc);
+        pack_lhs_transposed(blockA, lhs_transpose.getSubMapper(k2, i2), actual_kc, actual_mc);
 
-        gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
+        gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
       }
       // the block diagonal
       {
@@ -378,16 +384,16 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
         // symmetric packed copy
         pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc);
 
-        gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
+        gebp_kernel(res.getSubMapper(k2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
       }
 
       for(Index i2=k2+kc; i2<size; i2+=mc)
       {
         const Index actual_mc = (std::min)(i2+mc,size)-i2;
-        gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
-          (blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
+        gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
+          (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
 
-        gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
+        gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
       }
     }
   }
@@ -414,15 +420,18 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
     Index rows, Index cols,
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
+    Scalar* _res,        Index resStride,
     const Scalar& alpha)
   {
     Index size = cols;
 
-    const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-
     typedef gebp_traits<Scalar,Scalar> Traits;
 
+    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    ResMapper res(_res,resStride);
+
     Index kc = size; // cache block size along the K direction
     Index mc = rows;  // cache block size along the M direction
     Index nc = cols;  // cache block size along the N direction
@@ -432,8 +441,8 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
     ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
     Scalar* blockB = allocatedBlockB;
 
-    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
     symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
 
     for(Index k2=0; k2<size; k2+=kc)
@@ -446,9 +455,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
       for(Index i2=0; i2<rows; i2+=mc)
       {
         const Index actual_mc = (std::min)(i2+mc,rows)-i2;
-        pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
+        pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
 
-        gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
+        gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
       }
     }
   }
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h
index db7b27f8e..77aa3e5ee 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -108,7 +108,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
     Index _rows, Index _cols, Index _depth,
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
+    Scalar* _res,        Index resStride,
     const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
   {
     // strip zeros
@@ -117,8 +117,12 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
     Index depth     = IsLower ? diagSize : _depth;
     Index cols      = _cols;
     
-    const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-    const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
+    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+    typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    RhsMapper rhs(_rhs,rhsStride);
+    ResMapper res(_res, resStride);
 
     Index kc = blocking.kc();                   // cache block size along the K direction
     Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
@@ -136,9 +140,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
     else
       triangularBuffer.diagonal().setOnes();
 
-    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
+    gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
 
     for(Index k2=IsLower ? depth : 0;
         IsLower ? k2>0 : k2<depth;
@@ -154,7 +158,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
         k2 = k2+actual_kc-kc;
       }
 
-      pack_rhs(blockB, &rhs(actual_k2,0), rhsStride, actual_kc, cols);
+      pack_rhs(blockB, rhs.getSubMapper(actual_k2,0), actual_kc, cols);
 
       // the selected lhs's panel has to be split in three different parts:
       //  1 - the part which is zero => skip it
@@ -182,9 +186,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
             for (Index i=IsLower ? k+1 : 0; IsLower ? i<actualPanelWidth : i<k; ++i)
               triangularBuffer.coeffRef(i,k) = lhs(startBlock+i,startBlock+k);
           }
-          pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth);
+          pack_lhs(blockA, LhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()), actualPanelWidth, actualPanelWidth);
 
-          gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha,
+          gebp_kernel(res.getSubMapper(startBlock, 0), blockA, blockB,
+                      actualPanelWidth, actualPanelWidth, cols, alpha,
                       actualPanelWidth, actual_kc, 0, blockBOffset);
 
           // GEBP with remaining micro panel
@@ -192,9 +197,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
           {
             Index startTarget  = IsLower ? actual_k2+k1+actualPanelWidth : actual_k2;
 
-            pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget);
+            pack_lhs(blockA, lhs.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
 
-            gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha,
+            gebp_kernel(res.getSubMapper(startTarget, 0), blockA, blockB,
+                        lengthTarget, actualPanelWidth, cols, alpha,
                         actualPanelWidth, actual_kc, 0, blockBOffset);
           }
         }
@@ -206,10 +212,11 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
         for(Index i2=start; i2<end; i2+=mc)
         {
           const Index actual_mc = (std::min)(i2+mc,end)-i2;
-          gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
-            (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
+          gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
+            (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
 
-          gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0);
+          gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc,
+                      actual_kc, cols, alpha, -1, -1, 0, 0);
         }
       }
     }
@@ -247,7 +254,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
     Index _rows, Index _cols, Index _depth,
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
+    Scalar* _res,        Index resStride,
     const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
   {
     // strip zeros
@@ -256,8 +263,12 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
     Index depth     = IsLower ? _depth : diagSize;
     Index cols      = IsLower ? diagSize : _cols;
     
-    const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-    const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
+    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+    typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    RhsMapper rhs(_rhs,rhsStride);
+    ResMapper res(_res, resStride);
 
     Index kc = blocking.kc();                   // cache block size along the K direction
     Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
@@ -275,10 +286,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
     else
       triangularBuffer.diagonal().setOnes();
 
-    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
+    gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
 
     for(Index k2=IsLower ? 0 : depth;
         IsLower ? k2<depth  : k2>0;
@@ -302,7 +313,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
       Scalar* geb = blockB+ts*ts;
       geb = geb + internal::first_aligned(geb,EIGEN_ALIGN_BYTES/sizeof(Scalar));
 
-      pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs);
+      pack_rhs(geb, rhs.getSubMapper(actual_k2,IsLower ? 0 : k2), actual_kc, rs);
 
       // pack the triangular part of the rhs padding the unrolled blocks with zeros
       if(ts>0)
@@ -315,7 +326,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
           Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
           // general part
           pack_rhs_panel(blockB+j2*actual_kc,
-                         &rhs(actual_k2+panelOffset, actual_j2), rhsStride,
+                         rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
                          panelLength, actualPanelWidth,
                          actual_kc, panelOffset);
 
@@ -329,7 +340,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
           }
 
           pack_rhs_panel(blockB+j2*actual_kc,
-                         triangularBuffer.data(), triangularBuffer.outerStride(),
+                         RhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()),
                          actualPanelWidth, actualPanelWidth,
                          actual_kc, j2);
         }
@@ -338,7 +349,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
       for (Index i2=0; i2<rows; i2+=mc)
       {
         const Index actual_mc = (std::min)(mc,rows-i2);
-        pack_lhs(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
+        pack_lhs(blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
 
         // triangular kernel
         if(ts>0)
@@ -349,7 +360,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
             Index panelLength = IsLower ? actual_kc-j2 : j2+actualPanelWidth;
             Index blockOffset = IsLower ? j2 : 0;
 
-            gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride,
+            gebp_kernel(res.getSubMapper(i2, actual_k2 + j2),
                         blockA, blockB+j2*actual_kc,
                         actual_mc, panelLength, actualPanelWidth,
                         alpha,
@@ -357,7 +368,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
                         blockOffset, blockOffset);// offsets
           }
         }
-        gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride,
+        gebp_kernel(res.getSubMapper(i2, IsLower ? 0 : k2),
                     blockA, geb, actual_mc, actual_kc, rs,
                     alpha,
                     -1, -1, 0, 0);
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h
index 1f7afd187..238d6fc02 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -52,10 +52,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
     level3_blocking<Scalar,Scalar>& blocking)
   {
     Index cols = otherSize;
-    const_blas_data_mapper<Scalar, Index, TriStorageOrder> tri(_tri,triStride);
-    blas_data_mapper<Scalar, Index, ColMajor> other(_other,otherStride);
+
+    typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> TriMapper;
+    typedef blas_data_mapper<Scalar, Index, ColMajor> OtherMapper;
+    TriMapper tri(_tri, triStride);
+    OtherMapper other(_other, otherStride);
 
     typedef gebp_traits<Scalar,Scalar> Traits;
+
     enum {
       SmallPanelWidth   = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
       IsLower = (Mode&Lower) == Lower
@@ -71,9 +75,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
     ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
 
     conj_if<Conjugate> conj;
-    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr, ColMajor, false, true> pack_rhs;
+    gebp_kernel<Scalar, Scalar, Index, OtherMapper, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
+    gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
+    gemm_pack_rhs<Scalar, Index, OtherMapper, Traits::nr, ColMajor, false, true> pack_rhs;
 
     // the goal here is to subdivise the Rhs panels such that we keep some cache
     // coherence when accessing the rhs elements
@@ -146,16 +150,16 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
           Index blockBOffset = IsLower ? k1 : lengthTarget;
 
           // update the respective rows of B from other
-          pack_rhs(blockB+actual_kc*j2, &other(startBlock,j2), otherStride, actualPanelWidth, actual_cols, actual_kc, blockBOffset);
+          pack_rhs(blockB+actual_kc*j2, other.getSubMapper(startBlock,j2), actualPanelWidth, actual_cols, actual_kc, blockBOffset);
 
           // GEBP
           if (lengthTarget>0)
           {
             Index startTarget  = IsLower ? k2+k1+actualPanelWidth : k2-actual_kc;
 
-            pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget);
+            pack_lhs(blockA, tri.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
 
-            gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
+            gebp_kernel(other.getSubMapper(startTarget,j2), blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
                         actualPanelWidth, actual_kc, 0, blockBOffset);
           }
         }
@@ -170,9 +174,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
           const Index actual_mc = (std::min)(mc,end-i2);
           if (actual_mc>0)
           {
-            pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc);
+            pack_lhs(blockA, tri.getSubMapper(i2, IsLower ? k2 : k2-kc), actual_kc, actual_mc);
 
-            gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
+            gebp_kernel(other.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
           }
         }
       }
@@ -198,8 +202,11 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
     level3_blocking<Scalar,Scalar>& blocking)
   {
     Index rows = otherSize;
-    const_blas_data_mapper<Scalar, Index, TriStorageOrder> rhs(_tri,triStride);
-    blas_data_mapper<Scalar, Index, ColMajor> lhs(_other,otherStride);
+
+    typedef blas_data_mapper<Scalar, Index, ColMajor> LhsMapper;
+    typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper;
+    LhsMapper lhs(_other, otherStride);
+    RhsMapper rhs(_tri, triStride);
 
     typedef gebp_traits<Scalar,Scalar> Traits;
     enum {
@@ -218,10 +225,10 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
     ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
 
     conj_if<Conjugate> conj;
-    gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
+    gebp_kernel<Scalar, Scalar, Index, LhsMapper, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder,false,true> pack_rhs_panel;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
 
     for(Index k2=IsLower ? size : 0;
         IsLower ? k2>0 : k2<size;
@@ -234,7 +241,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
       Index rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc;
       Scalar* geb = blockB+actual_kc*actual_kc;
 
-      if (rs>0) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, actual_kc, rs);
+      if (rs>0) pack_rhs(geb, rhs.getSubMapper(actual_k2,startPanel), actual_kc, rs);
 
       // triangular packing (we only pack the panels off the diagonal,
       // neglecting the blocks overlapping the diagonal
@@ -248,7 +255,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
 
           if (panelLength>0)
           pack_rhs_panel(blockB+j2*actual_kc,
-                         &rhs(actual_k2+panelOffset, actual_j2), triStride,
+                         rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
                          panelLength, actualPanelWidth,
                          actual_kc, panelOffset);
         }
@@ -276,7 +283,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
             // GEBP
             if(panelLength>0)
             {
-              gebp_kernel(&lhs(i2,absolute_j2), otherStride,
+              gebp_kernel(lhs.getSubMapper(i2,absolute_j2),
                           blockA, blockB+j2*actual_kc,
                           actual_mc, panelLength, actualPanelWidth,
                           Scalar(-1),
@@ -303,14 +310,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
             }
 
             // pack the just computed part of lhs to A
-            pack_lhs_panel(blockA, _other+absolute_j2*otherStride+i2, otherStride,
+            pack_lhs_panel(blockA, LhsMapper(_other+absolute_j2*otherStride+i2, otherStride),
                            actualPanelWidth, actual_mc,
                            actual_kc, j2);
           }
         }
 
         if (rs>0)
-          gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb,
+          gebp_kernel(lhs.getSubMapper(i2, startPanel), blockA, geb,
                       actual_mc, actual_kc, rs, Scalar(-1),
                       -1, -1, 0, 0);
       }
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index 0d8e2705a..25a62d528 100644
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -18,13 +18,13 @@ namespace Eigen {
 namespace internal {
 
 // forward declarations
-template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
 struct gebp_kernel;
 
-template<typename Scalar, typename Index, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
+template<typename Scalar, typename Index, typename DataMapper, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
 struct gemm_pack_rhs;
 
-template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
 struct gemm_pack_lhs;
 
 template<
@@ -117,32 +117,96 @@ template<typename Scalar> struct get_factor<Scalar,typename NumTraits<Scalar>::R
   static EIGEN_STRONG_INLINE typename NumTraits<Scalar>::Real run(const Scalar& x) { return numext::real(x); }
 };
 
+
+template<typename Scalar, typename Index, int AlignmentType>
+class MatrixLinearMapper {
+  public:
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
+  EIGEN_ALWAYS_INLINE MatrixLinearMapper(Scalar *data) : m_data(data) {}
+
+  EIGEN_ALWAYS_INLINE void prefetch(int i) const {
+    internal::prefetch(&operator()(i));
+  }
+
+  EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const {
+    return m_data[i];
+  }
+
+  EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
+    return ploadt<Packet, AlignmentType>(m_data + i);
+  }
+
+  EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
+    return ploadt<HalfPacket, AlignmentType>(m_data + i);
+  }
+
+  EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
+    pstoret<Scalar, Packet, AlignmentType>(m_data + i, p);
+  }
+
+  protected:
+  Scalar *m_data;
+};
+
 // Lightweight helper class to access matrix coefficients.
-// Yes, this is somehow redundant with Map<>, but this version is much much lighter,
-// and so I hope better compilation performance (time and code quality).
-template<typename Scalar, typename Index, int StorageOrder>
-class blas_data_mapper
-{
+template<typename Scalar, typename Index, int StorageOrder, int AlignmentType = Unaligned>
+class blas_data_mapper {
   public:
-    blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i, Index j)
-    { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; }
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
+  typedef MatrixLinearMapper<Scalar, Index, AlignmentType> LinearMapper;
+
+  EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
+
+  EIGEN_ALWAYS_INLINE blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>
+  getSubMapper(Index i, Index j) const {
+    return blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>(&operator()(i, j), m_stride);
+  }
+
+  EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    return LinearMapper(&operator()(i, j));
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
+    return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride];
+  }
+
+  EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
+    return ploadt<Packet, AlignmentType>(&operator()(i, j));
+  }
+
+  EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
+    return ploadt<HalfPacket, AlignmentType>(&operator()(i, j));
+  }
+
+  template<typename SubPacket>
+  EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, SubPacket p) const {
+    pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
+  }
+
+  template<typename SubPacket>
+  EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const {
+    return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride);
+  }
+
   protected:
-    Scalar* EIGEN_RESTRICT m_data;
-    Index m_stride;
+  Scalar* EIGEN_RESTRICT m_data;
+  const Index m_stride;
 };
 
 // lightweight helper class to access matrix coefficients (const version)
 template<typename Scalar, typename Index, int StorageOrder>
-class const_blas_data_mapper
-{
+class const_blas_data_mapper : public blas_data_mapper<const Scalar, Index, StorageOrder> {
   public:
-    const_blas_data_mapper(const Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i, Index j) const
-    { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; }
-  protected:
-    const Scalar* EIGEN_RESTRICT m_data;
-    Index m_stride;
+  EIGEN_ALWAYS_INLINE const_blas_data_mapper(const Scalar *data, Index stride) : blas_data_mapper<const Scalar, Index, StorageOrder>(data, stride) {}
+
+  EIGEN_ALWAYS_INLINE const_blas_data_mapper<Scalar, Index, StorageOrder> getSubMapper(Index i, Index j) const {
+    return const_blas_data_mapper<Scalar, Index, StorageOrder>(&(this->operator()(i, j)), this->m_stride);
+  }
 };
 
 
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 5d8913dd8..75423f516 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -48,7 +48,7 @@ if(MPFR_FOUND)
   include_directories(${MPFR_INCLUDES} ./mpreal)
   ei_add_property(EIGEN_TESTED_BACKENDS "MPFR C++, ")
   set(EIGEN_MPFR_TEST_LIBRARIES ${MPFR_LIBRARIES} ${GMP_LIBRARIES})
-  ei_add_test(mpreal_support "" "${EIGEN_MPFR_TEST_LIBRARIES}" )
+#  ei_add_test(mpreal_support "" "${EIGEN_MPFR_TEST_LIBRARIES}" )
 else()
   ei_add_property(EIGEN_MISSING_BACKENDS "MPFR C++, ")
 endif()
-- 
cgit v1.2.3


From 95a430a2ca8489a85d0a12ffa66d260011c11745 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 3 Oct 2014 19:45:19 -0700
Subject: Vector primitives for CUDA

---
 Eigen/Core                               |   5 +
 Eigen/src/Core/arch/CUDA/MathFunctions.h |  75 +++++++++
 Eigen/src/Core/arch/CUDA/PacketMath.h    | 260 +++++++++++++++++++++++++++++++
 3 files changed, 340 insertions(+)
 create mode 100644 Eigen/src/Core/arch/CUDA/MathFunctions.h
 create mode 100644 Eigen/src/Core/arch/CUDA/PacketMath.h

(limited to 'Eigen')

diff --git a/Eigen/Core b/Eigen/Core
index 776b7faf3..537ac16b2 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -294,6 +294,11 @@ using std::ptrdiff_t;
   #include "src/Core/arch/NEON/Complex.h"
 #endif
 
+#if defined EIGEN_VECTORIZE_CUDA
+  #include "src/Core/arch/CUDA/PacketMath.h"
+  #include "src/Core/arch/CUDA/MathFunctions.h"
+#endif
+
 #include "src/Core/arch/Default/Settings.h"
 
 #include "src/Core/functors/BinaryFunctors.h"
diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h
new file mode 100644
index 000000000..e7305c01e
--- /dev/null
+++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h
@@ -0,0 +1,75 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H
+#define EIGEN_MATH_FUNCTIONS_CUDA_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+template<> EIGEN_STRONG_INLINE
+float4 plog<float4>(const float4& a)
+{
+  return make_float4(logf(a.x), logf(a.y), logf(a.z), logf(a.w));
+}
+
+template<> EIGEN_STRONG_INLINE
+double2 plog<double2>(const double2& a)
+{
+  return make_double2(log(a.x), log(a.y));
+}
+
+template<> EIGEN_STRONG_INLINE
+float4 pexp<float4>(const float4& a)
+{
+  return make_float4(expf(a.x), expf(a.y), expf(a.z), expf(a.w));
+}
+
+template<> EIGEN_STRONG_INLINE
+double2 pexp<double2>(const double2& a)
+{
+  return make_double2(exp(a.x), exp(a.y));
+}
+
+template<> EIGEN_STRONG_INLINE
+float4 psqrt<float4>(const float4& a)
+{
+  return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
+}
+
+template<> EIGEN_STRONG_INLINE
+double2 psqrt<double2>(const double2& a)
+{
+  return make_double2(sqrt(a.x), sqrt(a.y));
+}
+
+template<> EIGEN_STRONG_INLINE
+float4 prsqrt<float4>(const float4& a)
+{
+  return make_float4(rsqrtf(a.x), rsqrtf(a.y), rsqrtf(a.z), rsqrtf(a.w));
+}
+
+template<> EIGEN_STRONG_INLINE
+double2 prsqrt<double2>(const double2& a)
+{
+  return make_double2(rsqrt(a.x), rsqrt(a.y));
+}
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATH_FUNCTIONS_CUDA_H
diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h
new file mode 100644
index 000000000..5b0abe2e6
--- /dev/null
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -0,0 +1,260 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_CUDA_H
+#define EIGEN_PACKET_MATH_CUDA_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+template<> struct is_arithmetic<float4>  { enum { value = true }; };
+template<> struct is_arithmetic<double2> { enum { value = true }; };
+
+
+template<> struct packet_traits<float> : default_packet_traits
+{
+  typedef float4 type;
+  typedef float4 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=4,
+    HasHalfPacket = 0,
+
+    HasDiv  = 1,
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+
+    HasBlend = 0,
+  };
+};
+
+template<> struct packet_traits<double> : default_packet_traits
+{
+  typedef double2 type;
+  typedef double2 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=2,
+    HasHalfPacket = 0,
+
+    HasDiv  = 1,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+
+    HasBlend = 0,
+  };
+};
+
+
+template<> struct unpacket_traits<float4> { typedef float  type; enum {size=4}; typedef float4 half; };
+template<> struct unpacket_traits<double2> { typedef double type; enum {size=2}; typedef double2 half; };
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float&  from) {
+  return make_float4(from, from, from, from);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
+  return make_double2(from, from);
+}
+
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float>(const float& a) {
+  return make_float4(a, a+1, a+2, a+3);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double>(const double& a) {
+  return make_double2(a, a+1);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x+b.x, a.y+b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x-b.x, a.y-b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
+  return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
+  return make_double2(-a.x, -a.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x*b.x, a.y*b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x/b.x, a.y/b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
+  return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
+  return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
+  return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
+  return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
+  return *reinterpret_cast<const float4*>(from);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
+  return *reinterpret_cast<const double2*>(from);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
+  return make_float4(from[0], from[1], from[2], from[3]);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
+  return make_double2(from[0], from[1]);
+}
+
+template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float*   from) {
+  return make_float4(from[0], from[0], from[1], from[1]);
+}
+template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double*  from) {
+  return make_double2(from[0], from[0]);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float*   to, const float4& from) {
+  *reinterpret_cast<float4*>(to) = from;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
+  *reinterpret_cast<double2*>(to) = from;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const float4& from) {
+  to[0] = from.x;
+  to[1] = from.y;
+  to[2] = from.z;
+  to[3] = from.w;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
+  to[0] = from.x;
+  to[1] = from.y;
+}
+
+#ifdef __CUDA_ARCH__
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
+  return __ldg((const float4*)from);
+}
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
+  return __ldg((const double2*)from);
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
+  return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
+}
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
+  return make_double2(__ldg(from+0), __ldg(from+1));
+}
+#endif
+
+template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, int stride) {
+  return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, int stride) {
+  return make_double2(from[0*stride], from[1*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, int stride) {
+  to[stride*0] = from.x;
+  to[stride*1] = from.y;
+  to[stride*2] = from.z;
+  to[stride*3] = from.w;
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, int stride) {
+  to[stride*0] = from.x;
+  to[stride*1] = from.y;
+}
+
+template<> EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<float4,4>& kernel) {
+  double tmp = kernel.packet[0].y;
+  kernel.packet[0].y = kernel.packet[1].x;
+  kernel.packet[1].x = tmp;
+
+  tmp = kernel.packet[0].z;
+  kernel.packet[0].z = kernel.packet[2].x;
+  kernel.packet[2].x = tmp;
+
+  tmp = kernel.packet[0].w;
+  kernel.packet[0].w = kernel.packet[3].x;
+  kernel.packet[3].x = tmp;
+
+  tmp = kernel.packet[1].z;
+  kernel.packet[1].z = kernel.packet[2].y;
+  kernel.packet[2].y = tmp;
+
+  tmp = kernel.packet[1].w;
+  kernel.packet[1].w = kernel.packet[3].y;
+  kernel.packet[3].y = tmp;
+
+  tmp = kernel.packet[2].w;
+  kernel.packet[2].w = kernel.packet[3].z;
+  kernel.packet[3].z = tmp;
+}
+
+template<> EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<double2,2>& kernel) {
+  double tmp = kernel.packet[0].y;
+  kernel.packet[0].y = kernel.packet[1].x;
+  kernel.packet[1].x = tmp;
+}
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+
+#endif // EIGEN_PACKET_MATH_CUDA_H
-- 
cgit v1.2.3


From bbce6fa65d8a196f05e0428d014e0e3865e202f3 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 3 Oct 2014 19:55:35 -0700
Subject: define EIGEN_VECTORIZE_CUDA when compiling with nvcc

---
 Eigen/Core | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'Eigen')

diff --git a/Eigen/Core b/Eigen/Core
index 537ac16b2..acdeca5f4 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -178,6 +178,11 @@
   #endif
 #endif
 
+#if defined __CUDACC__
+  #define EIGEN_VECTORIZE_CUDA
+  #include <vector_types.h>
+#endif
+
 #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
   #define EIGEN_HAS_OPENMP
 #endif
-- 
cgit v1.2.3


From 99d75235a9567865d2c070a2840d54c8a5ad0f43 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 13 Oct 2014 17:02:09 -0700
Subject: Misc improvements and cleanups

---
 Eigen/src/Core/GenericPacketMath.h                 |  15 +-
 unsupported/Eigen/CXX11/Tensor                     |   4 +
 .../Eigen/CXX11/src/Core/util/CXX11Workarounds.h   |   5 +
 .../Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h   | 101 ++++++++-
 unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h  |   4 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorBase.h    |   8 +-
 .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h    |   8 +-
 .../Eigen/CXX11/src/Tensor/TensorConvolution.h     |  12 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h  |  35 ++++
 .../Eigen/CXX11/src/Tensor/TensorDeviceType.h      |  73 ++++---
 .../Eigen/CXX11/src/Tensor/TensorDimensions.h      |   2 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h  |   2 +-
 .../Eigen/CXX11/src/Tensor/TensorEvaluator.h       |  20 +-
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h        |  36 +++-
 .../Eigen/CXX11/src/Tensor/TensorFixedSize.h       |   4 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h  |  26 ++-
 unsupported/Eigen/CXX11/src/Tensor/TensorMap.h     |  22 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h |   4 +-
 .../Eigen/CXX11/src/Tensor/TensorShuffling.h       |   4 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h |   9 +-
 .../Eigen/CXX11/src/Tensor/TensorStriding.h        |  61 ++++--
 unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h  |  32 +--
 unsupported/test/CMakeLists.txt                    |   1 +
 unsupported/test/cxx11_tensor_assign.cpp           |  35 +++-
 unsupported/test/cxx11_tensor_convolution.cpp      |  70 +++++++
 unsupported/test/cxx11_tensor_device.cpp           |  27 +++
 unsupported/test/cxx11_tensor_morphing.cpp         |   5 +-
 unsupported/test/cxx11_tensor_of_complex.cpp       |  64 ++++++
 unsupported/test/cxx11_tensor_thread_pool.cpp      | 232 ++++++++++++++++++++-
 29 files changed, 780 insertions(+), 141 deletions(-)
 create mode 100644 unsupported/test/cxx11_tensor_of_complex.cpp

(limited to 'Eigen')

diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index e6fea5bba..3ef3475c7 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -359,7 +359,7 @@ pmadd(const Packet&  a,
 /** \internal \returns a packet version of \a *from.
   * If LoadMode equals #Aligned, \a from must be 16 bytes aligned */
 template<typename Packet, int LoadMode>
-inline Packet ploadt(const typename unpacket_traits<Packet>::type* from)
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_traits<Packet>::type* from)
 {
   if(LoadMode == Aligned)
     return pload<Packet>(from);
@@ -370,7 +370,7 @@ inline Packet ploadt(const typename unpacket_traits<Packet>::type* from)
 /** \internal copy the packet \a from to \a *to.
   * If StoreMode equals #Aligned, \a to must be 16 bytes aligned */
 template<typename Scalar, typename Packet, int LoadMode>
-inline void pstoret(Scalar* to, const Packet& from)
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& from)
 {
   if(LoadMode == Aligned)
     pstore(to, from);
@@ -378,6 +378,17 @@ inline void pstoret(Scalar* to, const Packet& from)
     pstoreu(to, from);
 }
 
+/** \internal \returns a packet version of \a *from.
+  * Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the
+  * hardware if available to speedup the loading of data that won't be modified
+  * by the current computation.
+  */
+template<typename Packet, int LoadMode>
+inline Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
+{
+  return ploadt<Packet, LoadMode>(from);
+}
+
 /** \internal default implementation of palign() allowing partial specialization */
 template<int Offset,typename PacketType>
 struct palign_impl
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 0dac95e45..2137f4276 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -30,6 +30,10 @@
 #include <cstring>
 #include <stdint.h>
 
+#ifdef EIGEN_USE_THREADS
+#include <future>
+#endif
+
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
 #include <curand_kernel.h>
 #endif
diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h
index 227522ecb..e30eb6ad8 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h
@@ -66,6 +66,11 @@ template<std::size_t I, class T, std::size_t N> constexpr inline T&       array_
 template<std::size_t I, class T, std::size_t N> constexpr inline T&&      array_get(std::array<T,N>&&      a) { return (T&&)      STD_GET_ARR_HACK; }
 template<std::size_t I, class T, std::size_t N> constexpr inline T const& array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; }
 
+template<std::size_t I, class T> constexpr inline T&       array_get(std::vector<T>&       a) { return a[I]; }
+template<std::size_t I, class T> constexpr inline T&&      array_get(std::vector<T>&&      a) { return a[I]; }
+template<std::size_t I, class T> constexpr inline T const& array_get(std::vector<T> const& a) { return a[I]; }
+
+
 #undef STD_GET_ARR_HACK
 
 template <typename T> struct array_size;
diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
index 4c6b95773..e45d0a3b1 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
@@ -48,7 +48,8 @@ template <typename T, size_t n> class array {
     values[2] = v3;
   }
   EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4) {
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3,
+                            const T& v4) {
     EIGEN_STATIC_ASSERT(n==4, YOU_MADE_A_PROGRAMMING_MISTAKE)
     values[0] = v1;
     values[1] = v2;
@@ -56,7 +57,8 @@ template <typename T, size_t n> class array {
     values[3] = v4;
   }
   EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, const T& v5) {
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+                            const T& v5) {
     EIGEN_STATIC_ASSERT(n==5, YOU_MADE_A_PROGRAMMING_MISTAKE)
     values[0] = v1;
     values[1] = v2;
@@ -64,6 +66,43 @@ template <typename T, size_t n> class array {
     values[3] = v4;
     values[4] = v5;
   }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+                            const T& v5, const T& v6) {
+    EIGEN_STATIC_ASSERT(n==6, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+                            const T& v5, const T& v6, const T& v7) {
+    EIGEN_STATIC_ASSERT(n==7, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+    values[6] = v7;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(
+      const T& v1, const T& v2, const T& v3, const T& v4,
+      const T& v5, const T& v6, const T& v7, const T& v8) {
+    EIGEN_STATIC_ASSERT(n==8, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+    values[6] = v7;
+    values[7] = v8;
+  }
 
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
   array(std::initializer_list<T> l) {
@@ -93,9 +132,11 @@ template<typename T, typename Tail=empty_list> struct type_list {
 
 struct null_type { };
 
-template<typename T1 = null_type, typename T2 = null_type, typename T3 = null_type, typename T4 = null_type, typename T5 = null_type>
+template<typename T1 = null_type, typename T2 = null_type, typename T3 = null_type,
+         typename T4 = null_type, typename T5 = null_type, typename T6 = null_type,
+         typename T7 = null_type, typename T8 = null_type>
 struct make_type_list {
-  typedef typename make_type_list<T2, T3, T4, T5>::type tailresult;
+  typedef typename make_type_list<T2, T3, T4, T5, T6, T7, T8>::type tailresult;
 
   typedef type_list<T1, tailresult> type;
 };
@@ -150,6 +191,23 @@ template<typename T, T V> struct gen_numeric_list_repeated<T, 5, V> {
   typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
 };
 
+template<typename T, T V> struct gen_numeric_list_repeated<T, 6, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 7, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 8, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V>, type2val<T, V> >::type type;
+};
+
 
 template <std::size_t index, class NList> struct get;
 
@@ -174,6 +232,7 @@ template <> struct arg_prod<empty_list> {
   static const int value = 1;
 };
 
+
 template<int n, typename t>
 array<t, n> repeat(t v) {
   array<t, n> array;
@@ -190,6 +249,11 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_l
   return get<I, type_list<Head, Tail> >::value;
 }
 
+template <class NList>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList& l) {
+  return arg_prod<NList>::value;
+};
+
 template<std::size_t n, typename t>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, n>& a) {
   t prod = 1;
@@ -201,6 +265,14 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, 0>& /*a*/) {
   return 0;
 }
 
+template<typename t>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
+  eigen_assert(a.size() > 0);
+  t prod = 1;
+  for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; }
+  return prod;
+}
+
 template<std::size_t I, class T, std::size_t N>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array<T,N>& a) {
   return a[I];
@@ -210,12 +282,31 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) {
   return a[I];
 }
 
+template<std::size_t I, class T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(std::vector<T>& a) {
+  return a[I];
+}
+template<std::size_t I, class T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const std::vector<T>& a) {
+  return a[I];
+}
 
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<array<T,N> > {
+  static const size_t value = N;
+};
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<array<T,N>& > {
+  static const size_t value = N;
+};
 template <typename T> struct array_size;
 template<class T, std::size_t N> struct array_size<const array<T,N> > {
   static const size_t value = N;
 };
-
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<const array<T,N>& > {
+  static const size_t value = N;
+};
 
 struct sum_op {
   template<typename A, typename B> static inline bool run(A a, B b) { return a + b; }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index 3bfe80c9e..e973c00d3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -131,8 +131,8 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
     m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
-    static const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
-    static const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
+    const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
+    const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
     m_leftImpl.template writePacket<LhsStoreMode>(i, m_rightImpl.template packet<RhsLoadMode>(i));
   }
   EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 27c10f64f..6018ecc66 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -30,6 +30,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
     typedef Scalar CoeffReturnType;
     typedef typename internal::packet_traits<Scalar>::type PacketReturnType;
 
+    // Dimensions
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return derived().dimensions()[n]; }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index size() const { return internal::array_prod(derived().dimensions()); }
+
     // Nullary operators
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived>
@@ -187,7 +193,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
 
     // Contractions.
-    typedef std::pair<Index, Index> DimensionPair;
+    typedef Eigen::IndexPair<Index> DimensionPair;
 
     template<typename OtherDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorContractionOp<const Dimensions, const Derived, const OtherDerived>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 3b2a9c8b9..0e55d4de1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -48,7 +48,7 @@ struct nested<TensorBroadcastingOp<Broadcast, XprType>, 1, typename eval<TensorB
 
 
 template<typename Broadcast, typename XprType>
-class TensorBroadcastingOp : public TensorBase<TensorBroadcastingOp<Broadcast, XprType>, WriteAccessors>
+class TensorBroadcastingOp : public TensorBase<TensorBroadcastingOp<Broadcast, XprType>, ReadOnlyAccessors>
 {
   public:
   typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Scalar Scalar;
@@ -91,7 +91,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
   };
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : m_impl(op.expression(), device)
   {
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
@@ -141,7 +141,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const D
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
     EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+packetSize-1 < dimensions().TotalSize());
 
@@ -161,7 +161,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const D
     if (innermostLoc + packetSize <= m_impl.dimensions()[0]) {
       return m_impl.template packet<Unaligned>(inputIndex);
     } else {
-      EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
+      EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
       values[0] = m_impl.coeff(inputIndex);
       for (int i = 1; i < packetSize; ++i) {
         values[i] = coeff(originalIndex+i);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 4a5fd9c79..34bdd5309 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -872,11 +872,19 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    assert(m_buf);
-    assert(index < m_dimensions.TotalSize());
+    eigen_assert(m_buf);
+    eigen_assert(index < m_dimensions.TotalSize());
     return m_buf[index];
   }
 
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
+  {
+    eigen_assert(m_buf);
+    eigen_assert(index < m_dimensions.TotalSize());
+    return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
+  }
+
  private:
   // No assignment (copies are needed by the kernels)
   TensorEvaluator& operator = (const TensorEvaluator&);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
index 75519c9f5..649bdb308 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
@@ -38,6 +38,18 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice {
       return *this;
     }
 
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
+      typedef typename OtherDerived::Scalar Scalar;
+      typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
+      Sum sum(m_expression, other);
+      typedef TensorAssignOp<ExpressionType, const Sum> Assign;
+      Assign assign(m_expression, sum);
+      static const bool Vectorize = TensorEvaluator<const Assign, DeviceType>::PacketAccess;
+      internal::TensorExecutor<const Assign, DeviceType, Vectorize>::run(assign, m_device);
+      return *this;
+    }
+
   protected:
     const DeviceType& m_device;
     ExpressionType& m_expression;
@@ -58,6 +70,18 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPool
       return *this;
     }
 
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
+      typedef typename OtherDerived::Scalar Scalar;
+      typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
+      Sum sum(m_expression, other);
+      typedef TensorAssignOp<ExpressionType, const Sum> Assign;
+      Assign assign(m_expression, sum);
+      static const bool Vectorize = TensorEvaluator<const Assign, ThreadPoolDevice>::PacketAccess;
+      internal::TensorExecutor<const Assign, ThreadPoolDevice, Vectorize>::run(assign, m_device);
+      return *this;
+    }
+
   protected:
     const ThreadPoolDevice& m_device;
     ExpressionType& m_expression;
@@ -79,6 +103,17 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
       return *this;
     }
 
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
+      typedef typename OtherDerived::Scalar Scalar;
+      typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
+      Sum sum(m_expression, other);
+      typedef TensorAssignOp<ExpressionType, const Sum> Assign;
+      Assign assign(m_expression, sum);
+      internal::TensorExecutor<const Assign, GpuDevice, false>::run(assign, m_device);
+      return *this;
+    }
+
   protected:
     const GpuDevice& m_device;
     ExpressionType m_expression;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
index fad342eab..5a6ff70e9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
@@ -37,23 +37,41 @@ struct DefaultDevice {
 // Multiple cpu cores
 // We should really use a thread pool here but first we need to find a portable thread pool library.
 #ifdef EIGEN_USE_THREADS
+
+typedef std::future<void> Future;
+
 struct ThreadPoolDevice {
-  ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : /*pool_(pool), */num_threads_(num_cores) { }
-  size_t numThreads() const { return num_threads_; }
+  ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : num_threads_(num_cores) { }
 
   EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
     return internal::aligned_malloc(num_bytes);
   }
+
   EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
     internal::aligned_free(buffer);
   }
+
   EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
     ::memcpy(dst, src, n);
   }
+
   EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
     ::memset(buffer, c, n);
   }
 
+  EIGEN_STRONG_INLINE size_t numThreads() const {
+    return num_threads_;
+  }
+
+  template <class Function, class... Args>
+  EIGEN_STRONG_INLINE Future enqueue(Function&& f, Args&&... args) const {
+    return std::async(std::launch::async, f, args...);
+  }
+  template <class Function, class... Args>
+  EIGEN_STRONG_INLINE void enqueueNoFuture(Function&& f, Args&&... args) const {
+    std::async(std::launch::async, f, args...);
+  }
+
  private:
   // todo: NUMA, ...
   size_t num_threads_;
@@ -63,41 +81,34 @@ struct ThreadPoolDevice {
 
 // GPU offloading
 #ifdef EIGEN_USE_GPU
-static int m_numMultiProcessors = 0;
-static int m_maxThreadsPerBlock = 0;
-static int m_maxThreadsPerMultiProcessor = 0;
+static cudaDeviceProp m_deviceProperties;
+static bool m_devicePropInitialized = false;
+
+static void initializeDeviceProp() {
+  if (!m_devicePropInitialized) {
+    assert(cudaGetDeviceProperties(&m_deviceProperties, 0) == cudaSuccess);
+    m_devicePropInitialized = true;
+  }
+}
 
 static inline int getNumCudaMultiProcessors() {
-  if (m_numMultiProcessors == 0) {
-    cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, 0);
-    m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
-    m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor;
-    m_numMultiProcessors = deviceProp.multiProcessorCount;
-  }
-  return m_numMultiProcessors;
+  initializeDeviceProp();
+  return m_deviceProperties.multiProcessorCount;
 }
 static inline int maxCudaThreadsPerBlock() {
-  if (m_maxThreadsPerBlock == 0) {
-    cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, 0);
-    m_numMultiProcessors = deviceProp.multiProcessorCount;
-    m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor;
-    m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
-  }
-  return m_maxThreadsPerBlock;
+  initializeDeviceProp();
+  return m_deviceProperties.maxThreadsPerBlock;
 }
 static inline int maxCudaThreadsPerMultiProcessor() {
-  if (m_maxThreadsPerBlock == 0) {
-    cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, 0);
-    m_numMultiProcessors = deviceProp.multiProcessorCount;
-    m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
-    m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor;
-  }
-  return m_maxThreadsPerMultiProcessor;
+  initializeDeviceProp();
+  return m_deviceProperties.maxThreadsPerMultiProcessor;
+}
+static inline int sharedMemPerBlock() {
+  initializeDeviceProp();
+  return m_deviceProperties.sharedMemPerBlock;
 }
 
+
 struct GpuDevice {
   // The cudastream is not owned: the caller is responsible for its initialization and eventual destruction.
   GpuDevice(const cudaStream_t* stream) : stream_(stream) { eigen_assert(stream); }
@@ -141,8 +152,8 @@ struct GpuDevice {
 #endif
   }
 
-  EIGEN_STRONG_INLINE size_t numThreads() const {
-    // Fixme:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
+    // FIXME
     return 32;
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index 732c6b344..2dd8e274b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -29,7 +29,7 @@ namespace Eigen {
   * \sa Tensor
   */
 
-// Can't use std::pairs on cuda devices
+// Can't use std::pair on cuda devices
 template <typename Index> struct IndexPair {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) { }
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Index f, Index s) : first(f), second(s) { }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 587cbd5ca..ce9d73578 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -116,7 +116,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
     m_buffer[i] = m_impl.coeff(i);
   }
-  EIGEN_STRONG_INLINE void evalPacket(Index i) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
     internal::pstoret<Scalar, Packet, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 0f969036c..e324ba8d2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -65,13 +65,13 @@ struct TensorEvaluator
     return m_data[index];
   }
 
-  template<int LoadMode> EIGEN_STRONG_INLINE
+  template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   PacketReturnType packet(Index index) const
   {
     return internal::ploadt<Packet, LoadMode>(m_data + index);
   }
 
-  template <int StoreMode> EIGEN_STRONG_INLINE
+  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const Packet& x)
   {
     return internal::pstoret<Scalar, Packet, StoreMode>(m_data + index, x);
@@ -113,13 +113,17 @@ struct TensorEvaluator<const Derived, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     eigen_assert(m_data);
+#ifdef __CUDA_ARCH__
+    return __ldg(m_data+index);
+#else
     return m_data[index];
+#endif
   }
 
-  template<int LoadMode> EIGEN_STRONG_INLINE
+  template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   PacketReturnType packet(Index index) const
   {
-    return internal::ploadt<Packet, LoadMode>(m_data + index);
+    return internal::ploadt_ro<Packet, LoadMode>(m_data + index);
   }
 
   const Scalar* data() const { return m_data; }
@@ -166,7 +170,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
   }
 
   template<int LoadMode>
-  EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
     return m_functor.packetOp(index);
   }
@@ -219,7 +223,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
   }
 
   template<int LoadMode>
-  EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
     return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index));
   }
@@ -278,7 +282,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
     return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index));
   }
   template<int LoadMode>
-  EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
     return m_functor.packetOp(m_leftImpl.template packet<LoadMode>(index), m_rightImpl.template packet<LoadMode>(index));
   }
@@ -340,7 +344,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
     return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index);
   }
   template<int LoadMode>
-  PacketReturnType packet(Index index) const
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
   {
     static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
     internal::Selector<PacketSize> select;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 10f5a5ee7..01fa04c64 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -10,10 +10,6 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
 
-#ifdef EIGEN_USE_THREADS
-#include <future>
-#endif
-
 namespace Eigen {
 
 /** \class TensorExecutor
@@ -62,7 +58,7 @@ class TensorExecutor<Expression, DefaultDevice, true>
     {
       const Index size = array_prod(evaluator.dimensions());
       static const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
-      const int VectorizedSize = (size / PacketSize) * PacketSize;
+      const Index VectorizedSize = (size / PacketSize) * PacketSize;
 
       for (Index i = 0; i < VectorizedSize; i += PacketSize) {
         evaluator.evalPacket(i);
@@ -131,10 +127,10 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
       const Index numblocks = size / blocksize;
 
       Index i = 0;
-      std::vector<std::future<void> > results;
+      std::vector<Future> results;
       results.reserve(numblocks);
       for (int i = 0; i < numblocks; ++i) {
-         results.push_back(std::async(std::launch::async, &EvalRange<Evaluator, Index>::run, &evaluator, i*blocksize, (i+1)*blocksize));
+        results.push_back(device.enqueue(&EvalRange<Evaluator, Index>::run, &evaluator, i*blocksize, (i+1)*blocksize));
       }
 
       for (int i = 0; i < numblocks; ++i) {
@@ -154,11 +150,31 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
 // GPU: the evaluation of the expression is offloaded to a GPU.
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
 template <typename Evaluator>
-__global__ void EigenMetaKernel(Evaluator eval, unsigned int size) {
+__global__ void
+__launch_bounds__(1024)
+EigenMetaKernel(Evaluator eval, unsigned int size) {
+
   const int first_index = blockIdx.x * blockDim.x + threadIdx.x;
   const int step_size = blockDim.x * gridDim.x;
-  for (int i = first_index; i < size; i += step_size) {
-    eval.evalScalar(i);
+
+  if (!Evaluator::PacketAccess || !Evaluator::IsAligned) {
+    // Use the scalar path
+    for (int i = first_index; i < size; i += step_size) {
+      eval.evalScalar(i);
+    }
+  }
+  else {
+    // Use the vector path
+    const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+    const int vectorized_step_size = step_size * PacketSize;
+    const int vectorized_size = (size / PacketSize) * PacketSize;
+    int i = first_index * PacketSize;
+    for ( ; i < vectorized_size; i += vectorized_step_size) {
+      eval.evalPacket(i);
+    }
+    for ( ; i < size; i += step_size) {
+      eval.evalScalar(i);
+    }
   }
 }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index 4d7f9e1fd..a753c5a48 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -17,7 +17,7 @@ namespace Eigen {
   *
   * \brief The fixed sized version of the tensor class.
   *
-  * The fixes sized equivalent of 
+  * The fixed sized equivalent of
   * Eigen::Tensor<float, 3> t(3, 5, 7);
   * is
   * Eigen::TensorFixedSize<float, Size<3,5,7>> t;
@@ -41,7 +41,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
 
     enum {
       IsAligned = bool(EIGEN_ALIGN),
-      PacketAccess = true,
+      PacketAccess = (internal::packet_traits<Scalar>::size > 1),
     };
 
   typedef Dimensions_ Dimensions;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index cf97031be..2714117ab 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -31,30 +31,34 @@ namespace internal {
 template <typename T>
 struct TensorIntDivisor {
  public:
-  TensorIntDivisor() {
+   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
     multiplier = 0;
     shift1 = 0;
     shift2 = 0;
   }
 
   // Must have 1 <= divider <= 2^31-1
-  TensorIntDivisor(const T divider) {
-    static const int N = 32;
+   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) {
+    const int N = 32;
     eigen_assert(divider > 0);
     eigen_assert(divider <= (1<<(N-1)) - 1);
 
     // fast ln2
+#ifndef __CUDA_ARCH__
     const int leading_zeros = __builtin_clz(divider);
-    const int l = N - (leading_zeros+1);
-
-    multiplier = (static_cast<uint64_t>(1) << (N+l)) / divider - (static_cast<uint64_t>(1) << N) + 1;
-    shift1 = (std::min)(1, l);
-    shift2 = (std::max)(0, l-1);
+#else
+    const int leading_zeros = __clz(divider);
+#endif
+    const int log_div = N - (leading_zeros+1);
+
+    multiplier = (static_cast<uint64_t>(1) << (N+log_div)) / divider - (static_cast<uint64_t>(1) << N) + 1;
+    shift1 = log_div > 1 ? 1 : log_div;
+    shift2 = log_div > 1 ? log_div-1 : 0;
   }
 
   // Must have 0 <= numerator <= 2^32-1
-  T divide(const T numerator) const {
-    static const int N = 32;
+   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const {
+    const int N = 32;
     eigen_assert(numerator >= 0);
     eigen_assert(numerator <= (1ull<<N) - 1);
 
@@ -71,7 +75,7 @@ struct TensorIntDivisor {
 
 
 template <typename T>
-static T operator / (const T& numerator, const TensorIntDivisor<T>& divisor) {
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor<T>& divisor) {
   return divisor.divide(numerator);
 }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
index 04849dd9f..2c0d2cd0f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@@ -42,26 +42,25 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
 
     static const int Options = Options_;
 
-    static const std::size_t NumIndices = PlainObjectType::NumIndices;
+    static const Index NumIndices = PlainObjectType::NumIndices;
     typedef typename PlainObjectType::Dimensions Dimensions;
 
-
     enum {
-      IsAligned = bool(EIGEN_ALIGN) && ((int(Options_)&Aligned)==Aligned),
-      PacketAccess = true,
+      IsAligned = ((int(Options_)&Aligned)==Aligned),
+      PacketAccess = (internal::packet_traits<Scalar>::size > 1),
     };
 
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(array<DenseIndex, NumIndices>({{firstDimension, otherDimensions...}})) {
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
       // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
-      EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
 #else
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(array<DenseIndex, NumIndices>(firstDimension)) {
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) {
       // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
-      EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
 #endif
 
@@ -176,12 +175,13 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
     {
-      static_assert(sizeof...(otherIndices) + 1 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+      static_assert(sizeof...(otherIndices) + 1 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+      const std::size_t NumDims = sizeof...(otherIndices) + 1;
       if (PlainObjectType::Options&RowMajor) {
-        const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+        const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumDims>{{firstIndex, otherIndices...}});
         return m_data[index];
       } else {
-        const Index index = m_dimensions.IndexOfColMajor(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+        const Index index = m_dimensions.IndexOfColMajor(array<Index, NumDims>{{firstIndex, otherIndices...}});
         return m_data[index];
       }
     }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index 7da89458f..8da6e0f26 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -144,7 +144,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
     EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+packetSize-1 < dimensions().TotalSize());
 
@@ -206,7 +206,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
   {
-    static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
     EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
     for (int i = 0; i < packetSize; ++i) {
       values[i] = coeff(index+i);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index f7e7fc107..7e0063626 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -97,7 +97,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
 
   enum {
-    IsAligned = true,
+    IsAligned = false,
     PacketAccess = (internal::packet_traits<Scalar>::size > 1),
   };
 
@@ -194,7 +194,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
 
   enum {
-    IsAligned = true,
+    IsAligned = false,
     PacketAccess = (internal::packet_traits<Scalar>::size > 1),
   };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
index 0c4f8a3d6..aaec39756 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -30,11 +30,11 @@ namespace Eigen {
   *
   * \sa Tensor
   */
-template<typename T, std::size_t NumIndices_, DenseIndex Size, int Options_, typename Dimensions = void> class TensorStorage;
+template<typename T, DenseIndex NumIndices_, DenseIndex Size, int Options_, typename Dimensions = void> class TensorStorage;
 
 
 // Pure fixed-size storage
-template<typename T, std::size_t NumIndices_, DenseIndex Size, int Options_, typename FixedDimensions>
+template<typename T, DenseIndex NumIndices_, DenseIndex Size, int Options_, typename FixedDimensions>
 class TensorStorage
 {
  private:
@@ -62,7 +62,7 @@ class TensorStorage
 
 
 // pure-dynamic, but without specification of all dimensions explicitly
-template<typename T, std::size_t NumIndices_, int Options_>
+template<typename T, DenseIndex NumIndices_, int Options_>
 class TensorStorage<T, NumIndices_, Dynamic, Options_, void>
   : public TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type>
 {
@@ -79,7 +79,7 @@ class TensorStorage<T, NumIndices_, Dynamic, Options_, void>
 };
 
 // pure dynamic
-template<typename T, std::size_t NumIndices_, int Options_>
+template<typename T, DenseIndex NumIndices_, int Options_>
 class TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type>
 {
     T *m_data;
@@ -140,6 +140,7 @@ class TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_nu
 };
 
 
+
 } // end namespace Eigen
 
 #endif // EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 7acdbfc72..ecfdb762c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -48,7 +48,7 @@ struct nested<TensorStridingOp<Strides, XprType>, 1, typename eval<TensorStridin
 
 
 template<typename Strides, typename XprType>
-class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType>, WriteAccessors>
+class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType> >
 {
   public:
   typedef typename Eigen::internal::traits<TensorStridingOp>::Scalar Scalar;
@@ -97,7 +97,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
 
   enum {
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
-    PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -109,28 +109,23 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
     }
 
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
-    for (int i = 0; i < NumDims; ++i) {
-      if (i > 0) {
-        m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
-        m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
-      } else {
-        m_inputStrides[0] = 1;
-        m_outputStrides[0] = 1;
-      }
-    }
-    for (int i = 0; i < NumDims; ++i) {
-        m_inputStrides[i] *= op.strides()[i];
+    m_outputStrides[0] = 1;
+    m_inputStrides[0] = 1;
+    for (int i = 1; i < NumDims; ++i) {
+      m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+      m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+      m_inputStrides[i-1] *= op.strides()[i-1];
     }
+    m_inputStrides[NumDims-1] *= op.strides()[NumDims-1];
   }
 
-  //  typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketReturnType PacketReturnType;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
@@ -150,16 +145,44 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
     return m_impl.coeff(inputIndex);
   }
 
-  /*  template<int LoadMode>
+  template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    return m_impl.template packet<LoadMode>(index);
-    }*/
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    Index inputIndices[] = {0, 0};
+    Index indices[] = {index, index + packetSize - 1};
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx0 = indices[0] / m_outputStrides[i];
+      const Index idx1 = indices[1] / m_outputStrides[i];
+      inputIndices[0] += idx0 * m_inputStrides[i];
+      inputIndices[1] += idx1 * m_inputStrides[i];
+      indices[0] -= idx0 * m_outputStrides[i];
+      indices[1] -= idx1 * m_outputStrides[i];
+    }
+    inputIndices[0] += indices[0] * m_inputStrides[0];
+    inputIndices[1] += indices[1] * m_inputStrides[0];
+    if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+      PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
+      return rslt;
+    }
+    else {
+      EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+      values[0] = m_impl.coeff(inputIndices[0]);
+      values[packetSize-1] = m_impl.coeff(inputIndices[1]);
+      for (int i = 1; i < packetSize-1; ++i) {
+        values[i] = coeff(index+i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  }
 
   Scalar* data() const { return NULL; }
 
  protected:
-    //  Strides m_strides;
   Dimensions m_dimensions;
   array<Index, NumDims> m_outputStrides;
   array<Index, NumDims> m_inputStrides;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
index 40f805741..5940a8cf1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -70,14 +70,18 @@ struct traits<TensorFixedSize<Scalar_, Dimensions, Options_> >
 };
 
 
-template<typename PlainObjectType>
-struct traits<TensorMap<PlainObjectType> >
+template<typename PlainObjectType, int Options_>
+struct traits<TensorMap<PlainObjectType, Options_> >
   : public traits<PlainObjectType>
 {
   typedef traits<PlainObjectType> BaseTraits;
   typedef typename BaseTraits::Scalar Scalar;
   typedef typename BaseTraits::StorageKind StorageKind;
   typedef typename BaseTraits::Index Index;
+  enum {
+    Options = Options_,
+    Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0),
+  };
 };
 
 
@@ -105,16 +109,16 @@ struct eval<const TensorFixedSize<Scalar_, Dimensions, Options>, Eigen::Dense>
   typedef const TensorFixedSize<Scalar_, Dimensions, Options>& type;
 };
 
-template<typename PlainObjectType>
-struct eval<TensorMap<PlainObjectType>, Eigen::Dense>
+template<typename PlainObjectType, int Options>
+struct eval<TensorMap<PlainObjectType, Options>, Eigen::Dense>
 {
-  typedef const TensorMap<PlainObjectType>& type;
+  typedef const TensorMap<PlainObjectType, Options>& type;
 };
 
-template<typename PlainObjectType>
-struct eval<const TensorMap<PlainObjectType>, Eigen::Dense>
+template<typename PlainObjectType, int Options>
+struct eval<const TensorMap<PlainObjectType, Options>, Eigen::Dense>
 {
-  typedef const TensorMap<PlainObjectType>& type;
+  typedef const TensorMap<PlainObjectType, Options>& type;
 };
 
 template <typename Scalar_, std::size_t NumIndices_, int Options_>
@@ -141,16 +145,16 @@ struct nested<const TensorFixedSize<Scalar_, Dimensions, Options>, 1, typename e
   typedef const TensorFixedSize<Scalar_, Dimensions, Options>& type;
 };
 
-template <typename PlainObjectType>
-struct nested<TensorMap<PlainObjectType>, 1, typename eval<TensorMap<PlainObjectType> >::type>
+template <typename PlainObjectType, int Options>
+struct nested<TensorMap<PlainObjectType, Options>, 1, typename eval<TensorMap<PlainObjectType, Options> >::type>
 {
-  typedef const TensorMap<PlainObjectType>& type;
+  typedef const TensorMap<PlainObjectType, Options>& type;
 };
 
-template <typename PlainObjectType>
-struct nested<const TensorMap<PlainObjectType>, 1, typename eval<TensorMap<PlainObjectType> >::type>
+template <typename PlainObjectType, int Options>
+struct nested<const TensorMap<PlainObjectType, Options>, 1, typename eval<TensorMap<PlainObjectType, Options> >::type>
 {
-  typedef const TensorMap<PlainObjectType>& type;
+  typedef const TensorMap<PlainObjectType, Options>& type;
 };
 
 }  // end namespace internal
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index d6c435947..a7ef2b402 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -110,6 +110,7 @@ if(EIGEN_TEST_CXX11)
 #  ei_add_test(cxx11_tensor_fixed_size "-std=c++0x")
   ei_add_test(cxx11_tensor_const "-std=c++0x")
   ei_add_test(cxx11_tensor_of_const_values "-std=c++0x")
+  ei_add_test(cxx11_tensor_of_complex "-std=c++0x")
   ei_add_test(cxx11_tensor_of_strings "-std=c++0x")
   ei_add_test(cxx11_tensor_intdiv "-std=c++0x")
   ei_add_test(cxx11_tensor_lvalue "-std=c++0x")
diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp
index f2b126413..0ac3f9bf9 100644
--- a/unsupported/test/cxx11_tensor_assign.cpp
+++ b/unsupported/test/cxx11_tensor_assign.cpp
@@ -253,6 +253,39 @@ static void test_auto_resize()
 }
 
 
+static void test_compound_assign()
+{
+  Tensor<int, 1> start_tensor(10);
+  Tensor<int, 1> offset_tensor(10);
+  start_tensor.setRandom();
+  offset_tensor.setRandom();
+
+  Tensor<int, 1> tensor = start_tensor;
+  tensor += offset_tensor;
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_EQUAL(tensor(i), start_tensor(i) + offset_tensor(i));
+  }
+
+  tensor = start_tensor;
+  tensor -= offset_tensor;
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_EQUAL(tensor(i), start_tensor(i) - offset_tensor(i));
+  }
+
+  tensor = start_tensor;
+  tensor *= offset_tensor;
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_EQUAL(tensor(i), start_tensor(i) * offset_tensor(i));
+  }
+
+  tensor = start_tensor;
+  tensor /= offset_tensor;
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_EQUAL(tensor(i), start_tensor(i) / offset_tensor(i));
+  }
+}
+
+
 void test_cxx11_tensor_assign()
 {
   CALL_SUBTEST(test_1d());
@@ -260,5 +293,5 @@ void test_cxx11_tensor_assign()
   CALL_SUBTEST(test_3d());
   CALL_SUBTEST(test_same_type());
   CALL_SUBTEST(test_auto_resize());
-
+  CALL_SUBTEST(test_compound_assign());
 }
diff --git a/unsupported/test/cxx11_tensor_convolution.cpp b/unsupported/test/cxx11_tensor_convolution.cpp
index bafe73edd..4672db463 100644
--- a/unsupported/test/cxx11_tensor_convolution.cpp
+++ b/unsupported/test/cxx11_tensor_convolution.cpp
@@ -64,8 +64,78 @@ static void test_expr()
 }
 
 
+static void test_modes() {
+  Tensor<float, 1> input(3);
+  Tensor<float, 1> kernel(3);
+  input(0) = 1.0f;
+  input(1) = 2.0f;
+  input(2) = 3.0f;
+  kernel(0) = 0.5f;
+  kernel(1) = 1.0f;
+  kernel(2) = 0.0f;
+
+  const Eigen::array<ptrdiff_t, 1> dims{{0}};
+  Eigen::array<std::pair<ptrdiff_t, ptrdiff_t>, 1> padding;
+
+  // Emulate VALID mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(0, 0);
+  Tensor<float, 1> valid(1);
+  valid = input.pad(padding).convolve(kernel, dims);
+  VERIFY_IS_EQUAL(valid.dimension(0), 1);
+  VERIFY_IS_APPROX(valid(0), 2.5f);
+
+  // Emulate SAME mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(1, 1);
+  Tensor<float, 1> same(3);
+  same = input.pad(padding).convolve(kernel, dims);
+  VERIFY_IS_EQUAL(same.dimension(0), 3);
+  VERIFY_IS_APPROX(same(0), 1.0f);
+  VERIFY_IS_APPROX(same(1), 2.5f);
+  VERIFY_IS_APPROX(same(2), 4.0f);
+
+  // Emulate FULL mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(2, 2);
+  Tensor<float, 1> full(5);
+  full = input.pad(padding).convolve(kernel, dims);
+  VERIFY_IS_EQUAL(full.dimension(0), 5);
+  VERIFY_IS_APPROX(full(0), 0.0f);
+  VERIFY_IS_APPROX(full(1), 1.0f);
+  VERIFY_IS_APPROX(full(2), 2.5f);
+  VERIFY_IS_APPROX(full(3), 4.0f);
+  VERIFY_IS_APPROX(full(4), 1.5f);
+}
+
+
+static void test_strides() {
+  Tensor<float, 1> input(13);
+  Tensor<float, 1> kernel(3);
+  input.setRandom();
+  kernel.setRandom();
+
+  const Eigen::array<ptrdiff_t, 1> dims{{0}};
+  const Eigen::array<ptrdiff_t, 1> stride_of_3{{3}};
+  const Eigen::array<ptrdiff_t, 1> stride_of_2{{2}};
+
+  Tensor<float, 1> result;
+  result = input.stride(stride_of_3).convolve(kernel, dims).stride(stride_of_2);
+
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_APPROX(result(0), (input(0)*kernel(0) + input(3)*kernel(1) +
+                               input(6)*kernel(2)));
+  VERIFY_IS_APPROX(result(1), (input(6)*kernel(0) + input(9)*kernel(1) +
+                               input(12)*kernel(2)));
+}
+
+
+
+
 void test_cxx11_tensor_convolution()
 {
   CALL_SUBTEST(test_evals());
   CALL_SUBTEST(test_expr());
+  CALL_SUBTEST(test_modes());
+  CALL_SUBTEST(test_strides());
 }
diff --git a/unsupported/test/cxx11_tensor_device.cpp b/unsupported/test/cxx11_tensor_device.cpp
index f331cb481..26465ee11 100644
--- a/unsupported/test/cxx11_tensor_device.cpp
+++ b/unsupported/test/cxx11_tensor_device.cpp
@@ -123,6 +123,14 @@ static void test_forced_contextual_eval(Context* context)
   context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f);
 }
 
+template <typename Context>
+static void test_compound_assignment(Context* context)
+{
+  context->out().device(context->device()) = context->in1().constant(2.718f);
+  context->out().device(context->device()) += context->in1() + context->in2() * 3.14f;
+}
+
+
 template <typename Context>
 static void test_contraction(Context* context)
 {
@@ -197,6 +205,15 @@ static void test_cpu() {
     }
   }
 
+  test_compound_assignment(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), in1(Eigen::array<int, 3>(i,j,k)) + in2(Eigen::array<int, 3>(i,j,k)) * 3.14f + 2.718f);
+      }
+    }
+  }
+
   test_contraction(&context);
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 40; ++j) {
@@ -299,6 +316,16 @@ static void test_gpu() {
     }
   }
 
+  test_compound_assignment(&context);
+  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), in1(Eigen::array<int, 3>(i,j,k)) + in2(Eigen::array<int, 3>(i,j,k)) * 3.14f + 2.718f);
+      }
+    }
+  }
+
   test_contraction(&context);
   assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
   for (int i = 0; i < 40; ++i) {
diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp
index 2a6a97856..fd1b1fa32 100644
--- a/unsupported/test/cxx11_tensor_morphing.cpp
+++ b/unsupported/test/cxx11_tensor_morphing.cpp
@@ -12,6 +12,7 @@
 #include <Eigen/CXX11/Tensor>
 
 using Eigen::Tensor;
+using Eigen::IndexPair;
 
 static void test_simple_reshape()
 {
@@ -52,7 +53,7 @@ static void test_reshape_in_expr() {
   TensorMap<Tensor<float, 5>> tensor2(m2.data(), 3,5,7,11,13);
   Tensor<float, 2>::Dimensions newDims1{{2,3*5*7*11}};
   Tensor<float, 2>::Dimensions newDims2{{3*5*7*11,13}};
-  array<Tensor<float, 1>::DimensionPair, 1> contract_along{{std::make_pair(1, 0)}};
+  Eigen::array<IndexPair<DenseIndex>, 1> contract_along{{IndexPair<DenseIndex>(1, 0)}};
   Tensor<float, 2> tensor3(2,13);
   tensor3 = tensor1.reshape(newDims1).contract(tensor2.reshape(newDims2), contract_along);
 
@@ -125,7 +126,7 @@ static void test_slice_in_expr() {
   TensorMap<Tensor<float, 2>> tensor1(m1.data(), 7, 7);
   TensorMap<Tensor<float, 2>> tensor2(m2.data(), 3, 3);
   Tensor<float, 2> tensor3(3,1);
-  array<Tensor<float, 1>::DimensionPair, 1> contract_along{{std::make_pair(1, 0)}};
+  array<IndexPair<DenseIndex>, 1> contract_along{{IndexPair<DenseIndex>(1, 0)}};
 
   Eigen::DSizes<ptrdiff_t, 2> indices1(1,2);
   Eigen::DSizes<ptrdiff_t, 2> sizes1(3,3);
diff --git a/unsupported/test/cxx11_tensor_of_complex.cpp b/unsupported/test/cxx11_tensor_of_complex.cpp
new file mode 100644
index 000000000..b5044b962
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_of_complex.cpp
@@ -0,0 +1,64 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+
+
+static void test_additions()
+{
+  Tensor<std::complex<float>, 1> data1(3);
+  Tensor<std::complex<float>, 1> data2(3);
+  for (int i = 0; i < 3; ++i) {
+    data1(i) = std::complex<float>(i, -i);
+    data2(i) = std::complex<float>(i, 7 * i);
+  }
+
+  Tensor<std::complex<float>, 1> sum = data1 + data2;
+  for (int i = 0; i < 3; ++i) {
+    VERIFY_IS_EQUAL(sum(i),  std::complex<float>(2*i, 6*i));
+  }
+}
+
+
+static void test_contractions()
+{
+  Tensor<std::complex<float>, 4> t_left(30, 50, 8, 31);
+  Tensor<std::complex<float>, 5> t_right(8, 31, 7, 20, 10);
+  Tensor<std::complex<float>, 5> t_result(30, 50, 7, 20, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  typedef Map<Matrix<std::complex<float>, Dynamic, Dynamic>> MapXcf;
+  MapXcf m_left(t_left.data(), 1500, 248);
+  MapXcf m_right(t_right.data(), 248, 1400);
+  Matrix<std::complex<float>, Dynamic, Dynamic> m_result(1500, 1400);
+
+  // This contraction should be equivalent to a regular matrix multiplication
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+  t_result = t_left.contract(t_right, dims);
+  m_result = m_left * m_right;
+  for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
+  }
+}
+
+
+void test_cxx11_tensor_of_complex()
+{
+  CALL_SUBTEST(test_additions());
+  CALL_SUBTEST(test_contractions());
+}
diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
index e02d8e4be..f0de61f8b 100644
--- a/unsupported/test/cxx11_tensor_thread_pool.cpp
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -9,22 +9,23 @@
 
 #define EIGEN_USE_THREADS
 
-
+#include <iostream>
 #include "main.h"
 #include <Eigen/CXX11/Tensor>
 
+
 using Eigen::Tensor;
 
-void test_cxx11_tensor_thread_pool()
+static void test_multithread_elementwise()
 {
-  Eigen::Tensor<float, 3> in1(2,3,7);
-  Eigen::Tensor<float, 3> in2(2,3,7);
-  Eigen::Tensor<float, 3> out(2,3,7);
+  Tensor<float, 3> in1(2,3,7);
+  Tensor<float, 3> in2(2,3,7);
+  Tensor<float, 3> out(2,3,7);
 
   in1.setRandom();
   in2.setRandom();
 
-  Eigen::ThreadPoolDevice thread_pool_device(3);
+  Eigen::ThreadPoolDevice thread_pool_device(internal::random<int>(3, 11));
   out.device(thread_pool_device) = in1 + in2 * 3.14f;
 
   for (int i = 0; i < 2; ++i) {
@@ -35,3 +36,222 @@ void test_cxx11_tensor_thread_pool()
     }
   }
 }
+
+
+static void test_multithread_compound_assignment()
+{
+  Tensor<float, 3> in1(2,3,7);
+  Tensor<float, 3> in2(2,3,7);
+  Tensor<float, 3> out(2,3,7);
+
+  in1.setRandom();
+  in2.setRandom();
+
+  Eigen::ThreadPoolDevice thread_pool_device(internal::random<int>(3, 11));
+  out.device(thread_pool_device) = in1;
+  out.device(thread_pool_device) += in2 * 3.14f;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f);
+      }
+    }
+  }
+}
+
+
+static void test_multithread_contraction()
+{
+  Tensor<float, 4> t_left(30, 50, 37, 31);
+  Tensor<float, 5> t_right(37, 31, 70, 2, 10);
+  Tensor<float, 5> t_result(30, 50, 70, 2, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  // this contraction should be equivalent to a single matrix multiplication
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+
+
+  typedef Map<MatrixXf> MapXf;
+  MapXf m_left(t_left.data(), 1500, 1147);
+  MapXf m_right(t_right.data(), 1147, 1400);
+  MatrixXf m_result(1500, 1400);
+
+  Eigen::ThreadPoolDevice thread_pool_device(4);
+
+  // compute results by separate methods
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  m_result = m_left * m_right;
+
+ for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    VERIFY(&t_result.data()[i] != &m_result.data()[i]);
+    if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
+      std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+}
+
+
+static void test_contraction_corner_cases()
+{
+  Tensor<float, 2> t_left(32, 500);
+  Tensor<float, 2> t_right(32, 28*28);
+  Tensor<float, 2> t_result(500, 28*28);
+
+  t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
+  t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
+  t_result = t_result.constant(NAN);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims{{DimPair(0, 0)}};
+
+  typedef Map<MatrixXf> MapXf;
+  MapXf m_left(t_left.data(), 32, 500);
+  MapXf m_right(t_right.data(), 32, 28*28);
+  MatrixXf m_result(500, 28*28);
+
+  Eigen::ThreadPoolDevice thread_pool_device(12);
+
+  // compute results by separate methods
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  m_result = m_left.transpose() * m_right;
+
+  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    assert(!isnan(t_result.data()[i]));
+    if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
+      std::cout << "mismatch detected at index " << i << " : " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+
+  t_left.resize(32, 1);
+  t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
+  t_result.resize (1, 28*28);
+  t_result = t_result.constant(NAN);
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  new(&m_left) MapXf(t_left.data(), 32, 1);
+  m_result = m_left.transpose() * m_right;
+  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    assert(!isnan(t_result.data()[i]));
+    if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
+      std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+
+  t_left.resize(32, 500);
+  t_right.resize(32, 4);
+  t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
+  t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
+  t_result.resize (500, 4);
+  t_result = t_result.constant(NAN);
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  new(&m_left) MapXf(t_left.data(), 32, 500);
+  new(&m_right) MapXf(t_right.data(), 32, 4);
+  m_result = m_left.transpose() * m_right;
+  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    assert(!isnan(t_result.data()[i]));
+    if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
+      std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+
+  t_left.resize(32, 1);
+  t_right.resize(32, 4);
+  t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
+  t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
+  t_result.resize (1, 4);
+  t_result = t_result.constant(NAN);
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  new(&m_left) MapXf(t_left.data(), 32, 1);
+  new(&m_right) MapXf(t_right.data(), 32, 4);
+  m_result = m_left.transpose() * m_right;
+  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    assert(!isnan(t_result.data()[i]));
+    if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
+      std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+}
+
+
+static void test_multithread_contraction_agrees_with_singlethread() {
+  int contract_size = internal::random<int>(1, 5000);
+
+  Tensor<float, 3> left(internal::random<int>(1, 80),
+                        contract_size,
+                        internal::random<int>(1, 100));
+
+  Tensor<float, 4> right(internal::random<int>(1, 25),
+                         internal::random<int>(1, 37),
+                         contract_size,
+                         internal::random<int>(1, 51));
+
+  left.setRandom();
+  right.setRandom();
+
+  // add constants to shift values away from 0 for more precision
+  left += left.constant(1.5f);
+  right += right.constant(1.5f);
+
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}});
+
+  Eigen::ThreadPoolDevice thread_pool_device(internal::random<int>(2, 11));
+
+  Tensor<float, 5> st_result;
+  st_result = left.contract(right, dims);
+
+  Tensor<float, 5> tp_result(st_result.dimensions());
+  tp_result.device(thread_pool_device) = left.contract(right, dims);
+
+  VERIFY(internal::dimensions_match(st_result.dimensions(), tp_result.dimensions()));
+  for (ptrdiff_t i = 0; i < st_result.size(); i++) {
+    // if both of the values are very small, then do nothing (because the test will fail
+    // due to numerical precision issues when values are small)
+    if (fabs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4) {
+      VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]);
+    }
+  }
+}
+
+
+static void test_memcpy() {
+
+  for (int i = 0; i < 5; ++i) {
+    const int num_threads = internal::random<int>(3, 11);
+    Eigen::ThreadPoolDevice thread_pool_device(num_threads);
+
+    const int size = internal::random<int>(13, 7632);
+    Tensor<float, 1> t1(size);
+    t1.setRandom();
+    std::vector<float> result(size);
+    thread_pool_device.memcpy(&result[0], t1.data(), size*sizeof(float));
+    for (int i = 0; i < size; i++) {
+      VERIFY_IS_EQUAL(t1(i), result[i]);
+    }
+  }
+}
+
+
+void test_cxx11_tensor_thread_pool()
+{
+  CALL_SUBTEST(test_multithread_elementwise());
+  CALL_SUBTEST(test_multithread_compound_assignment());
+
+  CALL_SUBTEST(test_multithread_contraction());
+
+  CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread());
+
+  // Exercise various cases that have been problematic in the past.
+  CALL_SUBTEST(test_contraction_corner_cases());
+
+  CALL_SUBTEST(test_memcpy());
+}
-- 
cgit v1.2.3


From bfdd9f3ac95d9a2b41e6f2ec1f7434331125b9e1 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 15 Oct 2014 15:32:59 -0700
Subject: Made the blocking computation aware of the l3 cache Also optimized
 the blocking parameters to take into account the number of threads used for a
 computation

---
 Eigen/src/Core/SolveTriangular.h                   |   2 +-
 Eigen/src/Core/products/GeneralBlockPanelKernel.h  | 124 ++++++++++++++-------
 Eigen/src/Core/products/GeneralMatrixMatrix.h      |  16 +--
 .../Core/products/GeneralMatrixMatrixTriangular.h  |   2 +-
 Eigen/src/Core/products/Parallelizer.h             |   4 +-
 Eigen/src/Core/products/SelfadjointMatrixMatrix.h  |   6 +-
 Eigen/src/Core/products/TriangularMatrixMatrix.h   |   2 +-
 Eigen/src/Core/products/TriangularSolverMatrix.h   |   4 +-
 blas/level3_impl.h                                 |  12 +-
 test/product_large.cpp                             |   7 +-
 unsupported/Eigen/CXX11/Tensor                     |   2 +-
 .../Eigen/CXX11/src/Tensor/TensorContraction.h     |   2 +-
 .../CXX11/src/Tensor/TensorContractionThreadPool.h |  13 +--
 13 files changed, 117 insertions(+), 79 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h
index ef17f288e..e158e3162 100644
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -96,7 +96,7 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
     typedef internal::gemm_blocking_space<(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
               Rhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxRowsAtCompileTime,4> BlockingType;
 
-    BlockingType blocking(rhs.rows(), rhs.cols(), size);
+    BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false);
 
     triangular_solve_matrix<Scalar,Index,Side,Mode,LhsProductTraits::NeedToConjugate,(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor,
                                (Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor>
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 090c8f4e6..b91786037 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -26,28 +26,37 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff
 }
 
 /** \internal */
-inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0)
+inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
 {
-  static std::ptrdiff_t m_l1CacheSize = 0;
-  static std::ptrdiff_t m_l2CacheSize = 0;
-  if(m_l2CacheSize==0)
+  static bool m_cache_sizes_initialized = false;
+  static std::ptrdiff_t m_l1CacheSize = 32*1024;
+  static std::ptrdiff_t m_l2CacheSize = 256*1024;
+  static std::ptrdiff_t m_l3CacheSize = 2*1024*1024;
+
+  if(!m_cache_sizes_initialized)
   {
-    m_l1CacheSize = manage_caching_sizes_helper(queryL1CacheSize(),8 * 1024);
-    m_l2CacheSize = manage_caching_sizes_helper(queryTopLevelCacheSize(),1*1024*1024);
+    int l1CacheSize, l2CacheSize, l3CacheSize;
+    queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
+    m_l1CacheSize = manage_caching_sizes_helper(l1CacheSize, 8*1024);
+    m_l2CacheSize = manage_caching_sizes_helper(l2CacheSize, 256*1024);
+    m_l3CacheSize = manage_caching_sizes_helper(l3CacheSize, 8*1024*1024);
+    m_cache_sizes_initialized = true;
   }
-  
+
   if(action==SetAction)
   {
     // set the cpu cache size and cache all block sizes from a global cache size in byte
     eigen_internal_assert(l1!=0 && l2!=0);
     m_l1CacheSize = *l1;
     m_l2CacheSize = *l2;
+    m_l3CacheSize = *l3;
   }
   else if(action==GetAction)
   {
     eigen_internal_assert(l1!=0 && l2!=0);
     *l1 = m_l1CacheSize;
     *l2 = m_l2CacheSize;
+    *l3 = m_l3CacheSize;
   }
   else
   {
@@ -70,10 +79,11 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdi
   * - the number of scalars that fit into a packet (when vectorization is enabled).
   *
   * \sa setCpuCacheSizes */
+#define CEIL(a, b) ((a)+(b)-1)/(b)
+
 template<typename LhsScalar, typename RhsScalar, int KcFactor, typename SizeType>
-void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
+void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads)
 {
-  EIGEN_UNUSED_VARIABLE(n);
   // Explanations:
   // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and
   // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed
@@ -81,43 +91,71 @@ void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
   // at the register level. For vectorization purpose, these small vertical panels are unpacked,
   // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to
   // stay in L1 cache.
-  std::ptrdiff_t l1, l2;
-
-  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
-  enum {
-    kdiv = KcFactor * 2 * Traits::nr
-         * Traits::RhsProgress * sizeof(RhsScalar),
-    mr = gebp_traits<LhsScalar,RhsScalar>::mr,
-    mr_mask = (0xffffffff/mr)*mr
-  };
+  std::ptrdiff_t l1, l2, l3;
+  manage_caching_sizes(GetAction, &l1, &l2, &l3);
+
+  if (num_threads > 1) {
+    typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+    typedef typename Traits::ResScalar ResScalar;
+    enum {
+      kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
+      ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
+      k_mask = (0xffffffff/8)*8,
+
+      mr = Traits::mr,
+      mr_mask = (0xffffffff/mr)*mr,
+
+      nr = Traits::nr,
+      nr_mask = (0xffffffff/nr)*nr
+    };
+    SizeType k_cache = (l1-ksub)/kdiv;
+    if (k_cache < k) {
+      k = k_cache & k_mask;
+      eigen_assert(k > 0);
+    }
 
-  manage_caching_sizes(GetAction, &l1, &l2);
+    SizeType n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
+    SizeType n_per_thread = CEIL(n, num_threads);
+    if (n_cache <= n_per_thread) {
+      // Don't exceed the capacity of the l2 cache.
+      eigen_assert(n_cache >= static_cast<SizeType>(nr));
+      n = n_cache & nr_mask;
+      eigen_assert(n > 0);
+    } else {
+      n = (std::min<SizeType>)(n, (n_per_thread + nr - 1) & nr_mask);
+    }
 
-//   k = std::min<SizeType>(k, l1/kdiv);
-//   SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0;
-//   if(_m<m) m = _m & mr_mask;
-  
-  // In unit tests we do not want to use extra large matrices,
-  // so we reduce the block size to check the blocking strategy is not flawed
+    if (l3 > l2) {
+      // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
+      SizeType m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
+      SizeType m_per_thread = CEIL(m, num_threads);
+      if(m_cache < m_per_thread && m_cache >= static_cast<SizeType>(mr)) {
+        m = m_cache & mr_mask;
+        eigen_assert(m > 0);
+      } else {
+        m = (std::min<SizeType>)(m, (m_per_thread + mr - 1) & mr_mask);
+      }
+    }
+  }
+  else {
+    // In unit tests we do not want to use extra large matrices,
+    // so we reduce the block size to check the blocking strategy is not flawed
 #ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
-//   k = std::min<SizeType>(k,240);
-//   n = std::min<SizeType>(n,3840/sizeof(RhsScalar));
-//   m = std::min<SizeType>(m,3840/sizeof(RhsScalar));
-  
-  k = std::min<SizeType>(k,sizeof(LhsScalar)<=4 ? 360 : 240);
-  n = std::min<SizeType>(n,3840/sizeof(RhsScalar));
-  m = std::min<SizeType>(m,3840/sizeof(RhsScalar));
+    k = std::min<SizeType>(k,sizeof(LhsScalar)<=4 ? 360 : 240);
+    n = std::min<SizeType>(n,3840/sizeof(RhsScalar));
+    m = std::min<SizeType>(m,3840/sizeof(RhsScalar));
 #else
-  k = std::min<SizeType>(k,24);
-  n = std::min<SizeType>(n,384/sizeof(RhsScalar));
-  m = std::min<SizeType>(m,384/sizeof(RhsScalar));
+    k = std::min<SizeType>(k,24);
+    n = std::min<SizeType>(n,384/sizeof(RhsScalar));
+    m = std::min<SizeType>(m,384/sizeof(RhsScalar));
 #endif
+  }
 }
 
 template<typename LhsScalar, typename RhsScalar, typename SizeType>
-inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
+inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads)
 {
-  computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n);
+  computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n, num_threads);
 }
 
 #ifdef EIGEN_HAS_FUSE_CJMADD
@@ -1846,8 +1884,8 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Co
   * \sa setCpuCacheSize */
 inline std::ptrdiff_t l1CacheSize()
 {
-  std::ptrdiff_t l1, l2;
-  internal::manage_caching_sizes(GetAction, &l1, &l2);
+  std::ptrdiff_t l1, l2, l3;
+  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
   return l1;
 }
 
@@ -1855,8 +1893,8 @@ inline std::ptrdiff_t l1CacheSize()
   * \sa setCpuCacheSize */
 inline std::ptrdiff_t l2CacheSize()
 {
-  std::ptrdiff_t l1, l2;
-  internal::manage_caching_sizes(GetAction, &l1, &l2);
+  std::ptrdiff_t l1, l2, l3;
+  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
   return l2;
 }
 
@@ -1865,9 +1903,9 @@ inline std::ptrdiff_t l2CacheSize()
   * for the algorithms working per blocks.
   *
   * \sa computeProductBlockingSizes */
-inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2)
+inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
 {
-  internal::manage_caching_sizes(SetAction, &l1, &l2);
+  internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 4d7a6270a..49362adbe 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -299,7 +299,7 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
 
   public:
 
-    gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, bool /*full_rows*/ = false)
+    gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, int /*num_threads*/, bool /*full_rows = false*/)
     {
       this->m_mc = ActualRows;
       this->m_nc = ActualCols;
@@ -331,21 +331,21 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
 
   public:
 
-    gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, bool full_rows = false)
+    gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, int num_threads, bool l3_blocking)
     {
       this->m_mc = Transpose ? cols : rows;
       this->m_nc = Transpose ? rows : cols;
       this->m_kc = depth;
 
-      if(full_rows)
+      if(l3_blocking)
       {
-        DenseIndex m = this->m_mc;
-        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc);
+        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc, num_threads);
       }
-      else // full columns
+      else  // no l3 blocking
       {
+        DenseIndex m = this->m_mc;
         DenseIndex n = this->m_nc;
-        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n);
+        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, n, num_threads);
       }
 
       m_sizeA = this->m_mc * this->m_kc;
@@ -451,7 +451,7 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
           (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
         _ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor;
 
-      BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), true);
+      BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
 
       internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit);
     }
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
index daa8a1d8a..8de39f76f 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -72,7 +72,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
     Index kc = depth; // cache block size along the K direction
     Index mc = size;  // cache block size along the M direction
     Index nc = size;  // cache block size along the N direction
-    computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc);
+    computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc, 1);
     // !!! mc must be a multiple of nr:
     if(mc > Traits::nr)
       mc = (mc/Traits::nr)*Traits::nr;
diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index 4079063eb..837e69415 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -49,8 +49,8 @@ inline void initParallel()
 {
   int nbt;
   internal::manage_multi_threading(GetAction, &nbt);
-  std::ptrdiff_t l1, l2;
-  internal::manage_caching_sizes(GetAction, &l1, &l2);
+  std::ptrdiff_t l1, l2, l3;
+  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
 }
 
 /** \returns the max number of threads reserved for Eigen
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index d9e6084c3..21f8175d2 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -343,7 +343,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
     Index kc = size;  // cache block size along the K direction
     Index mc = rows;  // cache block size along the M direction
     Index nc = cols;  // cache block size along the N direction
-    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
+    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1);
     // kc must smaller than mc
     kc = (std::min)(kc,mc);
 
@@ -432,10 +432,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
     LhsMapper lhs(_lhs,lhsStride);
     ResMapper res(_res,resStride);
 
-    Index kc = size; // cache block size along the K direction
+    Index kc = size;  // cache block size along the K direction
     Index mc = rows;  // cache block size along the M direction
     Index nc = cols;  // cache block size along the N direction
-    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
+    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1);
     std::size_t sizeB = kc*cols;
     ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
     ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h
index 77aa3e5ee..4cbb79da0 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -412,7 +412,7 @@ struct TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
     Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows()))
                                          : ((IsLower)  ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols()));
 
-    BlockingType blocking(stripedRows, stripedCols, stripedDepth);
+    BlockingType blocking(stripedRows, stripedCols, stripedDepth, 1, false);
 
     internal::product_triangular_matrix_matrix<Scalar, Index,
       Mode, LhsIsTriangular,
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h
index 238d6fc02..f5de67c59 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -81,8 +81,8 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
 
     // the goal here is to subdivise the Rhs panels such that we keep some cache
     // coherence when accessing the rhs elements
-    std::ptrdiff_t l1, l2;
-    manage_caching_sizes(GetAction, &l1, &l2);
+    std::ptrdiff_t l1, l2, l3;
+    manage_caching_sizes(GetAction, &l1, &l2, &l3);
     Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0;
     subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr);
 
diff --git a/blas/level3_impl.h b/blas/level3_impl.h
index a05872666..37a803ced 100644
--- a/blas/level3_impl.h
+++ b/blas/level3_impl.h
@@ -56,7 +56,7 @@ int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScal
     else                matrix(c, *m, *n, *ldc) *= beta;
   }
 
-  internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*m,*n,*k,true);
+  internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*m,*n,*k,1,true);
 
   int code = OP(*opa) | (OP(*opb) << 2);
   func[code](*m, *n, *k, a, *lda, b, *ldb, c, *ldc, alpha, blocking, 0);
@@ -131,12 +131,12 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m,
   
   if(SIDE(*side)==LEFT)
   {
-    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m);
+    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m,1,false);
     func[code](*m, *n, a, *lda, b, *ldb, blocking);
   }
   else
   {
-    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n);
+    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n,1,false);
     func[code](*n, *m, a, *lda, b, *ldb, blocking);
   }
 
@@ -222,12 +222,12 @@ int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m,
 
   if(SIDE(*side)==LEFT)
   {
-    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m);
+    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m,1,false);
     func[code](*m, *n, *m, a, *lda, tmp.data(), tmp.outerStride(), b, *ldb, alpha, blocking);
   }
   else
   {
-    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n);
+    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n,1,false);
     func[code](*m, *n, *n, tmp.data(), tmp.outerStride(), a, *lda, b, *ldb, alpha, blocking);
   }
   return 1;
@@ -577,7 +577,7 @@ int EIGEN_BLAS_FUNC(her2k)(char *uplo, char *op, int *n, int *k, RealScalar *pal
   else if(*n<0)                                                       info = 3;
   else if(*k<0)                                                       info = 4;
   else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k))                     info = 7;
-  else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k))                     info = 9;
+  else if(*ldb<std::max(1,(OP(*op)==NOTR)?*n:*k))                     info = 9;
   else if(*ldc<std::max(1,*n))                                        info = 12;
   if(info)
     return xerbla_(SCALAR_SUFFIX_UP"HER2K",&info,6);
diff --git a/test/product_large.cpp b/test/product_large.cpp
index 11531aa1d..ffb8b7bf2 100644
--- a/test/product_large.cpp
+++ b/test/product_large.cpp
@@ -39,15 +39,16 @@ void test_product_large()
     // check the functions to setup blocking sizes compile and do not segfault
     // FIXME check they do what they are supposed to do !!
     std::ptrdiff_t l1 = internal::random<int>(10000,20000);
-    std::ptrdiff_t l2 = internal::random<int>(1000000,2000000);
-    setCpuCacheSizes(l1,l2);
+    std::ptrdiff_t l2 = internal::random<int>(100000,200000);
+    std::ptrdiff_t l3 = internal::random<int>(1000000,2000000);
+    setCpuCacheSizes(l1,l2,l3);
     VERIFY(l1==l1CacheSize());
     VERIFY(l2==l2CacheSize());
     std::ptrdiff_t k1 = internal::random<int>(10,100)*16;
     std::ptrdiff_t m1 = internal::random<int>(10,100)*16;
     std::ptrdiff_t n1 = internal::random<int>(10,100)*16;
     // only makes sure it compiles fine
-    internal::computeProductBlockingSizes<float,float>(k1,m1,n1);
+    internal::computeProductBlockingSizes<float,float>(k1,m1,n1,1);
   }
 
   {
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 7ec60044e..47447f446 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -55,7 +55,7 @@
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h"
-//#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h"
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 1e6f276e0..cd992daab 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -766,7 +766,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     typedef typename internal::gemm_blocking_space<ColMajor, LhsScalar, RhsScalar, Dynamic, Dynamic, Dynamic> BlockingType;
 
     // Sizes of the blocks to load in cache. See the Goto paper for details.
-    BlockingType blocking(m, n, k, true);
+    BlockingType blocking(m, n, k, 1, true);
     const Index kc = blocking.kc();
     const Index mc = (std::min)(m, blocking.mc());
     const Index nc = (std::min)(n, blocking.nc());
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index dc0513305..8e4c7c11d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -152,7 +152,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     Index mc = m;
     Index nc = n;
     Index kc = k;
-    internal::computeProductBlockingSizes<LhsScalar,RhsScalar,1>(kc, mc, nc/*, num_threads*/);
+    internal::computeProductBlockingSizes<LhsScalar,RhsScalar,1>(kc, mc, nc, num_threads);
     eigen_assert(mc <= m);
     eigen_assert(nc <= n);
     eigen_assert(kc <= k);
@@ -197,9 +197,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
 
     // this should really be numBlockAs * n_blocks;
     const Index num_kernel_promises = num_threads * n_blocks;
-    Promise p;
-    p.set_value();
-    std::vector<Promise> kernel_promises(num_kernel_promises, p);
+    std::vector<Promise> kernel_promises(num_kernel_promises);
+    for (int i = 0; i < kernel_promises.size(); ++i) {
+      kernel_promises[i].set_value();
+    }
 
     for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) {
       const Index k_start = k_block_idx * kc;
@@ -275,8 +276,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
             need_to_pack, // need_to_pack
           };
 
-	  typedef decltype(Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>) Func;
-          this->m_device.enqueueNoFuture<Func, packRKArg>(&Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>, arg);
+          this->m_device.enqueueNoFuture(&Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>, arg);
         }
       }
     }
@@ -338,7 +338,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
              actual_mc, arg.kc, arg.nc, 1.0, -1, -1, 0, 0);
 
         const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx;
-        eigen_assert(!(*arg.kernel_promises)[set_idx].ready());
         (*arg.kernel_promises)[set_idx].set_value();
       }
     }
-- 
cgit v1.2.3


From 1946cc44784c9d0b024a2f1d7d7664010735411f Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 30 Oct 2014 17:52:32 -0700
Subject: Added missing packet primitives for CUDA.

---
 Eigen/src/Core/arch/CUDA/PacketMath.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h
index 5b0abe2e6..7b481d512 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -216,6 +216,21 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, c
   to[stride*1] = from.y;
 }
 
+template<> EIGEN_DEVICE_FUNC inline float  pfirst<float4>(const float4& a) {
+  return a.x;
+}
+template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
+  return a.x;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float4  pabs<float4>(const float4& a) {
+  return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
+  return make_double2(abs(a.x), abs(a.y));
+}
+
+
 template<> EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<float4,4>& kernel) {
   double tmp = kernel.packet[0].y;
-- 
cgit v1.2.3


From bc99c5f7db8d4d7e41e5e4358170e99a1bf9d364 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 30 Oct 2014 18:09:53 -0700
Subject: fixed some potential alignment issues.

---
 Eigen/src/Core/util/Macros.h                        | 4 +++-
 unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 8fdd7d898..001907a0b 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -297,7 +297,9 @@ namespace Eigen {
  * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
  * vectorized and non-vectorized code.
  */
-#if (defined __GNUC__) || (defined __PGI) || (defined __IBMCPP__) || (defined __ARMCC_VERSION)
+#if (defined __CUDACC__)
+#define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
+#elif (defined __GNUC__) || (defined __PGI) || (defined __IBMCPP__) || (defined __ARMCC_VERSION)
   #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
 #elif (defined _MSC_VER)
   #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 3447592eb..33849ed3e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -459,7 +459,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
       this->m_impl.template writePacket<StoreMode>(inputIndices[0], x);
     }
     else {
-      CoeffReturnType values[packetSize];
+      EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
       internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
       this->m_impl.coeffRef(inputIndices[0]) = values[0];
       this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1];
-- 
cgit v1.2.3


From 2dde63499c4ef836a0d9dfd443494d863ad62b16 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 31 Oct 2014 16:33:51 -0700
Subject: Generalized the matrix vector product code.

---
 Eigen/src/Core/GeneralProduct.h                  |  32 +--
 Eigen/src/Core/products/GeneralMatrixVector.h    | 246 ++++++++++++-----------
 Eigen/src/Core/products/TriangularMatrixVector.h |  46 +++--
 Eigen/src/Core/products/TriangularSolverVector.h |  24 ++-
 Eigen/src/Core/util/BlasUtil.h                   |  47 ++++-
 5 files changed, 228 insertions(+), 167 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h
index 7179eb124..9d3d5562c 100644
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_GENERAL_PRODUCT_H
 #define EIGEN_GENERAL_PRODUCT_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \class GeneralProduct
   * \ingroup Core_Module
@@ -257,7 +257,7 @@ class GeneralProduct<Lhs, Rhs, OuterProduct>
   : public ProductBase<GeneralProduct<Lhs,Rhs,OuterProduct>, Lhs, Rhs>
 {
     template<typename T> struct IsRowMajor : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {};
-    
+
   public:
     EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
 
@@ -266,7 +266,7 @@ class GeneralProduct<Lhs, Rhs, OuterProduct>
       EIGEN_STATIC_ASSERT((internal::is_same<typename Lhs::RealScalar, typename Rhs::RealScalar>::value),
         YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
     }
-    
+
     struct set  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
     struct add  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
     struct sub  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
@@ -277,12 +277,12 @@ class GeneralProduct<Lhs, Rhs, OuterProduct>
         dst.const_cast_derived() += m_scale * src;
       }
     };
-    
+
     template<typename Dest>
     inline void evalTo(Dest& dest) const {
       internal::outer_product_selector_run(*this, dest, set(), IsRowMajor<Dest>());
     }
-    
+
     template<typename Dest>
     inline void addTo(Dest& dest) const {
       internal::outer_product_selector_run(*this, dest, add(), IsRowMajor<Dest>());
@@ -436,12 +436,12 @@ template<> struct gemv_selector<OnTheRight,ColMajor,true>
 
     bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
     bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
-    
+
     RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
 
     ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
                                                   evalToDest ? dest.data() : static_dest.data());
-    
+
     if(!evalToDest)
     {
       #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
@@ -457,11 +457,13 @@ template<> struct gemv_selector<OnTheRight,ColMajor,true>
         MappedDest(actualDestPtr, dest.size()) = dest;
     }
 
+    typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
     general_matrix_vector_product
-      <Index,LhsScalar,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run(
+        <Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
         actualLhs.rows(), actualLhs.cols(),
-        actualLhs.data(), actualLhs.outerStride(),
-        actualRhs.data(), actualRhs.innerStride(),
+        LhsMapper(actualLhs.data(), actualLhs.outerStride()),
+        RhsMapper(actualRhs.data(), actualRhs.innerStride()),
         actualDestPtr, 1,
         compatibleAlpha);
 
@@ -516,11 +518,13 @@ template<> struct gemv_selector<OnTheRight,RowMajor,true>
       Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
     }
 
+    typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
     general_matrix_vector_product
-      <Index,LhsScalar,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run(
+        <Index,LhsScalar,LhsMapper,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
         actualLhs.rows(), actualLhs.cols(),
-        actualLhs.data(), actualLhs.outerStride(),
-        actualRhsPtr, 1,
+        LhsMapper(actualLhs.data(), actualLhs.outerStride()),
+        RhsMapper(actualRhsPtr, 1),
         dest.data(), dest.innerStride(),
         actualAlpha);
   }
@@ -594,7 +598,7 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
 #ifdef EIGEN_DEBUG_PRODUCT
   internal::product_type<Derived,OtherDerived>::debug();
 #endif
-  
+
   return Product<Derived, OtherDerived>(derived(), other.derived());
 }
 #else
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index 340c51394..7dfa48bfb 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
 #define EIGEN_GENERAL_MATRIX_VECTOR_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -48,17 +48,17 @@ namespace internal {
  *     // we currently fall back to the NoneAligned case
  *
  * The same reasoning apply for the transposed case.
- * 
+ *
  * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet...
  * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment
  * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow
  * compared to unaligned loads on a 4 byte boundary.
  *
  */
-template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
 {
-typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 
 enum {
   Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
@@ -78,17 +78,17 @@ typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
 
 EIGEN_DONT_INLINE static void run(
   Index rows, Index cols,
-  const LhsScalar* lhs, Index lhsStride,
-  const RhsScalar* rhs, Index rhsIncr,
+  const LhsMapper& lhs,
+  const RhsMapper& rhs,
         ResScalar* res, Index resIncr,
   RhsScalar alpha);
 };
 
-template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
   Index rows, Index cols,
-  const LhsScalar* lhs, Index lhsStride,
-  const RhsScalar* rhs, Index rhsIncr,
+  const LhsMapper& lhs,
+  const RhsMapper& rhs,
         ResScalar* res, Index resIncr,
   RhsScalar alpha)
 {
@@ -97,14 +97,16 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
   #ifdef _EIGEN_ACCUMULATE_PACKETS
   #error _EIGEN_ACCUMULATE_PACKETS has already been defined
   #endif
-  #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
+  #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \
     pstore(&res[j], \
       padd(pload<ResPacket>(&res[j]), \
         padd( \
-          padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]),    ptmp0), \
-                  pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]),   ptmp1)), \
-          padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]),    ptmp2), \
-                  pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]),   ptmp3)) )))
+      padd(pcj.pmul(lhs0.template load<LhsPacket, Alignment0>(j),    ptmp0), \
+      pcj.pmul(lhs1.template load<LhsPacket, Alignment13>(j),   ptmp1)),   \
+      padd(pcj.pmul(lhs2.template load<LhsPacket, Alignment2>(j),    ptmp2), \
+      pcj.pmul(lhs3.template load<LhsPacket, Alignment13>(j),   ptmp3)) )))
+
+  typedef typename LhsMapper::VectorMapper LhsScalars;
 
   conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
   conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
@@ -118,7 +120,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
   const Index ResPacketAlignedMask = ResPacketSize-1;
 //  const Index PeelAlignedMask = ResPacketSize*peels-1;
   const Index size = rows;
-  
+
+  const Index lhsStride = lhs.stride();
+
   // How many coeffs of the result do we have to skip to be aligned.
   // Here we assume data are at least aligned on the base scalar type.
   Index alignedStart = internal::first_aligned(res,size);
@@ -131,12 +135,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
                        : FirstAligned;
 
   // we cannot assume the first element is aligned because of sub-matrices
-  const Index lhsAlignmentOffset = internal::first_aligned(lhs,size);
+  const Index lhsAlignmentOffset = lhs.firstAligned(size);
 
   // find how many columns do we have to skip to be aligned with the result (if possible)
   Index skipColumns = 0;
   // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
-  if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) )
+  if( (lhsAlignmentOffset < 0) || (size_t(res)%sizeof(ResScalar)) )
   {
     alignedSize = 0;
     alignedStart = 0;
@@ -149,7 +153,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
   }
   else if (LhsPacketSize>1)
   {
-    eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
+  //    eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
 
     while (skipColumns<LhsPacketSize &&
           alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
@@ -166,10 +170,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
       // note that the skiped columns are processed later.
     }
 
-    eigen_internal_assert(  (alignmentPattern==NoneAligned)
+    /*    eigen_internal_assert(  (alignmentPattern==NoneAligned)
                       || (skipColumns + columnsAtOnce >= cols)
                       || LhsPacketSize > size
-                      || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);
+                      || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/
   }
   else if(Vectorizable)
   {
@@ -178,20 +182,20 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
     alignmentPattern = AllAligned;
   }
 
-  Index offset1 = (FirstAligned && alignmentStep==1?3:1);
-  Index offset3 = (FirstAligned && alignmentStep==1?1:3);
+  const Index offset1 = (FirstAligned && alignmentStep==1?3:1);
+  const Index offset3 = (FirstAligned && alignmentStep==1?1:3);
 
   Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
   for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
   {
-    RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
-              ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
-              ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
-              ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
+    RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(i, 0)),
+              ptmp1 = pset1<RhsPacket>(alpha*rhs(i+offset1, 0)),
+              ptmp2 = pset1<RhsPacket>(alpha*rhs(i+2, 0)),
+              ptmp3 = pset1<RhsPacket>(alpha*rhs(i+offset3, 0));
 
     // this helps a lot generating better binary code
-    const LhsScalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+offset1)*lhsStride,
-                    *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
+    const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0),   lhs1 = lhs.getVectorMapper(0, i+offset1),
+                     lhs2 = lhs.getVectorMapper(0, i+2),   lhs3 = lhs.getVectorMapper(0, i+offset3);
 
     if (Vectorizable)
     {
@@ -199,10 +203,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
       // process initial unaligned coeffs
       for (Index j=0; j<alignedStart; ++j)
       {
-        res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
-        res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
-        res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
-        res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
+        res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
+        res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
+        res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
+        res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
       }
 
       if (alignedSize>alignedStart)
@@ -211,11 +215,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
         {
           case AllAligned:
             for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,d,d);
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
             break;
           case EvenAligned:
             for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,du,d);
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
             break;
           case FirstAligned:
           {
@@ -225,28 +229,28 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
               LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
               ResPacket T0, T1;
 
-              A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
-              A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
-              A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
+              A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
+              A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
+              A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
 
               for (; j<peeledSize; j+=peels*ResPacketSize)
               {
-                A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]);  palign<1>(A01,A11);
-                A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]);  palign<2>(A02,A12);
-                A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]);  palign<3>(A03,A13);
+                A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize);  palign<1>(A01,A11);
+                A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize);  palign<2>(A02,A12);
+                A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize);  palign<3>(A03,A13);
 
-                A00 = pload<LhsPacket>(&lhs0[j]);
-                A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
+                A00 = lhs0.template load<LhsPacket, Aligned>(j);
+                A10 = lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize);
                 T0  = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
                 T1  = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
 
                 T0  = pcj.pmadd(A01, ptmp1, T0);
-                A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]);  palign<1>(A11,A01);
+                A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize);  palign<1>(A11,A01);
                 T0  = pcj.pmadd(A02, ptmp2, T0);
-                A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]);  palign<2>(A12,A02);
+                A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize);  palign<2>(A12,A02);
                 T0  = pcj.pmadd(A03, ptmp3, T0);
                 pstore(&res[j],T0);
-                A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]);  palign<3>(A13,A03);
+                A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize);  palign<3>(A13,A03);
                 T1  = pcj.pmadd(A11, ptmp1, T1);
                 T1  = pcj.pmadd(A12, ptmp2, T1);
                 T1  = pcj.pmadd(A13, ptmp3, T1);
@@ -254,12 +258,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
               }
             }
             for (; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,du,du);
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
             break;
           }
           default:
             for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(du,du,du);
+              _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
             break;
         }
       }
@@ -268,10 +272,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
     /* process remaining coeffs (or all if there is no explicit vectorization) */
     for (Index j=alignedSize; j<size; ++j)
     {
-      res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
-      res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
-      res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
-      res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
+      res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
+      res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
+      res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
+      res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
     }
   }
 
@@ -282,27 +286,27 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
   {
     for (Index k=start; k<end; ++k)
     {
-      RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
-      const LhsScalar* lhs0 = lhs + k*lhsStride;
+      RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(k, 0));
+      const LhsScalars lhs0 = lhs.getVectorMapper(0, k);
 
       if (Vectorizable)
       {
         /* explicit vectorization */
         // process first unaligned result's coeffs
         for (Index j=0; j<alignedStart; ++j)
-          res[j] += cj.pmul(lhs0[j], pfirst(ptmp0));
+          res[j] += cj.pmul(lhs0(j), pfirst(ptmp0));
         // process aligned result's coeffs
-        if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
+        if (lhs0.template aligned<LhsPacket>(alignedStart))
           for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
-            pstore(&res[i], pcj.pmadd(pload<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
+            pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(i), ptmp0, pload<ResPacket>(&res[i])));
         else
           for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
-            pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
+            pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(i), ptmp0, pload<ResPacket>(&res[i])));
       }
 
       // process remaining scalars (or all if no explicit vectorization)
       for (Index i=alignedSize; i<size; ++i)
-        res[i] += cj.pmul(lhs0[i], pfirst(ptmp0));
+        res[i] += cj.pmul(lhs0(i), pfirst(ptmp0));
     }
     if (skipColumns)
     {
@@ -326,8 +330,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
  *  - alpha is always a complex (or converted to a complex)
  *  - no vectorization
  */
-template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
 {
 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 
@@ -346,67 +350,69 @@ typedef typename packet_traits<ResScalar>::type  _ResPacket;
 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
-  
+
 EIGEN_DONT_INLINE static void run(
   Index rows, Index cols,
-  const LhsScalar* lhs, Index lhsStride,
-  const RhsScalar* rhs, Index rhsIncr,
+  const LhsMapper& lhs,
+  const RhsMapper& rhs,
         ResScalar* res, Index resIncr,
   ResScalar alpha);
 };
 
-template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
   Index rows, Index cols,
-  const LhsScalar* lhs, Index lhsStride,
-  const RhsScalar* rhs, Index rhsIncr,
+  const LhsMapper& lhs,
+  const RhsMapper& rhs,
   ResScalar* res, Index resIncr,
   ResScalar alpha)
 {
-  EIGEN_UNUSED_VARIABLE(rhsIncr);
-  eigen_internal_assert(rhsIncr==1);
-  
+  eigen_internal_assert(rhs.stride()==1);
+
   #ifdef _EIGEN_ACCUMULATE_PACKETS
   #error _EIGEN_ACCUMULATE_PACKETS has already been defined
   #endif
 
-  #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
-    RhsPacket b = pload<RhsPacket>(&rhs[j]); \
-    ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
-    ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
-    ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
-    ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
+  #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\
+    RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);  \
+    ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Alignment0>(j), b, ptmp0); \
+    ptmp1 = pcj.pmadd(lhs1.template load<LhsPacket, Alignment13>(j), b, ptmp1); \
+    ptmp2 = pcj.pmadd(lhs2.template load<LhsPacket, Alignment2>(j), b, ptmp2); \
+    ptmp3 = pcj.pmadd(lhs3.template load<LhsPacket, Alignment13>(j), b, ptmp3); }
 
   conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
   conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
 
+  typedef typename LhsMapper::VectorMapper LhsScalars;
+
   enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
   const Index rowsAtOnce = 4;
   const Index peels = 2;
   const Index RhsPacketAlignedMask = RhsPacketSize-1;
   const Index LhsPacketAlignedMask = LhsPacketSize-1;
-//   const Index PeelAlignedMask = RhsPacketSize*peels-1;
   const Index depth = cols;
+  const Index lhsStride = lhs.stride();
 
   // How many coeffs of the result do we have to skip to be aligned.
   // Here we assume data are at least aligned on the base scalar type
   // if that's not the case then vectorization is discarded, see below.
-  Index alignedStart = internal::first_aligned(rhs, depth);
+  Index alignedStart = rhs.firstAligned(depth);
   Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
   const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
 
   const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
   Index alignmentPattern = alignmentStep==0 ? AllAligned
-                         : alignmentStep==(LhsPacketSize/2) ? EvenAligned
-                         : FirstAligned;
+                           : alignmentStep==(LhsPacketSize/2) ? EvenAligned
+                           : FirstAligned;
 
   // we cannot assume the first element is aligned because of sub-matrices
-  const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth);
+  const Index lhsAlignmentOffset = lhs.firstAligned(depth);
+  const Index rhsAlignmentOffset = rhs.firstAligned(rows);
 
   // find how many rows do we have to skip to be aligned with rhs (if possible)
   Index skipRows = 0;
   // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
-  if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) )
+  if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (lhsAlignmentOffset < 0) || (rhsAlignmentOffset < 0) )
   {
     alignedSize = 0;
     alignedStart = 0;
@@ -418,7 +424,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
   }
   else if (LhsPacketSize>1)
   {
-    eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0  || depth<LhsPacketSize);
+  //    eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0  || depth<LhsPacketSize);
 
     while (skipRows<LhsPacketSize &&
            alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
@@ -434,11 +440,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
       skipRows = (std::min)(skipRows,Index(rows));
       // note that the skiped columns are processed later.
     }
-    eigen_internal_assert(  alignmentPattern==NoneAligned
+    /*    eigen_internal_assert(  alignmentPattern==NoneAligned
                       || LhsPacketSize==1
                       || (skipRows + rowsAtOnce >= rows)
                       || LhsPacketSize > depth
-                      || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);
+                      || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/
   }
   else if(Vectorizable)
   {
@@ -447,8 +453,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
     alignmentPattern = AllAligned;
   }
 
-  Index offset1 = (FirstAligned && alignmentStep==1?3:1);
-  Index offset3 = (FirstAligned && alignmentStep==1?1:3);
+  const Index offset1 = (FirstAligned && alignmentStep==1?3:1);
+  const Index offset3 = (FirstAligned && alignmentStep==1?1:3);
 
   Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
   for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
@@ -457,8 +463,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
     ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
 
     // this helps the compiler generating good binary code
-    const LhsScalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+offset1)*lhsStride,
-                    *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
+    const LhsScalars lhs0 = lhs.getVectorMapper(i+0, 0),    lhs1 = lhs.getVectorMapper(i+offset1, 0),
+                     lhs2 = lhs.getVectorMapper(i+2, 0),    lhs3 = lhs.getVectorMapper(i+offset3, 0);
 
     if (Vectorizable)
     {
@@ -470,9 +476,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
       // FIXME this loop get vectorized by the compiler !
       for (Index j=0; j<alignedStart; ++j)
       {
-        RhsScalar b = rhs[j];
-        tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
-        tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
+        RhsScalar b = rhs(j, 0);
+        tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
+        tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
       }
 
       if (alignedSize>alignedStart)
@@ -481,11 +487,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
         {
           case AllAligned:
             for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,d,d);
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
             break;
           case EvenAligned:
             for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,du,d);
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
             break;
           case FirstAligned:
           {
@@ -499,39 +505,39 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
                * than basic unaligned loads.
                */
               LhsPacket A01, A02, A03, A11, A12, A13;
-              A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
-              A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
-              A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
+              A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
+              A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
+              A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
 
               for (; j<peeledSize; j+=peels*RhsPacketSize)
               {
-                RhsPacket b = pload<RhsPacket>(&rhs[j]);
-                A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]);  palign<1>(A01,A11);
-                A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]);  palign<2>(A02,A12);
-                A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]);  palign<3>(A03,A13);
+                RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);
+                A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize);  palign<1>(A01,A11);
+                A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize);  palign<2>(A02,A12);
+                A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize);  palign<3>(A03,A13);
 
-                ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0);
+                ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), b, ptmp0);
                 ptmp1 = pcj.pmadd(A01, b, ptmp1);
-                A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]);  palign<1>(A11,A01);
+                A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize);  palign<1>(A11,A01);
                 ptmp2 = pcj.pmadd(A02, b, ptmp2);
-                A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]);  palign<2>(A12,A02);
+                A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize);  palign<2>(A12,A02);
                 ptmp3 = pcj.pmadd(A03, b, ptmp3);
-                A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]);  palign<3>(A13,A03);
+                A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize);  palign<3>(A13,A03);
 
-                b = pload<RhsPacket>(&rhs[j+RhsPacketSize]);
-                ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
+                b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load<RhsPacket, Aligned>(0);
+                ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize), b, ptmp0);
                 ptmp1 = pcj.pmadd(A11, b, ptmp1);
                 ptmp2 = pcj.pmadd(A12, b, ptmp2);
                 ptmp3 = pcj.pmadd(A13, b, ptmp3);
               }
             }
             for (; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,du,du);
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
             break;
           }
           default:
             for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(du,du,du);
+              _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
             break;
         }
         tmp0 += predux(ptmp0);
@@ -545,9 +551,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
     // FIXME this loop get vectorized by the compiler !
     for (Index j=alignedSize; j<depth; ++j)
     {
-      RhsScalar b = rhs[j];
-      tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
-      tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
+      RhsScalar b = rhs(j, 0);
+      tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
+      tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
     }
     res[i*resIncr]            += alpha*tmp0;
     res[(i+offset1)*resIncr]  += alpha*tmp1;
@@ -564,28 +570,28 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
     {
       EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0);
       ResPacket ptmp0 = pset1<ResPacket>(tmp0);
-      const LhsScalar* lhs0 = lhs + i*lhsStride;
+      const LhsScalars lhs0 = lhs.getVectorMapper(i, 0);
       // process first unaligned result's coeffs
       // FIXME this loop get vectorized by the compiler !
       for (Index j=0; j<alignedStart; ++j)
-        tmp0 += cj.pmul(lhs0[j], rhs[j]);
+        tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
 
       if (alignedSize>alignedStart)
       {
         // process aligned rhs coeffs
-        if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
+        if (lhs0.template aligned<LhsPacket>(alignedStart))
           for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
-            ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
+            ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
         else
           for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
-            ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
+            ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
         tmp0 += predux(ptmp0);
       }
 
       // process remaining scalars
       // FIXME this loop get vectorized by the compiler !
       for (Index j=alignedSize; j<depth; ++j)
-        tmp0 += cj.pmul(lhs0[j], rhs[j]);
+        tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
       res[i*resIncr] += alpha*tmp0;
     }
     if (skipRows)
diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h
index 817768481..d33e3f409 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_TRIANGULARMATRIXVECTOR_H
 #define EIGEN_TRIANGULARMATRIXVECTOR_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -43,7 +43,7 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
     typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
     const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride));
     typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs);
-    
+
     typedef Map<const Matrix<RhsScalar,Dynamic,1>, 0, InnerStride<> > RhsMap;
     const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr));
     typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs);
@@ -51,6 +51,9 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
     typedef Map<Matrix<ResScalar,Dynamic,1> > ResMap;
     ResMap res(_res,rows);
 
+    typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
+
     for (Index pi=0; pi<size; pi+=PanelWidth)
     {
       Index actualPanelWidth = (std::min)(PanelWidth, size-pi);
@@ -68,19 +71,19 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
       if (r>0)
       {
         Index s = IsLower ? pi+actualPanelWidth : 0;
-        general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjLhs,RhsScalar,ConjRhs,BuiltIn>::run(
+        general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run(
             r, actualPanelWidth,
-            &lhs.coeffRef(s,pi), lhsStride,
-            &rhs.coeffRef(pi), rhsIncr,
+            LhsMapper(&lhs.coeffRef(s,pi), lhsStride),
+            RhsMapper(&rhs.coeffRef(pi), rhsIncr),
             &res.coeffRef(s), resIncr, alpha);
       }
     }
     if((!IsLower) && cols>size)
     {
-      general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjLhs,RhsScalar,ConjRhs>::run(
+      general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run(
           rows, cols-size,
-          &lhs.coeffRef(0,size), lhsStride,
-          &rhs.coeffRef(size), rhsIncr,
+          LhsMapper(&lhs.coeffRef(0,size), lhsStride),
+          RhsMapper(&rhs.coeffRef(size), rhsIncr),
           _res, resIncr, alpha);
     }
   }
@@ -118,7 +121,10 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
 
     typedef Map<Matrix<ResScalar,Dynamic,1>, 0, InnerStride<> > ResMap;
     ResMap res(_res,rows,InnerStride<>(resIncr));
-    
+
+    typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
+
     for (Index pi=0; pi<diagSize; pi+=PanelWidth)
     {
       Index actualPanelWidth = (std::min)(PanelWidth, diagSize-pi);
@@ -136,19 +142,19 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
       if (r>0)
       {
         Index s = IsLower ? 0 : pi + actualPanelWidth;
-        general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjLhs,RhsScalar,ConjRhs,BuiltIn>::run(
+        general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run(
             actualPanelWidth, r,
-            &lhs.coeffRef(pi,s), lhsStride,
-            &rhs.coeffRef(s), rhsIncr,
+            LhsMapper(&lhs.coeffRef(pi,s), lhsStride),
+            RhsMapper(&rhs.coeffRef(s), rhsIncr),
             &res.coeffRef(pi), resIncr, alpha);
       }
     }
     if(IsLower && rows>diagSize)
     {
-      general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjLhs,RhsScalar,ConjRhs>::run(
+      general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run(
             rows-diagSize, cols,
-            &lhs.coeffRef(diagSize,0), lhsStride,
-            &rhs.coeffRef(0), rhsIncr,
+            LhsMapper(&lhs.coeffRef(diagSize,0), lhsStride),
+            RhsMapper(&rhs.coeffRef(0), rhsIncr),
             &res.coeffRef(diagSize), resIncr, alpha);
     }
   }
@@ -184,7 +190,7 @@ struct TriangularProduct<Mode,true,Lhs,false,Rhs,true>
   template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
   {
     eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
-  
+
     internal::trmv_selector<(int(internal::traits<Lhs>::Flags)&RowMajorBit) ? RowMajor : ColMajor>::run(*this, dst, alpha);
   }
 };
@@ -211,7 +217,7 @@ struct TriangularProduct<Mode,false,Lhs,true,Rhs,false>
 namespace internal {
 
 // TODO: find a way to factorize this piece of code with gemv_selector since the logic is exactly the same.
-  
+
 template<> struct trmv_selector<ColMajor>
 {
   template<int Mode, typename Lhs, typename Rhs, typename Dest>
@@ -247,7 +253,7 @@ template<> struct trmv_selector<ColMajor>
 
     bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
     bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
-    
+
     RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
 
     ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
@@ -267,7 +273,7 @@ template<> struct trmv_selector<ColMajor>
       else
         MappedDest(actualDestPtr, dest.size()) = dest;
     }
-    
+
     internal::triangular_matrix_vector_product
       <Index,Mode,
        LhsScalar, LhsBlasTraits::NeedToConjugate,
@@ -327,7 +333,7 @@ template<> struct trmv_selector<RowMajor>
       #endif
       Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
     }
-    
+
     internal::triangular_matrix_vector_product
       <Index,Mode,
        LhsScalar, LhsBlasTraits::NeedToConjugate,
diff --git a/Eigen/src/Core/products/TriangularSolverVector.h b/Eigen/src/Core/products/TriangularSolverVector.h
index ce4d10088..b994759b2 100644
--- a/Eigen/src/Core/products/TriangularSolverVector.h
+++ b/Eigen/src/Core/products/TriangularSolverVector.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_TRIANGULAR_SOLVER_VECTOR_H
 #define EIGEN_TRIANGULAR_SOLVER_VECTOR_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -25,7 +25,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheRight, Mode, Co
       >::run(size, _lhs, lhsStride, rhs);
   }
 };
-    
+
 // forward and backward substitution, row-major, rhs is a vector
 template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate>
 struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, RowMajor>
@@ -37,6 +37,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
   {
     typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap;
     const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride));
+
+    typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
+
     typename internal::conditional<
                           Conjugate,
                           const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
@@ -58,10 +62,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
         Index startRow = IsLower ? pi : pi-actualPanelWidth;
         Index startCol = IsLower ? 0 : pi;
 
-        general_matrix_vector_product<Index,LhsScalar,RowMajor,Conjugate,RhsScalar,false>::run(
+        general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
           actualPanelWidth, r,
-          &lhs.coeffRef(startRow,startCol), lhsStride,
-          rhs + startCol, 1,
+          LhsMapper(&lhs.coeffRef(startRow,startCol), lhsStride),
+          RhsMapper(rhs + startCol, 1),
           rhs + startRow, 1,
           RhsScalar(-1));
       }
@@ -72,7 +76,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
         Index s = IsLower ? pi   : i+1;
         if (k>0)
           rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum();
-        
+
         if(!(Mode & UnitDiag))
           rhs[i] /= cjLhs(i,i);
       }
@@ -91,6 +95,8 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
   {
     typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
     const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride));
+    typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
     typename internal::conditional<Conjugate,
                                    const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
                                    const LhsMap&
@@ -122,10 +128,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
         // let's directly call the low level product function because:
         // 1 - it is faster to compile
         // 2 - it is slighlty faster at runtime
-        general_matrix_vector_product<Index,LhsScalar,ColMajor,Conjugate,RhsScalar,false>::run(
+        general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
             r, actualPanelWidth,
-            &lhs.coeffRef(endBlock,startBlock), lhsStride,
-            rhs+startBlock, 1,
+            LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride),
+            RhsMapper(rhs+startBlock, 1),
             rhs+endBlock, 1, RhsScalar(-1));
       }
     }
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index 25a62d528..c4881b8da 100644
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -34,7 +34,9 @@ template<
   int ResStorageOrder>
 struct general_matrix_matrix_product;
 
-template<typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version=Specialized>
+template<typename Index,
+         typename LhsScalar, typename LhsMapper, int LhsStorageOrder, bool ConjugateLhs,
+         typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version=Specialized>
 struct general_matrix_vector_product;
 
 
@@ -118,13 +120,35 @@ template<typename Scalar> struct get_factor<Scalar,typename NumTraits<Scalar>::R
 };
 
 
+template<typename Scalar, typename Index>
+class BlasVectorMapper {
+  public:
+  EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar *data) : m_data(data) {}
+
+  EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
+    return m_data[i];
+  }
+  template <typename Packet, int AlignmentType>
+  EIGEN_ALWAYS_INLINE Packet load(Index i) const {
+    return ploadt<Packet, AlignmentType>(m_data + i);
+  }
+
+  template <typename Packet>
+  bool aligned(Index i) const {
+    return (size_t(m_data+i)%sizeof(Packet))==0;
+  }
+
+  protected:
+  Scalar* m_data;
+};
+
 template<typename Scalar, typename Index, int AlignmentType>
-class MatrixLinearMapper {
+class BlasLinearMapper {
   public:
   typedef typename packet_traits<Scalar>::type Packet;
   typedef typename packet_traits<Scalar>::half HalfPacket;
 
-  EIGEN_ALWAYS_INLINE MatrixLinearMapper(Scalar *data) : m_data(data) {}
+  EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {}
 
   EIGEN_ALWAYS_INLINE void prefetch(int i) const {
     internal::prefetch(&operator()(i));
@@ -157,7 +181,8 @@ class blas_data_mapper {
   typedef typename packet_traits<Scalar>::type Packet;
   typedef typename packet_traits<Scalar>::half HalfPacket;
 
-  typedef MatrixLinearMapper<Scalar, Index, AlignmentType> LinearMapper;
+  typedef BlasLinearMapper<Scalar, Index, AlignmentType> LinearMapper;
+  typedef BlasVectorMapper<Scalar, Index> VectorMapper;
 
   EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
 
@@ -170,6 +195,11 @@ class blas_data_mapper {
     return LinearMapper(&operator()(i, j));
   }
 
+  EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
+    return VectorMapper(&operator()(i, j));
+  }
+
+
   EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
     return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride];
@@ -193,6 +223,15 @@ class blas_data_mapper {
     return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride);
   }
 
+  const Index stride() const { return m_stride; }
+
+  Index firstAligned(Index size) const {
+    if (size_t(m_data)%sizeof(Scalar)) {
+      return -1;
+    }
+    return internal::first_aligned(m_data, size);
+  }
+
   protected:
   Scalar* EIGEN_RESTRICT m_data;
   const Index m_stride;
-- 
cgit v1.2.3


From 509e4ddc02e0d70b8c1ee325f3b18624d4235c1e Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 19 Nov 2014 10:34:11 -0800
Subject: Added reduction packet primitives for CUDA

---
 Eigen/src/Core/arch/CUDA/PacketMath.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h
index 7b481d512..19749c832 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -223,6 +223,27 @@ template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
   return a.x;
 }
 
+template<> EIGEN_DEVICE_FUNC inline float  predux<float4>(const float4& a) {
+  return a.x + a.y + a.z + a.w;
+}
+template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
+  return a.x + a.y;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux_max<float4>(const float4& a) {
+  return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
+  return fmax(a.x, a.y);
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux_min<float4>(const float4& a) {
+  return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
+  return fmin(a.x, a.y);
+}
+
 template<> EIGEN_DEVICE_FUNC inline float4  pabs<float4>(const float4& a) {
   return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w));
 }
-- 
cgit v1.2.3


From 80ed5bd90c245655ce0f892f6a679a0278ccbbab Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 5 Dec 2014 12:49:30 +0100
Subject: Workaround various "returning reference to temporary" warnings.

---
 Eigen/SparseCore                         | 7 -------
 Eigen/src/Core/CoreEvaluators.h          | 4 +++-
 Eigen/src/Core/util/Constants.h          | 3 +++
 Eigen/src/SparseCore/CompressedStorage.h | 2 +-
 Eigen/src/SparseCore/SparseMatrix.h      | 4 ++--
 5 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/SparseCore b/Eigen/SparseCore
index b68c8fa8a..d5c0f6271 100644
--- a/Eigen/SparseCore
+++ b/Eigen/SparseCore
@@ -26,13 +26,6 @@
   * This module depends on: Core.
   */
 
-namespace Eigen {
-
-/** The type used to identify a general sparse storage. */
-struct Sparse {};
-
-}
-
 #include "src/SparseCore/SparseUtil.h"
 #include "src/SparseCore/SparseMatrixBase.h"
 #include "src/SparseCore/SparseAssign.h"
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index a0dc72c4d..1c7123b85 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -1221,7 +1221,9 @@ struct evaluator<Diagonal<ArgType, DiagIndex> >
  
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  // FIXME having to check whether ArgType is sparse here i not very nice.
+  typedef typename internal::conditional<!internal::is_same<typename ArgType::StorageKind,Sparse>::value,
+                                         typename XprType::CoeffReturnType,Scalar>::type CoeffReturnType;
 
   EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index) const
   {
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index 5c7d70af6..9b40093f0 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -449,6 +449,9 @@ enum Action {GetAction, SetAction};
 /** The type used to identify a dense storage. */
 struct Dense {};
 
+/** The type used to identify a general sparse storage. */
+struct Sparse {};
+
 /** The type used to identify a permutation storage. */
 struct PermutationStorage {};
 
diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h
index 2741f8292..99f741138 100644
--- a/Eigen/src/SparseCore/CompressedStorage.h
+++ b/Eigen/src/SparseCore/CompressedStorage.h
@@ -143,7 +143,7 @@ class CompressedStorage
     }
 
     /** Like at(), but the search is performed in the range [start,end) */
-    inline const Scalar& atInRange(size_t start, size_t end, Index key, const Scalar& defaultValue = Scalar(0)) const
+    inline Scalar atInRange(size_t start, size_t end, Index key, const Scalar &defaultValue = Scalar(0)) const
     {
       if (start>=end)
         return defaultValue;
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index 4c79c7dc3..93677c786 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -179,7 +179,7 @@ class SparseMatrix
 
     /** \returns the value of the matrix at position \a i, \a j
       * This function returns Scalar(0) if the element is an explicit \em zero */
-    inline const Scalar& coeff(Index row, Index col) const
+    inline Scalar coeff(Index row, Index col) const
     {
       eigen_assert(row>=0 && row<rows() && col>=0 && col<cols());
       
@@ -1297,7 +1297,7 @@ struct evaluator<SparseMatrix<_Scalar,_Options,_Index> >
   operator const SparseMatrixType&() const { return *m_matrix; }
   
   typedef typename DenseCoeffsBase<SparseMatrixType,ReadOnlyAccessors>::CoeffReturnType CoeffReturnType;
-  CoeffReturnType coeff(Index row, Index col) const
+  Scalar coeff(Index row, Index col) const
   { return m_matrix->coeff(row,col); }
   
   Scalar& coeffRef(Index row, Index col)
-- 
cgit v1.2.3


From 30c849669d6d29f6e19484478639ec5176c1826a Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 8 Dec 2014 14:45:04 +0100
Subject: Fix dynamic allocation in JacobiSVD (regression)

---
 Eigen/src/SVD/JacobiSVD.h | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h
index 0f7e5b8fe..444187ae7 100644
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
@@ -628,6 +628,7 @@ template<typename _MatrixType, int QRPreconditioner> class JacobiSVD
 
     internal::qr_preconditioner_impl<MatrixType, QRPreconditioner, internal::PreconditionIfMoreColsThanRows> m_qr_precond_morecols;
     internal::qr_preconditioner_impl<MatrixType, QRPreconditioner, internal::PreconditionIfMoreRowsThanCols> m_qr_precond_morerows;
+    MatrixType m_scaledMatrix;
 };
 
 template<typename MatrixType, int QRPreconditioner>
@@ -674,8 +675,9 @@ void JacobiSVD<MatrixType, QRPreconditioner>::allocate(Index rows, Index cols, u
                             : 0);
   m_workMatrix.resize(m_diagSize, m_diagSize);
   
-  if(m_cols>m_rows) m_qr_precond_morecols.allocate(*this);
-  if(m_rows>m_cols) m_qr_precond_morerows.allocate(*this);
+  if(m_cols>m_rows)   m_qr_precond_morecols.allocate(*this);
+  if(m_rows>m_cols)   m_qr_precond_morerows.allocate(*this);
+  if(m_cols!=m_cols)  m_scaledMatrix.resize(rows,cols);
 }
 
 template<typename MatrixType, int QRPreconditioner>
@@ -698,7 +700,13 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
   
   /*** step 1. The R-SVD step: we use a QR decomposition to reduce to the case of a square matrix */
 
-  if(!m_qr_precond_morecols.run(*this, matrix/scale) && !m_qr_precond_morerows.run(*this, matrix/scale))
+  if(m_rows!=m_cols)
+  {
+    m_scaledMatrix = matrix / scale;
+    m_qr_precond_morecols.run(*this, m_scaledMatrix);
+    m_qr_precond_morerows.run(*this, m_scaledMatrix);
+  }
+  else
   {
     m_workMatrix = matrix.block(0,0,m_diagSize,m_diagSize) / scale;
     if(m_computeFullU) m_matrixU.setIdentity(m_rows,m_rows);
-- 
cgit v1.2.3


From 7f7a71206267014f33a175322789977047ced24f Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 8 Dec 2014 15:02:25 +0100
Subject: Optimize Simplicial Cholesky when NaturalOrdering is used.

---
 Eigen/src/SparseCholesky/SimplicialCholesky.h | 35 ++++++++++++++++++---------
 1 file changed, 24 insertions(+), 11 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h
index 918a34e13..1928670a3 100644
--- a/Eigen/src/SparseCholesky/SimplicialCholesky.h
+++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h
@@ -607,22 +607,35 @@ void SimplicialCholeskyBase<Derived>::ordering(const MatrixType& a, CholMatrixTy
 {
   eigen_assert(a.rows()==a.cols());
   const Index size = a.rows();
-  // Note that amd compute the inverse permutation
+  // Note that ordering methods compute the inverse permutation
+  if(!internal::is_same<OrderingType,NaturalOrdering<Index> >::value)
   {
-    CholMatrixType C;
-    C = a.template selfadjointView<UpLo>();
+    {
+      CholMatrixType C;
+      C = a.template selfadjointView<UpLo>();
+      
+      OrderingType ordering;
+      ordering(C,m_Pinv);
+    }
+
+    if(m_Pinv.size()>0) m_P = m_Pinv.inverse();
+    else                m_P.resize(0);
     
-    OrderingType ordering;
-    ordering(C,m_Pinv);
+    ap.resize(size,size);
+    ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
   }
-
-  if(m_Pinv.size()>0)
-    m_P = m_Pinv.inverse();
   else
+  {
+    m_Pinv.resize(0);
     m_P.resize(0);
-
-  ap.resize(size,size);
-  ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
+    if(UpLo==Lower)
+    {
+      ap.resize(size,size);
+      ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
+    }
+    else
+      ap = a.template triangularView<Upper>();
+  }  
 }
 
 } // end namespace Eigen
-- 
cgit v1.2.3


From bea36925dbdd753c861d650623ae2692bb9de812 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 8 Dec 2014 16:26:53 +0100
Subject: bug #876: implement a portable log1p function

---
 Eigen/src/Core/MathFunctions.h | 85 ++++++++++++++++++++++++++----------------
 1 file changed, 52 insertions(+), 33 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 4c5fc1cae..72d6acfc1 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -14,7 +14,7 @@ namespace Eigen {
 
 // On WINCE, std::abs is defined for int only, so let's defined our own overloads:
 // This issue has been confirmed with MSVC 2008 only, but the issue might exist for more recent versions too.
-#if defined(_WIN32_WCE) && defined(_MSC_VER) && _MSC_VER<=1500
+#if EIGEN_OS_WINCE && EIGEN_COMP_MSVC && EIGEN_COMP_MSVC<=1500
 long        abs(long        x) { return (labs(x));  }
 double      abs(double      x) { return (fabs(x));  }
 float       abs(float       x) { return (fabsf(x)); }
@@ -360,50 +360,31 @@ inline NewType cast(const OldType& x)
 }
 
 /****************************************************************************
-* Implementation of atanh2                                                *
+* Implementation of logp1                                                *
 ****************************************************************************/
 
 template<typename Scalar>
-struct atanh2_impl
+struct log1p_impl
 {
-  static inline Scalar run(const Scalar& x, const Scalar& r)
+  static inline Scalar run(const Scalar& x)
   {
     EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-    #if (__cplusplus >= 201103L) && !defined(__CYGWIN__)
+    // Let's be conservative and enable the default C++11 implementation only if we are sure it exists
+    #if (__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC)  \
+        && (EIGEN_ARCH_i386_OR_x86_64) && (EIGEN_OS_GNULINUX || EIGEN_OS_WIN_STRICT || EIGEN_OS_MAC)
       using std::log1p;
-      return log1p(2 * x / (r - x)) / 2;
+      return log1p(x);
     #else
-      using std::abs;
+      typedef typename NumTraits<Scalar>::Real RealScalar;
       using std::log;
-      using std::sqrt;
-      Scalar z = x / r;
-      if (r == 0 || abs(z) > sqrt(NumTraits<Scalar>::epsilon()))
-        return log((r + x) / (r - x)) / 2;
-      else
-        return z + z*z*z / 3;
+      Scalar x1p = RealScalar(1) + x;
+      return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
     #endif
   }
 };
 
-template<typename RealScalar>
-struct atanh2_impl<std::complex<RealScalar> >
-{
-  typedef std::complex<RealScalar> Scalar;
-  static inline Scalar run(const Scalar& x, const Scalar& r)
-  {
-    using std::log;
-    using std::norm;
-    using std::sqrt;
-    Scalar z = x / r;
-    if (r == Scalar(0) || norm(z) > NumTraits<RealScalar>::epsilon())
-      return RealScalar(0.5) * log((r + x) / (r - x));
-    else
-      return z + z*z*z / RealScalar(3);
-  }
-};
-
 template<typename Scalar>
-struct atanh2_retval
+struct log1p_retval
 {
   typedef Scalar type;
 };
@@ -680,9 +661,9 @@ inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar&
 
 template<typename Scalar>
 EIGEN_DEVICE_FUNC
-inline EIGEN_MATHFUNC_RETVAL(atanh2, Scalar) atanh2(const Scalar& x, const Scalar& y)
+inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
 {
-  return EIGEN_MATHFUNC_IMPL(atanh2, Scalar)::run(x, y);
+  return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x);
 }
 
 template<typename Scalar>
@@ -727,6 +708,44 @@ inline int log2(int x)
 
 } // end namespace numext
 
+
+namespace internal {
+
+/****************************************************************************
+* Implementation of atanh2                                                *
+****************************************************************************/
+
+template<typename Scalar>
+struct atanh2_impl
+{
+  static inline Scalar run(const Scalar& x, const Scalar& r)
+  {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    return numext::log1p(RealScalar(2) * x / (r - x)) / RealScalar(2);
+  }
+};
+
+template<typename Scalar>
+struct atanh2_retval
+{
+  typedef Scalar type;
+};
+
+
+} // end namespace internal
+
+namespace numext {
+
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(atanh2, Scalar) atanh2(const Scalar& x, const Scalar& y)
+{
+  return EIGEN_MATHFUNC_IMPL(atanh2, Scalar)::run(x, y);
+}
+
+} // end namespace numext
+
 namespace internal {
 
 /****************************************************************************
-- 
cgit v1.2.3


From 437191186165005e0b619c069239ac2913fd2c41 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 8 Dec 2014 16:44:34 +0100
Subject: Remove useless and non standard numext::atanh2 function.

---
 Eigen/src/Core/MathFunctions.h | 38 --------------------------------------
 1 file changed, 38 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 72d6acfc1..16ad2dc7e 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -708,44 +708,6 @@ inline int log2(int x)
 
 } // end namespace numext
 
-
-namespace internal {
-
-/****************************************************************************
-* Implementation of atanh2                                                *
-****************************************************************************/
-
-template<typename Scalar>
-struct atanh2_impl
-{
-  static inline Scalar run(const Scalar& x, const Scalar& r)
-  {
-    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    return numext::log1p(RealScalar(2) * x / (r - x)) / RealScalar(2);
-  }
-};
-
-template<typename Scalar>
-struct atanh2_retval
-{
-  typedef Scalar type;
-};
-
-
-} // end namespace internal
-
-namespace numext {
-
-template<typename Scalar>
-EIGEN_DEVICE_FUNC
-inline EIGEN_MATHFUNC_RETVAL(atanh2, Scalar) atanh2(const Scalar& x, const Scalar& y)
-{
-  return EIGEN_MATHFUNC_IMPL(atanh2, Scalar)::run(x, y);
-}
-
-} // end namespace numext
-
 namespace internal {
 
 /****************************************************************************
-- 
cgit v1.2.3


From a910a7466e143502439606572367849cb09ff5bf Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 8 Dec 2014 17:55:31 +0100
Subject: Fix inner iterator type

---
 Eigen/src/SparseCholesky/SimplicialCholesky_impl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'Eigen')

diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
index 7aaf702be..b7fd62faa 100644
--- a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
+++ b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
@@ -126,7 +126,7 @@ void SimplicialCholeskyBase<Derived>::factorize_preordered(const CholMatrixType&
     Index top = size;               // stack for pattern is empty
     tags[k] = k;                    // mark node k as visited
     m_nonZerosPerCol[k] = 0;        // count of nonzeros in column k of L
-    for(typename MatrixType::InnerIterator it(ap,k); it; ++it)
+    for(typename CholMatrixType::InnerIterator it(ap,k); it; ++it)
     {
       Index i = it.index();
       if(i <= k)
-- 
cgit v1.2.3


From 41a20994cce4d7e2c49bbb958a43c9ed69473f7f Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 8 Dec 2014 17:56:33 +0100
Subject: In simplicial cholesky: avoid deep copy of the input matrix is this
 later can be used readily

---
 Eigen/src/SparseCholesky/SimplicialCholesky.h | 68 +++++++++++++++++++++------
 1 file changed, 53 insertions(+), 15 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h
index 1928670a3..22325d7f4 100644
--- a/Eigen/src/SparseCholesky/SimplicialCholesky.h
+++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h
@@ -17,6 +17,27 @@ enum SimplicialCholeskyMode {
   SimplicialCholeskyLDLT
 };
 
+namespace internal {
+  template<typename CholMatrixType, typename InputMatrixType>
+  struct simplicial_cholesky_grab_input {
+    typedef CholMatrixType const * ConstCholMatrixPtr;
+    static void run(const InputMatrixType& input, ConstCholMatrixPtr &pmat, CholMatrixType &tmp)
+    {
+      tmp = input;
+      pmat = &tmp;
+    }
+  };
+  
+  template<typename MatrixType>
+  struct simplicial_cholesky_grab_input<MatrixType,MatrixType> {
+    typedef MatrixType const * ConstMatrixPtr;
+    static void run(const MatrixType& input, ConstMatrixPtr &pmat, MatrixType &/*tmp*/)
+    {
+      pmat = &input;
+    }
+  };
+} // end namespace internal
+
 /** \ingroup SparseCholesky_Module
   * \brief A direct sparse Cholesky factorizations
   *
@@ -46,6 +67,7 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
     typedef typename MatrixType::RealScalar RealScalar;
     typedef typename MatrixType::Index Index;
     typedef SparseMatrix<Scalar,ColMajor,Index> CholMatrixType;
+    typedef CholMatrixType const * ConstCholMatrixPtr;
     typedef Matrix<Scalar,Dynamic,1> VectorType;
 
   public:
@@ -169,10 +191,11 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
     {
       eigen_assert(matrix.rows()==matrix.cols());
       Index size = matrix.cols();
-      CholMatrixType ap(size,size);
-      ordering(matrix, ap);
-      analyzePattern_preordered(ap, DoLDLT);
-      factorize_preordered<DoLDLT>(ap);
+      CholMatrixType tmp(size,size);
+      ConstCholMatrixPtr pmat;
+      ordering(matrix, pmat, tmp);
+      analyzePattern_preordered(*pmat, DoLDLT);
+      factorize_preordered<DoLDLT>(*pmat);
     }
     
     template<bool DoLDLT>
@@ -180,9 +203,21 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
     {
       eigen_assert(a.rows()==a.cols());
       int size = a.cols();
-      CholMatrixType ap(size,size);
-      ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
-      factorize_preordered<DoLDLT>(ap);
+      CholMatrixType tmp(size,size);
+      ConstCholMatrixPtr pmat;
+      
+      if(m_P.size()==0 && (UpLo&Upper)==Upper)
+      {
+        // If there is no ordering, try to directly use the input matrix without any copy
+        internal::simplicial_cholesky_grab_input<CholMatrixType,MatrixType>::run(a, pmat, tmp);
+      }
+      else
+      {
+        tmp.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
+        pmat = &tmp;
+      }
+      
+      factorize_preordered<DoLDLT>(*pmat);
     }
 
     template<bool DoLDLT>
@@ -192,13 +227,14 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
     {
       eigen_assert(a.rows()==a.cols());
       int size = a.cols();
-      CholMatrixType ap(size,size);
-      ordering(a, ap);
-      analyzePattern_preordered(ap,doLDLT);
+      CholMatrixType tmp(size,size);
+      ConstCholMatrixPtr pmat;
+      ordering(a, pmat, tmp);
+      analyzePattern_preordered(*pmat,doLDLT);
     }
     void analyzePattern_preordered(const CholMatrixType& a, bool doLDLT);
     
-    void ordering(const MatrixType& a, CholMatrixType& ap);
+    void ordering(const MatrixType& a, ConstCholMatrixPtr &pmat, CholMatrixType& ap);
 
     /** keeps off-diagonal entries; drops diagonal entries */
     struct keep_diag {
@@ -603,10 +639,11 @@ public:
 };
 
 template<typename Derived>
-void SimplicialCholeskyBase<Derived>::ordering(const MatrixType& a, CholMatrixType& ap)
+void SimplicialCholeskyBase<Derived>::ordering(const MatrixType& a, ConstCholMatrixPtr &pmat, CholMatrixType& ap)
 {
   eigen_assert(a.rows()==a.cols());
   const Index size = a.rows();
+  pmat = &ap;
   // Note that ordering methods compute the inverse permutation
   if(!internal::is_same<OrderingType,NaturalOrdering<Index> >::value)
   {
@@ -628,13 +665,14 @@ void SimplicialCholeskyBase<Derived>::ordering(const MatrixType& a, CholMatrixTy
   {
     m_Pinv.resize(0);
     m_P.resize(0);
-    if(UpLo==Lower)
+    if(UpLo==Lower || MatrixType::IsRowMajor)
     {
+      // we have to transpose the lower part to to the upper one
       ap.resize(size,size);
-      ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
+      ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>();
     }
     else
-      ap = a.template triangularView<Upper>();
+      internal::simplicial_cholesky_grab_input<CholMatrixType,MatrixType>::run(a, pmat, ap);
   }  
 }
 
-- 
cgit v1.2.3


From 0efaff9b3b261daaa91baf8935ec7c4f5156a647 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 11 Dec 2014 16:15:20 +0100
Subject: Fix out-of-bounds write

---
 Eigen/src/SparseCore/AmbiVector.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/SparseCore/AmbiVector.h b/Eigen/src/SparseCore/AmbiVector.h
index 5c9c3101e..76ef25f7d 100644
--- a/Eigen/src/SparseCore/AmbiVector.h
+++ b/Eigen/src/SparseCore/AmbiVector.h
@@ -69,7 +69,7 @@ class AmbiVector
       delete[] m_buffer;
       if (size<1000)
       {
-        Index allocSize = (size * sizeof(ListEl))/sizeof(Scalar);
+        Index allocSize = (size * sizeof(ListEl) + sizeof(Scalar) - 1)/sizeof(Scalar);
         m_allocatedElements = (allocSize*sizeof(Scalar))/sizeof(ListEl);
         m_buffer = new Scalar[allocSize];
       }
@@ -88,7 +88,7 @@ class AmbiVector
       Index copyElements = m_allocatedElements;
       m_allocatedElements = (std::min)(Index(m_allocatedElements*1.5),m_size);
       Index allocSize = m_allocatedElements * sizeof(ListEl);
-      allocSize = allocSize/sizeof(Scalar) + (allocSize%sizeof(Scalar)>0?1:0);
+      allocSize = (allocSize + sizeof(Scalar) - 1)/sizeof(Scalar);
       Scalar* newBuffer = new Scalar[allocSize];
       memcpy(newBuffer,  m_buffer,  copyElements * sizeof(ListEl));
       delete[] m_buffer;
-- 
cgit v1.2.3


From 7dad5f797e8c270be5f32aee154f6660df2242f5 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 16 Dec 2014 13:33:43 +0100
Subject: bug #821: workaround MSVC 2013 issue with using Base::Base::operator=

---
 Eigen/src/Core/MapBase.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h
index 3c67edae5..5d51548cd 100644
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -243,7 +243,11 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
       return derived();
     }
 
-    using Base::Base::operator=;
+    // In theory MapBase<Derived, ReadOnlyAccessors> should not make a using Base::operator=,
+    // and thus we should directly do: using Base::Base::operator=;
+    // However, this would confuse recent MSVC 2013 (bug 821), and since MapBase<Derived, ReadOnlyAccessors>
+    // has operator= to make ICC 11 happy, we can also make MSVC 2013 happy as follow:
+    using Base::operator=;
 };
 
 #undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS
-- 
cgit v1.2.3


From b8d9eaa19bd8501e8007c4c3633904643b44617e Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 13 Dec 2014 22:16:39 +0100
Subject: Use true compile time "if" for Transform::makeAffine

---
 Eigen/src/Geometry/Transform.h | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h
index 7ebde6803..d33fc24db 100644
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h
@@ -78,6 +78,8 @@ struct traits<Transform<_Scalar,_Dim,_Mode,_Options> >
   };
 };
 
+template<int Mode> struct transform_make_affine;
+
 } // end namespace internal
 
 /** \geometry_module \ingroup Geometry_Module
@@ -246,8 +248,7 @@ public:
   inline Transform()
   {
     check_template_params();
-    if (int(Mode)==Affine)
-      makeAffine();
+    internal::transform_make_affine<(int(Mode)==Affine) ? Affine : AffineCompact>::run(m_matrix);
   }
 
   inline Transform(const Transform& other)
@@ -610,11 +611,7 @@ public:
     */
   void makeAffine()
   {
-    if(int(Mode)!=int(AffineCompact))
-    {
-      matrix().template block<1,Dim>(Dim,0).setZero();
-      matrix().coeffRef(Dim,Dim) = Scalar(1);
-    }
+    internal::transform_make_affine<int(Mode)>::run(m_matrix);
   }
 
   /** \internal
@@ -1102,6 +1099,24 @@ Transform<Scalar,Dim,Mode,Options>::fromPositionOrientationScale(const MatrixBas
 
 namespace internal {
 
+template<int Mode>
+struct transform_make_affine
+{
+  template<typename MatrixType>
+  static void run(MatrixType &mat)
+  {
+    static const int Dim = MatrixType::ColsAtCompileTime-1;
+    mat.template block<1,Dim>(Dim,0).setZero();
+    mat.coeffRef(Dim,Dim) = typename MatrixType::Scalar(1);
+  }
+};
+
+template<>
+struct transform_make_affine<AffineCompact>
+{
+  template<typename MatrixType> static void run(MatrixType &) { }
+};
+    
 // selector needed to avoid taking the inverse of a 3x4 matrix
 template<typename TransformType, int Mode=TransformType::Mode>
 struct projective_transform_inverse
-- 
cgit v1.2.3


From 25c7d9164f45119fa20dc6af2fa451d278c5f285 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 18 Dec 2014 22:58:15 +0100
Subject: bug #920: fix MSVC 2015 compilation issues

---
 Eigen/src/Core/MapBase.h     | 11 +++++------
 Eigen/src/Core/util/Macros.h |  4 ++--
 2 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h
index 5d51548cd..1589cbaae 100644
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -172,6 +172,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
 template<typename Derived> class MapBase<Derived, WriteAccessors>
   : public MapBase<Derived, ReadOnlyAccessors>
 {
+    typedef MapBase<Derived, ReadOnlyAccessors> ReadOnlyMapBase;
   public:
 
     typedef MapBase<Derived, ReadOnlyAccessors> Base;
@@ -239,15 +240,13 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
     EIGEN_DEVICE_FUNC
     Derived& operator=(const MapBase& other)
     {
-      Base::Base::operator=(other);
+      ReadOnlyMapBase::Base::operator=(other);
       return derived();
     }
 
-    // In theory MapBase<Derived, ReadOnlyAccessors> should not make a using Base::operator=,
-    // and thus we should directly do: using Base::Base::operator=;
-    // However, this would confuse recent MSVC 2013 (bug 821), and since MapBase<Derived, ReadOnlyAccessors>
-    // has operator= to make ICC 11 happy, we can also make MSVC 2013 happy as follow:
-    using Base::operator=;
+    // In theory we could simply refer to Base:Base::operator=, but MSVC does not like Base::Base,
+    // see bugs 821 and 920.
+    using ReadOnlyMapBase::Base::operator=;
 };
 
 #undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index bc26043d7..687ba41dd 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -73,7 +73,7 @@
 
 /// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC
 #if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC)
-  #define EIGEN_COMP_MSVC_STRICT 1
+  #define EIGEN_COMP_MSVC_STRICT _MSC_VER
 #else
   #define EIGEN_COMP_MSVC_STRICT 0
 #endif
@@ -592,7 +592,7 @@ namespace Eigen {
 // just an empty macro !
 #define EIGEN_EMPTY
 
-#if EIGEN_COMP_MSVC_STRICT
+#if EIGEN_COMP_MSVC_STRICT && EIGEN_COMP_MSVC < 1900
   #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
     using Base::operator =;
 #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
-- 
cgit v1.2.3


From f5f6e2c6f46a8999ee36ce0c7adc62098d8d93d2 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 19 Dec 2014 14:41:59 +0100
Subject: bug #921: fix utilization of bitwise operation on enums in
 first_aligned

---
 Eigen/src/Core/util/Memory.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index a54ccaedc..bacf236fb 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -523,9 +523,8 @@ template<typename T, bool Align> inline void conditional_aligned_delete_auto(T *
 template<typename Scalar, typename Index>
 inline Index first_aligned(const Scalar* array, Index size)
 {
-  enum { PacketSize = packet_traits<Scalar>::size,
-         PacketAlignedMask = PacketSize-1
-  };
+  static const Index PacketSize = packet_traits<Scalar>::size;
+  static const Index PacketAlignedMask = PacketSize-1;
 
   if(PacketSize==1)
   {
-- 
cgit v1.2.3


From 9f98650d0a82d4757afb4503ce6f2b6f61763463 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 6 Jan 2015 09:29:13 -0800
Subject: Ensured that contractions that can be reduced to a matrix vector
 product work correctly even when the input coefficients aren't aligned.

---
 Eigen/src/Core/products/GeneralMatrixVector.h |  8 +++--
 unsupported/test/cxx11_tensor_contraction.cpp | 48 +++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 2 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index 7dfa48bfb..7df6a6b1a 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -140,10 +140,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,C
   // find how many columns do we have to skip to be aligned with the result (if possible)
   Index skipColumns = 0;
   // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
-  if( (lhsAlignmentOffset < 0) || (size_t(res)%sizeof(ResScalar)) )
+  if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (size_t(res)%sizeof(ResScalar)) )
   {
     alignedSize = 0;
     alignedStart = 0;
+    alignmentPattern = NoneAligned;
   }
   else if(LhsPacketSize > 4)
   {
@@ -412,10 +413,13 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,R
   // find how many rows do we have to skip to be aligned with rhs (if possible)
   Index skipRows = 0;
   // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
-  if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (lhsAlignmentOffset < 0) || (rhsAlignmentOffset < 0) )
+  if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) ||
+      (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) ||
+      (rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) )
   {
     alignedSize = 0;
     alignedStart = 0;
+    alignmentPattern = NoneAligned;
   }
   else if(LhsPacketSize > 4)
   {
diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp
index 2b599d30d..17bd335f7 100644
--- a/unsupported/test/cxx11_tensor_contraction.cpp
+++ b/unsupported/test/cxx11_tensor_contraction.cpp
@@ -352,6 +352,52 @@ static void test_large_contraction()
 }
 
 
+static void test_matrix_vector()
+{
+  Tensor<float, 2> t_left(30, 50);
+  Tensor<float, 1> t_right(50);
+  Tensor<float, 1> t_result(30);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic>> MapXf;
+  MapXf m_left(t_left.data(), 30, 50);
+  MapXf m_right(t_right.data(), 50, 1);
+  Eigen::Matrix<float, Dynamic, Dynamic> m_result(30, 1);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 1> dims{{DimPair(1, 0)}};
+
+  // compute results by separate methods
+  t_result = t_left.contract(t_right, dims);
+  m_result = m_left * m_right;
+
+  for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result(i), m_result(i, 0));
+  }
+}
+
+
+static void test_tensor_vector()
+{
+  Tensor<float, 3> t_left(7, 13, 17);
+  Tensor<float, 2> t_right(1, 7);
+  typedef typename Tensor<float, 1>::DimensionPair DimensionPair;
+  Eigen::array<DimensionPair, 1> dim_pair01{{{0, 1}}};
+  Tensor<float, 3> t_result = t_left.contract(t_right, dim_pair01);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic>> MapXf;
+  MapXf m_left(t_left.data(), 7, 13*17);
+  MapXf m_right(t_right.data(), 1, 7);
+  Eigen::Matrix<float, Dynamic, Dynamic> m_result = m_left.transpose() * m_right.transpose();
+
+  for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result(i), m_result(i, 0));
+  }
+}
+
+
 void test_cxx11_tensor_contraction()
 {
   CALL_SUBTEST(test_evals());
@@ -364,4 +410,6 @@ void test_cxx11_tensor_contraction()
   CALL_SUBTEST(test_out_of_order_contraction());
   CALL_SUBTEST(test_consistency());
   CALL_SUBTEST(test_large_contraction());
+  CALL_SUBTEST(test_matrix_vector());
+  CALL_SUBTEST(test_tensor_vector());
 }
-- 
cgit v1.2.3


From 79f4a59ed9ac3fc1a3b6e4516c2b3e04cec5f522 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 7 Jan 2015 09:41:56 +0100
Subject: bug #907: fix compilation with ARM64

---
 Eigen/src/Core/arch/NEON/PacketMath.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 6c5c669a1..586fa95e5 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -597,7 +597,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vco
 
 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
 
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_low_f64(a) + vget_high_f64(a); }
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); }
 
 template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
 {
@@ -613,7 +613,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
 }
 // Other reduction functions:
 // mul
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_low_f64(a) * vget_high_f64(a); }
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); }
 
 // min
 template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); }
-- 
cgit v1.2.3


From 63974bcb88f23bd4768eb3232906f2b9f3c92fca Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 7 Jan 2015 09:44:25 +0100
Subject: Big 907: workaround some missing intrinsics in current NDK's gcc
 version (ARM64)

---
 Eigen/src/Core/arch/NEON/PacketMath.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 586fa95e5..d962e8adc 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -492,6 +492,21 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
 //---------- double ----------
 #if EIGEN_ARCH_ARM64
 
+#if EIGEN_COMP_GNUC_STRICT && __ANDROID__
+// Bug 907: workaround missing declarations of the following two functions in the ADK
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_f64 (float64x2_t __a)
+{
+  return (uint64x2_t) __a;
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_f64_u64 (uint64x2_t __a)
+{
+  return (float64x2_t) __a;
+}
+#endif
+
 typedef float64x2_t Packet2d;
 typedef float64x1_t Packet1d;
 
-- 
cgit v1.2.3


From 36f7c1337f2f73421325a9bdbe98dd53fd99951f Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 13 Jan 2015 09:57:37 +0100
Subject: bug #907, ARM64: workaround vreinterpretq_u64_* not defined in
 xcode/clang

---
 Eigen/src/Core/arch/NEON/PacketMath.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index d962e8adc..29512e264 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -492,7 +492,7 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
 //---------- double ----------
 #if EIGEN_ARCH_ARM64
 
-#if EIGEN_COMP_GNUC_STRICT && __ANDROID__
+#if (EIGEN_COMP_GNUC_STRICT && defined(__ANDROID__)) || defined(__apple_build_version__)
 // Bug 907: workaround missing declarations of the following two functions in the ADK
 __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 vreinterpretq_u64_f64 (float64x2_t __a)
-- 
cgit v1.2.3


From ae4644cc6827bb1b6d654ceed8b3a0c256b1d173 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 13 Jan 2015 10:03:00 +0100
Subject: bug #907, ARM64: workaround ICE in xcode/clang

---
 Eigen/src/Core/arch/NEON/PacketMath.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 29512e264..f83f8db0e 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -612,7 +612,12 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vco
 
 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
 
+#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
+// workaround ICE, see bug 907
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) + vget_high_f64(a))[0]; }
+#else
 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); }
+#endif
 
 template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
 {
@@ -628,7 +633,11 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
 }
 // Other reduction functions:
 // mul
+#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; }
+#else
 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); }
+#endif
 
 // min
 template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); }
-- 
cgit v1.2.3


From 279786e9875cc71c7bed1f78f8df1803be215904 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 13 Jan 2015 10:25:50 +0100
Subject: Fix missing evaluator in outer-product

---
 Eigen/src/Core/ProductEvaluators.h | 10 ++++++----
 test/product_small.cpp             | 10 ++++++++++
 2 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index 3cebbbd12..488eee00c 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -211,24 +211,26 @@ template<typename Dst, typename Lhs, typename Rhs, typename Func>
 EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
 {
   typedef typename Dst::Index Index;
+  typename evaluator<Rhs>::type rhsEval(rhs);
   // FIXME make sure lhs is sequentially stored
   // FIXME not very good if rhs is real and lhs complex while alpha is real too
-  // FIXME we should probably build an evaluator for dst and rhs
+  // FIXME we should probably build an evaluator for dst
   const Index cols = dst.cols();
   for (Index j=0; j<cols; ++j)
-    func(dst.col(j), rhs.coeff(0,j) * lhs);
+    func(dst.col(j), rhsEval.coeff(0,j) * lhs);
 }
 
 // Row major result
 template<typename Dst, typename Lhs, typename Rhs, typename Func>
 EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&) {
   typedef typename Dst::Index Index;
+  typename evaluator<Lhs>::type lhsEval(lhs);
   // FIXME make sure rhs is sequentially stored
   // FIXME not very good if lhs is real and rhs complex while alpha is real too
-  // FIXME we should probably build an evaluator for dst and lhs
+  // FIXME we should probably build an evaluator for dst
   const Index rows = dst.rows();
   for (Index i=0; i<rows; ++i)
-    func(dst.row(i), lhs.coeff(i,0) * rhs);
+    func(dst.row(i), lhsEval.coeff(i,0) * rhs);
 }
 
 template<typename Lhs, typename Rhs>
diff --git a/test/product_small.cpp b/test/product_small.cpp
index 8b132abb6..091955a0f 100644
--- a/test/product_small.cpp
+++ b/test/product_small.cpp
@@ -9,6 +9,7 @@
 
 #define EIGEN_NO_STATIC_ASSERT
 #include "product.h"
+#include <Eigen/LU>
 
 // regression test for bug 447
 void product1x1()
@@ -46,5 +47,14 @@ void test_product_small()
     Vector3f v = Vector3f::Random();
     VERIFY_IS_APPROX( (v * v.transpose()) * v, (v * v.transpose()).eval() * v);
   }
+  
+  {
+    // regression test for pull-request #93
+    Eigen::Matrix<double, 1, 1> A;  A.setRandom();
+    Eigen::Matrix<double, 18, 1> B; B.setRandom();
+    Eigen::Matrix<double, 1, 18> C; C.setRandom();
+    VERIFY_IS_APPROX(B * A.inverse(), B * A.inverse()[0]);
+    VERIFY_IS_APPROX(A.inverse() * C, A.inverse()[0] * C);
+  }
 #endif
 }
-- 
cgit v1.2.3


From b9d314ae19b1c857adfe42b7eaecf2695428f0ed Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 17 Jan 2015 21:55:33 +0100
Subject: bug #329: fix typo

---
 Eigen/src/Eigenvalues/RealQZ.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h
index ae10ff91e..128ef9028 100644
--- a/Eigen/src/Eigenvalues/RealQZ.h
+++ b/Eigen/src/Eigenvalues/RealQZ.h
@@ -313,7 +313,7 @@ namespace Eigen {
       using std::abs;
       using std::sqrt;
       const Index dim=m_S.cols();
-      if (abs(m_S.coeff(i+1,i)==Scalar(0)))
+      if (abs(m_S.coeff(i+1,i))==Scalar(0))
         return;
       Index z = findSmallDiagEntry(i,i+1);
       if (z==i-1)
-- 
cgit v1.2.3


From e1f1091fde660581d64b54ff1019bc494dbbca89 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 24 Jan 2015 10:32:49 +0100
Subject: Add support for dense ?= diagonal

---
 Eigen/src/Core/DiagonalMatrix.h | 6 ++++++
 test/diagonalmatrices.cpp       | 7 +++++++
 2 files changed, 13 insertions(+)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/DiagonalMatrix.h b/Eigen/src/Core/DiagonalMatrix.h
index e3dc71336..49b9b7925 100644
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h
@@ -326,6 +326,12 @@ struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Dense, Scalar>
     dst.setZero();
     dst.diagonal() = src.diagonal();
   }
+  
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar> &/*func*/)
+  { dst.diagonal() += src.diagonal(); }
+  
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar> &/*func*/)
+  { dst.diagonal() -= src.diagonal(); }
 };
 
 } // namespace internal
diff --git a/test/diagonalmatrices.cpp b/test/diagonalmatrices.cpp
index 149f1db2f..0227ba577 100644
--- a/test/diagonalmatrices.cpp
+++ b/test/diagonalmatrices.cpp
@@ -84,6 +84,13 @@ template<typename MatrixType> void diagonalmatrices(const MatrixType& m)
   
   VERIFY_IS_APPROX(m1 * (rdm1 * s1), (m1 * rdm1) * s1);
   VERIFY_IS_APPROX(m1 * (s1 * rdm1), (m1 * rdm1) * s1);
+  
+  // Diagonal to dense
+  sq_m1.setRandom();
+  sq_m2 = sq_m1;
+  VERIFY_IS_APPROX( (sq_m1 += (s1*v1).asDiagonal()), sq_m2 += (s1*v1).asDiagonal().toDenseMatrix() );
+  VERIFY_IS_APPROX( (sq_m1 -= (s1*v1).asDiagonal()), sq_m2 -= (s1*v1).asDiagonal().toDenseMatrix() );
+  VERIFY_IS_APPROX( (sq_m1 = (s1*v1).asDiagonal()), (s1*v1).asDiagonal().toDenseMatrix() );
 }
 
 void test_diagonalmatrices()
-- 
cgit v1.2.3


From c6eb84aabcf102aaa3ba1c288e890984f4b49277 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 26 Jan 2015 17:09:01 +0100
Subject: Enable vectorization of transposeInPlace for PacketSize x PacketSize
 matrices

---
 Eigen/src/Core/Transpose.h | 27 ++++++++++++++++++++++++---
 test/adjoint.cpp           | 22 ++++++++++++++++++++++
 2 files changed, 46 insertions(+), 3 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index a3b95256f..3bab6092c 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -217,18 +217,39 @@ MatrixBase<Derived>::adjoint() const
 namespace internal {
 
 template<typename MatrixType,
-  bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic>
+  bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic,
+  bool MatchPacketSize =
+        (int(MatrixType::RowsAtCompileTime) == int(internal::packet_traits<typename MatrixType::Scalar>::size))
+    &&  (internal::evaluator<MatrixType>::Flags&PacketAccessBit) >
 struct inplace_transpose_selector;
 
 template<typename MatrixType>
-struct inplace_transpose_selector<MatrixType,true> { // square matrix
+struct inplace_transpose_selector<MatrixType,true,false> { // square matrix
   static void run(MatrixType& m) {
     m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
   }
 };
 
+// TODO: vectorized path is currently limited to LargestPacketSize x LargestPacketSize cases only.
 template<typename MatrixType>
-struct inplace_transpose_selector<MatrixType,false> { // non square matrix
+struct inplace_transpose_selector<MatrixType,true,true> { // PacketSize x PacketSize
+  static void run(MatrixType& m) {
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename internal::packet_traits<typename MatrixType::Scalar>::type Packet;
+    typedef typename MatrixType::Index Index;
+    const Index PacketSize = internal::packet_traits<Scalar>::size;
+    const Index Alignment = internal::evaluator<MatrixType>::Flags&AlignedBit ? Aligned : Unaligned;
+    PacketBlock<Packet> A;
+    for (Index i=0; i<PacketSize; ++i)
+      A.packet[i] = m.template packetByOuterInner<Alignment>(i,0);
+    internal::ptranspose(A);
+    for (Index i=0; i<PacketSize; ++i)
+      m.template writePacket<Alignment>(m.rowIndexByOuterInner(i,0), m.colIndexByOuterInner(i,0), A.packet[i]);
+  }
+};
+
+template<typename MatrixType,bool MatchPacketSize>
+struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non square matrix
   static void run(MatrixType& m) {
     if (m.rows()==m.cols())
       m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
diff --git a/test/adjoint.cpp b/test/adjoint.cpp
index ea36f7841..3b2a53c91 100644
--- a/test/adjoint.cpp
+++ b/test/adjoint.cpp
@@ -64,6 +64,7 @@ template<typename MatrixType> void adjoint(const MatrixType& m)
   typedef typename NumTraits<Scalar>::Real RealScalar;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> SquareMatrixType;
+  const Index PacketSize = internal::packet_traits<Scalar>::size;
   
   Index rows = m.rows();
   Index cols = m.cols();
@@ -108,6 +109,17 @@ template<typename MatrixType> void adjoint(const MatrixType& m)
   VERIFY_IS_APPROX(m3,m1.transpose());
   m3.transposeInPlace();
   VERIFY_IS_APPROX(m3,m1);
+  
+  if(PacketSize<m3.rows() && PacketSize<m3.cols())
+  {
+    m3 = m1;
+    Index i = internal::random<Index>(0,m3.rows()-PacketSize);
+    Index j = internal::random<Index>(0,m3.cols()-PacketSize);
+    m3.template block<PacketSize,PacketSize>(i,j).transposeInPlace();
+    VERIFY_IS_APPROX( (m3.template block<PacketSize,PacketSize>(i,j)), (m1.template block<PacketSize,PacketSize>(i,j).transpose()) );
+    m3.template block<PacketSize,PacketSize>(i,j).transposeInPlace();
+    VERIFY_IS_APPROX(m3,m1);
+  }
 
   // check inplace adjoint
   m3 = m1;
@@ -129,9 +141,19 @@ void test_adjoint()
     CALL_SUBTEST_1( adjoint(Matrix<float, 1, 1>()) );
     CALL_SUBTEST_2( adjoint(Matrix3d()) );
     CALL_SUBTEST_3( adjoint(Matrix4f()) );
+    
     CALL_SUBTEST_4( adjoint(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
     CALL_SUBTEST_5( adjoint(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_6( adjoint(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    
+    // Complement for 128 bits vectorization:
+    CALL_SUBTEST_8( adjoint(Matrix2d()) );
+    CALL_SUBTEST_9( adjoint(Matrix<int,4,4>()) );
+    
+    // 256 bits vectorization:
+    CALL_SUBTEST_10( adjoint(Matrix<float,8,8>()) );
+    CALL_SUBTEST_11( adjoint(Matrix<double,4,4>()) );
+    CALL_SUBTEST_12( adjoint(Matrix<int,8,8>()) );
   }
   // test a large static matrix only once
   CALL_SUBTEST_7( adjoint(Matrix<float, 100, 100>()) );
-- 
cgit v1.2.3


From a727a2c4edfa85303ad1ad406e65415187fbb770 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 28 Jan 2015 16:07:51 +0100
Subject: bug #933: RealSchur, do not consider the input matrix norm to check
 negligible sub-diag entries. This also makes this test consistent with the
 complex and self-adjoint cases.

---
 Eigen/src/Eigenvalues/RealSchur.h | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h
index 10f5fb174..51e61ba38 100644
--- a/Eigen/src/Eigenvalues/RealSchur.h
+++ b/Eigen/src/Eigenvalues/RealSchur.h
@@ -234,7 +234,7 @@ template<typename _MatrixType> class RealSchur
     typedef Matrix<Scalar,3,1> Vector3s;
 
     Scalar computeNormOfT();
-    Index findSmallSubdiagEntry(Index iu, const Scalar& norm);
+    Index findSmallSubdiagEntry(Index iu);
     void splitOffTwoRows(Index iu, bool computeU, const Scalar& exshift);
     void computeShift(Index iu, Index iter, Scalar& exshift, Vector3s& shiftInfo);
     void initFrancisQRStep(Index il, Index iu, const Vector3s& shiftInfo, Index& im, Vector3s& firstHouseholderVector);
@@ -286,7 +286,7 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMa
   {
     while (iu >= 0)
     {
-      Index il = findSmallSubdiagEntry(iu, norm);
+      Index il = findSmallSubdiagEntry(iu);
 
       // Check for convergence
       if (il == iu) // One root found
@@ -343,16 +343,14 @@ inline typename MatrixType::Scalar RealSchur<MatrixType>::computeNormOfT()
 
 /** \internal Look for single small sub-diagonal element and returns its index */
 template<typename MatrixType>
-inline typename MatrixType::Index RealSchur<MatrixType>::findSmallSubdiagEntry(Index iu, const Scalar& norm)
+inline typename MatrixType::Index RealSchur<MatrixType>::findSmallSubdiagEntry(Index iu)
 {
   using std::abs;
   Index res = iu;
   while (res > 0)
   {
     Scalar s = abs(m_matT.coeff(res-1,res-1)) + abs(m_matT.coeff(res,res));
-    if (s == 0.0)
-      s = norm;
-    if (abs(m_matT.coeff(res,res-1)) < NumTraits<Scalar>::epsilon() * s)
+    if (abs(m_matT.coeff(res,res-1)) <= NumTraits<Scalar>::epsilon() * s)
       break;
     res--;
   }
@@ -457,9 +455,7 @@ inline void RealSchur<MatrixType>::initFrancisQRStep(Index il, Index iu, const V
     const Scalar lhs = m_matT.coeff(im,im-1) * (abs(v.coeff(1)) + abs(v.coeff(2)));
     const Scalar rhs = v.coeff(0) * (abs(m_matT.coeff(im-1,im-1)) + abs(Tmm) + abs(m_matT.coeff(im+1,im+1)));
     if (abs(lhs) < NumTraits<Scalar>::epsilon() * rhs)
-    {
       break;
-    }
   }
 }
 
-- 
cgit v1.2.3


From 9d82f7e30d53086d090ae13d69dffef771eb6263 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 30 Jan 2015 17:24:40 +0100
Subject: Supernodes was disabled.

---
 Eigen/src/SparseLU/SparseLU.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'Eigen')

diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h
index d72d7f150..79b78da99 100644
--- a/Eigen/src/SparseLU/SparseLU.h
+++ b/Eigen/src/SparseLU/SparseLU.h
@@ -309,7 +309,7 @@ class SparseLU : public SparseSolverBase<SparseLU<_MatrixType,_OrderingType> >,
     // Functions 
     void initperfvalues()
     {
-      m_perfv.panel_size = 1;
+      m_perfv.panel_size = 16;
       m_perfv.relax = 1; 
       m_perfv.maxsuper = 128; 
       m_perfv.rowblk = 16; 
-- 
cgit v1.2.3


From f1092d2f736bbe541b3d8b5cca893f668f9c6b5f Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 30 Jan 2015 19:04:04 +0100
Subject: bug #941: fix accuracy issue in ColPivHouseholderQR, do not stop
 decomposition on a small pivot

---
 Eigen/src/QR/ColPivHouseholderQR.h | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h
index de77e8411..370cb69e3 100644
--- a/Eigen/src/QR/ColPivHouseholderQR.h
+++ b/Eigen/src/QR/ColPivHouseholderQR.h
@@ -476,20 +476,10 @@ ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const
     // we store that back into our table: it can't hurt to correct our table.
     m_colSqNorms.coeffRef(biggest_col_index) = biggest_col_sq_norm;
 
-    // if the current biggest column is smaller than epsilon times the initial biggest column,
-    // terminate to avoid generating nan/inf values.
-    // Note that here, if we test instead for "biggest == 0", we get a failure every 1000 (or so)
-    // repetitions of the unit test, with the result of solve() filled with large values of the order
-    // of 1/(size*epsilon).
-    if(biggest_col_sq_norm < threshold_helper * RealScalar(rows-k))
-    {
+    // Track the number of meaningful pivots but do not stop the decomposition to make
+    // sure that the initial matrix is properly reproduced. See bug 941.
+    if(m_nonzero_pivots==size && biggest_col_sq_norm < threshold_helper * RealScalar(rows-k))
       m_nonzero_pivots = k;
-      m_hCoeffs.tail(size-k).setZero();
-      m_qr.bottomRightCorner(rows-k,cols-k)
-          .template triangularView<StrictlyLower>()
-          .setZero();
-      break;
-    }
 
     // apply the transposition to the columns
     m_colsTranspositions.coeffRef(k) = biggest_col_index;
@@ -518,7 +508,7 @@ ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const
   }
 
   m_colsPermutation.setIdentity(PermIndexType(cols));
-  for(PermIndexType k = 0; k < m_nonzero_pivots; ++k)
+  for(PermIndexType k = 0; k < size/*m_nonzero_pivots*/; ++k)
     m_colsPermutation.applyTranspositionOnTheRight(k, PermIndexType(m_colsTranspositions.coeff(k)));
 
   m_det_pq = (number_of_transpositions%2) ? -1 : 1;
@@ -574,13 +564,15 @@ struct Assignment<DstXprType, Inverse<ColPivHouseholderQR<MatrixType> >, interna
 
 } // end namespace internal
 
-/** \returns the matrix Q as a sequence of householder transformations */
+/** \returns the matrix Q as a sequence of householder transformations.
+  * You can extract the meaningful part only by using:
+  * \code qr.householderQ().setLength(qr.nonzeroPivots()) */
 template<typename MatrixType>
 typename ColPivHouseholderQR<MatrixType>::HouseholderSequenceType ColPivHouseholderQR<MatrixType>
   ::householderQ() const
 {
   eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-  return HouseholderSequenceType(m_qr, m_hCoeffs.conjugate()).setLength(m_nonzero_pivots);
+  return HouseholderSequenceType(m_qr, m_hCoeffs.conjugate());
 }
 
 #ifndef __CUDACC__
-- 
cgit v1.2.3


From 759bd92a85393617a56405ec0372e87416cfaebb Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Fri, 30 Jan 2015 17:27:56 -0500
Subject: bug #935: Add asm comments in GEBP kernels to work around a bug in
 both GCC and Clang on ARM/NEON, whereby they spill registers, severely
 harming performance. The reason why the asm comments make a difference is
 that they prevent the compiler from reordering code across these boundaries,
 which has the effect of extending the lifetime of local variables and
 increasing register pressure on this register-tight code.

---
 Eigen/src/Core/products/GeneralBlockPanelKernel.h | 160 ++++++++++++++--------
 Eigen/src/Core/util/Macros.h                      |   8 +-
 2 files changed, 108 insertions(+), 60 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 7b2ed6728..1b39642fb 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -760,31 +760,36 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
           
           for(Index k=0; k<peeled_kc; k+=pk)
           {
-            EIGEN_ASM_COMMENT("begin gegp micro kernel 3p x 4");
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
             RhsPacket B_0, T0;
             LhsPacket A2;
 
 #define EIGEN_GEBGP_ONESTEP(K) \
-            internal::prefetch(blA+(3*K+16)*LhsProgress); \
-            traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
-            traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
-            traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
-            traits.loadRhs(&blB[(0+4*K)*RhsProgress], B_0); \
-            traits.madd(A0, B_0, C0, T0); \
-            traits.madd(A1, B_0, C4, T0); \
-            traits.madd(A2, B_0, C8, B_0); \
-            traits.loadRhs(&blB[1+4*K*RhsProgress], B_0); \
-            traits.madd(A0, B_0, C1, T0); \
-            traits.madd(A1, B_0, C5, T0); \
-            traits.madd(A2, B_0, C9, B_0); \
-            traits.loadRhs(&blB[2+4*K*RhsProgress], B_0); \
-            traits.madd(A0, B_0, C2,  T0); \
-            traits.madd(A1, B_0, C6,  T0); \
-            traits.madd(A2, B_0, C10, B_0); \
-            traits.loadRhs(&blB[3+4*K*RhsProgress], B_0); \
-            traits.madd(A0, B_0, C3 , T0); \
-            traits.madd(A1, B_0, C7,  T0); \
-            traits.madd(A2, B_0, C11, B_0)
+            do { \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              internal::prefetch(blA+(3*K+16)*LhsProgress); \
+              traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
+              traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
+              traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
+              traits.loadRhs(&blB[(0+4*K)*RhsProgress], B_0); \
+              traits.madd(A0, B_0, C0, T0); \
+              traits.madd(A1, B_0, C4, T0); \
+              traits.madd(A2, B_0, C8, B_0); \
+              traits.loadRhs(&blB[1+4*K*RhsProgress], B_0); \
+              traits.madd(A0, B_0, C1, T0); \
+              traits.madd(A1, B_0, C5, T0); \
+              traits.madd(A2, B_0, C9, B_0); \
+              traits.loadRhs(&blB[2+4*K*RhsProgress], B_0); \
+              traits.madd(A0, B_0, C2,  T0); \
+              traits.madd(A1, B_0, C6,  T0); \
+              traits.madd(A2, B_0, C10, B_0); \
+              traits.loadRhs(&blB[3+4*K*RhsProgress], B_0); \
+              traits.madd(A0, B_0, C3 , T0); \
+              traits.madd(A1, B_0, C7,  T0); \
+              traits.madd(A2, B_0, C11, B_0); \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
+            } while(false)
         
             internal::prefetch(blB+(48+0));
             EIGEN_GEBGP_ONESTEP(0);
@@ -799,6 +804,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
 
             blB += pk*4*RhsProgress;
             blA += pk*3*Traits::LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
           }
           // process remaining peeled loop
           for(Index k=peeled_kc; k<depth; k++)
@@ -876,16 +883,21 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
           
           for(Index k=0; k<peeled_kc; k+=pk)
           {
-            EIGEN_ASM_COMMENT("begin gegp micro kernel 3p x 1");
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
             RhsPacket B_0;
 #define EIGEN_GEBGP_ONESTEP(K) \
-            traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
-            traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
-            traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
-            traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);   \
-            traits.madd(A0, B_0, C0, B_0); \
-            traits.madd(A1, B_0, C4, B_0); \
-            traits.madd(A2, B_0, C8, B_0)
+            do { \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
+              traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
+              traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
+              traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);   \
+              traits.madd(A0, B_0, C0, B_0); \
+              traits.madd(A1, B_0, C4, B_0); \
+              traits.madd(A2, B_0, C8, B_0); \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
+            } while(false)
         
             EIGEN_GEBGP_ONESTEP(0);
             EIGEN_GEBGP_ONESTEP(1);
@@ -898,6 +910,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
 
             blB += pk*RhsProgress;
             blA += pk*3*Traits::LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
           }
 
           // process remaining peeled loop
@@ -963,21 +977,26 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
 
           for(Index k=0; k<peeled_kc; k+=pk)
           {
-            EIGEN_ASM_COMMENT("begin gegp micro kernel 2pX4");
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
             RhsPacket B_0, B1, B2, B3, T0;
 
    #define EIGEN_GEBGP_ONESTEP(K) \
-            traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);                    \
-            traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);                    \
-            traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \
-            traits.madd(A0, B_0, C0, T0);                                     \
-            traits.madd(A1, B_0, C4, B_0);                                    \
-            traits.madd(A0, B1,  C1, T0);                                     \
-            traits.madd(A1, B1,  C5, B1);                                     \
-            traits.madd(A0, B2,  C2, T0);                                     \
-            traits.madd(A1, B2,  C6, B2);                                     \
-            traits.madd(A0, B3,  C3, T0);                                     \
-            traits.madd(A1, B3,  C7, B3)
+            do {                                                                \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4");        \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);                    \
+              traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);                    \
+              traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \
+              traits.madd(A0, B_0, C0, T0);                                     \
+              traits.madd(A1, B_0, C4, B_0);                                    \
+              traits.madd(A0, B1,  C1, T0);                                     \
+              traits.madd(A1, B1,  C5, B1);                                     \
+              traits.madd(A0, B2,  C2, T0);                                     \
+              traits.madd(A1, B2,  C6, B2);                                     \
+              traits.madd(A0, B3,  C3, T0);                                     \
+              traits.madd(A1, B3,  C7, B3);                                     \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4");          \
+            } while(false)
             
             internal::prefetch(blB+(48+0));
             EIGEN_GEBGP_ONESTEP(0);
@@ -992,6 +1011,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
 
             blB += pk*4*RhsProgress;
             blA += pk*(2*Traits::LhsProgress);
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
           }
           // process remaining peeled loop
           for(Index k=peeled_kc; k<depth; k++)
@@ -1054,15 +1075,20 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
 
           for(Index k=0; k<peeled_kc; k+=pk)
           {
-            EIGEN_ASM_COMMENT("begin gegp micro kernel 2p x 1");
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
             RhsPacket B_0, B1;
         
 #define EIGEN_GEBGP_ONESTEP(K) \
-            traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);  \
-            traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);  \
-            traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);   \
-            traits.madd(A0, B_0, C0, B1);                   \
-            traits.madd(A1, B_0, C4, B_0)
+            do {                                                                  \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1");          \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);                      \
+              traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);                      \
+              traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);                       \
+              traits.madd(A0, B_0, C0, B1);                                       \
+              traits.madd(A1, B_0, C4, B_0);                                      \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1");            \
+            } while(false)
         
             EIGEN_GEBGP_ONESTEP(0);
             EIGEN_GEBGP_ONESTEP(1);
@@ -1075,6 +1101,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
 
             blB += pk*RhsProgress;
             blA += pk*2*Traits::LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
           }
 
           // process remaining peeled loop
@@ -1137,16 +1165,21 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
 
           for(Index k=0; k<peeled_kc; k+=pk)
           {
-            EIGEN_ASM_COMMENT("begin gegp micro kernel 1pX4");
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4");
             RhsPacket B_0, B1, B2, B3;
                
 #define EIGEN_GEBGP_ONESTEP(K) \
-            traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \
-            traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \
-            traits.madd(A0, B_0, C0, B_0);                                    \
-            traits.madd(A0, B1,  C1, B1);                                     \
-            traits.madd(A0, B2,  C2, B2);                                     \
-            traits.madd(A0, B3,  C3, B3);
+            do {                                                                \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4");        \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \
+              traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \
+              traits.madd(A0, B_0, C0, B_0);                                    \
+              traits.madd(A0, B1,  C1, B1);                                     \
+              traits.madd(A0, B2,  C2, B2);                                     \
+              traits.madd(A0, B3,  C3, B3);                                     \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4");          \
+            } while(false)
             
             internal::prefetch(blB+(48+0));
             EIGEN_GEBGP_ONESTEP(0);
@@ -1161,6 +1194,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
 
             blB += pk*4*RhsProgress;
             blA += pk*1*LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4");
           }
           // process remaining peeled loop
           for(Index k=peeled_kc; k<depth; k++)
@@ -1209,14 +1244,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
 
           for(Index k=0; k<peeled_kc; k+=pk)
           {
-            EIGEN_ASM_COMMENT("begin gegp micro kernel 2p x 1");
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
             RhsPacket B_0;
         
 #define EIGEN_GEBGP_ONESTEP(K) \
-            traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);  \
-            traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);   \
-            traits.madd(A0, B_0, C0, B_0); \
-        
+            do {                                                                \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1");        \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \
+              traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);                     \
+              traits.madd(A0, B_0, C0, B_0);                                    \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1");          \
+            } while(false);
+
             EIGEN_GEBGP_ONESTEP(0);
             EIGEN_GEBGP_ONESTEP(1);
             EIGEN_GEBGP_ONESTEP(2);
@@ -1228,6 +1268,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
 
             blB += pk*RhsProgress;
             blA += pk*1*Traits::LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
           }
 
           // process remaining peeled loop
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 687ba41dd..13f8fdd4e 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -160,6 +160,12 @@
   #define EIGEN_ARCH_ARM64 0
 #endif
 
+#if EIGEN_ARCH_ARM || EIGEN_ARCH_ARM64
+  #define EIGEN_ARCH_ARM_OR_ARM64 1
+#else
+  #define EIGEN_ARCH_ARM_OR_ARM64 0
+#endif
+
 /// \internal EIGEN_ARCH_MIPS set to 1 if the architecture is MIPS
 #if defined(__mips__) || defined(__mips)
   #define EIGEN_ARCH_MIPS 1
@@ -526,7 +532,7 @@ namespace Eigen {
 #define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var);
 
 #if !defined(EIGEN_ASM_COMMENT)
-  #if EIGEN_COMP_GNUC && EIGEN_ARCH_i386_OR_x86_64
+  #if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64)
     #define EIGEN_ASM_COMMENT(X)  __asm__("#" X)
   #else
     #define EIGEN_ASM_COMMENT(X)
-- 
cgit v1.2.3


From 9f99f61e69a70e0a209d5f93e78a9257685cd70d Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Fri, 30 Jan 2015 17:43:56 -0500
Subject: bug #936, patch 1/3: some cleanup and renaming for consistency.

---
 Eigen/src/Core/arch/AltiVec/PacketMath.h          |  4 ++--
 Eigen/src/Core/arch/NEON/PacketMath.h             |  4 ++--
 Eigen/src/Core/products/GeneralBlockPanelKernel.h | 18 +++++++++---------
 3 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index fa02f57a1..27df5a025 100755
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -22,8 +22,8 @@ namespace internal {
 #define EIGEN_HAS_FUSED_MADD 1
 #endif
 
-#ifndef EIGEN_HAS_FUSE_CJMADD
-#define EIGEN_HAS_FUSE_CJMADD 1
+#ifndef EIGEN_HAS_FUSED_CJMADD
+#define EIGEN_HAS_FUSED_CJMADD
 #endif
 
 // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index f83f8db0e..5a6eb8c1d 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -24,8 +24,8 @@ namespace internal {
 #define EIGEN_HAS_FUSED_MADD 1
 #endif
 
-#ifndef EIGEN_HAS_FUSE_CJMADD
-#define EIGEN_HAS_FUSE_CJMADD 1
+#ifndef EIGEN_HAS_FUSED_CJMADD
+#define EIGEN_HAS_FUSED_CJMADD
 #endif
 
 // FIXME NEON has 16 quad registers, but since the current register allocator
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 1b39642fb..ae2fd9006 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -120,8 +120,8 @@ inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
   computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n);
 }
 
-#ifdef EIGEN_HAS_FUSE_CJMADD
-  #define MADD(CJ,A,B,C,T)  C = CJ.pmadd(A,B,C);
+#ifdef EIGEN_HAS_FUSED_CJMADD
+  #define CJMADD(CJ,A,B,C,T)  C = CJ.pmadd(A,B,C);
 #else
 
   // FIXME (a bit overkill maybe ?)
@@ -146,8 +146,8 @@ inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
     gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
   }
 
-  #define MADD(CJ,A,B,C,T)  gebp_madd(CJ,A,B,C,T);
-//   #define MADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); C = padd(C,T);
+  #define CJMADD(CJ,A,B,C,T)  gebp_madd(CJ,A,B,C,T);
+//   #define CJMADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); C = padd(C,T);
 #endif
 
 /* Vectorization logic
@@ -1402,13 +1402,13 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
               
               B_0 = blB[0];
               B_1 = blB[1];
-              MADD(cj,A0,B_0,C0,  B_0);
-              MADD(cj,A0,B_1,C1,  B_1);
+              CJMADD(cj,A0,B_0,C0,  B_0);
+              CJMADD(cj,A0,B_1,C1,  B_1);
               
               B_0 = blB[2];
               B_1 = blB[3];
-              MADD(cj,A0,B_0,C2,  B_0);
-              MADD(cj,A0,B_1,C3,  B_1);
+              CJMADD(cj,A0,B_0,C2,  B_0);
+              CJMADD(cj,A0,B_1,C3,  B_1);
               
               blB += 4;
             }
@@ -1434,7 +1434,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
           {
             LhsScalar A0 = blA[k];
             RhsScalar B_0 = blB[k];
-            MADD(cj, A0, B_0, C0, B_0);
+            CJMADD(cj, A0, B_0, C0, B_0);
           }
           res[(j2+0)*resStride + i] += alpha*C0;
         }
-- 
cgit v1.2.3


From 340b8afb14bb06788570ba22ba4ccba674402f09 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Sat, 31 Jan 2015 14:15:57 -0500
Subject: bug #936, patch 1.5/3: rename _FUSED_ macros to _SINGLE_INSTRUCTION_,
 because this is what they are about. "Fused" means "no intermediate rounding
 between the mul and the add, only one rounding at the end". Instead, what we
 are concerned about here is whether a temporary register is needed, i.e.
 whether the MUL and ADD are separate instructions. Concretely, on ARM NEON, a
 single-instruction mul-add is always available: VMLA. But a true fused
 mul-add is only available on VFPv4: VFMA.

---
 Eigen/src/Core/arch/AVX/PacketMath.h              |  4 ++--
 Eigen/src/Core/arch/AltiVec/PacketMath.h          |  8 ++++----
 Eigen/src/Core/arch/NEON/PacketMath.h             |  8 ++++----
 Eigen/src/Core/arch/SSE/PacketMath.h              |  4 ++--
 Eigen/src/Core/products/GeneralBlockPanelKernel.h | 12 ++++++------
 5 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index e2376bd1f..1d8c674a6 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -23,8 +23,8 @@ namespace internal {
 #endif
 
 #ifdef EIGEN_VECTORIZE_FMA
-#ifndef EIGEN_HAS_FUSED_MADD
-#define EIGEN_HAS_FUSED_MADD 1
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1
 #endif
 #endif
 
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 27df5a025..578b303a0 100755
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -18,12 +18,12 @@ namespace internal {
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
 #endif
 
-#ifndef EIGEN_HAS_FUSED_MADD
-#define EIGEN_HAS_FUSED_MADD 1
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1
 #endif
 
-#ifndef EIGEN_HAS_FUSED_CJMADD
-#define EIGEN_HAS_FUSED_CJMADD
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
 #endif
 
 // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 5a6eb8c1d..9cfb9c358 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -20,12 +20,12 @@ namespace internal {
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
 #endif
 
-#ifndef EIGEN_HAS_FUSED_MADD
-#define EIGEN_HAS_FUSED_MADD 1
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1
 #endif
 
-#ifndef EIGEN_HAS_FUSED_CJMADD
-#define EIGEN_HAS_FUSED_CJMADD
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
 #endif
 
 // FIXME NEON has 16 quad registers, but since the current register allocator
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 28427c308..202aaa72f 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -23,8 +23,8 @@ namespace internal {
 #endif
 
 #ifdef EIGEN_VECTORIZE_FMA
-#ifndef EIGEN_HAS_FUSED_MADD
-#define EIGEN_HAS_FUSED_MADD 1
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1
 #endif
 #endif
 
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index ae2fd9006..b5f06d831 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -120,7 +120,7 @@ inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
   computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n);
 }
 
-#ifdef EIGEN_HAS_FUSED_CJMADD
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
   #define CJMADD(CJ,A,B,C,T)  C = CJ.pmadd(A,B,C);
 #else
 
@@ -182,7 +182,7 @@ public:
     nr = 4,
 
     // register block size along the M direction (currently, this one cannot be modified)
-#if defined(EIGEN_HAS_FUSED_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
+#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
     // we assume 16 registers
     mr = 3*LhsPacketSize,
 #else
@@ -248,7 +248,7 @@ public:
     // let gcc allocate the register in which to store the result of the pmul
     // (in the case where there is no FMA) gcc fails to figure out how to avoid
     // spilling register.
-#ifdef EIGEN_HAS_FUSED_MADD
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
     EIGEN_UNUSED_VARIABLE(tmp);
     c = pmadd(a,b,c);
 #else
@@ -290,7 +290,7 @@ public:
     
     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
     nr = 4,
-#if defined(EIGEN_HAS_FUSED_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
+#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
     // we assume 16 registers
     mr = 3*LhsPacketSize,
 #else
@@ -353,7 +353,7 @@ public:
 
   EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
   {
-#ifdef EIGEN_HAS_FUSED_MADD
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
     EIGEN_UNUSED_VARIABLE(tmp);
     c.v = pmadd(a.v,b,c.v);
 #else
@@ -637,7 +637,7 @@ public:
 
   EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
   {
-#ifdef EIGEN_HAS_FUSED_MADD
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
     EIGEN_UNUSED_VARIABLE(tmp);
     c.v = pmadd(a,b.v,c.v);
 #else
-- 
cgit v1.2.3


From 0f216136980503c3792a90e382b4d6bbdbb870c0 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Fri, 30 Jan 2015 17:44:26 -0500
Subject: bug #936, patch 2/3: Remove EIGEN_VECTORIZE_FMA, was redundant with
 EIGEN_HAS_SINGLE_INSTRUCTION_MADD

---
 Eigen/Core                               | 4 +---
 Eigen/src/Core/arch/AVX/PacketMath.h     | 4 ++--
 Eigen/src/Core/arch/AltiVec/PacketMath.h | 2 +-
 Eigen/src/Core/arch/NEON/PacketMath.h    | 2 +-
 Eigen/src/Core/arch/SSE/PacketMath.h     | 2 +-
 5 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/Core b/Eigen/Core
index dcb20bfd0..b5af63623 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -125,9 +125,7 @@
       #define EIGEN_VECTORIZE_SSE4_1
       #define EIGEN_VECTORIZE_SSE4_2
     #endif
-    #ifdef __FMA__
-      #define EIGEN_VECTORIZE_FMA
-    #endif
+
     // include files
 
     // This extern "C" works around a MINGW-w64 compilation issue
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 1d8c674a6..485bac10b 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -22,9 +22,9 @@ namespace internal {
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
 #endif
 
-#ifdef EIGEN_VECTORIZE_FMA
+#ifdef __FMA__
 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
-#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
 #endif
 
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 578b303a0..6b68fc7a5 100755
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -19,7 +19,7 @@ namespace internal {
 #endif
 
 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
-#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
 
 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 9cfb9c358..71255ac85 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -21,7 +21,7 @@ namespace internal {
 #endif
 
 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
-#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
 
 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 202aaa72f..3f6fb0254 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -22,7 +22,7 @@ namespace internal {
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
 #endif
 
-#ifdef EIGEN_VECTORIZE_FMA
+#ifdef __FMA__
 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1
 #endif
-- 
cgit v1.2.3


From 5ef95fabee5e9a9357c082cd32ae3b4affb2eff6 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Fri, 30 Jan 2015 17:45:03 -0500
Subject: bug #936, patch 3/3: Properly detect FMA support on ARM (requires
 VFPv4) and use it instead of MLA when available, because it's both more
 accurate, and faster.

---
 Eigen/src/Core/arch/NEON/PacketMath.h | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 71255ac85..9afd86bec 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -177,8 +177,19 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
   return pset1<Packet4i>(0);
 }
 
-// for some weird raisons, it has to be overloaded for packet of integers
+#ifdef __ARM_FEATURE_FMA
+// See bug 936.
+// FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4.
+// FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding.
+// MLA is not fused i.e. does 2 roundings.
+// In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4):
+// MLA: 10 GFlop/s ; FMA: 12 GFlops/s.
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); }
+#else
 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vmlaq_f32(c,a,b); }
+#endif
+
+// No FMA instruction for int, so use MLA unconditionally.
 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); }
 
 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
@@ -551,8 +562,12 @@ template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const
 
 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }
 
-// for some weird raisons, it has to be overloaded for packet of integers
+#ifdef __ARM_FEATURE_FMA
+// See bug 936. See above comment about FMA for float.
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); }
+#else
 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); }
+#endif
 
 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); }
 
-- 
cgit v1.2.3


From ebdf6a2dbbc66d0f6fb045a3c5b0023bc2e89851 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 3 Feb 2015 22:32:34 +0100
Subject: SPQR: fix default threshold value

---
 Eigen/src/SPQRSupport/SuiteSparseQRSupport.h | 51 ++++++++++++++++++++++------
 1 file changed, 40 insertions(+), 11 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
index 44f6a1acb..54a1b21b8 100644
--- a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
+++ b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
@@ -68,13 +68,13 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
     typedef Map<PermutationMatrix<Dynamic, Dynamic, Index> > PermutationType;
   public:
     SPQR() 
-      : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon())
+      : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()), m_useDefaultThreshold(true)
     { 
       cholmod_l_start(&m_cc);
     }
     
     explicit SPQR(const _MatrixType& matrix)
-    : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon())
+    : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()), m_useDefaultThreshold(true)
     {
       cholmod_l_start(&m_cc);
       compute(matrix);
@@ -99,10 +99,25 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
       if(m_isInitialized) SPQR_free();
 
       MatrixType mat(matrix);
+      
+      /* Compute the default threshold as in MatLab, see:
+       * Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing
+       * Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011, Page 8:3 
+       */
+      RealScalar pivotThreshold = m_tolerance;
+      if(m_useDefaultThreshold) 
+      {
+        RealScalar max2Norm = 0.0;
+        for (int j = 0; j < mat.cols(); j++) max2Norm = numext::maxi(max2Norm, mat.col(j).norm());
+        if(max2Norm==RealScalar(0))
+          max2Norm = RealScalar(1);
+        pivotThreshold = 20 * (mat.rows() + mat.cols()) * max2Norm * NumTraits<RealScalar>::epsilon();
+      }
+      
       cholmod_sparse A; 
       A = viewAsCholmod(mat);
       Index col = matrix.cols();
-      m_rank = SuiteSparseQR<Scalar>(m_ordering, m_tolerance, col, &A, 
+      m_rank = SuiteSparseQR<Scalar>(m_ordering, pivotThreshold, col, &A, 
                              &m_cR, &m_E, &m_H, &m_HPinv, &m_HTau, &m_cc);
 
       if (!m_cR)
@@ -118,7 +133,7 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
     /** 
      * Get the number of rows of the input matrix and the Q matrix
      */
-    inline Index rows() const {return m_H->nrow; }
+    inline Index rows() const {return m_cR->nrow; }
     
     /** 
      * Get the number of columns of the input matrix. 
@@ -130,16 +145,25 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
     {
       eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()");
       eigen_assert(b.cols()==1 && "This method is for vectors only");
-      
+
       //Compute Q^T * b
-      typename Dest::PlainObject y;
+      typename Dest::PlainObject y, y2;
       y = matrixQ().transpose() * b;
-        // Solves with the triangular matrix R
+      
+      // Solves with the triangular matrix R
       Index rk = this->rank();
-      y.topRows(rk) = this->matrixR().topLeftCorner(rk, rk).template triangularView<Upper>().solve(y.topRows(rk));
-      y.bottomRows(cols()-rk).setZero();
+      y2 = y;
+      y.resize((std::max)(cols(),Index(y.rows())),y.cols());
+      y.topRows(rk) = this->matrixR().topLeftCorner(rk, rk).template triangularView<Upper>().solve(y2.topRows(rk));
+
       // Apply the column permutation 
-      dest.topRows(cols()) = colsPermutation() * y.topRows(cols());
+      // colsPermutation() performs a copy of the permutation,
+      // so let's apply it manually:
+      for(Index i = 0; i < rk; ++i) dest.row(m_E[i]) = y.row(i);
+      for(Index i = rk; i < cols(); ++i) dest.row(m_E[i]).setZero();
+      
+//       y.bottomRows(y.rows()-rk).setZero();
+//       dest = colsPermutation() * y.topRows(cols());
       
       m_info = Success;
     }
@@ -178,7 +202,11 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
     /// Set the fill-reducing ordering method to be used
     void setSPQROrdering(int ord) { m_ordering = ord;}
     /// Set the tolerance tol to treat columns with 2-norm < =tol as zero
-    void setPivotThreshold(const RealScalar& tol) { m_tolerance = tol; }
+    void setPivotThreshold(const RealScalar& tol)
+    {
+      m_useDefaultThreshold = false;
+      m_tolerance = tol;
+    }
     
     /** \returns a pointer to the SPQR workspace */
     cholmod_common *cholmodCommon() const { return &m_cc; }
@@ -210,6 +238,7 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
     mutable cholmod_dense *m_HTau; // The Householder coefficients
     mutable Index m_rank; // The rank of the matrix
     mutable cholmod_common m_cc; // Workspace and parameters
+    bool m_useDefaultThreshold;     // Use default threshold
     template<typename ,typename > friend struct SPQR_QProduct;
 };
 
-- 
cgit v1.2.3


From b1eca55328436f9778254c6bbc8852910a0d3d2a Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 3 Feb 2015 23:46:05 +0100
Subject: Use Ref<> to ensure that both x and b in Ax=b are compatible with
 Umfpack/SuperLU expectations

---
 Eigen/src/SuperLUSupport/SuperLUSupport.h | 23 +++++++++++++++++++----
 Eigen/src/UmfPackSupport/UmfPackSupport.h | 13 ++++++++++++-
 2 files changed, 31 insertions(+), 5 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/SuperLUSupport/SuperLUSupport.h b/Eigen/src/SuperLUSupport/SuperLUSupport.h
index ef73587a7..6de5b3dc5 100644
--- a/Eigen/src/SuperLUSupport/SuperLUSupport.h
+++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h
@@ -627,8 +627,12 @@ void SuperLU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest>
 
   m_sluFerr.resize(rhsCols);
   m_sluBerr.resize(rhsCols);
-  m_sluB = SluMatrix::Map(b.const_cast_derived());
-  m_sluX = SluMatrix::Map(x.derived());
+  
+  Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b);
+  Ref<const Matrix<typename Dest::Scalar,Dynamic,Dynamic,ColMajor> > x_ref(x);
+  
+  m_sluB = SluMatrix::Map(b_ref.const_cast_derived());
+  m_sluX = SluMatrix::Map(x_ref.const_cast_derived());
   
   typename Rhs::PlainObject b_cpy;
   if(m_sluEqued!='N')
@@ -651,6 +655,10 @@ void SuperLU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest>
                 &m_sluFerr[0], &m_sluBerr[0],
                 &m_sluStat, &info, Scalar());
   StatFree(&m_sluStat);
+  
+  if(&x.coeffRef(0) != x_ref.data())
+    x = x_ref;
+  
   m_info = info==0 ? Success : NumericalIssue;
 }
 
@@ -938,8 +946,12 @@ void SuperILU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest
 
   m_sluFerr.resize(rhsCols);
   m_sluBerr.resize(rhsCols);
-  m_sluB = SluMatrix::Map(b.const_cast_derived());
-  m_sluX = SluMatrix::Map(x.derived());
+  
+  Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b);
+  Ref<const Matrix<typename Dest::Scalar,Dynamic,Dynamic,ColMajor> > x_ref(x);
+  
+  m_sluB = SluMatrix::Map(b_ref.const_cast_derived());
+  m_sluX = SluMatrix::Map(x_ref.const_cast_derived());
 
   typename Rhs::PlainObject b_cpy;
   if(m_sluEqued!='N')
@@ -962,6 +974,9 @@ void SuperILU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest
                 &recip_pivot_growth, &rcond,
                 &m_sluStat, &info, Scalar());
   StatFree(&m_sluStat);
+  
+  if(&x.coeffRef(0) != x_ref.data())
+    x = x_ref;
 
   m_info = info==0 ? Success : NumericalIssue;
 }
diff --git a/Eigen/src/UmfPackSupport/UmfPackSupport.h b/Eigen/src/UmfPackSupport/UmfPackSupport.h
index b8b216d5e..a2bb75b09 100644
--- a/Eigen/src/UmfPackSupport/UmfPackSupport.h
+++ b/Eigen/src/UmfPackSupport/UmfPackSupport.h
@@ -403,11 +403,22 @@ bool UmfPackLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBas
   eigen_assert(b.derived().data() != x.derived().data() && " Umfpack does not support inplace solve");
   
   int errorCode;
+  Scalar* x_ptr = 0;
+  Matrix<Scalar,Dynamic,1> x_tmp;
+  if(x.innerStride()!=1)
+  {
+    x_tmp.resize(x.rows());
+    x_ptr = x_tmp.data();
+  }
   for (int j=0; j<rhsCols; ++j)
   {
+    if(x.innerStride()==1)
+      x_ptr = &x.col(j).coeffRef(0);
     errorCode = umfpack_solve(UMFPACK_A,
         m_outerIndexPtr, m_innerIndexPtr, m_valuePtr,
-        &x.col(j).coeffRef(0), &b.const_cast_derived().col(j).coeffRef(0), m_numeric, 0, 0);
+        x_ptr, &b.const_cast_derived().col(j).coeffRef(0), m_numeric, 0, 0);
+    if(x.innerStride()!=1)
+      x.col(j) = x_tmp;
     if (errorCode!=0)
       return false;
   }
-- 
cgit v1.2.3


From dcb2a8b184c43f9b638406c39c1636e1ff2b1e23 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 6 Feb 2015 02:51:59 -0800
Subject: Added the EIGEN_HAS_CONSTEXPR define Gate the tensor index list code
 based on the value of EIGEN_HAS_CONSTEXPR

---
 Eigen/src/Core/util/Macros.h                         | 6 ++++++
 unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h | 2 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 2 +-
 unsupported/test/cxx11_tensor_index_list.cpp         | 4 ++++
 unsupported/test/cxx11_tensor_reduction.cpp          | 8 ++++----
 5 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 001907a0b..40a28d4d6 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -133,6 +133,12 @@
 #define EIGEN_HAS_VARIADIC_TEMPLATES 1
 #endif
 
+// Does the compiler support const expressions?
+#if (defined(__plusplus) && __cplusplus >= 201402L) || \
+    EIGEN_GNUC_AT_LEAST(4,9)
+#define EIGEN_HAS_CONSTEXPR 1
+#endif
+
 /** Allows to disable some optimizations which might affect the accuracy of the result.
   * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
   * They currently include:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
index c94ed977e..eed0a9f05 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
 #define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
 
-#if __cplusplus > 199711L
+#ifdef EIGEN_HAS_CONSTEXPR
 
 namespace Eigen {
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 83ba1df71..21416afe0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -53,7 +53,7 @@ struct preserve_inner_most_dims {
   static const bool value = false;
 };
 
-#if __cplusplus > 199711L
+#ifdef EIGEN_HAS_CONSTEXPR
 template <typename ReducedDims, int NumTensorDims>
 struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
   static const bool value = indices_statically_known_to_increase<ReducedDims>()() &&
diff --git a/unsupported/test/cxx11_tensor_index_list.cpp b/unsupported/test/cxx11_tensor_index_list.cpp
index d79a3ed45..c4d4f244f 100644
--- a/unsupported/test/cxx11_tensor_index_list.cpp
+++ b/unsupported/test/cxx11_tensor_index_list.cpp
@@ -11,6 +11,7 @@
 
 #include <Eigen/CXX11/Tensor>
 
+#ifdef EIGEN_HAS_CONSTEXPR
 
 static void test_static_index_list()
 {
@@ -254,11 +255,14 @@ static void test_mixed_index_list()
   VERIFY_IS_APPROX(result3(0), expected);
 }
 
+#endif
 
 void test_cxx11_tensor_index_list()
 {
+#ifdef EIGEN_HAS_CONSTEXPR
   CALL_SUBTEST(test_static_index_list());
   CALL_SUBTEST(test_type2index_list());
   CALL_SUBTEST(test_dynamic_index_list());
   CALL_SUBTEST(test_mixed_index_list());
+#endif
 }
diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp
index 5c3184833..0269853a9 100644
--- a/unsupported/test/cxx11_tensor_reduction.cpp
+++ b/unsupported/test/cxx11_tensor_reduction.cpp
@@ -284,7 +284,7 @@ static void test_static_dims() {
   Tensor<float, 2, DataLayout> out(72, 97);
   in.setRandom();
 
-#if __cplusplus <= 199711L
+#ifndef EIGEN_HAS_CONSTEXPR 
   array<int, 2> reduction_axis;
   reduction_axis[0] = 1;
   reduction_axis[1] = 3;
@@ -314,7 +314,7 @@ static void test_innermost_last_dims() {
   in.setRandom();
 
 // Reduce on the innermost dimensions.
-#if __cplusplus <= 199711L
+#ifndef EIGEN_HAS_CONSTEXPR
   array<int, 2> reduction_axis;
   reduction_axis[0] = 0;
   reduction_axis[1] = 1;
@@ -345,7 +345,7 @@ static void test_innermost_first_dims() {
   in.setRandom();
 
 // Reduce on the innermost dimensions.
-#if __cplusplus <= 199711L
+#ifndef EIGEN_HAS_CONSTEXPR
   array<int, 2> reduction_axis;
   reduction_axis[0] = 2;
   reduction_axis[1] = 3;
@@ -376,7 +376,7 @@ static void test_reduce_middle_dims() {
   in.setRandom();
 
 // Reduce on the innermost dimensions.
-#if __cplusplus <= 199711L
+#ifndef EIGEN_HAS_CONSTEXPR
   array<int, 2> reduction_axis;
   reduction_axis[0] = 1;
   reduction_axis[1] = 2;
-- 
cgit v1.2.3


From 74e460b9950503ef5a306337a136e1d37795deae Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 6 Feb 2015 14:26:24 +0100
Subject: Fix symmetric product

---
 Eigen/src/Core/products/SelfadjointMatrixMatrix.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index 21f8175d2..860e233b9 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -374,7 +374,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
       {
         const Index actual_mc = (std::min)(i2+mc,k2)-i2;
         // transposed packed copy
-        pack_lhs_transposed(blockA, lhs_transpose.getSubMapper(k2, i2), actual_kc, actual_mc);
+        pack_lhs_transposed(blockA, lhs_transpose.getSubMapper(i2, k2), actual_kc, actual_mc);
 
         gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
       }
-- 
cgit v1.2.3


From 7838fda82cfc49b40e8d3a615bb05711815b50e1 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 7 Feb 2015 22:00:46 +0100
Subject: Add a SparseCompressedBase class providing (un)compressed accessors
 (like data()/*Stride() for dense matrices), and a CompressedAccessBit flag
 (similar to DirectAccessBit for dense matrices).

---
 Eigen/SparseCore                            |   1 +
 Eigen/src/Core/util/Constants.h             |  13 ++
 Eigen/src/SparseCore/SparseCompressedBase.h | 199 ++++++++++++++++++++++++++++
 Eigen/src/SparseCore/SparseMatrix.h         | 131 +++---------------
 Eigen/src/SparseCore/SparseUtil.h           |   6 +-
 5 files changed, 231 insertions(+), 119 deletions(-)
 create mode 100644 Eigen/src/SparseCore/SparseCompressedBase.h

(limited to 'Eigen')

diff --git a/Eigen/SparseCore b/Eigen/SparseCore
index d5c0f6271..e2071519b 100644
--- a/Eigen/SparseCore
+++ b/Eigen/SparseCore
@@ -31,6 +31,7 @@
 #include "src/SparseCore/SparseAssign.h"
 #include "src/SparseCore/CompressedStorage.h"
 #include "src/SparseCore/AmbiVector.h"
+#include "src/SparseCore/SparseCompressedBase.h"
 #include "src/SparseCore/SparseMatrix.h"
 #include "src/SparseCore/MappedSparseMatrix.h"
 #include "src/SparseCore/SparseVector.h"
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index 9b40093f0..d1855b50b 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -163,6 +163,19 @@ const unsigned int NestByRefBit = 0x100;
   * \sa \ref RowMajorBit, \ref TopicStorageOrders */
 const unsigned int NoPreferredStorageOrderBit = 0x200;
 
+/** \ingroup flags
+  *
+  * Means that the underlying coefficients can be accessed through pointers to the sparse (un)compressed storage format,
+  * that is, the expression provides:
+  * \code
+    inline const Scalar* valuePtr() const;
+    inline const Index* innerIndexPtr() const;
+    inline const Index* outerIndexPtr() const;
+    inline const Index* innerNonZeroPtr() const;
+    \endcode
+  */
+const unsigned int CompressedAccessBit = 0x400;
+
 
 // list of flags that are inherited by default
 const unsigned int HereditaryBits = RowMajorBit
diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h
new file mode 100644
index 000000000..00658181e
--- /dev/null
+++ b/Eigen/src/SparseCore/SparseCompressedBase.h
@@ -0,0 +1,199 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_COMPRESSED_BASE_H
+#define EIGEN_SPARSE_COMPRESSED_BASE_H
+
+namespace Eigen { 
+
+template<typename Derived> class SparseCompressedBase;
+  
+namespace internal {
+
+template<typename Derived>
+struct traits<SparseCompressedBase<Derived> > : traits<Derived>
+{};
+
+} // end namespace internal
+
+template<typename Derived>
+class SparseCompressedBase
+  : public SparseMatrixBase<Derived>
+{
+  public:
+    typedef SparseMatrixBase<Derived> Base;
+    _EIGEN_SPARSE_PUBLIC_INTERFACE(SparseCompressedBase)
+    using Base::operator=;
+    using Base::IsRowMajor;
+    
+    class InnerIterator;
+    class ReverseInnerIterator;
+    
+    /** \returns a const pointer to the array of values.
+      * This function is aimed at interoperability with other libraries.
+      * \sa innerIndexPtr(), outerIndexPtr() */
+    inline const Scalar* valuePtr() const { return derived().valuePtr(); }
+    /** \returns a non-const pointer to the array of values.
+      * This function is aimed at interoperability with other libraries.
+      * \sa innerIndexPtr(), outerIndexPtr() */
+    inline Scalar* valuePtr() { return derived().valuePtr(); }
+
+    /** \returns a const pointer to the array of inner indices.
+      * This function is aimed at interoperability with other libraries.
+      * \sa valuePtr(), outerIndexPtr() */
+    inline const Index* innerIndexPtr() const { return derived().innerIndexPtr(); }
+    /** \returns a non-const pointer to the array of inner indices.
+      * This function is aimed at interoperability with other libraries.
+      * \sa valuePtr(), outerIndexPtr() */
+    inline Index* innerIndexPtr() { return derived().innerIndexPtr(); }
+
+    /** \returns a const pointer to the array of the starting positions of the inner vectors.
+      * This function is aimed at interoperability with other libraries.
+      * \sa valuePtr(), innerIndexPtr() */
+    inline const Index* outerIndexPtr() const { return derived().outerIndexPtr(); }
+    /** \returns a non-const pointer to the array of the starting positions of the inner vectors.
+      * This function is aimed at interoperability with other libraries.
+      * \sa valuePtr(), innerIndexPtr() */
+    inline Index* outerIndexPtr() { return derived().outerIndexPtr(); }
+
+    /** \returns a const pointer to the array of the number of non zeros of the inner vectors.
+      * This function is aimed at interoperability with other libraries.
+      * \warning it returns the null pointer 0 in compressed mode */
+    inline const Index* innerNonZeroPtr() const { return derived().innerNonZeroPtr(); }
+    /** \returns a non-const pointer to the array of the number of non zeros of the inner vectors.
+      * This function is aimed at interoperability with other libraries.
+      * \warning it returns the null pointer 0 in compressed mode */
+    inline Index* innerNonZeroPtr() { return derived().innerNonZeroPtr(); }
+    
+    /** \returns whether \c *this is in compressed form. */
+    inline bool isCompressed() const { return innerNonZeroPtr()==0; }
+  
+};
+
+template<typename Derived>
+class SparseCompressedBase<Derived>::InnerIterator
+{
+  public:
+    InnerIterator(const SparseCompressedBase& mat, Index outer)
+      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_id(mat.outerIndexPtr()[outer])
+    {
+      if(mat.isCompressed())
+        m_end = mat.outerIndexPtr()[outer+1];
+      else
+        m_end = m_id + mat.innerNonZeroPtr()[outer];
+    }
+
+    inline InnerIterator& operator++() { m_id++; return *this; }
+
+    inline const Scalar& value() const { return m_values[m_id]; }
+    inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id]); }
+
+    inline Index index() const { return m_indices[m_id]; }
+    inline Index outer() const { return m_outer; }
+    inline Index row() const { return IsRowMajor ? m_outer : index(); }
+    inline Index col() const { return IsRowMajor ? index() : m_outer; }
+
+    inline operator bool() const { return (m_id < m_end); }
+
+  protected:
+    const Scalar* m_values;
+    const Index* m_indices;
+    const Index m_outer;
+    Index m_id;
+    Index m_end;
+  private:
+    // If you get here, then you're not using the right InnerIterator type, e.g.:
+    //   SparseMatrix<double,RowMajor> A;
+    //   SparseMatrix<double>::InnerIterator it(A,0);
+    template<typename T> InnerIterator(const SparseMatrixBase<T>&,Index outer);
+};
+
+template<typename Derived>
+class SparseCompressedBase<Derived>::ReverseInnerIterator
+{
+  public:
+    ReverseInnerIterator(const SparseCompressedBase& mat, Index outer)
+      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_start(mat.outerIndexPtr()[outer])
+    {
+      if(mat.isCompressed())
+        m_id = mat.outerIndexPtr()[outer+1];
+      else
+        m_id = m_start + mat.innerNonZeroPtr()[outer];
+    }
+
+    inline ReverseInnerIterator& operator--() { --m_id; return *this; }
+
+    inline const Scalar& value() const { return m_values[m_id-1]; }
+    inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id-1]); }
+
+    inline Index index() const { return m_indices[m_id-1]; }
+    inline Index outer() const { return m_outer; }
+    inline Index row() const { return IsRowMajor ? m_outer : index(); }
+    inline Index col() const { return IsRowMajor ? index() : m_outer; }
+
+    inline operator bool() const { return (m_id > m_start); }
+
+  protected:
+    const Scalar* m_values;
+    const Index* m_indices;
+    const Index m_outer;
+    Index m_id;
+    const Index m_start;
+};
+
+namespace internal {
+
+template<typename Derived>
+struct evaluator<SparseCompressedBase<Derived> >
+  : evaluator_base<Derived>
+{
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Index Index;
+  typedef typename Derived::InnerIterator InnerIterator;
+  typedef typename Derived::ReverseInnerIterator ReverseInnerIterator;
+  
+  enum {
+    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    Flags = Derived::Flags
+  };
+  
+  evaluator() : m_matrix(0) {}
+  explicit evaluator(const Derived &mat) : m_matrix(&mat) {}
+  
+  operator Derived&() { return m_matrix->const_cast_derived(); }
+  operator const Derived&() const { return *m_matrix; }
+  
+  typedef typename DenseCoeffsBase<Derived,ReadOnlyAccessors>::CoeffReturnType CoeffReturnType;
+  Scalar coeff(Index row, Index col) const
+  { return m_matrix->coeff(row,col); }
+  
+  Scalar& coeffRef(Index row, Index col)
+  {
+    eigen_internal_assert(row>=0 && row<m_matrix->rows() && col>=0 && col<m_matrix->cols());
+      
+    const Index outer = Derived::IsRowMajor ? row : col;
+    const Index inner = Derived::IsRowMajor ? col : row;
+
+    Index start = m_matrix->outerIndexPtr()[outer];
+    Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer];
+    eigen_assert(end>start && "you are using a non finalized sparse matrix or written coefficient does not exist");
+    const Index p =   std::lower_bound(m_matrix->innerIndexPtr()+start, m_matrix->innerIndexPtr()+end,inner)
+                    - m_matrix->innerIndexPtr();
+    eigen_assert((p<end) && (m_matrix->innerIndexPtr()[p]==inner) && "written coefficient does not exist");
+    return m_matrix->const_cast_derived().valuePtr()[p];
+  }
+
+  const Derived *m_matrix;
+};
+
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_COMPRESSED_BASE_H
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index 93677c786..74b4c6a9d 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -51,7 +51,7 @@ struct traits<SparseMatrix<_Scalar, _Options, _Index> >
     ColsAtCompileTime = Dynamic,
     MaxRowsAtCompileTime = Dynamic,
     MaxColsAtCompileTime = Dynamic,
-    Flags = _Options | NestByRefBit | LvalueBit,
+    Flags = _Options | NestByRefBit | LvalueBit | CompressedAccessBit,
     SupportedAccessPatterns = InnerRandomAccessPattern
   };
 };
@@ -90,16 +90,20 @@ struct traits<Diagonal<const SparseMatrix<_Scalar, _Options, _Index>, DiagIndex>
 
 template<typename _Scalar, int _Options, typename _Index>
 class SparseMatrix
-  : public SparseMatrixBase<SparseMatrix<_Scalar, _Options, _Index> >
+  : public SparseCompressedBase<SparseMatrix<_Scalar, _Options, _Index> >
 {
   public:
-    EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix)
+    typedef SparseCompressedBase<SparseMatrix> Base;
+    using Base::isCompressed;
+    _EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix)
     EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, +=)
     EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, -=)
 
     typedef MappedSparseMatrix<Scalar,Flags> Map;
     typedef Diagonal<SparseMatrix> DiagonalReturnType;
     typedef Diagonal<const SparseMatrix> ConstDiagonalReturnType;
+    typedef typename Base::InnerIterator InnerIterator;
+    typedef typename Base::ReverseInnerIterator ReverseInnerIterator;
     
 
     using Base::IsRowMajor;
@@ -123,9 +127,6 @@ class SparseMatrix
 
   public:
     
-    /** \returns whether \c *this is in compressed form. */
-    inline bool isCompressed() const { return m_innerNonZeros==0; }
-
     /** \returns the number of rows of the matrix */
     inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
     /** \returns the number of columns of the matrix */
@@ -241,9 +242,6 @@ class SparseMatrix
 
   public:
 
-    class InnerIterator;
-    class ReverseInnerIterator;
-
     /** Removes all non zeros but keep allocated memory */
     inline void setZero()
     {
@@ -875,76 +873,6 @@ private:
   };
 };
 
-template<typename Scalar, int _Options, typename _Index>
-class SparseMatrix<Scalar,_Options,_Index>::InnerIterator
-{
-  public:
-    InnerIterator(const SparseMatrix& mat, Index outer)
-      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_id(mat.m_outerIndex[outer])
-    {
-      if(mat.isCompressed())
-        m_end = mat.m_outerIndex[outer+1];
-      else
-        m_end = m_id + mat.m_innerNonZeros[outer];
-    }
-
-    inline InnerIterator& operator++() { m_id++; return *this; }
-
-    inline const Scalar& value() const { return m_values[m_id]; }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id]); }
-
-    inline Index index() const { return m_indices[m_id]; }
-    inline Index outer() const { return m_outer; }
-    inline Index row() const { return IsRowMajor ? m_outer : index(); }
-    inline Index col() const { return IsRowMajor ? index() : m_outer; }
-
-    inline operator bool() const { return (m_id < m_end); }
-
-  protected:
-    const Scalar* m_values;
-    const Index* m_indices;
-    const Index m_outer;
-    Index m_id;
-    Index m_end;
-  private:
-    // If you get here, then you're not using the right InnerIterator type, e.g.:
-    //   SparseMatrix<double,RowMajor> A;
-    //   SparseMatrix<double>::InnerIterator it(A,0);
-    template<typename T> InnerIterator(const SparseMatrixBase<T>&,Index outer);
-};
-
-template<typename Scalar, int _Options, typename _Index>
-class SparseMatrix<Scalar,_Options,_Index>::ReverseInnerIterator
-{
-  public:
-    ReverseInnerIterator(const SparseMatrix& mat, Index outer)
-      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_start(mat.m_outerIndex[outer])
-    {
-      if(mat.isCompressed())
-        m_id = mat.m_outerIndex[outer+1];
-      else
-        m_id = m_start + mat.m_innerNonZeros[outer];
-    }
-
-    inline ReverseInnerIterator& operator--() { --m_id; return *this; }
-
-    inline const Scalar& value() const { return m_values[m_id-1]; }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id-1]); }
-
-    inline Index index() const { return m_indices[m_id-1]; }
-    inline Index outer() const { return m_outer; }
-    inline Index row() const { return IsRowMajor ? m_outer : index(); }
-    inline Index col() const { return IsRowMajor ? index() : m_outer; }
-
-    inline operator bool() const { return (m_id > m_start); }
-
-  protected:
-    const Scalar* m_values;
-    const Index* m_indices;
-    const Index m_outer;
-    Index m_id;
-    const Index m_start;
-};
 
 namespace internal {
 
@@ -1075,6 +1003,10 @@ EIGEN_DONT_INLINE SparseMatrix<Scalar,_Options,_Index>& SparseMatrix<Scalar,_Opt
   EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
         YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
 
+  #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+    EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+  #endif
+      
   const bool needToTranspose = (Flags & RowMajorBit) != (internal::evaluator<OtherDerived>::Flags & RowMajorBit);
   if (needToTranspose)
   {
@@ -1277,45 +1209,12 @@ namespace internal {
 
 template<typename _Scalar, int _Options, typename _Index>
 struct evaluator<SparseMatrix<_Scalar,_Options,_Index> >
-  : evaluator_base<SparseMatrix<_Scalar,_Options,_Index> >
+  : evaluator<SparseCompressedBase<SparseMatrix<_Scalar,_Options,_Index> > >
 {
-  typedef _Scalar Scalar;
-  typedef _Index Index;
+  typedef evaluator<SparseCompressedBase<SparseMatrix<_Scalar,_Options,_Index> > > Base;
   typedef SparseMatrix<_Scalar,_Options,_Index> SparseMatrixType;
-  typedef typename SparseMatrixType::InnerIterator InnerIterator;
-  typedef typename SparseMatrixType::ReverseInnerIterator ReverseInnerIterator;
-  
-  enum {
-    CoeffReadCost = NumTraits<_Scalar>::ReadCost,
-    Flags = SparseMatrixType::Flags
-  };
-  
-  evaluator() : m_matrix(0) {}
-  explicit evaluator(const SparseMatrixType &mat) : m_matrix(&mat) {}
-  
-  operator SparseMatrixType&() { return m_matrix->const_cast_derived(); }
-  operator const SparseMatrixType&() const { return *m_matrix; }
-  
-  typedef typename DenseCoeffsBase<SparseMatrixType,ReadOnlyAccessors>::CoeffReturnType CoeffReturnType;
-  Scalar coeff(Index row, Index col) const
-  { return m_matrix->coeff(row,col); }
-  
-  Scalar& coeffRef(Index row, Index col)
-  {
-    eigen_internal_assert(row>=0 && row<m_matrix->rows() && col>=0 && col<m_matrix->cols());
-      
-    const Index outer = SparseMatrixType::IsRowMajor ? row : col;
-    const Index inner = SparseMatrixType::IsRowMajor ? col : row;
-
-    Index start = m_matrix->outerIndexPtr()[outer];
-    Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer];
-    eigen_assert(end>start && "you are using a non finalized sparse matrix or written coefficient does not exist");
-    const Index p = m_matrix->data().searchLowerIndex(start,end-1,inner);
-    eigen_assert((p<end) && (m_matrix->data().index(p)==inner) && "written coefficient does not exist");
-    return m_matrix->const_cast_derived().data().value(p);
-  }
-
-  const SparseMatrixType *m_matrix;
+  evaluator() : Base() {}
+  explicit evaluator(const SparseMatrixType &mat) : Base(mat) {}
 };
 
 }
diff --git a/Eigen/src/SparseCore/SparseUtil.h b/Eigen/src/SparseCore/SparseUtil.h
index 8de227b88..ba4803646 100644
--- a/Eigen/src/SparseCore/SparseUtil.h
+++ b/Eigen/src/SparseCore/SparseUtil.h
@@ -43,8 +43,7 @@ EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, -=) \
 EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, *=) \
 EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=)
 
-#define _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived, BaseClass) \
-  typedef BaseClass Base; \
+#define _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) \
   typedef typename Eigen::internal::traits<Derived >::Scalar Scalar; \
   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; \
   typedef typename Eigen::internal::nested<Derived >::type Nested; \
@@ -59,7 +58,8 @@ EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=)
   using Base::const_cast_derived;
   
 #define EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) \
-  _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived, Eigen::SparseMatrixBase<Derived >)
+  typedef Eigen::SparseMatrixBase<Derived > Base; \
+  _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived)
 
 const int CoherentAccessPattern     = 0x1;
 const int InnerRandomAccessPattern  = 0x2 | CoherentAccessPattern;
-- 
cgit v1.2.3


From 08081f829397de11651d8d779b9eed916ccc88d7 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 7 Feb 2015 22:02:14 +0100
Subject: Make SparseTranspose inherit SparseCompressBase when possible

---
 Eigen/src/SparseCore/SparseTranspose.h | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/SparseCore/SparseTranspose.h b/Eigen/src/SparseCore/SparseTranspose.h
index c3d2d1a16..37ce7b0d5 100644
--- a/Eigen/src/SparseCore/SparseTranspose.h
+++ b/Eigen/src/SparseCore/SparseTranspose.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -12,13 +12,41 @@
 
 namespace Eigen { 
 
+namespace internal {
+  template<typename MatrixType,int CompressedAccess=int(MatrixType::Flags&CompressedAccessBit)>
+  class SparseTransposeImpl
+    : public SparseMatrixBase<Transpose<MatrixType> >
+  {};
+  
+  template<typename MatrixType>
+  class SparseTransposeImpl<MatrixType,CompressedAccessBit>
+    : public SparseCompressedBase<Transpose<MatrixType> >
+  {
+    typedef SparseCompressedBase<Transpose<MatrixType> > Base;
+  public:
+    using Base::derived;
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::Index Index;
+    
+    inline const Scalar* valuePtr() const { return derived().nestedExpression().valuePtr(); }
+    inline const Index* innerIndexPtr() const { return derived().nestedExpression().innerIndexPtr(); }
+    inline const Index* outerIndexPtr() const { return derived().nestedExpression().outerIndexPtr(); }
+    inline const Index* innerNonZeroPtr() const { return derived().nestedExpression().innerNonZeroPtr(); }
+    
+    inline Scalar* valuePtr() { return derived().nestedExpression().valuePtr(); }
+    inline Index* innerIndexPtr() { return derived().nestedExpression().innerIndexPtr(); }
+    inline Index* outerIndexPtr() { return derived().nestedExpression().outerIndexPtr(); }
+    inline Index* innerNonZeroPtr() { return derived().nestedExpression().innerNonZeroPtr(); }
+  };
+}
+  
 // Implement nonZeros() for transpose. I'm not sure that's the best approach for that.
 // Perhaps it should be implemented in Transpose<> itself.
 template<typename MatrixType> class TransposeImpl<MatrixType,Sparse>
-  : public SparseMatrixBase<Transpose<MatrixType> >
+  : public internal::SparseTransposeImpl<MatrixType>
 {
   protected:
-    typedef SparseMatrixBase<Transpose<MatrixType> > Base;
+    typedef internal::SparseTransposeImpl<MatrixType> Base;
   public:
     inline typename MatrixType::Index nonZeros() const { return Base::derived().nestedExpression().nonZeros(); }
 };
-- 
cgit v1.2.3


From f3be317614b6b6709737d6ef7bea0ffe96c285d4 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 7 Feb 2015 22:03:25 +0100
Subject: Add a Map<SparseMatrix> specialization.

---
 Eigen/SparseCore                 |   1 +
 Eigen/src/SparseCore/SparseMap.h | 211 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 212 insertions(+)
 create mode 100644 Eigen/src/SparseCore/SparseMap.h

(limited to 'Eigen')

diff --git a/Eigen/SparseCore b/Eigen/SparseCore
index e2071519b..91cdfd3f0 100644
--- a/Eigen/SparseCore
+++ b/Eigen/SparseCore
@@ -33,6 +33,7 @@
 #include "src/SparseCore/AmbiVector.h"
 #include "src/SparseCore/SparseCompressedBase.h"
 #include "src/SparseCore/SparseMatrix.h"
+#include "src/SparseCore/SparseMap.h"
 #include "src/SparseCore/MappedSparseMatrix.h"
 #include "src/SparseCore/SparseVector.h"
 #include "src/SparseCore/SparseCwiseUnaryOp.h"
diff --git a/Eigen/src/SparseCore/SparseMap.h b/Eigen/src/SparseCore/SparseMap.h
new file mode 100644
index 000000000..91e8f7480
--- /dev/null
+++ b/Eigen/src/SparseCore/SparseMap.h
@@ -0,0 +1,211 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_MAP_H
+#define EIGEN_SPARSE_MAP_H
+
+namespace Eigen {
+
+template<typename Derived,
+         int Level = internal::accessors_level<Derived>::has_write_access ? WriteAccessors : ReadOnlyAccessors
+> class SparseMapBase;
+
+template<typename Derived>
+class SparseMapBase<Derived,ReadOnlyAccessors>
+  : public SparseCompressedBase<Derived>
+{
+  public:
+    typedef SparseCompressedBase<Derived> Base;
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::Index Index;
+    enum { IsRowMajor = Base::IsRowMajor };
+
+  protected:
+    
+    typedef typename internal::conditional<
+                         bool(internal::is_lvalue<Derived>::value),
+                         Scalar *, const Scalar *>::type ScalarPointer;
+    typedef typename internal::conditional<
+                         bool(internal::is_lvalue<Derived>::value),
+                         Index *, const Index *>::type IndexPointer;
+
+    Index   m_outerSize;
+    Index   m_innerSize;
+    Index   m_nnz;
+    IndexPointer  m_outerIndex;
+    IndexPointer  m_innerIndices;
+    ScalarPointer m_values;
+    IndexPointer  m_innerNonZeros;
+
+  public:
+
+    inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
+    inline Index cols() const { return IsRowMajor ? m_innerSize : m_outerSize; }
+    inline Index innerSize() const { return m_innerSize; }
+    inline Index outerSize() const { return m_outerSize; }
+    
+    bool isCompressed() const { return m_innerNonZeros==0; }
+
+    //----------------------------------------
+    // direct access interface
+    inline const Scalar* valuePtr() const { return m_values; }
+    inline const Index* innerIndexPtr() const { return m_innerIndices; }
+    inline const Index* outerIndexPtr() const { return m_outerIndex; }
+    inline const Index* innerNonZeroPtr() const { return m_innerNonZeros; }
+    //----------------------------------------
+
+    inline Scalar coeff(Index row, Index col) const
+    {
+      const Index outer = IsRowMajor ? row : col;
+      const Index inner = IsRowMajor ? col : row;
+
+      Index start = m_outerIndex[outer];
+      Index end = isCompressed() ? m_outerIndex[outer+1] : start + m_innerNonZeros[outer];
+      if (start==end)
+        return Scalar(0);
+      else if (end>0 && inner==m_innerIndices[end-1])
+        return m_values[end-1];
+      // ^^  optimization: let's first check if it is the last coefficient
+      // (very common in high level algorithms)
+
+      const Index* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end-1],inner);
+      const Index id = r-&m_innerIndices[0];
+      return ((*r==inner) && (id<end)) ? m_values[id] : Scalar(0);
+    }
+
+    /** \returns the number of non zero coefficients */
+    inline Index nonZeros() const  { return m_nnz; }
+
+    inline SparseMapBase(Index rows, Index cols, Index nnz, IndexPointer outerIndexPtr, IndexPointer innerIndexPtr,
+                              ScalarPointer valuePtr, IndexPointer innerNonZerosPtr = 0)
+      : m_outerSize(IsRowMajor?rows:cols), m_innerSize(IsRowMajor?cols:rows), m_nnz(nnz), m_outerIndex(outerIndexPtr),
+        m_innerIndices(innerIndexPtr), m_values(valuePtr), m_innerNonZeros(innerNonZerosPtr)
+    {}
+
+    /** Empty destructor */
+    inline ~SparseMapBase() {}
+};
+
+template<typename Derived>
+class SparseMapBase<Derived,WriteAccessors>
+  : public SparseMapBase<Derived,ReadOnlyAccessors>
+{
+    typedef MapBase<Derived, ReadOnlyAccessors> ReadOnlyMapBase;
+    
+  public:
+    typedef SparseMapBase<Derived, ReadOnlyAccessors> Base;
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::Index Index;
+    enum { IsRowMajor = Base::IsRowMajor };
+
+  public:
+    
+    //----------------------------------------
+    // direct access interface
+    using Base::valuePtr;
+    using Base::innerIndexPtr;
+    using Base::outerIndexPtr;
+    using Base::innerNonZeroPtr;
+    inline Scalar* valuePtr()       { return Base::m_values; }
+    inline Index* innerIndexPtr()   { return Base::m_innerIndices; }
+    inline Index* outerIndexPtr()   { return Base::m_outerIndex; }
+    inline Index* innerNonZeroPtr() { return Base::m_innerNonZeros; }
+    //----------------------------------------
+
+    inline Scalar& coeffRef(Index row, Index col)
+    {
+      const Index outer = IsRowMajor ? row : col;
+      const Index inner = IsRowMajor ? col : row;
+
+      Index start = Base::m_outerIndex[outer];
+      Index end = Base::isCompressed() ? Base::m_outerIndex[outer+1] : start + Base::m_innerNonZeros[outer];
+      eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix");
+      eigen_assert(end>start && "coeffRef cannot be called on a zero coefficient");
+      Index* r = std::lower_bound(&Base::m_innerIndices[start],&Base::m_innerIndices[end],inner);
+      const Index id = r - &Base::m_innerIndices[0];
+      eigen_assert((*r==inner) && (id<end) && "coeffRef cannot be called on a zero coefficient");
+      return const_cast<Scalar*>(Base::m_values)[id];
+    }
+    
+    inline SparseMapBase(Index rows, Index cols, Index nnz, Index* outerIndexPtr, Index* innerIndexPtr,
+                              Scalar* valuePtr, Index* innerNonZerosPtr = 0)
+      : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr)
+    {}
+
+    /** Empty destructor */
+    inline ~SparseMapBase() {}
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType>
+  : public SparseMapBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+{
+  public:
+    typedef SparseMapBase<Map<MatScalar, MatOptions, MatIndex> > Base;
+    _EIGEN_SPARSE_PUBLIC_INTERFACE(Map)
+    enum { IsRowMajor = Base::IsRowMajor };
+
+  public:
+
+    inline Map(Index rows, Index cols, Index nnz, Index* outerIndexPtr,
+               Index* innerIndexPtr, Scalar* valuePtr, Index* innerNonZerosPtr = 0)
+      : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr)
+    {}
+
+    /** Empty destructor */
+    inline ~Map() {}
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType>
+  : public SparseMapBase<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+{
+  public:
+    typedef SparseMapBase<Map<MatScalar, MatOptions, MatIndex> > Base;
+    _EIGEN_SPARSE_PUBLIC_INTERFACE(Map)
+    enum { IsRowMajor = Base::IsRowMajor };
+
+  public:
+
+    inline Map(Index rows, Index cols, Index nnz, const Index* outerIndexPtr,
+               const Index* innerIndexPtr, const Scalar* valuePtr, const Index* innerNonZerosPtr = 0)
+      : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr)
+    {}
+
+    /** Empty destructor */
+    inline ~Map() {}
+};
+
+namespace internal {
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : evaluator<SparseCompressedBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
+{
+  typedef evaluator<SparseCompressedBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
+  typedef Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;  
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : evaluator<SparseCompressedBase<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
+{
+  typedef evaluator<SparseCompressedBase<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
+  typedef Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;  
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
+};
+
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_MAP_H
-- 
cgit v1.2.3


From f2ff8c091e02b4aab8c7568807c09e43f51d1156 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 7 Feb 2015 22:04:18 +0100
Subject: Add a Ref<SparseMatrix> specialization.

---
 Eigen/SparseCore                 |   1 +
 Eigen/src/SparseCore/SparseRef.h | 200 +++++++++++++++++++++++++++++++++++++++
 test/CMakeLists.txt              |   1 +
 test/sparse_ref.cpp              |  96 +++++++++++++++++++
 4 files changed, 298 insertions(+)
 create mode 100644 Eigen/src/SparseCore/SparseRef.h
 create mode 100644 test/sparse_ref.cpp

(limited to 'Eigen')

diff --git a/Eigen/SparseCore b/Eigen/SparseCore
index 91cdfd3f0..48ed967b8 100644
--- a/Eigen/SparseCore
+++ b/Eigen/SparseCore
@@ -36,6 +36,7 @@
 #include "src/SparseCore/SparseMap.h"
 #include "src/SparseCore/MappedSparseMatrix.h"
 #include "src/SparseCore/SparseVector.h"
+#include "src/SparseCore/SparseRef.h"
 #include "src/SparseCore/SparseCwiseUnaryOp.h"
 #include "src/SparseCore/SparseCwiseBinaryOp.h"
 #include "src/SparseCore/SparseTranspose.h"
diff --git a/Eigen/src/SparseCore/SparseRef.h b/Eigen/src/SparseCore/SparseRef.h
new file mode 100644
index 000000000..cfea6ce8a
--- /dev/null
+++ b/Eigen/src/SparseCore/SparseRef.h
@@ -0,0 +1,200 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_REF_H
+#define EIGEN_SPARSE_REF_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename Derived> class SparseRefBase;
+  
+template<typename MatScalar, int MatOptions, typename MatIndex, int _Options, typename _StrideType>
+struct traits<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
+  : public traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >
+{
+  typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
+  enum {
+    Options = _Options,
+    Flags = traits<MappedSparseMatrix<MatScalar,MatOptions,MatIndex> >::Flags | CompressedAccessBit | NestByRefBit
+  };
+
+  template<typename Derived> struct match {
+    enum {
+      StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
+      MatchAtCompileTime = (Derived::Flags&CompressedAccessBit) && StorageOrderMatch
+    };
+    typedef typename internal::conditional<MatchAtCompileTime,internal::true_type,internal::false_type>::type type;
+  };
+  
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int _Options, typename _StrideType>
+struct traits<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
+  : public traits<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
+{
+  enum {
+    Flags = (traits<MappedSparseMatrix<MatScalar,MatOptions,MatIndex> >::Flags | CompressedAccessBit | NestByRefBit) & ~LvalueBit
+  };
+};
+  
+template<typename Derived>
+struct traits<SparseRefBase<Derived> > : public traits<Derived> {};
+
+template<typename Derived> class SparseRefBase
+//  : public MappedSparseMatrix<MatScalar,MatOptions,MatIndex>
+  : public SparseMapBase<Derived>
+{
+//   typedef typename internal::traits<Derived>::PlainObjectType PlainObjectType;
+
+public:
+
+  typedef SparseMapBase<Derived> Base;
+  _EIGEN_SPARSE_PUBLIC_INTERFACE(SparseRefBase)
+
+  SparseRefBase()
+    : Base(RowsAtCompileTime==Dynamic?0:RowsAtCompileTime,ColsAtCompileTime==Dynamic?0:ColsAtCompileTime, 0, 0, 0, 0, 0)
+  {}
+  
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(SparseRefBase)
+
+protected:
+
+
+  template<typename Expression>
+  void construct(Expression& expr)
+  {
+    ::new (static_cast<Base*>(this)) Base(expr.rows(), expr.cols(), expr.nonZeros(), expr.outerIndexPtr(), expr.innerIndexPtr(), expr.valuePtr(), expr.innerNonZeroPtr());
+  }
+};
+
+} // namespace internal
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType >
+  : public internal::SparseRefBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType > >
+{
+    typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
+    typedef internal::traits<Ref> Traits;
+    template<int OtherOptions>
+    inline Ref(const SparseMatrix<MatScalar,OtherOptions,MatIndex>& expr);
+    template<int OtherOptions>
+    inline Ref(const MappedSparseMatrix<MatScalar,OtherOptions,MatIndex>& expr);
+  public:
+
+    typedef internal::SparseRefBase<Ref> Base;
+    _EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
+
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<int OtherOptions>
+    inline Ref(SparseMatrix<MatScalar,OtherOptions,MatIndex>& expr)
+    {
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<SparseMatrix<MatScalar,OtherOptions,MatIndex> >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      Base::construct(expr.derived());
+    }
+    
+    template<int OtherOptions>
+    inline Ref(MappedSparseMatrix<MatScalar,OtherOptions,MatIndex>& expr)
+    {
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<SparseMatrix<MatScalar,OtherOptions,MatIndex> >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      Base::construct(expr.derived());
+    }
+    
+    template<typename Derived>
+    inline Ref(const SparseCompressedBase<Derived>& expr)
+    #else
+    template<typename Derived>
+    inline Ref(SparseCompressedBase<Derived>& expr)
+    #endif
+    {
+      EIGEN_STATIC_ASSERT(bool(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      Base::construct(expr.const_cast_derived());
+    }
+
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Ref)
+
+};
+
+// this is the const ref version
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType>
+  : public internal::SparseRefBase<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+{
+    typedef SparseMatrix<MatScalar,MatOptions,MatIndex> TPlainObjectType;
+    typedef internal::traits<Ref> Traits;
+  public:
+
+    typedef internal::SparseRefBase<Ref> Base;
+    _EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
+
+    template<typename Derived>
+    inline Ref(const SparseMatrixBase<Derived>& expr)
+    {
+      construct(expr.derived(), typename Traits::template match<Derived>::type());
+    }
+
+    inline Ref(const Ref& other) : Base(other) {
+      // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy
+    }
+
+    template<typename OtherRef>
+    inline Ref(const RefBase<OtherRef>& other) {
+      construct(other.derived(), typename Traits::template match<OtherRef>::type());
+    }
+
+  protected:
+
+    template<typename Expression>
+    void construct(const Expression& expr,internal::true_type)
+    {
+      Base::construct(expr);
+    }
+
+    template<typename Expression>
+    void construct(const Expression& expr, internal::false_type)
+    {
+      m_object = expr;
+      Base::construct(m_object);
+    }
+
+  protected:
+    TPlainObjectType m_object;
+};
+
+
+namespace internal {
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : evaluator<SparseCompressedBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
+{
+  typedef evaluator<SparseCompressedBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
+  typedef Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;  
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : evaluator<SparseCompressedBase<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
+{
+  typedef evaluator<SparseCompressedBase<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
+  typedef Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;  
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
+};
+
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_REF_H
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index f57d8ce36..168749634 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -228,6 +228,7 @@ ei_add_test(stddeque)
 ei_add_test(sparse_basic)
 ei_add_test(sparse_vector)
 ei_add_test(sparse_product)
+ei_add_test(sparse_ref)
 ei_add_test(sparse_solvers)
 ei_add_test(sparse_permutations)
 ei_add_test(simplicial_cholesky)
diff --git a/test/sparse_ref.cpp b/test/sparse_ref.cpp
new file mode 100644
index 000000000..b261cecf3
--- /dev/null
+++ b/test/sparse_ref.cpp
@@ -0,0 +1,96 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 20015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// This unit test cannot be easily written to work with EIGEN_DEFAULT_TO_ROW_MAJOR
+#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
+#undef EIGEN_DEFAULT_TO_ROW_MAJOR
+#endif
+
+static long int nb_temporaries;
+
+inline void on_temporary_creation() {
+  // here's a great place to set a breakpoint when debugging failures in this test!
+  nb_temporaries++;
+}
+
+#define EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN { on_temporary_creation(); }
+
+#include "main.h"
+#include <Eigen/SparseCore>
+
+#define VERIFY_EVALUATION_COUNT(XPR,N) {\
+    nb_temporaries = 0; \
+    XPR; \
+    if(nb_temporaries!=N) std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; \
+    VERIFY( (#XPR) && nb_temporaries==N ); \
+  }
+
+template<typename PlainObjectType> void check_const_correctness(const PlainObjectType&)
+{
+  // verify that ref-to-const don't have LvalueBit
+  typedef typename internal::add_const<PlainObjectType>::type ConstPlainObjectType;
+  VERIFY( !(internal::traits<Ref<ConstPlainObjectType> >::Flags & LvalueBit) );
+  VERIFY( !(internal::traits<Ref<ConstPlainObjectType, Aligned> >::Flags & LvalueBit) );
+  VERIFY( !(Ref<ConstPlainObjectType>::Flags & LvalueBit) );
+  VERIFY( !(Ref<ConstPlainObjectType, Aligned>::Flags & LvalueBit) );
+}
+
+template<typename B>
+EIGEN_DONT_INLINE void call_ref_1(Ref<SparseMatrix<float> > a, const B &b) { VERIFY_IS_EQUAL(a.toDense(),b.toDense()); }
+
+template<typename B>
+EIGEN_DONT_INLINE void call_ref_2(const Ref<const SparseMatrix<float> >& a, const B &b) { VERIFY_IS_EQUAL(a.toDense(),b.toDense()); }
+
+void call_ref()
+{
+//   SparseVector<std::complex<float> > ca = VectorXcf::Random(10).sparseView();
+//   SparseVector<float>                a  = VectorXf::Random(10).sparseView();
+  SparseMatrix<float>               A = MatrixXf::Random(10,10).sparseView();
+  SparseMatrix<float,RowMajor>      B = MatrixXf::Random(10,10).sparseView();
+  const SparseMatrix<float>&        Ac(A);
+  Block<SparseMatrix<float> >       Ab(A,0,1, 3,3);
+  const Block<SparseMatrix<float> > Abc(A,0,1,3,3);
+  
+
+  VERIFY_EVALUATION_COUNT( call_ref_1(A, A),  0);
+//   VERIFY_EVALUATION_COUNT( call_ref_1(Ac, Ac),  0); // does not compile on purpose
+  VERIFY_EVALUATION_COUNT( call_ref_2(A, A),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(A.transpose(), A.transpose()),  1);
+  VERIFY_EVALUATION_COUNT( call_ref_2(Ac,Ac), 0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(A+A,2*Ac), 1);
+  VERIFY_EVALUATION_COUNT( call_ref_2(B, B),  1);
+  VERIFY_EVALUATION_COUNT( call_ref_2(B.transpose(), B.transpose()),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(A*A, A*A),  1);
+  
+  Ref<SparseMatrix<float> > Ar(A);
+  VERIFY_EVALUATION_COUNT( call_ref_1(Ar, Ar),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(Ar, Ar),  0);
+  
+  Ref<SparseMatrix<float,RowMajor> > Br(B);
+  VERIFY_EVALUATION_COUNT( call_ref_1(Br.transpose(), Br.transpose()),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(Br, Br),  1);
+  VERIFY_EVALUATION_COUNT( call_ref_2(Br.transpose(), Br.transpose()),  0);
+  
+  Ref<const SparseMatrix<float> > Arc(A);
+//   VERIFY_EVALUATION_COUNT( call_ref_1(Arc, Arc),  0); // does not compile on purpose
+  VERIFY_EVALUATION_COUNT( call_ref_2(Arc, Arc),  0);
+  
+  VERIFY_EVALUATION_COUNT( call_ref_2(A.middleCols(1,3), A.middleCols(1,3)),  1); // should be 0
+  
+  VERIFY_EVALUATION_COUNT( call_ref_2(A.block(1,1,3,3), A.block(1,1,3,3)),  1); // should be 0 (allocate starts/nnz only)
+}
+
+void test_sparse_ref()
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( check_const_correctness(SparseMatrix<float>()) );
+    CALL_SUBTEST_1( check_const_correctness(SparseMatrix<double,RowMajor>()) );
+    CALL_SUBTEST_2( call_ref() );
+  }
+}
-- 
cgit v1.2.3


From 3af29caae87b00ed8f002605573d227ad1b629a4 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 9 Feb 2015 10:23:45 +0100
Subject: Cleaning and add more unit tests for Ref<SparseMatrix> and
 Map<SparseMatrix>

---
 Eigen/src/SparseCore/MappedSparseMatrix.h | 172 +++---------------------------
 Eigen/src/SparseCore/SparseMap.h          |  34 +++++-
 Eigen/src/SparseCore/SparseRef.h          |  12 +--
 test/sparse_basic.cpp                     |  20 ++++
 test/sparse_ref.cpp                       |   5 +-
 5 files changed, 72 insertions(+), 171 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/SparseCore/MappedSparseMatrix.h b/Eigen/src/SparseCore/MappedSparseMatrix.h
index 2852c669a..533479fd0 100644
--- a/Eigen/src/SparseCore/MappedSparseMatrix.h
+++ b/Eigen/src/SparseCore/MappedSparseMatrix.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,9 +10,10 @@
 #ifndef EIGEN_MAPPED_SPARSEMATRIX_H
 #define EIGEN_MAPPED_SPARSEMATRIX_H
 
-namespace Eigen { 
+namespace Eigen {
 
-/** \class MappedSparseMatrix
+/** \deprecated Use Map<SparseMatrix<> >
+  * \class MappedSparseMatrix
   *
   * \brief Sparse matrix
   *
@@ -25,179 +26,38 @@ namespace internal {
 template<typename _Scalar, int _Flags, typename _Index>
 struct traits<MappedSparseMatrix<_Scalar, _Flags, _Index> > : traits<SparseMatrix<_Scalar, _Flags, _Index> >
 {};
-}
+} // end namespace internal
 
 template<typename _Scalar, int _Flags, typename _Index>
 class MappedSparseMatrix
-  : public SparseMatrixBase<MappedSparseMatrix<_Scalar, _Flags, _Index> >
+  : public Map<SparseMatrix<_Scalar, _Flags, _Index> >
 {
-  public:
-    EIGEN_SPARSE_PUBLIC_INTERFACE(MappedSparseMatrix)
-    enum { IsRowMajor = Base::IsRowMajor };
-
-  protected:
-
-    Index   m_outerSize;
-    Index   m_innerSize;
-    Index   m_nnz;
-    Index*  m_outerIndex;
-    Index*  m_innerIndices;
-    Scalar* m_values;
+    typedef Map<SparseMatrix<_Scalar, _Flags, _Index> > Base;
 
   public:
-
-    inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
-    inline Index cols() const { return IsRowMajor ? m_innerSize : m_outerSize; }
-    inline Index innerSize() const { return m_innerSize; }
-    inline Index outerSize() const { return m_outerSize; }
     
-    bool isCompressed() const { return true; }
-
-    //----------------------------------------
-    // direct access interface
-    inline const Scalar* valuePtr() const { return m_values; }
-    inline Scalar* valuePtr() { return m_values; }
-
-    inline const Index* innerIndexPtr() const { return m_innerIndices; }
-    inline Index* innerIndexPtr() { return m_innerIndices; }
-
-    inline const Index* outerIndexPtr() const { return m_outerIndex; }
-    inline Index* outerIndexPtr() { return m_outerIndex; }
-    //----------------------------------------
-
-    inline Scalar coeff(Index row, Index col) const
-    {
-      const Index outer = IsRowMajor ? row : col;
-      const Index inner = IsRowMajor ? col : row;
-
-      Index start = m_outerIndex[outer];
-      Index end = m_outerIndex[outer+1];
-      if (start==end)
-        return Scalar(0);
-      else if (end>0 && inner==m_innerIndices[end-1])
-        return m_values[end-1];
-      // ^^  optimization: let's first check if it is the last coefficient
-      // (very common in high level algorithms)
+    typedef typename Base::Index Index;
+    typedef typename Base::Scalar Scalar;
 
-      const Index* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end-1],inner);
-      const Index id = r-&m_innerIndices[0];
-      return ((*r==inner) && (id<end)) ? m_values[id] : Scalar(0);
-    }
-
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      const Index outer = IsRowMajor ? row : col;
-      const Index inner = IsRowMajor ? col : row;
-
-      Index start = m_outerIndex[outer];
-      Index end = m_outerIndex[outer+1];
-      eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix");
-      eigen_assert(end>start && "coeffRef cannot be called on a zero coefficient");
-      Index* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end],inner);
-      const Index id = r-&m_innerIndices[0];
-      eigen_assert((*r==inner) && (id<end) && "coeffRef cannot be called on a zero coefficient");
-      return m_values[id];
-    }
-
-    class InnerIterator;
-    class ReverseInnerIterator;
-
-    /** \returns the number of non zero coefficients */
-    inline Index nonZeros() const  { return m_nnz; }
-
-    inline MappedSparseMatrix(Index rows, Index cols, Index nnz, Index* outerIndexPtr, Index* innerIndexPtr, Scalar* valuePtr)
-      : m_outerSize(IsRowMajor?rows:cols), m_innerSize(IsRowMajor?cols:rows), m_nnz(nnz), m_outerIndex(outerIndexPtr),
-        m_innerIndices(innerIndexPtr), m_values(valuePtr)
+    inline MappedSparseMatrix(Index rows, Index cols, Index nnz, Index* outerIndexPtr, Index* innerIndexPtr, Scalar* valuePtr, Index* innerNonZeroPtr = 0)
+      : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZeroPtr)
     {}
 
     /** Empty destructor */
     inline ~MappedSparseMatrix() {}
 };
 
-template<typename Scalar, int _Flags, typename _Index>
-class MappedSparseMatrix<Scalar,_Flags,_Index>::InnerIterator
-{
-  public:
-    InnerIterator(const MappedSparseMatrix& mat, Index outer)
-      : m_matrix(mat),
-        m_outer(outer),
-        m_id(mat.outerIndexPtr()[outer]),
-        m_start(m_id),
-        m_end(mat.outerIndexPtr()[outer+1])
-    {}
-
-    inline InnerIterator& operator++() { m_id++; return *this; }
-
-    inline Scalar value() const { return m_matrix.valuePtr()[m_id]; }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_matrix.valuePtr()[m_id]); }
-
-    inline Index index() const { return m_matrix.innerIndexPtr()[m_id]; }
-    inline Index row() const { return IsRowMajor ? m_outer : index(); }
-    inline Index col() const { return IsRowMajor ? index() : m_outer; }
-
-    inline operator bool() const { return (m_id < m_end) && (m_id>=m_start); }
-
-  protected:
-    const MappedSparseMatrix& m_matrix;
-    const Index m_outer;
-    Index m_id;
-    const Index m_start;
-    const Index m_end;
-};
-
-template<typename Scalar, int _Flags, typename _Index>
-class MappedSparseMatrix<Scalar,_Flags,_Index>::ReverseInnerIterator
-{
-  public:
-    ReverseInnerIterator(const MappedSparseMatrix& mat, Index outer)
-      : m_matrix(mat),
-        m_outer(outer),
-        m_id(mat.outerIndexPtr()[outer+1]),
-        m_start(mat.outerIndexPtr()[outer]),
-        m_end(m_id)
-    {}
-
-    inline ReverseInnerIterator& operator--() { m_id--; return *this; }
-
-    inline Scalar value() const { return m_matrix.valuePtr()[m_id-1]; }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_matrix.valuePtr()[m_id-1]); }
-
-    inline Index index() const { return m_matrix.innerIndexPtr()[m_id-1]; }
-    inline Index row() const { return IsRowMajor ? m_outer : index(); }
-    inline Index col() const { return IsRowMajor ? index() : m_outer; }
-
-    inline operator bool() const { return (m_id <= m_end) && (m_id>m_start); }
-
-  protected:
-    const MappedSparseMatrix& m_matrix;
-    const Index m_outer;
-    Index m_id;
-    const Index m_start;
-    const Index m_end;
-};
-
 namespace internal {
 
 template<typename _Scalar, int _Options, typename _Index>
 struct evaluator<MappedSparseMatrix<_Scalar,_Options,_Index> >
-  : evaluator_base<MappedSparseMatrix<_Scalar,_Options,_Index> >
+  : evaluator<SparseCompressedBase<MappedSparseMatrix<_Scalar,_Options,_Index> > >
 {
-  typedef MappedSparseMatrix<_Scalar,_Options,_Index> MappedSparseMatrixType;
-  typedef typename MappedSparseMatrixType::InnerIterator InnerIterator;
-  typedef typename MappedSparseMatrixType::ReverseInnerIterator ReverseInnerIterator;
-  
-  enum {
-    CoeffReadCost = NumTraits<_Scalar>::ReadCost,
-    Flags = MappedSparseMatrixType::Flags
-  };
-  
-  evaluator() : m_matrix(0) {}
-  explicit evaluator(const MappedSparseMatrixType &mat) : m_matrix(&mat) {}
-  
-  operator MappedSparseMatrixType&() { return m_matrix->const_cast_derived(); }
-  operator const MappedSparseMatrixType&() const { return *m_matrix; }
+  typedef MappedSparseMatrix<_Scalar,_Options,_Index> XprType;
+  typedef evaluator<SparseCompressedBase<XprType> > Base;
   
-  const MappedSparseMatrixType *m_matrix;
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
 };
 
 }
diff --git a/Eigen/src/SparseCore/SparseMap.h b/Eigen/src/SparseCore/SparseMap.h
index 91e8f7480..72dedb1ec 100644
--- a/Eigen/src/SparseCore/SparseMap.h
+++ b/Eigen/src/SparseCore/SparseMap.h
@@ -12,6 +12,32 @@
 
 namespace Eigen {
 
+namespace internal {
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct traits<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : public traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >
+{
+  typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
+  typedef traits<PlainObjectType> TraitsBase;
+  enum {
+    Flags = TraitsBase::Flags & (~NestByRefBit)
+  };
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct traits<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : public traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >
+{
+  typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
+  typedef traits<PlainObjectType> TraitsBase;
+  enum {
+    Flags = TraitsBase::Flags & (~ (NestByRefBit | LvalueBit))
+  };
+};
+
+} // end namespace internal
+  
 template<typename Derived,
          int Level = internal::accessors_level<Derived>::has_write_access ? WriteAccessors : ReadOnlyAccessors
 > class SparseMapBase;
@@ -25,7 +51,7 @@ class SparseMapBase<Derived,ReadOnlyAccessors>
     typedef typename Base::Scalar Scalar;
     typedef typename Base::Index Index;
     enum { IsRowMajor = Base::IsRowMajor };
-
+    using Base::operator=;
   protected:
     
     typedef typename internal::conditional<
@@ -103,6 +129,8 @@ class SparseMapBase<Derived,WriteAccessors>
     typedef typename Base::Scalar Scalar;
     typedef typename Base::Index Index;
     enum { IsRowMajor = Base::IsRowMajor };
+    
+    using Base::operator=;
 
   public:
     
@@ -147,7 +175,7 @@ class Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType>
   : public SparseMapBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
 {
   public:
-    typedef SparseMapBase<Map<MatScalar, MatOptions, MatIndex> > Base;
+    typedef SparseMapBase<Map> Base;
     _EIGEN_SPARSE_PUBLIC_INTERFACE(Map)
     enum { IsRowMajor = Base::IsRowMajor };
 
@@ -167,7 +195,7 @@ class Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType
   : public SparseMapBase<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
 {
   public:
-    typedef SparseMapBase<Map<MatScalar, MatOptions, MatIndex> > Base;
+    typedef SparseMapBase<Map> Base;
     _EIGEN_SPARSE_PUBLIC_INTERFACE(Map)
     enum { IsRowMajor = Base::IsRowMajor };
 
diff --git a/Eigen/src/SparseCore/SparseRef.h b/Eigen/src/SparseCore/SparseRef.h
index cfea6ce8a..2ca039323 100644
--- a/Eigen/src/SparseCore/SparseRef.h
+++ b/Eigen/src/SparseCore/SparseRef.h
@@ -23,7 +23,7 @@ struct traits<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _Stride
   typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
   enum {
     Options = _Options,
-    Flags = traits<MappedSparseMatrix<MatScalar,MatOptions,MatIndex> >::Flags | CompressedAccessBit | NestByRefBit
+    Flags = traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >::Flags | CompressedAccessBit | NestByRefBit
   };
 
   template<typename Derived> struct match {
@@ -41,7 +41,7 @@ struct traits<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _
   : public traits<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
 {
   enum {
-    Flags = (traits<MappedSparseMatrix<MatScalar,MatOptions,MatIndex> >::Flags | CompressedAccessBit | NestByRefBit) & ~LvalueBit
+    Flags = (traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >::Flags | CompressedAccessBit | NestByRefBit) & ~LvalueBit
   };
 };
   
@@ -49,11 +49,8 @@ template<typename Derived>
 struct traits<SparseRefBase<Derived> > : public traits<Derived> {};
 
 template<typename Derived> class SparseRefBase
-//  : public MappedSparseMatrix<MatScalar,MatOptions,MatIndex>
   : public SparseMapBase<Derived>
 {
-//   typedef typename internal::traits<Derived>::PlainObjectType PlainObjectType;
-
 public:
 
   typedef SparseMapBase<Derived> Base;
@@ -63,8 +60,6 @@ public:
     : Base(RowsAtCompileTime==Dynamic?0:RowsAtCompileTime,ColsAtCompileTime==Dynamic?0:ColsAtCompileTime, 0, 0, 0, 0, 0)
   {}
   
-  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(SparseRefBase)
-
 protected:
 
 
@@ -119,9 +114,6 @@ class Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType >
       EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
       Base::construct(expr.const_cast_derived());
     }
-
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Ref)
-
 };
 
 // this is the const ref version
diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp
index 097959c84..a19de296a 100644
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp
@@ -408,6 +408,26 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
     m.setFromTriplets(triplets.begin(), triplets.end());
     VERIFY_IS_APPROX(m, refMat);
   }
+  
+  // test Map
+  {
+    DenseMatrix refMat2(rows, cols), refMat3(rows, cols);
+    SparseMatrixType m2(rows, cols), m3(rows, cols);
+    initSparse<Scalar>(density, refMat2, m2);
+    initSparse<Scalar>(density, refMat3, m3);
+    {
+      Map<SparseMatrixType> mapMat2(m2.rows(), m2.cols(), m2.nonZeros(), m2.outerIndexPtr(), m2.innerIndexPtr(), m2.valuePtr(), m2.innerNonZeroPtr());
+      Map<SparseMatrixType> mapMat3(m3.rows(), m3.cols(), m3.nonZeros(), m3.outerIndexPtr(), m3.innerIndexPtr(), m3.valuePtr(), m3.innerNonZeroPtr());
+      VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3);
+      VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3);
+    }
+    {
+      MappedSparseMatrix<Scalar,SparseMatrixType::Options,Index> mapMat2(m2.rows(), m2.cols(), m2.nonZeros(), m2.outerIndexPtr(), m2.innerIndexPtr(), m2.valuePtr(), m2.innerNonZeroPtr());
+      MappedSparseMatrix<Scalar,SparseMatrixType::Options,Index> mapMat3(m3.rows(), m3.cols(), m3.nonZeros(), m3.outerIndexPtr(), m3.innerIndexPtr(), m3.valuePtr(), m3.innerNonZeroPtr());
+      VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3);
+      VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3);
+    }
+  }
 
   // test triangularView
   {
diff --git a/test/sparse_ref.cpp b/test/sparse_ref.cpp
index b261cecf3..27700f827 100644
--- a/test/sparse_ref.cpp
+++ b/test/sparse_ref.cpp
@@ -69,8 +69,9 @@ void call_ref()
   VERIFY_EVALUATION_COUNT( call_ref_2(A*A, A*A),  1);
   
   Ref<SparseMatrix<float> > Ar(A);
-  VERIFY_EVALUATION_COUNT( call_ref_1(Ar, Ar),  0);
-  VERIFY_EVALUATION_COUNT( call_ref_2(Ar, Ar),  0);
+  VERIFY_IS_APPROX(Ar+Ar, A+A);
+  VERIFY_EVALUATION_COUNT( call_ref_1(Ar, A),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(Ar, A),  0);
   
   Ref<SparseMatrix<float,RowMajor> > Br(B);
   VERIFY_EVALUATION_COUNT( call_ref_1(Br.transpose(), Br.transpose()),  0);
-- 
cgit v1.2.3


From d4ec48575e94ec469ef9fbf005a0a6e67571f528 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 9 Feb 2015 11:14:36 +0100
Subject: Make Block<SparseMatrix> inherit SparseCompressedBase in the case of
 an inner-panels and fix valuePtr() innerIndexPtr()

---
 Eigen/src/Core/Block.h             |  2 +-
 Eigen/src/SparseCore/SparseBlock.h | 17 ++++++++++++-----
 test/sparse_ref.cpp                |  8 +++++---
 3 files changed, 18 insertions(+), 9 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 9cf9d5432..5f6307206 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -87,7 +87,7 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
     // FIXME, this traits is rather specialized for dense object and it needs to be cleaned further
     FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
     FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
-    Flags = (traits<XprType>::Flags & DirectAccessBit) | FlagsLvalueBit | FlagsRowMajorBit
+    Flags = (traits<XprType>::Flags & (DirectAccessBit | (InnerPanel?CompressedAccessBit:0))) | FlagsLvalueBit | FlagsRowMajorBit
     // FIXME DirectAccessBit should not be handled by expressions
   };
 };
diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h
index 9e4da2057..6b8ade5aa 100644
--- a/Eigen/src/SparseCore/SparseBlock.h
+++ b/Eigen/src/SparseCore/SparseBlock.h
@@ -74,7 +74,7 @@ namespace internal {
 
 template<typename SparseMatrixType, int BlockRows, int BlockCols>
 class sparse_matrix_block_impl
-  : public SparseMatrixBase<Block<SparseMatrixType,BlockRows,BlockCols,true> >
+  : public SparseCompressedBase<Block<SparseMatrixType,BlockRows,BlockCols,true> >
 {
     typedef typename internal::remove_all<typename SparseMatrixType::Nested>::type _MatrixTypeNested;
     typedef Block<SparseMatrixType, BlockRows, BlockCols, true> BlockType;
@@ -172,19 +172,24 @@ public:
     }
 
     inline const Scalar* valuePtr() const
-    { return m_matrix.valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+    { return m_matrix.valuePtr(); }
     inline Scalar* valuePtr()
-    { return m_matrix.const_cast_derived().valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+    { return m_matrix.const_cast_derived().valuePtr(); }
 
     inline const Index* innerIndexPtr() const
-    { return m_matrix.innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+    { return m_matrix.innerIndexPtr(); }
     inline Index* innerIndexPtr()
-    { return m_matrix.const_cast_derived().innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+    { return m_matrix.const_cast_derived().innerIndexPtr(); }
 
     inline const Index* outerIndexPtr() const
     { return m_matrix.outerIndexPtr() + m_outerStart; }
     inline Index* outerIndexPtr()
     { return m_matrix.const_cast_derived().outerIndexPtr() + m_outerStart; }
+    
+    inline const Index* innerNonZeroPtr() const
+    { return isCompressed() ? 0 : m_matrix.innerNonZeroPtr(); }
+    inline Index* innerNonZeroPtr()
+    { return isCompressed() ? 0 : m_matrix.const_cast_derived().innerNonZeroPtr(); }
 
     Index nonZeros() const
     {
@@ -196,6 +201,8 @@ public:
       else
         return Map<const Matrix<Index,OuterSize,1> >(m_matrix.innerNonZeroPtr()+m_outerStart, m_outerSize.value()).sum();
     }
+    
+    bool isCompressed() const { return m_matrix.innerNonZeroPtr()==0; }
 
     const Scalar& lastCoeff() const
     {
diff --git a/test/sparse_ref.cpp b/test/sparse_ref.cpp
index 27700f827..e7380ba21 100644
--- a/test/sparse_ref.cpp
+++ b/test/sparse_ref.cpp
@@ -51,8 +51,8 @@ void call_ref()
 {
 //   SparseVector<std::complex<float> > ca = VectorXcf::Random(10).sparseView();
 //   SparseVector<float>                a  = VectorXf::Random(10).sparseView();
-  SparseMatrix<float>               A = MatrixXf::Random(10,10).sparseView();
-  SparseMatrix<float,RowMajor>      B = MatrixXf::Random(10,10).sparseView();
+  SparseMatrix<float>               A = MatrixXf::Random(10,10).sparseView(0.5,1);
+  SparseMatrix<float,RowMajor>      B = MatrixXf::Random(10,10).sparseView(0.5,1);
   const SparseMatrix<float>&        Ac(A);
   Block<SparseMatrix<float> >       Ab(A,0,1, 3,3);
   const Block<SparseMatrix<float> > Abc(A,0,1,3,3);
@@ -82,7 +82,9 @@ void call_ref()
 //   VERIFY_EVALUATION_COUNT( call_ref_1(Arc, Arc),  0); // does not compile on purpose
   VERIFY_EVALUATION_COUNT( call_ref_2(Arc, Arc),  0);
   
-  VERIFY_EVALUATION_COUNT( call_ref_2(A.middleCols(1,3), A.middleCols(1,3)),  1); // should be 0
+  VERIFY_EVALUATION_COUNT( call_ref_2(A.middleCols(1,3), A.middleCols(1,3)),  0);
+  
+  VERIFY_EVALUATION_COUNT( call_ref_2(A.col(2), A.col(2)),  0);
   
   VERIFY_EVALUATION_COUNT( call_ref_2(A.block(1,1,3,3), A.block(1,1,3,3)),  1); // should be 0 (allocate starts/nnz only)
 }
-- 
cgit v1.2.3


From 87629cd6395433966977e868ec091c3fc754956c Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 9 Feb 2015 11:41:25 +0100
Subject: bug #897: makes iterative sparse solvers use a Ref<SparseMatrix>
 instead of a SparseMatrix pointer. This fixes usage of iterative solvers with
 a Map<SparseMatrix>.

---
 Eigen/src/IterativeLinearSolvers/BiCGSTAB.h        |  2 +-
 .../src/IterativeLinearSolvers/ConjugateGradient.h |  2 +-
 .../IterativeLinearSolvers/IterativeSolverBase.h   | 53 +++++++++++++++-------
 test/sparse_solver.h                               |  5 +-
 4 files changed, 41 insertions(+), 21 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
index 224fe913f..a50680133 100644
--- a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
+++ b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
@@ -193,7 +193,7 @@ public:
       m_error = Base::m_tolerance;
       
       typename Dest::ColXpr xj(x,j);
-      if(!internal::bicgstab(*mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_error))
+      if(!internal::bicgstab(mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_error))
         failed = true;
     }
     m_info = failed ? NumericalIssue
diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
index b5ef6d60f..3e024bda1 100644
--- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
+++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
@@ -206,7 +206,7 @@ public:
       m_error = Base::m_tolerance;
 
       typename Dest::ColXpr xj(x,j);
-      internal::conjugate_gradient(mp_matrix->template selfadjointView<UpLo>(), b.col(j), xj,
+      internal::conjugate_gradient(mp_matrix.template selfadjointView<UpLo>(), b.col(j), xj,
                                    Base::m_preconditioner, m_iterations, m_error);
     }
 
diff --git a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
index f33c868bb..6f48075b4 100644
--- a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
+++ b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
@@ -37,7 +37,7 @@ public:
 
   /** Default constructor. */
   IterativeSolverBase()
-    : mp_matrix(0)
+    : m_dummy(0,0), mp_matrix(m_dummy)
   {
     init();
   }
@@ -52,10 +52,11 @@ public:
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  explicit IterativeSolverBase(const MatrixType& A)
+  template<typename SparseMatrixDerived>
+  explicit IterativeSolverBase(const SparseMatrixBase<SparseMatrixDerived>& A)
   {
     init();
-    compute(A);
+    compute(A.derived());
   }
 
   ~IterativeSolverBase() {}
@@ -65,9 +66,11 @@ public:
     * Currently, this function mostly calls analyzePattern on the preconditioner. In the future
     * we might, for instance, implement column reordering for faster matrix vector products.
     */
-  Derived& analyzePattern(const MatrixType& A)
+  template<typename SparseMatrixDerived>
+  Derived& analyzePattern(const SparseMatrixBase<SparseMatrixDerived>& A)
   {
-    m_preconditioner.analyzePattern(A);
+    grab(A);
+    m_preconditioner.analyzePattern(mp_matrix);
     m_isInitialized = true;
     m_analysisIsOk = true;
     m_info = Success;
@@ -83,11 +86,12 @@ public:
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  Derived& factorize(const MatrixType& A)
+  template<typename SparseMatrixDerived>
+  Derived& factorize(const SparseMatrixBase<SparseMatrixDerived>& A)
   {
     eigen_assert(m_analysisIsOk && "You must first call analyzePattern()"); 
-    mp_matrix = &A;
-    m_preconditioner.factorize(A);
+    grab(A);
+    m_preconditioner.factorize(mp_matrix);
     m_factorizationIsOk = true;
     m_info = Success;
     return derived();
@@ -103,10 +107,11 @@ public:
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  Derived& compute(const MatrixType& A)
+  template<typename SparseMatrixDerived>
+  Derived& compute(const SparseMatrixBase<SparseMatrixDerived>& A)
   {
-    mp_matrix = &A;
-    m_preconditioner.compute(A);
+    grab(A);
+    m_preconditioner.compute(mp_matrix);
     m_isInitialized = true;
     m_analysisIsOk = true;
     m_factorizationIsOk = true;
@@ -115,9 +120,9 @@ public:
   }
 
   /** \internal */
-  Index rows() const { return mp_matrix ? mp_matrix->rows() : 0; }
+  Index rows() const { return mp_matrix.rows(); }
   /** \internal */
-  Index cols() const { return mp_matrix ? mp_matrix->cols() : 0; }
+  Index cols() const { return mp_matrix.cols(); }
 
   /** \returns the tolerance threshold used by the stopping criteria */
   RealScalar tolerance() const { return m_tolerance; }
@@ -135,13 +140,18 @@ public:
   /** \returns a read-only reference to the preconditioner. */
   const Preconditioner& preconditioner() const { return m_preconditioner; }
 
-  /** \returns the max number of iterations */
+  /** \returns the max number of iterations.
+    * It is either the value setted by setMaxIterations or, by default,
+    * twice the number of columns of the matrix.
+    */
   int maxIterations() const
   {
-    return (mp_matrix && m_maxIterations<0) ? mp_matrix->cols() : m_maxIterations;
+    return (m_maxIterations<0) ? 2*mp_matrix.cols() : m_maxIterations;
   }
   
-  /** Sets the max number of iterations */
+  /** Sets the max number of iterations.
+    * Default is twice the number of columns of the matrix.
+    */
   Derived& setMaxIterations(int maxIters)
   {
     m_maxIterations = maxIters;
@@ -210,7 +220,16 @@ protected:
     m_maxIterations = -1;
     m_tolerance = NumTraits<Scalar>::epsilon();
   }
-  const MatrixType* mp_matrix;
+  
+  template<typename SparseMatrixDerived>
+  void grab(const SparseMatrixBase<SparseMatrixDerived> &A)
+  {
+    mp_matrix.~Ref<const MatrixType>();
+    ::new (&mp_matrix) Ref<const MatrixType>(A);
+  }
+  
+  MatrixType m_dummy;
+  Ref<const MatrixType> mp_matrix;
   Preconditioner m_preconditioner;
 
   int m_maxIterations;
diff --git a/test/sparse_solver.h b/test/sparse_solver.h
index ee350d561..887ff02a9 100644
--- a/test/sparse_solver.h
+++ b/test/sparse_solver.h
@@ -32,7 +32,7 @@ void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A,
     x = solver.solve(b);
     if (solver.info() != Success)
     {
-      std::cerr << "sparse solver testing: solving failed\n";
+      std::cerr << "sparse solver testing: solving failed (" << typeid(Solver).name() << ")\n";
       return;
     }
     VERIFY(oldb.isApprox(b) && "sparse solver testing: the rhs should not be modified!");
@@ -75,7 +75,8 @@ void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A,
     xm = solver.solve(bm);
     if (solver.info() != Success)
     {
-      std::cerr << "sparse solver testing: solving failed\n";
+      std::cerr << "sparse solver testing: solving with a Map failed\n";
+      exit(0);
       return;
     }
     VERIFY(oldb.isApprox(bm) && "sparse solver testing: the rhs should not be modified!");
-- 
cgit v1.2.3


From c6e8caf0900ae303e9e7399bed00af705015ff17 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 10 Feb 2015 18:57:41 +0100
Subject: Allows Lower|Upper as a template argument of CG and MINRES: in this
 case the full matrix will be considered.

---
 Eigen/src/IterativeLinearSolvers/ConjugateGradient.h | 11 +++++++----
 test/conjugate_gradient.cpp                          |  6 ++++--
 test/sparse_solver.h                                 |  5 ++++-
 unsupported/Eigen/src/IterativeSolvers/MINRES.h      |  7 ++++++-
 unsupported/test/minres.cpp                          |  2 ++
 5 files changed, 23 insertions(+), 8 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
index 3e024bda1..4857dd9e9 100644
--- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
+++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
@@ -113,8 +113,8 @@ struct traits<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> >
   * The matrix A must be selfadjoint. The matrix A and the vectors x and b can be either dense or sparse.
   *
   * \tparam _MatrixType the type of the matrix A, can be a dense or a sparse matrix.
-  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
-  *               or Upper. Default is Lower.
+  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower,
+  *               Upper, or Lower|Upper in which the full matrix entries will be considered. Default is Lower.
   * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
   *
   * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
@@ -197,6 +197,10 @@ public:
   template<typename Rhs,typename Dest>
   void _solve_with_guess_impl(const Rhs& b, Dest& x) const
   {
+    typedef typename internal::conditional<UpLo==(Lower|Upper),
+                                           Ref<const MatrixType>&,
+                                           SparseSelfAdjointView<const Ref<const MatrixType>, UpLo>
+                                          >::type MatrixWrapperType;
     m_iterations = Base::maxIterations();
     m_error = Base::m_tolerance;
 
@@ -206,8 +210,7 @@ public:
       m_error = Base::m_tolerance;
 
       typename Dest::ColXpr xj(x,j);
-      internal::conjugate_gradient(mp_matrix.template selfadjointView<UpLo>(), b.col(j), xj,
-                                   Base::m_preconditioner, m_iterations, m_error);
+      internal::conjugate_gradient(MatrixWrapperType(mp_matrix), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error);
     }
 
     m_isInitialized = true;
diff --git a/test/conjugate_gradient.cpp b/test/conjugate_gradient.cpp
index 869051b31..019cc4d64 100644
--- a/test/conjugate_gradient.cpp
+++ b/test/conjugate_gradient.cpp
@@ -12,13 +12,15 @@
 
 template<typename T> void test_conjugate_gradient_T()
 {
-  ConjugateGradient<SparseMatrix<T>, Lower> cg_colmajor_lower_diag;
-  ConjugateGradient<SparseMatrix<T>, Upper> cg_colmajor_upper_diag;
+  ConjugateGradient<SparseMatrix<T>, Lower      > cg_colmajor_lower_diag;
+  ConjugateGradient<SparseMatrix<T>, Upper      > cg_colmajor_upper_diag;
+  ConjugateGradient<SparseMatrix<T>, Lower|Upper> cg_colmajor_loup_diag;
   ConjugateGradient<SparseMatrix<T>, Lower, IdentityPreconditioner> cg_colmajor_lower_I;
   ConjugateGradient<SparseMatrix<T>, Upper, IdentityPreconditioner> cg_colmajor_upper_I;
 
   CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_lower_diag)  );
   CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_upper_diag)  );
+  CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_loup_diag)   );
   CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_lower_I)     );
   CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_upper_I)     );
 }
diff --git a/test/sparse_solver.h b/test/sparse_solver.h
index 887ff02a9..6cf99e0b0 100644
--- a/test/sparse_solver.h
+++ b/test/sparse_solver.h
@@ -195,7 +195,10 @@ int generate_sparse_spd_problem(Solver& , typename Solver::MatrixType& A, typena
   dA = dM * dM.adjoint();
   
   halfA.resize(size,size);
-  halfA.template selfadjointView<Solver::UpLo>().rankUpdate(M);
+  if(Solver::UpLo==(Lower|Upper))
+    halfA = A;
+  else
+    halfA.template selfadjointView<Solver::UpLo>().rankUpdate(M);
   
   return size;
 }
diff --git a/unsupported/Eigen/src/IterativeSolvers/MINRES.h b/unsupported/Eigen/src/IterativeSolvers/MINRES.h
index a34902001..65cffc255 100644
--- a/unsupported/Eigen/src/IterativeSolvers/MINRES.h
+++ b/unsupported/Eigen/src/IterativeSolvers/MINRES.h
@@ -250,6 +250,11 @@ namespace Eigen {
         template<typename Rhs,typename Dest>
         void _solve_with_guess_impl(const Rhs& b, Dest& x) const
         {
+            typedef typename internal::conditional<UpLo==(Lower|Upper),
+                                                   Ref<const MatrixType>&,
+                                                   SparseSelfAdjointView<const Ref<const MatrixType>, UpLo>
+                                                  >::type MatrixWrapperType;
+                                          
             m_iterations = Base::maxIterations();
             m_error = Base::m_tolerance;
             
@@ -259,7 +264,7 @@ namespace Eigen {
                 m_error = Base::m_tolerance;
                 
                 typename Dest::ColXpr xj(x,j);
-                internal::minres(mp_matrix.template selfadjointView<UpLo>(), b.col(j), xj,
+                internal::minres(MatrixWrapperType(mp_matrix), b.col(j), xj,
                                  Base::m_preconditioner, m_iterations, m_error);
             }
             
diff --git a/unsupported/test/minres.cpp b/unsupported/test/minres.cpp
index 81b762c37..f6e526bbd 100644
--- a/unsupported/test/minres.cpp
+++ b/unsupported/test/minres.cpp
@@ -21,6 +21,7 @@ template<typename T> void test_minres_T()
   // Diagonal preconditioner
   MINRES<SparseMatrix<T>, Lower, DiagonalPreconditioner<T> > minres_colmajor_lower_diag;
   MINRES<SparseMatrix<T>, Upper, DiagonalPreconditioner<T> > minres_colmajor_upper_diag;
+  MINRES<SparseMatrix<T>, Upper, DiagonalPreconditioner<T> > minres_colmajor_uplo_diag;
   
   // call tests for SPD matrix
   CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_lower_I) );
@@ -28,6 +29,7 @@ template<typename T> void test_minres_T()
     
   CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_lower_diag)  );
   CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_upper_diag)  );
+  CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_uplo_diag)  );
     
   // TO DO: symmetric semi-definite matrix
   // TO DO: symmetric indefinite matrix
-- 
cgit v1.2.3


From c3f3580b8f7e4af89f4c7cdbe036ac1cac750128 Mon Sep 17 00:00:00 2001
From: Jan Blechta <blechta@karlin.mff.cuni.cz>
Date: Tue, 10 Feb 2015 14:24:39 +0100
Subject: Fix bug #733: step by step solving is not a good example for
 solveWithGuess

---
 Eigen/src/IterativeLinearSolvers/BiCGSTAB.h          |  6 +-----
 Eigen/src/IterativeLinearSolvers/ConjugateGradient.h | 15 +--------------
 unsupported/Eigen/src/IterativeSolvers/GMRES.h       | 17 ++---------------
 unsupported/Eigen/src/IterativeSolvers/MINRES.h      | 15 +--------------
 4 files changed, 5 insertions(+), 48 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
index a50680133..31a43cb56 100644
--- a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
+++ b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
@@ -139,11 +139,7 @@ struct traits<BiCGSTAB<_MatrixType,_Preconditioner> >
   * \include BiCGSTAB_simple.cpp
   * 
   * By default the iterations start with x=0 as an initial guess of the solution.
-  * One can control the start using the solveWithGuess() method. Here is a step by
-  * step execution example starting with a random guess and printing the evolution
-  * of the estimated error:
-  * \include BiCGSTAB_step_by_step.cpp
-  * Note that such a step by step execution is slightly slower.
+  * One can control the start using the solveWithGuess() method.
   * 
   * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
   */
diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
index 4857dd9e9..1e819fc9f 100644
--- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
+++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
@@ -137,20 +137,7 @@ struct traits<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> >
   * \endcode
   * 
   * By default the iterations start with x=0 as an initial guess of the solution.
-  * One can control the start using the solveWithGuess() method. Here is a step by
-  * step execution example starting with a random guess and printing the evolution
-  * of the estimated error:
-  * * \code
-  * x = VectorXd::Random(n);
-  * cg.setMaxIterations(1);
-  * int i = 0;
-  * do {
-  *   x = cg.solveWithGuess(b,x);
-  *   std::cout << i << " : " << cg.error() << std::endl;
-  *   ++i;
-  * } while (cg.info()!=Success && i<100);
-  * \endcode
-  * Note that such a step by step excution is slightly slower.
+  * One can control the start using the solveWithGuess() method.
   * 
   * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
   */
diff --git a/unsupported/Eigen/src/IterativeSolvers/GMRES.h b/unsupported/Eigen/src/IterativeSolvers/GMRES.h
index 39610c074..30f82b52e 100644
--- a/unsupported/Eigen/src/IterativeSolvers/GMRES.h
+++ b/unsupported/Eigen/src/IterativeSolvers/GMRES.h
@@ -250,21 +250,8 @@ struct traits<GMRES<_MatrixType,_Preconditioner> >
   * \endcode
   *
   * By default the iterations start with x=0 as an initial guess of the solution.
-  * One can control the start using the solveWithGuess() method. Here is a step by
-  * step execution example starting with a random guess and printing the evolution
-  * of the estimated error:
-  * * \code
-  * x = VectorXd::Random(n);
-  * solver.setMaxIterations(1);
-  * int i = 0;
-  * do {
-  *   x = solver.solveWithGuess(b,x);
-  *   std::cout << i << " : " << solver.error() << std::endl;
-  *   ++i;
-  * } while (solver.info()!=Success && i<100);
-  * \endcode
-  * Note that such a step by step excution is slightly slower.
-  *
+  * One can control the start using the solveWithGuess() method.
+  * 
   * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
   */
 template< typename _MatrixType, typename _Preconditioner>
diff --git a/unsupported/Eigen/src/IterativeSolvers/MINRES.h b/unsupported/Eigen/src/IterativeSolvers/MINRES.h
index 93a83e5b7..c4d969a72 100644
--- a/unsupported/Eigen/src/IterativeSolvers/MINRES.h
+++ b/unsupported/Eigen/src/IterativeSolvers/MINRES.h
@@ -189,20 +189,7 @@ namespace Eigen {
      * \endcode
      *
      * By default the iterations start with x=0 as an initial guess of the solution.
-     * One can control the start using the solveWithGuess() method. Here is a step by
-     * step execution example starting with a random guess and printing the evolution
-     * of the estimated error:
-     * * \code
-     * x = VectorXd::Random(n);
-     * mr.setMaxIterations(1);
-     * int i = 0;
-     * do {
-     *   x = mr.solveWithGuess(b,x);
-     *   std::cout << i << " : " << mr.error() << std::endl;
-     *   ++i;
-     * } while (mr.info()!=Success && i<100);
-     * \endcode
-     * Note that such a step by step excution is slightly slower.
+     * One can control the start using the solveWithGuess() method.
      *
      * \sa class ConjugateGradient, BiCGSTAB, SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
      */
-- 
cgit v1.2.3


From cc5d7ff5238da45ef7416ec94f18227486ed9643 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 10 Feb 2015 14:02:38 -0800
Subject: Added vectorized implementation of the exponential function for
 ARM/NEON

---
 Eigen/Core                               |  1 +
 Eigen/src/Core/arch/NEON/MathFunctions.h | 91 ++++++++++++++++++++++++++++++++
 Eigen/src/Core/arch/NEON/PacketMath.h    |  2 +-
 3 files changed, 93 insertions(+), 1 deletion(-)
 create mode 100644 Eigen/src/Core/arch/NEON/MathFunctions.h

(limited to 'Eigen')

diff --git a/Eigen/Core b/Eigen/Core
index b5af63623..416e901ae 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -300,6 +300,7 @@ using std::ptrdiff_t;
   #include "src/Core/arch/AltiVec/Complex.h"
 #elif defined EIGEN_VECTORIZE_NEON
   #include "src/Core/arch/NEON/PacketMath.h"
+  #include "src/Core/arch/NEON/MathFunctions.h"
   #include "src/Core/arch/NEON/Complex.h"
 #endif
 
diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h
new file mode 100644
index 000000000..6bb05bb92
--- /dev/null
+++ b/Eigen/src/Core/arch/NEON/MathFunctions.h
@@ -0,0 +1,91 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* The sin, cos, exp, and log functions of this file come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+#ifndef EIGEN_MATH_FUNCTIONS_NEON_H
+#define EIGEN_MATH_FUNCTIONS_NEON_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pexp<Packet4f>(const Packet4f& _x)
+{
+  Packet4f x = _x;
+  Packet4f tmp, fx;
+
+  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+  _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
+  _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
+
+  x = vminq_f32(x, p4f_exp_hi);
+  x = vmaxq_f32(x, p4f_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = vmlaq_f32(p4f_half, x, p4f_cephes_LOG2EF);
+
+  /* perform a floorf */
+  tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
+
+  /* if greater, substract 1 */
+  Packet4ui mask = vcgtq_f32(tmp, fx);
+  mask = vandq_u32(mask, vreinterpretq_u32_f32(p4f_1));
+
+  fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
+
+  tmp = vmulq_f32(fx, p4f_cephes_exp_C1);
+  Packet4f z = vmulq_f32(fx, p4f_cephes_exp_C2);
+  x = vsubq_f32(x, tmp);
+  x = vsubq_f32(x, z);
+
+  Packet4f y = vmulq_f32(p4f_cephes_exp_p0, x);
+  z = vmulq_f32(x, x);
+  y = vaddq_f32(y, p4f_cephes_exp_p1);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, p4f_cephes_exp_p2);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, p4f_cephes_exp_p3);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, p4f_cephes_exp_p4);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, p4f_cephes_exp_p5);
+
+  y = vmulq_f32(y, z);
+  y = vaddq_f32(y, x);
+  y = vaddq_f32(y, p4f_1);
+
+  /* build 2^n */
+  int32x4_t mm;
+  mm = vcvtq_s32_f32(fx);
+  mm = vaddq_s32(mm, p4i_0x7f);
+  mm = vshlq_n_s32(mm, 23);
+  Packet4f pow2n = vreinterpretq_f32_s32(mm);
+
+  y = vmulq_f32(y, pow2n);
+  return y;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATH_FUNCTIONS_NEON_H
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 9afd86bec..559682cf7 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -88,7 +88,7 @@ template<> struct packet_traits<float>  : default_packet_traits
     HasSin  = 0,
     HasCos  = 0,
     HasLog  = 0,
-    HasExp  = 0,
+    HasExp  = 1,
     HasSqrt = 0
   };
 };
-- 
cgit v1.2.3


From fe25f3b8e3ee86d808b5f8f75f789cee8dffd581 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 10 Feb 2015 23:11:35 +0100
Subject: FMA has been wrongly disabled

---
 Eigen/src/Core/arch/AVX/PacketMath.h | 2 +-
 Eigen/src/Core/arch/SSE/PacketMath.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 485bac10b..fc2e78b2d 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -133,7 +133,7 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, co
   return pset1<Packet8i>(0);
 }
 
-#ifdef EIGEN_VECTORIZE_FMA
+#ifdef __FMA__
 template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
 #if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
   // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 3f6fb0254..63a859699 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -227,7 +227,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
 
 // for some weird raisons, it has to be overloaded for packet of integers
 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
-#ifdef EIGEN_VECTORIZE_FMA
+#ifdef __FMA__
 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
 #endif
-- 
cgit v1.2.3


From f669f5656ab550010c5dd92ce2da7d3fab07babd Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 10 Feb 2015 14:29:47 -0800
Subject: Marked a few functions as EIGEN_DEVICE_FUNC to enable the use of
 tensors in cuda kernels.

---
 Eigen/src/Core/util/Memory.h                       | 48 +++++++++++-----------
 unsupported/Eigen/CXX11/src/Tensor/Tensor.h        |  4 +-
 .../Eigen/CXX11/src/Tensor/TensorDimensions.h      |  2 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h |  4 +-
 4 files changed, 29 insertions(+), 29 deletions(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index bacf236fb..16f8cc1b0 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -143,8 +143,8 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t =
 *** Implementation of generic aligned realloc (when no realloc can be used)***
 *****************************************************************************/
 
-void* aligned_malloc(std::size_t size);
-void  aligned_free(void *ptr);
+EIGEN_DEVICE_FUNC void* aligned_malloc(std::size_t size);
+EIGEN_DEVICE_FUNC void  aligned_free(void *ptr);
 
 /** \internal
   * \brief Reallocates aligned memory.
@@ -185,33 +185,33 @@ inline void* generic_aligned_realloc(void* ptr, size_t size, size_t old_size)
 *****************************************************************************/
 
 #ifdef EIGEN_NO_MALLOC
-inline void check_that_malloc_is_allowed()
+EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
 {
   eigen_assert(false && "heap allocation is forbidden (EIGEN_NO_MALLOC is defined)");
 }
 #elif defined EIGEN_RUNTIME_NO_MALLOC
-inline bool is_malloc_allowed_impl(bool update, bool new_value = false)
+EIGEN_DEVICE_FUNC inline bool is_malloc_allowed_impl(bool update, bool new_value = false)
 {
   static bool value = true;
   if (update == 1)
     value = new_value;
   return value;
 }
-inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); }
-inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); }
-inline void check_that_malloc_is_allowed()
+EIGEN_DEVICE_FUNC inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); }
+EIGEN_DEVICE_FUNC inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); }
+EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
 {
   eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)");
 }
 #else 
-inline void check_that_malloc_is_allowed()
+EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
 {}
 #endif
 
 /** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements.
   * On allocation error, the returned pointer is null, and std::bad_alloc is thrown.
   */
-inline void* aligned_malloc(size_t size)
+EIGEN_DEVICE_FUNC inline void* aligned_malloc(size_t size)
 {
   check_that_malloc_is_allowed();
 
@@ -237,7 +237,7 @@ inline void* aligned_malloc(size_t size)
 }
 
 /** \internal Frees memory allocated with aligned_malloc. */
-inline void aligned_free(void *ptr)
+EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr)
 {
   #if !EIGEN_ALIGN
     std::free(ptr);
@@ -298,12 +298,12 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size)
 /** \internal Allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned.
   * On allocation error, the returned pointer is null, and a std::bad_alloc is thrown.
   */
-template<bool Align> inline void* conditional_aligned_malloc(size_t size)
+template<bool Align> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(size_t size)
 {
   return aligned_malloc(size);
 }
 
-template<> inline void* conditional_aligned_malloc<false>(size_t size)
+template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(size_t size)
 {
   check_that_malloc_is_allowed();
 
@@ -314,12 +314,12 @@ template<> inline void* conditional_aligned_malloc<false>(size_t size)
 }
 
 /** \internal Frees memory allocated with conditional_aligned_malloc */
-template<bool Align> inline void conditional_aligned_free(void *ptr)
+template<bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr)
 {
   aligned_free(ptr);
 }
 
-template<> inline void conditional_aligned_free<false>(void *ptr)
+template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void *ptr)
 {
   std::free(ptr);
 }
@@ -341,7 +341,7 @@ template<> inline void* conditional_aligned_realloc<false>(void* ptr, size_t new
 /** \internal Destructs the elements of an array.
   * The \a size parameters tells on how many objects to call the destructor of T.
   */
-template<typename T> inline void destruct_elements_of_array(T *ptr, size_t size)
+template<typename T> EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T *ptr, size_t size)
 {
   // always destruct an array starting from the end.
   if(ptr)
@@ -351,7 +351,7 @@ template<typename T> inline void destruct_elements_of_array(T *ptr, size_t size)
 /** \internal Constructs the elements of an array.
   * The \a size parameter tells on how many objects to call the constructor of T.
   */
-template<typename T> inline T* construct_elements_of_array(T *ptr, size_t size)
+template<typename T> EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, size_t size)
 {
   size_t i;
   EIGEN_TRY
@@ -371,7 +371,7 @@ template<typename T> inline T* construct_elements_of_array(T *ptr, size_t size)
 *****************************************************************************/
 
 template<typename T>
-EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
 {
   if(size > size_t(-1) / sizeof(T))
     throw_std_bad_alloc();
@@ -381,7 +381,7 @@ EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
   * On allocation error, the returned pointer is undefined, but a std::bad_alloc is thrown.
   * The default constructor of T is called.
   */
-template<typename T> inline T* aligned_new(size_t size)
+template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(size_t size)
 {
   check_size_for_overflow<T>(size);
   T *result = reinterpret_cast<T*>(aligned_malloc(sizeof(T)*size));
@@ -396,7 +396,7 @@ template<typename T> inline T* aligned_new(size_t size)
   }
 }
 
-template<typename T, bool Align> inline T* conditional_aligned_new(size_t size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(size_t size)
 {
   check_size_for_overflow<T>(size);
   T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
@@ -414,7 +414,7 @@ template<typename T, bool Align> inline T* conditional_aligned_new(size_t size)
 /** \internal Deletes objects constructed with aligned_new
   * The \a size parameters tells on how many objects to call the destructor of T.
   */
-template<typename T> inline void aligned_delete(T *ptr, size_t size)
+template<typename T> EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, size_t size)
 {
   destruct_elements_of_array<T>(ptr, size);
   aligned_free(ptr);
@@ -423,13 +423,13 @@ template<typename T> inline void aligned_delete(T *ptr, size_t size)
 /** \internal Deletes objects constructed with conditional_aligned_new
   * The \a size parameters tells on how many objects to call the destructor of T.
   */
-template<typename T, bool Align> inline void conditional_aligned_delete(T *ptr, size_t size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete(T *ptr, size_t size)
 {
   destruct_elements_of_array<T>(ptr, size);
   conditional_aligned_free<Align>(ptr);
 }
 
-template<typename T, bool Align> inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size)
 {
   check_size_for_overflow<T>(new_size);
   check_size_for_overflow<T>(old_size);
@@ -452,7 +452,7 @@ template<typename T, bool Align> inline T* conditional_aligned_realloc_new(T* pt
 }
 
 
-template<typename T, bool Align> inline T* conditional_aligned_new_auto(size_t size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new_auto(size_t size)
 {
   if(size==0)
     return 0; // short-cut. Also fixes Bug 884
@@ -495,7 +495,7 @@ template<typename T, bool Align> inline T* conditional_aligned_realloc_new_auto(
   return result;
 }
 
-template<typename T, bool Align> inline void conditional_aligned_delete_auto(T *ptr, size_t size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, size_t size)
 {
   if(NumTraits<T>::RequireInitialization)
     destruct_elements_of_array<T>(ptr, size);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
index 0e8a4b8d6..037219f23 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@@ -350,7 +350,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_> >
     }
 #endif
 
-    void resize(const array<Index, NumIndices>& dimensions)
+    EIGEN_DEVICE_FUNC void resize(const array<Index, NumIndices>& dimensions)
     {
       std::size_t i;
       Index size = Index(1);
@@ -367,7 +367,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_> >
       #endif
     }
 
-    void resize(const DSizes<Index, NumIndices>& dimensions) {
+    EIGEN_DEVICE_FUNC void resize(const DSizes<Index, NumIndices>& dimensions) {
       array<Index, NumIndices> dims;
       for (std::size_t i = 0; i < NumIndices; ++i) {
         dims[i] = dimensions[i];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index d81197e6d..2ad52b2f9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -275,7 +275,7 @@ struct DSizes : array<DenseIndex, NumDims> {
   }
 #endif
 
-  DSizes& operator = (const array<DenseIndex, NumDims>& other) {
+  EIGEN_DEVICE_FUNC DSizes& operator = (const array<DenseIndex, NumDims>& other) {
     *static_cast<Base*>(this) = other;
     return *this;
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
index dfe85602a..1b227e8c2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -112,9 +112,9 @@ class TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_nu
     void swap(Self_& other)
     { std::swap(m_data,other.m_data); std::swap(m_dimensions,other.m_dimensions); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<DenseIndex, NumIndices_>& dimensions() const {return m_dimensions;}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<DenseIndex, NumIndices_>& dimensions() const {return m_dimensions;}
 
-    void resize(DenseIndex size, const array<DenseIndex, NumIndices_>& nbDimensions)
+    EIGEN_DEVICE_FUNC void resize(DenseIndex size, const array<DenseIndex, NumIndices_>& nbDimensions)
     {
       const DenseIndex currentSz = internal::array_prod(m_dimensions);
       if(size != currentSz)
-- 
cgit v1.2.3


From 409547a0c83604b6dea70b8523674ac19e2af958 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 12 Feb 2015 21:04:31 +0100
Subject: update EIGEN_FAST_MATH documentation

---
 Eigen/src/Core/util/Macros.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'Eigen')

diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 13f8fdd4e..27167d3dc 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -385,7 +385,7 @@
 /** Allows to disable some optimizations which might affect the accuracy of the result.
   * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
   * They currently include:
-  *   - single precision Cwise::sin() and Cwise::cos() when SSE vectorization is enabled.
+  *   - single precision ArrayBase::sin() and ArrayBase::cos() when SSE vectorization is enabled.
   */
 #ifndef EIGEN_FAST_MATH
 #define EIGEN_FAST_MATH 1
-- 
cgit v1.2.3