227 files changed, 32433 insertions, 5999 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 05d92babe..00287c9bb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 project(Eigen)
 
-cmake_minimum_required(VERSION 2.8.2)
+cmake_minimum_required(VERSION 2.8.4)
 
 # guard against in-source builds
 
@@ -448,6 +448,7 @@ if(cmake_generator_tolower MATCHES "makefile")
   message(STATUS "make check    | Build and run the unit-tests. Read this page:")
   message(STATUS "              |   http://eigen.tuxfamily.org/index.php?title=Tests")
   message(STATUS "make blas     | Build BLAS library (not the same thing as Eigen)")
+  message(STATUS "make uninstall| Removes files installed by make install")
   message(STATUS "--------------+--------------------------------------------------------------")
 else()
   message(STATUS "To build/run the unit tests, read this page:")
@@ -483,3 +484,7 @@ install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake
                 ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
           DESTINATION ${EIGEN_CONFIG_CMAKE_PATH}
         )
+
+# Add uninstall target
+add_custom_target ( uninstall
+    COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/EigenUninstall.cmake)
diff --git a/Eigen/Core b/Eigen/Core
index dcb20bfd0..216428e97 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -125,9 +125,7 @@
       #define EIGEN_VECTORIZE_SSE4_1
       #define EIGEN_VECTORIZE_SSE4_2
     #endif
-    #ifdef __FMA__
-      #define EIGEN_VECTORIZE_FMA
-    #endif
+
     // include files
 
     // This extern "C" works around a MINGW-w64 compilation issue
@@ -187,6 +185,11 @@
   #endif
 #endif
 
+#if defined __CUDACC__
+  #define EIGEN_VECTORIZE_CUDA
+  #include <vector_types.h>
+#endif
+
 #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
   #define EIGEN_HAS_OPENMP
 #endif
@@ -302,9 +305,15 @@ using std::ptrdiff_t;
   #include "src/Core/arch/AltiVec/Complex.h"
 #elif defined EIGEN_VECTORIZE_NEON
   #include "src/Core/arch/NEON/PacketMath.h"
+  #include "src/Core/arch/NEON/MathFunctions.h"
   #include "src/Core/arch/NEON/Complex.h"
 #endif
 
+#if defined EIGEN_VECTORIZE_CUDA
+  #include "src/Core/arch/CUDA/PacketMath.h"
+  #include "src/Core/arch/CUDA/MathFunctions.h"
+#endif
+
 #include "src/Core/arch/Default/Settings.h"
 
 #include "src/Core/functors/BinaryFunctors.h"
diff --git a/Eigen/SparseCore b/Eigen/SparseCore
index b68c8fa8a..48ed967b8 100644
--- a/Eigen/SparseCore
+++ b/Eigen/SparseCore
@@ -26,21 +26,17 @@
   * This module depends on: Core.
   */
 
-namespace Eigen {
-
-/** The type used to identify a general sparse storage. */
-struct Sparse {};
-
-}
-
 #include "src/SparseCore/SparseUtil.h"
 #include "src/SparseCore/SparseMatrixBase.h"
 #include "src/SparseCore/SparseAssign.h"
 #include "src/SparseCore/CompressedStorage.h"
 #include "src/SparseCore/AmbiVector.h"
+#include "src/SparseCore/SparseCompressedBase.h"
 #include "src/SparseCore/SparseMatrix.h"
+#include "src/SparseCore/SparseMap.h"
 #include "src/SparseCore/MappedSparseMatrix.h"
 #include "src/SparseCore/SparseVector.h"
+#include "src/SparseCore/SparseRef.h"
 #include "src/SparseCore/SparseCwiseUnaryOp.h"
 #include "src/SparseCore/SparseCwiseBinaryOp.h"
 #include "src/SparseCore/SparseTranspose.h"
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 6ea383695..2ef37ca1c 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -87,7 +87,7 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
     // FIXME, this traits is rather specialized for dense object and it needs to be cleaned further
     FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
     FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
-    Flags = (traits<XprType>::Flags & DirectAccessBit) | FlagsLvalueBit | FlagsRowMajorBit
+    Flags = (traits<XprType>::Flags & (DirectAccessBit | (InnerPanel?CompressedAccessBit:0))) | FlagsLvalueBit | FlagsRowMajorBit
     // FIXME DirectAccessBit should not be handled by expressions
   };
 };
diff --git a/Eigen/src/Core/CommaInitializer.h b/Eigen/src/Core/CommaInitializer.h
index 98ebe3bf6..808f3977c 100644
--- a/Eigen/src/Core/CommaInitializer.h
+++ b/Eigen/src/Core/CommaInitializer.h
@@ -28,7 +28,6 @@ template<typename XprType>
 struct CommaInitializer
 {
   typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::StorageIndex StorageIndex;
 
   EIGEN_DEVICE_FUNC
   inline CommaInitializer(XprType& xpr, const Scalar& s)
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index eb35b44cb..9485080d3 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -111,6 +111,7 @@ struct evaluator_base
   typedef evaluator<ExpressionType> type;
   typedef evaluator<ExpressionType> nestedType;
   
+  // FIXME is it really usefull?
   typedef typename traits<ExpressionType>::StorageIndex StorageIndex;
   // TODO that's not very nice to have to propagate all these traits. They are currently only needed to handle outer,inner indices.
   typedef traits<ExpressionType> ExpressionTraits;
@@ -128,7 +129,6 @@ struct evaluator<PlainObjectBase<Derived> >
   : evaluator_base<Derived>
 {
   typedef PlainObjectBase<Derived> PlainObjectType;
-  typedef typename PlainObjectType::StorageIndex StorageIndex;
   typedef typename PlainObjectType::Scalar Scalar;
   typedef typename PlainObjectType::CoeffReturnType CoeffReturnType;
   typedef typename PlainObjectType::PacketScalar PacketScalar;
@@ -264,7 +264,6 @@ struct unary_evaluator<Transpose<ArgType>, IndexBased>
 
   EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {}
 
-  typedef typename XprType::StorageIndex StorageIndex;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketScalar PacketScalar;
@@ -343,7 +342,6 @@ struct evaluator<CwiseNullaryOp<NullaryOp,PlainObjectType> >
     : m_functor(n.functor()) 
   { }
 
-  typedef typename XprType::StorageIndex StorageIndex;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketScalar PacketScalar;
 
@@ -394,7 +392,6 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased >
       m_argImpl(op.nestedExpression()) 
   { }
 
-  typedef typename XprType::StorageIndex StorageIndex;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketScalar PacketScalar;
 
@@ -469,7 +466,6 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
       m_rhsImpl(xpr.rhs())  
   { }
 
-  typedef typename XprType::StorageIndex StorageIndex;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketScalar PacketScalar;
 
@@ -522,7 +518,6 @@ struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType>, IndexBased>
       m_argImpl(op.nestedExpression()) 
   { }
 
-  typedef typename XprType::StorageIndex StorageIndex;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
@@ -563,7 +558,6 @@ struct mapbase_evaluator : evaluator_base<Derived>
 {
   typedef Derived  XprType;
   typedef typename XprType::PointerType PointerType;
-  typedef typename XprType::StorageIndex StorageIndex;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketScalar PacketScalar;
@@ -760,7 +754,6 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
       m_startCol(block.startCol()) 
   { }
  
-  typedef typename XprType::StorageIndex StorageIndex;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketScalar PacketScalar;
@@ -865,7 +858,6 @@ struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
       m_elseImpl(select.elseMatrix())
   { }
  
-  typedef typename XprType::StorageIndex StorageIndex;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
   inline EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
@@ -898,7 +890,6 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
   : evaluator_base<Replicate<ArgType, RowFactor, ColFactor> >
 {
   typedef Replicate<ArgType, RowFactor, ColFactor> XprType;
-  typedef typename XprType::StorageIndex StorageIndex;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketReturnType PacketReturnType;
   enum {
@@ -981,7 +972,6 @@ struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
     : m_expr(expr)
   {}
 
-  typedef typename XprType::StorageIndex StorageIndex;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
  
   EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
@@ -1016,7 +1006,6 @@ struct evaluator_wrapper_base
 
   EIGEN_DEVICE_FUNC explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {}
 
-  typedef typename ArgType::StorageIndex StorageIndex;
   typedef typename ArgType::Scalar Scalar;
   typedef typename ArgType::CoeffReturnType CoeffReturnType;
   typedef typename ArgType::PacketScalar PacketScalar;
@@ -1103,7 +1092,6 @@ struct unary_evaluator<Reverse<ArgType, Direction> >
   : evaluator_base<Reverse<ArgType, Direction> >
 {
   typedef Reverse<ArgType, Direction> XprType;
-  typedef typename XprType::StorageIndex StorageIndex;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketScalar PacketScalar;
@@ -1219,9 +1207,10 @@ struct evaluator<Diagonal<ArgType, DiagIndex> >
       m_index(diagonal.index())
   { }
  
-  typedef typename XprType::StorageIndex StorageIndex;
   typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  // FIXME having to check whether ArgType is sparse here i not very nice.
+  typedef typename internal::conditional<!internal::is_same<typename ArgType::StorageKind,Sparse>::value,
+                                         typename XprType::CoeffReturnType,Scalar>::type CoeffReturnType;
 
   EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index) const
   {
diff --git a/Eigen/src/Core/DiagonalMatrix.h b/Eigen/src/Core/DiagonalMatrix.h
index f37091000..56beaf3bc 100644
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h
@@ -326,6 +326,12 @@ struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Dense, Scalar>
     dst.setZero();
     dst.diagonal() = src.diagonal();
   }
+  
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar> &/*func*/)
+  { dst.diagonal() += src.diagonal(); }
+  
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar> &/*func*/)
+  { dst.diagonal() -= src.diagonal(); }
 };
 
 } // namespace internal
diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h
index 4eb01b1b3..81750722c 100644
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_GENERAL_PRODUCT_H
 #define EIGEN_GENERAL_PRODUCT_H
 
-namespace Eigen { 
+namespace Eigen {
 
 enum {
   Large = 2,
@@ -252,12 +252,12 @@ template<> struct gemv_dense_sense_selector<OnTheRight,ColMajor,true>
 
     bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
     bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
-    
+
     RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
 
     ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
                                                   evalToDest ? dest.data() : static_dest.data());
-    
+
     if(!evalToDest)
     {
       #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
@@ -273,11 +273,13 @@ template<> struct gemv_dense_sense_selector<OnTheRight,ColMajor,true>
         MappedDest(actualDestPtr, dest.size()) = dest;
     }
 
+    typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
     general_matrix_vector_product
-      <Index,LhsScalar,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run(
+        <Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
         actualLhs.rows(), actualLhs.cols(),
-        actualLhs.data(), actualLhs.outerStride(),
-        actualRhs.data(), actualRhs.innerStride(),
+        LhsMapper(actualLhs.data(), actualLhs.outerStride()),
+        RhsMapper(actualRhs.data(), actualRhs.innerStride()),
         actualDestPtr, 1,
         compatibleAlpha);
 
@@ -333,11 +335,13 @@ template<> struct gemv_dense_sense_selector<OnTheRight,RowMajor,true>
       Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
     }
 
+    typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
     general_matrix_vector_product
-      <Index,LhsScalar,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run(
+        <Index,LhsScalar,LhsMapper,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
         actualLhs.rows(), actualLhs.cols(),
-        actualLhs.data(), actualLhs.outerStride(),
-        actualRhsPtr, 1,
+        LhsMapper(actualLhs.data(), actualLhs.outerStride()),
+        RhsMapper(actualRhsPtr, 1),
         dest.data(), dest.innerStride(),
         actualAlpha);
   }
@@ -410,7 +414,7 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
 #ifdef EIGEN_DEBUG_PRODUCT
   internal::product_type<Derived,OtherDerived>::debug();
 #endif
-  
+
   return Product<Derived, OtherDerived>(derived(), other.derived());
 }
 
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 81f4eef40..8759cd06c 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -54,6 +54,7 @@ struct default_packet_traits
     HasMax    = 1,
     HasConj   = 1,
     HasSetLinear = 1,
+    HasBlend  = 0,
 
     HasDiv    = 0,
     HasSqrt   = 0,
@@ -94,6 +95,8 @@ template<typename T> struct packet_traits : default_packet_traits
   };
 };
 
+template<typename T> struct packet_traits<const T> : packet_traits<T> { };
+
 /** \internal \returns a + b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 padd(const Packet& a,
@@ -356,7 +359,7 @@ pmadd(const Packet&  a,
 /** \internal \returns a packet version of \a *from.
   * If LoadMode equals #Aligned, \a from must be 16 bytes aligned */
 template<typename Packet, int LoadMode>
-inline Packet ploadt(const typename unpacket_traits<Packet>::type* from)
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_traits<Packet>::type* from)
 {
   if(LoadMode == Aligned)
     return pload<Packet>(from);
@@ -367,7 +370,7 @@ inline Packet ploadt(const typename unpacket_traits<Packet>::type* from)
 /** \internal copy the packet \a from to \a *to.
   * If StoreMode equals #Aligned, \a to must be 16 bytes aligned */
 template<typename Scalar, typename Packet, int LoadMode>
-inline void pstoret(Scalar* to, const Packet& from)
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& from)
 {
   if(LoadMode == Aligned)
     pstore(to, from);
@@ -375,6 +378,17 @@ inline void pstoret(Scalar* to, const Packet& from)
     pstoreu(to, from);
 }
 
+/** \internal \returns a packet version of \a *from.
+  * Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the
+  * hardware if available to speedup the loading of data that won't be modified
+  * by the current computation.
+  */
+template<typename Packet, int LoadMode>
+inline Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
+{
+  return ploadt<Packet, LoadMode>(from);
+}
+
 /** \internal default implementation of palign() allowing partial specialization */
 template<int Offset,typename PacketType>
 struct palign_impl
@@ -433,6 +447,19 @@ ptranspose(PacketBlock<Packet,1>& /*kernel*/) {
   // Nothing to do in the scalar case, i.e. a 1x1 matrix.
 }
 
+/***************************************************************************
+ * Selector, i.e. vector of N boolean values used to select (i.e. blend)
+ * words from 2 packets.
+***************************************************************************/
+template <size_t N> struct Selector {
+  bool select[N];
+};
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pblend(const Selector<unpacket_traits<Packet>::size>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
+  return ifPacket.select[0] ? thenPacket : elsePacket;
+}
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h
index 3dafee9d7..8dca9796d 100644
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -171,6 +171,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
 template<typename Derived> class MapBase<Derived, WriteAccessors>
   : public MapBase<Derived, ReadOnlyAccessors>
 {
+    typedef MapBase<Derived, ReadOnlyAccessors> ReadOnlyMapBase;
   public:
 
     typedef MapBase<Derived, ReadOnlyAccessors> Base;
@@ -238,11 +239,13 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
     EIGEN_DEVICE_FUNC
     Derived& operator=(const MapBase& other)
     {
-      Base::Base::operator=(other);
+      ReadOnlyMapBase::Base::operator=(other);
       return derived();
     }
 
-    using Base::Base::operator=;
+    // In theory we could simply refer to Base:Base::operator=, but MSVC does not like Base::Base,
+    // see bugs 821 and 920.
+    using ReadOnlyMapBase::Base::operator=;
 };
 
 #undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 4c5fc1cae..16ad2dc7e 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -14,7 +14,7 @@ namespace Eigen {
 
 // On WINCE, std::abs is defined for int only, so let's defined our own overloads:
 // This issue has been confirmed with MSVC 2008 only, but the issue might exist for more recent versions too.
-#if defined(_WIN32_WCE) && defined(_MSC_VER) && _MSC_VER<=1500
+#if EIGEN_OS_WINCE && EIGEN_COMP_MSVC && EIGEN_COMP_MSVC<=1500
 long        abs(long        x) { return (labs(x));  }
 double      abs(double      x) { return (fabs(x));  }
 float       abs(float       x) { return (fabsf(x)); }
@@ -360,50 +360,31 @@ inline NewType cast(const OldType& x)
 }
 
 /****************************************************************************
-* Implementation of atanh2                                                *
+* Implementation of logp1                                                *
 ****************************************************************************/
 
 template<typename Scalar>
-struct atanh2_impl
+struct log1p_impl
 {
-  static inline Scalar run(const Scalar& x, const Scalar& r)
+  static inline Scalar run(const Scalar& x)
   {
     EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-    #if (__cplusplus >= 201103L) && !defined(__CYGWIN__)
+    // Let's be conservative and enable the default C++11 implementation only if we are sure it exists
+    #if (__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC)  \
+        && (EIGEN_ARCH_i386_OR_x86_64) && (EIGEN_OS_GNULINUX || EIGEN_OS_WIN_STRICT || EIGEN_OS_MAC)
       using std::log1p;
-      return log1p(2 * x / (r - x)) / 2;
+      return log1p(x);
     #else
-      using std::abs;
+      typedef typename NumTraits<Scalar>::Real RealScalar;
       using std::log;
-      using std::sqrt;
-      Scalar z = x / r;
-      if (r == 0 || abs(z) > sqrt(NumTraits<Scalar>::epsilon()))
-        return log((r + x) / (r - x)) / 2;
-      else
-        return z + z*z*z / 3;
+      Scalar x1p = RealScalar(1) + x;
+      return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
     #endif
   }
 };
 
-template<typename RealScalar>
-struct atanh2_impl<std::complex<RealScalar> >
-{
-  typedef std::complex<RealScalar> Scalar;
-  static inline Scalar run(const Scalar& x, const Scalar& r)
-  {
-    using std::log;
-    using std::norm;
-    using std::sqrt;
-    Scalar z = x / r;
-    if (r == Scalar(0) || norm(z) > NumTraits<RealScalar>::epsilon())
-      return RealScalar(0.5) * log((r + x) / (r - x));
-    else
-      return z + z*z*z / RealScalar(3);
-  }
-};
-
 template<typename Scalar>
-struct atanh2_retval
+struct log1p_retval
 {
   typedef Scalar type;
 };
@@ -680,9 +661,9 @@ inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar&
 
 template<typename Scalar>
 EIGEN_DEVICE_FUNC
-inline EIGEN_MATHFUNC_RETVAL(atanh2, Scalar) atanh2(const Scalar& x, const Scalar& y)
+inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
 {
-  return EIGEN_MATHFUNC_IMPL(atanh2, Scalar)::run(x, y);
+  return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x);
 }
 
 template<typename Scalar>
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index b2c9b56ed..7f9d135f7 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -210,23 +210,26 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
 template<typename Dst, typename Lhs, typename Rhs, typename Func>
 EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
 {
+  typename evaluator<Rhs>::type rhsEval(rhs);
   // FIXME make sure lhs is sequentially stored
   // FIXME not very good if rhs is real and lhs complex while alpha is real too
-  // FIXME we should probably build an evaluator for dst and rhs
+  // FIXME we should probably build an evaluator for dst
   const Index cols = dst.cols();
   for (Index j=0; j<cols; ++j)
-    func(dst.col(j), rhs.coeff(0,j) * lhs);
+    func(dst.col(j), rhsEval.coeff(0,j) * lhs);
 }
 
 // Row major result
 template<typename Dst, typename Lhs, typename Rhs, typename Func>
-EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&) {
+EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
+{
+  typename evaluator<Lhs>::type lhsEval(lhs);
   // FIXME make sure rhs is sequentially stored
   // FIXME not very good if lhs is real and rhs complex while alpha is real too
-  // FIXME we should probably build an evaluator for dst and lhs
+  // FIXME we should probably build an evaluator for dst
   const Index rows = dst.rows();
   for (Index i=0; i<rows; ++i)
-    func(dst.row(i), lhs.coeff(i,0) * rhs);
+    func(dst.row(i), lhsEval.coeff(i,0) * rhs);
 }
 
 template<typename Lhs, typename Rhs>
diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h
index 0f17e3a89..f97048bda 100644
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -96,7 +96,7 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
     typedef internal::gemm_blocking_space<(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
               Rhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxRowsAtCompileTime,4> BlockingType;
 
-    BlockingType blocking(rhs.rows(), rhs.cols(), size);
+    BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false);
 
     triangular_solve_matrix<Scalar,Index,Side,Mode,LhsProductTraits::NeedToConjugate,(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor,
                                (Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor>
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index e1316a73d..5d60ba149 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -213,18 +213,39 @@ MatrixBase<Derived>::adjoint() const
 namespace internal {
 
 template<typename MatrixType,
-  bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic>
+  bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic,
+  bool MatchPacketSize =
+        (int(MatrixType::RowsAtCompileTime) == int(internal::packet_traits<typename MatrixType::Scalar>::size))
+    &&  (internal::evaluator<MatrixType>::Flags&PacketAccessBit) >
 struct inplace_transpose_selector;
 
 template<typename MatrixType>
-struct inplace_transpose_selector<MatrixType,true> { // square matrix
+struct inplace_transpose_selector<MatrixType,true,false> { // square matrix
   static void run(MatrixType& m) {
     m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
   }
 };
 
+// TODO: vectorized path is currently limited to LargestPacketSize x LargestPacketSize cases only.
 template<typename MatrixType>
-struct inplace_transpose_selector<MatrixType,false> { // non square matrix
+struct inplace_transpose_selector<MatrixType,true,true> { // PacketSize x PacketSize
+  static void run(MatrixType& m) {
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename internal::packet_traits<typename MatrixType::Scalar>::type Packet;
+    typedef typename MatrixType::Index Index;
+    const Index PacketSize = internal::packet_traits<Scalar>::size;
+    const Index Alignment = internal::evaluator<MatrixType>::Flags&AlignedBit ? Aligned : Unaligned;
+    PacketBlock<Packet> A;
+    for (Index i=0; i<PacketSize; ++i)
+      A.packet[i] = m.template packetByOuterInner<Alignment>(i,0);
+    internal::ptranspose(A);
+    for (Index i=0; i<PacketSize; ++i)
+      m.template writePacket<Alignment>(m.rowIndexByOuterInner(i,0), m.colIndexByOuterInner(i,0), A.packet[i]);
+  }
+};
+
+template<typename MatrixType,bool MatchPacketSize>
+struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non square matrix
   static void run(MatrixType& m) {
     if (m.rows()==m.cols())
       m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index e2376bd1f..be66a502a 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -22,9 +22,9 @@ namespace internal {
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
 #endif
 
-#ifdef EIGEN_VECTORIZE_FMA
-#ifndef EIGEN_HAS_FUSED_MADD
-#define EIGEN_HAS_FUSED_MADD 1
+#ifdef __FMA__
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
 #endif
 
@@ -58,7 +58,8 @@ template<> struct packet_traits<float>  : default_packet_traits
     HasCos  = 0,
     HasLog  = 0,
     HasExp  = 0,
-    HasSqrt = 0
+    HasSqrt = 0,
+    HasBlend = 1
   };
  };
 template<> struct packet_traits<double> : default_packet_traits
@@ -72,7 +73,8 @@ template<> struct packet_traits<double> : default_packet_traits
     HasHalfPacket = 1,
 
     HasDiv  = 1,
-    HasExp  = 0
+    HasExp  = 0,
+    HasBlend = 1
   };
 };
 
@@ -133,7 +135,7 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, co
   return pset1<Packet8i>(0);
 }
 
-#ifdef EIGEN_VECTORIZE_FMA
+#ifdef __FMA__
 template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
 #if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
   // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
@@ -557,6 +559,19 @@ ptranspose(PacketBlock<Packet4d,4>& kernel) {
   kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
 }
 
+template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) {
+  const __m256 zero = _mm256_setzero_ps();
+  const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+  __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ);
+  return _mm256_blendv_ps(thenPacket, elsePacket, false_mask);
+}
+template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) {
+  const __m256d zero = _mm256_setzero_pd();
+  const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+  __m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ);
+  return _mm256_blendv_pd(thenPacket, elsePacket, false_mask);
+}
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index fa02f57a1..6b68fc7a5 100755
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -18,12 +18,12 @@ namespace internal {
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
 #endif
 
-#ifndef EIGEN_HAS_FUSED_MADD
-#define EIGEN_HAS_FUSED_MADD 1
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
 
-#ifndef EIGEN_HAS_FUSE_CJMADD
-#define EIGEN_HAS_FUSE_CJMADD 1
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
 #endif
 
 // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h
new file mode 100644
index 000000000..e7305c01e
--- /dev/null
+++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h
@@ -0,0 +1,75 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H
+#define EIGEN_MATH_FUNCTIONS_CUDA_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+template<> EIGEN_STRONG_INLINE
+float4 plog<float4>(const float4& a)
+{
+  return make_float4(logf(a.x), logf(a.y), logf(a.z), logf(a.w));
+}
+
+template<> EIGEN_STRONG_INLINE
+double2 plog<double2>(const double2& a)
+{
+  return make_double2(log(a.x), log(a.y));
+}
+
+template<> EIGEN_STRONG_INLINE
+float4 pexp<float4>(const float4& a)
+{
+  return make_float4(expf(a.x), expf(a.y), expf(a.z), expf(a.w));
+}
+
+template<> EIGEN_STRONG_INLINE
+double2 pexp<double2>(const double2& a)
+{
+  return make_double2(exp(a.x), exp(a.y));
+}
+
+template<> EIGEN_STRONG_INLINE
+float4 psqrt<float4>(const float4& a)
+{
+  return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
+}
+
+template<> EIGEN_STRONG_INLINE
+double2 psqrt<double2>(const double2& a)
+{
+  return make_double2(sqrt(a.x), sqrt(a.y));
+}
+
+template<> EIGEN_STRONG_INLINE
+float4 prsqrt<float4>(const float4& a)
+{
+  return make_float4(rsqrtf(a.x), rsqrtf(a.y), rsqrtf(a.z), rsqrtf(a.w));
+}
+
+template<> EIGEN_STRONG_INLINE
+double2 prsqrt<double2>(const double2& a)
+{
+  return make_double2(rsqrt(a.x), rsqrt(a.y));
+}
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATH_FUNCTIONS_CUDA_H
diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h
new file mode 100644
index 000000000..19749c832
--- /dev/null
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -0,0 +1,296 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_CUDA_H
+#define EIGEN_PACKET_MATH_CUDA_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+template<> struct is_arithmetic<float4>  { enum { value = true }; };
+template<> struct is_arithmetic<double2> { enum { value = true }; };
+
+
+template<> struct packet_traits<float> : default_packet_traits
+{
+  typedef float4 type;
+  typedef float4 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=4,
+    HasHalfPacket = 0,
+
+    HasDiv  = 1,
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+
+    HasBlend = 0,
+  };
+};
+
+template<> struct packet_traits<double> : default_packet_traits
+{
+  typedef double2 type;
+  typedef double2 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=2,
+    HasHalfPacket = 0,
+
+    HasDiv  = 1,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+
+    HasBlend = 0,
+  };
+};
+
+
+template<> struct unpacket_traits<float4> { typedef float  type; enum {size=4}; typedef float4 half; };
+template<> struct unpacket_traits<double2> { typedef double type; enum {size=2}; typedef double2 half; };
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float&  from) {
+  return make_float4(from, from, from, from);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
+  return make_double2(from, from);
+}
+
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float>(const float& a) {
+  return make_float4(a, a+1, a+2, a+3);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double>(const double& a) {
+  return make_double2(a, a+1);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x+b.x, a.y+b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x-b.x, a.y-b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
+  return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
+  return make_double2(-a.x, -a.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x*b.x, a.y*b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x/b.x, a.y/b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
+  return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
+  return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
+  return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
+  return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
+  return *reinterpret_cast<const float4*>(from);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
+  return *reinterpret_cast<const double2*>(from);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
+  return make_float4(from[0], from[1], from[2], from[3]);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
+  return make_double2(from[0], from[1]);
+}
+
+template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float*   from) {
+  return make_float4(from[0], from[0], from[1], from[1]);
+}
+template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double*  from) {
+  return make_double2(from[0], from[0]);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float*   to, const float4& from) {
+  *reinterpret_cast<float4*>(to) = from;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
+  *reinterpret_cast<double2*>(to) = from;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const float4& from) {
+  to[0] = from.x;
+  to[1] = from.y;
+  to[2] = from.z;
+  to[3] = from.w;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
+  to[0] = from.x;
+  to[1] = from.y;
+}
+
+#ifdef __CUDA_ARCH__
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
+  return __ldg((const float4*)from);
+}
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
+  return __ldg((const double2*)from);
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
+  return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
+}
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
+  return make_double2(__ldg(from+0), __ldg(from+1));
+}
+#endif
+
+template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, int stride) {
+  return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, int stride) {
+  return make_double2(from[0*stride], from[1*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, int stride) {
+  to[stride*0] = from.x;
+  to[stride*1] = from.y;
+  to[stride*2] = from.z;
+  to[stride*3] = from.w;
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, int stride) {
+  to[stride*0] = from.x;
+  to[stride*1] = from.y;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  pfirst<float4>(const float4& a) {
+  return a.x;
+}
+template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
+  return a.x;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux<float4>(const float4& a) {
+  return a.x + a.y + a.z + a.w;
+}
+template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
+  return a.x + a.y;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux_max<float4>(const float4& a) {
+  return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
+  return fmax(a.x, a.y);
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux_min<float4>(const float4& a) {
+  return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
+  return fmin(a.x, a.y);
+}
+
+template<> EIGEN_DEVICE_FUNC inline float4  pabs<float4>(const float4& a) {
+  return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
+  return make_double2(abs(a.x), abs(a.y));
+}
+
+
+template<> EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<float4,4>& kernel) {
+  double tmp = kernel.packet[0].y;
+  kernel.packet[0].y = kernel.packet[1].x;
+  kernel.packet[1].x = tmp;
+
+  tmp = kernel.packet[0].z;
+  kernel.packet[0].z = kernel.packet[2].x;
+  kernel.packet[2].x = tmp;
+
+  tmp = kernel.packet[0].w;
+  kernel.packet[0].w = kernel.packet[3].x;
+  kernel.packet[3].x = tmp;
+
+  tmp = kernel.packet[1].z;
+  kernel.packet[1].z = kernel.packet[2].y;
+  kernel.packet[2].y = tmp;
+
+  tmp = kernel.packet[1].w;
+  kernel.packet[1].w = kernel.packet[3].y;
+  kernel.packet[3].y = tmp;
+
+  tmp = kernel.packet[2].w;
+  kernel.packet[2].w = kernel.packet[3].z;
+  kernel.packet[3].z = tmp;
+}
+
+template<> EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<double2,2>& kernel) {
+  double tmp = kernel.packet[0].y;
+  kernel.packet[0].y = kernel.packet[1].x;
+  kernel.packet[1].x = tmp;
+}
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+
+#endif // EIGEN_PACKET_MATH_CUDA_H
diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h
new file mode 100644
index 000000000..6bb05bb92
--- /dev/null
+++ b/Eigen/src/Core/arch/NEON/MathFunctions.h
@@ -0,0 +1,91 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* The sin, cos, exp, and log functions of this file come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+#ifndef EIGEN_MATH_FUNCTIONS_NEON_H
+#define EIGEN_MATH_FUNCTIONS_NEON_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pexp<Packet4f>(const Packet4f& _x)
+{
+  Packet4f x = _x;
+  Packet4f tmp, fx;
+
+  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+  _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
+  _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
+
+  x = vminq_f32(x, p4f_exp_hi);
+  x = vmaxq_f32(x, p4f_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = vmlaq_f32(p4f_half, x, p4f_cephes_LOG2EF);
+
+  /* perform a floorf */
+  tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
+
+  /* if greater, substract 1 */
+  Packet4ui mask = vcgtq_f32(tmp, fx);
+  mask = vandq_u32(mask, vreinterpretq_u32_f32(p4f_1));
+
+  fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
+
+  tmp = vmulq_f32(fx, p4f_cephes_exp_C1);
+  Packet4f z = vmulq_f32(fx, p4f_cephes_exp_C2);
+  x = vsubq_f32(x, tmp);
+  x = vsubq_f32(x, z);
+
+  Packet4f y = vmulq_f32(p4f_cephes_exp_p0, x);
+  z = vmulq_f32(x, x);
+  y = vaddq_f32(y, p4f_cephes_exp_p1);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, p4f_cephes_exp_p2);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, p4f_cephes_exp_p3);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, p4f_cephes_exp_p4);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, p4f_cephes_exp_p5);
+
+  y = vmulq_f32(y, z);
+  y = vaddq_f32(y, x);
+  y = vaddq_f32(y, p4f_1);
+
+  /* build 2^n */
+  int32x4_t mm;
+  mm = vcvtq_s32_f32(fx);
+  mm = vaddq_s32(mm, p4i_0x7f);
+  mm = vshlq_n_s32(mm, 23);
+  Packet4f pow2n = vreinterpretq_f32_s32(mm);
+
+  y = vmulq_f32(y, pow2n);
+  return y;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATH_FUNCTIONS_NEON_H
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 6c5c669a1..559682cf7 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -20,12 +20,12 @@ namespace internal {
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
 #endif
 
-#ifndef EIGEN_HAS_FUSED_MADD
-#define EIGEN_HAS_FUSED_MADD 1
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
 
-#ifndef EIGEN_HAS_FUSE_CJMADD
-#define EIGEN_HAS_FUSE_CJMADD 1
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
 #endif
 
 // FIXME NEON has 16 quad registers, but since the current register allocator
@@ -88,7 +88,7 @@ template<> struct packet_traits<float>  : default_packet_traits
     HasSin  = 0,
     HasCos  = 0,
     HasLog  = 0,
-    HasExp  = 0,
+    HasExp  = 1,
     HasSqrt = 0
   };
 };
@@ -177,8 +177,19 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
   return pset1<Packet4i>(0);
 }
 
-// for some weird raisons, it has to be overloaded for packet of integers
+#ifdef __ARM_FEATURE_FMA
+// See bug 936.
+// FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4.
+// FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding.
+// MLA is not fused i.e. does 2 roundings.
+// In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4):
+// MLA: 10 GFlop/s ; FMA: 12 GFlops/s.
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); }
+#else
 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vmlaq_f32(c,a,b); }
+#endif
+
+// No FMA instruction for int, so use MLA unconditionally.
 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); }
 
 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
@@ -492,6 +503,21 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
 //---------- double ----------
 #if EIGEN_ARCH_ARM64
 
+#if (EIGEN_COMP_GNUC_STRICT && defined(__ANDROID__)) || defined(__apple_build_version__)
+// Bug 907: workaround missing declarations of the following two functions in the ADK
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_f64 (float64x2_t __a)
+{
+  return (uint64x2_t) __a;
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_f64_u64 (uint64x2_t __a)
+{
+  return (float64x2_t) __a;
+}
+#endif
+
 typedef float64x2_t Packet2d;
 typedef float64x1_t Packet1d;
 
@@ -536,8 +562,12 @@ template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const
 
 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }
 
-// for some weird raisons, it has to be overloaded for packet of integers
+#ifdef __ARM_FEATURE_FMA
+// See bug 936. See above comment about FMA for float.
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); }
+#else
 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); }
+#endif
 
 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); }
 
@@ -597,7 +627,12 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vco
 
 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
 
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_low_f64(a) + vget_high_f64(a); }
+#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
+// workaround ICE, see bug 907
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) + vget_high_f64(a))[0]; }
+#else
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); }
+#endif
 
 template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
 {
@@ -613,7 +648,11 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
 }
 // Other reduction functions:
 // mul
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_low_f64(a) * vget_high_f64(a); }
+#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; }
+#else
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); }
+#endif
 
 // min
 template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); }
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index 414c4cb6a..565e448fe 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -44,7 +44,8 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
     HasAbs2   = 0,
     HasMin    = 0,
     HasMax    = 0,
-    HasSetLinear = 0
+    HasSetLinear = 0,
+    HasBlend = 1
   };
 };
 #endif
@@ -472,6 +473,11 @@ ptranspose(PacketBlock<Packet2cf,2>& kernel) {
   kernel.packet[1].v = tmp;
 }
 
+template<>  EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+  __m128d result = pblend(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v));
+  return Packet2cf(_mm_castpd_ps(result));
+}
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 28427c308..898cb9ab0 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -22,9 +22,9 @@ namespace internal {
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
 #endif
 
-#ifdef EIGEN_VECTORIZE_FMA
-#ifndef EIGEN_HAS_FUSED_MADD
-#define EIGEN_HAS_FUSED_MADD 1
+#ifdef __FMA__
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1
 #endif
 #endif
 
@@ -108,7 +108,8 @@ template<> struct packet_traits<float>  : default_packet_traits
     HasCos  = EIGEN_FAST_MATH,
     HasLog  = 1,
     HasExp  = 1,
-    HasSqrt = 1
+    HasSqrt = 1,
+    HasBlend = 1
   };
 };
 template<> struct packet_traits<double> : default_packet_traits
@@ -123,7 +124,8 @@ template<> struct packet_traits<double> : default_packet_traits
 
     HasDiv  = 1,
     HasExp  = 1,
-    HasSqrt = 1
+    HasSqrt = 1,
+    HasBlend = 1
   };
 };
 #endif
@@ -135,7 +137,9 @@ template<> struct packet_traits<int>    : default_packet_traits
     // FIXME check the Has*
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=4
+    size=4,
+
+    HasBlend = 1
   };
 };
 
@@ -227,7 +231,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
 
 // for some weird raisons, it has to be overloaded for packet of integers
 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
-#ifdef EIGEN_VECTORIZE_FMA
+#ifdef __FMA__
 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
 #endif
@@ -809,6 +813,37 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
   kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
 }
 
+template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+  __m128i false_mask = _mm_cmpeq_epi32(select, zero);
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_blendv_epi8(thenPacket, elsePacket, false_mask);
+#else
+  return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket));
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
+  const __m128 zero = _mm_setzero_ps();
+  const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+  __m128 false_mask = _mm_cmpeq_ps(select, zero);
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_blendv_ps(thenPacket, elsePacket, false_mask);
+#else
+  return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket));
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
+  const __m128d zero = _mm_setzero_pd();
+  const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]);
+  __m128d false_mask = _mm_cmpeq_pd(select, zero);
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_blendv_pd(thenPacket, elsePacket, false_mask);
+#else
+  return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket));
+#endif
+}
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 7b2ed6728..11e5f591d 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -26,28 +26,37 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff
 }
 
 /** \internal */
-inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0)
+inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
 {
-  static std::ptrdiff_t m_l1CacheSize = 0;
-  static std::ptrdiff_t m_l2CacheSize = 0;
-  if(m_l2CacheSize==0)
+  static bool m_cache_sizes_initialized = false;
+  static std::ptrdiff_t m_l1CacheSize = 32*1024;
+  static std::ptrdiff_t m_l2CacheSize = 256*1024;
+  static std::ptrdiff_t m_l3CacheSize = 2*1024*1024;
+
+  if(!m_cache_sizes_initialized)
   {
-    m_l1CacheSize = manage_caching_sizes_helper(queryL1CacheSize(),8 * 1024);
-    m_l2CacheSize = manage_caching_sizes_helper(queryTopLevelCacheSize(),1*1024*1024);
+    int l1CacheSize, l2CacheSize, l3CacheSize;
+    queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
+    m_l1CacheSize = manage_caching_sizes_helper(l1CacheSize, 8*1024);
+    m_l2CacheSize = manage_caching_sizes_helper(l2CacheSize, 256*1024);
+    m_l3CacheSize = manage_caching_sizes_helper(l3CacheSize, 8*1024*1024);
+    m_cache_sizes_initialized = true;
   }
-  
+
   if(action==SetAction)
   {
     // set the cpu cache size and cache all block sizes from a global cache size in byte
     eigen_internal_assert(l1!=0 && l2!=0);
     m_l1CacheSize = *l1;
     m_l2CacheSize = *l2;
+    m_l3CacheSize = *l3;
   }
   else if(action==GetAction)
   {
     eigen_internal_assert(l1!=0 && l2!=0);
     *l1 = m_l1CacheSize;
     *l2 = m_l2CacheSize;
+    *l3 = m_l3CacheSize;
   }
   else
   {
@@ -70,10 +79,11 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdi
   * - the number of scalars that fit into a packet (when vectorization is enabled).
   *
   * \sa setCpuCacheSizes */
+#define CEIL(a, b) ((a)+(b)-1)/(b)
+
 template<typename LhsScalar, typename RhsScalar, int KcFactor, typename SizeType>
-void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
+void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads)
 {
-  EIGEN_UNUSED_VARIABLE(n);
   // Explanations:
   // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and
   // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed
@@ -81,47 +91,75 @@ void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
   // at the register level. For vectorization purpose, these small vertical panels are unpacked,
   // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to
   // stay in L1 cache.
-  std::ptrdiff_t l1, l2;
-
-  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
-  enum {
-    kdiv = KcFactor * 2 * Traits::nr
-         * Traits::RhsProgress * sizeof(RhsScalar),
-    mr = gebp_traits<LhsScalar,RhsScalar>::mr,
-    mr_mask = (0xffffffff/mr)*mr
-  };
+  std::ptrdiff_t l1, l2, l3;
+  manage_caching_sizes(GetAction, &l1, &l2, &l3);
+
+  if (num_threads > 1) {
+    typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+    typedef typename Traits::ResScalar ResScalar;
+    enum {
+      kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
+      ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
+      k_mask = (0xffffffff/8)*8,
+
+      mr = Traits::mr,
+      mr_mask = (0xffffffff/mr)*mr,
+
+      nr = Traits::nr,
+      nr_mask = (0xffffffff/nr)*nr
+    };
+    SizeType k_cache = (l1-ksub)/kdiv;
+    if (k_cache < k) {
+      k = k_cache & k_mask;
+      eigen_assert(k > 0);
+    }
 
-  manage_caching_sizes(GetAction, &l1, &l2);
+    SizeType n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
+    SizeType n_per_thread = CEIL(n, num_threads);
+    if (n_cache <= n_per_thread) {
+      // Don't exceed the capacity of the l2 cache.
+      eigen_assert(n_cache >= static_cast<SizeType>(nr));
+      n = n_cache & nr_mask;
+      eigen_assert(n > 0);
+    } else {
+      n = (std::min<SizeType>)(n, (n_per_thread + nr - 1) & nr_mask);
+    }
 
-//   k = std::min<SizeType>(k, l1/kdiv);
-//   SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0;
-//   if(_m<m) m = _m & mr_mask;
-  
-  // In unit tests we do not want to use extra large matrices,
-  // so we reduce the block size to check the blocking strategy is not flawed
+    if (l3 > l2) {
+      // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
+      SizeType m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
+      SizeType m_per_thread = CEIL(m, num_threads);
+      if(m_cache < m_per_thread && m_cache >= static_cast<SizeType>(mr)) {
+        m = m_cache & mr_mask;
+        eigen_assert(m > 0);
+      } else {
+        m = (std::min<SizeType>)(m, (m_per_thread + mr - 1) & mr_mask);
+      }
+    }
+  }
+  else {
+    // In unit tests we do not want to use extra large matrices,
+    // so we reduce the block size to check the blocking strategy is not flawed
 #ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
-//   k = std::min<SizeType>(k,240);
-//   n = std::min<SizeType>(n,3840/sizeof(RhsScalar));
-//   m = std::min<SizeType>(m,3840/sizeof(RhsScalar));
-  
-  k = std::min<SizeType>(k,sizeof(LhsScalar)<=4 ? 360 : 240);
-  n = std::min<SizeType>(n,3840/sizeof(RhsScalar));
-  m = std::min<SizeType>(m,3840/sizeof(RhsScalar));
+    k = std::min<SizeType>(k,sizeof(LhsScalar)<=4 ? 360 : 240);
+    n = std::min<SizeType>(n,3840/sizeof(RhsScalar));
+    m = std::min<SizeType>(m,3840/sizeof(RhsScalar));
 #else
-  k = std::min<SizeType>(k,24);
-  n = std::min<SizeType>(n,384/sizeof(RhsScalar));
-  m = std::min<SizeType>(m,384/sizeof(RhsScalar));
+    k = std::min<SizeType>(k,24);
+    n = std::min<SizeType>(n,384/sizeof(RhsScalar));
+    m = std::min<SizeType>(m,384/sizeof(RhsScalar));
 #endif
+  }
 }
 
 template<typename LhsScalar, typename RhsScalar, typename SizeType>
-inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
+inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads)
 {
-  computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n);
+  computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n, num_threads);
 }
 
-#ifdef EIGEN_HAS_FUSE_CJMADD
-  #define MADD(CJ,A,B,C,T)  C = CJ.pmadd(A,B,C);
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+  #define CJMADD(CJ,A,B,C,T)  C = CJ.pmadd(A,B,C);
 #else
 
   // FIXME (a bit overkill maybe ?)
@@ -146,8 +184,8 @@ inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
     gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
   }
 
-  #define MADD(CJ,A,B,C,T)  gebp_madd(CJ,A,B,C,T);
-//   #define MADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); C = padd(C,T);
+  #define CJMADD(CJ,A,B,C,T)  gebp_madd(CJ,A,B,C,T);
+//   #define CJMADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); C = padd(C,T);
 #endif
 
 /* Vectorization logic
@@ -182,7 +220,7 @@ public:
     nr = 4,
 
     // register block size along the M direction (currently, this one cannot be modified)
-#if defined(EIGEN_HAS_FUSED_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
+#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
     // we assume 16 registers
     mr = 3*LhsPacketSize,
 #else
@@ -248,7 +286,7 @@ public:
     // let gcc allocate the register in which to store the result of the pmul
     // (in the case where there is no FMA) gcc fails to figure out how to avoid
     // spilling register.
-#ifdef EIGEN_HAS_FUSED_MADD
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
     EIGEN_UNUSED_VARIABLE(tmp);
     c = pmadd(a,b,c);
 #else
@@ -290,7 +328,7 @@ public:
     
     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
     nr = 4,
-#if defined(EIGEN_HAS_FUSED_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
+#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
     // we assume 16 registers
     mr = 3*LhsPacketSize,
 #else
@@ -353,7 +391,7 @@ public:
 
   EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
   {
-#ifdef EIGEN_HAS_FUSED_MADD
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
     EIGEN_UNUSED_VARIABLE(tmp);
     c.v = pmadd(a.v,b,c.v);
 #else
@@ -637,7 +675,7 @@ public:
 
   EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
   {
-#ifdef EIGEN_HAS_FUSED_MADD
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
     EIGEN_UNUSED_VARIABLE(tmp);
     c.v = pmadd(a,b.v,c.v);
 #else
@@ -667,7 +705,7 @@ protected:
  *  |real |cplx | no vectorization yet, would require to pack A with duplication
  *  |cplx |real | easy vectorization
  */
-template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
 struct gebp_kernel
 {
   typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
@@ -676,14 +714,15 @@ struct gebp_kernel
   typedef typename Traits::RhsPacket RhsPacket;
   typedef typename Traits::ResPacket ResPacket;
   typedef typename Traits::AccPacket AccPacket;
-  
+
   typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
   typedef typename SwappedTraits::ResScalar SResScalar;
   typedef typename SwappedTraits::LhsPacket SLhsPacket;
   typedef typename SwappedTraits::RhsPacket SRhsPacket;
   typedef typename SwappedTraits::ResPacket SResPacket;
   typedef typename SwappedTraits::AccPacket SAccPacket;
-            
+
+  typedef typename DataMapper::LinearMapper LinearMapper;
 
   enum {
     Vectorizable  = Traits::Vectorizable,
@@ -693,14 +732,16 @@ struct gebp_kernel
   };
 
   EIGEN_DONT_INLINE
-  void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
+  void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
+                  Index rows, Index depth, Index cols, ResScalar alpha,
                   Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
 };
 
-template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
 EIGEN_DONT_INLINE
-void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
-  ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
+void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
+  ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
+               Index rows, Index depth, Index cols, ResScalar alpha,
                Index strideA, Index strideB, Index offsetA, Index offsetB)
   {
     Traits traits;
@@ -743,15 +784,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
           traits.initAcc(C4);  traits.initAcc(C5);  traits.initAcc(C6);  traits.initAcc(C7);
           traits.initAcc(C8);  traits.initAcc(C9);  traits.initAcc(C10); traits.initAcc(C11);
 
-          ResScalar* r0 = &res[(j2+0)*resStride + i];
-          ResScalar* r1 = &res[(j2+1)*resStride + i];
-          ResScalar* r2 = &res[(j2+2)*resStride + i];
-          ResScalar* r3 = &res[(j2+3)*resStride + i];
-          
-          internal::prefetch(r0);
-          internal::prefetch(r1);
-          internal::prefetch(r2);
-          internal::prefetch(r3);
+          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+          r0.prefetch(0);
+          r1.prefetch(0);
+          r2.prefetch(0);
+          r3.prefetch(0);
 
           // performs "inner" products
           const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
@@ -760,31 +801,36 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
           
           for(Index k=0; k<peeled_kc; k+=pk)
           {
-            EIGEN_ASM_COMMENT("begin gegp micro kernel 3p x 4");
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
             RhsPacket B_0, T0;
             LhsPacket A2;
 
 #define EIGEN_GEBGP_ONESTEP(K) \
-            internal::prefetch(blA+(3*K+16)*LhsProgress); \
-            traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
-            traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
-            traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
-            traits.loadRhs(&blB[(0+4*K)*RhsProgress], B_0); \
-            traits.madd(A0, B_0, C0, T0); \
-            traits.madd(A1, B_0, C4, T0); \
-            traits.madd(A2, B_0, C8, B_0); \
-            traits.loadRhs(&blB[1+4*K*RhsProgress], B_0); \
-            traits.madd(A0, B_0, C1, T0); \
-            traits.madd(A1, B_0, C5, T0); \
-            traits.madd(A2, B_0, C9, B_0); \
-            traits.loadRhs(&blB[2+4*K*RhsProgress], B_0); \
-            traits.madd(A0, B_0, C2,  T0); \
-            traits.madd(A1, B_0, C6,  T0); \
-            traits.madd(A2, B_0, C10, B_0); \
-            traits.loadRhs(&blB[3+4*K*RhsProgress], B_0); \
-            traits.madd(A0, B_0, C3 , T0); \
-            traits.madd(A1, B_0, C7,  T0); \
-            traits.madd(A2, B_0, C11, B_0)
+            do { \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              internal::prefetch(blA+(3*K+16)*LhsProgress); \
+              traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
+              traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
+              traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
+              traits.loadRhs(&blB[(0+4*K)*RhsProgress], B_0); \
+              traits.madd(A0, B_0, C0, T0); \
+              traits.madd(A1, B_0, C4, T0); \
+              traits.madd(A2, B_0, C8, B_0); \
+              traits.loadRhs(&blB[1+4*K*RhsProgress], B_0); \
+              traits.madd(A0, B_0, C1, T0); \
+              traits.madd(A1, B_0, C5, T0); \
+              traits.madd(A2, B_0, C9, B_0); \
+              traits.loadRhs(&blB[2+4*K*RhsProgress], B_0); \
+              traits.madd(A0, B_0, C2,  T0); \
+              traits.madd(A1, B_0, C6,  T0); \
+              traits.madd(A2, B_0, C10, B_0); \
+              traits.loadRhs(&blB[3+4*K*RhsProgress], B_0); \
+              traits.madd(A0, B_0, C3 , T0); \
+              traits.madd(A1, B_0, C7,  T0); \
+              traits.madd(A2, B_0, C11, B_0); \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
+            } while(false)
         
             internal::prefetch(blB+(48+0));
             EIGEN_GEBGP_ONESTEP(0);
@@ -799,6 +845,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
 
             blB += pk*4*RhsProgress;
             blA += pk*3*Traits::LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
           }
           // process remaining peeled loop
           for(Index k=peeled_kc; k<depth; k++)
@@ -813,48 +861,48 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
   
           ResPacket R0, R1, R2;
           ResPacket alphav = pset1<ResPacket>(alpha);
-          
-          R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
-          R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize);
-          R2 = ploadu<ResPacket>(r0+2*Traits::ResPacketSize);
+
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r0.loadPacket(2 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
           traits.acc(C4, alphav, R1);
           traits.acc(C8, alphav, R2);
-          pstoreu(r0+0*Traits::ResPacketSize, R0);
-          pstoreu(r0+1*Traits::ResPacketSize, R1);
-          pstoreu(r0+2*Traits::ResPacketSize, R2);
-          
-          R0 = ploadu<ResPacket>(r1+0*Traits::ResPacketSize);
-          R1 = ploadu<ResPacket>(r1+1*Traits::ResPacketSize);
-          R2 = ploadu<ResPacket>(r1+2*Traits::ResPacketSize);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r0.storePacket(1 * Traits::ResPacketSize, R1);
+          r0.storePacket(2 * Traits::ResPacketSize, R2);
+
+          R0 = r1.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r1.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r1.loadPacket(2 * Traits::ResPacketSize);
           traits.acc(C1, alphav, R0);
           traits.acc(C5, alphav, R1);
           traits.acc(C9, alphav, R2);
-          pstoreu(r1+0*Traits::ResPacketSize, R0);
-          pstoreu(r1+1*Traits::ResPacketSize, R1);
-          pstoreu(r1+2*Traits::ResPacketSize, R2);
-          
-          R0 = ploadu<ResPacket>(r2+0*Traits::ResPacketSize);
-          R1 = ploadu<ResPacket>(r2+1*Traits::ResPacketSize);
-          R2 = ploadu<ResPacket>(r2+2*Traits::ResPacketSize);
+          r1.storePacket(0 * Traits::ResPacketSize, R0);
+          r1.storePacket(1 * Traits::ResPacketSize, R1);
+          r1.storePacket(2 * Traits::ResPacketSize, R2);
+
+          R0 = r2.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r2.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r2.loadPacket(2 * Traits::ResPacketSize);
           traits.acc(C2, alphav, R0);
           traits.acc(C6, alphav, R1);
           traits.acc(C10, alphav, R2);
-          pstoreu(r2+0*Traits::ResPacketSize, R0);
-          pstoreu(r2+1*Traits::ResPacketSize, R1);
-          pstoreu(r2+2*Traits::ResPacketSize, R2);
-          
-          R0 = ploadu<ResPacket>(r3+0*Traits::ResPacketSize);
-          R1 = ploadu<ResPacket>(r3+1*Traits::ResPacketSize);
-          R2 = ploadu<ResPacket>(r3+2*Traits::ResPacketSize);
+          r2.storePacket(0 * Traits::ResPacketSize, R0);
+          r2.storePacket(1 * Traits::ResPacketSize, R1);
+          r2.storePacket(2 * Traits::ResPacketSize, R2);
+
+          R0 = r3.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r3.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r3.loadPacket(2 * Traits::ResPacketSize);
           traits.acc(C3, alphav, R0);
           traits.acc(C7, alphav, R1);
           traits.acc(C11, alphav, R2);
-          pstoreu(r3+0*Traits::ResPacketSize, R0);
-          pstoreu(r3+1*Traits::ResPacketSize, R1);
-          pstoreu(r3+2*Traits::ResPacketSize, R2);
+          r3.storePacket(0 * Traits::ResPacketSize, R0);
+          r3.storePacket(1 * Traits::ResPacketSize, R1);
+          r3.storePacket(2 * Traits::ResPacketSize, R2);
         }
-        
+
         // Deal with remaining columns of the rhs
         for(Index j2=packet_cols4; j2<cols; j2++)
         {
@@ -868,7 +916,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
           traits.initAcc(C4);
           traits.initAcc(C8);
 
-          ResScalar* r0 = &res[(j2+0)*resStride + i];
+          LinearMapper r0 = res.getLinearMapper(i, j2);
+          r0.prefetch(0);
 
           // performs "inner" products
           const RhsScalar* blB = &blockB[j2*strideB+offsetB];
@@ -876,16 +925,21 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
           
           for(Index k=0; k<peeled_kc; k+=pk)
           {
-            EIGEN_ASM_COMMENT("begin gegp micro kernel 3p x 1");
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
             RhsPacket B_0;
 #define EIGEN_GEBGP_ONESTEP(K) \
-            traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
-            traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
-            traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
-            traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);   \
-            traits.madd(A0, B_0, C0, B_0); \
-            traits.madd(A1, B_0, C4, B_0); \
-            traits.madd(A2, B_0, C8, B_0)
+            do { \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
+              traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
+              traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
+              traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);   \
+              traits.madd(A0, B_0, C0, B_0); \
+              traits.madd(A1, B_0, C4, B_0); \
+              traits.madd(A2, B_0, C8, B_0); \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
+            } while(false)
         
             EIGEN_GEBGP_ONESTEP(0);
             EIGEN_GEBGP_ONESTEP(1);
@@ -898,6 +952,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
 
             blB += pk*RhsProgress;
             blA += pk*3*Traits::LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
           }
 
           // process remaining peeled loop
@@ -912,19 +968,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
           ResPacket R0, R1, R2;
           ResPacket alphav = pset1<ResPacket>(alpha);
 
-          R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
-          R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize);
-          R2 = ploadu<ResPacket>(r0+2*Traits::ResPacketSize);
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r0.loadPacket(2 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
           traits.acc(C4, alphav, R1);
-          traits.acc(C8 , alphav, R2);
-          pstoreu(r0+0*Traits::ResPacketSize, R0);
-          pstoreu(r0+1*Traits::ResPacketSize, R1);
-          pstoreu(r0+2*Traits::ResPacketSize, R2);
+          traits.acc(C8, alphav, R2);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r0.storePacket(1 * Traits::ResPacketSize, R1);
+          r0.storePacket(2 * Traits::ResPacketSize, R2);
         }
       }
     }
-      
+
     //---------- Process 2 * LhsProgress rows at once ----------
     if(mr>=2*Traits::LhsProgress)
     {
@@ -946,15 +1002,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
           traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
           traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
 
-          ResScalar* r0 = &res[(j2+0)*resStride + i];
-          ResScalar* r1 = &res[(j2+1)*resStride + i];
-          ResScalar* r2 = &res[(j2+2)*resStride + i];
-          ResScalar* r3 = &res[(j2+3)*resStride + i];
-          
-          internal::prefetch(r0+prefetch_res_offset);
-          internal::prefetch(r1+prefetch_res_offset);
-          internal::prefetch(r2+prefetch_res_offset);
-          internal::prefetch(r3+prefetch_res_offset);
+          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+          r0.prefetch(prefetch_res_offset);
+          r1.prefetch(prefetch_res_offset);
+          r2.prefetch(prefetch_res_offset);
+          r3.prefetch(prefetch_res_offset);
 
           // performs "inner" products
           const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
@@ -963,21 +1019,26 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
 
           for(Index k=0; k<peeled_kc; k+=pk)
           {
-            EIGEN_ASM_COMMENT("begin gegp micro kernel 2pX4");
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
             RhsPacket B_0, B1, B2, B3, T0;
 
    #define EIGEN_GEBGP_ONESTEP(K) \
-            traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);                    \
-            traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);                    \
-            traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \
-            traits.madd(A0, B_0, C0, T0);                                     \
-            traits.madd(A1, B_0, C4, B_0);                                    \
-            traits.madd(A0, B1,  C1, T0);                                     \
-            traits.madd(A1, B1,  C5, B1);                                     \
-            traits.madd(A0, B2,  C2, T0);                                     \
-            traits.madd(A1, B2,  C6, B2);                                     \
-            traits.madd(A0, B3,  C3, T0);                                     \
-            traits.madd(A1, B3,  C7, B3)
+            do {                                                                \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4");        \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);                    \
+              traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);                    \
+              traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \
+              traits.madd(A0, B_0, C0, T0);                                     \
+              traits.madd(A1, B_0, C4, B_0);                                    \
+              traits.madd(A0, B1,  C1, T0);                                     \
+              traits.madd(A1, B1,  C5, B1);                                     \
+              traits.madd(A0, B2,  C2, T0);                                     \
+              traits.madd(A1, B2,  C6, B2);                                     \
+              traits.madd(A0, B3,  C3, T0);                                     \
+              traits.madd(A1, B3,  C7, B3);                                     \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4");          \
+            } while(false)
             
             internal::prefetch(blB+(48+0));
             EIGEN_GEBGP_ONESTEP(0);
@@ -992,6 +1053,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
 
             blB += pk*4*RhsProgress;
             blA += pk*(2*Traits::LhsProgress);
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
           }
           // process remaining peeled loop
           for(Index k=peeled_kc; k<depth; k++)
@@ -1002,37 +1065,37 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
             blA += 2*Traits::LhsProgress;
           }
 #undef EIGEN_GEBGP_ONESTEP
-  
+
           ResPacket R0, R1, R2, R3;
           ResPacket alphav = pset1<ResPacket>(alpha);
-          
-          R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
-          R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize);
-          R2 = ploadu<ResPacket>(r1+0*Traits::ResPacketSize);
-          R3 = ploadu<ResPacket>(r1+1*Traits::ResPacketSize);
+
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r1.loadPacket(0 * Traits::ResPacketSize);
+          R3 = r1.loadPacket(1 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
           traits.acc(C4, alphav, R1);
           traits.acc(C1, alphav, R2);
           traits.acc(C5, alphav, R3);
-          pstoreu(r0+0*Traits::ResPacketSize, R0);
-          pstoreu(r0+1*Traits::ResPacketSize, R1);
-          pstoreu(r1+0*Traits::ResPacketSize, R2);
-          pstoreu(r1+1*Traits::ResPacketSize, R3);
-          
-          R0 = ploadu<ResPacket>(r2+0*Traits::ResPacketSize);
-          R1 = ploadu<ResPacket>(r2+1*Traits::ResPacketSize);
-          R2 = ploadu<ResPacket>(r3+0*Traits::ResPacketSize);
-          R3 = ploadu<ResPacket>(r3+1*Traits::ResPacketSize);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r0.storePacket(1 * Traits::ResPacketSize, R1);
+          r1.storePacket(0 * Traits::ResPacketSize, R2);
+          r1.storePacket(1 * Traits::ResPacketSize, R3);
+
+          R0 = r2.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r2.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r3.loadPacket(0 * Traits::ResPacketSize);
+          R3 = r3.loadPacket(1 * Traits::ResPacketSize);
           traits.acc(C2,  alphav, R0);
           traits.acc(C6,  alphav, R1);
           traits.acc(C3,  alphav, R2);
           traits.acc(C7,  alphav, R3);
-          pstoreu(r2+0*Traits::ResPacketSize, R0);
-          pstoreu(r2+1*Traits::ResPacketSize, R1);
-          pstoreu(r3+0*Traits::ResPacketSize, R2);
-          pstoreu(r3+1*Traits::ResPacketSize, R3);
+          r2.storePacket(0 * Traits::ResPacketSize, R0);
+          r2.storePacket(1 * Traits::ResPacketSize, R1);
+          r3.storePacket(0 * Traits::ResPacketSize, R2);
+          r3.storePacket(1 * Traits::ResPacketSize, R3);
         }
-        
+
         // Deal with remaining columns of the rhs
         for(Index j2=packet_cols4; j2<cols; j2++)
         {
@@ -1045,8 +1108,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
           traits.initAcc(C0);
           traits.initAcc(C4);
 
-          ResScalar* r0 = &res[(j2+0)*resStride + i];
-          internal::prefetch(r0+prefetch_res_offset);
+          LinearMapper r0 = res.getLinearMapper(i, j2);
+          r0.prefetch(prefetch_res_offset);
 
           // performs "inner" products
           const RhsScalar* blB = &blockB[j2*strideB+offsetB];
@@ -1054,15 +1117,20 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
 
           for(Index k=0; k<peeled_kc; k+=pk)
           {
-            EIGEN_ASM_COMMENT("begin gegp micro kernel 2p x 1");
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
             RhsPacket B_0, B1;
         
 #define EIGEN_GEBGP_ONESTEP(K) \
-            traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);  \
-            traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);  \
-            traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);   \
-            traits.madd(A0, B_0, C0, B1);                   \
-            traits.madd(A1, B_0, C4, B_0)
+            do {                                                                  \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1");          \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);                      \
+              traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);                      \
+              traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);                       \
+              traits.madd(A0, B_0, C0, B1);                                       \
+              traits.madd(A1, B_0, C4, B_0);                                      \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1");            \
+            } while(false)
         
             EIGEN_GEBGP_ONESTEP(0);
             EIGEN_GEBGP_ONESTEP(1);
@@ -1075,6 +1143,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
 
             blB += pk*RhsProgress;
             blA += pk*2*Traits::LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
           }
 
           // process remaining peeled loop
@@ -1089,12 +1159,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
           ResPacket R0, R1;
           ResPacket alphav = pset1<ResPacket>(alpha);
 
-          R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
-          R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize);
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
           traits.acc(C4, alphav, R1);
-          pstoreu(r0+0*Traits::ResPacketSize, R0);
-          pstoreu(r0+1*Traits::ResPacketSize, R1);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r0.storePacket(1 * Traits::ResPacketSize, R1);
         }
       }
     }
@@ -1120,15 +1190,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
           traits.initAcc(C2);
           traits.initAcc(C3);
 
-          ResScalar* r0 = &res[(j2+0)*resStride + i];
-          ResScalar* r1 = &res[(j2+1)*resStride + i];
-          ResScalar* r2 = &res[(j2+2)*resStride + i];
-          ResScalar* r3 = &res[(j2+3)*resStride + i];
-          
-          internal::prefetch(r0+prefetch_res_offset);
-          internal::prefetch(r1+prefetch_res_offset);
-          internal::prefetch(r2+prefetch_res_offset);
-          internal::prefetch(r3+prefetch_res_offset);
+          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+          r0.prefetch(prefetch_res_offset);
+          r1.prefetch(prefetch_res_offset);
+          r2.prefetch(prefetch_res_offset);
+          r3.prefetch(prefetch_res_offset);
 
           // performs "inner" products
           const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
@@ -1137,16 +1207,21 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
 
           for(Index k=0; k<peeled_kc; k+=pk)
           {
-            EIGEN_ASM_COMMENT("begin gegp micro kernel 1pX4");
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4");
             RhsPacket B_0, B1, B2, B3;
                
 #define EIGEN_GEBGP_ONESTEP(K) \
-            traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \
-            traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \
-            traits.madd(A0, B_0, C0, B_0);                                    \
-            traits.madd(A0, B1,  C1, B1);                                     \
-            traits.madd(A0, B2,  C2, B2);                                     \
-            traits.madd(A0, B3,  C3, B3);
+            do {                                                                \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4");        \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \
+              traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \
+              traits.madd(A0, B_0, C0, B_0);                                    \
+              traits.madd(A0, B1,  C1, B1);                                     \
+              traits.madd(A0, B2,  C2, B2);                                     \
+              traits.madd(A0, B3,  C3, B3);                                     \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4");          \
+            } while(false)
             
             internal::prefetch(blB+(48+0));
             EIGEN_GEBGP_ONESTEP(0);
@@ -1161,6 +1236,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
 
             blB += pk*4*RhsProgress;
             blA += pk*1*LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4");
           }
           // process remaining peeled loop
           for(Index k=peeled_kc; k<depth; k++)
@@ -1171,25 +1248,25 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
             blA += 1*LhsProgress;
           }
 #undef EIGEN_GEBGP_ONESTEP
-  
+
           ResPacket R0, R1;
           ResPacket alphav = pset1<ResPacket>(alpha);
-          
-          R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
-          R1 = ploadu<ResPacket>(r1+0*Traits::ResPacketSize);
+
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r1.loadPacket(0 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
           traits.acc(C1,  alphav, R1);
-          pstoreu(r0+0*Traits::ResPacketSize, R0);
-          pstoreu(r1+0*Traits::ResPacketSize, R1);
-          
-          R0 = ploadu<ResPacket>(r2+0*Traits::ResPacketSize);
-          R1 = ploadu<ResPacket>(r3+0*Traits::ResPacketSize);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r1.storePacket(0 * Traits::ResPacketSize, R1);
+
+          R0 = r2.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r3.loadPacket(0 * Traits::ResPacketSize);
           traits.acc(C2,  alphav, R0);
           traits.acc(C3,  alphav, R1);
-          pstoreu(r2+0*Traits::ResPacketSize, R0);
-          pstoreu(r3+0*Traits::ResPacketSize, R1);
+          r2.storePacket(0 * Traits::ResPacketSize, R0);
+          r3.storePacket(0 * Traits::ResPacketSize, R1);
         }
-        
+
         // Deal with remaining columns of the rhs
         for(Index j2=packet_cols4; j2<cols; j2++)
         {
@@ -1201,7 +1278,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
           AccPacket C0;
           traits.initAcc(C0);
 
-          ResScalar* r0 = &res[(j2+0)*resStride + i];
+          LinearMapper r0 = res.getLinearMapper(i, j2);
 
           // performs "inner" products
           const RhsScalar* blB = &blockB[j2*strideB+offsetB];
@@ -1209,14 +1286,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
 
           for(Index k=0; k<peeled_kc; k+=pk)
           {
-            EIGEN_ASM_COMMENT("begin gegp micro kernel 2p x 1");
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
             RhsPacket B_0;
         
 #define EIGEN_GEBGP_ONESTEP(K) \
-            traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);  \
-            traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);   \
-            traits.madd(A0, B_0, C0, B_0); \
-        
+            do {                                                                \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1");        \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \
+              traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);                     \
+              traits.madd(A0, B_0, C0, B_0);                                    \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1");          \
+            } while(false);
+
             EIGEN_GEBGP_ONESTEP(0);
             EIGEN_GEBGP_ONESTEP(1);
             EIGEN_GEBGP_ONESTEP(2);
@@ -1228,6 +1310,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
 
             blB += pk*RhsProgress;
             blA += pk*1*Traits::LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
           }
 
           // process remaining peeled loop
@@ -1241,9 +1325,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
 #undef EIGEN_GEBGP_ONESTEP
           ResPacket R0;
           ResPacket alphav = pset1<ResPacket>(alpha);
-          R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
-          pstoreu(r0+0*Traits::ResPacketSize, R0);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
         }
       }
     }
@@ -1259,7 +1343,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
           const LhsScalar* blA = &blockA[i*strideA+offsetA];
           prefetch(&blA[0]);
           const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
-          
+
           if( (SwappedTraits::LhsProgress % 4)==0 )
           {
             // NOTE The following piece of code wont work for 512 bit registers
@@ -1268,32 +1352,32 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
             straits.initAcc(C1);
             straits.initAcc(C2);
             straits.initAcc(C3);
-            
+
             const Index spk   = (std::max)(1,SwappedTraits::LhsProgress/4);
             const Index endk  = (depth/spk)*spk;
             const Index endk4 = (depth/(spk*4))*(spk*4);
-            
+
             Index k=0;
             for(; k<endk4; k+=4*spk)
             {
               SLhsPacket A0,A1;
               SRhsPacket B_0,B_1;
-              
+
               straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
               straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
-              
+
               straits.loadRhsQuad(blA+0*spk, B_0);
               straits.loadRhsQuad(blA+1*spk, B_1);
               straits.madd(A0,B_0,C0,B_0);
               straits.madd(A1,B_1,C1,B_1);
-              
+
               straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
               straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
               straits.loadRhsQuad(blA+2*spk, B_0);
               straits.loadRhsQuad(blA+3*spk, B_1);
               straits.madd(A0,B_0,C2,B_0);
               straits.madd(A1,B_1,C3,B_1);
-              
+
               blB += 4*SwappedTraits::LhsProgress;
               blA += 4*spk;
             }
@@ -1302,11 +1386,11 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
             {
               SLhsPacket A0;
               SRhsPacket B_0;
-              
+
               straits.loadLhsUnaligned(blB, A0);
               straits.loadRhsQuad(blA, B_0);
               straits.madd(A0,B_0,C0,B_0);
-              
+
               blB += SwappedTraits::LhsProgress;
               blA += spk;
             }
@@ -1317,10 +1401,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
               typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
               typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
               typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
-              
-              SResPacketHalf R = pgather<SResScalar, SResPacketHalf>(&res[j2*resStride + i], resStride);
+
+              SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
               SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
-             
+
               if(depth-endk>0)
               {
                 // We have to handle the last row of the rhs which corresponds to a half-packet
@@ -1336,14 +1420,14 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
               {
                 straits.acc(predux4(C0), alphav, R);
               }
-              pscatter(&res[j2*resStride + i], R, resStride);
+              res.scatterPacket(i, j2, R);
             }
             else
             {
-              SResPacket R = pgather<SResScalar, SResPacket>(&res[j2*resStride + i], resStride);
+              SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
               SResPacket alphav = pset1<SResPacket>(alpha);
               straits.acc(C0, alphav, R);
-              pscatter(&res[j2*resStride + i], R, resStride);
+              res.scatterPacket(i, j2, R);
             }
           }
           else // scalar path
@@ -1355,25 +1439,25 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
             {
               LhsScalar A0;
               RhsScalar B_0, B_1;
-              
+
               A0 = blA[k];
-              
+
               B_0 = blB[0];
               B_1 = blB[1];
-              MADD(cj,A0,B_0,C0,  B_0);
-              MADD(cj,A0,B_1,C1,  B_1);
+              CJMADD(cj,A0,B_0,C0,  B_0);
+              CJMADD(cj,A0,B_1,C1,  B_1);
               
               B_0 = blB[2];
               B_1 = blB[3];
-              MADD(cj,A0,B_0,C2,  B_0);
-              MADD(cj,A0,B_1,C3,  B_1);
+              CJMADD(cj,A0,B_0,C2,  B_0);
+              CJMADD(cj,A0,B_1,C3,  B_1);
               
               blB += 4;
             }
-            res[(j2+0)*resStride + i] += alpha*C0;
-            res[(j2+1)*resStride + i] += alpha*C1;
-            res[(j2+2)*resStride + i] += alpha*C2;
-            res[(j2+3)*resStride + i] += alpha*C3;
+            res(i, j2 + 0) += alpha * C0;
+            res(i, j2 + 1) += alpha * C1;
+            res(i, j2 + 2) += alpha * C2;
+            res(i, j2 + 3) += alpha * C3;
           }
         }
       }
@@ -1392,9 +1476,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
           {
             LhsScalar A0 = blA[k];
             RhsScalar B_0 = blB[k];
-            MADD(cj, A0, B_0, C0, B_0);
+            CJMADD(cj, A0, B_0, C0, B_0);
           }
-          res[(j2+0)*resStride + i] += alpha*C0;
+          res(i, j2) += alpha * C0;
         }
       }
     }
@@ -1417,15 +1501,16 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
 //
 //  32 33 34 35 ...
 //  36 36 38 39 ...
-template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
 {
-  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0);
+  typedef typename DataMapper::LinearMapper LinearMapper;
+  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
 };
 
-template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
-  ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset)
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
+  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
   typedef typename packet_traits<Scalar>::type Packet;
   enum { PacketSize = packet_traits<Scalar>::size };
@@ -1436,30 +1521,29 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
   eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
   eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
   conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
-  const_blas_data_mapper<Scalar, Index, ColMajor> lhs(_lhs,lhsStride);
   Index count = 0;
-  
+
   const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
   const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
   const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
   const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
                          : Pack2>1             ? (rows/Pack2)*Pack2 : 0;
-  
+
   Index i=0;
-  
+
   // Pack 3 packets
   if(Pack1>=3*PacketSize)
   {
     for(; i<peeled_mc3; i+=3*PacketSize)
     {
       if(PanelMode) count += (3*PacketSize) * offset;
-      
+
       for(Index k=0; k<depth; k++)
       {
         Packet A, B, C;
-        A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
-        B = ploadu<Packet>(&lhs(i+1*PacketSize, k));
-        C = ploadu<Packet>(&lhs(i+2*PacketSize, k));
+        A = lhs.loadPacket(i+0*PacketSize, k);
+        B = lhs.loadPacket(i+1*PacketSize, k);
+        C = lhs.loadPacket(i+2*PacketSize, k);
         pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
         pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
         pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
@@ -1473,12 +1557,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
     for(; i<peeled_mc2; i+=2*PacketSize)
     {
       if(PanelMode) count += (2*PacketSize) * offset;
-      
+
       for(Index k=0; k<depth; k++)
       {
         Packet A, B;
-        A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
-        B = ploadu<Packet>(&lhs(i+1*PacketSize, k));
+        A = lhs.loadPacket(i+0*PacketSize, k);
+        B = lhs.loadPacket(i+1*PacketSize, k);
         pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
         pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
       }
@@ -1491,11 +1575,11 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
     for(; i<peeled_mc1; i+=1*PacketSize)
     {
       if(PanelMode) count += (1*PacketSize) * offset;
-      
+
       for(Index k=0; k<depth; k++)
       {
         Packet A;
-        A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
+        A = lhs.loadPacket(i+0*PacketSize, k);
         pstore(blockA+count, cj.pconj(A));
         count+=PacketSize;
       }
@@ -1508,11 +1592,11 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
     for(; i<peeled_mc0; i+=Pack2)
     {
       if(PanelMode) count += Pack2 * offset;
-      
+
       for(Index k=0; k<depth; k++)
         for(Index w=0; w<Pack2; w++)
           blockA[count++] = cj(lhs(i+w, k));
-        
+
       if(PanelMode) count += Pack2 * (stride-offset-depth);
     }
   }
@@ -1525,15 +1609,16 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
   }
 }
 
-template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
 {
-  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0);
+  typedef typename DataMapper::LinearMapper LinearMapper;
+  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
 };
 
-template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
-  ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset)
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
+  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
   typedef typename packet_traits<Scalar>::type Packet;
   enum { PacketSize = packet_traits<Scalar>::size };
@@ -1543,13 +1628,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj
   EIGEN_UNUSED_VARIABLE(offset);
   eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
   conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
-  const_blas_data_mapper<Scalar, Index, RowMajor> lhs(_lhs,lhsStride);
   Index count = 0;
-  
+
 //   const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
 //   const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
 //   const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
-  
+
   int pack = Pack1;
   Index i = 0;
   while(pack>0)
@@ -1569,7 +1653,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj
           for (Index m = 0; m < pack; m += PacketSize)
           {
             PacketBlock<Packet> kernel;
-            for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = ploadu<Packet>(&lhs(i+p+m, k));
+            for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k);
             ptranspose(kernel);
             for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
           }
@@ -1594,15 +1678,15 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj
           for(;w<pack;++w)
             blockA[count++] = cj(lhs(i+w, k));
       }
-      
+
       if(PanelMode) count += pack * (stride-offset-depth);
     }
-    
+
     pack -= PacketSize;
     if(pack<Pack2 && (pack+PacketSize)!=Pack2)
       pack = Pack2;
   }
-  
+
   for(; i<rows; i++)
   {
     if(PanelMode) count += offset;
@@ -1619,17 +1703,18 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj
 //  4  5  6  7   16 17 18 19   25 28
 //  8  9 10 11   20 21 22 23   26 29
 //  .  .  .  .    .  .  .  .    .  .
-template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
 {
   typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename DataMapper::LinearMapper LinearMapper;
   enum { PacketSize = packet_traits<Scalar>::size };
-  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
+  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
 };
 
-template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
-  ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset)
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+  ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
 {
   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
   EIGEN_UNUSED_VARIABLE(stride);
@@ -1685,27 +1770,27 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
 //       if(PanelMode) count += 8 * (stride-offset-depth);
 //     }
 //   }
-  
+
   if(nr>=4)
   {
     for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
     {
       // skip what we have before
       if(PanelMode) count += 4 * offset;
-      const Scalar* b0 = &rhs[(j2+0)*rhsStride];
-      const Scalar* b1 = &rhs[(j2+1)*rhsStride];
-      const Scalar* b2 = &rhs[(j2+2)*rhsStride];
-      const Scalar* b3 = &rhs[(j2+3)*rhsStride];
-      
+      const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
       Index k=0;
       if((PacketSize%4)==0) // TODO enbale vectorized transposition for PacketSize==2 ??
       {
         for(; k<peeled_k; k+=PacketSize) {
           PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
-          kernel.packet[0] = ploadu<Packet>(&b0[k]);
-          kernel.packet[1] = ploadu<Packet>(&b1[k]);
-          kernel.packet[2] = ploadu<Packet>(&b2[k]);
-          kernel.packet[3] = ploadu<Packet>(&b3[k]);
+          kernel.packet[0] = dm0.loadPacket(k);
+          kernel.packet[1] = dm1.loadPacket(k);
+          kernel.packet[2] = dm2.loadPacket(k);
+          kernel.packet[3] = dm3.loadPacket(k);
           ptranspose(kernel);
           pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
           pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1]));
@@ -1716,10 +1801,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
       }
       for(; k<depth; k++)
       {
-        blockB[count+0] = cj(b0[k]);
-        blockB[count+1] = cj(b1[k]);
-        blockB[count+2] = cj(b2[k]);
-        blockB[count+3] = cj(b3[k]);
+        blockB[count+0] = cj(dm0(k));
+        blockB[count+1] = cj(dm1(k));
+        blockB[count+2] = cj(dm2(k));
+        blockB[count+3] = cj(dm3(k));
         count += 4;
       }
       // skip what we have after
@@ -1731,10 +1816,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
   for(Index j2=packet_cols4; j2<cols; ++j2)
   {
     if(PanelMode) count += offset;
-    const Scalar* b0 = &rhs[(j2+0)*rhsStride];
+    const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
     for(Index k=0; k<depth; k++)
     {
-      blockB[count] = cj(b0[k]);
+      blockB[count] = cj(dm0(k));
       count += 1;
     }
     if(PanelMode) count += (stride-offset-depth);
@@ -1742,17 +1827,18 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
 }
 
 // this version is optimized for row major matrices
-template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
 {
   typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename DataMapper::LinearMapper LinearMapper;
   enum { PacketSize = packet_traits<Scalar>::size };
-  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
+  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
 };
 
-template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
-  ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset)
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+  ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
 {
   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
   EIGEN_UNUSED_VARIABLE(stride);
@@ -1762,7 +1848,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
   Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
   Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
   Index count = 0;
-  
+
 //   if(nr>=8)
 //   {
 //     for(Index j2=0; j2<packet_cols8; j2+=8)
@@ -1805,15 +1891,15 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
       for(Index k=0; k<depth; k++)
       {
         if (PacketSize==4) {
-          Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
+          Packet A = rhs.loadPacket(k, j2);
           pstoreu(blockB+count, cj.pconj(A));
           count += PacketSize;
         } else {
-          const Scalar* b0 = &rhs[k*rhsStride + j2];
-          blockB[count+0] = cj(b0[0]);
-          blockB[count+1] = cj(b0[1]);
-          blockB[count+2] = cj(b0[2]);
-          blockB[count+3] = cj(b0[3]);
+          const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
+          blockB[count+0] = cj(dm0(0));
+          blockB[count+1] = cj(dm0(1));
+          blockB[count+2] = cj(dm0(2));
+          blockB[count+3] = cj(dm0(3));
           count += 4;
         }
       }
@@ -1825,10 +1911,9 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
   for(Index j2=packet_cols4; j2<cols; ++j2)
   {
     if(PanelMode) count += offset;
-    const Scalar* b0 = &rhs[j2];
     for(Index k=0; k<depth; k++)
     {
-      blockB[count] = cj(b0[k*rhsStride]);
+      blockB[count] = cj(rhs(k, j2));
       count += 1;
     }
     if(PanelMode) count += stride-offset-depth;
@@ -1841,8 +1926,8 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
   * \sa setCpuCacheSize */
 inline std::ptrdiff_t l1CacheSize()
 {
-  std::ptrdiff_t l1, l2;
-  internal::manage_caching_sizes(GetAction, &l1, &l2);
+  std::ptrdiff_t l1, l2, l3;
+  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
   return l1;
 }
 
@@ -1850,8 +1935,8 @@ inline std::ptrdiff_t l1CacheSize()
   * \sa setCpuCacheSize */
 inline std::ptrdiff_t l2CacheSize()
 {
-  std::ptrdiff_t l1, l2;
-  internal::manage_caching_sizes(GetAction, &l1, &l2);
+  std::ptrdiff_t l1, l2, l3;
+  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
   return l2;
 }
 
@@ -1860,9 +1945,9 @@ inline std::ptrdiff_t l2CacheSize()
   * for the algorithms working per blocks.
   *
   * \sa computeProductBlockingSizes */
-inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2)
+inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
 {
-  internal::manage_caching_sizes(SetAction, &l1, &l2);
+  internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index b7e1867f0..fd9443cd2 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -59,21 +59,25 @@ typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScal
 static void run(Index rows, Index cols, Index depth,
   const LhsScalar* _lhs, Index lhsStride,
   const RhsScalar* _rhs, Index rhsStride,
-  ResScalar* res, Index resStride,
+  ResScalar* _res, Index resStride,
   ResScalar alpha,
   level3_blocking<LhsScalar,RhsScalar>& blocking,
   GemmParallelInfo<Index>* info = 0)
 {
-  const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-  const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
+  typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
+  typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
+  typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+  LhsMapper lhs(_lhs,lhsStride);
+  RhsMapper rhs(_rhs,rhsStride);
+  ResMapper res(_res, resStride);
 
   Index kc = blocking.kc();                   // cache block size along the K direction
   Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
   Index nc = (std::min)(cols,blocking.nc());  // cache block size along the N direction
 
-  gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
-  gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
-  gebp_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
+  gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+  gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+  gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
 
 #ifdef EIGEN_HAS_OPENMP
   if(info)
@@ -95,7 +99,7 @@ static void run(Index rows, Index cols, Index depth,
 
       // In order to reduce the chance that a thread has to wait for the other,
       // let's start by packing B'.
-      pack_rhs(blockB, &rhs(k,0), rhsStride, actual_kc, nc);
+      pack_rhs(blockB, rhs.getSubMapper(k,0), actual_kc, nc);
 
       // Pack A_k to A' in a parallel fashion:
       // each thread packs the sub block A_k,i to A'_i where i is the thread id.
@@ -105,8 +109,8 @@ static void run(Index rows, Index cols, Index depth,
       // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
       while(info[tid].users!=0) {}
       info[tid].users += threads;
-      
-      pack_lhs(blockA+info[tid].lhs_start*actual_kc, &lhs(info[tid].lhs_start,k), lhsStride, actual_kc, info[tid].lhs_length);
+
+      pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length);
 
       // Notify the other threads that the part A'_i is ready to go.
       info[tid].sync = k;
@@ -119,9 +123,12 @@ static void run(Index rows, Index cols, Index depth,
         // At this point we have to make sure that A'_i has been updated by the thread i,
         // we use testAndSetOrdered to mimic a volatile access.
         // However, no need to wait for the B' part which has been updated by the current thread!
-        if(shift>0)
-          while(info[i].sync!=k) {}
-        gebp(res+info[i].lhs_start, resStride, blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
+        if (shift>0) {
+          while(info[i].sync!=k) {
+          }
+        }
+
+        gebp(res.getSubMapper(info[i].lhs_start, 0), blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
       }
 
       // Then keep going as usual with the remaining B'
@@ -130,10 +137,10 @@ static void run(Index rows, Index cols, Index depth,
         const Index actual_nc = (std::min)(j+nc,cols)-j;
 
         // pack B_k,j to B'
-        pack_rhs(blockB, &rhs(k,j), rhsStride, actual_kc, actual_nc);
+        pack_rhs(blockB, rhs.getSubMapper(k,j), actual_kc, actual_nc);
 
         // C_j += A' * B'
-        gebp(res+j*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha);
+        gebp(res.getSubMapper(0, j), blockA, blockB, rows, actual_kc, actual_nc, alpha);
       }
 
       // Release all the sub blocks A'_i of A' for the current thread,
@@ -159,28 +166,33 @@ static void run(Index rows, Index cols, Index depth,
     ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
 
     // For each horizontal panel of the rhs, and corresponding panel of the lhs...
-    for(Index k2=0; k2<depth; k2+=kc)
+    for(Index i2=0; i2<rows; i2+=mc)
     {
-      const Index actual_kc = (std::min)(k2+kc,depth)-k2;
+      const Index actual_mc = (std::min)(i2+mc,rows)-i2;
 
-      // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
-      // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
-      // Note that this panel will be read as many times as the number of blocks in the rhs's
-      // horizontal panel which is, in practice, a very low number.
-      pack_lhs(blockA, &lhs(0,k2), lhsStride, actual_kc, rows);
-
-      // For each kc x nc block of the rhs's horizontal panel...
-      for(Index j2=0; j2<cols; j2+=nc)
+      for(Index k2=0; k2<depth; k2+=kc)
       {
-        const Index actual_nc = (std::min)(j2+nc,cols)-j2;
-
-        // We pack the rhs's block into a sequential chunk of memory (L2 caching)
-        // Note that this block will be read a very high number of times, which is equal to the number of
-        // micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
-        pack_rhs(blockB, &rhs(k2,j2), rhsStride, actual_kc, actual_nc);
-
-        // Everything is packed, we can now call the panel * block kernel:
-        gebp(res+j2*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha);
+        const Index actual_kc = (std::min)(k2+kc,depth)-k2;
+        
+        // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
+        // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
+        // Note that this panel will be read as many times as the number of blocks in the rhs's
+        // horizontal panel which is, in practice, a very low number.
+        pack_lhs(blockA, lhs.getSubMapper(i2,k2), actual_kc, actual_mc);
+        
+        // For each kc x nc block of the rhs's horizontal panel...
+        for(Index j2=0; j2<cols; j2+=nc)
+        {
+          const Index actual_nc = (std::min)(j2+nc,cols)-j2;
+          
+          // We pack the rhs's block into a sequential chunk of memory (L2 caching)
+          // Note that this block will be read a very high number of times, which is equal to the number of
+          // micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
+          pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc);
+          
+          // Everything is packed, we can now call the panel * block kernel:
+          gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha);
+        }
       }
     }
   }
@@ -287,7 +299,7 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
 
   public:
 
-    gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, bool /*full_rows*/ = false)
+    gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, int /*num_threads*/, bool /*full_rows = false*/)
     {
       this->m_mc = ActualRows;
       this->m_nc = ActualCols;
@@ -319,23 +331,23 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
 
   public:
 
-    gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, bool full_rows = false)
+    gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, int num_threads, bool l3_blocking)
     {
       this->m_mc = Transpose ? cols : rows;
       this->m_nc = Transpose ? rows : cols;
       this->m_kc = depth;
 
-      if(full_rows)
+      if(l3_blocking)
       {
-        DenseIndex m = this->m_mc;
-        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc);
+        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc, num_threads);
       }
-      else // full columns
+      else  // no l3 blocking
       {
+        DenseIndex m = this->m_mc;
         DenseIndex n = this->m_nc;
-        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n);
+        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, n, num_threads);
       }
-      
+
       m_sizeA = this->m_mc * this->m_kc;
       m_sizeB = this->m_kc * this->m_nc;
     }
@@ -445,8 +457,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
         (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
       ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor;
 
-    BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), true);
-
+    BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
     internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>
                               (GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), Dest::Flags&RowMajorBit);
   }
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
index 7db3e3d38..e55994900 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -58,27 +58,31 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
 {
   typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
   static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
-                                      const RhsScalar* _rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha)
+                                      const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, const ResScalar& alpha)
   {
-    const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-    const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
-
     typedef gebp_traits<LhsScalar,RhsScalar> Traits;
 
+    typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    RhsMapper rhs(_rhs,rhsStride);
+    ResMapper res(_res, resStride);
+
     Index kc = depth; // cache block size along the K direction
     Index mc = size;  // cache block size along the M direction
     Index nc = size;  // cache block size along the N direction
-    computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc);
+    computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc, 1);
     // !!! mc must be a multiple of nr:
     if(mc > Traits::nr)
       mc = (mc/Traits::nr)*Traits::nr;
 
     ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0);
     ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0);
-    
-    gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
-    gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
-    gebp_kernel <LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
+
+    gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+    gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
     tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, UpLo> sybb;
 
     for(Index k2=0; k2<depth; k2+=kc)
@@ -86,29 +90,30 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
       const Index actual_kc = (std::min)(k2+kc,depth)-k2;
 
       // note that the actual rhs is the transpose/adjoint of mat
-      pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, size);
+      pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, size);
 
       for(Index i2=0; i2<size; i2+=mc)
       {
         const Index actual_mc = (std::min)(i2+mc,size)-i2;
 
-        pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
+        pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
 
         // the selected actual_mc * size panel of res is split into three different part:
         //  1 - before the diagonal => processed with gebp or skipped
         //  2 - the actual_mc x actual_mc symmetric block => processed with a special kernel
         //  3 - after the diagonal => processed with gebp or skipped
         if (UpLo==Lower)
-          gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha,
-               -1, -1, 0, 0);
+          gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc,
+               (std::min)(size,i2), alpha, -1, -1, 0, 0);
+
 
-        sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
+        sybb(_res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
 
         if (UpLo==Upper)
         {
           Index j2 = i2+actual_mc;
-          gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha,
-               -1, -1, 0, 0);
+          gebp(res.getSubMapper(i2, j2), blockA, blockB+actual_kc*j2, actual_mc,
+               actual_kc, (std::max)(Index(0), size-j2), alpha, -1, -1, 0, 0);
         }
       }
     }
@@ -129,13 +134,16 @@ struct tribb_kernel
 {
   typedef gebp_traits<LhsScalar,RhsScalar,ConjLhs,ConjRhs> Traits;
   typedef typename Traits::ResScalar ResScalar;
-  
+
   enum {
     BlockSize  = EIGEN_PLAIN_ENUM_MAX(mr,nr)
   };
-  void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
+  void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
   {
-    gebp_kernel<LhsScalar, RhsScalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
+    typedef blas_data_mapper<ResScalar, Index, ColMajor> ResMapper;
+    ResMapper res(_res, resStride);
+    gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
+
     Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer;
 
     // let's process the block per panel of actual_mc x BlockSize,
@@ -146,7 +154,7 @@ struct tribb_kernel
       const RhsScalar* actual_b = blockB+j*depth;
 
       if(UpLo==Upper)
-        gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha,
+        gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha,
                     -1, -1, 0, 0);
 
       // selfadjoint micro block
@@ -154,12 +162,12 @@ struct tribb_kernel
         Index i = j;
         buffer.setZero();
         // 1 - apply the kernel on the temporary buffer
-        gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
+        gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
                     -1, -1, 0, 0);
         // 2 - triangular accumulation
         for(Index j1=0; j1<actualBlockSize; ++j1)
         {
-          ResScalar* r = res + (j+j1)*resStride + i;
+          ResScalar* r = &res(i, j + j1);
           for(Index i1=UpLo==Lower ? j1 : 0;
               UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1)
             r[i1] += buffer(i1,j1);
@@ -169,8 +177,8 @@ struct tribb_kernel
       if(UpLo==Lower)
       {
         Index i = j+actualBlockSize;
-        gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha,
-                    -1, -1, 0, 0);
+        gebp_kernel(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i, 
+                    depth, actualBlockSize, alpha, -1, -1, 0, 0);
       }
     }
   }
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index 340c51394..7df6a6b1a 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
 #define EIGEN_GENERAL_MATRIX_VECTOR_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -48,17 +48,17 @@ namespace internal {
  *     // we currently fall back to the NoneAligned case
  *
  * The same reasoning apply for the transposed case.
- * 
+ *
  * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet...
  * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment
  * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow
  * compared to unaligned loads on a 4 byte boundary.
  *
  */
-template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
 {
-typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 
 enum {
   Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
@@ -78,17 +78,17 @@ typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
 
 EIGEN_DONT_INLINE static void run(
   Index rows, Index cols,
-  const LhsScalar* lhs, Index lhsStride,
-  const RhsScalar* rhs, Index rhsIncr,
+  const LhsMapper& lhs,
+  const RhsMapper& rhs,
         ResScalar* res, Index resIncr,
   RhsScalar alpha);
 };
 
-template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
   Index rows, Index cols,
-  const LhsScalar* lhs, Index lhsStride,
-  const RhsScalar* rhs, Index rhsIncr,
+  const LhsMapper& lhs,
+  const RhsMapper& rhs,
         ResScalar* res, Index resIncr,
   RhsScalar alpha)
 {
@@ -97,14 +97,16 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
   #ifdef _EIGEN_ACCUMULATE_PACKETS
   #error _EIGEN_ACCUMULATE_PACKETS has already been defined
   #endif
-  #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
+  #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \
     pstore(&res[j], \
       padd(pload<ResPacket>(&res[j]), \
         padd( \
-          padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]),    ptmp0), \
-                  pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]),   ptmp1)), \
-          padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]),    ptmp2), \
-                  pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]),   ptmp3)) )))
+      padd(pcj.pmul(lhs0.template load<LhsPacket, Alignment0>(j),    ptmp0), \
+      pcj.pmul(lhs1.template load<LhsPacket, Alignment13>(j),   ptmp1)),   \
+      padd(pcj.pmul(lhs2.template load<LhsPacket, Alignment2>(j),    ptmp2), \
+      pcj.pmul(lhs3.template load<LhsPacket, Alignment13>(j),   ptmp3)) )))
+
+  typedef typename LhsMapper::VectorMapper LhsScalars;
 
   conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
   conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
@@ -118,7 +120,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
   const Index ResPacketAlignedMask = ResPacketSize-1;
 //  const Index PeelAlignedMask = ResPacketSize*peels-1;
   const Index size = rows;
-  
+
+  const Index lhsStride = lhs.stride();
+
   // How many coeffs of the result do we have to skip to be aligned.
   // Here we assume data are at least aligned on the base scalar type.
   Index alignedStart = internal::first_aligned(res,size);
@@ -131,15 +135,16 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
                        : FirstAligned;
 
   // we cannot assume the first element is aligned because of sub-matrices
-  const Index lhsAlignmentOffset = internal::first_aligned(lhs,size);
+  const Index lhsAlignmentOffset = lhs.firstAligned(size);
 
   // find how many columns do we have to skip to be aligned with the result (if possible)
   Index skipColumns = 0;
   // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
-  if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) )
+  if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (size_t(res)%sizeof(ResScalar)) )
   {
     alignedSize = 0;
     alignedStart = 0;
+    alignmentPattern = NoneAligned;
   }
   else if(LhsPacketSize > 4)
   {
@@ -149,7 +154,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
   }
   else if (LhsPacketSize>1)
   {
-    eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
+  //    eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
 
     while (skipColumns<LhsPacketSize &&
           alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
@@ -166,10 +171,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
       // note that the skiped columns are processed later.
     }
 
-    eigen_internal_assert(  (alignmentPattern==NoneAligned)
+    /*    eigen_internal_assert(  (alignmentPattern==NoneAligned)
                       || (skipColumns + columnsAtOnce >= cols)
                       || LhsPacketSize > size
-                      || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);
+                      || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/
   }
   else if(Vectorizable)
   {
@@ -178,20 +183,20 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
     alignmentPattern = AllAligned;
   }
 
-  Index offset1 = (FirstAligned && alignmentStep==1?3:1);
-  Index offset3 = (FirstAligned && alignmentStep==1?1:3);
+  const Index offset1 = (FirstAligned && alignmentStep==1?3:1);
+  const Index offset3 = (FirstAligned && alignmentStep==1?1:3);
 
   Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
   for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
   {
-    RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
-              ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
-              ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
-              ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
+    RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(i, 0)),
+              ptmp1 = pset1<RhsPacket>(alpha*rhs(i+offset1, 0)),
+              ptmp2 = pset1<RhsPacket>(alpha*rhs(i+2, 0)),
+              ptmp3 = pset1<RhsPacket>(alpha*rhs(i+offset3, 0));
 
     // this helps a lot generating better binary code
-    const LhsScalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+offset1)*lhsStride,
-                    *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
+    const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0),   lhs1 = lhs.getVectorMapper(0, i+offset1),
+                     lhs2 = lhs.getVectorMapper(0, i+2),   lhs3 = lhs.getVectorMapper(0, i+offset3);
 
     if (Vectorizable)
     {
@@ -199,10 +204,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
       // process initial unaligned coeffs
       for (Index j=0; j<alignedStart; ++j)
       {
-        res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
-        res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
-        res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
-        res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
+        res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
+        res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
+        res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
+        res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
       }
 
       if (alignedSize>alignedStart)
@@ -211,11 +216,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
         {
           case AllAligned:
             for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,d,d);
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
             break;
           case EvenAligned:
             for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,du,d);
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
             break;
           case FirstAligned:
           {
@@ -225,28 +230,28 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
               LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
               ResPacket T0, T1;
 
-              A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
-              A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
-              A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
+              A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
+              A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
+              A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
 
               for (; j<peeledSize; j+=peels*ResPacketSize)
               {
-                A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]);  palign<1>(A01,A11);
-                A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]);  palign<2>(A02,A12);
-                A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]);  palign<3>(A03,A13);
+                A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize);  palign<1>(A01,A11);
+                A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize);  palign<2>(A02,A12);
+                A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize);  palign<3>(A03,A13);
 
-                A00 = pload<LhsPacket>(&lhs0[j]);
-                A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
+                A00 = lhs0.template load<LhsPacket, Aligned>(j);
+                A10 = lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize);
                 T0  = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
                 T1  = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
 
                 T0  = pcj.pmadd(A01, ptmp1, T0);
-                A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]);  palign<1>(A11,A01);
+                A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize);  palign<1>(A11,A01);
                 T0  = pcj.pmadd(A02, ptmp2, T0);
-                A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]);  palign<2>(A12,A02);
+                A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize);  palign<2>(A12,A02);
                 T0  = pcj.pmadd(A03, ptmp3, T0);
                 pstore(&res[j],T0);
-                A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]);  palign<3>(A13,A03);
+                A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize);  palign<3>(A13,A03);
                 T1  = pcj.pmadd(A11, ptmp1, T1);
                 T1  = pcj.pmadd(A12, ptmp2, T1);
                 T1  = pcj.pmadd(A13, ptmp3, T1);
@@ -254,12 +259,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
               }
             }
             for (; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,du,du);
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
             break;
           }
           default:
             for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(du,du,du);
+              _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
             break;
         }
       }
@@ -268,10 +273,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
     /* process remaining coeffs (or all if there is no explicit vectorization) */
     for (Index j=alignedSize; j<size; ++j)
     {
-      res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
-      res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
-      res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
-      res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
+      res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
+      res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
+      res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
+      res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
     }
   }
 
@@ -282,27 +287,27 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
   {
     for (Index k=start; k<end; ++k)
     {
-      RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
-      const LhsScalar* lhs0 = lhs + k*lhsStride;
+      RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(k, 0));
+      const LhsScalars lhs0 = lhs.getVectorMapper(0, k);
 
       if (Vectorizable)
       {
         /* explicit vectorization */
         // process first unaligned result's coeffs
         for (Index j=0; j<alignedStart; ++j)
-          res[j] += cj.pmul(lhs0[j], pfirst(ptmp0));
+          res[j] += cj.pmul(lhs0(j), pfirst(ptmp0));
         // process aligned result's coeffs
-        if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
+        if (lhs0.template aligned<LhsPacket>(alignedStart))
           for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
-            pstore(&res[i], pcj.pmadd(pload<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
+            pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(i), ptmp0, pload<ResPacket>(&res[i])));
         else
           for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
-            pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
+            pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(i), ptmp0, pload<ResPacket>(&res[i])));
       }
 
       // process remaining scalars (or all if no explicit vectorization)
       for (Index i=alignedSize; i<size; ++i)
-        res[i] += cj.pmul(lhs0[i], pfirst(ptmp0));
+        res[i] += cj.pmul(lhs0(i), pfirst(ptmp0));
     }
     if (skipColumns)
     {
@@ -326,8 +331,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
  *  - alpha is always a complex (or converted to a complex)
  *  - no vectorization
  */
-template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
 {
 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 
@@ -346,70 +351,75 @@ typedef typename packet_traits<ResScalar>::type  _ResPacket;
 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
-  
+
 EIGEN_DONT_INLINE static void run(
   Index rows, Index cols,
-  const LhsScalar* lhs, Index lhsStride,
-  const RhsScalar* rhs, Index rhsIncr,
+  const LhsMapper& lhs,
+  const RhsMapper& rhs,
         ResScalar* res, Index resIncr,
   ResScalar alpha);
 };
 
-template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
   Index rows, Index cols,
-  const LhsScalar* lhs, Index lhsStride,
-  const RhsScalar* rhs, Index rhsIncr,
+  const LhsMapper& lhs,
+  const RhsMapper& rhs,
   ResScalar* res, Index resIncr,
   ResScalar alpha)
 {
-  EIGEN_UNUSED_VARIABLE(rhsIncr);
-  eigen_internal_assert(rhsIncr==1);
-  
+  eigen_internal_assert(rhs.stride()==1);
+
   #ifdef _EIGEN_ACCUMULATE_PACKETS
   #error _EIGEN_ACCUMULATE_PACKETS has already been defined
   #endif
 
-  #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
-    RhsPacket b = pload<RhsPacket>(&rhs[j]); \
-    ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
-    ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
-    ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
-    ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
+  #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\
+    RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);  \
+    ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Alignment0>(j), b, ptmp0); \
+    ptmp1 = pcj.pmadd(lhs1.template load<LhsPacket, Alignment13>(j), b, ptmp1); \
+    ptmp2 = pcj.pmadd(lhs2.template load<LhsPacket, Alignment2>(j), b, ptmp2); \
+    ptmp3 = pcj.pmadd(lhs3.template load<LhsPacket, Alignment13>(j), b, ptmp3); }
 
   conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
   conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
 
+  typedef typename LhsMapper::VectorMapper LhsScalars;
+
   enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
   const Index rowsAtOnce = 4;
   const Index peels = 2;
   const Index RhsPacketAlignedMask = RhsPacketSize-1;
   const Index LhsPacketAlignedMask = LhsPacketSize-1;
-//   const Index PeelAlignedMask = RhsPacketSize*peels-1;
   const Index depth = cols;
+  const Index lhsStride = lhs.stride();
 
   // How many coeffs of the result do we have to skip to be aligned.
   // Here we assume data are at least aligned on the base scalar type
   // if that's not the case then vectorization is discarded, see below.
-  Index alignedStart = internal::first_aligned(rhs, depth);
+  Index alignedStart = rhs.firstAligned(depth);
   Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
   const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
 
   const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
   Index alignmentPattern = alignmentStep==0 ? AllAligned
-                         : alignmentStep==(LhsPacketSize/2) ? EvenAligned
-                         : FirstAligned;
+                           : alignmentStep==(LhsPacketSize/2) ? EvenAligned
+                           : FirstAligned;
 
   // we cannot assume the first element is aligned because of sub-matrices
-  const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth);
+  const Index lhsAlignmentOffset = lhs.firstAligned(depth);
+  const Index rhsAlignmentOffset = rhs.firstAligned(rows);
 
   // find how many rows do we have to skip to be aligned with rhs (if possible)
   Index skipRows = 0;
   // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
-  if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) )
+  if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) ||
+      (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) ||
+      (rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) )
   {
     alignedSize = 0;
     alignedStart = 0;
+    alignmentPattern = NoneAligned;
   }
   else if(LhsPacketSize > 4)
   {
@@ -418,7 +428,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
   }
   else if (LhsPacketSize>1)
   {
-    eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0  || depth<LhsPacketSize);
+  //    eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0  || depth<LhsPacketSize);
 
     while (skipRows<LhsPacketSize &&
            alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
@@ -434,11 +444,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
       skipRows = (std::min)(skipRows,Index(rows));
       // note that the skiped columns are processed later.
     }
-    eigen_internal_assert(  alignmentPattern==NoneAligned
+    /*    eigen_internal_assert(  alignmentPattern==NoneAligned
                       || LhsPacketSize==1
                       || (skipRows + rowsAtOnce >= rows)
                       || LhsPacketSize > depth
-                      || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);
+                      || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/
   }
   else if(Vectorizable)
   {
@@ -447,8 +457,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
     alignmentPattern = AllAligned;
   }
 
-  Index offset1 = (FirstAligned && alignmentStep==1?3:1);
-  Index offset3 = (FirstAligned && alignmentStep==1?1:3);
+  const Index offset1 = (FirstAligned && alignmentStep==1?3:1);
+  const Index offset3 = (FirstAligned && alignmentStep==1?1:3);
 
   Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
   for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
@@ -457,8 +467,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
     ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
 
     // this helps the compiler generating good binary code
-    const LhsScalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+offset1)*lhsStride,
-                    *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
+    const LhsScalars lhs0 = lhs.getVectorMapper(i+0, 0),    lhs1 = lhs.getVectorMapper(i+offset1, 0),
+                     lhs2 = lhs.getVectorMapper(i+2, 0),    lhs3 = lhs.getVectorMapper(i+offset3, 0);
 
     if (Vectorizable)
     {
@@ -470,9 +480,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
       // FIXME this loop get vectorized by the compiler !
       for (Index j=0; j<alignedStart; ++j)
       {
-        RhsScalar b = rhs[j];
-        tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
-        tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
+        RhsScalar b = rhs(j, 0);
+        tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
+        tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
       }
 
       if (alignedSize>alignedStart)
@@ -481,11 +491,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
         {
           case AllAligned:
             for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,d,d);
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
             break;
           case EvenAligned:
             for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,du,d);
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
             break;
           case FirstAligned:
           {
@@ -499,39 +509,39 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
                * than basic unaligned loads.
                */
               LhsPacket A01, A02, A03, A11, A12, A13;
-              A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
-              A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
-              A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
+              A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
+              A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
+              A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
 
               for (; j<peeledSize; j+=peels*RhsPacketSize)
               {
-                RhsPacket b = pload<RhsPacket>(&rhs[j]);
-                A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]);  palign<1>(A01,A11);
-                A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]);  palign<2>(A02,A12);
-                A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]);  palign<3>(A03,A13);
+                RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);
+                A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize);  palign<1>(A01,A11);
+                A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize);  palign<2>(A02,A12);
+                A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize);  palign<3>(A03,A13);
 
-                ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0);
+                ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), b, ptmp0);
                 ptmp1 = pcj.pmadd(A01, b, ptmp1);
-                A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]);  palign<1>(A11,A01);
+                A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize);  palign<1>(A11,A01);
                 ptmp2 = pcj.pmadd(A02, b, ptmp2);
-                A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]);  palign<2>(A12,A02);
+                A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize);  palign<2>(A12,A02);
                 ptmp3 = pcj.pmadd(A03, b, ptmp3);
-                A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]);  palign<3>(A13,A03);
+                A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize);  palign<3>(A13,A03);
 
-                b = pload<RhsPacket>(&rhs[j+RhsPacketSize]);
-                ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
+                b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load<RhsPacket, Aligned>(0);
+                ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize), b, ptmp0);
                 ptmp1 = pcj.pmadd(A11, b, ptmp1);
                 ptmp2 = pcj.pmadd(A12, b, ptmp2);
                 ptmp3 = pcj.pmadd(A13, b, ptmp3);
               }
             }
             for (; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,du,du);
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
             break;
           }
           default:
             for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(du,du,du);
+              _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
             break;
         }
         tmp0 += predux(ptmp0);
@@ -545,9 +555,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
     // FIXME this loop get vectorized by the compiler !
     for (Index j=alignedSize; j<depth; ++j)
     {
-      RhsScalar b = rhs[j];
-      tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
-      tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
+      RhsScalar b = rhs(j, 0);
+      tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
+      tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
     }
     res[i*resIncr]            += alpha*tmp0;
     res[(i+offset1)*resIncr]  += alpha*tmp1;
@@ -564,28 +574,28 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
     {
       EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0);
       ResPacket ptmp0 = pset1<ResPacket>(tmp0);
-      const LhsScalar* lhs0 = lhs + i*lhsStride;
+      const LhsScalars lhs0 = lhs.getVectorMapper(i, 0);
       // process first unaligned result's coeffs
       // FIXME this loop get vectorized by the compiler !
       for (Index j=0; j<alignedStart; ++j)
-        tmp0 += cj.pmul(lhs0[j], rhs[j]);
+        tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
 
       if (alignedSize>alignedStart)
       {
         // process aligned rhs coeffs
-        if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
+        if (lhs0.template aligned<LhsPacket>(alignedStart))
           for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
-            ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
+            ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
         else
           for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
-            ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
+            ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
         tmp0 += predux(ptmp0);
       }
 
       // process remaining scalars
       // FIXME this loop get vectorized by the compiler !
       for (Index j=alignedSize; j<depth; ++j)
-        tmp0 += cj.pmul(lhs0[j], rhs[j]);
+        tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
       res[i*resIncr] += alpha*tmp0;
     }
     if (skipRows)
diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index 126cfbbff..2b90abf8f 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -49,8 +49,8 @@ inline void initParallel()
 {
   int nbt;
   internal::manage_multi_threading(GetAction, &nbt);
-  std::ptrdiff_t l1, l2;
-  internal::manage_caching_sizes(GetAction, &l1, &l2);
+  std::ptrdiff_t l1, l2, l3;
+  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
 }
 
 /** \returns the max number of threads reserved for Eigen
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index 4e507b6cf..4b6316d63 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -324,20 +324,26 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
     Index rows, Index cols,
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
+    Scalar* _res,        Index resStride,
     const Scalar& alpha)
   {
     Index size = rows;
 
-    const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-    const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
-
     typedef gebp_traits<Scalar,Scalar> Traits;
 
+    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+    typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper;
+    typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    LhsTransposeMapper lhs_transpose(_lhs,lhsStride);
+    RhsMapper rhs(_rhs,rhsStride);
+    ResMapper res(_res, resStride);
+
     Index kc = size;  // cache block size along the K direction
     Index mc = rows;  // cache block size along the M direction
     Index nc = cols;  // cache block size along the N direction
-    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
+    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1);
     // kc must smaller than mc
     kc = (std::min)(kc,mc);
 
@@ -346,10 +352,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
     ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
     Scalar* blockB = allocatedBlockB;
 
-    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+    gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
     symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
+    gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
 
     for(Index k2=0; k2<size; k2+=kc)
     {
@@ -358,7 +364,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
       // we have selected one row panel of rhs and one column panel of lhs
       // pack rhs's panel into a sequential chunk of memory
       // and expand each coeff to a constant packet for further reuse
-      pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols);
+      pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, cols);
 
       // the select lhs's panel has to be split in three different parts:
       //  1 - the transposed panel above the diagonal block => transposed packed copy
@@ -368,9 +374,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
       {
         const Index actual_mc = (std::min)(i2+mc,k2)-i2;
         // transposed packed copy
-        pack_lhs_transposed(blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc);
+        pack_lhs_transposed(blockA, lhs_transpose.getSubMapper(i2, k2), actual_kc, actual_mc);
 
-        gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
+        gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
       }
       // the block diagonal
       {
@@ -378,16 +384,16 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
         // symmetric packed copy
         pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc);
 
-        gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
+        gebp_kernel(res.getSubMapper(k2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
       }
 
       for(Index i2=k2+kc; i2<size; i2+=mc)
       {
         const Index actual_mc = (std::min)(i2+mc,size)-i2;
-        gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
-          (blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
+        gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
+          (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
 
-        gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
+        gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
       }
     }
   }
@@ -414,26 +420,29 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
     Index rows, Index cols,
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
+    Scalar* _res,        Index resStride,
     const Scalar& alpha)
   {
     Index size = cols;
 
-    const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-
     typedef gebp_traits<Scalar,Scalar> Traits;
 
-    Index kc = size; // cache block size along the K direction
+    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    ResMapper res(_res,resStride);
+
+    Index kc = size;  // cache block size along the K direction
     Index mc = rows;  // cache block size along the M direction
     Index nc = cols;  // cache block size along the N direction
-    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
+    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1);
     std::size_t sizeB = kc*cols;
     ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
     ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
     Scalar* blockB = allocatedBlockB;
 
-    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
     symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
 
     for(Index k2=0; k2<size; k2+=kc)
@@ -446,9 +455,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
       for(Index i2=0; i2<rows; i2+=mc)
       {
         const Index actual_mc = (std::min)(i2+mc,rows)-i2;
-        pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
+        pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
 
-        gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
+        gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
       }
     }
   }
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h
index c2d0817ea..60c99dcd2 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -108,7 +108,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
     Index _rows, Index _cols, Index _depth,
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
+    Scalar* _res,        Index resStride,
     const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
   {
     // strip zeros
@@ -117,8 +117,12 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
     Index depth     = IsLower ? diagSize : _depth;
     Index cols      = _cols;
     
-    const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-    const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
+    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+    typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    RhsMapper rhs(_rhs,rhsStride);
+    ResMapper res(_res, resStride);
 
     Index kc = blocking.kc();                   // cache block size along the K direction
     Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
@@ -136,9 +140,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
     else
       triangularBuffer.diagonal().setOnes();
 
-    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
+    gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
 
     for(Index k2=IsLower ? depth : 0;
         IsLower ? k2>0 : k2<depth;
@@ -154,7 +158,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
         k2 = k2+actual_kc-kc;
       }
 
-      pack_rhs(blockB, &rhs(actual_k2,0), rhsStride, actual_kc, cols);
+      pack_rhs(blockB, rhs.getSubMapper(actual_k2,0), actual_kc, cols);
 
       // the selected lhs's panel has to be split in three different parts:
       //  1 - the part which is zero => skip it
@@ -182,9 +186,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
             for (Index i=IsLower ? k+1 : 0; IsLower ? i<actualPanelWidth : i<k; ++i)
               triangularBuffer.coeffRef(i,k) = lhs(startBlock+i,startBlock+k);
           }
-          pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth);
+          pack_lhs(blockA, LhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()), actualPanelWidth, actualPanelWidth);
 
-          gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha,
+          gebp_kernel(res.getSubMapper(startBlock, 0), blockA, blockB,
+                      actualPanelWidth, actualPanelWidth, cols, alpha,
                       actualPanelWidth, actual_kc, 0, blockBOffset);
 
           // GEBP with remaining micro panel
@@ -192,9 +197,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
           {
             Index startTarget  = IsLower ? actual_k2+k1+actualPanelWidth : actual_k2;
 
-            pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget);
+            pack_lhs(blockA, lhs.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
 
-            gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha,
+            gebp_kernel(res.getSubMapper(startTarget, 0), blockA, blockB,
+                        lengthTarget, actualPanelWidth, cols, alpha,
                         actualPanelWidth, actual_kc, 0, blockBOffset);
           }
         }
@@ -206,10 +212,11 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
         for(Index i2=start; i2<end; i2+=mc)
         {
           const Index actual_mc = (std::min)(i2+mc,end)-i2;
-          gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
-            (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
+          gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
+            (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
 
-          gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0);
+          gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc,
+                      actual_kc, cols, alpha, -1, -1, 0, 0);
         }
       }
     }
@@ -247,7 +254,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
     Index _rows, Index _cols, Index _depth,
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
+    Scalar* _res,        Index resStride,
     const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
   {
     // strip zeros
@@ -256,8 +263,12 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
     Index depth     = IsLower ? _depth : diagSize;
     Index cols      = IsLower ? diagSize : _cols;
     
-    const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-    const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
+    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+    typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    RhsMapper rhs(_rhs,rhsStride);
+    ResMapper res(_res, resStride);
 
     Index kc = blocking.kc();                   // cache block size along the K direction
     Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
@@ -275,10 +286,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
     else
       triangularBuffer.diagonal().setOnes();
 
-    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
+    gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
 
     for(Index k2=IsLower ? 0 : depth;
         IsLower ? k2<depth  : k2>0;
@@ -302,7 +313,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
       Scalar* geb = blockB+ts*ts;
       geb = geb + internal::first_aligned(geb,EIGEN_ALIGN_BYTES/sizeof(Scalar));
 
-      pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs);
+      pack_rhs(geb, rhs.getSubMapper(actual_k2,IsLower ? 0 : k2), actual_kc, rs);
 
       // pack the triangular part of the rhs padding the unrolled blocks with zeros
       if(ts>0)
@@ -315,7 +326,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
           Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
           // general part
           pack_rhs_panel(blockB+j2*actual_kc,
-                         &rhs(actual_k2+panelOffset, actual_j2), rhsStride,
+                         rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
                          panelLength, actualPanelWidth,
                          actual_kc, panelOffset);
 
@@ -329,7 +340,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
           }
 
           pack_rhs_panel(blockB+j2*actual_kc,
-                         triangularBuffer.data(), triangularBuffer.outerStride(),
+                         RhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()),
                          actualPanelWidth, actualPanelWidth,
                          actual_kc, j2);
         }
@@ -338,7 +349,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
       for (Index i2=0; i2<rows; i2+=mc)
       {
         const Index actual_mc = (std::min)(mc,rows-i2);
-        pack_lhs(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
+        pack_lhs(blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
 
         // triangular kernel
         if(ts>0)
@@ -349,7 +360,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
             Index panelLength = IsLower ? actual_kc-j2 : j2+actualPanelWidth;
             Index blockOffset = IsLower ? j2 : 0;
 
-            gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride,
+            gebp_kernel(res.getSubMapper(i2, actual_k2 + j2),
                         blockA, blockB+j2*actual_kc,
                         actual_mc, panelLength, actualPanelWidth,
                         alpha,
@@ -357,7 +368,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
                         blockOffset, blockOffset);// offsets
           }
         }
-        gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride,
+        gebp_kernel(res.getSubMapper(i2, IsLower ? 0 : k2),
                     blockA, geb, actual_mc, actual_kc, rs,
                     alpha,
                     -1, -1, 0, 0);
@@ -402,7 +413,7 @@ struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
     Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows()))
                                          : ((IsLower)  ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols()));
 
-    BlockingType blocking(stripedRows, stripedCols, stripedDepth);
+    BlockingType blocking(stripedRows, stripedCols, stripedDepth, 1, false);
 
     internal::product_triangular_matrix_matrix<Scalar, Index,
       Mode, LhsIsTriangular,
diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h
index 92d64e384..4d88a710b 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_TRIANGULARMATRIXVECTOR_H
 #define EIGEN_TRIANGULARMATRIXVECTOR_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -43,7 +43,7 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
     typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
     const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride));
     typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs);
-    
+
     typedef Map<const Matrix<RhsScalar,Dynamic,1>, 0, InnerStride<> > RhsMap;
     const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr));
     typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs);
@@ -51,6 +51,9 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
     typedef Map<Matrix<ResScalar,Dynamic,1> > ResMap;
     ResMap res(_res,rows);
 
+    typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
+
     for (Index pi=0; pi<size; pi+=PanelWidth)
     {
       Index actualPanelWidth = (std::min)(PanelWidth, size-pi);
@@ -68,19 +71,19 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
       if (r>0)
       {
         Index s = IsLower ? pi+actualPanelWidth : 0;
-        general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjLhs,RhsScalar,ConjRhs,BuiltIn>::run(
+        general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run(
             r, actualPanelWidth,
-            &lhs.coeffRef(s,pi), lhsStride,
-            &rhs.coeffRef(pi), rhsIncr,
+            LhsMapper(&lhs.coeffRef(s,pi), lhsStride),
+            RhsMapper(&rhs.coeffRef(pi), rhsIncr),
             &res.coeffRef(s), resIncr, alpha);
       }
     }
     if((!IsLower) && cols>size)
     {
-      general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjLhs,RhsScalar,ConjRhs>::run(
+      general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run(
           rows, cols-size,
-          &lhs.coeffRef(0,size), lhsStride,
-          &rhs.coeffRef(size), rhsIncr,
+          LhsMapper(&lhs.coeffRef(0,size), lhsStride),
+          RhsMapper(&rhs.coeffRef(size), rhsIncr),
           _res, resIncr, alpha);
     }
   }
@@ -118,7 +121,10 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
 
     typedef Map<Matrix<ResScalar,Dynamic,1>, 0, InnerStride<> > ResMap;
     ResMap res(_res,rows,InnerStride<>(resIncr));
-    
+
+    typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
+
     for (Index pi=0; pi<diagSize; pi+=PanelWidth)
     {
       Index actualPanelWidth = (std::min)(PanelWidth, diagSize-pi);
@@ -136,19 +142,19 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
       if (r>0)
       {
         Index s = IsLower ? 0 : pi + actualPanelWidth;
-        general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjLhs,RhsScalar,ConjRhs,BuiltIn>::run(
+        general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run(
             actualPanelWidth, r,
-            &lhs.coeffRef(pi,s), lhsStride,
-            &rhs.coeffRef(s), rhsIncr,
+            LhsMapper(&lhs.coeffRef(pi,s), lhsStride),
+            RhsMapper(&rhs.coeffRef(s), rhsIncr),
             &res.coeffRef(pi), resIncr, alpha);
       }
     }
     if(IsLower && rows>diagSize)
     {
-      general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjLhs,RhsScalar,ConjRhs>::run(
+      general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run(
             rows-diagSize, cols,
-            &lhs.coeffRef(diagSize,0), lhsStride,
-            &rhs.coeffRef(0), rhsIncr,
+            LhsMapper(&lhs.coeffRef(diagSize,0), lhsStride),
+            RhsMapper(&rhs.coeffRef(0), rhsIncr),
             &res.coeffRef(diagSize), resIncr, alpha);
     }
   }
@@ -231,7 +237,7 @@ template<int Mode> struct trmv_selector<Mode,ColMajor>
 
     bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
     bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
-    
+
     RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
 
     ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
@@ -251,7 +257,7 @@ template<int Mode> struct trmv_selector<Mode,ColMajor>
       else
         MappedDest(actualDestPtr, dest.size()) = dest;
     }
-    
+
     internal::triangular_matrix_vector_product
       <Index,Mode,
        LhsScalar, LhsBlasTraits::NeedToConjugate,
@@ -311,7 +317,7 @@ template<int Mode> struct trmv_selector<Mode,RowMajor>
       #endif
       Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
     }
-    
+
     internal::triangular_matrix_vector_product
       <Index,Mode,
        LhsScalar, LhsBlasTraits::NeedToConjugate,
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h
index 1f7afd187..f5de67c59 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -52,10 +52,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
     level3_blocking<Scalar,Scalar>& blocking)
   {
     Index cols = otherSize;
-    const_blas_data_mapper<Scalar, Index, TriStorageOrder> tri(_tri,triStride);
-    blas_data_mapper<Scalar, Index, ColMajor> other(_other,otherStride);
+
+    typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> TriMapper;
+    typedef blas_data_mapper<Scalar, Index, ColMajor> OtherMapper;
+    TriMapper tri(_tri, triStride);
+    OtherMapper other(_other, otherStride);
 
     typedef gebp_traits<Scalar,Scalar> Traits;
+
     enum {
       SmallPanelWidth   = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
       IsLower = (Mode&Lower) == Lower
@@ -71,14 +75,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
     ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
 
     conj_if<Conjugate> conj;
-    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr, ColMajor, false, true> pack_rhs;
+    gebp_kernel<Scalar, Scalar, Index, OtherMapper, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
+    gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
+    gemm_pack_rhs<Scalar, Index, OtherMapper, Traits::nr, ColMajor, false, true> pack_rhs;
 
     // the goal here is to subdivise the Rhs panels such that we keep some cache
     // coherence when accessing the rhs elements
-    std::ptrdiff_t l1, l2;
-    manage_caching_sizes(GetAction, &l1, &l2);
+    std::ptrdiff_t l1, l2, l3;
+    manage_caching_sizes(GetAction, &l1, &l2, &l3);
     Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0;
     subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr);
 
@@ -146,16 +150,16 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
           Index blockBOffset = IsLower ? k1 : lengthTarget;
 
           // update the respective rows of B from other
-          pack_rhs(blockB+actual_kc*j2, &other(startBlock,j2), otherStride, actualPanelWidth, actual_cols, actual_kc, blockBOffset);
+          pack_rhs(blockB+actual_kc*j2, other.getSubMapper(startBlock,j2), actualPanelWidth, actual_cols, actual_kc, blockBOffset);
 
           // GEBP
           if (lengthTarget>0)
           {
             Index startTarget  = IsLower ? k2+k1+actualPanelWidth : k2-actual_kc;
 
-            pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget);
+            pack_lhs(blockA, tri.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
 
-            gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
+            gebp_kernel(other.getSubMapper(startTarget,j2), blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
                         actualPanelWidth, actual_kc, 0, blockBOffset);
           }
         }
@@ -170,9 +174,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
           const Index actual_mc = (std::min)(mc,end-i2);
           if (actual_mc>0)
           {
-            pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc);
+            pack_lhs(blockA, tri.getSubMapper(i2, IsLower ? k2 : k2-kc), actual_kc, actual_mc);
 
-            gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
+            gebp_kernel(other.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
           }
         }
       }
@@ -198,8 +202,11 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
     level3_blocking<Scalar,Scalar>& blocking)
   {
     Index rows = otherSize;
-    const_blas_data_mapper<Scalar, Index, TriStorageOrder> rhs(_tri,triStride);
-    blas_data_mapper<Scalar, Index, ColMajor> lhs(_other,otherStride);
+
+    typedef blas_data_mapper<Scalar, Index, ColMajor> LhsMapper;
+    typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper;
+    LhsMapper lhs(_other, otherStride);
+    RhsMapper rhs(_tri, triStride);
 
     typedef gebp_traits<Scalar,Scalar> Traits;
     enum {
@@ -218,10 +225,10 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
     ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
 
     conj_if<Conjugate> conj;
-    gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
+    gebp_kernel<Scalar, Scalar, Index, LhsMapper, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder,false,true> pack_rhs_panel;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
 
     for(Index k2=IsLower ? size : 0;
         IsLower ? k2>0 : k2<size;
@@ -234,7 +241,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
       Index rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc;
       Scalar* geb = blockB+actual_kc*actual_kc;
 
-      if (rs>0) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, actual_kc, rs);
+      if (rs>0) pack_rhs(geb, rhs.getSubMapper(actual_k2,startPanel), actual_kc, rs);
 
       // triangular packing (we only pack the panels off the diagonal,
       // neglecting the blocks overlapping the diagonal
@@ -248,7 +255,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
 
           if (panelLength>0)
           pack_rhs_panel(blockB+j2*actual_kc,
-                         &rhs(actual_k2+panelOffset, actual_j2), triStride,
+                         rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
                          panelLength, actualPanelWidth,
                          actual_kc, panelOffset);
         }
@@ -276,7 +283,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
             // GEBP
             if(panelLength>0)
             {
-              gebp_kernel(&lhs(i2,absolute_j2), otherStride,
+              gebp_kernel(lhs.getSubMapper(i2,absolute_j2),
                           blockA, blockB+j2*actual_kc,
                           actual_mc, panelLength, actualPanelWidth,
                           Scalar(-1),
@@ -303,14 +310,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
             }
 
             // pack the just computed part of lhs to A
-            pack_lhs_panel(blockA, _other+absolute_j2*otherStride+i2, otherStride,
+            pack_lhs_panel(blockA, LhsMapper(_other+absolute_j2*otherStride+i2, otherStride),
                            actualPanelWidth, actual_mc,
                            actual_kc, j2);
           }
         }
 
         if (rs>0)
-          gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb,
+          gebp_kernel(lhs.getSubMapper(i2, startPanel), blockA, geb,
                       actual_mc, actual_kc, rs, Scalar(-1),
                       -1, -1, 0, 0);
       }
diff --git a/Eigen/src/Core/products/TriangularSolverVector.h b/Eigen/src/Core/products/TriangularSolverVector.h
index ce4d10088..b994759b2 100644
--- a/Eigen/src/Core/products/TriangularSolverVector.h
+++ b/Eigen/src/Core/products/TriangularSolverVector.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_TRIANGULAR_SOLVER_VECTOR_H
 #define EIGEN_TRIANGULAR_SOLVER_VECTOR_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -25,7 +25,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheRight, Mode, Co
       >::run(size, _lhs, lhsStride, rhs);
   }
 };
-    
+
 // forward and backward substitution, row-major, rhs is a vector
 template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate>
 struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, RowMajor>
@@ -37,6 +37,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
   {
     typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap;
     const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride));
+
+    typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
+
     typename internal::conditional<
                           Conjugate,
                           const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
@@ -58,10 +62,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
         Index startRow = IsLower ? pi : pi-actualPanelWidth;
         Index startCol = IsLower ? 0 : pi;
 
-        general_matrix_vector_product<Index,LhsScalar,RowMajor,Conjugate,RhsScalar,false>::run(
+        general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
           actualPanelWidth, r,
-          &lhs.coeffRef(startRow,startCol), lhsStride,
-          rhs + startCol, 1,
+          LhsMapper(&lhs.coeffRef(startRow,startCol), lhsStride),
+          RhsMapper(rhs + startCol, 1),
           rhs + startRow, 1,
           RhsScalar(-1));
       }
@@ -72,7 +76,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
         Index s = IsLower ? pi   : i+1;
         if (k>0)
           rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum();
-        
+
         if(!(Mode & UnitDiag))
           rhs[i] /= cjLhs(i,i);
       }
@@ -91,6 +95,8 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
   {
     typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
     const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride));
+    typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
     typename internal::conditional<Conjugate,
                                    const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
                                    const LhsMap&
@@ -122,10 +128,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
         // let's directly call the low level product function because:
         // 1 - it is faster to compile
         // 2 - it is slighlty faster at runtime
-        general_matrix_vector_product<Index,LhsScalar,ColMajor,Conjugate,RhsScalar,false>::run(
+        general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
             r, actualPanelWidth,
-            &lhs.coeffRef(endBlock,startBlock), lhsStride,
-            rhs+startBlock, 1,
+            LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride),
+            RhsMapper(rhs+startBlock, 1),
             rhs+endBlock, 1, RhsScalar(-1));
       }
     }
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index 9f9115c2a..3ec55fad2 100644
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -18,13 +18,13 @@ namespace Eigen {
 namespace internal {
 
 // forward declarations
-template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
 struct gebp_kernel;
 
-template<typename Scalar, typename Index, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
+template<typename Scalar, typename Index, typename DataMapper, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
 struct gemm_pack_rhs;
 
-template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
 struct gemm_pack_lhs;
 
 template<
@@ -34,7 +34,9 @@ template<
   int ResStorageOrder>
 struct general_matrix_matrix_product;
 
-template<typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version=Specialized>
+template<typename Index,
+         typename LhsScalar, typename LhsMapper, int LhsStorageOrder, bool ConjugateLhs,
+         typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version=Specialized>
 struct general_matrix_vector_product;
 
 
@@ -117,32 +119,133 @@ template<typename Scalar> struct get_factor<Scalar,typename NumTraits<Scalar>::R
   static EIGEN_STRONG_INLINE typename NumTraits<Scalar>::Real run(const Scalar& x) { return numext::real(x); }
 };
 
+
+template<typename Scalar, typename Index>
+class BlasVectorMapper {
+  public:
+  EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar *data) : m_data(data) {}
+
+  EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
+    return m_data[i];
+  }
+  template <typename Packet, int AlignmentType>
+  EIGEN_ALWAYS_INLINE Packet load(Index i) const {
+    return ploadt<Packet, AlignmentType>(m_data + i);
+  }
+
+  template <typename Packet>
+  bool aligned(Index i) const {
+    return (size_t(m_data+i)%sizeof(Packet))==0;
+  }
+
+  protected:
+  Scalar* m_data;
+};
+
+template<typename Scalar, typename Index, int AlignmentType>
+class BlasLinearMapper {
+  public:
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
+  EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {}
+
+  EIGEN_ALWAYS_INLINE void prefetch(int i) const {
+    internal::prefetch(&operator()(i));
+  }
+
+  EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const {
+    return m_data[i];
+  }
+
+  EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
+    return ploadt<Packet, AlignmentType>(m_data + i);
+  }
+
+  EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
+    return ploadt<HalfPacket, AlignmentType>(m_data + i);
+  }
+
+  EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
+    pstoret<Scalar, Packet, AlignmentType>(m_data + i, p);
+  }
+
+  protected:
+  Scalar *m_data;
+};
+
 // Lightweight helper class to access matrix coefficients.
-// Yes, this is somehow redundant with Map<>, but this version is much much lighter,
-// and so I hope better compilation performance (time and code quality).
-template<typename Scalar, typename Index, int StorageOrder>
-class blas_data_mapper
-{
+template<typename Scalar, typename Index, int StorageOrder, int AlignmentType = Unaligned>
+class blas_data_mapper {
   public:
-    blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i, Index j)
-    { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; }
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
+  typedef BlasLinearMapper<Scalar, Index, AlignmentType> LinearMapper;
+  typedef BlasVectorMapper<Scalar, Index> VectorMapper;
+
+  EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
+
+  EIGEN_ALWAYS_INLINE blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>
+  getSubMapper(Index i, Index j) const {
+    return blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>(&operator()(i, j), m_stride);
+  }
+
+  EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    return LinearMapper(&operator()(i, j));
+  }
+
+  EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
+    return VectorMapper(&operator()(i, j));
+  }
+
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
+    return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride];
+  }
+
+  EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
+    return ploadt<Packet, AlignmentType>(&operator()(i, j));
+  }
+
+  EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
+    return ploadt<HalfPacket, AlignmentType>(&operator()(i, j));
+  }
+
+  template<typename SubPacket>
+  EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, SubPacket p) const {
+    pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
+  }
+
+  template<typename SubPacket>
+  EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const {
+    return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride);
+  }
+
+  const Index stride() const { return m_stride; }
+
+  Index firstAligned(Index size) const {
+    if (size_t(m_data)%sizeof(Scalar)) {
+      return -1;
+    }
+    return internal::first_aligned(m_data, size);
+  }
+
   protected:
-    Scalar* EIGEN_RESTRICT m_data;
-    Index m_stride;
+  Scalar* EIGEN_RESTRICT m_data;
+  const Index m_stride;
 };
 
 // lightweight helper class to access matrix coefficients (const version)
 template<typename Scalar, typename Index, int StorageOrder>
-class const_blas_data_mapper
-{
+class const_blas_data_mapper : public blas_data_mapper<const Scalar, Index, StorageOrder> {
   public:
-    const_blas_data_mapper(const Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i, Index j) const
-    { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; }
-  protected:
-    const Scalar* EIGEN_RESTRICT m_data;
-    Index m_stride;
+  EIGEN_ALWAYS_INLINE const_blas_data_mapper(const Scalar *data, Index stride) : blas_data_mapper<const Scalar, Index, StorageOrder>(data, stride) {}
+
+  EIGEN_ALWAYS_INLINE const_blas_data_mapper<Scalar, Index, StorageOrder> getSubMapper(Index i, Index j) const {
+    return const_blas_data_mapper<Scalar, Index, StorageOrder>(&(this->operator()(i, j)), this->m_stride);
+  }
 };
 
 
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index 5c7d70af6..d1855b50b 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -163,6 +163,19 @@ const unsigned int NestByRefBit = 0x100;
   * \sa \ref RowMajorBit, \ref TopicStorageOrders */
 const unsigned int NoPreferredStorageOrderBit = 0x200;
 
+/** \ingroup flags
+  *
+  * Means that the underlying coefficients can be accessed through pointers to the sparse (un)compressed storage format,
+  * that is, the expression provides:
+  * \code
+    inline const Scalar* valuePtr() const;
+    inline const Index* innerIndexPtr() const;
+    inline const Index* outerIndexPtr() const;
+    inline const Index* innerNonZeroPtr() const;
+    \endcode
+  */
+const unsigned int CompressedAccessBit = 0x400;
+
 
 // list of flags that are inherited by default
 const unsigned int HereditaryBits = RowMajorBit
@@ -449,6 +462,9 @@ enum Action {GetAction, SetAction};
 /** The type used to identify a dense storage. */
 struct Dense {};
 
+/** The type used to identify a general sparse storage. */
+struct Sparse {};
+
 /** The type used to identify a permutation storage. */
 struct PermutationStorage {};
 
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 11b7e2887..07923848a 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -73,7 +73,7 @@
 
 /// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC
 #if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC)
-  #define EIGEN_COMP_MSVC_STRICT 1
+  #define EIGEN_COMP_MSVC_STRICT _MSC_VER
 #else
   #define EIGEN_COMP_MSVC_STRICT 0
 #endif
@@ -160,6 +160,12 @@
   #define EIGEN_ARCH_ARM64 0
 #endif
 
+#if EIGEN_ARCH_ARM || EIGEN_ARCH_ARM64
+  #define EIGEN_ARCH_ARM_OR_ARM64 1
+#else
+  #define EIGEN_ARCH_ARM_OR_ARM64 0
+#endif
+
 /// \internal EIGEN_ARCH_MIPS set to 1 if the architecture is MIPS
 #if defined(__mips__) || defined(__mips)
   #define EIGEN_ARCH_MIPS 1
@@ -376,10 +382,21 @@
   #define EIGEN_HAVE_RVALUE_REFERENCES
 #endif
 
+// Does the compiler support variadic templates?
+#if __cplusplus > 199711L
+#define EIGEN_HAS_VARIADIC_TEMPLATES 1
+#endif
+
+// Does the compiler support const expressions?
+#if (defined(__plusplus) && __cplusplus >= 201402L) || \
+    EIGEN_GNUC_AT_LEAST(4,9)
+#define EIGEN_HAS_CONSTEXPR 1
+#endif
+
 /** Allows to disable some optimizations which might affect the accuracy of the result.
   * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
   * They currently include:
-  *   - single precision Cwise::sin() and Cwise::cos() when SSE vectorization is enabled.
+  *   - single precision ArrayBase::sin() and ArrayBase::cos() when SSE vectorization is enabled.
   */
 #ifndef EIGEN_FAST_MATH
 #define EIGEN_FAST_MATH 1
@@ -526,7 +543,7 @@ namespace Eigen {
 #define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var);
 
 #if !defined(EIGEN_ASM_COMMENT)
-  #if EIGEN_COMP_GNUC && EIGEN_ARCH_i386_OR_x86_64
+  #if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64)
     #define EIGEN_ASM_COMMENT(X)  __asm__("#" X)
   #else
     #define EIGEN_ASM_COMMENT(X)
@@ -540,7 +557,9 @@ namespace Eigen {
  * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
  * vectorized and non-vectorized code.
  */
-#if EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
+#if (defined __CUDACC__)
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
+#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
   #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
 #elif EIGEN_COMP_MSVC
   #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))
@@ -592,7 +611,7 @@ namespace Eigen {
 // just an empty macro !
 #define EIGEN_EMPTY
 
-#if EIGEN_COMP_MSVC_STRICT
+#if EIGEN_COMP_MSVC_STRICT && EIGEN_COMP_MSVC < 1900
   #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
     using Base::operator =;
 #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index a54ccaedc..16f8cc1b0 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -143,8 +143,8 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t =
 *** Implementation of generic aligned realloc (when no realloc can be used)***
 *****************************************************************************/
 
-void* aligned_malloc(std::size_t size);
-void  aligned_free(void *ptr);
+EIGEN_DEVICE_FUNC void* aligned_malloc(std::size_t size);
+EIGEN_DEVICE_FUNC void  aligned_free(void *ptr);
 
 /** \internal
   * \brief Reallocates aligned memory.
@@ -185,33 +185,33 @@ inline void* generic_aligned_realloc(void* ptr, size_t size, size_t old_size)
 *****************************************************************************/
 
 #ifdef EIGEN_NO_MALLOC
-inline void check_that_malloc_is_allowed()
+EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
 {
   eigen_assert(false && "heap allocation is forbidden (EIGEN_NO_MALLOC is defined)");
 }
 #elif defined EIGEN_RUNTIME_NO_MALLOC
-inline bool is_malloc_allowed_impl(bool update, bool new_value = false)
+EIGEN_DEVICE_FUNC inline bool is_malloc_allowed_impl(bool update, bool new_value = false)
 {
   static bool value = true;
   if (update == 1)
     value = new_value;
   return value;
 }
-inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); }
-inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); }
-inline void check_that_malloc_is_allowed()
+EIGEN_DEVICE_FUNC inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); }
+EIGEN_DEVICE_FUNC inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); }
+EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
 {
   eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)");
 }
 #else 
-inline void check_that_malloc_is_allowed()
+EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
 {}
 #endif
 
 /** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements.
   * On allocation error, the returned pointer is null, and std::bad_alloc is thrown.
   */
-inline void* aligned_malloc(size_t size)
+EIGEN_DEVICE_FUNC inline void* aligned_malloc(size_t size)
 {
   check_that_malloc_is_allowed();
 
@@ -237,7 +237,7 @@ inline void* aligned_malloc(size_t size)
 }
 
 /** \internal Frees memory allocated with aligned_malloc. */
-inline void aligned_free(void *ptr)
+EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr)
 {
   #if !EIGEN_ALIGN
     std::free(ptr);
@@ -298,12 +298,12 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size)
 /** \internal Allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned.
   * On allocation error, the returned pointer is null, and a std::bad_alloc is thrown.
   */
-template<bool Align> inline void* conditional_aligned_malloc(size_t size)
+template<bool Align> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(size_t size)
 {
   return aligned_malloc(size);
 }
 
-template<> inline void* conditional_aligned_malloc<false>(size_t size)
+template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(size_t size)
 {
   check_that_malloc_is_allowed();
 
@@ -314,12 +314,12 @@ template<> inline void* conditional_aligned_malloc<false>(size_t size)
 }
 
 /** \internal Frees memory allocated with conditional_aligned_malloc */
-template<bool Align> inline void conditional_aligned_free(void *ptr)
+template<bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr)
 {
   aligned_free(ptr);
 }
 
-template<> inline void conditional_aligned_free<false>(void *ptr)
+template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void *ptr)
 {
   std::free(ptr);
 }
@@ -341,7 +341,7 @@ template<> inline void* conditional_aligned_realloc<false>(void* ptr, size_t new
 /** \internal Destructs the elements of an array.
   * The \a size parameters tells on how many objects to call the destructor of T.
   */
-template<typename T> inline void destruct_elements_of_array(T *ptr, size_t size)
+template<typename T> EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T *ptr, size_t size)
 {
   // always destruct an array starting from the end.
   if(ptr)
@@ -351,7 +351,7 @@ template<typename T> inline void destruct_elements_of_array(T *ptr, size_t size)
 /** \internal Constructs the elements of an array.
   * The \a size parameter tells on how many objects to call the constructor of T.
   */
-template<typename T> inline T* construct_elements_of_array(T *ptr, size_t size)
+template<typename T> EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, size_t size)
 {
   size_t i;
   EIGEN_TRY
@@ -371,7 +371,7 @@ template<typename T> inline T* construct_elements_of_array(T *ptr, size_t size)
 *****************************************************************************/
 
 template<typename T>
-EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
 {
   if(size > size_t(-1) / sizeof(T))
     throw_std_bad_alloc();
@@ -381,7 +381,7 @@ EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
   * On allocation error, the returned pointer is undefined, but a std::bad_alloc is thrown.
   * The default constructor of T is called.
   */
-template<typename T> inline T* aligned_new(size_t size)
+template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(size_t size)
 {
   check_size_for_overflow<T>(size);
   T *result = reinterpret_cast<T*>(aligned_malloc(sizeof(T)*size));
@@ -396,7 +396,7 @@ template<typename T> inline T* aligned_new(size_t size)
   }
 }
 
-template<typename T, bool Align> inline T* conditional_aligned_new(size_t size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(size_t size)
 {
   check_size_for_overflow<T>(size);
   T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
@@ -414,7 +414,7 @@ template<typename T, bool Align> inline T* conditional_aligned_new(size_t size)
 /** \internal Deletes objects constructed with aligned_new
   * The \a size parameters tells on how many objects to call the destructor of T.
   */
-template<typename T> inline void aligned_delete(T *ptr, size_t size)
+template<typename T> EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, size_t size)
 {
   destruct_elements_of_array<T>(ptr, size);
   aligned_free(ptr);
@@ -423,13 +423,13 @@ template<typename T> inline void aligned_delete(T *ptr, size_t size)
 /** \internal Deletes objects constructed with conditional_aligned_new
   * The \a size parameters tells on how many objects to call the destructor of T.
   */
-template<typename T, bool Align> inline void conditional_aligned_delete(T *ptr, size_t size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete(T *ptr, size_t size)
 {
   destruct_elements_of_array<T>(ptr, size);
   conditional_aligned_free<Align>(ptr);
 }
 
-template<typename T, bool Align> inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size)
 {
   check_size_for_overflow<T>(new_size);
   check_size_for_overflow<T>(old_size);
@@ -452,7 +452,7 @@ template<typename T, bool Align> inline T* conditional_aligned_realloc_new(T* pt
 }
 
 
-template<typename T, bool Align> inline T* conditional_aligned_new_auto(size_t size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new_auto(size_t size)
 {
   if(size==0)
     return 0; // short-cut. Also fixes Bug 884
@@ -495,7 +495,7 @@ template<typename T, bool Align> inline T* conditional_aligned_realloc_new_auto(
   return result;
 }
 
-template<typename T, bool Align> inline void conditional_aligned_delete_auto(T *ptr, size_t size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, size_t size)
 {
   if(NumTraits<T>::RequireInitialization)
     destruct_elements_of_array<T>(ptr, size);
@@ -523,9 +523,8 @@ template<typename T, bool Align> inline void conditional_aligned_delete_auto(T *
 template<typename Scalar, typename Index>
 inline Index first_aligned(const Scalar* array, Index size)
 {
-  enum { PacketSize = packet_traits<Scalar>::size,
-         PacketAlignedMask = PacketSize-1
-  };
+  static const Index PacketSize = packet_traits<Scalar>::size;
+  static const Index PacketAlignedMask = PacketSize-1;
 
   if(PacketSize==1)
   {
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index 299e5cbc2..528ebe297 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -463,6 +463,21 @@ template<typename XprType, typename CastType> struct cast_return_type
                               const XprType&,CastType>::type type;
 };
 
+template <typename A, typename B> struct promote_storage_type;
+
+template <typename A> struct promote_storage_type<A,A>
+{
+  typedef A ret;
+};
+template <typename A> struct promote_storage_type<A, const A>
+{
+  typedef A ret;
+};
+template <typename A> struct promote_storage_type<const A, A>
+{
+  typedef A ret;
+};
+
 /** \internal Specify the "storage kind" of applying a coefficient-wise
   * binary operations between two expressions of kinds A and B respectively.
   * The template parameter Functor permits to specialize the resulting storage kind wrt to
diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h
index ae10ff91e..128ef9028 100644
--- a/Eigen/src/Eigenvalues/RealQZ.h
+++ b/Eigen/src/Eigenvalues/RealQZ.h
@@ -313,7 +313,7 @@ namespace Eigen {
       using std::abs;
       using std::sqrt;
       const Index dim=m_S.cols();
-      if (abs(m_S.coeff(i+1,i)==Scalar(0)))
+      if (abs(m_S.coeff(i+1,i))==Scalar(0))
         return;
       Index z = findSmallDiagEntry(i,i+1);
       if (z==i-1)
diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h
index 10f5fb174..51e61ba38 100644
--- a/Eigen/src/Eigenvalues/RealSchur.h
+++ b/Eigen/src/Eigenvalues/RealSchur.h
@@ -234,7 +234,7 @@ template<typename _MatrixType> class RealSchur
     typedef Matrix<Scalar,3,1> Vector3s;
 
     Scalar computeNormOfT();
-    Index findSmallSubdiagEntry(Index iu, const Scalar& norm);
+    Index findSmallSubdiagEntry(Index iu);
     void splitOffTwoRows(Index iu, bool computeU, const Scalar& exshift);
     void computeShift(Index iu, Index iter, Scalar& exshift, Vector3s& shiftInfo);
     void initFrancisQRStep(Index il, Index iu, const Vector3s& shiftInfo, Index& im, Vector3s& firstHouseholderVector);
@@ -286,7 +286,7 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMa
   {
     while (iu >= 0)
     {
-      Index il = findSmallSubdiagEntry(iu, norm);
+      Index il = findSmallSubdiagEntry(iu);
 
       // Check for convergence
       if (il == iu) // One root found
@@ -343,16 +343,14 @@ inline typename MatrixType::Scalar RealSchur<MatrixType>::computeNormOfT()
 
 /** \internal Look for single small sub-diagonal element and returns its index */
 template<typename MatrixType>
-inline typename MatrixType::Index RealSchur<MatrixType>::findSmallSubdiagEntry(Index iu, const Scalar& norm)
+inline typename MatrixType::Index RealSchur<MatrixType>::findSmallSubdiagEntry(Index iu)
 {
   using std::abs;
   Index res = iu;
   while (res > 0)
   {
     Scalar s = abs(m_matT.coeff(res-1,res-1)) + abs(m_matT.coeff(res,res));
-    if (s == 0.0)
-      s = norm;
-    if (abs(m_matT.coeff(res,res-1)) < NumTraits<Scalar>::epsilon() * s)
+    if (abs(m_matT.coeff(res,res-1)) <= NumTraits<Scalar>::epsilon() * s)
       break;
     res--;
   }
@@ -457,9 +455,7 @@ inline void RealSchur<MatrixType>::initFrancisQRStep(Index il, Index iu, const V
     const Scalar lhs = m_matT.coeff(im,im-1) * (abs(v.coeff(1)) + abs(v.coeff(2)));
     const Scalar rhs = v.coeff(0) * (abs(m_matT.coeff(im-1,im-1)) + abs(Tmm) + abs(m_matT.coeff(im+1,im+1)));
     if (abs(lhs) < NumTraits<Scalar>::epsilon() * rhs)
-    {
       break;
-    }
   }
 }
 
diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h
index 276e94c58..54842b5ff 100644
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h
@@ -78,6 +78,8 @@ struct traits<Transform<_Scalar,_Dim,_Mode,_Options> >
   };
 };
 
+template<int Mode> struct transform_make_affine;
+
 } // end namespace internal
 
 /** \geometry_module \ingroup Geometry_Module
@@ -247,8 +249,7 @@ public:
   inline Transform()
   {
     check_template_params();
-    if (int(Mode)==Affine)
-      makeAffine();
+    internal::transform_make_affine<(int(Mode)==Affine) ? Affine : AffineCompact>::run(m_matrix);
   }
 
   inline Transform(const Transform& other)
@@ -611,11 +612,7 @@ public:
     */
   void makeAffine()
   {
-    if(int(Mode)!=int(AffineCompact))
-    {
-      matrix().template block<1,Dim>(Dim,0).setZero();
-      matrix().coeffRef(Dim,Dim) = Scalar(1);
-    }
+    internal::transform_make_affine<int(Mode)>::run(m_matrix);
   }
 
   /** \internal
@@ -1103,6 +1100,24 @@ Transform<Scalar,Dim,Mode,Options>::fromPositionOrientationScale(const MatrixBas
 
 namespace internal {
 
+template<int Mode>
+struct transform_make_affine
+{
+  template<typename MatrixType>
+  static void run(MatrixType &mat)
+  {
+    static const int Dim = MatrixType::ColsAtCompileTime-1;
+    mat.template block<1,Dim>(Dim,0).setZero();
+    mat.coeffRef(Dim,Dim) = typename MatrixType::Scalar(1);
+  }
+};
+
+template<>
+struct transform_make_affine<AffineCompact>
+{
+  template<typename MatrixType> static void run(MatrixType &) { }
+};
+    
 // selector needed to avoid taking the inverse of a 3x4 matrix
 template<typename TransformType, int Mode=TransformType::Mode>
 struct projective_transform_inverse
diff --git a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
index 5f55efbe9..a715c7285 100644
--- a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
+++ b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
@@ -139,11 +139,7 @@ struct traits<BiCGSTAB<_MatrixType,_Preconditioner> >
   * \include BiCGSTAB_simple.cpp
   * 
   * By default the iterations start with x=0 as an initial guess of the solution.
-  * One can control the start using the solveWithGuess() method. Here is a step by
-  * step execution example starting with a random guess and printing the evolution
-  * of the estimated error:
-  * \include BiCGSTAB_step_by_step.cpp
-  * Note that such a step by step execution is slightly slower.
+  * One can control the start using the solveWithGuess() method.
   * 
   * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
   */
@@ -192,7 +188,7 @@ public:
       m_error = Base::m_tolerance;
       
       typename Dest::ColXpr xj(x,j);
-      if(!internal::bicgstab(*mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_error))
+      if(!internal::bicgstab(mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_error))
         failed = true;
     }
     m_info = failed ? NumericalIssue
diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
index 2ccbf4fe6..d8bc13f5f 100644
--- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
+++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
@@ -113,8 +113,8 @@ struct traits<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> >
   * The matrix A must be selfadjoint. The matrix A and the vectors x and b can be either dense or sparse.
   *
   * \tparam _MatrixType the type of the matrix A, can be a dense or a sparse matrix.
-  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
-  *               or Upper. Default is Lower.
+  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower,
+  *               Upper, or Lower|Upper in which the full matrix entries will be considered. Default is Lower.
   * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
   *
   * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
@@ -137,20 +137,7 @@ struct traits<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> >
   * \endcode
   * 
   * By default the iterations start with x=0 as an initial guess of the solution.
-  * One can control the start using the solveWithGuess() method. Here is a step by
-  * step execution example starting with a random guess and printing the evolution
-  * of the estimated error:
-  * * \code
-  * x = VectorXd::Random(n);
-  * cg.setMaxIterations(1);
-  * int i = 0;
-  * do {
-  *   x = cg.solveWithGuess(b,x);
-  *   std::cout << i << " : " << cg.error() << std::endl;
-  *   ++i;
-  * } while (cg.info()!=Success && i<100);
-  * \endcode
-  * Note that such a step by step excution is slightly slower.
+  * One can control the start using the solveWithGuess() method.
   * 
   * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
   */
@@ -196,6 +183,10 @@ public:
   template<typename Rhs,typename Dest>
   void _solve_with_guess_impl(const Rhs& b, Dest& x) const
   {
+    typedef typename internal::conditional<UpLo==(Lower|Upper),
+                                           Ref<const MatrixType>&,
+                                           SparseSelfAdjointView<const Ref<const MatrixType>, UpLo>
+                                          >::type MatrixWrapperType;
     m_iterations = Base::maxIterations();
     m_error = Base::m_tolerance;
 
@@ -205,8 +196,7 @@ public:
       m_error = Base::m_tolerance;
 
       typename Dest::ColXpr xj(x,j);
-      internal::conjugate_gradient(mp_matrix->template selfadjointView<UpLo>(), b.col(j), xj,
-                                   Base::m_preconditioner, m_iterations, m_error);
+      internal::conjugate_gradient(MatrixWrapperType(mp_matrix), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error);
     }
 
     m_isInitialized = true;
diff --git a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
index cc99e00f9..2afd80f4d 100644
--- a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
+++ b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
@@ -37,7 +37,7 @@ public:
 
   /** Default constructor. */
   IterativeSolverBase()
-    : mp_matrix(0)
+    : m_dummy(0,0), mp_matrix(m_dummy)
   {
     init();
   }
@@ -52,10 +52,11 @@ public:
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  explicit IterativeSolverBase(const MatrixType& A)
+  template<typename SparseMatrixDerived>
+  explicit IterativeSolverBase(const SparseMatrixBase<SparseMatrixDerived>& A)
   {
     init();
-    compute(A);
+    compute(A.derived());
   }
 
   ~IterativeSolverBase() {}
@@ -65,9 +66,11 @@ public:
     * Currently, this function mostly calls analyzePattern on the preconditioner. In the future
     * we might, for instance, implement column reordering for faster matrix vector products.
     */
-  Derived& analyzePattern(const MatrixType& A)
+  template<typename SparseMatrixDerived>
+  Derived& analyzePattern(const SparseMatrixBase<SparseMatrixDerived>& A)
   {
-    m_preconditioner.analyzePattern(A);
+    grab(A);
+    m_preconditioner.analyzePattern(mp_matrix);
     m_isInitialized = true;
     m_analysisIsOk = true;
     m_info = Success;
@@ -83,11 +86,12 @@ public:
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  Derived& factorize(const MatrixType& A)
+  template<typename SparseMatrixDerived>
+  Derived& factorize(const SparseMatrixBase<SparseMatrixDerived>& A)
   {
     eigen_assert(m_analysisIsOk && "You must first call analyzePattern()"); 
-    mp_matrix = &A;
-    m_preconditioner.factorize(A);
+    grab(A);
+    m_preconditioner.factorize(mp_matrix);
     m_factorizationIsOk = true;
     m_info = Success;
     return derived();
@@ -103,10 +107,11 @@ public:
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  Derived& compute(const MatrixType& A)
+  template<typename SparseMatrixDerived>
+  Derived& compute(const SparseMatrixBase<SparseMatrixDerived>& A)
   {
-    mp_matrix = &A;
-    m_preconditioner.compute(A);
+    grab(A);
+    m_preconditioner.compute(mp_matrix);
     m_isInitialized = true;
     m_analysisIsOk = true;
     m_factorizationIsOk = true;
@@ -115,9 +120,10 @@ public:
   }
 
   /** \internal */
-  StorageIndex rows() const { return mp_matrix ? mp_matrix->rows() : 0; }
+  Index rows() const { return mp_matrix.rows(); }
+
   /** \internal */
-  StorageIndex cols() const { return mp_matrix ? mp_matrix->cols() : 0; }
+  Index cols() const { return mp_matrix.cols(); }
 
   /** \returns the tolerance threshold used by the stopping criteria */
   RealScalar tolerance() const { return m_tolerance; }
@@ -135,13 +141,18 @@ public:
   /** \returns a read-only reference to the preconditioner. */
   const Preconditioner& preconditioner() const { return m_preconditioner; }
 
-  /** \returns the max number of iterations */
+  /** \returns the max number of iterations.
+    * It is either the value setted by setMaxIterations or, by default,
+    * twice the number of columns of the matrix.
+    */
   int maxIterations() const
   {
-    return (mp_matrix && m_maxIterations<0) ? mp_matrix->cols() : m_maxIterations;
+    return (m_maxIterations<0) ? 2*mp_matrix.cols() : m_maxIterations;
   }
   
-  /** Sets the max number of iterations */
+  /** Sets the max number of iterations.
+    * Default is twice the number of columns of the matrix.
+    */
   Derived& setMaxIterations(int maxIters)
   {
     m_maxIterations = maxIters;
@@ -210,7 +221,16 @@ protected:
     m_maxIterations = -1;
     m_tolerance = NumTraits<Scalar>::epsilon();
   }
-  const MatrixType* mp_matrix;
+  
+  template<typename SparseMatrixDerived>
+  void grab(const SparseMatrixBase<SparseMatrixDerived> &A)
+  {
+    mp_matrix.~Ref<const MatrixType>();
+    ::new (&mp_matrix) Ref<const MatrixType>(A);
+  }
+  
+  MatrixType m_dummy;
+  Ref<const MatrixType> mp_matrix;
   Preconditioner m_preconditioner;
 
   int m_maxIterations;
diff --git a/Eigen/src/PaStiXSupport/PaStiXSupport.h b/Eigen/src/PaStiXSupport/PaStiXSupport.h
index 27acf4128..e20c9ba2a 100644
--- a/Eigen/src/PaStiXSupport/PaStiXSupport.h
+++ b/Eigen/src/PaStiXSupport/PaStiXSupport.h
@@ -43,7 +43,7 @@ namespace internal
     typedef _MatrixType MatrixType;
     typedef typename _MatrixType::Scalar Scalar;
     typedef typename _MatrixType::RealScalar RealScalar;
-    typedef typename _MatrixType::Index Index;
+    typedef typename _MatrixType::StorageIndex StorageIndex;
   };
 
   template<typename _MatrixType, int Options>
@@ -52,7 +52,7 @@ namespace internal
     typedef _MatrixType MatrixType;
     typedef typename _MatrixType::Scalar Scalar;
     typedef typename _MatrixType::RealScalar RealScalar;
-    typedef typename _MatrixType::Index Index;
+    typedef typename _MatrixType::StorageIndex StorageIndex;
   };
 
   template<typename _MatrixType, int Options>
@@ -61,7 +61,7 @@ namespace internal
     typedef _MatrixType MatrixType;
     typedef typename _MatrixType::Scalar Scalar;
     typedef typename _MatrixType::RealScalar RealScalar;
-    typedef typename _MatrixType::Index Index;
+    typedef typename _MatrixType::StorageIndex StorageIndex;
   };
   
   void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, float *vals, int *perm, int * invp, float *x, int nbrhs, int *iparm, double *dparm)
@@ -138,7 +138,6 @@ class PastixBase : public SparseSolverBase<Derived>
     typedef _MatrixType MatrixType;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
     typedef typename MatrixType::StorageIndex StorageIndex;
     typedef Matrix<Scalar,Dynamic,1> Vector;
     typedef SparseMatrix<Scalar, ColMajor> ColSpMatrix;
@@ -163,7 +162,7 @@ class PastixBase : public SparseSolverBase<Derived>
       * The statistics related to the different phases of factorization and solve are saved here as well
       * \sa analyzePattern() factorize()
       */
-    Array<Index,IPARM_SIZE,1>& iparm()
+    Array<StorageIndex,IPARM_SIZE,1>& iparm()
     {
       return m_iparm; 
     }
@@ -243,8 +242,8 @@ class PastixBase : public SparseSolverBase<Derived>
     mutable int m_comm; // The MPI communicator identifier
     mutable Matrix<int,IPARM_SIZE,1> m_iparm; // integer vector for the input parameters
     mutable Matrix<double,DPARM_SIZE,1> m_dparm; // Scalar vector for the input parameters
-    mutable Matrix<Index,Dynamic,1> m_perm;  // Permutation vector
-    mutable Matrix<Index,Dynamic,1> m_invp;  // Inverse permutation vector
+    mutable Matrix<StorageIndex,Dynamic,1> m_perm;  // Permutation vector
+    mutable Matrix<StorageIndex,Dynamic,1> m_invp;  // Inverse permutation vector
     mutable int m_size; // Size of the matrix 
 }; 
 
@@ -410,7 +409,7 @@ class PastixLU : public PastixBase< PastixLU<_MatrixType> >
     typedef _MatrixType MatrixType;
     typedef PastixBase<PastixLU<MatrixType> > Base;
     typedef typename Base::ColSpMatrix ColSpMatrix;
-    typedef typename MatrixType::Index Index;
+    typedef typename MatrixType::StorageIndex StorageIndex;
     
   public:
     PastixLU() : Base()
diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h
index 6fade3755..0dee139ce 100644
--- a/Eigen/src/QR/ColPivHouseholderQR.h
+++ b/Eigen/src/QR/ColPivHouseholderQR.h
@@ -477,20 +477,10 @@ ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const
     // we store that back into our table: it can't hurt to correct our table.
     m_colSqNorms.coeffRef(biggest_col_index) = biggest_col_sq_norm;
 
-    // if the current biggest column is smaller than epsilon times the initial biggest column,
-    // terminate to avoid generating nan/inf values.
-    // Note that here, if we test instead for "biggest == 0", we get a failure every 1000 (or so)
-    // repetitions of the unit test, with the result of solve() filled with large values of the order
-    // of 1/(size*epsilon).
-    if(biggest_col_sq_norm < threshold_helper * RealScalar(rows-k))
-    {
+    // Track the number of meaningful pivots but do not stop the decomposition to make
+    // sure that the initial matrix is properly reproduced. See bug 941.
+    if(m_nonzero_pivots==size && biggest_col_sq_norm < threshold_helper * RealScalar(rows-k))
       m_nonzero_pivots = k;
-      m_hCoeffs.tail(size-k).setZero();
-      m_qr.bottomRightCorner(rows-k,cols-k)
-          .template triangularView<StrictlyLower>()
-          .setZero();
-      break;
-    }
 
     // apply the transposition to the columns
     m_colsTranspositions.coeffRef(k) = biggest_col_index;
@@ -519,7 +509,7 @@ ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const
   }
 
   m_colsPermutation.setIdentity(PermIndexType(cols));
-  for(PermIndexType k = 0; k < m_nonzero_pivots; ++k)
+  for(PermIndexType k = 0; k < size/*m_nonzero_pivots*/; ++k)
     m_colsPermutation.applyTranspositionOnTheRight(k, PermIndexType(m_colsTranspositions.coeff(k)));
 
   m_det_pq = (number_of_transpositions%2) ? -1 : 1;
@@ -575,13 +565,15 @@ struct Assignment<DstXprType, Inverse<ColPivHouseholderQR<MatrixType> >, interna
 
 } // end namespace internal
 
-/** \returns the matrix Q as a sequence of householder transformations */
+/** \returns the matrix Q as a sequence of householder transformations.
+  * You can extract the meaningful part only by using:
+  * \code qr.householderQ().setLength(qr.nonzeroPivots()) */
 template<typename MatrixType>
 typename ColPivHouseholderQR<MatrixType>::HouseholderSequenceType ColPivHouseholderQR<MatrixType>
   ::householderQ() const
 {
   eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-  return HouseholderSequenceType(m_qr, m_hCoeffs.conjugate()).setLength(m_nonzero_pivots);
+  return HouseholderSequenceType(m_qr, m_hCoeffs.conjugate());
 }
 
 #ifndef __CUDACC__
diff --git a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
index 5fd18b787..58efa6024 100644
--- a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
+++ b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
@@ -68,13 +68,13 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
     typedef Map<PermutationMatrix<Dynamic, Dynamic, StorageIndex> > PermutationType;
   public:
     SPQR() 
-      : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon())
+      : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()), m_useDefaultThreshold(true)
     { 
       cholmod_l_start(&m_cc);
     }
     
     explicit SPQR(const _MatrixType& matrix)
-    : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon())
+    : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()), m_useDefaultThreshold(true)
     {
       cholmod_l_start(&m_cc);
       compute(matrix);
@@ -99,10 +99,25 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
       if(m_isInitialized) SPQR_free();
 
       MatrixType mat(matrix);
+      
+      /* Compute the default threshold as in MatLab, see:
+       * Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing
+       * Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011, Page 8:3 
+       */
+      RealScalar pivotThreshold = m_tolerance;
+      if(m_useDefaultThreshold) 
+      {
+        RealScalar max2Norm = 0.0;
+        for (int j = 0; j < mat.cols(); j++) max2Norm = numext::maxi(max2Norm, mat.col(j).norm());
+        if(max2Norm==RealScalar(0))
+          max2Norm = RealScalar(1);
+        pivotThreshold = 20 * (mat.rows() + mat.cols()) * max2Norm * NumTraits<RealScalar>::epsilon();
+      }
+      
       cholmod_sparse A; 
       A = viewAsCholmod(mat);
       Index col = matrix.cols();
-      m_rank = SuiteSparseQR<Scalar>(m_ordering, m_tolerance, col, &A, 
+      m_rank = SuiteSparseQR<Scalar>(m_ordering, pivotThreshold, col, &A, 
                              &m_cR, &m_E, &m_H, &m_HPinv, &m_HTau, &m_cc);
 
       if (!m_cR)
@@ -118,7 +133,7 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
     /** 
      * Get the number of rows of the input matrix and the Q matrix
      */
-    inline Index rows() const {return m_H->nrow; }
+    inline Index rows() const {return m_cR->nrow; }
     
     /** 
      * Get the number of columns of the input matrix. 
@@ -130,16 +145,25 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
     {
       eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()");
       eigen_assert(b.cols()==1 && "This method is for vectors only");
-      
+
       //Compute Q^T * b
-      typename Dest::PlainObject y;
+      typename Dest::PlainObject y, y2;
       y = matrixQ().transpose() * b;
-        // Solves with the triangular matrix R
+      
+      // Solves with the triangular matrix R
       Index rk = this->rank();
-      y.topRows(rk) = this->matrixR().topLeftCorner(rk, rk).template triangularView<Upper>().solve(y.topRows(rk));
-      y.bottomRows(cols()-rk).setZero();
+      y2 = y;
+      y.resize((std::max)(cols(),Index(y.rows())),y.cols());
+      y.topRows(rk) = this->matrixR().topLeftCorner(rk, rk).template triangularView<Upper>().solve(y2.topRows(rk));
+
       // Apply the column permutation 
-      dest.topRows(cols()) = colsPermutation() * y.topRows(cols());
+      // colsPermutation() performs a copy of the permutation,
+      // so let's apply it manually:
+      for(Index i = 0; i < rk; ++i) dest.row(m_E[i]) = y.row(i);
+      for(Index i = rk; i < cols(); ++i) dest.row(m_E[i]).setZero();
+      
+//       y.bottomRows(y.rows()-rk).setZero();
+//       dest = colsPermutation() * y.topRows(cols());
       
       m_info = Success;
     }
@@ -178,7 +202,11 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
     /// Set the fill-reducing ordering method to be used
     void setSPQROrdering(int ord) { m_ordering = ord;}
     /// Set the tolerance tol to treat columns with 2-norm < =tol as zero
-    void setPivotThreshold(const RealScalar& tol) { m_tolerance = tol; }
+    void setPivotThreshold(const RealScalar& tol)
+    {
+      m_useDefaultThreshold = false;
+      m_tolerance = tol;
+    }
     
     /** \returns a pointer to the SPQR workspace */
     cholmod_common *cholmodCommon() const { return &m_cc; }
@@ -210,6 +238,7 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
     mutable cholmod_dense *m_HTau; // The Householder coefficients
     mutable StorageIndex m_rank; // The rank of the matrix
     mutable cholmod_common m_cc; // Workspace and parameters
+    bool m_useDefaultThreshold;     // Use default threshold
     template<typename ,typename > friend struct SPQR_QProduct;
 };
 
diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h
index 0f7e5b8fe..444187ae7 100644
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
@@ -628,6 +628,7 @@ template<typename _MatrixType, int QRPreconditioner> class JacobiSVD
 
     internal::qr_preconditioner_impl<MatrixType, QRPreconditioner, internal::PreconditionIfMoreColsThanRows> m_qr_precond_morecols;
     internal::qr_preconditioner_impl<MatrixType, QRPreconditioner, internal::PreconditionIfMoreRowsThanCols> m_qr_precond_morerows;
+    MatrixType m_scaledMatrix;
 };
 
 template<typename MatrixType, int QRPreconditioner>
@@ -674,8 +675,9 @@ void JacobiSVD<MatrixType, QRPreconditioner>::allocate(Index rows, Index cols, u
                             : 0);
   m_workMatrix.resize(m_diagSize, m_diagSize);
   
-  if(m_cols>m_rows) m_qr_precond_morecols.allocate(*this);
-  if(m_rows>m_cols) m_qr_precond_morerows.allocate(*this);
+  if(m_cols>m_rows)   m_qr_precond_morecols.allocate(*this);
+  if(m_rows>m_cols)   m_qr_precond_morerows.allocate(*this);
+  if(m_cols!=m_cols)  m_scaledMatrix.resize(rows,cols);
 }
 
 template<typename MatrixType, int QRPreconditioner>
@@ -698,7 +700,13 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
   
   /*** step 1. The R-SVD step: we use a QR decomposition to reduce to the case of a square matrix */
 
-  if(!m_qr_precond_morecols.run(*this, matrix/scale) && !m_qr_precond_morerows.run(*this, matrix/scale))
+  if(m_rows!=m_cols)
+  {
+    m_scaledMatrix = matrix / scale;
+    m_qr_precond_morecols.run(*this, m_scaledMatrix);
+    m_qr_precond_morerows.run(*this, m_scaledMatrix);
+  }
+  else
   {
     m_workMatrix = matrix.block(0,0,m_diagSize,m_diagSize) / scale;
     if(m_computeFullU) m_matrixU.setIdentity(m_rows,m_rows);
diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h
index b148d6b1f..c47077e50 100644
--- a/Eigen/src/SparseCholesky/SimplicialCholesky.h
+++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h
@@ -17,6 +17,27 @@ enum SimplicialCholeskyMode {
   SimplicialCholeskyLDLT
 };
 
+namespace internal {
+  template<typename CholMatrixType, typename InputMatrixType>
+  struct simplicial_cholesky_grab_input {
+    typedef CholMatrixType const * ConstCholMatrixPtr;
+    static void run(const InputMatrixType& input, ConstCholMatrixPtr &pmat, CholMatrixType &tmp)
+    {
+      tmp = input;
+      pmat = &tmp;
+    }
+  };
+  
+  template<typename MatrixType>
+  struct simplicial_cholesky_grab_input<MatrixType,MatrixType> {
+    typedef MatrixType const * ConstMatrixPtr;
+    static void run(const MatrixType& input, ConstMatrixPtr &pmat, MatrixType &/*tmp*/)
+    {
+      pmat = &input;
+    }
+  };
+} // end namespace internal
+
 /** \ingroup SparseCholesky_Module
   * \brief A direct sparse Cholesky factorizations
   *
@@ -46,6 +67,7 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
     typedef typename MatrixType::RealScalar RealScalar;
     typedef typename MatrixType::StorageIndex StorageIndex;
     typedef SparseMatrix<Scalar,ColMajor,StorageIndex> CholMatrixType;
+    typedef CholMatrixType const * ConstCholMatrixPtr;
     typedef Matrix<Scalar,Dynamic,1> VectorType;
 
   public:
@@ -169,10 +191,11 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
     {
       eigen_assert(matrix.rows()==matrix.cols());
       Index size = matrix.cols();
-      CholMatrixType ap(size,size);
-      ordering(matrix, ap);
-      analyzePattern_preordered(ap, DoLDLT);
-      factorize_preordered<DoLDLT>(ap);
+      CholMatrixType tmp(size,size);
+      ConstCholMatrixPtr pmat;
+      ordering(matrix, pmat, tmp);
+      analyzePattern_preordered(*pmat, DoLDLT);
+      factorize_preordered<DoLDLT>(*pmat);
     }
     
     template<bool DoLDLT>
@@ -180,9 +203,21 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
     {
       eigen_assert(a.rows()==a.cols());
       int size = a.cols();
-      CholMatrixType ap(size,size);
-      ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
-      factorize_preordered<DoLDLT>(ap);
+      CholMatrixType tmp(size,size);
+      ConstCholMatrixPtr pmat;
+      
+      if(m_P.size()==0 && (UpLo&Upper)==Upper)
+      {
+        // If there is no ordering, try to directly use the input matrix without any copy
+        internal::simplicial_cholesky_grab_input<CholMatrixType,MatrixType>::run(a, pmat, tmp);
+      }
+      else
+      {
+        tmp.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
+        pmat = &tmp;
+      }
+      
+      factorize_preordered<DoLDLT>(*pmat);
     }
 
     template<bool DoLDLT>
@@ -192,13 +227,14 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
     {
       eigen_assert(a.rows()==a.cols());
       int size = a.cols();
-      CholMatrixType ap(size,size);
-      ordering(a, ap);
-      analyzePattern_preordered(ap,doLDLT);
+      CholMatrixType tmp(size,size);
+      ConstCholMatrixPtr pmat;
+      ordering(a, pmat, tmp);
+      analyzePattern_preordered(*pmat,doLDLT);
     }
     void analyzePattern_preordered(const CholMatrixType& a, bool doLDLT);
     
-    void ordering(const MatrixType& a, CholMatrixType& ap);
+    void ordering(const MatrixType& a, ConstCholMatrixPtr &pmat, CholMatrixType& ap);
 
     /** keeps off-diagonal entries; drops diagonal entries */
     struct keep_diag {
@@ -603,26 +639,41 @@ public:
 };
 
 template<typename Derived>
-void SimplicialCholeskyBase<Derived>::ordering(const MatrixType& a, CholMatrixType& ap)
+void SimplicialCholeskyBase<Derived>::ordering(const MatrixType& a, ConstCholMatrixPtr &pmat, CholMatrixType& ap)
 {
   eigen_assert(a.rows()==a.cols());
   const Index size = a.rows();
-  // Note that amd compute the inverse permutation
+  pmat = &ap;
+  // Note that ordering methods compute the inverse permutation
+  if(!internal::is_same<OrderingType,NaturalOrdering<Index> >::value)
   {
-    CholMatrixType C;
-    C = a.template selfadjointView<UpLo>();
+    {
+      CholMatrixType C;
+      C = a.template selfadjointView<UpLo>();
+      
+      OrderingType ordering;
+      ordering(C,m_Pinv);
+    }
+
+    if(m_Pinv.size()>0) m_P = m_Pinv.inverse();
+    else                m_P.resize(0);
     
-    OrderingType ordering;
-    ordering(C,m_Pinv);
+    ap.resize(size,size);
+    ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
   }
-
-  if(m_Pinv.size()>0)
-    m_P = m_Pinv.inverse();
   else
+  {
+    m_Pinv.resize(0);
     m_P.resize(0);
-
-  ap.resize(size,size);
-  ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
+    if(UpLo==Lower || MatrixType::IsRowMajor)
+    {
+      // we have to transpose the lower part to to the upper one
+      ap.resize(size,size);
+      ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>();
+    }
+    else
+      internal::simplicial_cholesky_grab_input<CholMatrixType,MatrixType>::run(a, pmat, ap);
+  }  
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
index 302323ab4..a93189df2 100644
--- a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
+++ b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
@@ -126,7 +126,7 @@ void SimplicialCholeskyBase<Derived>::factorize_preordered(const CholMatrixType&
     StorageIndex top = size;               // stack for pattern is empty
     tags[k] = k;                    // mark node k as visited
     m_nonZerosPerCol[k] = 0;        // count of nonzeros in column k of L
-    for(typename MatrixType::InnerIterator it(ap,k); it; ++it)
+    for(typename CholMatrixType::InnerIterator it(ap,k); it; ++it)
     {
       StorageIndex i = it.index();
       if(i <= k)
diff --git a/Eigen/src/SparseCore/AmbiVector.h b/Eigen/src/SparseCore/AmbiVector.h
index 759fc08ff..8bde5d58e 100644
--- a/Eigen/src/SparseCore/AmbiVector.h
+++ b/Eigen/src/SparseCore/AmbiVector.h
@@ -73,7 +73,7 @@ class AmbiVector
       delete[] m_buffer;
       if (size<1000)
       {
-        Index allocSize = (size * sizeof(ListEl))/sizeof(Scalar);
+        Index allocSize = (size * sizeof(ListEl) + sizeof(Scalar) - 1)/sizeof(Scalar);
         m_allocatedElements = convert_index((allocSize*sizeof(Scalar))/sizeof(ListEl));
         m_buffer = new Scalar[allocSize];
       }
@@ -92,7 +92,7 @@ class AmbiVector
       Index copyElements = m_allocatedElements;
       m_allocatedElements = (std::min)(StorageIndex(m_allocatedElements*1.5),m_size);
       Index allocSize = m_allocatedElements * sizeof(ListEl);
-      allocSize = allocSize/sizeof(Scalar) + (allocSize%sizeof(Scalar)>0?1:0);
+      allocSize = (allocSize + sizeof(Scalar) - 1)/sizeof(Scalar);
       Scalar* newBuffer = new Scalar[allocSize];
       memcpy(newBuffer,  m_buffer,  copyElements * sizeof(ListEl));
       delete[] m_buffer;
diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h
index f98b42760..2d4f2bcf8 100644
--- a/Eigen/src/SparseCore/CompressedStorage.h
+++ b/Eigen/src/SparseCore/CompressedStorage.h
@@ -143,7 +143,7 @@ class CompressedStorage
     }
 
     /** Like at(), but the search is performed in the range [start,end) */
-    inline const Scalar& atInRange(size_t start, size_t end, Index key, const Scalar& defaultValue = Scalar(0)) const
+    inline Scalar atInRange(size_t start, size_t end, Index key, const Scalar &defaultValue = Scalar(0)) const
     {
       if (start>=end)
         return defaultValue;
diff --git a/Eigen/src/SparseCore/MappedSparseMatrix.h b/Eigen/src/SparseCore/MappedSparseMatrix.h
index 5e4580329..67718c85b 100644
--- a/Eigen/src/SparseCore/MappedSparseMatrix.h
+++ b/Eigen/src/SparseCore/MappedSparseMatrix.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,9 +10,10 @@
 #ifndef EIGEN_MAPPED_SPARSEMATRIX_H
 #define EIGEN_MAPPED_SPARSEMATRIX_H
 
-namespace Eigen { 
+namespace Eigen {
 
-/** \class MappedSparseMatrix
+/** \deprecated Use Map<SparseMatrix<> >
+  * \class MappedSparseMatrix
   *
   * \brief Sparse matrix
   *
@@ -25,179 +26,38 @@ namespace internal {
 template<typename _Scalar, int _Flags, typename _StorageIndex>
 struct traits<MappedSparseMatrix<_Scalar, _Flags, _StorageIndex> > : traits<SparseMatrix<_Scalar, _Flags, _StorageIndex> >
 {};
-}
+} // end namespace internal
 
 template<typename _Scalar, int _Flags, typename _StorageIndex>
 class MappedSparseMatrix
-  : public SparseMatrixBase<MappedSparseMatrix<_Scalar, _Flags, _StorageIndex> >
+  : public Map<SparseMatrix<_Scalar, _Flags, _StorageIndex> >
 {
-  public:
-    EIGEN_SPARSE_PUBLIC_INTERFACE(MappedSparseMatrix)
-    enum { IsRowMajor = Base::IsRowMajor };
-
-  protected:
-
-    StorageIndex   m_outerSize;
-    StorageIndex   m_innerSize;
-    StorageIndex   m_nnz;
-    StorageIndex*  m_outerIndex;
-    StorageIndex*  m_innerIndices;
-    Scalar* m_values;
+    typedef Map<SparseMatrix<_Scalar, _Flags, _StorageIndex> > Base;
 
   public:
-
-    inline StorageIndex rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
-    inline StorageIndex cols() const { return IsRowMajor ? m_innerSize : m_outerSize; }
-    inline StorageIndex innerSize() const { return m_innerSize; }
-    inline StorageIndex outerSize() const { return m_outerSize; }
     
-    bool isCompressed() const { return true; }
-
-    //----------------------------------------
-    // direct access interface
-    inline const Scalar* valuePtr() const { return m_values; }
-    inline Scalar* valuePtr() { return m_values; }
-
-    inline const StorageIndex* innerIndexPtr() const { return m_innerIndices; }
-    inline StorageIndex* innerIndexPtr() { return m_innerIndices; }
-
-    inline const StorageIndex* outerIndexPtr() const { return m_outerIndex; }
-    inline StorageIndex* outerIndexPtr() { return m_outerIndex; }
-    //----------------------------------------
-
-    inline Scalar coeff(Index row, Index col) const
-    {
-      const Index outer = IsRowMajor ? row : col;
-      const Index inner = IsRowMajor ? col : row;
-
-      Index start = m_outerIndex[outer];
-      Index end = m_outerIndex[outer+1];
-      if (start==end)
-        return Scalar(0);
-      else if (end>0 && inner==m_innerIndices[end-1])
-        return m_values[end-1];
-      // ^^  optimization: let's first check if it is the last coefficient
-      // (very common in high level algorithms)
+    typedef typename Base::StorageIndex StorageIndex;
+    typedef typename Base::Scalar Scalar;
 
-      const StorageIndex* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end-1],inner);
-      const Index id = r-&m_innerIndices[0];
-      return ((*r==inner) && (id<end)) ? m_values[id] : Scalar(0);
-    }
-
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      const Index outer = IsRowMajor ? row : col;
-      const Index inner = IsRowMajor ? col : row;
-
-      Index start = m_outerIndex[outer];
-      Index end = m_outerIndex[outer+1];
-      eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix");
-      eigen_assert(end>start && "coeffRef cannot be called on a zero coefficient");
-      StorageIndex* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end],inner);
-      const Index id = r-&m_innerIndices[0];
-      eigen_assert((*r==inner) && (id<end) && "coeffRef cannot be called on a zero coefficient");
-      return m_values[id];
-    }
-
-    class InnerIterator;
-    class ReverseInnerIterator;
-
-    /** \returns the number of non zero coefficients */
-    inline StorageIndex nonZeros() const  { return m_nnz; }
-
-    inline MappedSparseMatrix(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, StorageIndex* innerIndexPtr, Scalar* valuePtr)
-      : m_outerSize(convert_index(IsRowMajor?rows:cols)), m_innerSize(convert_index(IsRowMajor?cols:rows)), m_nnz(convert_index(nnz)),
-        m_outerIndex(outerIndexPtr), m_innerIndices(innerIndexPtr), m_values(valuePtr)
+    inline MappedSparseMatrix(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, StorageIndex* innerIndexPtr, Scalar* valuePtr, StorageIndex* innerNonZeroPtr = 0)
+      : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZeroPtr)
     {}
 
     /** Empty destructor */
     inline ~MappedSparseMatrix() {}
 };
 
-template<typename Scalar, int _Flags, typename _StorageIndex>
-class MappedSparseMatrix<Scalar,_Flags,_StorageIndex>::InnerIterator
-{
-  public:
-    InnerIterator(const MappedSparseMatrix& mat, Index outer)
-      : m_matrix(mat),
-        m_outer(convert_index(outer)),
-        m_id(mat.outerIndexPtr()[outer]),
-        m_start(m_id),
-        m_end(mat.outerIndexPtr()[outer+1])
-    {}
-
-    inline InnerIterator& operator++() { m_id++; return *this; }
-
-    inline Scalar value() const { return m_matrix.valuePtr()[m_id]; }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_matrix.valuePtr()[m_id]); }
-
-    inline StorageIndex index() const { return m_matrix.innerIndexPtr()[m_id]; }
-    inline StorageIndex row() const { return IsRowMajor ? m_outer : index(); }
-    inline StorageIndex col() const { return IsRowMajor ? index() : m_outer; }
-
-    inline operator bool() const { return (m_id < m_end) && (m_id>=m_start); }
-
-  protected:
-    const MappedSparseMatrix& m_matrix;
-    const StorageIndex m_outer;
-    StorageIndex m_id;
-    const StorageIndex m_start;
-    const StorageIndex m_end;
-};
-
-template<typename Scalar, int _Flags, typename _StorageIndex>
-class MappedSparseMatrix<Scalar,_Flags,_StorageIndex>::ReverseInnerIterator
-{
-  public:
-    ReverseInnerIterator(const MappedSparseMatrix& mat, Index outer)
-      : m_matrix(mat),
-        m_outer(outer),
-        m_id(mat.outerIndexPtr()[outer+1]),
-        m_start(mat.outerIndexPtr()[outer]),
-        m_end(m_id)
-    {}
-
-    inline ReverseInnerIterator& operator--() { m_id--; return *this; }
-
-    inline Scalar value() const { return m_matrix.valuePtr()[m_id-1]; }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_matrix.valuePtr()[m_id-1]); }
-
-    inline StorageIndex index() const { return m_matrix.innerIndexPtr()[m_id-1]; }
-    inline StorageIndex row() const { return IsRowMajor ? m_outer : index(); }
-    inline StorageIndex col() const { return IsRowMajor ? index() : m_outer; }
-
-    inline operator bool() const { return (m_id <= m_end) && (m_id>m_start); }
-
-  protected:
-    const MappedSparseMatrix& m_matrix;
-    const StorageIndex m_outer;
-    StorageIndex m_id;
-    const StorageIndex m_start;
-    const StorageIndex m_end;
-};
-
 namespace internal {
 
-template<typename _Scalar, int _Options, typename _Index>
-struct evaluator<MappedSparseMatrix<_Scalar,_Options,_Index> >
-  : evaluator_base<MappedSparseMatrix<_Scalar,_Options,_Index> >
+template<typename _Scalar, int _Options, typename _StorageIndex>
+struct evaluator<MappedSparseMatrix<_Scalar,_Options,_StorageIndex> >
+  : evaluator<SparseCompressedBase<MappedSparseMatrix<_Scalar,_Options,_StorageIndex> > >
 {
-  typedef MappedSparseMatrix<_Scalar,_Options,_Index> MappedSparseMatrixType;
-  typedef typename MappedSparseMatrixType::InnerIterator InnerIterator;
-  typedef typename MappedSparseMatrixType::ReverseInnerIterator ReverseInnerIterator;
-  
-  enum {
-    CoeffReadCost = NumTraits<_Scalar>::ReadCost,
-    Flags = MappedSparseMatrixType::Flags
-  };
-  
-  evaluator() : m_matrix(0) {}
-  explicit evaluator(const MappedSparseMatrixType &mat) : m_matrix(&mat) {}
-  
-  operator MappedSparseMatrixType&() { return m_matrix->const_cast_derived(); }
-  operator const MappedSparseMatrixType&() const { return *m_matrix; }
+  typedef MappedSparseMatrix<_Scalar,_Options,_StorageIndex> XprType;
+  typedef evaluator<SparseCompressedBase<XprType> > Base;
   
-  const MappedSparseMatrixType *m_matrix;
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
 };
 
 }
diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h
index 8db4bbb75..dfdf4314a 100644
--- a/Eigen/src/SparseCore/SparseBlock.h
+++ b/Eigen/src/SparseCore/SparseBlock.h
@@ -74,7 +74,7 @@ namespace internal {
 
 template<typename SparseMatrixType, int BlockRows, int BlockCols>
 class sparse_matrix_block_impl
-  : public SparseMatrixBase<Block<SparseMatrixType,BlockRows,BlockCols,true> >
+  : public SparseCompressedBase<Block<SparseMatrixType,BlockRows,BlockCols,true> >
 {
     typedef typename internal::remove_all<typename SparseMatrixType::Nested>::type _MatrixTypeNested;
     typedef Block<SparseMatrixType, BlockRows, BlockCols, true> BlockType;
@@ -173,19 +173,24 @@ public:
     }
 
     inline const Scalar* valuePtr() const
-    { return m_matrix.valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+    { return m_matrix.valuePtr(); }
     inline Scalar* valuePtr()
-    { return m_matrix.const_cast_derived().valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+    { return m_matrix.const_cast_derived().valuePtr(); }
 
     inline const StorageIndex* innerIndexPtr() const
-    { return m_matrix.innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+    { return m_matrix.innerIndexPtr(); }
     inline StorageIndex* innerIndexPtr()
-    { return m_matrix.const_cast_derived().innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+    { return m_matrix.const_cast_derived().innerIndexPtr(); }
 
     inline const StorageIndex* outerIndexPtr() const
     { return m_matrix.outerIndexPtr() + m_outerStart; }
     inline StorageIndex* outerIndexPtr()
     { return m_matrix.const_cast_derived().outerIndexPtr() + m_outerStart; }
+    
+    inline const StorageIndex* innerNonZeroPtr() const
+    { return isCompressed() ? 0 : m_matrix.innerNonZeroPtr(); }
+    inline StorageIndex* innerNonZeroPtr()
+    { return isCompressed() ? 0 : m_matrix.const_cast_derived().innerNonZeroPtr(); }
 
     StorageIndex nonZeros() const
     {
@@ -197,6 +202,8 @@ public:
       else
         return Map<const IndexVector>(m_matrix.innerNonZeroPtr()+m_outerStart, m_outerSize.value()).sum();
     }
+    
+    bool isCompressed() const { return m_matrix.innerNonZeroPtr()==0; }
 
     const Scalar& lastCoeff() const
     {
diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h
new file mode 100644
index 000000000..a5ba45e04
--- /dev/null
+++ b/Eigen/src/SparseCore/SparseCompressedBase.h
@@ -0,0 +1,198 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_COMPRESSED_BASE_H
+#define EIGEN_SPARSE_COMPRESSED_BASE_H
+
+namespace Eigen { 
+
+template<typename Derived> class SparseCompressedBase;
+  
+namespace internal {
+
+template<typename Derived>
+struct traits<SparseCompressedBase<Derived> > : traits<Derived>
+{};
+
+} // end namespace internal
+
+template<typename Derived>
+class SparseCompressedBase
+  : public SparseMatrixBase<Derived>
+{
+  public:
+    typedef SparseMatrixBase<Derived> Base;
+    _EIGEN_SPARSE_PUBLIC_INTERFACE(SparseCompressedBase)
+    using Base::operator=;
+    using Base::IsRowMajor;
+    
+    class InnerIterator;
+    class ReverseInnerIterator;
+    
+    /** \returns a const pointer to the array of values.
+      * This function is aimed at interoperability with other libraries.
+      * \sa innerIndexPtr(), outerIndexPtr() */
+    inline const Scalar* valuePtr() const { return derived().valuePtr(); }
+    /** \returns a non-const pointer to the array of values.
+      * This function is aimed at interoperability with other libraries.
+      * \sa innerIndexPtr(), outerIndexPtr() */
+    inline Scalar* valuePtr() { return derived().valuePtr(); }
+
+    /** \returns a const pointer to the array of inner indices.
+      * This function is aimed at interoperability with other libraries.
+      * \sa valuePtr(), outerIndexPtr() */
+    inline const StorageIndex* innerIndexPtr() const { return derived().innerIndexPtr(); }
+    /** \returns a non-const pointer to the array of inner indices.
+      * This function is aimed at interoperability with other libraries.
+      * \sa valuePtr(), outerIndexPtr() */
+    inline StorageIndex* innerIndexPtr() { return derived().innerIndexPtr(); }
+
+    /** \returns a const pointer to the array of the starting positions of the inner vectors.
+      * This function is aimed at interoperability with other libraries.
+      * \sa valuePtr(), innerIndexPtr() */
+    inline const StorageIndex* outerIndexPtr() const { return derived().outerIndexPtr(); }
+    /** \returns a non-const pointer to the array of the starting positions of the inner vectors.
+      * This function is aimed at interoperability with other libraries.
+      * \sa valuePtr(), innerIndexPtr() */
+    inline StorageIndex* outerIndexPtr() { return derived().outerIndexPtr(); }
+
+    /** \returns a const pointer to the array of the number of non zeros of the inner vectors.
+      * This function is aimed at interoperability with other libraries.
+      * \warning it returns the null pointer 0 in compressed mode */
+    inline const StorageIndex* innerNonZeroPtr() const { return derived().innerNonZeroPtr(); }
+    /** \returns a non-const pointer to the array of the number of non zeros of the inner vectors.
+      * This function is aimed at interoperability with other libraries.
+      * \warning it returns the null pointer 0 in compressed mode */
+    inline StorageIndex* innerNonZeroPtr() { return derived().innerNonZeroPtr(); }
+    
+    /** \returns whether \c *this is in compressed form. */
+    inline bool isCompressed() const { return innerNonZeroPtr()==0; }
+  
+};
+
+template<typename Derived>
+class SparseCompressedBase<Derived>::InnerIterator
+{
+  public:
+    InnerIterator(const SparseCompressedBase& mat, Index outer)
+      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_id(mat.outerIndexPtr()[outer])
+    {
+      if(mat.isCompressed())
+        m_end = mat.outerIndexPtr()[outer+1];
+      else
+        m_end = m_id + mat.innerNonZeroPtr()[outer];
+    }
+
+    inline InnerIterator& operator++() { m_id++; return *this; }
+
+    inline const Scalar& value() const { return m_values[m_id]; }
+    inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id]); }
+
+    inline StorageIndex index() const { return m_indices[m_id]; }
+    inline Index outer() const { return m_outer; }
+    inline Index row() const { return IsRowMajor ? m_outer : index(); }
+    inline Index col() const { return IsRowMajor ? index() : m_outer; }
+
+    inline operator bool() const { return (m_id < m_end); }
+
+  protected:
+    const Scalar* m_values;
+    const StorageIndex* m_indices;
+    const Index m_outer;
+    Index m_id;
+    Index m_end;
+  private:
+    // If you get here, then you're not using the right InnerIterator type, e.g.:
+    //   SparseMatrix<double,RowMajor> A;
+    //   SparseMatrix<double>::InnerIterator it(A,0);
+    template<typename T> InnerIterator(const SparseMatrixBase<T>&, Index outer);
+};
+
+template<typename Derived>
+class SparseCompressedBase<Derived>::ReverseInnerIterator
+{
+  public:
+    ReverseInnerIterator(const SparseCompressedBase& mat, Index outer)
+      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_start(mat.outerIndexPtr()[outer])
+    {
+      if(mat.isCompressed())
+        m_id = mat.outerIndexPtr()[outer+1];
+      else
+        m_id = m_start + mat.innerNonZeroPtr()[outer];
+    }
+
+    inline ReverseInnerIterator& operator--() { --m_id; return *this; }
+
+    inline const Scalar& value() const { return m_values[m_id-1]; }
+    inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id-1]); }
+
+    inline StorageIndex index() const { return m_indices[m_id-1]; }
+    inline Index outer() const { return m_outer; }
+    inline Index row() const { return IsRowMajor ? m_outer : index(); }
+    inline Index col() const { return IsRowMajor ? index() : m_outer; }
+
+    inline operator bool() const { return (m_id > m_start); }
+
+  protected:
+    const Scalar* m_values;
+    const StorageIndex* m_indices;
+    const Index m_outer;
+    Index m_id;
+    const Index m_start;
+};
+
+namespace internal {
+
+template<typename Derived>
+struct evaluator<SparseCompressedBase<Derived> >
+  : evaluator_base<Derived>
+{
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::InnerIterator InnerIterator;
+  typedef typename Derived::ReverseInnerIterator ReverseInnerIterator;
+  
+  enum {
+    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    Flags = Derived::Flags
+  };
+  
+  evaluator() : m_matrix(0) {}
+  explicit evaluator(const Derived &mat) : m_matrix(&mat) {}
+  
+  operator Derived&() { return m_matrix->const_cast_derived(); }
+  operator const Derived&() const { return *m_matrix; }
+  
+  typedef typename DenseCoeffsBase<Derived,ReadOnlyAccessors>::CoeffReturnType CoeffReturnType;
+  Scalar coeff(Index row, Index col) const
+  { return m_matrix->coeff(row,col); }
+  
+  Scalar& coeffRef(Index row, Index col)
+  {
+    eigen_internal_assert(row>=0 && row<m_matrix->rows() && col>=0 && col<m_matrix->cols());
+      
+    const Index outer = Derived::IsRowMajor ? row : col;
+    const Index inner = Derived::IsRowMajor ? col : row;
+
+    Index start = m_matrix->outerIndexPtr()[outer];
+    Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer];
+    eigen_assert(end>start && "you are using a non finalized sparse matrix or written coefficient does not exist");
+    const Index p =   std::lower_bound(m_matrix->innerIndexPtr()+start, m_matrix->innerIndexPtr()+end,inner)
+                    - m_matrix->innerIndexPtr();
+    eigen_assert((p<end) && (m_matrix->innerIndexPtr()[p]==inner) && "written coefficient does not exist");
+    return m_matrix->const_cast_derived().valuePtr()[p];
+  }
+
+  const Derived *m_matrix;
+};
+
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_COMPRESSED_BASE_H
diff --git a/Eigen/src/SparseCore/SparseMap.h b/Eigen/src/SparseCore/SparseMap.h
new file mode 100644
index 000000000..a6ff7d559
--- /dev/null
+++ b/Eigen/src/SparseCore/SparseMap.h
@@ -0,0 +1,239 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_MAP_H
+#define EIGEN_SPARSE_MAP_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct traits<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : public traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >
+{
+  typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
+  typedef traits<PlainObjectType> TraitsBase;
+  enum {
+    Flags = TraitsBase::Flags & (~NestByRefBit)
+  };
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct traits<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : public traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >
+{
+  typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
+  typedef traits<PlainObjectType> TraitsBase;
+  enum {
+    Flags = TraitsBase::Flags & (~ (NestByRefBit | LvalueBit))
+  };
+};
+
+} // end namespace internal
+  
+template<typename Derived,
+         int Level = internal::accessors_level<Derived>::has_write_access ? WriteAccessors : ReadOnlyAccessors
+> class SparseMapBase;
+
+template<typename Derived>
+class SparseMapBase<Derived,ReadOnlyAccessors>
+  : public SparseCompressedBase<Derived>
+{
+  public:
+    typedef SparseCompressedBase<Derived> Base;
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::StorageIndex StorageIndex;
+    enum { IsRowMajor = Base::IsRowMajor };
+    using Base::operator=;
+  protected:
+    
+    typedef typename internal::conditional<
+                         bool(internal::is_lvalue<Derived>::value),
+                         Scalar *, const Scalar *>::type ScalarPointer;
+    typedef typename internal::conditional<
+                         bool(internal::is_lvalue<Derived>::value),
+                         StorageIndex *, const StorageIndex *>::type IndexPointer;
+
+    Index   m_outerSize;
+    Index   m_innerSize;
+    Index   m_nnz;
+    IndexPointer  m_outerIndex;
+    IndexPointer  m_innerIndices;
+    ScalarPointer m_values;
+    IndexPointer  m_innerNonZeros;
+
+  public:
+
+    inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
+    inline Index cols() const { return IsRowMajor ? m_innerSize : m_outerSize; }
+    inline Index innerSize() const { return m_innerSize; }
+    inline Index outerSize() const { return m_outerSize; }
+    
+    bool isCompressed() const { return m_innerNonZeros==0; }
+
+    //----------------------------------------
+    // direct access interface
+    inline const Scalar* valuePtr() const { return m_values; }
+    inline const StorageIndex* innerIndexPtr() const { return m_innerIndices; }
+    inline const StorageIndex* outerIndexPtr() const { return m_outerIndex; }
+    inline const StorageIndex* innerNonZeroPtr() const { return m_innerNonZeros; }
+    //----------------------------------------
+
+    inline Scalar coeff(Index row, Index col) const
+    {
+      const Index outer = IsRowMajor ? row : col;
+      const Index inner = IsRowMajor ? col : row;
+
+      Index start = m_outerIndex[outer];
+      Index end = isCompressed() ? m_outerIndex[outer+1] : start + m_innerNonZeros[outer];
+      if (start==end)
+        return Scalar(0);
+      else if (end>0 && inner==m_innerIndices[end-1])
+        return m_values[end-1];
+      // ^^  optimization: let's first check if it is the last coefficient
+      // (very common in high level algorithms)
+
+      const StorageIndex* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end-1],inner);
+      const Index id = r-&m_innerIndices[0];
+      return ((*r==inner) && (id<end)) ? m_values[id] : Scalar(0);
+    }
+
+    /** \returns the number of non zero coefficients */
+    inline Index nonZeros() const  { return m_nnz; }
+
+    inline SparseMapBase(Index rows, Index cols, Index nnz, IndexPointer outerIndexPtr, IndexPointer innerIndexPtr,
+                              ScalarPointer valuePtr, IndexPointer innerNonZerosPtr = 0)
+      : m_outerSize(IsRowMajor?rows:cols), m_innerSize(IsRowMajor?cols:rows), m_nnz(nnz), m_outerIndex(outerIndexPtr),
+        m_innerIndices(innerIndexPtr), m_values(valuePtr), m_innerNonZeros(innerNonZerosPtr)
+    {}
+
+    /** Empty destructor */
+    inline ~SparseMapBase() {}
+};
+
+template<typename Derived>
+class SparseMapBase<Derived,WriteAccessors>
+  : public SparseMapBase<Derived,ReadOnlyAccessors>
+{
+    typedef MapBase<Derived, ReadOnlyAccessors> ReadOnlyMapBase;
+    
+  public:
+    typedef SparseMapBase<Derived, ReadOnlyAccessors> Base;
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::StorageIndex StorageIndex;
+    enum { IsRowMajor = Base::IsRowMajor };
+    
+    using Base::operator=;
+
+  public:
+    
+    //----------------------------------------
+    // direct access interface
+    using Base::valuePtr;
+    using Base::innerIndexPtr;
+    using Base::outerIndexPtr;
+    using Base::innerNonZeroPtr;
+    inline Scalar* valuePtr()       { return Base::m_values; }
+    inline StorageIndex* innerIndexPtr()   { return Base::m_innerIndices; }
+    inline StorageIndex* outerIndexPtr()   { return Base::m_outerIndex; }
+    inline StorageIndex* innerNonZeroPtr() { return Base::m_innerNonZeros; }
+    //----------------------------------------
+
+    inline Scalar& coeffRef(Index row, Index col)
+    {
+      const Index outer = IsRowMajor ? row : col;
+      const Index inner = IsRowMajor ? col : row;
+
+      Index start = Base::m_outerIndex[outer];
+      Index end = Base::isCompressed() ? Base::m_outerIndex[outer+1] : start + Base::m_innerNonZeros[outer];
+      eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix");
+      eigen_assert(end>start && "coeffRef cannot be called on a zero coefficient");
+      Index* r = std::lower_bound(&Base::m_innerIndices[start],&Base::m_innerIndices[end],inner);
+      const Index id = r - &Base::m_innerIndices[0];
+      eigen_assert((*r==inner) && (id<end) && "coeffRef cannot be called on a zero coefficient");
+      return const_cast<Scalar*>(Base::m_values)[id];
+    }
+    
+    inline SparseMapBase(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, StorageIndex* innerIndexPtr,
+                              Scalar* valuePtr, StorageIndex* innerNonZerosPtr = 0)
+      : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr)
+    {}
+
+    /** Empty destructor */
+    inline ~SparseMapBase() {}
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType>
+  : public SparseMapBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+{
+  public:
+    typedef SparseMapBase<Map> Base;
+    _EIGEN_SPARSE_PUBLIC_INTERFACE(Map)
+    enum { IsRowMajor = Base::IsRowMajor };
+
+  public:
+
+    inline Map(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr,
+               StorageIndex* innerIndexPtr, Scalar* valuePtr, StorageIndex* innerNonZerosPtr = 0)
+      : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr)
+    {}
+
+    /** Empty destructor */
+    inline ~Map() {}
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType>
+  : public SparseMapBase<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+{
+  public:
+    typedef SparseMapBase<Map> Base;
+    _EIGEN_SPARSE_PUBLIC_INTERFACE(Map)
+    enum { IsRowMajor = Base::IsRowMajor };
+
+  public:
+
+    inline Map(Index rows, Index cols, Index nnz, const StorageIndex* outerIndexPtr,
+               const StorageIndex* innerIndexPtr, const Scalar* valuePtr, const StorageIndex* innerNonZerosPtr = 0)
+      : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr)
+    {}
+
+    /** Empty destructor */
+    inline ~Map() {}
+};
+
+namespace internal {
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : evaluator<SparseCompressedBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
+{
+  typedef evaluator<SparseCompressedBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
+  typedef Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;  
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : evaluator<SparseCompressedBase<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
+{
+  typedef evaluator<SparseCompressedBase<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
+  typedef Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;  
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
+};
+
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_MAP_H
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index dd11fcafd..c049f9bcb 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -51,7 +51,7 @@ struct traits<SparseMatrix<_Scalar, _Options, _Index> >
     ColsAtCompileTime = Dynamic,
     MaxRowsAtCompileTime = Dynamic,
     MaxColsAtCompileTime = Dynamic,
-    Flags = _Options | NestByRefBit | LvalueBit,
+    Flags = _Options | NestByRefBit | LvalueBit | CompressedAccessBit,
     SupportedAccessPatterns = InnerRandomAccessPattern
   };
 };
@@ -90,16 +90,20 @@ struct traits<Diagonal<const SparseMatrix<_Scalar, _Options, _Index>, DiagIndex>
 
 template<typename _Scalar, int _Options, typename _Index>
 class SparseMatrix
-  : public SparseMatrixBase<SparseMatrix<_Scalar, _Options, _Index> >
+  : public SparseCompressedBase<SparseMatrix<_Scalar, _Options, _Index> >
 {
   public:
-    EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix)
+    typedef SparseCompressedBase<SparseMatrix> Base;
+    using Base::isCompressed;
+    _EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix)
     EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, +=)
     EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, -=)
 
     typedef MappedSparseMatrix<Scalar,Flags> Map;
     typedef Diagonal<SparseMatrix> DiagonalReturnType;
     typedef Diagonal<const SparseMatrix> ConstDiagonalReturnType;
+    typedef typename Base::InnerIterator InnerIterator;
+    typedef typename Base::ReverseInnerIterator ReverseInnerIterator;
     
 
     using Base::IsRowMajor;
@@ -124,9 +128,6 @@ class SparseMatrix
 
   public:
     
-    /** \returns whether \c *this is in compressed form. */
-    inline bool isCompressed() const { return m_innerNonZeros==0; }
-
     /** \returns the number of rows of the matrix */
     inline StorageIndex rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
     /** \returns the number of columns of the matrix */
@@ -180,7 +181,7 @@ class SparseMatrix
 
     /** \returns the value of the matrix at position \a i, \a j
       * This function returns Scalar(0) if the element is an explicit \em zero */
-    inline const Scalar& coeff(Index row, Index col) const
+    inline Scalar coeff(Index row, Index col) const
     {
       eigen_assert(row>=0 && row<rows() && col>=0 && col<cols());
       
@@ -242,9 +243,6 @@ class SparseMatrix
 
   public:
 
-    class InnerIterator;
-    class ReverseInnerIterator;
-
     /** Removes all non zeros but keep allocated memory */
     inline void setZero()
     {
@@ -874,77 +872,6 @@ private:
   };
 };
 
-template<typename Scalar, int _Options, typename _Index>
-class SparseMatrix<Scalar,_Options,_Index>::InnerIterator
-{
-  public:
-    InnerIterator(const SparseMatrix& mat, Index outer)
-      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(convert_index(outer)), m_id(mat.m_outerIndex[outer])
-    {
-      if(mat.isCompressed())
-        m_end = mat.m_outerIndex[outer+1];
-      else
-        m_end = m_id + mat.m_innerNonZeros[outer];
-    }
-
-    inline InnerIterator& operator++() { m_id++; return *this; }
-
-    inline const Scalar& value() const { return m_values[m_id]; }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id]); }
-
-    inline StorageIndex index() const { return m_indices[m_id]; }
-    inline StorageIndex outer() const { return m_outer; }
-    inline StorageIndex row() const { return IsRowMajor ? m_outer : index(); }
-    inline StorageIndex col() const { return IsRowMajor ? index() : m_outer; }
-
-    inline operator bool() const { return (m_id < m_end); }
-
-  protected:
-    const Scalar* m_values;
-    const StorageIndex* m_indices;
-    const StorageIndex m_outer;
-    StorageIndex m_id;
-    StorageIndex m_end;
-  private:
-    // If you get here, then you're not using the right InnerIterator type, e.g.:
-    //   SparseMatrix<double,RowMajor> A;
-    //   SparseMatrix<double>::InnerIterator it(A,0);
-    template<typename T> InnerIterator(const SparseMatrixBase<T>&,Index outer);
-};
-
-template<typename Scalar, int _Options, typename _Index>
-class SparseMatrix<Scalar,_Options,_Index>::ReverseInnerIterator
-{
-  public:
-    ReverseInnerIterator(const SparseMatrix& mat, Index outer)
-      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_start(mat.m_outerIndex[outer])
-    {
-      if(mat.isCompressed())
-        m_id = mat.m_outerIndex[outer+1];
-      else
-        m_id = m_start + mat.m_innerNonZeros[outer];
-    }
-
-    inline ReverseInnerIterator& operator--() { --m_id; return *this; }
-
-    inline const Scalar& value() const { return m_values[m_id-1]; }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id-1]); }
-
-    inline StorageIndex index() const { return m_indices[m_id-1]; }
-    inline StorageIndex outer() const { return m_outer; }
-    inline StorageIndex row() const { return IsRowMajor ? m_outer : index(); }
-    inline StorageIndex col() const { return IsRowMajor ? index() : m_outer; }
-
-    inline operator bool() const { return (m_id > m_start); }
-
-  protected:
-    const Scalar* m_values;
-    const StorageIndex* m_indices;
-    const StorageIndex m_outer;
-    StorageIndex m_id;
-    const StorageIndex m_start;
-};
-
 namespace internal {
 
 template<typename InputIterator, typename SparseMatrixType>
@@ -1074,6 +1001,10 @@ EIGEN_DONT_INLINE SparseMatrix<Scalar,_Options,_Index>& SparseMatrix<Scalar,_Opt
   EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
         YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
 
+  #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+    EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+  #endif
+      
   const bool needToTranspose = (Flags & RowMajorBit) != (internal::evaluator<OtherDerived>::Flags & RowMajorBit);
   if (needToTranspose)
   {
@@ -1276,44 +1207,12 @@ namespace internal {
 
 template<typename _Scalar, int _Options, typename _Index>
 struct evaluator<SparseMatrix<_Scalar,_Options,_Index> >
-  : evaluator_base<SparseMatrix<_Scalar,_Options,_Index> >
+  : evaluator<SparseCompressedBase<SparseMatrix<_Scalar,_Options,_Index> > >
 {
-  typedef _Scalar Scalar;
+  typedef evaluator<SparseCompressedBase<SparseMatrix<_Scalar,_Options,_Index> > > Base;
   typedef SparseMatrix<_Scalar,_Options,_Index> SparseMatrixType;
-  typedef typename SparseMatrixType::InnerIterator InnerIterator;
-  typedef typename SparseMatrixType::ReverseInnerIterator ReverseInnerIterator;
-  
-  enum {
-    CoeffReadCost = NumTraits<_Scalar>::ReadCost,
-    Flags = SparseMatrixType::Flags
-  };
-  
-  evaluator() : m_matrix(0) {}
-  explicit evaluator(const SparseMatrixType &mat) : m_matrix(&mat) {}
-  
-  operator SparseMatrixType&() { return m_matrix->const_cast_derived(); }
-  operator const SparseMatrixType&() const { return *m_matrix; }
-  
-  typedef typename DenseCoeffsBase<SparseMatrixType,ReadOnlyAccessors>::CoeffReturnType CoeffReturnType;
-  CoeffReturnType coeff(Index row, Index col) const
-  { return m_matrix->coeff(row,col); }
-  
-  Scalar& coeffRef(Index row, Index col)
-  {
-    eigen_internal_assert(row>=0 && row<m_matrix->rows() && col>=0 && col<m_matrix->cols());
-      
-    const Index outer = SparseMatrixType::IsRowMajor ? row : col;
-    const Index inner = SparseMatrixType::IsRowMajor ? col : row;
-
-    Index start = m_matrix->outerIndexPtr()[outer];
-    Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer];
-    eigen_assert(end>start && "you are using a non finalized sparse matrix or written coefficient does not exist");
-    const Index p = m_matrix->data().searchLowerIndex(start,end-1,inner);
-    eigen_assert((p<end) && (m_matrix->data().index(p)==inner) && "written coefficient does not exist");
-    return m_matrix->const_cast_derived().data().value(p);
-  }
-
-  const SparseMatrixType *m_matrix;
+  evaluator() : Base() {}
+  explicit evaluator(const SparseMatrixType &mat) : Base(mat) {}
 };
 
 }
diff --git a/Eigen/src/SparseCore/SparseRef.h b/Eigen/src/SparseCore/SparseRef.h
new file mode 100644
index 000000000..2ca039323
--- /dev/null
+++ b/Eigen/src/SparseCore/SparseRef.h
@@ -0,0 +1,192 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_REF_H
+#define EIGEN_SPARSE_REF_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename Derived> class SparseRefBase;
+  
+template<typename MatScalar, int MatOptions, typename MatIndex, int _Options, typename _StrideType>
+struct traits<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
+  : public traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >
+{
+  typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
+  enum {
+    Options = _Options,
+    Flags = traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >::Flags | CompressedAccessBit | NestByRefBit
+  };
+
+  template<typename Derived> struct match {
+    enum {
+      StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
+      MatchAtCompileTime = (Derived::Flags&CompressedAccessBit) && StorageOrderMatch
+    };
+    typedef typename internal::conditional<MatchAtCompileTime,internal::true_type,internal::false_type>::type type;
+  };
+  
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int _Options, typename _StrideType>
+struct traits<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
+  : public traits<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
+{
+  enum {
+    Flags = (traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >::Flags | CompressedAccessBit | NestByRefBit) & ~LvalueBit
+  };
+};
+  
+template<typename Derived>
+struct traits<SparseRefBase<Derived> > : public traits<Derived> {};
+
+template<typename Derived> class SparseRefBase
+  : public SparseMapBase<Derived>
+{
+public:
+
+  typedef SparseMapBase<Derived> Base;
+  _EIGEN_SPARSE_PUBLIC_INTERFACE(SparseRefBase)
+
+  SparseRefBase()
+    : Base(RowsAtCompileTime==Dynamic?0:RowsAtCompileTime,ColsAtCompileTime==Dynamic?0:ColsAtCompileTime, 0, 0, 0, 0, 0)
+  {}
+  
+protected:
+
+
+  template<typename Expression>
+  void construct(Expression& expr)
+  {
+    ::new (static_cast<Base*>(this)) Base(expr.rows(), expr.cols(), expr.nonZeros(), expr.outerIndexPtr(), expr.innerIndexPtr(), expr.valuePtr(), expr.innerNonZeroPtr());
+  }
+};
+
+} // namespace internal
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType >
+  : public internal::SparseRefBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType > >
+{
+    typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
+    typedef internal::traits<Ref> Traits;
+    template<int OtherOptions>
+    inline Ref(const SparseMatrix<MatScalar,OtherOptions,MatIndex>& expr);
+    template<int OtherOptions>
+    inline Ref(const MappedSparseMatrix<MatScalar,OtherOptions,MatIndex>& expr);
+  public:
+
+    typedef internal::SparseRefBase<Ref> Base;
+    _EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
+
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<int OtherOptions>
+    inline Ref(SparseMatrix<MatScalar,OtherOptions,MatIndex>& expr)
+    {
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<SparseMatrix<MatScalar,OtherOptions,MatIndex> >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      Base::construct(expr.derived());
+    }
+    
+    template<int OtherOptions>
+    inline Ref(MappedSparseMatrix<MatScalar,OtherOptions,MatIndex>& expr)
+    {
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<SparseMatrix<MatScalar,OtherOptions,MatIndex> >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      Base::construct(expr.derived());
+    }
+    
+    template<typename Derived>
+    inline Ref(const SparseCompressedBase<Derived>& expr)
+    #else
+    template<typename Derived>
+    inline Ref(SparseCompressedBase<Derived>& expr)
+    #endif
+    {
+      EIGEN_STATIC_ASSERT(bool(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      Base::construct(expr.const_cast_derived());
+    }
+};
+
+// this is the const ref version
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType>
+  : public internal::SparseRefBase<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+{
+    typedef SparseMatrix<MatScalar,MatOptions,MatIndex> TPlainObjectType;
+    typedef internal::traits<Ref> Traits;
+  public:
+
+    typedef internal::SparseRefBase<Ref> Base;
+    _EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
+
+    template<typename Derived>
+    inline Ref(const SparseMatrixBase<Derived>& expr)
+    {
+      construct(expr.derived(), typename Traits::template match<Derived>::type());
+    }
+
+    inline Ref(const Ref& other) : Base(other) {
+      // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy
+    }
+
+    template<typename OtherRef>
+    inline Ref(const RefBase<OtherRef>& other) {
+      construct(other.derived(), typename Traits::template match<OtherRef>::type());
+    }
+
+  protected:
+
+    template<typename Expression>
+    void construct(const Expression& expr,internal::true_type)
+    {
+      Base::construct(expr);
+    }
+
+    template<typename Expression>
+    void construct(const Expression& expr, internal::false_type)
+    {
+      m_object = expr;
+      Base::construct(m_object);
+    }
+
+  protected:
+    TPlainObjectType m_object;
+};
+
+
+namespace internal {
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : evaluator<SparseCompressedBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
+{
+  typedef evaluator<SparseCompressedBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
+  typedef Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;  
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : evaluator<SparseCompressedBase<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
+{
+  typedef evaluator<SparseCompressedBase<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
+  typedef Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;  
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
+};
+
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_REF_H
diff --git a/Eigen/src/SparseCore/SparseTranspose.h b/Eigen/src/SparseCore/SparseTranspose.h
index c74af46b3..84413c374 100644
--- a/Eigen/src/SparseCore/SparseTranspose.h
+++ b/Eigen/src/SparseCore/SparseTranspose.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -12,13 +12,41 @@
 
 namespace Eigen { 
 
+namespace internal {
+  template<typename MatrixType,int CompressedAccess=int(MatrixType::Flags&CompressedAccessBit)>
+  class SparseTransposeImpl
+    : public SparseMatrixBase<Transpose<MatrixType> >
+  {};
+  
+  template<typename MatrixType>
+  class SparseTransposeImpl<MatrixType,CompressedAccessBit>
+    : public SparseCompressedBase<Transpose<MatrixType> >
+  {
+    typedef SparseCompressedBase<Transpose<MatrixType> > Base;
+  public:
+    using Base::derived;
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::StorageIndex StorageIndex;
+    
+    inline const Scalar* valuePtr() const { return derived().nestedExpression().valuePtr(); }
+    inline const StorageIndex* innerIndexPtr() const { return derived().nestedExpression().innerIndexPtr(); }
+    inline const StorageIndex* outerIndexPtr() const { return derived().nestedExpression().outerIndexPtr(); }
+    inline const StorageIndex* innerNonZeroPtr() const { return derived().nestedExpression().innerNonZeroPtr(); }
+    
+    inline Scalar* valuePtr() { return derived().nestedExpression().valuePtr(); }
+    inline StorageIndex* innerIndexPtr() { return derived().nestedExpression().innerIndexPtr(); }
+    inline StorageIndex* outerIndexPtr() { return derived().nestedExpression().outerIndexPtr(); }
+    inline StorageIndex* innerNonZeroPtr() { return derived().nestedExpression().innerNonZeroPtr(); }
+  };
+}
+  
 // Implement nonZeros() for transpose. I'm not sure that's the best approach for that.
 // Perhaps it should be implemented in Transpose<> itself.
 template<typename MatrixType> class TransposeImpl<MatrixType,Sparse>
-  : public SparseMatrixBase<Transpose<MatrixType> >
+  : public internal::SparseTransposeImpl<MatrixType>
 {
   protected:
-    typedef SparseMatrixBase<Transpose<MatrixType> > Base;
+    typedef internal::SparseTransposeImpl<MatrixType> Base;
   public:
     inline typename MatrixType::StorageIndex nonZeros() const { return Base::derived().nestedExpression().nonZeros(); }
 };
diff --git a/Eigen/src/SparseCore/SparseUtil.h b/Eigen/src/SparseCore/SparseUtil.h
index 5714150c2..ac2b4f10b 100644
--- a/Eigen/src/SparseCore/SparseUtil.h
+++ b/Eigen/src/SparseCore/SparseUtil.h
@@ -44,8 +44,7 @@ EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, *=) \
 EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=)
 
 // TODO this is mostly the same as EIGEN_GENERIC_PUBLIC_INTERFACE
-#define _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived, BaseClass) \
-  typedef BaseClass Base; \
+#define _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) \
   typedef typename Eigen::internal::traits<Derived >::Scalar Scalar; \
   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; \
   typedef typename Eigen::internal::nested<Derived >::type Nested; \
@@ -61,7 +60,8 @@ EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=)
   using Base::convert_index;
   
 #define EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) \
-  _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived, Eigen::SparseMatrixBase<Derived >)
+  typedef Eigen::SparseMatrixBase<Derived > Base; \
+  _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived)
 
 const int CoherentAccessPattern     = 0x1;
 const int InnerRandomAccessPattern  = 0x2 | CoherentAccessPattern;
diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h
index 0c48fef3e..380ba25c0 100644
--- a/Eigen/src/SparseLU/SparseLU.h
+++ b/Eigen/src/SparseLU/SparseLU.h
@@ -309,7 +309,7 @@ class SparseLU : public SparseSolverBase<SparseLU<_MatrixType,_OrderingType> >,
     // Functions 
     void initperfvalues()
     {
-      m_perfv.panel_size = 1;
+      m_perfv.panel_size = 16;
       m_perfv.relax = 1; 
       m_perfv.maxsuper = 128; 
       m_perfv.rowblk = 16; 
diff --git a/Eigen/src/SuperLUSupport/SuperLUSupport.h b/Eigen/src/SuperLUSupport/SuperLUSupport.h
index f00bc3976..3a9b5fd74 100644
--- a/Eigen/src/SuperLUSupport/SuperLUSupport.h
+++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h
@@ -627,8 +627,12 @@ void SuperLU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest>
 
   m_sluFerr.resize(rhsCols);
   m_sluBerr.resize(rhsCols);
-  m_sluB = SluMatrix::Map(b.const_cast_derived());
-  m_sluX = SluMatrix::Map(x.derived());
+  
+  Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b);
+  Ref<const Matrix<typename Dest::Scalar,Dynamic,Dynamic,ColMajor> > x_ref(x);
+  
+  m_sluB = SluMatrix::Map(b_ref.const_cast_derived());
+  m_sluX = SluMatrix::Map(x_ref.const_cast_derived());
   
   typename Rhs::PlainObject b_cpy;
   if(m_sluEqued!='N')
@@ -651,6 +655,10 @@ void SuperLU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest>
                 &m_sluFerr[0], &m_sluBerr[0],
                 &m_sluStat, &info, Scalar());
   StatFree(&m_sluStat);
+  
+  if(&x.coeffRef(0) != x_ref.data())
+    x = x_ref;
+  
   m_info = info==0 ? Success : NumericalIssue;
 }
 
@@ -938,8 +946,12 @@ void SuperILU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest
 
   m_sluFerr.resize(rhsCols);
   m_sluBerr.resize(rhsCols);
-  m_sluB = SluMatrix::Map(b.const_cast_derived());
-  m_sluX = SluMatrix::Map(x.derived());
+  
+  Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b);
+  Ref<const Matrix<typename Dest::Scalar,Dynamic,Dynamic,ColMajor> > x_ref(x);
+  
+  m_sluB = SluMatrix::Map(b_ref.const_cast_derived());
+  m_sluX = SluMatrix::Map(x_ref.const_cast_derived());
 
   typename Rhs::PlainObject b_cpy;
   if(m_sluEqued!='N')
@@ -962,6 +974,9 @@ void SuperILU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest
                 &recip_pivot_growth, &rcond,
                 &m_sluStat, &info, Scalar());
   StatFree(&m_sluStat);
+  
+  if(&x.coeffRef(0) != x_ref.data())
+    x = x_ref;
 
   m_info = info==0 ? Success : NumericalIssue;
 }
diff --git a/Eigen/src/UmfPackSupport/UmfPackSupport.h b/Eigen/src/UmfPackSupport/UmfPackSupport.h
index 982aa2fca..47e8b6304 100644
--- a/Eigen/src/UmfPackSupport/UmfPackSupport.h
+++ b/Eigen/src/UmfPackSupport/UmfPackSupport.h
@@ -403,11 +403,22 @@ bool UmfPackLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBas
   eigen_assert(b.derived().data() != x.derived().data() && " Umfpack does not support inplace solve");
   
   int errorCode;
+  Scalar* x_ptr = 0;
+  Matrix<Scalar,Dynamic,1> x_tmp;
+  if(x.innerStride()!=1)
+  {
+    x_tmp.resize(x.rows());
+    x_ptr = x_tmp.data();
+  }
   for (int j=0; j<rhsCols; ++j)
   {
+    if(x.innerStride()==1)
+      x_ptr = &x.col(j).coeffRef(0);
     errorCode = umfpack_solve(UMFPACK_A,
         m_outerIndexPtr, m_innerIndexPtr, m_valuePtr,
-        &x.col(j).coeffRef(0), &b.const_cast_derived().col(j).coeffRef(0), m_numeric, 0, 0);
+        x_ptr, &b.const_cast_derived().col(j).coeffRef(0), m_numeric, 0, 0);
+    if(x.innerStride()!=1)
+      x.col(j) = x_tmp;
     if (errorCode!=0)
       return false;
   }
diff --git a/bench/btl/CMakeLists.txt b/bench/btl/CMakeLists.txt
index b299d9899..9444b450c 100644
--- a/bench/btl/CMakeLists.txt
+++ b/bench/btl/CMakeLists.txt
@@ -97,6 +97,7 @@ ENABLE_TESTING()
 
 add_subdirectory(libs/eigen3)
 add_subdirectory(libs/eigen2)
+add_subdirectory(libs/tensors)
 add_subdirectory(libs/BLAS)
 add_subdirectory(libs/ublas)
 add_subdirectory(libs/gmm)
diff --git a/bench/btl/libs/tensors/CMakeLists.txt b/bench/btl/libs/tensors/CMakeLists.txt
new file mode 100644
index 000000000..09d6d8e43
--- /dev/null
+++ b/bench/btl/libs/tensors/CMakeLists.txt
@@ -0,0 +1,44 @@
+
+
+if((NOT TENSOR_INCLUDE_DIR) AND Eigen_SOURCE_DIR)
+  # unless TENSOR_INCLUDE_DIR is defined, let's use current Eigen version
+  set(TENSOR_INCLUDE_DIR ${Eigen_SOURCE_DIR})
+  set(TENSOR_FOUND TRUE)
+else()
+  find_package(Tensor)
+endif()
+
+if (TENSOR_FOUND)
+
+  include_directories(${TENSOR_INCLUDE_DIR})
+  btl_add_bench(btl_tensor_linear main_linear.cpp)
+  btl_add_bench(btl_tensor_vecmat main_vecmat.cpp)
+  btl_add_bench(btl_tensor_matmat main_matmat.cpp)
+
+  btl_add_target_property(btl_tensor_linear COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor")
+  btl_add_target_property(btl_tensor_vecmat COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor")
+  btl_add_target_property(btl_tensor_matmat COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor")
+
+  option(BTL_BENCH_NOGCCVEC "also bench Eigen explicit vec without GCC's auto vec" OFF)
+  if(CMAKE_COMPILER_IS_GNUCXX AND BTL_BENCH_NOGCCVEC)
+    btl_add_bench(btl_tensor_nogccvec_linear main_linear.cpp)
+    btl_add_bench(btl_tensor_nogccvec_vecmat main_vecmat.cpp)
+    btl_add_bench(btl_tensor_nogccvec_matmat main_matmat.cpp)
+
+    btl_add_target_property(btl_tensor_nogccvec_linear COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec")
+    btl_add_target_property(btl_tensor_nogccvec_vecmat COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec")
+    btl_add_target_property(btl_tensor_nogccvec_matmat COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec")
+  endif()
+
+
+  if(NOT BTL_NOVEC)
+    btl_add_bench(btl_tensor_novec_linear main_linear.cpp OFF)
+    btl_add_bench(btl_tensor_novec_vecmat main_vecmat.cpp OFF)
+    btl_add_bench(btl_tensor_novec_matmat main_matmat.cpp OFF)
+    btl_add_target_property(btl_tensor_novec_linear COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec")
+    btl_add_target_property(btl_tensor_novec_vecmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec")
+    btl_add_target_property(btl_tensor_novec_matmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec")
+
+  endif(NOT BTL_NOVEC)
+
+endif (TENSOR_FOUND)
diff --git a/bench/btl/libs/tensors/main_linear.cpp b/bench/btl/libs/tensors/main_linear.cpp
new file mode 100644
index 000000000..e257f1e72
--- /dev/null
+++ b/bench/btl/libs/tensors/main_linear.cpp
@@ -0,0 +1,23 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "utilities.h"
+#include "tensor_interface.hh"
+#include "bench.hh"
+#include "basic_actions.hh"
+
+BTL_MAIN;
+
+int main()
+{
+  bench<Action_axpy<tensor_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT);
+  bench<Action_axpby<tensor_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT);
+
+  return 0;
+}
diff --git a/bench/btl/libs/tensors/main_matmat.cpp b/bench/btl/libs/tensors/main_matmat.cpp
new file mode 100644
index 000000000..675fcfc6d
--- /dev/null
+++ b/bench/btl/libs/tensors/main_matmat.cpp
@@ -0,0 +1,21 @@
+//=====================================================
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//=====================================================
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+//
+#include "utilities.h"
+#include "tensor_interface.hh"
+#include "bench.hh"
+#include "basic_actions.hh"
+
+BTL_MAIN;
+
+int main()
+{
+  bench<Action_matrix_matrix_product<tensor_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+
+  return 0;
+}
diff --git a/bench/btl/libs/tensors/main_vecmat.cpp b/bench/btl/libs/tensors/main_vecmat.cpp
new file mode 100644
index 000000000..1af00c81b
--- /dev/null
+++ b/bench/btl/libs/tensors/main_vecmat.cpp
@@ -0,0 +1,21 @@
+//=====================================================
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//=====================================================
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+//
+#include "utilities.h"
+#include "tensor_interface.hh"
+#include "bench.hh"
+#include "basic_actions.hh"
+
+BTL_MAIN;
+
+int main()
+{
+  bench<Action_matrix_vector_product<tensor_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
+
+  return 0;
+}
diff --git a/bench/btl/libs/tensors/tensor_interface.hh b/bench/btl/libs/tensors/tensor_interface.hh
new file mode 100644
index 000000000..97b8e0f0b
--- /dev/null
+++ b/bench/btl/libs/tensors/tensor_interface.hh
@@ -0,0 +1,105 @@
+//=====================================================
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//=====================================================
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+//
+#ifndef TENSOR_INTERFACE_HH
+#define TENSOR_INTERFACE_HH
+
+#include <unsupported/Eigen/CXX11/Tensor>
+#include <vector>
+#include "btl.hh"
+
+using namespace Eigen;
+
+template<class real>
+class tensor_interface
+{
+public :
+  typedef real real_type;
+  typedef typename Eigen::Tensor<real,2>::Index Index;
+
+  typedef std::vector<real> stl_vector;
+  typedef std::vector<stl_vector> stl_matrix;
+
+  typedef Eigen::Tensor<real,2> gene_matrix;
+  typedef Eigen::Tensor<real,1> gene_vector;
+
+
+  static inline std::string name( void )
+  {
+    return EIGEN_MAKESTRING(BTL_PREFIX);
+  }
+
+  static void free_matrix(gene_matrix & /*A*/, int /*N*/) {}
+
+  static void free_vector(gene_vector & /*B*/) {}
+
+  static BTL_DONT_INLINE void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
+    A.resize(Eigen::array<Index,2>(A_stl[0].size(), A_stl.size()));
+
+    for (unsigned int j=0; j<A_stl.size() ; j++){
+      for (unsigned int i=0; i<A_stl[j].size() ; i++){
+        A.coeffRef(Eigen::array<Index,2>(i,j)) = A_stl[j][i];
+      }
+    }
+  }
+
+  static BTL_DONT_INLINE  void vector_from_stl(gene_vector & B, stl_vector & B_stl){
+    B.resize(B_stl.size());
+
+    for (unsigned int i=0; i<B_stl.size() ; i++){
+      B.coeffRef(i) = B_stl[i];
+    }
+  }
+
+  static BTL_DONT_INLINE  void vector_to_stl(gene_vector & B, stl_vector & B_stl){
+    for (unsigned int i=0; i<B_stl.size() ; i++){
+      B_stl[i] = B.coeff(i);
+    }
+  }
+
+  static BTL_DONT_INLINE  void matrix_to_stl(gene_matrix & A, stl_matrix & A_stl){
+    int  N=A_stl.size();
+
+    for (int j=0;j<N;j++){
+      A_stl[j].resize(N);
+      for (int i=0;i<N;i++){
+        A_stl[j][i] = A.coeff(Eigen::array<Index,2>(i,j));
+      }
+    }
+  }
+
+  static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int  /*N*/){
+    typedef typename Eigen::Tensor<real_type, 1>::DimensionPair DimPair;
+    const Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+    X/*.noalias()*/ = A.contract(B, dims);
+  }
+
+  static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int  /*N*/){
+    typedef typename Eigen::Tensor<real_type, 1>::DimensionPair DimPair;
+    const Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+    X/*.noalias()*/ = A.contract(B, dims);
+  }
+
+  static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int  /*N*/){
+    Y += X.constant(coef) * X;
+  }
+
+  static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int  /*N*/){
+    Y = X.constant(a)*X + Y.constant(b)*Y;
+  }
+
+  static EIGEN_DONT_INLINE void copy_matrix(const gene_matrix & source, gene_matrix & cible, int  /*N*/){
+    cible = source;
+  }
+
+  static EIGEN_DONT_INLINE void copy_vector(const gene_vector & source, gene_vector & cible, int  /*N*/){
+    cible = source;
+  }
+};
+
+#endif
diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h
new file mode 100644
index 000000000..525b9acda
--- /dev/null
+++ b/bench/tensors/tensor_benchmarks.h
@@ -0,0 +1,305 @@
+#ifndef THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
+#define THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
+
+typedef int TensorIndex;
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "testing/base/public/benchmark.h"
+
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+
+// TODO(bsteiner): also templatize on the input type since we have users
+// for int8 as well as floats.
+template <typename Device> class BenchmarkSuite {
+ public:
+  BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n)
+      : m_(m), k_(k), n_(n), device_(device) {
+    initialize();
+  }
+
+  BenchmarkSuite(const Device& device, size_t m)
+      : m_(m), k_(m), n_(m), device_(device) {
+    initialize();
+  }
+
+  ~BenchmarkSuite() {
+    device_.deallocate(a_);
+    device_.deallocate(b_);
+    device_.deallocate(c_);
+  }
+
+  void memcpy(int num_iters) {
+    eigen_assert(m_ == k_ && k_ == n_);
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      device_.memcpy(c_, a_, m_ * m_ * sizeof(float));
+    }
+    // Record the number of values copied per second
+    finalizeBenchmark(m_ * m_ * num_iters);
+  }
+
+  void random(int num_iters) {
+    eigen_assert(m_ == k_ && k_ == n_);
+    const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = C.random();
+    }
+    // Record the number of random numbers generated per second
+    finalizeBenchmark(m_ * m_ * num_iters);
+  }
+
+  void slicing(int num_iters) {
+    eigen_assert(m_ == k_ && k_ == n_);
+    const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
+    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
+
+    const Eigen::DSizes<TensorIndex, 2> quarter_sizes(Eigen::array<TensorIndex, 2>(m_/2, m_/2));
+    const Eigen::DSizes<TensorIndex, 2> first_quadrant(Eigen::array<TensorIndex, 2>(0, 0));
+    const Eigen::DSizes<TensorIndex, 2> second_quadrant(Eigen::array<TensorIndex, 2>(0, m_/2));
+    const Eigen::DSizes<TensorIndex, 2> third_quadrant(Eigen::array<TensorIndex, 2>(m_/2, 0));
+    const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(Eigen::array<TensorIndex, 2>(m_/2, m_/2));
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.slice(first_quadrant, quarter_sizes).device(device_) =
+          A.slice(first_quadrant, quarter_sizes);
+      C.slice(second_quadrant, quarter_sizes).device(device_) =
+          B.slice(second_quadrant, quarter_sizes);
+      C.slice(third_quadrant, quarter_sizes).device(device_) =
+          A.slice(third_quadrant, quarter_sizes);
+      C.slice(fourth_quadrant, quarter_sizes).device(device_) =
+          B.slice(fourth_quadrant, quarter_sizes);
+    }
+    // Record the number of values copied from the rhs slice to the lhs slice
+    // each second
+    finalizeBenchmark(m_ * m_ * num_iters);
+  }
+
+  void shuffling(int num_iters) {
+    eigen_assert(m_ == n_);
+    const Eigen::array<TensorIndex, 2> size_a(m_, k_);
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
+    const Eigen::array<TensorIndex, 2> size_b(k_, m_);
+    TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
+
+    const Eigen::array<int, 2> shuffle(1, 0);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      B.device(device_) = A.shuffle(shuffle);
+    }
+    // Record the number of values shuffled from A and copied to B each second
+    finalizeBenchmark(m_ * k_ * num_iters);
+  }
+
+ void padding(int num_iters) {
+    eigen_assert(m_ == k_);
+    const Eigen::array<TensorIndex, 2> size_a(m_, k_-3);
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
+    const Eigen::array<TensorIndex, 2> size_b(k_, m_);
+    TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
+
+    Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
+    paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0);
+    paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      B.device(device_) = A.pad(paddings);
+    }
+    // Record the number of values copied from the padded tensor A each second
+    finalizeBenchmark(m_ * k_ * num_iters);
+  }
+
+ void striding(int num_iters) {
+    eigen_assert(m_ == k_);
+    const Eigen::array<TensorIndex, 2> size_a(m_, k_);
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
+    const Eigen::array<TensorIndex, 2> size_b(m_, k_ / 2);
+    TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
+
+    const Eigen::array<TensorIndex, 2> strides(1, 2);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      B.device(device_) = A.stride(strides);
+    }
+    // Record the number of values copied from the padded tensor A each second
+    finalizeBenchmark(m_ * k_ * num_iters);
+  }
+
+  void broadcasting(int num_iters) {
+    const Eigen::array<TensorIndex, 2> size_a(m_, 1);
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
+    const Eigen::array<TensorIndex, 2> size_c(m_, n_);
+    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, size_c);
+
+#if defined(__CUDACC__)
+    // nvcc doesn't support cxx11
+    const Eigen::array<int, 2> broadcast(1, n_);
+#else
+    // Take advantage of cxx11 to give the compiler information it can use to
+    // optimize the code.
+    Eigen::IndexList<Eigen::type2index<1>, int> broadcast;
+    broadcast.set(1, n_);
+#endif
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = A.broadcast(broadcast);
+    }
+    // Record the number of values broadcasted from A and copied to C each second
+    finalizeBenchmark(m_ * n_ * num_iters);
+  }
+
+  void coeffWiseOp(int num_iters) {
+    eigen_assert(m_ == k_ && k_ == n_);
+    const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
+    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = A * A.constant(3.14) + B * B.constant(2.7);
+    }
+    // Record the number of FLOP executed per second (2 multiplications and
+    // 1 addition per value)
+    finalizeBenchmark(3 * m_ * m_ * num_iters);
+  }
+
+  void algebraicFunc(int num_iters) {
+    eigen_assert(m_ == k_ && k_ == n_);
+    const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
+    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
+    }
+    // Record the number of FLOP executed per second (assuming one operation
+    // per value)
+    finalizeBenchmark(m_ * m_ * num_iters);
+  }
+
+  void transcendentalFunc(int num_iters) {
+    eigen_assert(m_ == k_ && k_ == n_);
+    const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
+    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = A.exp() + B.log();
+    }
+    // Record the number of FLOP executed per second (assuming one operation
+    // per value)
+    finalizeBenchmark(m_ * m_ * num_iters);
+  }
+
+  // Simple reduction
+  void reduction(int num_iters) {
+    const Eigen::array<TensorIndex, 2> input_size(k_, n_);
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, input_size);
+    const Eigen::array<TensorIndex, 1> output_size(n_);
+    TensorMap<Tensor<float, 1>, Eigen::Aligned> C(c_, output_size);
+
+    const Eigen::array<TensorIndex, 1> sum_along_dim(0);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = B.sum(sum_along_dim);
+    }
+    // Record the number of FLOP executed per second (assuming one operation
+    // per value)
+    finalizeBenchmark(m_ * m_ * num_iters);
+  }
+
+  // do a contraction which is equivalent to a matrix multiplication
+  void contraction(int num_iters) {
+    const Eigen::array<TensorIndex, 2> sizeA(m_, k_);
+    const Eigen::array<TensorIndex, 2> sizeB(k_, n_);
+    const Eigen::array<TensorIndex, 2> sizeC(m_, n_);
+
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizeA);
+    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizeB);
+    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizeC);
+
+    typedef typename Tensor<float, 2>::DimensionPair DimPair;
+    const Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = A.contract(B, dims);
+    }
+    // Record the number of FLOP executed per second (size_ multiplications and
+    // additions for each value in the resulting tensor)
+    finalizeBenchmark(static_cast<int64>(2) * m_ * n_ * k_ * num_iters);
+  }
+
+  void convolution(int num_iters, int kernel_x, int kernel_y) {
+    const Eigen::array<TensorIndex, 2> input_sizes(m_, n_);
+    TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, input_sizes);
+    const Eigen::array<TensorIndex, 2> kernel_sizes(kernel_x, kernel_y);
+    TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, kernel_sizes);
+    const Eigen::array<TensorIndex, 2> result_sizes(
+        m_ - kernel_x + 1, n_ - kernel_y + 1);
+    TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, result_sizes);
+    Eigen::array<Tensor<float, 2>::Index, 2> dims(0, 1);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = A.convolve(B, dims);
+    }
+    // Record the number of FLOP executed per second (kernel_size
+    // multiplications and additions for each value in the resulting tensor)
+    finalizeBenchmark(
+        (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * 2 * num_iters);
+  }
+
+ private:
+  void initialize() {
+    a_ = (float *) device_.allocate(m_ * k_ * sizeof(float));
+    b_ = (float *) device_.allocate(k_ * n_ * sizeof(float));
+    c_ = (float *) device_.allocate(m_ * n_ * sizeof(float));
+
+    // Initialize the content of the memory pools to prevent asan from
+    // complaining.
+    device_.memset(a_, 12, m_ * k_ * sizeof(float));
+    device_.memset(b_, 23, k_ * n_ * sizeof(float));
+    device_.memset(c_, 31, m_ * n_ * sizeof(float));
+
+    BenchmarkUseRealTime();
+  }
+
+  inline void finalizeBenchmark(int64 num_items) {
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+    if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
+      device_.synchronize();
+    }
+#endif
+    StopBenchmarkTiming();
+    SetBenchmarkItemsProcessed(num_items);
+  }
+
+
+  size_t m_;
+  size_t k_;
+  size_t n_;
+  float* a_;
+  float* b_;
+  float* c_;
+  Device device_;
+};
+#endif  // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
diff --git a/bench/tensors/tensor_benchmarks_cpu.cc b/bench/tensors/tensor_benchmarks_cpu.cc
new file mode 100644
index 000000000..68653ba15
--- /dev/null
+++ b/bench/tensors/tensor_benchmarks_cpu.cc
@@ -0,0 +1,156 @@
+#define EIGEN_USE_THREADS
+
+#include "base/sysinfo.h"
+#include "strings/strcat.h"
+#include "third_party/eigen3/tensor_benchmarks.h"
+#include "thread/threadpool.h"
+
+#ifdef __ANDROID__
+#define CREATE_THREAD_POOL(threads)             \
+Eigen::ThreadPoolDevice device(threads);
+#else
+#define CREATE_THREAD_POOL(threads)             \
+ThreadPool tp(threads);                         \
+tp.StartWorkers();                              \
+Eigen::ThreadPoolDevice device(&tp, threads);
+#endif
+
+// Simple functions
+#define BM_FuncCPU(FUNC, THREADS)                                \
+  static void BM_##FUNC##_##THREADS##T(int iters, int N) {       \
+    StopBenchmarkTiming();                                       \
+    CREATE_THREAD_POOL(THREADS);                                 \
+    BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N);    \
+    suite.FUNC(iters);                                           \
+    SetBenchmarkLabel(StrCat("using ", THREADS, " threads"));    \
+  }                                                              \
+  BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000);
+
+BM_FuncCPU(memcpy, 4);
+BM_FuncCPU(memcpy, 8);
+BM_FuncCPU(memcpy, 12);
+
+BM_FuncCPU(random, 4);
+BM_FuncCPU(random, 8);
+BM_FuncCPU(random, 12);
+
+BM_FuncCPU(slicing, 4);
+BM_FuncCPU(slicing, 8);
+BM_FuncCPU(slicing, 12);
+
+BM_FuncCPU(shuffling, 4);
+BM_FuncCPU(shuffling, 8);
+BM_FuncCPU(shuffling, 12);
+
+BM_FuncCPU(padding, 4);
+BM_FuncCPU(padding, 8);
+BM_FuncCPU(padding, 12);
+
+BM_FuncCPU(striding, 4);
+BM_FuncCPU(striding, 8);
+BM_FuncCPU(striding, 12);
+
+BM_FuncCPU(broadcasting, 4);
+BM_FuncCPU(broadcasting, 8);
+BM_FuncCPU(broadcasting, 12);
+
+BM_FuncCPU(coeffWiseOp, 4);
+BM_FuncCPU(coeffWiseOp, 8);
+BM_FuncCPU(coeffWiseOp, 12);
+
+BM_FuncCPU(algebraicFunc, 4);
+BM_FuncCPU(algebraicFunc, 8);
+BM_FuncCPU(algebraicFunc, 12);
+
+BM_FuncCPU(transcendentalFunc, 4);
+BM_FuncCPU(transcendentalFunc, 8);
+BM_FuncCPU(transcendentalFunc, 12);
+
+BM_FuncCPU(reduction, 4);
+BM_FuncCPU(reduction, 8);
+BM_FuncCPU(reduction, 12);
+
+
+// Contractions
+#define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS)                     \
+  static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) {\
+    StopBenchmarkTiming();                                                     \
+    if (THREADS == 1) {                                                        \
+      Eigen::DefaultDevice device;                                             \
+      BenchmarkSuite<Eigen::DefaultDevice> suite(device, D1, D2, D3);          \
+      suite.FUNC(iters);                                                       \
+    } else {                                                                   \
+      CREATE_THREAD_POOL(THREADS);                                             \
+      BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, D1, D2, D3);       \
+      suite.FUNC(iters);                                                       \
+    }                                                                          \
+    SetBenchmarkLabel(StrCat("using ", THREADS, " threads"));                  \
+  }                                                                            \
+  BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000);
+
+
+BM_FuncWithInputDimsCPU(contraction, N, N, N, 1);
+BM_FuncWithInputDimsCPU(contraction, N, N, N, 4);
+BM_FuncWithInputDimsCPU(contraction, N, N, N, 8);
+BM_FuncWithInputDimsCPU(contraction, N, N, N, 12);
+BM_FuncWithInputDimsCPU(contraction, N, N, N, 16);
+
+BM_FuncWithInputDimsCPU(contraction, 64, N, N, 1);
+BM_FuncWithInputDimsCPU(contraction, 64, N, N, 4);
+BM_FuncWithInputDimsCPU(contraction, 64, N, N, 8);
+BM_FuncWithInputDimsCPU(contraction, 64, N, N, 12);
+BM_FuncWithInputDimsCPU(contraction, 64, N, N, 16);
+
+BM_FuncWithInputDimsCPU(contraction, N, 64, N, 1);
+BM_FuncWithInputDimsCPU(contraction, N, 64, N, 4);
+BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8);
+BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12);
+BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16);
+
+BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1);
+BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4);
+BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8);
+BM_FuncWithInputDimsCPU(contraction, 1, N, N, 12);
+BM_FuncWithInputDimsCPU(contraction, 1, N, N, 16);
+
+BM_FuncWithInputDimsCPU(contraction, N, N, 1, 1);
+BM_FuncWithInputDimsCPU(contraction, N, N, 1, 4);
+BM_FuncWithInputDimsCPU(contraction, N, N, 1, 8);
+BM_FuncWithInputDimsCPU(contraction, N, N, 1, 12);
+BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16);
+
+
+// Convolutions
+#define BM_FuncWithKernelDimsCPU(FUNC, DIM1, DIM2, THREADS)                    \
+  static void BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T(int iters, int N) {   \
+    StopBenchmarkTiming();                                                     \
+    CREATE_THREAD_POOL(THREADS);                                               \
+    BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N);                  \
+    suite.FUNC(iters, DIM1, DIM2);                                             \
+    SetBenchmarkLabel(StrCat("using ", THREADS, " threads"));                  \
+  }                                                                            \
+  BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000);
+
+BM_FuncWithKernelDimsCPU(convolution, 7, 1, 4);
+BM_FuncWithKernelDimsCPU(convolution, 7, 1, 8);
+BM_FuncWithKernelDimsCPU(convolution, 7, 1, 12);
+
+BM_FuncWithKernelDimsCPU(convolution, 1, 7, 4);
+BM_FuncWithKernelDimsCPU(convolution, 1, 7, 8);
+BM_FuncWithKernelDimsCPU(convolution, 1, 7, 12);
+
+BM_FuncWithKernelDimsCPU(convolution, 7, 4, 4);
+BM_FuncWithKernelDimsCPU(convolution, 7, 4, 8);
+BM_FuncWithKernelDimsCPU(convolution, 7, 4, 12);
+
+BM_FuncWithKernelDimsCPU(convolution, 4, 7, 4);
+BM_FuncWithKernelDimsCPU(convolution, 4, 7, 8);
+BM_FuncWithKernelDimsCPU(convolution, 4, 7, 12);
+
+BM_FuncWithKernelDimsCPU(convolution, 7, 64, 4);
+BM_FuncWithKernelDimsCPU(convolution, 7, 64, 8);
+BM_FuncWithKernelDimsCPU(convolution, 7, 64, 12);
+
+BM_FuncWithKernelDimsCPU(convolution, 64, 7, 4);
+BM_FuncWithKernelDimsCPU(convolution, 64, 7, 8);
+BM_FuncWithKernelDimsCPU(convolution, 64, 7, 12);
diff --git a/bench/tensors/tensor_benchmarks_gpu.cc b/bench/tensors/tensor_benchmarks_gpu.cc
new file mode 100644
index 000000000..adea754ad
--- /dev/null
+++ b/bench/tensors/tensor_benchmarks_gpu.cc
@@ -0,0 +1,75 @@
+#define EIGEN_USE_GPU
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <iostream>
+#include "strings/strcat.h"
+#include "third_party/eigen3/tensor_benchmarks.h"
+
+
+
+// Simple functions
+#define BM_FuncGPU(FUNC)                                                       \
+  static void BM_##FUNC(int iters, int N) {                                    \
+    StopBenchmarkTiming();                                                     \
+    cudaStream_t stream;                                                       \
+    cudaStreamCreate(&stream);                                                 \
+    Eigen::GpuDevice device(&stream);                                          \
+    BenchmarkSuite<Eigen::GpuDevice> suite(device, N);                         \
+    cudaDeviceSynchronize();                                                   \
+    suite.FUNC(iters);                                                         \
+    cudaStreamDestroy(stream);                                                 \
+  }                                                                            \
+  BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
+
+BM_FuncGPU(memcpy);
+BM_FuncGPU(random);
+BM_FuncGPU(slicing);
+BM_FuncGPU(shuffling);
+BM_FuncGPU(padding);
+BM_FuncGPU(striding);
+BM_FuncGPU(broadcasting);
+BM_FuncGPU(coeffWiseOp);
+BM_FuncGPU(reduction);
+
+
+// Contractions
+#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3)                              \
+  static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) {               \
+    StopBenchmarkTiming();                                                     \
+    cudaStream_t stream;                                                       \
+    cudaStreamCreate(&stream);                                                 \
+    Eigen::GpuDevice device(&stream);                                          \
+    BenchmarkSuite<Eigen::GpuDevice> suite(device, D1, D2, D3);                \
+    cudaDeviceSynchronize();                                                   \
+    suite.FUNC(iters);                                                         \
+    cudaStreamDestroy(stream);                                                 \
+  }                                                                            \
+  BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
+
+
+BM_FuncWithInputDimsGPU(contraction, N, N, N);
+BM_FuncWithInputDimsGPU(contraction, 64, N, N);
+BM_FuncWithInputDimsGPU(contraction, N, 64, N);
+
+
+// Convolutions
+#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2)                             \
+  static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) {                  \
+    StopBenchmarkTiming();                                                     \
+    cudaStream_t stream;                                                       \
+    cudaStreamCreate(&stream);                                                 \
+    Eigen::GpuDevice device(&stream);                                          \
+    BenchmarkSuite<Eigen::GpuDevice> suite(device, N);                         \
+    cudaDeviceSynchronize();                                                   \
+    suite.FUNC(iters, DIM1, DIM2);                                             \
+    cudaStreamDestroy(stream);                                                 \
+  }                                                                            \
+  BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);
+
+BM_FuncWithKernelDimsGPU(convolution, 7, 1);
+BM_FuncWithKernelDimsGPU(convolution, 1, 7);
+BM_FuncWithKernelDimsGPU(convolution, 7, 4);
+BM_FuncWithKernelDimsGPU(convolution, 4, 7);
+BM_FuncWithKernelDimsGPU(convolution, 7, 64);
+BM_FuncWithKernelDimsGPU(convolution, 64, 7);
diff --git a/blas/CMakeLists.txt b/blas/CMakeLists.txt
index a9bc05137..d0efb4188 100644
--- a/blas/CMakeLists.txt
+++ b/blas/CMakeLists.txt
@@ -14,23 +14,18 @@ endif()
 
 add_custom_target(blas)
 
-set(EigenBlas_SRCS single.cpp double.cpp complex_single.cpp complex_double.cpp xerbla.cpp)
-
-if(EIGEN_Fortran_COMPILER_WORKS)
-
-set(EigenBlas_SRCS ${EigenBlas_SRCS}
-    complexdots.f
-    srotm.f srotmg.f drotm.f drotmg.f
-    lsame.f  dspmv.f ssbmv.f
-    chbmv.f  sspmv.f
-    zhbmv.f  chpmv.f dsbmv.f
-    zhpmv.f
-    dtbmv.f stbmv.f ctbmv.f ztbmv.f
-)
+set(EigenBlas_SRCS  single.cpp double.cpp complex_single.cpp complex_double.cpp xerbla.cpp
+                    f2c/srotm.c   f2c/srotmg.c  f2c/drotm.c f2c/drotmg.c
+                    f2c/lsame.c   f2c/dspmv.c   f2c/ssbmv.c f2c/chbmv.c
+                    f2c/sspmv.c   f2c/zhbmv.c   f2c/chpmv.c f2c/dsbmv.c
+                    f2c/zhpmv.c   f2c/dtbmv.c   f2c/stbmv.c f2c/ctbmv.c
+                    f2c/ztbmv.c   f2c/d_cnjg.c  f2c/r_cnjg.c
+   )
+
+if (EIGEN_Fortran_COMPILER_WORKS)
+  set(EigenBlas_SRCS ${EigenBlas_SRCS} fortran/complexdots.f)
 else()
-
-message(WARNING " No fortran compiler has been detected, the blas build will be incomplete.")
-
+  set(EigenBlas_SRCS ${EigenBlas_SRCS} f2c/complexdots.c)
 endif()
 
 add_library(eigen_blas_static ${EigenBlas_SRCS})
diff --git a/blas/chbmv.f b/blas/chbmv.f
deleted file mode 100644
index 1b1c330ea..000000000
--- a/blas/chbmv.f
+++ /dev/null
@@ -1,310 +0,0 @@
-      SUBROUTINE CHBMV(UPLO,N,K,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
-*     .. Scalar Arguments ..
-      COMPLEX ALPHA,BETA
-      INTEGER INCX,INCY,K,LDA,N
-      CHARACTER UPLO
-*     ..
-*     .. Array Arguments ..
-      COMPLEX A(LDA,*),X(*),Y(*)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*  CHBMV  performs the matrix-vector  operation
-*
-*     y := alpha*A*x + beta*y,
-*
-*  where alpha and beta are scalars, x and y are n element vectors and
-*  A is an n by n hermitian band matrix, with k super-diagonals.
-*
-*  Arguments
-*  ==========
-*
-*  UPLO   - CHARACTER*1.
-*           On entry, UPLO specifies whether the upper or lower
-*           triangular part of the band matrix A is being supplied as
-*           follows:
-*
-*              UPLO = 'U' or 'u'   The upper triangular part of A is
-*                                  being supplied.
-*
-*              UPLO = 'L' or 'l'   The lower triangular part of A is
-*                                  being supplied.
-*
-*           Unchanged on exit.
-*
-*  N      - INTEGER.
-*           On entry, N specifies the order of the matrix A.
-*           N must be at least zero.
-*           Unchanged on exit.
-*
-*  K      - INTEGER.
-*           On entry, K specifies the number of super-diagonals of the
-*           matrix A. K must satisfy  0 .le. K.
-*           Unchanged on exit.
-*
-*  ALPHA  - COMPLEX         .
-*           On entry, ALPHA specifies the scalar alpha.
-*           Unchanged on exit.
-*
-*  A      - COMPLEX          array of DIMENSION ( LDA, n ).
-*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
-*           by n part of the array A must contain the upper triangular
-*           band part of the hermitian matrix, supplied column by
-*           column, with the leading diagonal of the matrix in row
-*           ( k + 1 ) of the array, the first super-diagonal starting at
-*           position 2 in row k, and so on. The top left k by k triangle
-*           of the array A is not referenced.
-*           The following program segment will transfer the upper
-*           triangular part of a hermitian band matrix from conventional
-*           full matrix storage to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = K + 1 - J
-*                    DO 10, I = MAX( 1, J - K ), J
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
-*           by n part of the array A must contain the lower triangular
-*           band part of the hermitian matrix, supplied column by
-*           column, with the leading diagonal of the matrix in row 1 of
-*           the array, the first sub-diagonal starting at position 1 in
-*           row 2, and so on. The bottom right k by k triangle of the
-*           array A is not referenced.
-*           The following program segment will transfer the lower
-*           triangular part of a hermitian band matrix from conventional
-*           full matrix storage to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = 1 - J
-*                    DO 10, I = J, MIN( N, J + K )
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Note that the imaginary parts of the diagonal elements need
-*           not be set and are assumed to be zero.
-*           Unchanged on exit.
-*
-*  LDA    - INTEGER.
-*           On entry, LDA specifies the first dimension of A as declared
-*           in the calling (sub) program. LDA must be at least
-*           ( k + 1 ).
-*           Unchanged on exit.
-*
-*  X      - COMPLEX          array of DIMENSION at least
-*           ( 1 + ( n - 1 )*abs( INCX ) ).
-*           Before entry, the incremented array X must contain the
-*           vector x.
-*           Unchanged on exit.
-*
-*  INCX   - INTEGER.
-*           On entry, INCX specifies the increment for the elements of
-*           X. INCX must not be zero.
-*           Unchanged on exit.
-*
-*  BETA   - COMPLEX         .
-*           On entry, BETA specifies the scalar beta.
-*           Unchanged on exit.
-*
-*  Y      - COMPLEX          array of DIMENSION at least
-*           ( 1 + ( n - 1 )*abs( INCY ) ).
-*           Before entry, the incremented array Y must contain the
-*           vector y. On exit, Y is overwritten by the updated vector y.
-*
-*  INCY   - INTEGER.
-*           On entry, INCY specifies the increment for the elements of
-*           Y. INCY must not be zero.
-*           Unchanged on exit.
-*
-*  Further Details
-*  ===============
-*
-*  Level 2 Blas routine.
-*
-*  -- Written on 22-October-1986.
-*     Jack Dongarra, Argonne National Lab.
-*     Jeremy Du Croz, Nag Central Office.
-*     Sven Hammarling, Nag Central Office.
-*     Richard Hanson, Sandia National Labs.
-*
-*  =====================================================================
-*
-*     .. Parameters ..
-      COMPLEX ONE
-      PARAMETER (ONE= (1.0E+0,0.0E+0))
-      COMPLEX ZERO
-      PARAMETER (ZERO= (0.0E+0,0.0E+0))
-*     ..
-*     .. Local Scalars ..
-      COMPLEX TEMP1,TEMP2
-      INTEGER I,INFO,IX,IY,J,JX,JY,KPLUS1,KX,KY,L
-*     ..
-*     .. External Functions ..
-      LOGICAL LSAME
-      EXTERNAL LSAME
-*     ..
-*     .. External Subroutines ..
-      EXTERNAL XERBLA
-*     ..
-*     .. Intrinsic Functions ..
-      INTRINSIC CONJG,MAX,MIN,REAL
-*     ..
-*
-*     Test the input parameters.
-*
-      INFO = 0
-      IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
-          INFO = 1
-      ELSE IF (N.LT.0) THEN
-          INFO = 2
-      ELSE IF (K.LT.0) THEN
-          INFO = 3
-      ELSE IF (LDA.LT. (K+1)) THEN
-          INFO = 6
-      ELSE IF (INCX.EQ.0) THEN
-          INFO = 8
-      ELSE IF (INCY.EQ.0) THEN
-          INFO = 11
-      END IF
-      IF (INFO.NE.0) THEN
-          CALL XERBLA('CHBMV ',INFO)
-          RETURN
-      END IF
-*
-*     Quick return if possible.
-*
-      IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
-*
-*     Set up the start points in  X  and  Y.
-*
-      IF (INCX.GT.0) THEN
-          KX = 1
-      ELSE
-          KX = 1 - (N-1)*INCX
-      END IF
-      IF (INCY.GT.0) THEN
-          KY = 1
-      ELSE
-          KY = 1 - (N-1)*INCY
-      END IF
-*
-*     Start the operations. In this version the elements of the array A
-*     are accessed sequentially with one pass through A.
-*
-*     First form  y := beta*y.
-*
-      IF (BETA.NE.ONE) THEN
-          IF (INCY.EQ.1) THEN
-              IF (BETA.EQ.ZERO) THEN
-                  DO 10 I = 1,N
-                      Y(I) = ZERO
-   10             CONTINUE
-              ELSE
-                  DO 20 I = 1,N
-                      Y(I) = BETA*Y(I)
-   20             CONTINUE
-              END IF
-          ELSE
-              IY = KY
-              IF (BETA.EQ.ZERO) THEN
-                  DO 30 I = 1,N
-                      Y(IY) = ZERO
-                      IY = IY + INCY
-   30             CONTINUE
-              ELSE
-                  DO 40 I = 1,N
-                      Y(IY) = BETA*Y(IY)
-                      IY = IY + INCY
-   40             CONTINUE
-              END IF
-          END IF
-      END IF
-      IF (ALPHA.EQ.ZERO) RETURN
-      IF (LSAME(UPLO,'U')) THEN
-*
-*        Form  y  when upper triangle of A is stored.
-*
-          KPLUS1 = K + 1
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 60 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  L = KPLUS1 - J
-                  DO 50 I = MAX(1,J-K),J - 1
-                      Y(I) = Y(I) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + CONJG(A(L+I,J))*X(I)
-   50             CONTINUE
-                  Y(J) = Y(J) + TEMP1*REAL(A(KPLUS1,J)) + ALPHA*TEMP2
-   60         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 80 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  IX = KX
-                  IY = KY
-                  L = KPLUS1 - J
-                  DO 70 I = MAX(1,J-K),J - 1
-                      Y(IY) = Y(IY) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + CONJG(A(L+I,J))*X(IX)
-                      IX = IX + INCX
-                      IY = IY + INCY
-   70             CONTINUE
-                  Y(JY) = Y(JY) + TEMP1*REAL(A(KPLUS1,J)) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-                  IF (J.GT.K) THEN
-                      KX = KX + INCX
-                      KY = KY + INCY
-                  END IF
-   80         CONTINUE
-          END IF
-      ELSE
-*
-*        Form  y  when lower triangle of A is stored.
-*
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 100 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  Y(J) = Y(J) + TEMP1*REAL(A(1,J))
-                  L = 1 - J
-                  DO 90 I = J + 1,MIN(N,J+K)
-                      Y(I) = Y(I) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + CONJG(A(L+I,J))*X(I)
-   90             CONTINUE
-                  Y(J) = Y(J) + ALPHA*TEMP2
-  100         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 120 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  Y(JY) = Y(JY) + TEMP1*REAL(A(1,J))
-                  L = 1 - J
-                  IX = JX
-                  IY = JY
-                  DO 110 I = J + 1,MIN(N,J+K)
-                      IX = IX + INCX
-                      IY = IY + INCY
-                      Y(IY) = Y(IY) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + CONJG(A(L+I,J))*X(IX)
-  110             CONTINUE
-                  Y(JY) = Y(JY) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-  120         CONTINUE
-          END IF
-      END IF
-*
-      RETURN
-*
-*     End of CHBMV .
-*
-      END
diff --git a/blas/chpmv.f b/blas/chpmv.f
deleted file mode 100644
index 158be5a7b..000000000
--- a/blas/chpmv.f
+++ /dev/null
@@ -1,272 +0,0 @@
-      SUBROUTINE CHPMV(UPLO,N,ALPHA,AP,X,INCX,BETA,Y,INCY)
-*     .. Scalar Arguments ..
-      COMPLEX ALPHA,BETA
-      INTEGER INCX,INCY,N
-      CHARACTER UPLO
-*     ..
-*     .. Array Arguments ..
-      COMPLEX AP(*),X(*),Y(*)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*  CHPMV  performs the matrix-vector operation
-*
-*     y := alpha*A*x + beta*y,
-*
-*  where alpha and beta are scalars, x and y are n element vectors and
-*  A is an n by n hermitian matrix, supplied in packed form.
-*
-*  Arguments
-*  ==========
-*
-*  UPLO   - CHARACTER*1.
-*           On entry, UPLO specifies whether the upper or lower
-*           triangular part of the matrix A is supplied in the packed
-*           array AP as follows:
-*
-*              UPLO = 'U' or 'u'   The upper triangular part of A is
-*                                  supplied in AP.
-*
-*              UPLO = 'L' or 'l'   The lower triangular part of A is
-*                                  supplied in AP.
-*
-*           Unchanged on exit.
-*
-*  N      - INTEGER.
-*           On entry, N specifies the order of the matrix A.
-*           N must be at least zero.
-*           Unchanged on exit.
-*
-*  ALPHA  - COMPLEX         .
-*           On entry, ALPHA specifies the scalar alpha.
-*           Unchanged on exit.
-*
-*  AP     - COMPLEX          array of DIMENSION at least
-*           ( ( n*( n + 1 ) )/2 ).
-*           Before entry with UPLO = 'U' or 'u', the array AP must
-*           contain the upper triangular part of the hermitian matrix
-*           packed sequentially, column by column, so that AP( 1 )
-*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 )
-*           and a( 2, 2 ) respectively, and so on.
-*           Before entry with UPLO = 'L' or 'l', the array AP must
-*           contain the lower triangular part of the hermitian matrix
-*           packed sequentially, column by column, so that AP( 1 )
-*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 )
-*           and a( 3, 1 ) respectively, and so on.
-*           Note that the imaginary parts of the diagonal elements need
-*           not be set and are assumed to be zero.
-*           Unchanged on exit.
-*
-*  X      - COMPLEX          array of dimension at least
-*           ( 1 + ( n - 1 )*abs( INCX ) ).
-*           Before entry, the incremented array X must contain the n
-*           element vector x.
-*           Unchanged on exit.
-*
-*  INCX   - INTEGER.
-*           On entry, INCX specifies the increment for the elements of
-*           X. INCX must not be zero.
-*           Unchanged on exit.
-*
-*  BETA   - COMPLEX         .
-*           On entry, BETA specifies the scalar beta. When BETA is
-*           supplied as zero then Y need not be set on input.
-*           Unchanged on exit.
-*
-*  Y      - COMPLEX          array of dimension at least
-*           ( 1 + ( n - 1 )*abs( INCY ) ).
-*           Before entry, the incremented array Y must contain the n
-*           element vector y. On exit, Y is overwritten by the updated
-*           vector y.
-*
-*  INCY   - INTEGER.
-*           On entry, INCY specifies the increment for the elements of
-*           Y. INCY must not be zero.
-*           Unchanged on exit.
-*
-*  Further Details
-*  ===============
-*
-*  Level 2 Blas routine.
-*
-*  -- Written on 22-October-1986.
-*     Jack Dongarra, Argonne National Lab.
-*     Jeremy Du Croz, Nag Central Office.
-*     Sven Hammarling, Nag Central Office.
-*     Richard Hanson, Sandia National Labs.
-*
-*  =====================================================================
-*
-*     .. Parameters ..
-      COMPLEX ONE
-      PARAMETER (ONE= (1.0E+0,0.0E+0))
-      COMPLEX ZERO
-      PARAMETER (ZERO= (0.0E+0,0.0E+0))
-*     ..
-*     .. Local Scalars ..
-      COMPLEX TEMP1,TEMP2
-      INTEGER I,INFO,IX,IY,J,JX,JY,K,KK,KX,KY
-*     ..
-*     .. External Functions ..
-      LOGICAL LSAME
-      EXTERNAL LSAME
-*     ..
-*     .. External Subroutines ..
-      EXTERNAL XERBLA
-*     ..
-*     .. Intrinsic Functions ..
-      INTRINSIC CONJG,REAL
-*     ..
-*
-*     Test the input parameters.
-*
-      INFO = 0
-      IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
-          INFO = 1
-      ELSE IF (N.LT.0) THEN
-          INFO = 2
-      ELSE IF (INCX.EQ.0) THEN
-          INFO = 6
-      ELSE IF (INCY.EQ.0) THEN
-          INFO = 9
-      END IF
-      IF (INFO.NE.0) THEN
-          CALL XERBLA('CHPMV ',INFO)
-          RETURN
-      END IF
-*
-*     Quick return if possible.
-*
-      IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
-*
-*     Set up the start points in  X  and  Y.
-*
-      IF (INCX.GT.0) THEN
-          KX = 1
-      ELSE
-          KX = 1 - (N-1)*INCX
-      END IF
-      IF (INCY.GT.0) THEN
-          KY = 1
-      ELSE
-          KY = 1 - (N-1)*INCY
-      END IF
-*
-*     Start the operations. In this version the elements of the array AP
-*     are accessed sequentially with one pass through AP.
-*
-*     First form  y := beta*y.
-*
-      IF (BETA.NE.ONE) THEN
-          IF (INCY.EQ.1) THEN
-              IF (BETA.EQ.ZERO) THEN
-                  DO 10 I = 1,N
-                      Y(I) = ZERO
-   10             CONTINUE
-              ELSE
-                  DO 20 I = 1,N
-                      Y(I) = BETA*Y(I)
-   20             CONTINUE
-              END IF
-          ELSE
-              IY = KY
-              IF (BETA.EQ.ZERO) THEN
-                  DO 30 I = 1,N
-                      Y(IY) = ZERO
-                      IY = IY + INCY
-   30             CONTINUE
-              ELSE
-                  DO 40 I = 1,N
-                      Y(IY) = BETA*Y(IY)
-                      IY = IY + INCY
-   40             CONTINUE
-              END IF
-          END IF
-      END IF
-      IF (ALPHA.EQ.ZERO) RETURN
-      KK = 1
-      IF (LSAME(UPLO,'U')) THEN
-*
-*        Form  y  when AP contains the upper triangle.
-*
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 60 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  K = KK
-                  DO 50 I = 1,J - 1
-                      Y(I) = Y(I) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + CONJG(AP(K))*X(I)
-                      K = K + 1
-   50             CONTINUE
-                  Y(J) = Y(J) + TEMP1*REAL(AP(KK+J-1)) + ALPHA*TEMP2
-                  KK = KK + J
-   60         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 80 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  IX = KX
-                  IY = KY
-                  DO 70 K = KK,KK + J - 2
-                      Y(IY) = Y(IY) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + CONJG(AP(K))*X(IX)
-                      IX = IX + INCX
-                      IY = IY + INCY
-   70             CONTINUE
-                  Y(JY) = Y(JY) + TEMP1*REAL(AP(KK+J-1)) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-                  KK = KK + J
-   80         CONTINUE
-          END IF
-      ELSE
-*
-*        Form  y  when AP contains the lower triangle.
-*
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 100 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  Y(J) = Y(J) + TEMP1*REAL(AP(KK))
-                  K = KK + 1
-                  DO 90 I = J + 1,N
-                      Y(I) = Y(I) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + CONJG(AP(K))*X(I)
-                      K = K + 1
-   90             CONTINUE
-                  Y(J) = Y(J) + ALPHA*TEMP2
-                  KK = KK + (N-J+1)
-  100         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 120 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  Y(JY) = Y(JY) + TEMP1*REAL(AP(KK))
-                  IX = JX
-                  IY = JY
-                  DO 110 K = KK + 1,KK + N - J
-                      IX = IX + INCX
-                      IY = IY + INCY
-                      Y(IY) = Y(IY) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + CONJG(AP(K))*X(IX)
-  110             CONTINUE
-                  Y(JY) = Y(JY) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-                  KK = KK + (N-J+1)
-  120         CONTINUE
-          END IF
-      END IF
-*
-      RETURN
-*
-*     End of CHPMV .
-*
-      END
diff --git a/blas/common.h b/blas/common.h
index c39cc63a8..5ecb153e2 100644
--- a/blas/common.h
+++ b/blas/common.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -13,7 +13,6 @@
 #include <Eigen/Core>
 #include <Eigen/Jacobi>
 
-#include <iostream>
 #include <complex>
 
 #ifndef SCALAR
diff --git a/blas/ctbmv.f b/blas/ctbmv.f
deleted file mode 100644
index 5a879fa01..000000000
--- a/blas/ctbmv.f
+++ /dev/null
@@ -1,366 +0,0 @@
-      SUBROUTINE CTBMV(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX)
-*     .. Scalar Arguments ..
-      INTEGER INCX,K,LDA,N
-      CHARACTER DIAG,TRANS,UPLO
-*     ..
-*     .. Array Arguments ..
-      COMPLEX A(LDA,*),X(*)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*  CTBMV  performs one of the matrix-vector operations
-*
-*     x := A*x,   or   x := A'*x,   or   x := conjg( A' )*x,
-*
-*  where x is an n element vector and  A is an n by n unit, or non-unit,
-*  upper or lower triangular band matrix, with ( k + 1 ) diagonals.
-*
-*  Arguments
-*  ==========
-*
-*  UPLO   - CHARACTER*1.
-*           On entry, UPLO specifies whether the matrix is an upper or
-*           lower triangular matrix as follows:
-*
-*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
-*
-*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
-*
-*           Unchanged on exit.
-*
-*  TRANS  - CHARACTER*1.
-*           On entry, TRANS specifies the operation to be performed as
-*           follows:
-*
-*              TRANS = 'N' or 'n'   x := A*x.
-*
-*              TRANS = 'T' or 't'   x := A'*x.
-*
-*              TRANS = 'C' or 'c'   x := conjg( A' )*x.
-*
-*           Unchanged on exit.
-*
-*  DIAG   - CHARACTER*1.
-*           On entry, DIAG specifies whether or not A is unit
-*           triangular as follows:
-*
-*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
-*
-*              DIAG = 'N' or 'n'   A is not assumed to be unit
-*                                  triangular.
-*
-*           Unchanged on exit.
-*
-*  N      - INTEGER.
-*           On entry, N specifies the order of the matrix A.
-*           N must be at least zero.
-*           Unchanged on exit.
-*
-*  K      - INTEGER.
-*           On entry with UPLO = 'U' or 'u', K specifies the number of
-*           super-diagonals of the matrix A.
-*           On entry with UPLO = 'L' or 'l', K specifies the number of
-*           sub-diagonals of the matrix A.
-*           K must satisfy  0 .le. K.
-*           Unchanged on exit.
-*
-*  A      - COMPLEX          array of DIMENSION ( LDA, n ).
-*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
-*           by n part of the array A must contain the upper triangular
-*           band part of the matrix of coefficients, supplied column by
-*           column, with the leading diagonal of the matrix in row
-*           ( k + 1 ) of the array, the first super-diagonal starting at
-*           position 2 in row k, and so on. The top left k by k triangle
-*           of the array A is not referenced.
-*           The following program segment will transfer an upper
-*           triangular band matrix from conventional full matrix storage
-*           to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = K + 1 - J
-*                    DO 10, I = MAX( 1, J - K ), J
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
-*           by n part of the array A must contain the lower triangular
-*           band part of the matrix of coefficients, supplied column by
-*           column, with the leading diagonal of the matrix in row 1 of
-*           the array, the first sub-diagonal starting at position 1 in
-*           row 2, and so on. The bottom right k by k triangle of the
-*           array A is not referenced.
-*           The following program segment will transfer a lower
-*           triangular band matrix from conventional full matrix storage
-*           to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = 1 - J
-*                    DO 10, I = J, MIN( N, J + K )
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Note that when DIAG = 'U' or 'u' the elements of the array A
-*           corresponding to the diagonal elements of the matrix are not
-*           referenced, but are assumed to be unity.
-*           Unchanged on exit.
-*
-*  LDA    - INTEGER.
-*           On entry, LDA specifies the first dimension of A as declared
-*           in the calling (sub) program. LDA must be at least
-*           ( k + 1 ).
-*           Unchanged on exit.
-*
-*  X      - COMPLEX          array of dimension at least
-*           ( 1 + ( n - 1 )*abs( INCX ) ).
-*           Before entry, the incremented array X must contain the n
-*           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
-*
-*  INCX   - INTEGER.
-*           On entry, INCX specifies the increment for the elements of
-*           X. INCX must not be zero.
-*           Unchanged on exit.
-*
-*  Further Details
-*  ===============
-*
-*  Level 2 Blas routine.
-*
-*  -- Written on 22-October-1986.
-*     Jack Dongarra, Argonne National Lab.
-*     Jeremy Du Croz, Nag Central Office.
-*     Sven Hammarling, Nag Central Office.
-*     Richard Hanson, Sandia National Labs.
-*
-*  =====================================================================
-*
-*     .. Parameters ..
-      COMPLEX ZERO
-      PARAMETER (ZERO= (0.0E+0,0.0E+0))
-*     ..
-*     .. Local Scalars ..
-      COMPLEX TEMP
-      INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L
-      LOGICAL NOCONJ,NOUNIT
-*     ..
-*     .. External Functions ..
-      LOGICAL LSAME
-      EXTERNAL LSAME
-*     ..
-*     .. External Subroutines ..
-      EXTERNAL XERBLA
-*     ..
-*     .. Intrinsic Functions ..
-      INTRINSIC CONJG,MAX,MIN
-*     ..
-*
-*     Test the input parameters.
-*
-      INFO = 0
-      IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
-          INFO = 1
-      ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND.
-     +         .NOT.LSAME(TRANS,'C')) THEN
-          INFO = 2
-      ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN
-          INFO = 3
-      ELSE IF (N.LT.0) THEN
-          INFO = 4
-      ELSE IF (K.LT.0) THEN
-          INFO = 5
-      ELSE IF (LDA.LT. (K+1)) THEN
-          INFO = 7
-      ELSE IF (INCX.EQ.0) THEN
-          INFO = 9
-      END IF
-      IF (INFO.NE.0) THEN
-          CALL XERBLA('CTBMV ',INFO)
-          RETURN
-      END IF
-*
-*     Quick return if possible.
-*
-      IF (N.EQ.0) RETURN
-*
-      NOCONJ = LSAME(TRANS,'T')
-      NOUNIT = LSAME(DIAG,'N')
-*
-*     Set up the start point in X if the increment is not unity. This
-*     will be  ( N - 1 )*INCX   too small for descending loops.
-*
-      IF (INCX.LE.0) THEN
-          KX = 1 - (N-1)*INCX
-      ELSE IF (INCX.NE.1) THEN
-          KX = 1
-      END IF
-*
-*     Start the operations. In this version the elements of A are
-*     accessed sequentially with one pass through A.
-*
-      IF (LSAME(TRANS,'N')) THEN
-*
-*         Form  x := A*x.
-*
-          IF (LSAME(UPLO,'U')) THEN
-              KPLUS1 = K + 1
-              IF (INCX.EQ.1) THEN
-                  DO 20 J = 1,N
-                      IF (X(J).NE.ZERO) THEN
-                          TEMP = X(J)
-                          L = KPLUS1 - J
-                          DO 10 I = MAX(1,J-K),J - 1
-                              X(I) = X(I) + TEMP*A(L+I,J)
-   10                     CONTINUE
-                          IF (NOUNIT) X(J) = X(J)*A(KPLUS1,J)
-                      END IF
-   20             CONTINUE
-              ELSE
-                  JX = KX
-                  DO 40 J = 1,N
-                      IF (X(JX).NE.ZERO) THEN
-                          TEMP = X(JX)
-                          IX = KX
-                          L = KPLUS1 - J
-                          DO 30 I = MAX(1,J-K),J - 1
-                              X(IX) = X(IX) + TEMP*A(L+I,J)
-                              IX = IX + INCX
-   30                     CONTINUE
-                          IF (NOUNIT) X(JX) = X(JX)*A(KPLUS1,J)
-                      END IF
-                      JX = JX + INCX
-                      IF (J.GT.K) KX = KX + INCX
-   40             CONTINUE
-              END IF
-          ELSE
-              IF (INCX.EQ.1) THEN
-                  DO 60 J = N,1,-1
-                      IF (X(J).NE.ZERO) THEN
-                          TEMP = X(J)
-                          L = 1 - J
-                          DO 50 I = MIN(N,J+K),J + 1,-1
-                              X(I) = X(I) + TEMP*A(L+I,J)
-   50                     CONTINUE
-                          IF (NOUNIT) X(J) = X(J)*A(1,J)
-                      END IF
-   60             CONTINUE
-              ELSE
-                  KX = KX + (N-1)*INCX
-                  JX = KX
-                  DO 80 J = N,1,-1
-                      IF (X(JX).NE.ZERO) THEN
-                          TEMP = X(JX)
-                          IX = KX
-                          L = 1 - J
-                          DO 70 I = MIN(N,J+K),J + 1,-1
-                              X(IX) = X(IX) + TEMP*A(L+I,J)
-                              IX = IX - INCX
-   70                     CONTINUE
-                          IF (NOUNIT) X(JX) = X(JX)*A(1,J)
-                      END IF
-                      JX = JX - INCX
-                      IF ((N-J).GE.K) KX = KX - INCX
-   80             CONTINUE
-              END IF
-          END IF
-      ELSE
-*
-*        Form  x := A'*x  or  x := conjg( A' )*x.
-*
-          IF (LSAME(UPLO,'U')) THEN
-              KPLUS1 = K + 1
-              IF (INCX.EQ.1) THEN
-                  DO 110 J = N,1,-1
-                      TEMP = X(J)
-                      L = KPLUS1 - J
-                      IF (NOCONJ) THEN
-                          IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J)
-                          DO 90 I = J - 1,MAX(1,J-K),-1
-                              TEMP = TEMP + A(L+I,J)*X(I)
-   90                     CONTINUE
-                      ELSE
-                          IF (NOUNIT) TEMP = TEMP*CONJG(A(KPLUS1,J))
-                          DO 100 I = J - 1,MAX(1,J-K),-1
-                              TEMP = TEMP + CONJG(A(L+I,J))*X(I)
-  100                     CONTINUE
-                      END IF
-                      X(J) = TEMP
-  110             CONTINUE
-              ELSE
-                  KX = KX + (N-1)*INCX
-                  JX = KX
-                  DO 140 J = N,1,-1
-                      TEMP = X(JX)
-                      KX = KX - INCX
-                      IX = KX
-                      L = KPLUS1 - J
-                      IF (NOCONJ) THEN
-                          IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J)
-                          DO 120 I = J - 1,MAX(1,J-K),-1
-                              TEMP = TEMP + A(L+I,J)*X(IX)
-                              IX = IX - INCX
-  120                     CONTINUE
-                      ELSE
-                          IF (NOUNIT) TEMP = TEMP*CONJG(A(KPLUS1,J))
-                          DO 130 I = J - 1,MAX(1,J-K),-1
-                              TEMP = TEMP + CONJG(A(L+I,J))*X(IX)
-                              IX = IX - INCX
-  130                     CONTINUE
-                      END IF
-                      X(JX) = TEMP
-                      JX = JX - INCX
-  140             CONTINUE
-              END IF
-          ELSE
-              IF (INCX.EQ.1) THEN
-                  DO 170 J = 1,N
-                      TEMP = X(J)
-                      L = 1 - J
-                      IF (NOCONJ) THEN
-                          IF (NOUNIT) TEMP = TEMP*A(1,J)
-                          DO 150 I = J + 1,MIN(N,J+K)
-                              TEMP = TEMP + A(L+I,J)*X(I)
-  150                     CONTINUE
-                      ELSE
-                          IF (NOUNIT) TEMP = TEMP*CONJG(A(1,J))
-                          DO 160 I = J + 1,MIN(N,J+K)
-                              TEMP = TEMP + CONJG(A(L+I,J))*X(I)
-  160                     CONTINUE
-                      END IF
-                      X(J) = TEMP
-  170             CONTINUE
-              ELSE
-                  JX = KX
-                  DO 200 J = 1,N
-                      TEMP = X(JX)
-                      KX = KX + INCX
-                      IX = KX
-                      L = 1 - J
-                      IF (NOCONJ) THEN
-                          IF (NOUNIT) TEMP = TEMP*A(1,J)
-                          DO 180 I = J + 1,MIN(N,J+K)
-                              TEMP = TEMP + A(L+I,J)*X(IX)
-                              IX = IX + INCX
-  180                     CONTINUE
-                      ELSE
-                          IF (NOUNIT) TEMP = TEMP*CONJG(A(1,J))
-                          DO 190 I = J + 1,MIN(N,J+K)
-                              TEMP = TEMP + CONJG(A(L+I,J))*X(IX)
-                              IX = IX + INCX
-  190                     CONTINUE
-                      END IF
-                      X(JX) = TEMP
-                      JX = JX + INCX
-  200             CONTINUE
-              END IF
-          END IF
-      END IF
-*
-      RETURN
-*
-*     End of CTBMV .
-*
-      END
diff --git a/blas/drotm.f b/blas/drotm.f
deleted file mode 100644
index 63a3b1134..000000000
--- a/blas/drotm.f
+++ /dev/null
@@ -1,147 +0,0 @@
-      SUBROUTINE DROTM(N,DX,INCX,DY,INCY,DPARAM)
-*     .. Scalar Arguments ..
-      INTEGER INCX,INCY,N
-*     ..
-*     .. Array Arguments ..
-      DOUBLE PRECISION DPARAM(5),DX(*),DY(*)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*     APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX
-*
-*     (DX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF DX ARE IN
-*     (DY**T)
-*
-*     DX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE
-*     LX = (-INCX)*N, AND SIMILARLY FOR SY USING LY AND INCY.
-*     WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS..
-*
-*     DFLAG=-1.D0     DFLAG=0.D0        DFLAG=1.D0     DFLAG=-2.D0
-*
-*       (DH11  DH12)    (1.D0  DH12)    (DH11  1.D0)    (1.D0  0.D0)
-*     H=(          )    (          )    (          )    (          )
-*       (DH21  DH22),   (DH21  1.D0),   (-1.D0 DH22),   (0.D0  1.D0).
-*     SEE DROTMG FOR A DESCRIPTION OF DATA STORAGE IN DPARAM.
-*
-*  Arguments
-*  =========
-*
-*  N      (input) INTEGER
-*         number of elements in input vector(s)
-*
-*  DX     (input/output) DOUBLE PRECISION array, dimension N
-*         double precision vector with N elements
-*
-*  INCX   (input) INTEGER
-*         storage spacing between elements of DX
-*
-*  DY     (input/output) DOUBLE PRECISION array, dimension N
-*         double precision vector with N elements
-*
-*  INCY   (input) INTEGER
-*         storage spacing between elements of DY
-*
-*  DPARAM (input/output)  DOUBLE PRECISION array, dimension 5 
-*     DPARAM(1)=DFLAG
-*     DPARAM(2)=DH11
-*     DPARAM(3)=DH21
-*     DPARAM(4)=DH12
-*     DPARAM(5)=DH22
-*
-*  =====================================================================
-*
-*     .. Local Scalars ..
-      DOUBLE PRECISION DFLAG,DH11,DH12,DH21,DH22,TWO,W,Z,ZERO
-      INTEGER I,KX,KY,NSTEPS
-*     ..
-*     .. Data statements ..
-      DATA ZERO,TWO/0.D0,2.D0/
-*     ..
-*
-      DFLAG = DPARAM(1)
-      IF (N.LE.0 .OR. (DFLAG+TWO.EQ.ZERO)) GO TO 140
-      IF (.NOT. (INCX.EQ.INCY.AND.INCX.GT.0)) GO TO 70
-*
-      NSTEPS = N*INCX
-      IF (DFLAG) 50,10,30
-   10 CONTINUE
-      DH12 = DPARAM(4)
-      DH21 = DPARAM(3)
-      DO 20 I = 1,NSTEPS,INCX
-          W = DX(I)
-          Z = DY(I)
-          DX(I) = W + Z*DH12
-          DY(I) = W*DH21 + Z
-   20 CONTINUE
-      GO TO 140
-   30 CONTINUE
-      DH11 = DPARAM(2)
-      DH22 = DPARAM(5)
-      DO 40 I = 1,NSTEPS,INCX
-          W = DX(I)
-          Z = DY(I)
-          DX(I) = W*DH11 + Z
-          DY(I) = -W + DH22*Z
-   40 CONTINUE
-      GO TO 140
-   50 CONTINUE
-      DH11 = DPARAM(2)
-      DH12 = DPARAM(4)
-      DH21 = DPARAM(3)
-      DH22 = DPARAM(5)
-      DO 60 I = 1,NSTEPS,INCX
-          W = DX(I)
-          Z = DY(I)
-          DX(I) = W*DH11 + Z*DH12
-          DY(I) = W*DH21 + Z*DH22
-   60 CONTINUE
-      GO TO 140
-   70 CONTINUE
-      KX = 1
-      KY = 1
-      IF (INCX.LT.0) KX = 1 + (1-N)*INCX
-      IF (INCY.LT.0) KY = 1 + (1-N)*INCY
-*
-      IF (DFLAG) 120,80,100
-   80 CONTINUE
-      DH12 = DPARAM(4)
-      DH21 = DPARAM(3)
-      DO 90 I = 1,N
-          W = DX(KX)
-          Z = DY(KY)
-          DX(KX) = W + Z*DH12
-          DY(KY) = W*DH21 + Z
-          KX = KX + INCX
-          KY = KY + INCY
-   90 CONTINUE
-      GO TO 140
-  100 CONTINUE
-      DH11 = DPARAM(2)
-      DH22 = DPARAM(5)
-      DO 110 I = 1,N
-          W = DX(KX)
-          Z = DY(KY)
-          DX(KX) = W*DH11 + Z
-          DY(KY) = -W + DH22*Z
-          KX = KX + INCX
-          KY = KY + INCY
-  110 CONTINUE
-      GO TO 140
-  120 CONTINUE
-      DH11 = DPARAM(2)
-      DH12 = DPARAM(4)
-      DH21 = DPARAM(3)
-      DH22 = DPARAM(5)
-      DO 130 I = 1,N
-          W = DX(KX)
-          Z = DY(KY)
-          DX(KX) = W*DH11 + Z*DH12
-          DY(KY) = W*DH21 + Z*DH22
-          KX = KX + INCX
-          KY = KY + INCY
-  130 CONTINUE
-  140 CONTINUE
-      RETURN
-      END
diff --git a/blas/drotmg.f b/blas/drotmg.f
deleted file mode 100644
index 3ae647b08..000000000
--- a/blas/drotmg.f
+++ /dev/null
@@ -1,206 +0,0 @@
-      SUBROUTINE DROTMG(DD1,DD2,DX1,DY1,DPARAM)
-*     .. Scalar Arguments ..
-      DOUBLE PRECISION DD1,DD2,DX1,DY1
-*     ..
-*     .. Array Arguments ..
-      DOUBLE PRECISION DPARAM(5)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*     CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS
-*     THE SECOND COMPONENT OF THE 2-VECTOR  (DSQRT(DD1)*DX1,DSQRT(DD2)*
-*     DY2)**T.
-*     WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS..
-*
-*     DFLAG=-1.D0     DFLAG=0.D0        DFLAG=1.D0     DFLAG=-2.D0
-*
-*       (DH11  DH12)    (1.D0  DH12)    (DH11  1.D0)    (1.D0  0.D0)
-*     H=(          )    (          )    (          )    (          )
-*       (DH21  DH22),   (DH21  1.D0),   (-1.D0 DH22),   (0.D0  1.D0).
-*     LOCATIONS 2-4 OF DPARAM CONTAIN DH11, DH21, DH12, AND DH22
-*     RESPECTIVELY. (VALUES OF 1.D0, -1.D0, OR 0.D0 IMPLIED BY THE
-*     VALUE OF DPARAM(1) ARE NOT STORED IN DPARAM.)
-*
-*     THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE
-*     INEXACT.  THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE
-*     OF DD1 AND DD2.  ALL ACTUAL SCALING OF DATA IS DONE USING GAM.
-*
-*
-*  Arguments
-*  =========
-*
-*  DD1    (input/output) DOUBLE PRECISION
-*
-*  DD2    (input/output) DOUBLE PRECISION 
-*
-*  DX1    (input/output) DOUBLE PRECISION 
-*
-*  DY1    (input) DOUBLE PRECISION
-*
-*  DPARAM (input/output)  DOUBLE PRECISION array, dimension 5
-*     DPARAM(1)=DFLAG
-*     DPARAM(2)=DH11
-*     DPARAM(3)=DH21
-*     DPARAM(4)=DH12
-*     DPARAM(5)=DH22
-*
-*  =====================================================================
-*
-*     .. Local Scalars ..
-      DOUBLE PRECISION DFLAG,DH11,DH12,DH21,DH22,DP1,DP2,DQ1,DQ2,DTEMP,
-     +                 DU,GAM,GAMSQ,ONE,RGAMSQ,TWO,ZERO
-      INTEGER IGO
-*     ..
-*     .. Intrinsic Functions ..
-      INTRINSIC DABS
-*     ..
-*     .. Data statements ..
-*
-      DATA ZERO,ONE,TWO/0.D0,1.D0,2.D0/
-      DATA GAM,GAMSQ,RGAMSQ/4096.D0,16777216.D0,5.9604645D-8/
-*     ..
-
-      IF (.NOT.DD1.LT.ZERO) GO TO 10
-*       GO ZERO-H-D-AND-DX1..
-      GO TO 60
-   10 CONTINUE
-*     CASE-DD1-NONNEGATIVE
-      DP2 = DD2*DY1
-      IF (.NOT.DP2.EQ.ZERO) GO TO 20
-      DFLAG = -TWO
-      GO TO 260
-*     REGULAR-CASE..
-   20 CONTINUE
-      DP1 = DD1*DX1
-      DQ2 = DP2*DY1
-      DQ1 = DP1*DX1
-*
-      IF (.NOT.DABS(DQ1).GT.DABS(DQ2)) GO TO 40
-      DH21 = -DY1/DX1
-      DH12 = DP2/DP1
-*
-      DU = ONE - DH12*DH21
-*
-      IF (.NOT.DU.LE.ZERO) GO TO 30
-*         GO ZERO-H-D-AND-DX1..
-      GO TO 60
-   30 CONTINUE
-      DFLAG = ZERO
-      DD1 = DD1/DU
-      DD2 = DD2/DU
-      DX1 = DX1*DU
-*         GO SCALE-CHECK..
-      GO TO 100
-   40 CONTINUE
-      IF (.NOT.DQ2.LT.ZERO) GO TO 50
-*         GO ZERO-H-D-AND-DX1..
-      GO TO 60
-   50 CONTINUE
-      DFLAG = ONE
-      DH11 = DP1/DP2
-      DH22 = DX1/DY1
-      DU = ONE + DH11*DH22
-      DTEMP = DD2/DU
-      DD2 = DD1/DU
-      DD1 = DTEMP
-      DX1 = DY1*DU
-*         GO SCALE-CHECK
-      GO TO 100
-*     PROCEDURE..ZERO-H-D-AND-DX1..
-   60 CONTINUE
-      DFLAG = -ONE
-      DH11 = ZERO
-      DH12 = ZERO
-      DH21 = ZERO
-      DH22 = ZERO
-*
-      DD1 = ZERO
-      DD2 = ZERO
-      DX1 = ZERO
-*         RETURN..
-      GO TO 220
-*     PROCEDURE..FIX-H..
-   70 CONTINUE
-      IF (.NOT.DFLAG.GE.ZERO) GO TO 90
-*
-      IF (.NOT.DFLAG.EQ.ZERO) GO TO 80
-      DH11 = ONE
-      DH22 = ONE
-      DFLAG = -ONE
-      GO TO 90
-   80 CONTINUE
-      DH21 = -ONE
-      DH12 = ONE
-      DFLAG = -ONE
-   90 CONTINUE
-      GO TO IGO(120,150,180,210)
-*     PROCEDURE..SCALE-CHECK
-  100 CONTINUE
-  110 CONTINUE
-      IF (.NOT.DD1.LE.RGAMSQ) GO TO 130
-      IF (DD1.EQ.ZERO) GO TO 160
-      ASSIGN 120 TO IGO
-*              FIX-H..
-      GO TO 70
-  120 CONTINUE
-      DD1 = DD1*GAM**2
-      DX1 = DX1/GAM
-      DH11 = DH11/GAM
-      DH12 = DH12/GAM
-      GO TO 110
-  130 CONTINUE
-  140 CONTINUE
-      IF (.NOT.DD1.GE.GAMSQ) GO TO 160
-      ASSIGN 150 TO IGO
-*              FIX-H..
-      GO TO 70
-  150 CONTINUE
-      DD1 = DD1/GAM**2
-      DX1 = DX1*GAM
-      DH11 = DH11*GAM
-      DH12 = DH12*GAM
-      GO TO 140
-  160 CONTINUE
-  170 CONTINUE
-      IF (.NOT.DABS(DD2).LE.RGAMSQ) GO TO 190
-      IF (DD2.EQ.ZERO) GO TO 220
-      ASSIGN 180 TO IGO
-*              FIX-H..
-      GO TO 70
-  180 CONTINUE
-      DD2 = DD2*GAM**2
-      DH21 = DH21/GAM
-      DH22 = DH22/GAM
-      GO TO 170
-  190 CONTINUE
-  200 CONTINUE
-      IF (.NOT.DABS(DD2).GE.GAMSQ) GO TO 220
-      ASSIGN 210 TO IGO
-*              FIX-H..
-      GO TO 70
-  210 CONTINUE
-      DD2 = DD2/GAM**2
-      DH21 = DH21*GAM
-      DH22 = DH22*GAM
-      GO TO 200
-  220 CONTINUE
-      IF (DFLAG) 250,230,240
-  230 CONTINUE
-      DPARAM(3) = DH21
-      DPARAM(4) = DH12
-      GO TO 260
-  240 CONTINUE
-      DPARAM(2) = DH11
-      DPARAM(5) = DH22
-      GO TO 260
-  250 CONTINUE
-      DPARAM(2) = DH11
-      DPARAM(3) = DH21
-      DPARAM(4) = DH12
-      DPARAM(5) = DH22
-  260 CONTINUE
-      DPARAM(1) = DFLAG
-      RETURN
-      END
diff --git a/blas/dsbmv.f b/blas/dsbmv.f
deleted file mode 100644
index 8c82d1fa1..000000000
--- a/blas/dsbmv.f
+++ /dev/null
@@ -1,304 +0,0 @@
-      SUBROUTINE DSBMV(UPLO,N,K,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
-*     .. Scalar Arguments ..
-      DOUBLE PRECISION ALPHA,BETA
-      INTEGER INCX,INCY,K,LDA,N
-      CHARACTER UPLO
-*     ..
-*     .. Array Arguments ..
-      DOUBLE PRECISION A(LDA,*),X(*),Y(*)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*  DSBMV  performs the matrix-vector  operation
-*
-*     y := alpha*A*x + beta*y,
-*
-*  where alpha and beta are scalars, x and y are n element vectors and
-*  A is an n by n symmetric band matrix, with k super-diagonals.
-*
-*  Arguments
-*  ==========
-*
-*  UPLO   - CHARACTER*1.
-*           On entry, UPLO specifies whether the upper or lower
-*           triangular part of the band matrix A is being supplied as
-*           follows:
-*
-*              UPLO = 'U' or 'u'   The upper triangular part of A is
-*                                  being supplied.
-*
-*              UPLO = 'L' or 'l'   The lower triangular part of A is
-*                                  being supplied.
-*
-*           Unchanged on exit.
-*
-*  N      - INTEGER.
-*           On entry, N specifies the order of the matrix A.
-*           N must be at least zero.
-*           Unchanged on exit.
-*
-*  K      - INTEGER.
-*           On entry, K specifies the number of super-diagonals of the
-*           matrix A. K must satisfy  0 .le. K.
-*           Unchanged on exit.
-*
-*  ALPHA  - DOUBLE PRECISION.
-*           On entry, ALPHA specifies the scalar alpha.
-*           Unchanged on exit.
-*
-*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
-*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
-*           by n part of the array A must contain the upper triangular
-*           band part of the symmetric matrix, supplied column by
-*           column, with the leading diagonal of the matrix in row
-*           ( k + 1 ) of the array, the first super-diagonal starting at
-*           position 2 in row k, and so on. The top left k by k triangle
-*           of the array A is not referenced.
-*           The following program segment will transfer the upper
-*           triangular part of a symmetric band matrix from conventional
-*           full matrix storage to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = K + 1 - J
-*                    DO 10, I = MAX( 1, J - K ), J
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
-*           by n part of the array A must contain the lower triangular
-*           band part of the symmetric matrix, supplied column by
-*           column, with the leading diagonal of the matrix in row 1 of
-*           the array, the first sub-diagonal starting at position 1 in
-*           row 2, and so on. The bottom right k by k triangle of the
-*           array A is not referenced.
-*           The following program segment will transfer the lower
-*           triangular part of a symmetric band matrix from conventional
-*           full matrix storage to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = 1 - J
-*                    DO 10, I = J, MIN( N, J + K )
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Unchanged on exit.
-*
-*  LDA    - INTEGER.
-*           On entry, LDA specifies the first dimension of A as declared
-*           in the calling (sub) program. LDA must be at least
-*           ( k + 1 ).
-*           Unchanged on exit.
-*
-*  X      - DOUBLE PRECISION array of DIMENSION at least
-*           ( 1 + ( n - 1 )*abs( INCX ) ).
-*           Before entry, the incremented array X must contain the
-*           vector x.
-*           Unchanged on exit.
-*
-*  INCX   - INTEGER.
-*           On entry, INCX specifies the increment for the elements of
-*           X. INCX must not be zero.
-*           Unchanged on exit.
-*
-*  BETA   - DOUBLE PRECISION.
-*           On entry, BETA specifies the scalar beta.
-*           Unchanged on exit.
-*
-*  Y      - DOUBLE PRECISION array of DIMENSION at least
-*           ( 1 + ( n - 1 )*abs( INCY ) ).
-*           Before entry, the incremented array Y must contain the
-*           vector y. On exit, Y is overwritten by the updated vector y.
-*
-*  INCY   - INTEGER.
-*           On entry, INCY specifies the increment for the elements of
-*           Y. INCY must not be zero.
-*           Unchanged on exit.
-*
-*
-*  Level 2 Blas routine.
-*
-*  -- Written on 22-October-1986.
-*     Jack Dongarra, Argonne National Lab.
-*     Jeremy Du Croz, Nag Central Office.
-*     Sven Hammarling, Nag Central Office.
-*     Richard Hanson, Sandia National Labs.
-*
-*  =====================================================================
-*
-*     .. Parameters ..
-      DOUBLE PRECISION ONE,ZERO
-      PARAMETER (ONE=1.0D+0,ZERO=0.0D+0)
-*     ..
-*     .. Local Scalars ..
-      DOUBLE PRECISION TEMP1,TEMP2
-      INTEGER I,INFO,IX,IY,J,JX,JY,KPLUS1,KX,KY,L
-*     ..
-*     .. External Functions ..
-      LOGICAL LSAME
-      EXTERNAL LSAME
-*     ..
-*     .. External Subroutines ..
-      EXTERNAL XERBLA
-*     ..
-*     .. Intrinsic Functions ..
-      INTRINSIC MAX,MIN
-*     ..
-*
-*     Test the input parameters.
-*
-      INFO = 0
-      IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
-          INFO = 1
-      ELSE IF (N.LT.0) THEN
-          INFO = 2
-      ELSE IF (K.LT.0) THEN
-          INFO = 3
-      ELSE IF (LDA.LT. (K+1)) THEN
-          INFO = 6
-      ELSE IF (INCX.EQ.0) THEN
-          INFO = 8
-      ELSE IF (INCY.EQ.0) THEN
-          INFO = 11
-      END IF
-      IF (INFO.NE.0) THEN
-          CALL XERBLA('DSBMV ',INFO)
-          RETURN
-      END IF
-*
-*     Quick return if possible.
-*
-      IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
-*
-*     Set up the start points in  X  and  Y.
-*
-      IF (INCX.GT.0) THEN
-          KX = 1
-      ELSE
-          KX = 1 - (N-1)*INCX
-      END IF
-      IF (INCY.GT.0) THEN
-          KY = 1
-      ELSE
-          KY = 1 - (N-1)*INCY
-      END IF
-*
-*     Start the operations. In this version the elements of the array A
-*     are accessed sequentially with one pass through A.
-*
-*     First form  y := beta*y.
-*
-      IF (BETA.NE.ONE) THEN
-          IF (INCY.EQ.1) THEN
-              IF (BETA.EQ.ZERO) THEN
-                  DO 10 I = 1,N
-                      Y(I) = ZERO
-   10             CONTINUE
-              ELSE
-                  DO 20 I = 1,N
-                      Y(I) = BETA*Y(I)
-   20             CONTINUE
-              END IF
-          ELSE
-              IY = KY
-              IF (BETA.EQ.ZERO) THEN
-                  DO 30 I = 1,N
-                      Y(IY) = ZERO
-                      IY = IY + INCY
-   30             CONTINUE
-              ELSE
-                  DO 40 I = 1,N
-                      Y(IY) = BETA*Y(IY)
-                      IY = IY + INCY
-   40             CONTINUE
-              END IF
-          END IF
-      END IF
-      IF (ALPHA.EQ.ZERO) RETURN
-      IF (LSAME(UPLO,'U')) THEN
-*
-*        Form  y  when upper triangle of A is stored.
-*
-          KPLUS1 = K + 1
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 60 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  L = KPLUS1 - J
-                  DO 50 I = MAX(1,J-K),J - 1
-                      Y(I) = Y(I) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + A(L+I,J)*X(I)
-   50             CONTINUE
-                  Y(J) = Y(J) + TEMP1*A(KPLUS1,J) + ALPHA*TEMP2
-   60         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 80 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  IX = KX
-                  IY = KY
-                  L = KPLUS1 - J
-                  DO 70 I = MAX(1,J-K),J - 1
-                      Y(IY) = Y(IY) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + A(L+I,J)*X(IX)
-                      IX = IX + INCX
-                      IY = IY + INCY
-   70             CONTINUE
-                  Y(JY) = Y(JY) + TEMP1*A(KPLUS1,J) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-                  IF (J.GT.K) THEN
-                      KX = KX + INCX
-                      KY = KY + INCY
-                  END IF
-   80         CONTINUE
-          END IF
-      ELSE
-*
-*        Form  y  when lower triangle of A is stored.
-*
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 100 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  Y(J) = Y(J) + TEMP1*A(1,J)
-                  L = 1 - J
-                  DO 90 I = J + 1,MIN(N,J+K)
-                      Y(I) = Y(I) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + A(L+I,J)*X(I)
-   90             CONTINUE
-                  Y(J) = Y(J) + ALPHA*TEMP2
-  100         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 120 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  Y(JY) = Y(JY) + TEMP1*A(1,J)
-                  L = 1 - J
-                  IX = JX
-                  IY = JY
-                  DO 110 I = J + 1,MIN(N,J+K)
-                      IX = IX + INCX
-                      IY = IY + INCY
-                      Y(IY) = Y(IY) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + A(L+I,J)*X(IX)
-  110             CONTINUE
-                  Y(JY) = Y(JY) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-  120         CONTINUE
-          END IF
-      END IF
-*
-      RETURN
-*
-*     End of DSBMV .
-*
-      END
diff --git a/blas/dspmv.f b/blas/dspmv.f
deleted file mode 100644
index f6e121e76..000000000
--- a/blas/dspmv.f
+++ /dev/null
@@ -1,265 +0,0 @@
-      SUBROUTINE DSPMV(UPLO,N,ALPHA,AP,X,INCX,BETA,Y,INCY)
-*     .. Scalar Arguments ..
-      DOUBLE PRECISION ALPHA,BETA
-      INTEGER INCX,INCY,N
-      CHARACTER UPLO
-*     ..
-*     .. Array Arguments ..
-      DOUBLE PRECISION AP(*),X(*),Y(*)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*  DSPMV  performs the matrix-vector operation
-*
-*     y := alpha*A*x + beta*y,
-*
-*  where alpha and beta are scalars, x and y are n element vectors and
-*  A is an n by n symmetric matrix, supplied in packed form.
-*
-*  Arguments
-*  ==========
-*
-*  UPLO   - CHARACTER*1.
-*           On entry, UPLO specifies whether the upper or lower
-*           triangular part of the matrix A is supplied in the packed
-*           array AP as follows:
-*
-*              UPLO = 'U' or 'u'   The upper triangular part of A is
-*                                  supplied in AP.
-*
-*              UPLO = 'L' or 'l'   The lower triangular part of A is
-*                                  supplied in AP.
-*
-*           Unchanged on exit.
-*
-*  N      - INTEGER.
-*           On entry, N specifies the order of the matrix A.
-*           N must be at least zero.
-*           Unchanged on exit.
-*
-*  ALPHA  - DOUBLE PRECISION.
-*           On entry, ALPHA specifies the scalar alpha.
-*           Unchanged on exit.
-*
-*  AP     - DOUBLE PRECISION array of DIMENSION at least
-*           ( ( n*( n + 1 ) )/2 ).
-*           Before entry with UPLO = 'U' or 'u', the array AP must
-*           contain the upper triangular part of the symmetric matrix
-*           packed sequentially, column by column, so that AP( 1 )
-*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 )
-*           and a( 2, 2 ) respectively, and so on.
-*           Before entry with UPLO = 'L' or 'l', the array AP must
-*           contain the lower triangular part of the symmetric matrix
-*           packed sequentially, column by column, so that AP( 1 )
-*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 )
-*           and a( 3, 1 ) respectively, and so on.
-*           Unchanged on exit.
-*
-*  X      - DOUBLE PRECISION array of dimension at least
-*           ( 1 + ( n - 1 )*abs( INCX ) ).
-*           Before entry, the incremented array X must contain the n
-*           element vector x.
-*           Unchanged on exit.
-*
-*  INCX   - INTEGER.
-*           On entry, INCX specifies the increment for the elements of
-*           X. INCX must not be zero.
-*           Unchanged on exit.
-*
-*  BETA   - DOUBLE PRECISION.
-*           On entry, BETA specifies the scalar beta. When BETA is
-*           supplied as zero then Y need not be set on input.
-*           Unchanged on exit.
-*
-*  Y      - DOUBLE PRECISION array of dimension at least
-*           ( 1 + ( n - 1 )*abs( INCY ) ).
-*           Before entry, the incremented array Y must contain the n
-*           element vector y. On exit, Y is overwritten by the updated
-*           vector y.
-*
-*  INCY   - INTEGER.
-*           On entry, INCY specifies the increment for the elements of
-*           Y. INCY must not be zero.
-*           Unchanged on exit.
-*
-*  Further Details
-*  ===============
-*
-*  Level 2 Blas routine.
-*
-*  -- Written on 22-October-1986.
-*     Jack Dongarra, Argonne National Lab.
-*     Jeremy Du Croz, Nag Central Office.
-*     Sven Hammarling, Nag Central Office.
-*     Richard Hanson, Sandia National Labs.
-*
-*  =====================================================================
-*
-*     .. Parameters ..
-      DOUBLE PRECISION ONE,ZERO
-      PARAMETER (ONE=1.0D+0,ZERO=0.0D+0)
-*     ..
-*     .. Local Scalars ..
-      DOUBLE PRECISION TEMP1,TEMP2
-      INTEGER I,INFO,IX,IY,J,JX,JY,K,KK,KX,KY
-*     ..
-*     .. External Functions ..
-      LOGICAL LSAME
-      EXTERNAL LSAME
-*     ..
-*     .. External Subroutines ..
-      EXTERNAL XERBLA
-*     ..
-*
-*     Test the input parameters.
-*
-      INFO = 0
-      IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
-          INFO = 1
-      ELSE IF (N.LT.0) THEN
-          INFO = 2
-      ELSE IF (INCX.EQ.0) THEN
-          INFO = 6
-      ELSE IF (INCY.EQ.0) THEN
-          INFO = 9
-      END IF
-      IF (INFO.NE.0) THEN
-          CALL XERBLA('DSPMV ',INFO)
-          RETURN
-      END IF
-*
-*     Quick return if possible.
-*
-      IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
-*
-*     Set up the start points in  X  and  Y.
-*
-      IF (INCX.GT.0) THEN
-          KX = 1
-      ELSE
-          KX = 1 - (N-1)*INCX
-      END IF
-      IF (INCY.GT.0) THEN
-          KY = 1
-      ELSE
-          KY = 1 - (N-1)*INCY
-      END IF
-*
-*     Start the operations. In this version the elements of the array AP
-*     are accessed sequentially with one pass through AP.
-*
-*     First form  y := beta*y.
-*
-      IF (BETA.NE.ONE) THEN
-          IF (INCY.EQ.1) THEN
-              IF (BETA.EQ.ZERO) THEN
-                  DO 10 I = 1,N
-                      Y(I) = ZERO
-   10             CONTINUE
-              ELSE
-                  DO 20 I = 1,N
-                      Y(I) = BETA*Y(I)
-   20             CONTINUE
-              END IF
-          ELSE
-              IY = KY
-              IF (BETA.EQ.ZERO) THEN
-                  DO 30 I = 1,N
-                      Y(IY) = ZERO
-                      IY = IY + INCY
-   30             CONTINUE
-              ELSE
-                  DO 40 I = 1,N
-                      Y(IY) = BETA*Y(IY)
-                      IY = IY + INCY
-   40             CONTINUE
-              END IF
-          END IF
-      END IF
-      IF (ALPHA.EQ.ZERO) RETURN
-      KK = 1
-      IF (LSAME(UPLO,'U')) THEN
-*
-*        Form  y  when AP contains the upper triangle.
-*
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 60 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  K = KK
-                  DO 50 I = 1,J - 1
-                      Y(I) = Y(I) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + AP(K)*X(I)
-                      K = K + 1
-   50             CONTINUE
-                  Y(J) = Y(J) + TEMP1*AP(KK+J-1) + ALPHA*TEMP2
-                  KK = KK + J
-   60         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 80 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  IX = KX
-                  IY = KY
-                  DO 70 K = KK,KK + J - 2
-                      Y(IY) = Y(IY) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + AP(K)*X(IX)
-                      IX = IX + INCX
-                      IY = IY + INCY
-   70             CONTINUE
-                  Y(JY) = Y(JY) + TEMP1*AP(KK+J-1) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-                  KK = KK + J
-   80         CONTINUE
-          END IF
-      ELSE
-*
-*        Form  y  when AP contains the lower triangle.
-*
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 100 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  Y(J) = Y(J) + TEMP1*AP(KK)
-                  K = KK + 1
-                  DO 90 I = J + 1,N
-                      Y(I) = Y(I) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + AP(K)*X(I)
-                      K = K + 1
-   90             CONTINUE
-                  Y(J) = Y(J) + ALPHA*TEMP2
-                  KK = KK + (N-J+1)
-  100         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 120 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  Y(JY) = Y(JY) + TEMP1*AP(KK)
-                  IX = JX
-                  IY = JY
-                  DO 110 K = KK + 1,KK + N - J
-                      IX = IX + INCX
-                      IY = IY + INCY
-                      Y(IY) = Y(IY) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + AP(K)*X(IX)
-  110             CONTINUE
-                  Y(JY) = Y(JY) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-                  KK = KK + (N-J+1)
-  120         CONTINUE
-          END IF
-      END IF
-*
-      RETURN
-*
-*     End of DSPMV .
-*
-      END
diff --git a/blas/dtbmv.f b/blas/dtbmv.f
deleted file mode 100644
index a87ffdeae..000000000
--- a/blas/dtbmv.f
+++ /dev/null
@@ -1,335 +0,0 @@
-      SUBROUTINE DTBMV(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX)
-*     .. Scalar Arguments ..
-      INTEGER INCX,K,LDA,N
-      CHARACTER DIAG,TRANS,UPLO
-*     ..
-*     .. Array Arguments ..
-      DOUBLE PRECISION A(LDA,*),X(*)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*  DTBMV  performs one of the matrix-vector operations
-*
-*     x := A*x,   or   x := A'*x,
-*
-*  where x is an n element vector and  A is an n by n unit, or non-unit,
-*  upper or lower triangular band matrix, with ( k + 1 ) diagonals.
-*
-*  Arguments
-*  ==========
-*
-*  UPLO   - CHARACTER*1.
-*           On entry, UPLO specifies whether the matrix is an upper or
-*           lower triangular matrix as follows:
-*
-*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
-*
-*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
-*
-*           Unchanged on exit.
-*
-*  TRANS  - CHARACTER*1.
-*           On entry, TRANS specifies the operation to be performed as
-*           follows:
-*
-*              TRANS = 'N' or 'n'   x := A*x.
-*
-*              TRANS = 'T' or 't'   x := A'*x.
-*
-*              TRANS = 'C' or 'c'   x := A'*x.
-*
-*           Unchanged on exit.
-*
-*  DIAG   - CHARACTER*1.
-*           On entry, DIAG specifies whether or not A is unit
-*           triangular as follows:
-*
-*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
-*
-*              DIAG = 'N' or 'n'   A is not assumed to be unit
-*                                  triangular.
-*
-*           Unchanged on exit.
-*
-*  N      - INTEGER.
-*           On entry, N specifies the order of the matrix A.
-*           N must be at least zero.
-*           Unchanged on exit.
-*
-*  K      - INTEGER.
-*           On entry with UPLO = 'U' or 'u', K specifies the number of
-*           super-diagonals of the matrix A.
-*           On entry with UPLO = 'L' or 'l', K specifies the number of
-*           sub-diagonals of the matrix A.
-*           K must satisfy  0 .le. K.
-*           Unchanged on exit.
-*
-*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
-*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
-*           by n part of the array A must contain the upper triangular
-*           band part of the matrix of coefficients, supplied column by
-*           column, with the leading diagonal of the matrix in row
-*           ( k + 1 ) of the array, the first super-diagonal starting at
-*           position 2 in row k, and so on. The top left k by k triangle
-*           of the array A is not referenced.
-*           The following program segment will transfer an upper
-*           triangular band matrix from conventional full matrix storage
-*           to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = K + 1 - J
-*                    DO 10, I = MAX( 1, J - K ), J
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
-*           by n part of the array A must contain the lower triangular
-*           band part of the matrix of coefficients, supplied column by
-*           column, with the leading diagonal of the matrix in row 1 of
-*           the array, the first sub-diagonal starting at position 1 in
-*           row 2, and so on. The bottom right k by k triangle of the
-*           array A is not referenced.
-*           The following program segment will transfer a lower
-*           triangular band matrix from conventional full matrix storage
-*           to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = 1 - J
-*                    DO 10, I = J, MIN( N, J + K )
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Note that when DIAG = 'U' or 'u' the elements of the array A
-*           corresponding to the diagonal elements of the matrix are not
-*           referenced, but are assumed to be unity.
-*           Unchanged on exit.
-*
-*  LDA    - INTEGER.
-*           On entry, LDA specifies the first dimension of A as declared
-*           in the calling (sub) program. LDA must be at least
-*           ( k + 1 ).
-*           Unchanged on exit.
-*
-*  X      - DOUBLE PRECISION array of dimension at least
-*           ( 1 + ( n - 1 )*abs( INCX ) ).
-*           Before entry, the incremented array X must contain the n
-*           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
-*
-*  INCX   - INTEGER.
-*           On entry, INCX specifies the increment for the elements of
-*           X. INCX must not be zero.
-*           Unchanged on exit.
-*
-*  Further Details
-*  ===============
-*
-*  Level 2 Blas routine.
-*
-*  -- Written on 22-October-1986.
-*     Jack Dongarra, Argonne National Lab.
-*     Jeremy Du Croz, Nag Central Office.
-*     Sven Hammarling, Nag Central Office.
-*     Richard Hanson, Sandia National Labs.
-*
-*  =====================================================================
-*
-*     .. Parameters ..
-      DOUBLE PRECISION ZERO
-      PARAMETER (ZERO=0.0D+0)
-*     ..
-*     .. Local Scalars ..
-      DOUBLE PRECISION TEMP
-      INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L
-      LOGICAL NOUNIT
-*     ..
-*     .. External Functions ..
-      LOGICAL LSAME
-      EXTERNAL LSAME
-*     ..
-*     .. External Subroutines ..
-      EXTERNAL XERBLA
-*     ..
-*     .. Intrinsic Functions ..
-      INTRINSIC MAX,MIN
-*     ..
-*
-*     Test the input parameters.
-*
-      INFO = 0
-      IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
-          INFO = 1
-      ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND.
-     +         .NOT.LSAME(TRANS,'C')) THEN
-          INFO = 2
-      ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN
-          INFO = 3
-      ELSE IF (N.LT.0) THEN
-          INFO = 4
-      ELSE IF (K.LT.0) THEN
-          INFO = 5
-      ELSE IF (LDA.LT. (K+1)) THEN
-          INFO = 7
-      ELSE IF (INCX.EQ.0) THEN
-          INFO = 9
-      END IF
-      IF (INFO.NE.0) THEN
-          CALL XERBLA('DTBMV ',INFO)
-          RETURN
-      END IF
-*
-*     Quick return if possible.
-*
-      IF (N.EQ.0) RETURN
-*
-      NOUNIT = LSAME(DIAG,'N')
-*
-*     Set up the start point in X if the increment is not unity. This
-*     will be  ( N - 1 )*INCX   too small for descending loops.
-*
-      IF (INCX.LE.0) THEN
-          KX = 1 - (N-1)*INCX
-      ELSE IF (INCX.NE.1) THEN
-          KX = 1
-      END IF
-*
-*     Start the operations. In this version the elements of A are
-*     accessed sequentially with one pass through A.
-*
-      IF (LSAME(TRANS,'N')) THEN
-*
-*         Form  x := A*x.
-*
-          IF (LSAME(UPLO,'U')) THEN
-              KPLUS1 = K + 1
-              IF (INCX.EQ.1) THEN
-                  DO 20 J = 1,N
-                      IF (X(J).NE.ZERO) THEN
-                          TEMP = X(J)
-                          L = KPLUS1 - J
-                          DO 10 I = MAX(1,J-K),J - 1
-                              X(I) = X(I) + TEMP*A(L+I,J)
-   10                     CONTINUE
-                          IF (NOUNIT) X(J) = X(J)*A(KPLUS1,J)
-                      END IF
-   20             CONTINUE
-              ELSE
-                  JX = KX
-                  DO 40 J = 1,N
-                      IF (X(JX).NE.ZERO) THEN
-                          TEMP = X(JX)
-                          IX = KX
-                          L = KPLUS1 - J
-                          DO 30 I = MAX(1,J-K),J - 1
-                              X(IX) = X(IX) + TEMP*A(L+I,J)
-                              IX = IX + INCX
-   30                     CONTINUE
-                          IF (NOUNIT) X(JX) = X(JX)*A(KPLUS1,J)
-                      END IF
-                      JX = JX + INCX
-                      IF (J.GT.K) KX = KX + INCX
-   40             CONTINUE
-              END IF
-          ELSE
-              IF (INCX.EQ.1) THEN
-                  DO 60 J = N,1,-1
-                      IF (X(J).NE.ZERO) THEN
-                          TEMP = X(J)
-                          L = 1 - J
-                          DO 50 I = MIN(N,J+K),J + 1,-1
-                              X(I) = X(I) + TEMP*A(L+I,J)
-   50                     CONTINUE
-                          IF (NOUNIT) X(J) = X(J)*A(1,J)
-                      END IF
-   60             CONTINUE
-              ELSE
-                  KX = KX + (N-1)*INCX
-                  JX = KX
-                  DO 80 J = N,1,-1
-                      IF (X(JX).NE.ZERO) THEN
-                          TEMP = X(JX)
-                          IX = KX
-                          L = 1 - J
-                          DO 70 I = MIN(N,J+K),J + 1,-1
-                              X(IX) = X(IX) + TEMP*A(L+I,J)
-                              IX = IX - INCX
-   70                     CONTINUE
-                          IF (NOUNIT) X(JX) = X(JX)*A(1,J)
-                      END IF
-                      JX = JX - INCX
-                      IF ((N-J).GE.K) KX = KX - INCX
-   80             CONTINUE
-              END IF
-          END IF
-      ELSE
-*
-*        Form  x := A'*x.
-*
-          IF (LSAME(UPLO,'U')) THEN
-              KPLUS1 = K + 1
-              IF (INCX.EQ.1) THEN
-                  DO 100 J = N,1,-1
-                      TEMP = X(J)
-                      L = KPLUS1 - J
-                      IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J)
-                      DO 90 I = J - 1,MAX(1,J-K),-1
-                          TEMP = TEMP + A(L+I,J)*X(I)
-   90                 CONTINUE
-                      X(J) = TEMP
-  100             CONTINUE
-              ELSE
-                  KX = KX + (N-1)*INCX
-                  JX = KX
-                  DO 120 J = N,1,-1
-                      TEMP = X(JX)
-                      KX = KX - INCX
-                      IX = KX
-                      L = KPLUS1 - J
-                      IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J)
-                      DO 110 I = J - 1,MAX(1,J-K),-1
-                          TEMP = TEMP + A(L+I,J)*X(IX)
-                          IX = IX - INCX
-  110                 CONTINUE
-                      X(JX) = TEMP
-                      JX = JX - INCX
-  120             CONTINUE
-              END IF
-          ELSE
-              IF (INCX.EQ.1) THEN
-                  DO 140 J = 1,N
-                      TEMP = X(J)
-                      L = 1 - J
-                      IF (NOUNIT) TEMP = TEMP*A(1,J)
-                      DO 130 I = J + 1,MIN(N,J+K)
-                          TEMP = TEMP + A(L+I,J)*X(I)
-  130                 CONTINUE
-                      X(J) = TEMP
-  140             CONTINUE
-              ELSE
-                  JX = KX
-                  DO 160 J = 1,N
-                      TEMP = X(JX)
-                      KX = KX + INCX
-                      IX = KX
-                      L = 1 - J
-                      IF (NOUNIT) TEMP = TEMP*A(1,J)
-                      DO 150 I = J + 1,MIN(N,J+K)
-                          TEMP = TEMP + A(L+I,J)*X(IX)
-                          IX = IX + INCX
-  150                 CONTINUE
-                      X(JX) = TEMP
-                      JX = JX + INCX
-  160             CONTINUE
-              END IF
-          END IF
-      END IF
-*
-      RETURN
-*
-*     End of DTBMV .
-*
-      END
diff --git a/blas/f2c/chbmv.c b/blas/f2c/chbmv.c
new file mode 100644
index 000000000..f218fe3f5
--- /dev/null
+++ b/blas/f2c/chbmv.c
@@ -0,0 +1,487 @@
+/* chbmv.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int chbmv_(char *uplo, integer *n, integer *k, complex *
+	alpha, complex *a, integer *lda, complex *x, integer *incx, complex *
+	beta, complex *y, integer *incy, ftnlen uplo_len)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    real r__1;
+    complex q__1, q__2, q__3, q__4;
+
+    /* Builtin functions */
+    void r_cnjg(complex *, complex *);
+
+    /* Local variables */
+    integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
+    complex temp1, temp2;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    integer kplus1;
+    extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  CHBMV  performs the matrix-vector  operation */
+
+/*     y := alpha*A*x + beta*y, */
+
+/*  where alpha and beta are scalars, x and y are n element vectors and */
+/*  A is an n by n hermitian band matrix, with k super-diagonals. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  UPLO   - CHARACTER*1. */
+/*           On entry, UPLO specifies whether the upper or lower */
+/*           triangular part of the band matrix A is being supplied as */
+/*           follows: */
+
+/*              UPLO = 'U' or 'u'   The upper triangular part of A is */
+/*                                  being supplied. */
+
+/*              UPLO = 'L' or 'l'   The lower triangular part of A is */
+/*                                  being supplied. */
+
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the order of the matrix A. */
+/*           N must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  K      - INTEGER. */
+/*           On entry, K specifies the number of super-diagonals of the */
+/*           matrix A. K must satisfy  0 .le. K. */
+/*           Unchanged on exit. */
+
+/*  ALPHA  - COMPLEX         . */
+/*           On entry, ALPHA specifies the scalar alpha. */
+/*           Unchanged on exit. */
+
+/*  A      - COMPLEX          array of DIMENSION ( LDA, n ). */
+/*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the upper triangular */
+/*           band part of the hermitian matrix, supplied column by */
+/*           column, with the leading diagonal of the matrix in row */
+/*           ( k + 1 ) of the array, the first super-diagonal starting at */
+/*           position 2 in row k, and so on. The top left k by k triangle */
+/*           of the array A is not referenced. */
+/*           The following program segment will transfer the upper */
+/*           triangular part of a hermitian band matrix from conventional */
+/*           full matrix storage to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = K + 1 - J */
+/*                    DO 10, I = MAX( 1, J - K ), J */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the lower triangular */
+/*           band part of the hermitian matrix, supplied column by */
+/*           column, with the leading diagonal of the matrix in row 1 of */
+/*           the array, the first sub-diagonal starting at position 1 in */
+/*           row 2, and so on. The bottom right k by k triangle of the */
+/*           array A is not referenced. */
+/*           The following program segment will transfer the lower */
+/*           triangular part of a hermitian band matrix from conventional */
+/*           full matrix storage to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = 1 - J */
+/*                    DO 10, I = J, MIN( N, J + K ) */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Note that the imaginary parts of the diagonal elements need */
+/*           not be set and are assumed to be zero. */
+/*           Unchanged on exit. */
+
+/*  LDA    - INTEGER. */
+/*           On entry, LDA specifies the first dimension of A as declared */
+/*           in the calling (sub) program. LDA must be at least */
+/*           ( k + 1 ). */
+/*           Unchanged on exit. */
+
+/*  X      - COMPLEX          array of DIMENSION at least */
+/*           ( 1 + ( n - 1 )*abs( INCX ) ). */
+/*           Before entry, the incremented array X must contain the */
+/*           vector x. */
+/*           Unchanged on exit. */
+
+/*  INCX   - INTEGER. */
+/*           On entry, INCX specifies the increment for the elements of */
+/*           X. INCX must not be zero. */
+/*           Unchanged on exit. */
+
+/*  BETA   - COMPLEX         . */
+/*           On entry, BETA specifies the scalar beta. */
+/*           Unchanged on exit. */
+
+/*  Y      - COMPLEX          array of DIMENSION at least */
+/*           ( 1 + ( n - 1 )*abs( INCY ) ). */
+/*           Before entry, the incremented array Y must contain the */
+/*           vector y. On exit, Y is overwritten by the updated vector y. */
+
+/*  INCY   - INTEGER. */
+/*           On entry, INCY specifies the increment for the elements of */
+/*           Y. INCY must not be zero. */
+/*           Unchanged on exit. */
+
+/*  Further Details */
+/*  =============== */
+
+/*  Level 2 Blas routine. */
+
+/*  -- Written on 22-October-1986. */
+/*     Jack Dongarra, Argonne National Lab. */
+/*     Jeremy Du Croz, Nag Central Office. */
+/*     Sven Hammarling, Nag Central Office. */
+/*     Richard Hanson, Sandia National Labs. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+    --y;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
+	    ftnlen)1, (ftnlen)1)) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*k < 0) {
+	info = 3;
+    } else if (*lda < *k + 1) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    } else if (*incy == 0) {
+	info = 11;
+    }
+    if (info != 0) {
+	xerbla_("CHBMV ", &info, (ftnlen)6);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || (alpha->r == 0.f && alpha->i == 0.f && (beta->r == 1.f && 
+                                                           beta->i == 0.f))) {
+	return 0;
+    }
+
+/*     Set up the start points in  X  and  Y. */
+
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (*n - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (*n - 1) * *incy;
+    }
+
+/*     Start the operations. In this version the elements of the array A */
+/*     are accessed sequentially with one pass through A. */
+
+/*     First form  y := beta*y. */
+
+    if (beta->r != 1.f || beta->i != 0.f) {
+	if (*incy == 1) {
+	    if (beta->r == 0.f && beta->i == 0.f) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = i__;
+		    y[i__2].r = 0.f, y[i__2].i = 0.f;
+/* L10: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = i__;
+		    i__3 = i__;
+		    q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, 
+			    q__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
+			    .r;
+		    y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (beta->r == 0.f && beta->i == 0.f) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = iy;
+		    y[i__2].r = 0.f, y[i__2].i = 0.f;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = iy;
+		    i__3 = iy;
+		    q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, 
+			    q__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
+			    .r;
+		    y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (alpha->r == 0.f && alpha->i == 0.f) {
+	return 0;
+    }
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+
+/*        Form  y  when upper triangle of A is stored. */
+
+	kplus1 = *k + 1;
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j;
+		q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i =
+			 alpha->r * x[i__2].i + alpha->i * x[i__2].r;
+		temp1.r = q__1.r, temp1.i = q__1.i;
+		temp2.r = 0.f, temp2.i = 0.f;
+		l = kplus1 - j;
+/* Computing MAX */
+		i__2 = 1, i__3 = j - *k;
+		i__4 = j - 1;
+		for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
+		    i__2 = i__;
+		    i__3 = i__;
+		    i__5 = l + i__ + j * a_dim1;
+		    q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, 
+			    q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
+			    .r;
+		    q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
+		    y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+		    r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
+		    i__2 = i__;
+		    q__2.r = q__3.r * x[i__2].r - q__3.i * x[i__2].i, q__2.i =
+			     q__3.r * x[i__2].i + q__3.i * x[i__2].r;
+		    q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+/* L50: */
+		}
+		i__4 = j;
+		i__2 = j;
+		i__3 = kplus1 + j * a_dim1;
+		r__1 = a[i__3].r;
+		q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
+		q__2.r = y[i__2].r + q__3.r, q__2.i = y[i__2].i + q__3.i;
+		q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
+		y[i__4].r = q__1.r, y[i__4].i = q__1.i;
+/* L60: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__4 = jx;
+		q__1.r = alpha->r * x[i__4].r - alpha->i * x[i__4].i, q__1.i =
+			 alpha->r * x[i__4].i + alpha->i * x[i__4].r;
+		temp1.r = q__1.r, temp1.i = q__1.i;
+		temp2.r = 0.f, temp2.i = 0.f;
+		ix = kx;
+		iy = ky;
+		l = kplus1 - j;
+/* Computing MAX */
+		i__4 = 1, i__2 = j - *k;
+		i__3 = j - 1;
+		for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
+		    i__4 = iy;
+		    i__2 = iy;
+		    i__5 = l + i__ + j * a_dim1;
+		    q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, 
+			    q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
+			    .r;
+		    q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i;
+		    y[i__4].r = q__1.r, y[i__4].i = q__1.i;
+		    r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
+		    i__4 = ix;
+		    q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i =
+			     q__3.r * x[i__4].i + q__3.i * x[i__4].r;
+		    q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+		    ix += *incx;
+		    iy += *incy;
+/* L70: */
+		}
+		i__3 = jy;
+		i__4 = jy;
+		i__2 = kplus1 + j * a_dim1;
+		r__1 = a[i__2].r;
+		q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
+		q__2.r = y[i__4].r + q__3.r, q__2.i = y[i__4].i + q__3.i;
+		q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
+		y[i__3].r = q__1.r, y[i__3].i = q__1.i;
+		jx += *incx;
+		jy += *incy;
+		if (j > *k) {
+		    kx += *incx;
+		    ky += *incy;
+		}
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y  when lower triangle of A is stored. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__3 = j;
+		q__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, q__1.i =
+			 alpha->r * x[i__3].i + alpha->i * x[i__3].r;
+		temp1.r = q__1.r, temp1.i = q__1.i;
+		temp2.r = 0.f, temp2.i = 0.f;
+		i__3 = j;
+		i__4 = j;
+		i__2 = j * a_dim1 + 1;
+		r__1 = a[i__2].r;
+		q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
+		q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
+		y[i__3].r = q__1.r, y[i__3].i = q__1.i;
+		l = 1 - j;
+/* Computing MIN */
+		i__4 = *n, i__2 = j + *k;
+		i__3 = min(i__4,i__2);
+		for (i__ = j + 1; i__ <= i__3; ++i__) {
+		    i__4 = i__;
+		    i__2 = i__;
+		    i__5 = l + i__ + j * a_dim1;
+		    q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, 
+			    q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
+			    .r;
+		    q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i;
+		    y[i__4].r = q__1.r, y[i__4].i = q__1.i;
+		    r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
+		    i__4 = i__;
+		    q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i =
+			     q__3.r * x[i__4].i + q__3.i * x[i__4].r;
+		    q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+/* L90: */
+		}
+		i__3 = j;
+		i__4 = j;
+		q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
+		y[i__3].r = q__1.r, y[i__3].i = q__1.i;
+/* L100: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__3 = jx;
+		q__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, q__1.i =
+			 alpha->r * x[i__3].i + alpha->i * x[i__3].r;
+		temp1.r = q__1.r, temp1.i = q__1.i;
+		temp2.r = 0.f, temp2.i = 0.f;
+		i__3 = jy;
+		i__4 = jy;
+		i__2 = j * a_dim1 + 1;
+		r__1 = a[i__2].r;
+		q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
+		q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
+		y[i__3].r = q__1.r, y[i__3].i = q__1.i;
+		l = 1 - j;
+		ix = jx;
+		iy = jy;
+/* Computing MIN */
+		i__4 = *n, i__2 = j + *k;
+		i__3 = min(i__4,i__2);
+		for (i__ = j + 1; i__ <= i__3; ++i__) {
+		    ix += *incx;
+		    iy += *incy;
+		    i__4 = iy;
+		    i__2 = iy;
+		    i__5 = l + i__ + j * a_dim1;
+		    q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, 
+			    q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
+			    .r;
+		    q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i;
+		    y[i__4].r = q__1.r, y[i__4].i = q__1.i;
+		    r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
+		    i__4 = ix;
+		    q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i =
+			     q__3.r * x[i__4].i + q__3.i * x[i__4].r;
+		    q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+/* L110: */
+		}
+		i__3 = jy;
+		i__4 = jy;
+		q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
+		y[i__3].r = q__1.r, y[i__3].i = q__1.i;
+		jx += *incx;
+		jy += *incy;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of CHBMV . */
+
+} /* chbmv_ */
+
diff --git a/blas/f2c/chpmv.c b/blas/f2c/chpmv.c
new file mode 100644
index 000000000..65bab1c7f
--- /dev/null
+++ b/blas/f2c/chpmv.c
@@ -0,0 +1,438 @@
+/* chpmv.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int chpmv_(char *uplo, integer *n, complex *alpha, complex *
+	ap, complex *x, integer *incx, complex *beta, complex *y, integer *
+	incy, ftnlen uplo_len)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3, i__4, i__5;
+    real r__1;
+    complex q__1, q__2, q__3, q__4;
+
+    /* Builtin functions */
+    void r_cnjg(complex *, complex *);
+
+    /* Local variables */
+    integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
+    complex temp1, temp2;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  CHPMV  performs the matrix-vector operation */
+
+/*     y := alpha*A*x + beta*y, */
+
+/*  where alpha and beta are scalars, x and y are n element vectors and */
+/*  A is an n by n hermitian matrix, supplied in packed form. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  UPLO   - CHARACTER*1. */
+/*           On entry, UPLO specifies whether the upper or lower */
+/*           triangular part of the matrix A is supplied in the packed */
+/*           array AP as follows: */
+
+/*              UPLO = 'U' or 'u'   The upper triangular part of A is */
+/*                                  supplied in AP. */
+
+/*              UPLO = 'L' or 'l'   The lower triangular part of A is */
+/*                                  supplied in AP. */
+
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the order of the matrix A. */
+/*           N must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  ALPHA  - COMPLEX         . */
+/*           On entry, ALPHA specifies the scalar alpha. */
+/*           Unchanged on exit. */
+
+/*  AP     - COMPLEX          array of DIMENSION at least */
+/*           ( ( n*( n + 1 ) )/2 ). */
+/*           Before entry with UPLO = 'U' or 'u', the array AP must */
+/*           contain the upper triangular part of the hermitian matrix */
+/*           packed sequentially, column by column, so that AP( 1 ) */
+/*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
+/*           and a( 2, 2 ) respectively, and so on. */
+/*           Before entry with UPLO = 'L' or 'l', the array AP must */
+/*           contain the lower triangular part of the hermitian matrix */
+/*           packed sequentially, column by column, so that AP( 1 ) */
+/*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
+/*           and a( 3, 1 ) respectively, and so on. */
+/*           Note that the imaginary parts of the diagonal elements need */
+/*           not be set and are assumed to be zero. */
+/*           Unchanged on exit. */
+
+/*  X      - COMPLEX          array of dimension at least */
+/*           ( 1 + ( n - 1 )*abs( INCX ) ). */
+/*           Before entry, the incremented array X must contain the n */
+/*           element vector x. */
+/*           Unchanged on exit. */
+
+/*  INCX   - INTEGER. */
+/*           On entry, INCX specifies the increment for the elements of */
+/*           X. INCX must not be zero. */
+/*           Unchanged on exit. */
+
+/*  BETA   - COMPLEX         . */
+/*           On entry, BETA specifies the scalar beta. When BETA is */
+/*           supplied as zero then Y need not be set on input. */
+/*           Unchanged on exit. */
+
+/*  Y      - COMPLEX          array of dimension at least */
+/*           ( 1 + ( n - 1 )*abs( INCY ) ). */
+/*           Before entry, the incremented array Y must contain the n */
+/*           element vector y. On exit, Y is overwritten by the updated */
+/*           vector y. */
+
+/*  INCY   - INTEGER. */
+/*           On entry, INCY specifies the increment for the elements of */
+/*           Y. INCY must not be zero. */
+/*           Unchanged on exit. */
+
+/*  Further Details */
+/*  =============== */
+
+/*  Level 2 Blas routine. */
+
+/*  -- Written on 22-October-1986. */
+/*     Jack Dongarra, Argonne National Lab. */
+/*     Jeremy Du Croz, Nag Central Office. */
+/*     Sven Hammarling, Nag Central Office. */
+/*     Richard Hanson, Sandia National Labs. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    --y;
+    --x;
+    --ap;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
+	    ftnlen)1, (ftnlen)1)) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 6;
+    } else if (*incy == 0) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("CHPMV ", &info, (ftnlen)6);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || (alpha->r == 0.f && alpha->i == 0.f && (beta->r == 1.f && 
+                                                           beta->i == 0.f))) {
+	return 0;
+    }
+
+/*     Set up the start points in  X  and  Y. */
+
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (*n - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (*n - 1) * *incy;
+    }
+
+/*     Start the operations. In this version the elements of the array AP */
+/*     are accessed sequentially with one pass through AP. */
+
+/*     First form  y := beta*y. */
+
+    if (beta->r != 1.f || beta->i != 0.f) {
+	if (*incy == 1) {
+	    if (beta->r == 0.f && beta->i == 0.f) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = i__;
+		    y[i__2].r = 0.f, y[i__2].i = 0.f;
+/* L10: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = i__;
+		    i__3 = i__;
+		    q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, 
+			    q__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
+			    .r;
+		    y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (beta->r == 0.f && beta->i == 0.f) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = iy;
+		    y[i__2].r = 0.f, y[i__2].i = 0.f;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = iy;
+		    i__3 = iy;
+		    q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, 
+			    q__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
+			    .r;
+		    y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (alpha->r == 0.f && alpha->i == 0.f) {
+	return 0;
+    }
+    kk = 1;
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+
+/*        Form  y  when AP contains the upper triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j;
+		q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i =
+			 alpha->r * x[i__2].i + alpha->i * x[i__2].r;
+		temp1.r = q__1.r, temp1.i = q__1.i;
+		temp2.r = 0.f, temp2.i = 0.f;
+		k = kk;
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    i__3 = i__;
+		    i__4 = i__;
+		    i__5 = k;
+		    q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, 
+			    q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5]
+			    .r;
+		    q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
+		    y[i__3].r = q__1.r, y[i__3].i = q__1.i;
+		    r_cnjg(&q__3, &ap[k]);
+		    i__3 = i__;
+		    q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i =
+			     q__3.r * x[i__3].i + q__3.i * x[i__3].r;
+		    q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+		    ++k;
+/* L50: */
+		}
+		i__2 = j;
+		i__3 = j;
+		i__4 = kk + j - 1;
+		r__1 = ap[i__4].r;
+		q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
+		q__2.r = y[i__3].r + q__3.r, q__2.i = y[i__3].i + q__3.i;
+		q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
+		y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+		kk += j;
+/* L60: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = jx;
+		q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i =
+			 alpha->r * x[i__2].i + alpha->i * x[i__2].r;
+		temp1.r = q__1.r, temp1.i = q__1.i;
+		temp2.r = 0.f, temp2.i = 0.f;
+		ix = kx;
+		iy = ky;
+		i__2 = kk + j - 2;
+		for (k = kk; k <= i__2; ++k) {
+		    i__3 = iy;
+		    i__4 = iy;
+		    i__5 = k;
+		    q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, 
+			    q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5]
+			    .r;
+		    q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
+		    y[i__3].r = q__1.r, y[i__3].i = q__1.i;
+		    r_cnjg(&q__3, &ap[k]);
+		    i__3 = ix;
+		    q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i =
+			     q__3.r * x[i__3].i + q__3.i * x[i__3].r;
+		    q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+		    ix += *incx;
+		    iy += *incy;
+/* L70: */
+		}
+		i__2 = jy;
+		i__3 = jy;
+		i__4 = kk + j - 1;
+		r__1 = ap[i__4].r;
+		q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
+		q__2.r = y[i__3].r + q__3.r, q__2.i = y[i__3].i + q__3.i;
+		q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
+		y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+		jx += *incx;
+		jy += *incy;
+		kk += j;
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y  when AP contains the lower triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j;
+		q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i =
+			 alpha->r * x[i__2].i + alpha->i * x[i__2].r;
+		temp1.r = q__1.r, temp1.i = q__1.i;
+		temp2.r = 0.f, temp2.i = 0.f;
+		i__2 = j;
+		i__3 = j;
+		i__4 = kk;
+		r__1 = ap[i__4].r;
+		q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
+		q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
+		y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+		k = kk + 1;
+		i__2 = *n;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    i__3 = i__;
+		    i__4 = i__;
+		    i__5 = k;
+		    q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, 
+			    q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5]
+			    .r;
+		    q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
+		    y[i__3].r = q__1.r, y[i__3].i = q__1.i;
+		    r_cnjg(&q__3, &ap[k]);
+		    i__3 = i__;
+		    q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i =
+			     q__3.r * x[i__3].i + q__3.i * x[i__3].r;
+		    q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+		    ++k;
+/* L90: */
+		}
+		i__2 = j;
+		i__3 = j;
+		q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
+		y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+		kk += *n - j + 1;
+/* L100: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = jx;
+		q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i =
+			 alpha->r * x[i__2].i + alpha->i * x[i__2].r;
+		temp1.r = q__1.r, temp1.i = q__1.i;
+		temp2.r = 0.f, temp2.i = 0.f;
+		i__2 = jy;
+		i__3 = jy;
+		i__4 = kk;
+		r__1 = ap[i__4].r;
+		q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
+		q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
+		y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+		ix = jx;
+		iy = jy;
+		i__2 = kk + *n - j;
+		for (k = kk + 1; k <= i__2; ++k) {
+		    ix += *incx;
+		    iy += *incy;
+		    i__3 = iy;
+		    i__4 = iy;
+		    i__5 = k;
+		    q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, 
+			    q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5]
+			    .r;
+		    q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
+		    y[i__3].r = q__1.r, y[i__3].i = q__1.i;
+		    r_cnjg(&q__3, &ap[k]);
+		    i__3 = ix;
+		    q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i =
+			     q__3.r * x[i__3].i + q__3.i * x[i__3].r;
+		    q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
+		    temp2.r = q__1.r, temp2.i = q__1.i;
+/* L110: */
+		}
+		i__2 = jy;
+		i__3 = jy;
+		q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
+		y[i__2].r = q__1.r, y[i__2].i = q__1.i;
+		jx += *incx;
+		jy += *incy;
+		kk += *n - j + 1;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of CHPMV . */
+
+} /* chpmv_ */
+
diff --git a/blas/f2c/complexdots.c b/blas/f2c/complexdots.c
new file mode 100644
index 000000000..a856a231c
--- /dev/null
+++ b/blas/f2c/complexdots.c
@@ -0,0 +1,84 @@
+/* This file has been modified to use the standard gfortran calling
+   convention, rather than the f2c calling convention.
+
+   It does not require -ff2c when compiled with gfortran.
+*/
+
+/* complexdots.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+complex cdotc_(integer *n, complex *cx, integer 
+	*incx, complex *cy, integer *incy)
+{
+    complex res;
+    extern /* Subroutine */ int cdotcw_(integer *, complex *, integer *, 
+	    complex *, integer *, complex *);
+
+    /* Parameter adjustments */
+    --cy;
+    --cx;
+
+    /* Function Body */
+    cdotcw_(n, &cx[1], incx, &cy[1], incy, &res);
+    return res;
+} /* cdotc_ */
+
+complex cdotu_(integer *n, complex *cx, integer 
+	*incx, complex *cy, integer *incy)
+{
+    complex res;
+    extern /* Subroutine */ int cdotuw_(integer *, complex *, integer *, 
+	    complex *, integer *, complex *);
+
+    /* Parameter adjustments */
+    --cy;
+    --cx;
+
+    /* Function Body */
+    cdotuw_(n, &cx[1], incx, &cy[1], incy, &res);
+    return res;
+} /* cdotu_ */
+
+doublecomplex zdotc_(integer *n, doublecomplex *cx, integer *incx, 
+                     doublecomplex *cy, integer *incy)
+{
+    doublecomplex res;
+    extern /* Subroutine */ int zdotcw_(integer *, doublecomplex *, integer *,
+	     doublecomplex *, integer *, doublecomplex *);
+
+    /* Parameter adjustments */
+    --cy;
+    --cx;
+
+    /* Function Body */
+    zdotcw_(n, &cx[1], incx, &cy[1], incy, &res);
+    return res;
+} /* zdotc_ */
+
+doublecomplex zdotu_(integer *n, doublecomplex *cx, integer *incx, 
+                     doublecomplex *cy, integer *incy)
+{
+    doublecomplex res;
+    extern /* Subroutine */ int zdotuw_(integer *, doublecomplex *, integer *,
+	     doublecomplex *, integer *, doublecomplex *);
+
+    /* Parameter adjustments */
+    --cy;
+    --cx;
+
+    /* Function Body */
+    zdotuw_(n, &cx[1], incx, &cy[1], incy, &res);
+    return res;
+} /* zdotu_ */
+
diff --git a/blas/f2c/ctbmv.c b/blas/f2c/ctbmv.c
new file mode 100644
index 000000000..790fd581f
--- /dev/null
+++ b/blas/f2c/ctbmv.c
@@ -0,0 +1,647 @@
+/* ctbmv.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int ctbmv_(char *uplo, char *trans, char *diag, integer *n, 
+	integer *k, complex *a, integer *lda, complex *x, integer *incx, 
+	ftnlen uplo_len, ftnlen trans_len, ftnlen diag_len)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    complex q__1, q__2, q__3;
+
+    /* Builtin functions */
+    void r_cnjg(complex *, complex *);
+
+    /* Local variables */
+    integer i__, j, l, ix, jx, kx, info;
+    complex temp;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    integer kplus1;
+    extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
+    logical noconj, nounit;
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  CTBMV  performs one of the matrix-vector operations */
+
+/*     x := A*x,   or   x := A'*x,   or   x := conjg( A' )*x, */
+
+/*  where x is an n element vector and  A is an n by n unit, or non-unit, */
+/*  upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  UPLO   - CHARACTER*1. */
+/*           On entry, UPLO specifies whether the matrix is an upper or */
+/*           lower triangular matrix as follows: */
+
+/*              UPLO = 'U' or 'u'   A is an upper triangular matrix. */
+
+/*              UPLO = 'L' or 'l'   A is a lower triangular matrix. */
+
+/*           Unchanged on exit. */
+
+/*  TRANS  - CHARACTER*1. */
+/*           On entry, TRANS specifies the operation to be performed as */
+/*           follows: */
+
+/*              TRANS = 'N' or 'n'   x := A*x. */
+
+/*              TRANS = 'T' or 't'   x := A'*x. */
+
+/*              TRANS = 'C' or 'c'   x := conjg( A' )*x. */
+
+/*           Unchanged on exit. */
+
+/*  DIAG   - CHARACTER*1. */
+/*           On entry, DIAG specifies whether or not A is unit */
+/*           triangular as follows: */
+
+/*              DIAG = 'U' or 'u'   A is assumed to be unit triangular. */
+
+/*              DIAG = 'N' or 'n'   A is not assumed to be unit */
+/*                                  triangular. */
+
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the order of the matrix A. */
+/*           N must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  K      - INTEGER. */
+/*           On entry with UPLO = 'U' or 'u', K specifies the number of */
+/*           super-diagonals of the matrix A. */
+/*           On entry with UPLO = 'L' or 'l', K specifies the number of */
+/*           sub-diagonals of the matrix A. */
+/*           K must satisfy  0 .le. K. */
+/*           Unchanged on exit. */
+
+/*  A      - COMPLEX          array of DIMENSION ( LDA, n ). */
+/*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the upper triangular */
+/*           band part of the matrix of coefficients, supplied column by */
+/*           column, with the leading diagonal of the matrix in row */
+/*           ( k + 1 ) of the array, the first super-diagonal starting at */
+/*           position 2 in row k, and so on. The top left k by k triangle */
+/*           of the array A is not referenced. */
+/*           The following program segment will transfer an upper */
+/*           triangular band matrix from conventional full matrix storage */
+/*           to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = K + 1 - J */
+/*                    DO 10, I = MAX( 1, J - K ), J */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the lower triangular */
+/*           band part of the matrix of coefficients, supplied column by */
+/*           column, with the leading diagonal of the matrix in row 1 of */
+/*           the array, the first sub-diagonal starting at position 1 in */
+/*           row 2, and so on. The bottom right k by k triangle of the */
+/*           array A is not referenced. */
+/*           The following program segment will transfer a lower */
+/*           triangular band matrix from conventional full matrix storage */
+/*           to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = 1 - J */
+/*                    DO 10, I = J, MIN( N, J + K ) */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Note that when DIAG = 'U' or 'u' the elements of the array A */
+/*           corresponding to the diagonal elements of the matrix are not */
+/*           referenced, but are assumed to be unity. */
+/*           Unchanged on exit. */
+
+/*  LDA    - INTEGER. */
+/*           On entry, LDA specifies the first dimension of A as declared */
+/*           in the calling (sub) program. LDA must be at least */
+/*           ( k + 1 ). */
+/*           Unchanged on exit. */
+
+/*  X      - COMPLEX          array of dimension at least */
+/*           ( 1 + ( n - 1 )*abs( INCX ) ). */
+/*           Before entry, the incremented array X must contain the n */
+/*           element vector x. On exit, X is overwritten with the */
+/*           tranformed vector x. */
+
+/*  INCX   - INTEGER. */
+/*           On entry, INCX specifies the increment for the elements of */
+/*           X. INCX must not be zero. */
+/*           Unchanged on exit. */
+
+/*  Further Details */
+/*  =============== */
+
+/*  Level 2 Blas routine. */
+
+/*  -- Written on 22-October-1986. */
+/*     Jack Dongarra, Argonne National Lab. */
+/*     Jeremy Du Croz, Nag Central Office. */
+/*     Sven Hammarling, Nag Central Office. */
+/*     Richard Hanson, Sandia National Labs. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
+	    ftnlen)1, (ftnlen)1)) {
+	info = 1;
+    } else if (! lsame_(trans, "N", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, 
+	    "T", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, "C", (ftnlen)1, (
+	    ftnlen)1)) {
+	info = 2;
+    } else if (! lsame_(diag, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(diag, 
+	    "N", (ftnlen)1, (ftnlen)1)) {
+	info = 3;
+    } else if (*n < 0) {
+	info = 4;
+    } else if (*k < 0) {
+	info = 5;
+    } else if (*lda < *k + 1) {
+	info = 7;
+    } else if (*incx == 0) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("CTBMV ", &info, (ftnlen)6);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    noconj = lsame_(trans, "T", (ftnlen)1, (ftnlen)1);
+    nounit = lsame_(diag, "N", (ftnlen)1, (ftnlen)1);
+
+/*     Set up the start point in X if the increment is not unity. This */
+/*     will be  ( N - 1 )*INCX   too small for descending loops. */
+
+    if (*incx <= 0) {
+	kx = 1 - (*n - 1) * *incx;
+    } else if (*incx != 1) {
+	kx = 1;
+    }
+
+/*     Start the operations. In this version the elements of A are */
+/*     accessed sequentially with one pass through A. */
+
+    if (lsame_(trans, "N", (ftnlen)1, (ftnlen)1)) {
+
+/*         Form  x := A*x. */
+
+	if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	    kplus1 = *k + 1;
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j;
+		    if (x[i__2].r != 0.f || x[i__2].i != 0.f) {
+			i__2 = j;
+			temp.r = x[i__2].r, temp.i = x[i__2].i;
+			l = kplus1 - j;
+/* Computing MAX */
+			i__2 = 1, i__3 = j - *k;
+			i__4 = j - 1;
+			for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
+			    i__2 = i__;
+			    i__3 = i__;
+			    i__5 = l + i__ + j * a_dim1;
+			    q__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, 
+				    q__2.i = temp.r * a[i__5].i + temp.i * a[
+				    i__5].r;
+			    q__1.r = x[i__3].r + q__2.r, q__1.i = x[i__3].i + 
+				    q__2.i;
+			    x[i__2].r = q__1.r, x[i__2].i = q__1.i;
+/* L10: */
+			}
+			if (nounit) {
+			    i__4 = j;
+			    i__2 = j;
+			    i__3 = kplus1 + j * a_dim1;
+			    q__1.r = x[i__2].r * a[i__3].r - x[i__2].i * a[
+				    i__3].i, q__1.i = x[i__2].r * a[i__3].i + 
+				    x[i__2].i * a[i__3].r;
+			    x[i__4].r = q__1.r, x[i__4].i = q__1.i;
+			}
+		    }
+/* L20: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__4 = jx;
+		    if (x[i__4].r != 0.f || x[i__4].i != 0.f) {
+			i__4 = jx;
+			temp.r = x[i__4].r, temp.i = x[i__4].i;
+			ix = kx;
+			l = kplus1 - j;
+/* Computing MAX */
+			i__4 = 1, i__2 = j - *k;
+			i__3 = j - 1;
+			for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
+			    i__4 = ix;
+			    i__2 = ix;
+			    i__5 = l + i__ + j * a_dim1;
+			    q__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, 
+				    q__2.i = temp.r * a[i__5].i + temp.i * a[
+				    i__5].r;
+			    q__1.r = x[i__2].r + q__2.r, q__1.i = x[i__2].i + 
+				    q__2.i;
+			    x[i__4].r = q__1.r, x[i__4].i = q__1.i;
+			    ix += *incx;
+/* L30: */
+			}
+			if (nounit) {
+			    i__3 = jx;
+			    i__4 = jx;
+			    i__2 = kplus1 + j * a_dim1;
+			    q__1.r = x[i__4].r * a[i__2].r - x[i__4].i * a[
+				    i__2].i, q__1.i = x[i__4].r * a[i__2].i + 
+				    x[i__4].i * a[i__2].r;
+			    x[i__3].r = q__1.r, x[i__3].i = q__1.i;
+			}
+		    }
+		    jx += *incx;
+		    if (j > *k) {
+			kx += *incx;
+		    }
+/* L40: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    i__1 = j;
+		    if (x[i__1].r != 0.f || x[i__1].i != 0.f) {
+			i__1 = j;
+			temp.r = x[i__1].r, temp.i = x[i__1].i;
+			l = 1 - j;
+/* Computing MIN */
+			i__1 = *n, i__3 = j + *k;
+			i__4 = j + 1;
+			for (i__ = min(i__1,i__3); i__ >= i__4; --i__) {
+			    i__1 = i__;
+			    i__3 = i__;
+			    i__2 = l + i__ + j * a_dim1;
+			    q__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, 
+				    q__2.i = temp.r * a[i__2].i + temp.i * a[
+				    i__2].r;
+			    q__1.r = x[i__3].r + q__2.r, q__1.i = x[i__3].i + 
+				    q__2.i;
+			    x[i__1].r = q__1.r, x[i__1].i = q__1.i;
+/* L50: */
+			}
+			if (nounit) {
+			    i__4 = j;
+			    i__1 = j;
+			    i__3 = j * a_dim1 + 1;
+			    q__1.r = x[i__1].r * a[i__3].r - x[i__1].i * a[
+				    i__3].i, q__1.i = x[i__1].r * a[i__3].i + 
+				    x[i__1].i * a[i__3].r;
+			    x[i__4].r = q__1.r, x[i__4].i = q__1.i;
+			}
+		    }
+/* L60: */
+		}
+	    } else {
+		kx += (*n - 1) * *incx;
+		jx = kx;
+		for (j = *n; j >= 1; --j) {
+		    i__4 = jx;
+		    if (x[i__4].r != 0.f || x[i__4].i != 0.f) {
+			i__4 = jx;
+			temp.r = x[i__4].r, temp.i = x[i__4].i;
+			ix = kx;
+			l = 1 - j;
+/* Computing MIN */
+			i__4 = *n, i__1 = j + *k;
+			i__3 = j + 1;
+			for (i__ = min(i__4,i__1); i__ >= i__3; --i__) {
+			    i__4 = ix;
+			    i__1 = ix;
+			    i__2 = l + i__ + j * a_dim1;
+			    q__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, 
+				    q__2.i = temp.r * a[i__2].i + temp.i * a[
+				    i__2].r;
+			    q__1.r = x[i__1].r + q__2.r, q__1.i = x[i__1].i + 
+				    q__2.i;
+			    x[i__4].r = q__1.r, x[i__4].i = q__1.i;
+			    ix -= *incx;
+/* L70: */
+			}
+			if (nounit) {
+			    i__3 = jx;
+			    i__4 = jx;
+			    i__1 = j * a_dim1 + 1;
+			    q__1.r = x[i__4].r * a[i__1].r - x[i__4].i * a[
+				    i__1].i, q__1.i = x[i__4].r * a[i__1].i + 
+				    x[i__4].i * a[i__1].r;
+			    x[i__3].r = q__1.r, x[i__3].i = q__1.i;
+			}
+		    }
+		    jx -= *incx;
+		    if (*n - j >= *k) {
+			kx -= *incx;
+		    }
+/* L80: */
+		}
+	    }
+	}
+    } else {
+
+/*        Form  x := A'*x  or  x := conjg( A' )*x. */
+
+	if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	    kplus1 = *k + 1;
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    i__3 = j;
+		    temp.r = x[i__3].r, temp.i = x[i__3].i;
+		    l = kplus1 - j;
+		    if (noconj) {
+			if (nounit) {
+			    i__3 = kplus1 + j * a_dim1;
+			    q__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, 
+				    q__1.i = temp.r * a[i__3].i + temp.i * a[
+				    i__3].r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+/* Computing MAX */
+			i__4 = 1, i__1 = j - *k;
+			i__3 = max(i__4,i__1);
+			for (i__ = j - 1; i__ >= i__3; --i__) {
+			    i__4 = l + i__ + j * a_dim1;
+			    i__1 = i__;
+			    q__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[
+				    i__1].i, q__2.i = a[i__4].r * x[i__1].i + 
+				    a[i__4].i * x[i__1].r;
+			    q__1.r = temp.r + q__2.r, q__1.i = temp.i + 
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+/* L90: */
+			}
+		    } else {
+			if (nounit) {
+			    r_cnjg(&q__2, &a[kplus1 + j * a_dim1]);
+			    q__1.r = temp.r * q__2.r - temp.i * q__2.i, 
+				    q__1.i = temp.r * q__2.i + temp.i * 
+				    q__2.r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+/* Computing MAX */
+			i__4 = 1, i__1 = j - *k;
+			i__3 = max(i__4,i__1);
+			for (i__ = j - 1; i__ >= i__3; --i__) {
+			    r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
+			    i__4 = i__;
+			    q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, 
+				    q__2.i = q__3.r * x[i__4].i + q__3.i * x[
+				    i__4].r;
+			    q__1.r = temp.r + q__2.r, q__1.i = temp.i + 
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+/* L100: */
+			}
+		    }
+		    i__3 = j;
+		    x[i__3].r = temp.r, x[i__3].i = temp.i;
+/* L110: */
+		}
+	    } else {
+		kx += (*n - 1) * *incx;
+		jx = kx;
+		for (j = *n; j >= 1; --j) {
+		    i__3 = jx;
+		    temp.r = x[i__3].r, temp.i = x[i__3].i;
+		    kx -= *incx;
+		    ix = kx;
+		    l = kplus1 - j;
+		    if (noconj) {
+			if (nounit) {
+			    i__3 = kplus1 + j * a_dim1;
+			    q__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, 
+				    q__1.i = temp.r * a[i__3].i + temp.i * a[
+				    i__3].r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+/* Computing MAX */
+			i__4 = 1, i__1 = j - *k;
+			i__3 = max(i__4,i__1);
+			for (i__ = j - 1; i__ >= i__3; --i__) {
+			    i__4 = l + i__ + j * a_dim1;
+			    i__1 = ix;
+			    q__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[
+				    i__1].i, q__2.i = a[i__4].r * x[i__1].i + 
+				    a[i__4].i * x[i__1].r;
+			    q__1.r = temp.r + q__2.r, q__1.i = temp.i + 
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			    ix -= *incx;
+/* L120: */
+			}
+		    } else {
+			if (nounit) {
+			    r_cnjg(&q__2, &a[kplus1 + j * a_dim1]);
+			    q__1.r = temp.r * q__2.r - temp.i * q__2.i, 
+				    q__1.i = temp.r * q__2.i + temp.i * 
+				    q__2.r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+/* Computing MAX */
+			i__4 = 1, i__1 = j - *k;
+			i__3 = max(i__4,i__1);
+			for (i__ = j - 1; i__ >= i__3; --i__) {
+			    r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
+			    i__4 = ix;
+			    q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, 
+				    q__2.i = q__3.r * x[i__4].i + q__3.i * x[
+				    i__4].r;
+			    q__1.r = temp.r + q__2.r, q__1.i = temp.i + 
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			    ix -= *incx;
+/* L130: */
+			}
+		    }
+		    i__3 = jx;
+		    x[i__3].r = temp.r, x[i__3].i = temp.i;
+		    jx -= *incx;
+/* L140: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		i__3 = *n;
+		for (j = 1; j <= i__3; ++j) {
+		    i__4 = j;
+		    temp.r = x[i__4].r, temp.i = x[i__4].i;
+		    l = 1 - j;
+		    if (noconj) {
+			if (nounit) {
+			    i__4 = j * a_dim1 + 1;
+			    q__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, 
+				    q__1.i = temp.r * a[i__4].i + temp.i * a[
+				    i__4].r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+/* Computing MIN */
+			i__1 = *n, i__2 = j + *k;
+			i__4 = min(i__1,i__2);
+			for (i__ = j + 1; i__ <= i__4; ++i__) {
+			    i__1 = l + i__ + j * a_dim1;
+			    i__2 = i__;
+			    q__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[
+				    i__2].i, q__2.i = a[i__1].r * x[i__2].i + 
+				    a[i__1].i * x[i__2].r;
+			    q__1.r = temp.r + q__2.r, q__1.i = temp.i + 
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+/* L150: */
+			}
+		    } else {
+			if (nounit) {
+			    r_cnjg(&q__2, &a[j * a_dim1 + 1]);
+			    q__1.r = temp.r * q__2.r - temp.i * q__2.i, 
+				    q__1.i = temp.r * q__2.i + temp.i * 
+				    q__2.r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+/* Computing MIN */
+			i__1 = *n, i__2 = j + *k;
+			i__4 = min(i__1,i__2);
+			for (i__ = j + 1; i__ <= i__4; ++i__) {
+			    r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
+			    i__1 = i__;
+			    q__2.r = q__3.r * x[i__1].r - q__3.i * x[i__1].i, 
+				    q__2.i = q__3.r * x[i__1].i + q__3.i * x[
+				    i__1].r;
+			    q__1.r = temp.r + q__2.r, q__1.i = temp.i + 
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+/* L160: */
+			}
+		    }
+		    i__4 = j;
+		    x[i__4].r = temp.r, x[i__4].i = temp.i;
+/* L170: */
+		}
+	    } else {
+		jx = kx;
+		i__3 = *n;
+		for (j = 1; j <= i__3; ++j) {
+		    i__4 = jx;
+		    temp.r = x[i__4].r, temp.i = x[i__4].i;
+		    kx += *incx;
+		    ix = kx;
+		    l = 1 - j;
+		    if (noconj) {
+			if (nounit) {
+			    i__4 = j * a_dim1 + 1;
+			    q__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, 
+				    q__1.i = temp.r * a[i__4].i + temp.i * a[
+				    i__4].r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+/* Computing MIN */
+			i__1 = *n, i__2 = j + *k;
+			i__4 = min(i__1,i__2);
+			for (i__ = j + 1; i__ <= i__4; ++i__) {
+			    i__1 = l + i__ + j * a_dim1;
+			    i__2 = ix;
+			    q__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[
+				    i__2].i, q__2.i = a[i__1].r * x[i__2].i + 
+				    a[i__1].i * x[i__2].r;
+			    q__1.r = temp.r + q__2.r, q__1.i = temp.i + 
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			    ix += *incx;
+/* L180: */
+			}
+		    } else {
+			if (nounit) {
+			    r_cnjg(&q__2, &a[j * a_dim1 + 1]);
+			    q__1.r = temp.r * q__2.r - temp.i * q__2.i, 
+				    q__1.i = temp.r * q__2.i + temp.i * 
+				    q__2.r;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			}
+/* Computing MIN */
+			i__1 = *n, i__2 = j + *k;
+			i__4 = min(i__1,i__2);
+			for (i__ = j + 1; i__ <= i__4; ++i__) {
+			    r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
+			    i__1 = ix;
+			    q__2.r = q__3.r * x[i__1].r - q__3.i * x[i__1].i, 
+				    q__2.i = q__3.r * x[i__1].i + q__3.i * x[
+				    i__1].r;
+			    q__1.r = temp.r + q__2.r, q__1.i = temp.i + 
+				    q__2.i;
+			    temp.r = q__1.r, temp.i = q__1.i;
+			    ix += *incx;
+/* L190: */
+			}
+		    }
+		    i__4 = jx;
+		    x[i__4].r = temp.r, x[i__4].i = temp.i;
+		    jx += *incx;
+/* L200: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of CTBMV . */
+
+} /* ctbmv_ */
+
diff --git a/blas/f2c/d_cnjg.c b/blas/f2c/d_cnjg.c
new file mode 100644
index 000000000..623090c6b
--- /dev/null
+++ b/blas/f2c/d_cnjg.c
@@ -0,0 +1,6 @@
+#include "datatypes.h"    
+
+void d_cnjg(doublecomplex *r, doublecomplex *z) {
+    r->r = z->r;
+    r->i = -(z->i);
+}
diff --git a/blas/f2c/datatypes.h b/blas/f2c/datatypes.h
new file mode 100644
index 000000000..63232b246
--- /dev/null
+++ b/blas/f2c/datatypes.h
@@ -0,0 +1,24 @@
+/* This contains a limited subset of the typedefs exposed by f2c
+   for use by the Eigen BLAS C-only implementation.
+*/
+
+#ifndef __EIGEN_DATATYPES_H__
+#define __EIGEN_DATATYPES_H__
+
+typedef int integer;
+typedef unsigned int uinteger;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+typedef int ftnlen;
+typedef int logical;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (doublereal)abs(x)
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (doublereal)min(a,b)
+#define dmax(a,b) (doublereal)max(a,b)
+
+#endif
diff --git a/blas/f2c/drotm.c b/blas/f2c/drotm.c
new file mode 100644
index 000000000..17a779b74
--- /dev/null
+++ b/blas/f2c/drotm.c
@@ -0,0 +1,215 @@
+/* drotm.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int drotm_(integer *n, doublereal *dx, integer *incx, 
+	doublereal *dy, integer *incy, doublereal *dparam)
+{
+    /* Initialized data */
+
+    static doublereal zero = 0.;
+    static doublereal two = 2.;
+
+    /* System generated locals */
+    integer i__1, i__2;
+
+    /* Local variables */
+    integer i__;
+    doublereal w, z__;
+    integer kx, ky;
+    doublereal dh11, dh12, dh21, dh22, dflag;
+    integer nsteps;
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*     APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX */
+
+/*     (DX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF DX ARE IN */
+/*     (DY**T) */
+
+/*     DX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE */
+/*     LX = (-INCX)*N, AND SIMILARLY FOR SY USING LY AND INCY. */
+/*     WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
+
+/*     DFLAG=-1.D0     DFLAG=0.D0        DFLAG=1.D0     DFLAG=-2.D0 */
+
+/*       (DH11  DH12)    (1.D0  DH12)    (DH11  1.D0)    (1.D0  0.D0) */
+/*     H=(          )    (          )    (          )    (          ) */
+/*       (DH21  DH22),   (DH21  1.D0),   (-1.D0 DH22),   (0.D0  1.D0). */
+/*     SEE DROTMG FOR A DESCRIPTION OF DATA STORAGE IN DPARAM. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  N      (input) INTEGER */
+/*         number of elements in input vector(s) */
+
+/*  DX     (input/output) DOUBLE PRECISION array, dimension N */
+/*         double precision vector with N elements */
+
+/*  INCX   (input) INTEGER */
+/*         storage spacing between elements of DX */
+
+/*  DY     (input/output) DOUBLE PRECISION array, dimension N */
+/*         double precision vector with N elements */
+
+/*  INCY   (input) INTEGER */
+/*         storage spacing between elements of DY */
+
+/*  DPARAM (input/output)  DOUBLE PRECISION array, dimension 5 */
+/*     DPARAM(1)=DFLAG */
+/*     DPARAM(2)=DH11 */
+/*     DPARAM(3)=DH21 */
+/*     DPARAM(4)=DH12 */
+/*     DPARAM(5)=DH22 */
+
+/*  ===================================================================== */
+
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Data statements .. */
+    /* Parameter adjustments */
+    --dparam;
+    --dy;
+    --dx;
+
+    /* Function Body */
+/*     .. */
+
+    dflag = dparam[1];
+    if (*n <= 0 || dflag + two == zero) {
+	goto L140;
+    }
+    if (! (*incx == *incy && *incx > 0)) {
+	goto L70;
+    }
+
+    nsteps = *n * *incx;
+    if (dflag < 0.) {
+	goto L50;
+    } else if (dflag == 0) {
+	goto L10;
+    } else {
+	goto L30;
+    }
+L10:
+    dh12 = dparam[4];
+    dh21 = dparam[3];
+    i__1 = nsteps;
+    i__2 = *incx;
+    for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	w = dx[i__];
+	z__ = dy[i__];
+	dx[i__] = w + z__ * dh12;
+	dy[i__] = w * dh21 + z__;
+/* L20: */
+    }
+    goto L140;
+L30:
+    dh11 = dparam[2];
+    dh22 = dparam[5];
+    i__2 = nsteps;
+    i__1 = *incx;
+    for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) {
+	w = dx[i__];
+	z__ = dy[i__];
+	dx[i__] = w * dh11 + z__;
+	dy[i__] = -w + dh22 * z__;
+/* L40: */
+    }
+    goto L140;
+L50:
+    dh11 = dparam[2];
+    dh12 = dparam[4];
+    dh21 = dparam[3];
+    dh22 = dparam[5];
+    i__1 = nsteps;
+    i__2 = *incx;
+    for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	w = dx[i__];
+	z__ = dy[i__];
+	dx[i__] = w * dh11 + z__ * dh12;
+	dy[i__] = w * dh21 + z__ * dh22;
+/* L60: */
+    }
+    goto L140;
+L70:
+    kx = 1;
+    ky = 1;
+    if (*incx < 0) {
+	kx = (1 - *n) * *incx + 1;
+    }
+    if (*incy < 0) {
+	ky = (1 - *n) * *incy + 1;
+    }
+
+    if (dflag < 0.) {
+	goto L120;
+    } else if (dflag == 0) {
+	goto L80;
+    } else {
+	goto L100;
+    }
+L80:
+    dh12 = dparam[4];
+    dh21 = dparam[3];
+    i__2 = *n;
+    for (i__ = 1; i__ <= i__2; ++i__) {
+	w = dx[kx];
+	z__ = dy[ky];
+	dx[kx] = w + z__ * dh12;
+	dy[ky] = w * dh21 + z__;
+	kx += *incx;
+	ky += *incy;
+/* L90: */
+    }
+    goto L140;
+L100:
+    dh11 = dparam[2];
+    dh22 = dparam[5];
+    i__2 = *n;
+    for (i__ = 1; i__ <= i__2; ++i__) {
+	w = dx[kx];
+	z__ = dy[ky];
+	dx[kx] = w * dh11 + z__;
+	dy[ky] = -w + dh22 * z__;
+	kx += *incx;
+	ky += *incy;
+/* L110: */
+    }
+    goto L140;
+L120:
+    dh11 = dparam[2];
+    dh12 = dparam[4];
+    dh21 = dparam[3];
+    dh22 = dparam[5];
+    i__2 = *n;
+    for (i__ = 1; i__ <= i__2; ++i__) {
+	w = dx[kx];
+	z__ = dy[ky];
+	dx[kx] = w * dh11 + z__ * dh12;
+	dy[ky] = w * dh21 + z__ * dh22;
+	kx += *incx;
+	ky += *incy;
+/* L130: */
+    }
+L140:
+    return 0;
+} /* drotm_ */
+
diff --git a/blas/f2c/drotmg.c b/blas/f2c/drotmg.c
new file mode 100644
index 000000000..a63eb1083
--- /dev/null
+++ b/blas/f2c/drotmg.c
@@ -0,0 +1,293 @@
+/* drotmg.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int drotmg_(doublereal *dd1, doublereal *dd2, doublereal *
+	dx1, doublereal *dy1, doublereal *dparam)
+{
+    /* Initialized data */
+
+    static doublereal zero = 0.;
+    static doublereal one = 1.;
+    static doublereal two = 2.;
+    static doublereal gam = 4096.;
+    static doublereal gamsq = 16777216.;
+    static doublereal rgamsq = 5.9604645e-8;
+
+    /* Format strings */
+    static char fmt_120[] = "";
+    static char fmt_150[] = "";
+    static char fmt_180[] = "";
+    static char fmt_210[] = "";
+
+    /* System generated locals */
+    doublereal d__1;
+
+    /* Local variables */
+    doublereal du, dp1, dp2, dq1, dq2, dh11, dh12, dh21, dh22;
+    integer igo;
+    doublereal dflag, dtemp;
+
+    /* Assigned format variables */
+    static char *igo_fmt;
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*     CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS */
+/*     THE SECOND COMPONENT OF THE 2-VECTOR  (DSQRT(DD1)*DX1,DSQRT(DD2)* */
+/*     DY2)**T. */
+/*     WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
+
+/*     DFLAG=-1.D0     DFLAG=0.D0        DFLAG=1.D0     DFLAG=-2.D0 */
+
+/*       (DH11  DH12)    (1.D0  DH12)    (DH11  1.D0)    (1.D0  0.D0) */
+/*     H=(          )    (          )    (          )    (          ) */
+/*       (DH21  DH22),   (DH21  1.D0),   (-1.D0 DH22),   (0.D0  1.D0). */
+/*     LOCATIONS 2-4 OF DPARAM CONTAIN DH11, DH21, DH12, AND DH22 */
+/*     RESPECTIVELY. (VALUES OF 1.D0, -1.D0, OR 0.D0 IMPLIED BY THE */
+/*     VALUE OF DPARAM(1) ARE NOT STORED IN DPARAM.) */
+
+/*     THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE */
+/*     INEXACT.  THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE */
+/*     OF DD1 AND DD2.  ALL ACTUAL SCALING OF DATA IS DONE USING GAM. */
+
+
+/*  Arguments */
+/*  ========= */
+
+/*  DD1    (input/output) DOUBLE PRECISION */
+
+/*  DD2    (input/output) DOUBLE PRECISION */
+
+/*  DX1    (input/output) DOUBLE PRECISION */
+
+/*  DY1    (input) DOUBLE PRECISION */
+
+/*  DPARAM (input/output)  DOUBLE PRECISION array, dimension 5 */
+/*     DPARAM(1)=DFLAG */
+/*     DPARAM(2)=DH11 */
+/*     DPARAM(3)=DH21 */
+/*     DPARAM(4)=DH12 */
+/*     DPARAM(5)=DH22 */
+
+/*  ===================================================================== */
+
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Data statements .. */
+
+    /* Parameter adjustments */
+    --dparam;
+
+    /* Function Body */
+/*     .. */
+    if (! (*dd1 < zero)) {
+	goto L10;
+    }
+/*       GO ZERO-H-D-AND-DX1.. */
+    goto L60;
+L10:
+/*     CASE-DD1-NONNEGATIVE */
+    dp2 = *dd2 * *dy1;
+    if (! (dp2 == zero)) {
+	goto L20;
+    }
+    dflag = -two;
+    goto L260;
+/*     REGULAR-CASE.. */
+L20:
+    dp1 = *dd1 * *dx1;
+    dq2 = dp2 * *dy1;
+    dq1 = dp1 * *dx1;
+
+    if (! (abs(dq1) > abs(dq2))) {
+	goto L40;
+    }
+    dh21 = -(*dy1) / *dx1;
+    dh12 = dp2 / dp1;
+
+    du = one - dh12 * dh21;
+
+    if (! (du <= zero)) {
+	goto L30;
+    }
+/*         GO ZERO-H-D-AND-DX1.. */
+    goto L60;
+L30:
+    dflag = zero;
+    *dd1 /= du;
+    *dd2 /= du;
+    *dx1 *= du;
+/*         GO SCALE-CHECK.. */
+    goto L100;
+L40:
+    if (! (dq2 < zero)) {
+	goto L50;
+    }
+/*         GO ZERO-H-D-AND-DX1.. */
+    goto L60;
+L50:
+    dflag = one;
+    dh11 = dp1 / dp2;
+    dh22 = *dx1 / *dy1;
+    du = one + dh11 * dh22;
+    dtemp = *dd2 / du;
+    *dd2 = *dd1 / du;
+    *dd1 = dtemp;
+    *dx1 = *dy1 * du;
+/*         GO SCALE-CHECK */
+    goto L100;
+/*     PROCEDURE..ZERO-H-D-AND-DX1.. */
+L60:
+    dflag = -one;
+    dh11 = zero;
+    dh12 = zero;
+    dh21 = zero;
+    dh22 = zero;
+
+    *dd1 = zero;
+    *dd2 = zero;
+    *dx1 = zero;
+/*         RETURN.. */
+    goto L220;
+/*     PROCEDURE..FIX-H.. */
+L70:
+    if (! (dflag >= zero)) {
+	goto L90;
+    }
+
+    if (! (dflag == zero)) {
+	goto L80;
+    }
+    dh11 = one;
+    dh22 = one;
+    dflag = -one;
+    goto L90;
+L80:
+    dh21 = -one;
+    dh12 = one;
+    dflag = -one;
+L90:
+    switch (igo) {
+	case 0: goto L120;
+	case 1: goto L150;
+	case 2: goto L180;
+	case 3: goto L210;
+    }
+/*     PROCEDURE..SCALE-CHECK */
+L100:
+L110:
+    if (! (*dd1 <= rgamsq)) {
+	goto L130;
+    }
+    if (*dd1 == zero) {
+	goto L160;
+    }
+    igo = 0;
+    igo_fmt = fmt_120;
+/*              FIX-H.. */
+    goto L70;
+L120:
+/* Computing 2nd power */
+    d__1 = gam;
+    *dd1 *= d__1 * d__1;
+    *dx1 /= gam;
+    dh11 /= gam;
+    dh12 /= gam;
+    goto L110;
+L130:
+L140:
+    if (! (*dd1 >= gamsq)) {
+	goto L160;
+    }
+    igo = 1;
+    igo_fmt = fmt_150;
+/*              FIX-H.. */
+    goto L70;
+L150:
+/* Computing 2nd power */
+    d__1 = gam;
+    *dd1 /= d__1 * d__1;
+    *dx1 *= gam;
+    dh11 *= gam;
+    dh12 *= gam;
+    goto L140;
+L160:
+L170:
+    if (! (abs(*dd2) <= rgamsq)) {
+	goto L190;
+    }
+    if (*dd2 == zero) {
+	goto L220;
+    }
+    igo = 2;
+    igo_fmt = fmt_180;
+/*              FIX-H.. */
+    goto L70;
+L180:
+/* Computing 2nd power */
+    d__1 = gam;
+    *dd2 *= d__1 * d__1;
+    dh21 /= gam;
+    dh22 /= gam;
+    goto L170;
+L190:
+L200:
+    if (! (abs(*dd2) >= gamsq)) {
+	goto L220;
+    }
+    igo = 3;
+    igo_fmt = fmt_210;
+/*              FIX-H.. */
+    goto L70;
+L210:
+/* Computing 2nd power */
+    d__1 = gam;
+    *dd2 /= d__1 * d__1;
+    dh21 *= gam;
+    dh22 *= gam;
+    goto L200;
+L220:
+    if (dflag < 0.) {
+	goto L250;
+    } else if (dflag == 0) {
+	goto L230;
+    } else {
+	goto L240;
+    }
+L230:
+    dparam[3] = dh21;
+    dparam[4] = dh12;
+    goto L260;
+L240:
+    dparam[2] = dh11;
+    dparam[5] = dh22;
+    goto L260;
+L250:
+    dparam[2] = dh11;
+    dparam[3] = dh21;
+    dparam[4] = dh12;
+    dparam[5] = dh22;
+L260:
+    dparam[1] = dflag;
+    return 0;
+} /* drotmg_ */
+
diff --git a/blas/f2c/dsbmv.c b/blas/f2c/dsbmv.c
new file mode 100644
index 000000000..c6b4b21d6
--- /dev/null
+++ b/blas/f2c/dsbmv.c
@@ -0,0 +1,366 @@
+/* dsbmv.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int dsbmv_(char *uplo, integer *n, integer *k, doublereal *
+	alpha, doublereal *a, integer *lda, doublereal *x, integer *incx, 
+	doublereal *beta, doublereal *y, integer *incy, ftnlen uplo_len)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
+    doublereal temp1, temp2;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    integer kplus1;
+    extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DSBMV  performs the matrix-vector  operation */
+
+/*     y := alpha*A*x + beta*y, */
+
+/*  where alpha and beta are scalars, x and y are n element vectors and */
+/*  A is an n by n symmetric band matrix, with k super-diagonals. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  UPLO   - CHARACTER*1. */
+/*           On entry, UPLO specifies whether the upper or lower */
+/*           triangular part of the band matrix A is being supplied as */
+/*           follows: */
+
+/*              UPLO = 'U' or 'u'   The upper triangular part of A is */
+/*                                  being supplied. */
+
+/*              UPLO = 'L' or 'l'   The lower triangular part of A is */
+/*                                  being supplied. */
+
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the order of the matrix A. */
+/*           N must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  K      - INTEGER. */
+/*           On entry, K specifies the number of super-diagonals of the */
+/*           matrix A. K must satisfy  0 .le. K. */
+/*           Unchanged on exit. */
+
+/*  ALPHA  - DOUBLE PRECISION. */
+/*           On entry, ALPHA specifies the scalar alpha. */
+/*           Unchanged on exit. */
+
+/*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ). */
+/*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the upper triangular */
+/*           band part of the symmetric matrix, supplied column by */
+/*           column, with the leading diagonal of the matrix in row */
+/*           ( k + 1 ) of the array, the first super-diagonal starting at */
+/*           position 2 in row k, and so on. The top left k by k triangle */
+/*           of the array A is not referenced. */
+/*           The following program segment will transfer the upper */
+/*           triangular part of a symmetric band matrix from conventional */
+/*           full matrix storage to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = K + 1 - J */
+/*                    DO 10, I = MAX( 1, J - K ), J */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the lower triangular */
+/*           band part of the symmetric matrix, supplied column by */
+/*           column, with the leading diagonal of the matrix in row 1 of */
+/*           the array, the first sub-diagonal starting at position 1 in */
+/*           row 2, and so on. The bottom right k by k triangle of the */
+/*           array A is not referenced. */
+/*           The following program segment will transfer the lower */
+/*           triangular part of a symmetric band matrix from conventional */
+/*           full matrix storage to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = 1 - J */
+/*                    DO 10, I = J, MIN( N, J + K ) */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Unchanged on exit. */
+
+/*  LDA    - INTEGER. */
+/*           On entry, LDA specifies the first dimension of A as declared */
+/*           in the calling (sub) program. LDA must be at least */
+/*           ( k + 1 ). */
+/*           Unchanged on exit. */
+
+/*  X      - DOUBLE PRECISION array of DIMENSION at least */
+/*           ( 1 + ( n - 1 )*abs( INCX ) ). */
+/*           Before entry, the incremented array X must contain the */
+/*           vector x. */
+/*           Unchanged on exit. */
+
+/*  INCX   - INTEGER. */
+/*           On entry, INCX specifies the increment for the elements of */
+/*           X. INCX must not be zero. */
+/*           Unchanged on exit. */
+
+/*  BETA   - DOUBLE PRECISION. */
+/*           On entry, BETA specifies the scalar beta. */
+/*           Unchanged on exit. */
+
+/*  Y      - DOUBLE PRECISION array of DIMENSION at least */
+/*           ( 1 + ( n - 1 )*abs( INCY ) ). */
+/*           Before entry, the incremented array Y must contain the */
+/*           vector y. On exit, Y is overwritten by the updated vector y. */
+
+/*  INCY   - INTEGER. */
+/*           On entry, INCY specifies the increment for the elements of */
+/*           Y. INCY must not be zero. */
+/*           Unchanged on exit. */
+
+
+/*  Level 2 Blas routine. */
+
+/*  -- Written on 22-October-1986. */
+/*     Jack Dongarra, Argonne National Lab. */
+/*     Jeremy Du Croz, Nag Central Office. */
+/*     Sven Hammarling, Nag Central Office. */
+/*     Richard Hanson, Sandia National Labs. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+    --y;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
+	    ftnlen)1, (ftnlen)1)) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*k < 0) {
+	info = 3;
+    } else if (*lda < *k + 1) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    } else if (*incy == 0) {
+	info = 11;
+    }
+    if (info != 0) {
+	xerbla_("DSBMV ", &info, (ftnlen)6);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || (*alpha == 0. && *beta == 1.)) {
+	return 0;
+    }
+
+/*     Set up the start points in  X  and  Y. */
+
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (*n - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (*n - 1) * *incy;
+    }
+
+/*     Start the operations. In this version the elements of the array A */
+/*     are accessed sequentially with one pass through A. */
+
+/*     First form  y := beta*y. */
+
+    if (*beta != 1.) {
+	if (*incy == 1) {
+	    if (*beta == 0.) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[i__] = 0.;
+/* L10: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[i__] = *beta * y[i__];
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (*beta == 0.) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[iy] = 0.;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[iy] = *beta * y[iy];
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (*alpha == 0.) {
+	return 0;
+    }
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+
+/*        Form  y  when upper triangle of A is stored. */
+
+	kplus1 = *k + 1;
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[j];
+		temp2 = 0.;
+		l = kplus1 - j;
+/* Computing MAX */
+		i__2 = 1, i__3 = j - *k;
+		i__4 = j - 1;
+		for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
+		    y[i__] += temp1 * a[l + i__ + j * a_dim1];
+		    temp2 += a[l + i__ + j * a_dim1] * x[i__];
+/* L50: */
+		}
+		y[j] = y[j] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2;
+/* L60: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[jx];
+		temp2 = 0.;
+		ix = kx;
+		iy = ky;
+		l = kplus1 - j;
+/* Computing MAX */
+		i__4 = 1, i__2 = j - *k;
+		i__3 = j - 1;
+		for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
+		    y[iy] += temp1 * a[l + i__ + j * a_dim1];
+		    temp2 += a[l + i__ + j * a_dim1] * x[ix];
+		    ix += *incx;
+		    iy += *incy;
+/* L70: */
+		}
+		y[jy] = y[jy] + temp1 * a[kplus1 + j * a_dim1] + *alpha * 
+			temp2;
+		jx += *incx;
+		jy += *incy;
+		if (j > *k) {
+		    kx += *incx;
+		    ky += *incy;
+		}
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y  when lower triangle of A is stored. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[j];
+		temp2 = 0.;
+		y[j] += temp1 * a[j * a_dim1 + 1];
+		l = 1 - j;
+/* Computing MIN */
+		i__4 = *n, i__2 = j + *k;
+		i__3 = min(i__4,i__2);
+		for (i__ = j + 1; i__ <= i__3; ++i__) {
+		    y[i__] += temp1 * a[l + i__ + j * a_dim1];
+		    temp2 += a[l + i__ + j * a_dim1] * x[i__];
+/* L90: */
+		}
+		y[j] += *alpha * temp2;
+/* L100: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[jx];
+		temp2 = 0.;
+		y[jy] += temp1 * a[j * a_dim1 + 1];
+		l = 1 - j;
+		ix = jx;
+		iy = jy;
+/* Computing MIN */
+		i__4 = *n, i__2 = j + *k;
+		i__3 = min(i__4,i__2);
+		for (i__ = j + 1; i__ <= i__3; ++i__) {
+		    ix += *incx;
+		    iy += *incy;
+		    y[iy] += temp1 * a[l + i__ + j * a_dim1];
+		    temp2 += a[l + i__ + j * a_dim1] * x[ix];
+/* L110: */
+		}
+		y[jy] += *alpha * temp2;
+		jx += *incx;
+		jy += *incy;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DSBMV . */
+
+} /* dsbmv_ */
+
diff --git a/blas/f2c/dspmv.c b/blas/f2c/dspmv.c
new file mode 100644
index 000000000..0b4e92d5c
--- /dev/null
+++ b/blas/f2c/dspmv.c
@@ -0,0 +1,316 @@
+/* dspmv.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int dspmv_(char *uplo, integer *n, doublereal *alpha, 
+	doublereal *ap, doublereal *x, integer *incx, doublereal *beta, 
+	doublereal *y, integer *incy, ftnlen uplo_len)
+{
+    /* System generated locals */
+    integer i__1, i__2;
+
+    /* Local variables */
+    integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
+    doublereal temp1, temp2;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DSPMV  performs the matrix-vector operation */
+
+/*     y := alpha*A*x + beta*y, */
+
+/*  where alpha and beta are scalars, x and y are n element vectors and */
+/*  A is an n by n symmetric matrix, supplied in packed form. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  UPLO   - CHARACTER*1. */
+/*           On entry, UPLO specifies whether the upper or lower */
+/*           triangular part of the matrix A is supplied in the packed */
+/*           array AP as follows: */
+
+/*              UPLO = 'U' or 'u'   The upper triangular part of A is */
+/*                                  supplied in AP. */
+
+/*              UPLO = 'L' or 'l'   The lower triangular part of A is */
+/*                                  supplied in AP. */
+
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the order of the matrix A. */
+/*           N must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  ALPHA  - DOUBLE PRECISION. */
+/*           On entry, ALPHA specifies the scalar alpha. */
+/*           Unchanged on exit. */
+
+/*  AP     - DOUBLE PRECISION array of DIMENSION at least */
+/*           ( ( n*( n + 1 ) )/2 ). */
+/*           Before entry with UPLO = 'U' or 'u', the array AP must */
+/*           contain the upper triangular part of the symmetric matrix */
+/*           packed sequentially, column by column, so that AP( 1 ) */
+/*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
+/*           and a( 2, 2 ) respectively, and so on. */
+/*           Before entry with UPLO = 'L' or 'l', the array AP must */
+/*           contain the lower triangular part of the symmetric matrix */
+/*           packed sequentially, column by column, so that AP( 1 ) */
+/*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
+/*           and a( 3, 1 ) respectively, and so on. */
+/*           Unchanged on exit. */
+
+/*  X      - DOUBLE PRECISION array of dimension at least */
+/*           ( 1 + ( n - 1 )*abs( INCX ) ). */
+/*           Before entry, the incremented array X must contain the n */
+/*           element vector x. */
+/*           Unchanged on exit. */
+
+/*  INCX   - INTEGER. */
+/*           On entry, INCX specifies the increment for the elements of */
+/*           X. INCX must not be zero. */
+/*           Unchanged on exit. */
+
+/*  BETA   - DOUBLE PRECISION. */
+/*           On entry, BETA specifies the scalar beta. When BETA is */
+/*           supplied as zero then Y need not be set on input. */
+/*           Unchanged on exit. */
+
+/*  Y      - DOUBLE PRECISION array of dimension at least */
+/*           ( 1 + ( n - 1 )*abs( INCY ) ). */
+/*           Before entry, the incremented array Y must contain the n */
+/*           element vector y. On exit, Y is overwritten by the updated */
+/*           vector y. */
+
+/*  INCY   - INTEGER. */
+/*           On entry, INCY specifies the increment for the elements of */
+/*           Y. INCY must not be zero. */
+/*           Unchanged on exit. */
+
+/*  Further Details */
+/*  =============== */
+
+/*  Level 2 Blas routine. */
+
+/*  -- Written on 22-October-1986. */
+/*     Jack Dongarra, Argonne National Lab. */
+/*     Jeremy Du Croz, Nag Central Office. */
+/*     Sven Hammarling, Nag Central Office. */
+/*     Richard Hanson, Sandia National Labs. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    --y;
+    --x;
+    --ap;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
+	    ftnlen)1, (ftnlen)1)) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 6;
+    } else if (*incy == 0) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("DSPMV ", &info, (ftnlen)6);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || (*alpha == 0. && *beta == 1.)) {
+	return 0;
+    }
+
+/*     Set up the start points in  X  and  Y. */
+
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (*n - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (*n - 1) * *incy;
+    }
+
+/*     Start the operations. In this version the elements of the array AP */
+/*     are accessed sequentially with one pass through AP. */
+
+/*     First form  y := beta*y. */
+
+    if (*beta != 1.) {
+	if (*incy == 1) {
+	    if (*beta == 0.) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[i__] = 0.;
+/* L10: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[i__] = *beta * y[i__];
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (*beta == 0.) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[iy] = 0.;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[iy] = *beta * y[iy];
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (*alpha == 0.) {
+	return 0;
+    }
+    kk = 1;
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+
+/*        Form  y  when AP contains the upper triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[j];
+		temp2 = 0.;
+		k = kk;
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    y[i__] += temp1 * ap[k];
+		    temp2 += ap[k] * x[i__];
+		    ++k;
+/* L50: */
+		}
+		y[j] = y[j] + temp1 * ap[kk + j - 1] + *alpha * temp2;
+		kk += j;
+/* L60: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[jx];
+		temp2 = 0.;
+		ix = kx;
+		iy = ky;
+		i__2 = kk + j - 2;
+		for (k = kk; k <= i__2; ++k) {
+		    y[iy] += temp1 * ap[k];
+		    temp2 += ap[k] * x[ix];
+		    ix += *incx;
+		    iy += *incy;
+/* L70: */
+		}
+		y[jy] = y[jy] + temp1 * ap[kk + j - 1] + *alpha * temp2;
+		jx += *incx;
+		jy += *incy;
+		kk += j;
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y  when AP contains the lower triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[j];
+		temp2 = 0.;
+		y[j] += temp1 * ap[kk];
+		k = kk + 1;
+		i__2 = *n;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    y[i__] += temp1 * ap[k];
+		    temp2 += ap[k] * x[i__];
+		    ++k;
+/* L90: */
+		}
+		y[j] += *alpha * temp2;
+		kk += *n - j + 1;
+/* L100: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[jx];
+		temp2 = 0.;
+		y[jy] += temp1 * ap[kk];
+		ix = jx;
+		iy = jy;
+		i__2 = kk + *n - j;
+		for (k = kk + 1; k <= i__2; ++k) {
+		    ix += *incx;
+		    iy += *incy;
+		    y[iy] += temp1 * ap[k];
+		    temp2 += ap[k] * x[ix];
+/* L110: */
+		}
+		y[jy] += *alpha * temp2;
+		jx += *incx;
+		jy += *incy;
+		kk += *n - j + 1;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DSPMV . */
+
+} /* dspmv_ */
+
diff --git a/blas/f2c/dtbmv.c b/blas/f2c/dtbmv.c
new file mode 100644
index 000000000..fdf73ebb5
--- /dev/null
+++ b/blas/f2c/dtbmv.c
@@ -0,0 +1,428 @@
+/* dtbmv.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int dtbmv_(char *uplo, char *trans, char *diag, integer *n, 
+	integer *k, doublereal *a, integer *lda, doublereal *x, integer *incx,
+	 ftnlen uplo_len, ftnlen trans_len, ftnlen diag_len)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    integer i__, j, l, ix, jx, kx, info;
+    doublereal temp;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    integer kplus1;
+    extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
+    logical nounit;
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DTBMV  performs one of the matrix-vector operations */
+
+/*     x := A*x,   or   x := A'*x, */
+
+/*  where x is an n element vector and  A is an n by n unit, or non-unit, */
+/*  upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  UPLO   - CHARACTER*1. */
+/*           On entry, UPLO specifies whether the matrix is an upper or */
+/*           lower triangular matrix as follows: */
+
+/*              UPLO = 'U' or 'u'   A is an upper triangular matrix. */
+
+/*              UPLO = 'L' or 'l'   A is a lower triangular matrix. */
+
+/*           Unchanged on exit. */
+
+/*  TRANS  - CHARACTER*1. */
+/*           On entry, TRANS specifies the operation to be performed as */
+/*           follows: */
+
+/*              TRANS = 'N' or 'n'   x := A*x. */
+
+/*              TRANS = 'T' or 't'   x := A'*x. */
+
+/*              TRANS = 'C' or 'c'   x := A'*x. */
+
+/*           Unchanged on exit. */
+
+/*  DIAG   - CHARACTER*1. */
+/*           On entry, DIAG specifies whether or not A is unit */
+/*           triangular as follows: */
+
+/*              DIAG = 'U' or 'u'   A is assumed to be unit triangular. */
+
+/*              DIAG = 'N' or 'n'   A is not assumed to be unit */
+/*                                  triangular. */
+
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the order of the matrix A. */
+/*           N must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  K      - INTEGER. */
+/*           On entry with UPLO = 'U' or 'u', K specifies the number of */
+/*           super-diagonals of the matrix A. */
+/*           On entry with UPLO = 'L' or 'l', K specifies the number of */
+/*           sub-diagonals of the matrix A. */
+/*           K must satisfy  0 .le. K. */
+/*           Unchanged on exit. */
+
+/*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ). */
+/*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the upper triangular */
+/*           band part of the matrix of coefficients, supplied column by */
+/*           column, with the leading diagonal of the matrix in row */
+/*           ( k + 1 ) of the array, the first super-diagonal starting at */
+/*           position 2 in row k, and so on. The top left k by k triangle */
+/*           of the array A is not referenced. */
+/*           The following program segment will transfer an upper */
+/*           triangular band matrix from conventional full matrix storage */
+/*           to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = K + 1 - J */
+/*                    DO 10, I = MAX( 1, J - K ), J */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the lower triangular */
+/*           band part of the matrix of coefficients, supplied column by */
+/*           column, with the leading diagonal of the matrix in row 1 of */
+/*           the array, the first sub-diagonal starting at position 1 in */
+/*           row 2, and so on. The bottom right k by k triangle of the */
+/*           array A is not referenced. */
+/*           The following program segment will transfer a lower */
+/*           triangular band matrix from conventional full matrix storage */
+/*           to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = 1 - J */
+/*                    DO 10, I = J, MIN( N, J + K ) */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Note that when DIAG = 'U' or 'u' the elements of the array A */
+/*           corresponding to the diagonal elements of the matrix are not */
+/*           referenced, but are assumed to be unity. */
+/*           Unchanged on exit. */
+
+/*  LDA    - INTEGER. */
+/*           On entry, LDA specifies the first dimension of A as declared */
+/*           in the calling (sub) program. LDA must be at least */
+/*           ( k + 1 ). */
+/*           Unchanged on exit. */
+
+/*  X      - DOUBLE PRECISION array of dimension at least */
+/*           ( 1 + ( n - 1 )*abs( INCX ) ). */
+/*           Before entry, the incremented array X must contain the n */
+/*           element vector x. On exit, X is overwritten with the */
+/*           tranformed vector x. */
+
+/*  INCX   - INTEGER. */
+/*           On entry, INCX specifies the increment for the elements of */
+/*           X. INCX must not be zero. */
+/*           Unchanged on exit. */
+
+/*  Further Details */
+/*  =============== */
+
+/*  Level 2 Blas routine. */
+
+/*  -- Written on 22-October-1986. */
+/*     Jack Dongarra, Argonne National Lab. */
+/*     Jeremy Du Croz, Nag Central Office. */
+/*     Sven Hammarling, Nag Central Office. */
+/*     Richard Hanson, Sandia National Labs. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
+	    ftnlen)1, (ftnlen)1)) {
+	info = 1;
+    } else if (! lsame_(trans, "N", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, 
+	    "T", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, "C", (ftnlen)1, (
+	    ftnlen)1)) {
+	info = 2;
+    } else if (! lsame_(diag, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(diag, 
+	    "N", (ftnlen)1, (ftnlen)1)) {
+	info = 3;
+    } else if (*n < 0) {
+	info = 4;
+    } else if (*k < 0) {
+	info = 5;
+    } else if (*lda < *k + 1) {
+	info = 7;
+    } else if (*incx == 0) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("DTBMV ", &info, (ftnlen)6);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    nounit = lsame_(diag, "N", (ftnlen)1, (ftnlen)1);
+
+/*     Set up the start point in X if the increment is not unity. This */
+/*     will be  ( N - 1 )*INCX   too small for descending loops. */
+
+    if (*incx <= 0) {
+	kx = 1 - (*n - 1) * *incx;
+    } else if (*incx != 1) {
+	kx = 1;
+    }
+
+/*     Start the operations. In this version the elements of A are */
+/*     accessed sequentially with one pass through A. */
+
+    if (lsame_(trans, "N", (ftnlen)1, (ftnlen)1)) {
+
+/*         Form  x := A*x. */
+
+	if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	    kplus1 = *k + 1;
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (x[j] != 0.) {
+			temp = x[j];
+			l = kplus1 - j;
+/* Computing MAX */
+			i__2 = 1, i__3 = j - *k;
+			i__4 = j - 1;
+			for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
+			    x[i__] += temp * a[l + i__ + j * a_dim1];
+/* L10: */
+			}
+			if (nounit) {
+			    x[j] *= a[kplus1 + j * a_dim1];
+			}
+		    }
+/* L20: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (x[jx] != 0.) {
+			temp = x[jx];
+			ix = kx;
+			l = kplus1 - j;
+/* Computing MAX */
+			i__4 = 1, i__2 = j - *k;
+			i__3 = j - 1;
+			for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
+			    x[ix] += temp * a[l + i__ + j * a_dim1];
+			    ix += *incx;
+/* L30: */
+			}
+			if (nounit) {
+			    x[jx] *= a[kplus1 + j * a_dim1];
+			}
+		    }
+		    jx += *incx;
+		    if (j > *k) {
+			kx += *incx;
+		    }
+/* L40: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    if (x[j] != 0.) {
+			temp = x[j];
+			l = 1 - j;
+/* Computing MIN */
+			i__1 = *n, i__3 = j + *k;
+			i__4 = j + 1;
+			for (i__ = min(i__1,i__3); i__ >= i__4; --i__) {
+			    x[i__] += temp * a[l + i__ + j * a_dim1];
+/* L50: */
+			}
+			if (nounit) {
+			    x[j] *= a[j * a_dim1 + 1];
+			}
+		    }
+/* L60: */
+		}
+	    } else {
+		kx += (*n - 1) * *incx;
+		jx = kx;
+		for (j = *n; j >= 1; --j) {
+		    if (x[jx] != 0.) {
+			temp = x[jx];
+			ix = kx;
+			l = 1 - j;
+/* Computing MIN */
+			i__4 = *n, i__1 = j + *k;
+			i__3 = j + 1;
+			for (i__ = min(i__4,i__1); i__ >= i__3; --i__) {
+			    x[ix] += temp * a[l + i__ + j * a_dim1];
+			    ix -= *incx;
+/* L70: */
+			}
+			if (nounit) {
+			    x[jx] *= a[j * a_dim1 + 1];
+			}
+		    }
+		    jx -= *incx;
+		    if (*n - j >= *k) {
+			kx -= *incx;
+		    }
+/* L80: */
+		}
+	    }
+	}
+    } else {
+
+/*        Form  x := A'*x. */
+
+	if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	    kplus1 = *k + 1;
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    temp = x[j];
+		    l = kplus1 - j;
+		    if (nounit) {
+			temp *= a[kplus1 + j * a_dim1];
+		    }
+/* Computing MAX */
+		    i__4 = 1, i__1 = j - *k;
+		    i__3 = max(i__4,i__1);
+		    for (i__ = j - 1; i__ >= i__3; --i__) {
+			temp += a[l + i__ + j * a_dim1] * x[i__];
+/* L90: */
+		    }
+		    x[j] = temp;
+/* L100: */
+		}
+	    } else {
+		kx += (*n - 1) * *incx;
+		jx = kx;
+		for (j = *n; j >= 1; --j) {
+		    temp = x[jx];
+		    kx -= *incx;
+		    ix = kx;
+		    l = kplus1 - j;
+		    if (nounit) {
+			temp *= a[kplus1 + j * a_dim1];
+		    }
+/* Computing MAX */
+		    i__4 = 1, i__1 = j - *k;
+		    i__3 = max(i__4,i__1);
+		    for (i__ = j - 1; i__ >= i__3; --i__) {
+			temp += a[l + i__ + j * a_dim1] * x[ix];
+			ix -= *incx;
+/* L110: */
+		    }
+		    x[jx] = temp;
+		    jx -= *incx;
+/* L120: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		i__3 = *n;
+		for (j = 1; j <= i__3; ++j) {
+		    temp = x[j];
+		    l = 1 - j;
+		    if (nounit) {
+			temp *= a[j * a_dim1 + 1];
+		    }
+/* Computing MIN */
+		    i__1 = *n, i__2 = j + *k;
+		    i__4 = min(i__1,i__2);
+		    for (i__ = j + 1; i__ <= i__4; ++i__) {
+			temp += a[l + i__ + j * a_dim1] * x[i__];
+/* L130: */
+		    }
+		    x[j] = temp;
+/* L140: */
+		}
+	    } else {
+		jx = kx;
+		i__3 = *n;
+		for (j = 1; j <= i__3; ++j) {
+		    temp = x[jx];
+		    kx += *incx;
+		    ix = kx;
+		    l = 1 - j;
+		    if (nounit) {
+			temp *= a[j * a_dim1 + 1];
+		    }
+/* Computing MIN */
+		    i__1 = *n, i__2 = j + *k;
+		    i__4 = min(i__1,i__2);
+		    for (i__ = j + 1; i__ <= i__4; ++i__) {
+			temp += a[l + i__ + j * a_dim1] * x[ix];
+			ix += *incx;
+/* L150: */
+		    }
+		    x[jx] = temp;
+		    jx += *incx;
+/* L160: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DTBMV . */
+
+} /* dtbmv_ */
+
diff --git a/blas/f2c/lsame.c b/blas/f2c/lsame.c
new file mode 100644
index 000000000..46324d916
--- /dev/null
+++ b/blas/f2c/lsame.c
@@ -0,0 +1,117 @@
+/* lsame.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+logical lsame_(char *ca, char *cb, ftnlen ca_len, ftnlen cb_len)
+{
+    /* System generated locals */
+    logical ret_val;
+
+    /* Local variables */
+    integer inta, intb, zcode;
+
+
+/*  -- LAPACK auxiliary routine (version 3.1) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  LSAME returns .TRUE. if CA is the same letter as CB regardless of */
+/*  case. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  CA      (input) CHARACTER*1 */
+
+/*  CB      (input) CHARACTER*1 */
+/*          CA and CB specify the single characters to be compared. */
+
+/* ===================================================================== */
+
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+
+/*     Test if the characters are equal */
+
+    ret_val = *(unsigned char *)ca == *(unsigned char *)cb;
+    if (ret_val) {
+	return ret_val;
+    }
+
+/*     Now test for equivalence if both characters are alphabetic. */
+
+    zcode = 'Z';
+
+/*     Use 'Z' rather than 'A' so that ASCII can be detected on Prime */
+/*     machines, on which ICHAR returns a value with bit 8 set. */
+/*     ICHAR('A') on Prime machines returns 193 which is the same as */
+/*     ICHAR('A') on an EBCDIC machine. */
+
+    inta = *(unsigned char *)ca;
+    intb = *(unsigned char *)cb;
+
+    if (zcode == 90 || zcode == 122) {
+
+/*        ASCII is assumed - ZCODE is the ASCII code of either lower or */
+/*        upper case 'Z'. */
+
+	if (inta >= 97 && inta <= 122) {
+	    inta += -32;
+	}
+	if (intb >= 97 && intb <= 122) {
+	    intb += -32;
+	}
+
+    } else if (zcode == 233 || zcode == 169) {
+
+/*        EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or */
+/*        upper case 'Z'. */
+
+	if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) || 
+            (inta >= 162 && inta <= 169)) {
+	    inta += 64;
+	}
+	if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) || 
+            (intb >= 162 && intb <= 169)) {
+	    intb += 64;
+	}
+
+    } else if (zcode == 218 || zcode == 250) {
+
+/*        ASCII is assumed, on Prime machines - ZCODE is the ASCII code */
+/*        plus 128 of either lower or upper case 'Z'. */
+
+	if (inta >= 225 && inta <= 250) {
+	    inta += -32;
+	}
+	if (intb >= 225 && intb <= 250) {
+	    intb += -32;
+	}
+    }
+    ret_val = inta == intb;
+
+/*     RETURN */
+
+/*     End of LSAME */
+
+    return ret_val;
+} /* lsame_ */
+
diff --git a/blas/f2c/r_cnjg.c b/blas/f2c/r_cnjg.c
new file mode 100644
index 000000000..c08182f88
--- /dev/null
+++ b/blas/f2c/r_cnjg.c
@@ -0,0 +1,6 @@
+#include "datatypes.h"    
+
+void r_cnjg(complex *r, complex *z) {
+    r->r = z->r;
+    r->i = -(z->i);
+}
diff --git a/blas/f2c/srotm.c b/blas/f2c/srotm.c
new file mode 100644
index 000000000..bd5944a99
--- /dev/null
+++ b/blas/f2c/srotm.c
@@ -0,0 +1,216 @@
+/* srotm.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int srotm_(integer *n, real *sx, integer *incx, real *sy, 
+	integer *incy, real *sparam)
+{
+    /* Initialized data */
+
+    static real zero = 0.f;
+    static real two = 2.f;
+
+    /* System generated locals */
+    integer i__1, i__2;
+
+    /* Local variables */
+    integer i__;
+    real w, z__;
+    integer kx, ky;
+    real sh11, sh12, sh21, sh22, sflag;
+    integer nsteps;
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*     APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX */
+
+/*     (SX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF SX ARE IN */
+/*     (DX**T) */
+
+/*     SX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE */
+/*     LX = (-INCX)*N, AND SIMILARLY FOR SY USING USING LY AND INCY. */
+/*     WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
+
+/*     SFLAG=-1.E0     SFLAG=0.E0        SFLAG=1.E0     SFLAG=-2.E0 */
+
+/*       (SH11  SH12)    (1.E0  SH12)    (SH11  1.E0)    (1.E0  0.E0) */
+/*     H=(          )    (          )    (          )    (          ) */
+/*       (SH21  SH22),   (SH21  1.E0),   (-1.E0 SH22),   (0.E0  1.E0). */
+/*     SEE  SROTMG FOR A DESCRIPTION OF DATA STORAGE IN SPARAM. */
+
+
+/*  Arguments */
+/*  ========= */
+
+/*  N      (input) INTEGER */
+/*         number of elements in input vector(s) */
+
+/*  SX     (input/output) REAL array, dimension N */
+/*         double precision vector with N elements */
+
+/*  INCX   (input) INTEGER */
+/*         storage spacing between elements of SX */
+
+/*  SY     (input/output) REAL array, dimension N */
+/*         double precision vector with N elements */
+
+/*  INCY   (input) INTEGER */
+/*         storage spacing between elements of SY */
+
+/*  SPARAM (input/output)  REAL array, dimension 5 */
+/*     SPARAM(1)=SFLAG */
+/*     SPARAM(2)=SH11 */
+/*     SPARAM(3)=SH21 */
+/*     SPARAM(4)=SH12 */
+/*     SPARAM(5)=SH22 */
+
+/*  ===================================================================== */
+
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Data statements .. */
+    /* Parameter adjustments */
+    --sparam;
+    --sy;
+    --sx;
+
+    /* Function Body */
+/*     .. */
+
+    sflag = sparam[1];
+    if (*n <= 0 || sflag + two == zero) {
+	goto L140;
+    }
+    if (! (*incx == *incy && *incx > 0)) {
+	goto L70;
+    }
+
+    nsteps = *n * *incx;
+    if (sflag < 0.f) {
+	goto L50;
+    } else if (sflag == 0) {
+	goto L10;
+    } else {
+	goto L30;
+    }
+L10:
+    sh12 = sparam[4];
+    sh21 = sparam[3];
+    i__1 = nsteps;
+    i__2 = *incx;
+    for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	w = sx[i__];
+	z__ = sy[i__];
+	sx[i__] = w + z__ * sh12;
+	sy[i__] = w * sh21 + z__;
+/* L20: */
+    }
+    goto L140;
+L30:
+    sh11 = sparam[2];
+    sh22 = sparam[5];
+    i__2 = nsteps;
+    i__1 = *incx;
+    for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) {
+	w = sx[i__];
+	z__ = sy[i__];
+	sx[i__] = w * sh11 + z__;
+	sy[i__] = -w + sh22 * z__;
+/* L40: */
+    }
+    goto L140;
+L50:
+    sh11 = sparam[2];
+    sh12 = sparam[4];
+    sh21 = sparam[3];
+    sh22 = sparam[5];
+    i__1 = nsteps;
+    i__2 = *incx;
+    for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	w = sx[i__];
+	z__ = sy[i__];
+	sx[i__] = w * sh11 + z__ * sh12;
+	sy[i__] = w * sh21 + z__ * sh22;
+/* L60: */
+    }
+    goto L140;
+L70:
+    kx = 1;
+    ky = 1;
+    if (*incx < 0) {
+	kx = (1 - *n) * *incx + 1;
+    }
+    if (*incy < 0) {
+	ky = (1 - *n) * *incy + 1;
+    }
+
+    if (sflag < 0.f) {
+	goto L120;
+    } else if (sflag == 0) {
+	goto L80;
+    } else {
+	goto L100;
+    }
+L80:
+    sh12 = sparam[4];
+    sh21 = sparam[3];
+    i__2 = *n;
+    for (i__ = 1; i__ <= i__2; ++i__) {
+	w = sx[kx];
+	z__ = sy[ky];
+	sx[kx] = w + z__ * sh12;
+	sy[ky] = w * sh21 + z__;
+	kx += *incx;
+	ky += *incy;
+/* L90: */
+    }
+    goto L140;
+L100:
+    sh11 = sparam[2];
+    sh22 = sparam[5];
+    i__2 = *n;
+    for (i__ = 1; i__ <= i__2; ++i__) {
+	w = sx[kx];
+	z__ = sy[ky];
+	sx[kx] = w * sh11 + z__;
+	sy[ky] = -w + sh22 * z__;
+	kx += *incx;
+	ky += *incy;
+/* L110: */
+    }
+    goto L140;
+L120:
+    sh11 = sparam[2];
+    sh12 = sparam[4];
+    sh21 = sparam[3];
+    sh22 = sparam[5];
+    i__2 = *n;
+    for (i__ = 1; i__ <= i__2; ++i__) {
+	w = sx[kx];
+	z__ = sy[ky];
+	sx[kx] = w * sh11 + z__ * sh12;
+	sy[ky] = w * sh21 + z__ * sh22;
+	kx += *incx;
+	ky += *incy;
+/* L130: */
+    }
+L140:
+    return 0;
+} /* srotm_ */
+
diff --git a/blas/f2c/srotmg.c b/blas/f2c/srotmg.c
new file mode 100644
index 000000000..75f789fe2
--- /dev/null
+++ b/blas/f2c/srotmg.c
@@ -0,0 +1,295 @@
+/* srotmg.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int srotmg_(real *sd1, real *sd2, real *sx1, real *sy1, real 
+	*sparam)
+{
+    /* Initialized data */
+
+    static real zero = 0.f;
+    static real one = 1.f;
+    static real two = 2.f;
+    static real gam = 4096.f;
+    static real gamsq = 16777200.f;
+    static real rgamsq = 5.96046e-8f;
+
+    /* Format strings */
+    static char fmt_120[] = "";
+    static char fmt_150[] = "";
+    static char fmt_180[] = "";
+    static char fmt_210[] = "";
+
+    /* System generated locals */
+    real r__1;
+
+    /* Local variables */
+    real su, sp1, sp2, sq1, sq2, sh11, sh12, sh21, sh22;
+    integer igo;
+    real sflag, stemp;
+
+    /* Assigned format variables */
+    static char *igo_fmt;
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*     CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS */
+/*     THE SECOND COMPONENT OF THE 2-VECTOR  (SQRT(SD1)*SX1,SQRT(SD2)* */
+/*     SY2)**T. */
+/*     WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
+
+/*     SFLAG=-1.E0     SFLAG=0.E0        SFLAG=1.E0     SFLAG=-2.E0 */
+
+/*       (SH11  SH12)    (1.E0  SH12)    (SH11  1.E0)    (1.E0  0.E0) */
+/*     H=(          )    (          )    (          )    (          ) */
+/*       (SH21  SH22),   (SH21  1.E0),   (-1.E0 SH22),   (0.E0  1.E0). */
+/*     LOCATIONS 2-4 OF SPARAM CONTAIN SH11,SH21,SH12, AND SH22 */
+/*     RESPECTIVELY. (VALUES OF 1.E0, -1.E0, OR 0.E0 IMPLIED BY THE */
+/*     VALUE OF SPARAM(1) ARE NOT STORED IN SPARAM.) */
+
+/*     THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE */
+/*     INEXACT.  THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE */
+/*     OF SD1 AND SD2.  ALL ACTUAL SCALING OF DATA IS DONE USING GAM. */
+
+
+/*  Arguments */
+/*  ========= */
+
+
+/*  SD1    (input/output) REAL */
+
+/*  SD2    (input/output) REAL */
+
+/*  SX1    (input/output) REAL */
+
+/*  SY1    (input) REAL */
+
+
+/*  SPARAM (input/output)  REAL array, dimension 5 */
+/*     SPARAM(1)=SFLAG */
+/*     SPARAM(2)=SH11 */
+/*     SPARAM(3)=SH21 */
+/*     SPARAM(4)=SH12 */
+/*     SPARAM(5)=SH22 */
+
+/*  ===================================================================== */
+
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Data statements .. */
+
+    /* Parameter adjustments */
+    --sparam;
+
+    /* Function Body */
+/*     .. */
+    if (! (*sd1 < zero)) {
+	goto L10;
+    }
+/*       GO ZERO-H-D-AND-SX1.. */
+    goto L60;
+L10:
+/*     CASE-SD1-NONNEGATIVE */
+    sp2 = *sd2 * *sy1;
+    if (! (sp2 == zero)) {
+	goto L20;
+    }
+    sflag = -two;
+    goto L260;
+/*     REGULAR-CASE.. */
+L20:
+    sp1 = *sd1 * *sx1;
+    sq2 = sp2 * *sy1;
+    sq1 = sp1 * *sx1;
+
+    if (! (dabs(sq1) > dabs(sq2))) {
+	goto L40;
+    }
+    sh21 = -(*sy1) / *sx1;
+    sh12 = sp2 / sp1;
+
+    su = one - sh12 * sh21;
+
+    if (! (su <= zero)) {
+	goto L30;
+    }
+/*         GO ZERO-H-D-AND-SX1.. */
+    goto L60;
+L30:
+    sflag = zero;
+    *sd1 /= su;
+    *sd2 /= su;
+    *sx1 *= su;
+/*         GO SCALE-CHECK.. */
+    goto L100;
+L40:
+    if (! (sq2 < zero)) {
+	goto L50;
+    }
+/*         GO ZERO-H-D-AND-SX1.. */
+    goto L60;
+L50:
+    sflag = one;
+    sh11 = sp1 / sp2;
+    sh22 = *sx1 / *sy1;
+    su = one + sh11 * sh22;
+    stemp = *sd2 / su;
+    *sd2 = *sd1 / su;
+    *sd1 = stemp;
+    *sx1 = *sy1 * su;
+/*         GO SCALE-CHECK */
+    goto L100;
+/*     PROCEDURE..ZERO-H-D-AND-SX1.. */
+L60:
+    sflag = -one;
+    sh11 = zero;
+    sh12 = zero;
+    sh21 = zero;
+    sh22 = zero;
+
+    *sd1 = zero;
+    *sd2 = zero;
+    *sx1 = zero;
+/*         RETURN.. */
+    goto L220;
+/*     PROCEDURE..FIX-H.. */
+L70:
+    if (! (sflag >= zero)) {
+	goto L90;
+    }
+
+    if (! (sflag == zero)) {
+	goto L80;
+    }
+    sh11 = one;
+    sh22 = one;
+    sflag = -one;
+    goto L90;
+L80:
+    sh21 = -one;
+    sh12 = one;
+    sflag = -one;
+L90:
+    switch (igo) {
+	case 0: goto L120;
+	case 1: goto L150;
+	case 2: goto L180;
+	case 3: goto L210;
+    }
+/*     PROCEDURE..SCALE-CHECK */
+L100:
+L110:
+    if (! (*sd1 <= rgamsq)) {
+	goto L130;
+    }
+    if (*sd1 == zero) {
+	goto L160;
+    }
+    igo = 0;
+    igo_fmt = fmt_120;
+/*              FIX-H.. */
+    goto L70;
+L120:
+/* Computing 2nd power */
+    r__1 = gam;
+    *sd1 *= r__1 * r__1;
+    *sx1 /= gam;
+    sh11 /= gam;
+    sh12 /= gam;
+    goto L110;
+L130:
+L140:
+    if (! (*sd1 >= gamsq)) {
+	goto L160;
+    }
+    igo = 1;
+    igo_fmt = fmt_150;
+/*              FIX-H.. */
+    goto L70;
+L150:
+/* Computing 2nd power */
+    r__1 = gam;
+    *sd1 /= r__1 * r__1;
+    *sx1 *= gam;
+    sh11 *= gam;
+    sh12 *= gam;
+    goto L140;
+L160:
+L170:
+    if (! (dabs(*sd2) <= rgamsq)) {
+	goto L190;
+    }
+    if (*sd2 == zero) {
+	goto L220;
+    }
+    igo = 2;
+    igo_fmt = fmt_180;
+/*              FIX-H.. */
+    goto L70;
+L180:
+/* Computing 2nd power */
+    r__1 = gam;
+    *sd2 *= r__1 * r__1;
+    sh21 /= gam;
+    sh22 /= gam;
+    goto L170;
+L190:
+L200:
+    if (! (dabs(*sd2) >= gamsq)) {
+	goto L220;
+    }
+    igo = 3;
+    igo_fmt = fmt_210;
+/*              FIX-H.. */
+    goto L70;
+L210:
+/* Computing 2nd power */
+    r__1 = gam;
+    *sd2 /= r__1 * r__1;
+    sh21 *= gam;
+    sh22 *= gam;
+    goto L200;
+L220:
+    if (sflag < 0.f) {
+	goto L250;
+    } else if (sflag == 0) {
+	goto L230;
+    } else {
+	goto L240;
+    }
+L230:
+    sparam[3] = sh21;
+    sparam[4] = sh12;
+    goto L260;
+L240:
+    sparam[2] = sh11;
+    sparam[5] = sh22;
+    goto L260;
+L250:
+    sparam[2] = sh11;
+    sparam[3] = sh21;
+    sparam[4] = sh12;
+    sparam[5] = sh22;
+L260:
+    sparam[1] = sflag;
+    return 0;
+} /* srotmg_ */
+
diff --git a/blas/f2c/ssbmv.c b/blas/f2c/ssbmv.c
new file mode 100644
index 000000000..8599325f2
--- /dev/null
+++ b/blas/f2c/ssbmv.c
@@ -0,0 +1,368 @@
+/* ssbmv.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int ssbmv_(char *uplo, integer *n, integer *k, real *alpha, 
+	real *a, integer *lda, real *x, integer *incx, real *beta, real *y, 
+	integer *incy, ftnlen uplo_len)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
+    real temp1, temp2;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    integer kplus1;
+    extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  SSBMV  performs the matrix-vector  operation */
+
+/*     y := alpha*A*x + beta*y, */
+
+/*  where alpha and beta are scalars, x and y are n element vectors and */
+/*  A is an n by n symmetric band matrix, with k super-diagonals. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  UPLO   - CHARACTER*1. */
+/*           On entry, UPLO specifies whether the upper or lower */
+/*           triangular part of the band matrix A is being supplied as */
+/*           follows: */
+
+/*              UPLO = 'U' or 'u'   The upper triangular part of A is */
+/*                                  being supplied. */
+
+/*              UPLO = 'L' or 'l'   The lower triangular part of A is */
+/*                                  being supplied. */
+
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the order of the matrix A. */
+/*           N must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  K      - INTEGER. */
+/*           On entry, K specifies the number of super-diagonals of the */
+/*           matrix A. K must satisfy  0 .le. K. */
+/*           Unchanged on exit. */
+
+/*  ALPHA  - REAL            . */
+/*           On entry, ALPHA specifies the scalar alpha. */
+/*           Unchanged on exit. */
+
+/*  A      - REAL             array of DIMENSION ( LDA, n ). */
+/*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the upper triangular */
+/*           band part of the symmetric matrix, supplied column by */
+/*           column, with the leading diagonal of the matrix in row */
+/*           ( k + 1 ) of the array, the first super-diagonal starting at */
+/*           position 2 in row k, and so on. The top left k by k triangle */
+/*           of the array A is not referenced. */
+/*           The following program segment will transfer the upper */
+/*           triangular part of a symmetric band matrix from conventional */
+/*           full matrix storage to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = K + 1 - J */
+/*                    DO 10, I = MAX( 1, J - K ), J */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the lower triangular */
+/*           band part of the symmetric matrix, supplied column by */
+/*           column, with the leading diagonal of the matrix in row 1 of */
+/*           the array, the first sub-diagonal starting at position 1 in */
+/*           row 2, and so on. The bottom right k by k triangle of the */
+/*           array A is not referenced. */
+/*           The following program segment will transfer the lower */
+/*           triangular part of a symmetric band matrix from conventional */
+/*           full matrix storage to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = 1 - J */
+/*                    DO 10, I = J, MIN( N, J + K ) */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Unchanged on exit. */
+
+/*  LDA    - INTEGER. */
+/*           On entry, LDA specifies the first dimension of A as declared */
+/*           in the calling (sub) program. LDA must be at least */
+/*           ( k + 1 ). */
+/*           Unchanged on exit. */
+
+/*  X      - REAL             array of DIMENSION at least */
+/*           ( 1 + ( n - 1 )*abs( INCX ) ). */
+/*           Before entry, the incremented array X must contain the */
+/*           vector x. */
+/*           Unchanged on exit. */
+
+/*  INCX   - INTEGER. */
+/*           On entry, INCX specifies the increment for the elements of */
+/*           X. INCX must not be zero. */
+/*           Unchanged on exit. */
+
+/*  BETA   - REAL            . */
+/*           On entry, BETA specifies the scalar beta. */
+/*           Unchanged on exit. */
+
+/*  Y      - REAL             array of DIMENSION at least */
+/*           ( 1 + ( n - 1 )*abs( INCY ) ). */
+/*           Before entry, the incremented array Y must contain the */
+/*           vector y. On exit, Y is overwritten by the updated vector y. */
+
+/*  INCY   - INTEGER. */
+/*           On entry, INCY specifies the increment for the elements of */
+/*           Y. INCY must not be zero. */
+/*           Unchanged on exit. */
+
+/*  Further Details */
+/*  =============== */
+
+/*  Level 2 Blas routine. */
+
+/*  -- Written on 22-October-1986. */
+/*     Jack Dongarra, Argonne National Lab. */
+/*     Jeremy Du Croz, Nag Central Office. */
+/*     Sven Hammarling, Nag Central Office. */
+/*     Richard Hanson, Sandia National Labs. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+    --y;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
+	    ftnlen)1, (ftnlen)1)) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*k < 0) {
+	info = 3;
+    } else if (*lda < *k + 1) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    } else if (*incy == 0) {
+	info = 11;
+    }
+    if (info != 0) {
+	xerbla_("SSBMV ", &info, (ftnlen)6);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || (*alpha == 0.f && *beta == 1.f)) {
+	return 0;
+    }
+
+/*     Set up the start points in  X  and  Y. */
+
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (*n - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (*n - 1) * *incy;
+    }
+
+/*     Start the operations. In this version the elements of the array A */
+/*     are accessed sequentially with one pass through A. */
+
+/*     First form  y := beta*y. */
+
+    if (*beta != 1.f) {
+	if (*incy == 1) {
+	    if (*beta == 0.f) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[i__] = 0.f;
+/* L10: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[i__] = *beta * y[i__];
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (*beta == 0.f) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[iy] = 0.f;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[iy] = *beta * y[iy];
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (*alpha == 0.f) {
+	return 0;
+    }
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+
+/*        Form  y  when upper triangle of A is stored. */
+
+	kplus1 = *k + 1;
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[j];
+		temp2 = 0.f;
+		l = kplus1 - j;
+/* Computing MAX */
+		i__2 = 1, i__3 = j - *k;
+		i__4 = j - 1;
+		for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
+		    y[i__] += temp1 * a[l + i__ + j * a_dim1];
+		    temp2 += a[l + i__ + j * a_dim1] * x[i__];
+/* L50: */
+		}
+		y[j] = y[j] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2;
+/* L60: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[jx];
+		temp2 = 0.f;
+		ix = kx;
+		iy = ky;
+		l = kplus1 - j;
+/* Computing MAX */
+		i__4 = 1, i__2 = j - *k;
+		i__3 = j - 1;
+		for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
+		    y[iy] += temp1 * a[l + i__ + j * a_dim1];
+		    temp2 += a[l + i__ + j * a_dim1] * x[ix];
+		    ix += *incx;
+		    iy += *incy;
+/* L70: */
+		}
+		y[jy] = y[jy] + temp1 * a[kplus1 + j * a_dim1] + *alpha * 
+			temp2;
+		jx += *incx;
+		jy += *incy;
+		if (j > *k) {
+		    kx += *incx;
+		    ky += *incy;
+		}
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y  when lower triangle of A is stored. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[j];
+		temp2 = 0.f;
+		y[j] += temp1 * a[j * a_dim1 + 1];
+		l = 1 - j;
+/* Computing MIN */
+		i__4 = *n, i__2 = j + *k;
+		i__3 = min(i__4,i__2);
+		for (i__ = j + 1; i__ <= i__3; ++i__) {
+		    y[i__] += temp1 * a[l + i__ + j * a_dim1];
+		    temp2 += a[l + i__ + j * a_dim1] * x[i__];
+/* L90: */
+		}
+		y[j] += *alpha * temp2;
+/* L100: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[jx];
+		temp2 = 0.f;
+		y[jy] += temp1 * a[j * a_dim1 + 1];
+		l = 1 - j;
+		ix = jx;
+		iy = jy;
+/* Computing MIN */
+		i__4 = *n, i__2 = j + *k;
+		i__3 = min(i__4,i__2);
+		for (i__ = j + 1; i__ <= i__3; ++i__) {
+		    ix += *incx;
+		    iy += *incy;
+		    y[iy] += temp1 * a[l + i__ + j * a_dim1];
+		    temp2 += a[l + i__ + j * a_dim1] * x[ix];
+/* L110: */
+		}
+		y[jy] += *alpha * temp2;
+		jx += *incx;
+		jy += *incy;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of SSBMV . */
+
+} /* ssbmv_ */
+
diff --git a/blas/f2c/sspmv.c b/blas/f2c/sspmv.c
new file mode 100644
index 000000000..47858ec6c
--- /dev/null
+++ b/blas/f2c/sspmv.c
@@ -0,0 +1,316 @@
+/* sspmv.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int sspmv_(char *uplo, integer *n, real *alpha, real *ap, 
+	real *x, integer *incx, real *beta, real *y, integer *incy, ftnlen 
+	uplo_len)
+{
+    /* System generated locals */
+    integer i__1, i__2;
+
+    /* Local variables */
+    integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
+    real temp1, temp2;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  SSPMV  performs the matrix-vector operation */
+
+/*     y := alpha*A*x + beta*y, */
+
+/*  where alpha and beta are scalars, x and y are n element vectors and */
+/*  A is an n by n symmetric matrix, supplied in packed form. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  UPLO   - CHARACTER*1. */
+/*           On entry, UPLO specifies whether the upper or lower */
+/*           triangular part of the matrix A is supplied in the packed */
+/*           array AP as follows: */
+
+/*              UPLO = 'U' or 'u'   The upper triangular part of A is */
+/*                                  supplied in AP. */
+
+/*              UPLO = 'L' or 'l'   The lower triangular part of A is */
+/*                                  supplied in AP. */
+
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the order of the matrix A. */
+/*           N must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  ALPHA  - REAL            . */
+/*           On entry, ALPHA specifies the scalar alpha. */
+/*           Unchanged on exit. */
+
+/*  AP     - REAL             array of DIMENSION at least */
+/*           ( ( n*( n + 1 ) )/2 ). */
+/*           Before entry with UPLO = 'U' or 'u', the array AP must */
+/*           contain the upper triangular part of the symmetric matrix */
+/*           packed sequentially, column by column, so that AP( 1 ) */
+/*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
+/*           and a( 2, 2 ) respectively, and so on. */
+/*           Before entry with UPLO = 'L' or 'l', the array AP must */
+/*           contain the lower triangular part of the symmetric matrix */
+/*           packed sequentially, column by column, so that AP( 1 ) */
+/*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
+/*           and a( 3, 1 ) respectively, and so on. */
+/*           Unchanged on exit. */
+
+/*  X      - REAL             array of dimension at least */
+/*           ( 1 + ( n - 1 )*abs( INCX ) ). */
+/*           Before entry, the incremented array X must contain the n */
+/*           element vector x. */
+/*           Unchanged on exit. */
+
+/*  INCX   - INTEGER. */
+/*           On entry, INCX specifies the increment for the elements of */
+/*           X. INCX must not be zero. */
+/*           Unchanged on exit. */
+
+/*  BETA   - REAL            . */
+/*           On entry, BETA specifies the scalar beta. When BETA is */
+/*           supplied as zero then Y need not be set on input. */
+/*           Unchanged on exit. */
+
+/*  Y      - REAL             array of dimension at least */
+/*           ( 1 + ( n - 1 )*abs( INCY ) ). */
+/*           Before entry, the incremented array Y must contain the n */
+/*           element vector y. On exit, Y is overwritten by the updated */
+/*           vector y. */
+
+/*  INCY   - INTEGER. */
+/*           On entry, INCY specifies the increment for the elements of */
+/*           Y. INCY must not be zero. */
+/*           Unchanged on exit. */
+
+/*  Further Details */
+/*  =============== */
+
+/*  Level 2 Blas routine. */
+
+/*  -- Written on 22-October-1986. */
+/*     Jack Dongarra, Argonne National Lab. */
+/*     Jeremy Du Croz, Nag Central Office. */
+/*     Sven Hammarling, Nag Central Office. */
+/*     Richard Hanson, Sandia National Labs. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    --y;
+    --x;
+    --ap;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
+	    ftnlen)1, (ftnlen)1)) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 6;
+    } else if (*incy == 0) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("SSPMV ", &info, (ftnlen)6);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || (*alpha == 0.f && *beta == 1.f)) {
+	return 0;
+    }
+
+/*     Set up the start points in  X  and  Y. */
+
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (*n - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (*n - 1) * *incy;
+    }
+
+/*     Start the operations. In this version the elements of the array AP */
+/*     are accessed sequentially with one pass through AP. */
+
+/*     First form  y := beta*y. */
+
+    if (*beta != 1.f) {
+	if (*incy == 1) {
+	    if (*beta == 0.f) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[i__] = 0.f;
+/* L10: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[i__] = *beta * y[i__];
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (*beta == 0.f) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[iy] = 0.f;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[iy] = *beta * y[iy];
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (*alpha == 0.f) {
+	return 0;
+    }
+    kk = 1;
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+
+/*        Form  y  when AP contains the upper triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[j];
+		temp2 = 0.f;
+		k = kk;
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    y[i__] += temp1 * ap[k];
+		    temp2 += ap[k] * x[i__];
+		    ++k;
+/* L50: */
+		}
+		y[j] = y[j] + temp1 * ap[kk + j - 1] + *alpha * temp2;
+		kk += j;
+/* L60: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[jx];
+		temp2 = 0.f;
+		ix = kx;
+		iy = ky;
+		i__2 = kk + j - 2;
+		for (k = kk; k <= i__2; ++k) {
+		    y[iy] += temp1 * ap[k];
+		    temp2 += ap[k] * x[ix];
+		    ix += *incx;
+		    iy += *incy;
+/* L70: */
+		}
+		y[jy] = y[jy] + temp1 * ap[kk + j - 1] + *alpha * temp2;
+		jx += *incx;
+		jy += *incy;
+		kk += j;
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y  when AP contains the lower triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[j];
+		temp2 = 0.f;
+		y[j] += temp1 * ap[kk];
+		k = kk + 1;
+		i__2 = *n;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    y[i__] += temp1 * ap[k];
+		    temp2 += ap[k] * x[i__];
+		    ++k;
+/* L90: */
+		}
+		y[j] += *alpha * temp2;
+		kk += *n - j + 1;
+/* L100: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp1 = *alpha * x[jx];
+		temp2 = 0.f;
+		y[jy] += temp1 * ap[kk];
+		ix = jx;
+		iy = jy;
+		i__2 = kk + *n - j;
+		for (k = kk + 1; k <= i__2; ++k) {
+		    ix += *incx;
+		    iy += *incy;
+		    y[iy] += temp1 * ap[k];
+		    temp2 += ap[k] * x[ix];
+/* L110: */
+		}
+		y[jy] += *alpha * temp2;
+		jx += *incx;
+		jy += *incy;
+		kk += *n - j + 1;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of SSPMV . */
+
+} /* sspmv_ */
+
diff --git a/blas/f2c/stbmv.c b/blas/f2c/stbmv.c
new file mode 100644
index 000000000..fcf9ce336
--- /dev/null
+++ b/blas/f2c/stbmv.c
@@ -0,0 +1,428 @@
+/* stbmv.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int stbmv_(char *uplo, char *trans, char *diag, integer *n, 
+	integer *k, real *a, integer *lda, real *x, integer *incx, ftnlen 
+	uplo_len, ftnlen trans_len, ftnlen diag_len)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    integer i__, j, l, ix, jx, kx, info;
+    real temp;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    integer kplus1;
+    extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
+    logical nounit;
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  STBMV  performs one of the matrix-vector operations */
+
+/*     x := A*x,   or   x := A'*x, */
+
+/*  where x is an n element vector and  A is an n by n unit, or non-unit, */
+/*  upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  UPLO   - CHARACTER*1. */
+/*           On entry, UPLO specifies whether the matrix is an upper or */
+/*           lower triangular matrix as follows: */
+
+/*              UPLO = 'U' or 'u'   A is an upper triangular matrix. */
+
+/*              UPLO = 'L' or 'l'   A is a lower triangular matrix. */
+
+/*           Unchanged on exit. */
+
+/*  TRANS  - CHARACTER*1. */
+/*           On entry, TRANS specifies the operation to be performed as */
+/*           follows: */
+
+/*              TRANS = 'N' or 'n'   x := A*x. */
+
+/*              TRANS = 'T' or 't'   x := A'*x. */
+
+/*              TRANS = 'C' or 'c'   x := A'*x. */
+
+/*           Unchanged on exit. */
+
+/*  DIAG   - CHARACTER*1. */
+/*           On entry, DIAG specifies whether or not A is unit */
+/*           triangular as follows: */
+
+/*              DIAG = 'U' or 'u'   A is assumed to be unit triangular. */
+
+/*              DIAG = 'N' or 'n'   A is not assumed to be unit */
+/*                                  triangular. */
+
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the order of the matrix A. */
+/*           N must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  K      - INTEGER. */
+/*           On entry with UPLO = 'U' or 'u', K specifies the number of */
+/*           super-diagonals of the matrix A. */
+/*           On entry with UPLO = 'L' or 'l', K specifies the number of */
+/*           sub-diagonals of the matrix A. */
+/*           K must satisfy  0 .le. K. */
+/*           Unchanged on exit. */
+
+/*  A      - REAL             array of DIMENSION ( LDA, n ). */
+/*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the upper triangular */
+/*           band part of the matrix of coefficients, supplied column by */
+/*           column, with the leading diagonal of the matrix in row */
+/*           ( k + 1 ) of the array, the first super-diagonal starting at */
+/*           position 2 in row k, and so on. The top left k by k triangle */
+/*           of the array A is not referenced. */
+/*           The following program segment will transfer an upper */
+/*           triangular band matrix from conventional full matrix storage */
+/*           to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = K + 1 - J */
+/*                    DO 10, I = MAX( 1, J - K ), J */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the lower triangular */
+/*           band part of the matrix of coefficients, supplied column by */
+/*           column, with the leading diagonal of the matrix in row 1 of */
+/*           the array, the first sub-diagonal starting at position 1 in */
+/*           row 2, and so on. The bottom right k by k triangle of the */
+/*           array A is not referenced. */
+/*           The following program segment will transfer a lower */
+/*           triangular band matrix from conventional full matrix storage */
+/*           to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = 1 - J */
+/*                    DO 10, I = J, MIN( N, J + K ) */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Note that when DIAG = 'U' or 'u' the elements of the array A */
+/*           corresponding to the diagonal elements of the matrix are not */
+/*           referenced, but are assumed to be unity. */
+/*           Unchanged on exit. */
+
+/*  LDA    - INTEGER. */
+/*           On entry, LDA specifies the first dimension of A as declared */
+/*           in the calling (sub) program. LDA must be at least */
+/*           ( k + 1 ). */
+/*           Unchanged on exit. */
+
+/*  X      - REAL             array of dimension at least */
+/*           ( 1 + ( n - 1 )*abs( INCX ) ). */
+/*           Before entry, the incremented array X must contain the n */
+/*           element vector x. On exit, X is overwritten with the */
+/*           tranformed vector x. */
+
+/*  INCX   - INTEGER. */
+/*           On entry, INCX specifies the increment for the elements of */
+/*           X. INCX must not be zero. */
+/*           Unchanged on exit. */
+
+/*  Further Details */
+/*  =============== */
+
+/*  Level 2 Blas routine. */
+
+/*  -- Written on 22-October-1986. */
+/*     Jack Dongarra, Argonne National Lab. */
+/*     Jeremy Du Croz, Nag Central Office. */
+/*     Sven Hammarling, Nag Central Office. */
+/*     Richard Hanson, Sandia National Labs. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
+	    ftnlen)1, (ftnlen)1)) {
+	info = 1;
+    } else if (! lsame_(trans, "N", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, 
+	    "T", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, "C", (ftnlen)1, (
+	    ftnlen)1)) {
+	info = 2;
+    } else if (! lsame_(diag, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(diag, 
+	    "N", (ftnlen)1, (ftnlen)1)) {
+	info = 3;
+    } else if (*n < 0) {
+	info = 4;
+    } else if (*k < 0) {
+	info = 5;
+    } else if (*lda < *k + 1) {
+	info = 7;
+    } else if (*incx == 0) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("STBMV ", &info, (ftnlen)6);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    nounit = lsame_(diag, "N", (ftnlen)1, (ftnlen)1);
+
+/*     Set up the start point in X if the increment is not unity. This */
+/*     will be  ( N - 1 )*INCX   too small for descending loops. */
+
+    if (*incx <= 0) {
+	kx = 1 - (*n - 1) * *incx;
+    } else if (*incx != 1) {
+	kx = 1;
+    }
+
+/*     Start the operations. In this version the elements of A are */
+/*     accessed sequentially with one pass through A. */
+
+    if (lsame_(trans, "N", (ftnlen)1, (ftnlen)1)) {
+
+/*         Form  x := A*x. */
+
+	if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	    kplus1 = *k + 1;
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (x[j] != 0.f) {
+			temp = x[j];
+			l = kplus1 - j;
+/* Computing MAX */
+			i__2 = 1, i__3 = j - *k;
+			i__4 = j - 1;
+			for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
+			    x[i__] += temp * a[l + i__ + j * a_dim1];
+/* L10: */
+			}
+			if (nounit) {
+			    x[j] *= a[kplus1 + j * a_dim1];
+			}
+		    }
+/* L20: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (x[jx] != 0.f) {
+			temp = x[jx];
+			ix = kx;
+			l = kplus1 - j;
+/* Computing MAX */
+			i__4 = 1, i__2 = j - *k;
+			i__3 = j - 1;
+			for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
+			    x[ix] += temp * a[l + i__ + j * a_dim1];
+			    ix += *incx;
+/* L30: */
+			}
+			if (nounit) {
+			    x[jx] *= a[kplus1 + j * a_dim1];
+			}
+		    }
+		    jx += *incx;
+		    if (j > *k) {
+			kx += *incx;
+		    }
+/* L40: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    if (x[j] != 0.f) {
+			temp = x[j];
+			l = 1 - j;
+/* Computing MIN */
+			i__1 = *n, i__3 = j + *k;
+			i__4 = j + 1;
+			for (i__ = min(i__1,i__3); i__ >= i__4; --i__) {
+			    x[i__] += temp * a[l + i__ + j * a_dim1];
+/* L50: */
+			}
+			if (nounit) {
+			    x[j] *= a[j * a_dim1 + 1];
+			}
+		    }
+/* L60: */
+		}
+	    } else {
+		kx += (*n - 1) * *incx;
+		jx = kx;
+		for (j = *n; j >= 1; --j) {
+		    if (x[jx] != 0.f) {
+			temp = x[jx];
+			ix = kx;
+			l = 1 - j;
+/* Computing MIN */
+			i__4 = *n, i__1 = j + *k;
+			i__3 = j + 1;
+			for (i__ = min(i__4,i__1); i__ >= i__3; --i__) {
+			    x[ix] += temp * a[l + i__ + j * a_dim1];
+			    ix -= *incx;
+/* L70: */
+			}
+			if (nounit) {
+			    x[jx] *= a[j * a_dim1 + 1];
+			}
+		    }
+		    jx -= *incx;
+		    if (*n - j >= *k) {
+			kx -= *incx;
+		    }
+/* L80: */
+		}
+	    }
+	}
+    } else {
+
+/*        Form  x := A'*x. */
+
+	if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	    kplus1 = *k + 1;
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    temp = x[j];
+		    l = kplus1 - j;
+		    if (nounit) {
+			temp *= a[kplus1 + j * a_dim1];
+		    }
+/* Computing MAX */
+		    i__4 = 1, i__1 = j - *k;
+		    i__3 = max(i__4,i__1);
+		    for (i__ = j - 1; i__ >= i__3; --i__) {
+			temp += a[l + i__ + j * a_dim1] * x[i__];
+/* L90: */
+		    }
+		    x[j] = temp;
+/* L100: */
+		}
+	    } else {
+		kx += (*n - 1) * *incx;
+		jx = kx;
+		for (j = *n; j >= 1; --j) {
+		    temp = x[jx];
+		    kx -= *incx;
+		    ix = kx;
+		    l = kplus1 - j;
+		    if (nounit) {
+			temp *= a[kplus1 + j * a_dim1];
+		    }
+/* Computing MAX */
+		    i__4 = 1, i__1 = j - *k;
+		    i__3 = max(i__4,i__1);
+		    for (i__ = j - 1; i__ >= i__3; --i__) {
+			temp += a[l + i__ + j * a_dim1] * x[ix];
+			ix -= *incx;
+/* L110: */
+		    }
+		    x[jx] = temp;
+		    jx -= *incx;
+/* L120: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		i__3 = *n;
+		for (j = 1; j <= i__3; ++j) {
+		    temp = x[j];
+		    l = 1 - j;
+		    if (nounit) {
+			temp *= a[j * a_dim1 + 1];
+		    }
+/* Computing MIN */
+		    i__1 = *n, i__2 = j + *k;
+		    i__4 = min(i__1,i__2);
+		    for (i__ = j + 1; i__ <= i__4; ++i__) {
+			temp += a[l + i__ + j * a_dim1] * x[i__];
+/* L130: */
+		    }
+		    x[j] = temp;
+/* L140: */
+		}
+	    } else {
+		jx = kx;
+		i__3 = *n;
+		for (j = 1; j <= i__3; ++j) {
+		    temp = x[jx];
+		    kx += *incx;
+		    ix = kx;
+		    l = 1 - j;
+		    if (nounit) {
+			temp *= a[j * a_dim1 + 1];
+		    }
+/* Computing MIN */
+		    i__1 = *n, i__2 = j + *k;
+		    i__4 = min(i__1,i__2);
+		    for (i__ = j + 1; i__ <= i__4; ++i__) {
+			temp += a[l + i__ + j * a_dim1] * x[ix];
+			ix += *incx;
+/* L150: */
+		    }
+		    x[jx] = temp;
+		    jx += *incx;
+/* L160: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of STBMV . */
+
+} /* stbmv_ */
+
diff --git a/blas/f2c/zhbmv.c b/blas/f2c/zhbmv.c
new file mode 100644
index 000000000..42da13dbb
--- /dev/null
+++ b/blas/f2c/zhbmv.c
@@ -0,0 +1,488 @@
+/* zhbmv.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int zhbmv_(char *uplo, integer *n, integer *k, doublecomplex 
+	*alpha, doublecomplex *a, integer *lda, doublecomplex *x, integer *
+	incx, doublecomplex *beta, doublecomplex *y, integer *incy, ftnlen 
+	uplo_len)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    doublereal d__1;
+    doublecomplex z__1, z__2, z__3, z__4;
+
+    /* Builtin functions */
+    void d_cnjg(doublecomplex *, doublecomplex *);
+
+    /* Local variables */
+    integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
+    doublecomplex temp1, temp2;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    integer kplus1;
+    extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  ZHBMV  performs the matrix-vector  operation */
+
+/*     y := alpha*A*x + beta*y, */
+
+/*  where alpha and beta are scalars, x and y are n element vectors and */
+/*  A is an n by n hermitian band matrix, with k super-diagonals. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  UPLO   - CHARACTER*1. */
+/*           On entry, UPLO specifies whether the upper or lower */
+/*           triangular part of the band matrix A is being supplied as */
+/*           follows: */
+
+/*              UPLO = 'U' or 'u'   The upper triangular part of A is */
+/*                                  being supplied. */
+
+/*              UPLO = 'L' or 'l'   The lower triangular part of A is */
+/*                                  being supplied. */
+
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the order of the matrix A. */
+/*           N must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  K      - INTEGER. */
+/*           On entry, K specifies the number of super-diagonals of the */
+/*           matrix A. K must satisfy  0 .le. K. */
+/*           Unchanged on exit. */
+
+/*  ALPHA  - COMPLEX*16      . */
+/*           On entry, ALPHA specifies the scalar alpha. */
+/*           Unchanged on exit. */
+
+/*  A      - COMPLEX*16       array of DIMENSION ( LDA, n ). */
+/*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the upper triangular */
+/*           band part of the hermitian matrix, supplied column by */
+/*           column, with the leading diagonal of the matrix in row */
+/*           ( k + 1 ) of the array, the first super-diagonal starting at */
+/*           position 2 in row k, and so on. The top left k by k triangle */
+/*           of the array A is not referenced. */
+/*           The following program segment will transfer the upper */
+/*           triangular part of a hermitian band matrix from conventional */
+/*           full matrix storage to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = K + 1 - J */
+/*                    DO 10, I = MAX( 1, J - K ), J */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the lower triangular */
+/*           band part of the hermitian matrix, supplied column by */
+/*           column, with the leading diagonal of the matrix in row 1 of */
+/*           the array, the first sub-diagonal starting at position 1 in */
+/*           row 2, and so on. The bottom right k by k triangle of the */
+/*           array A is not referenced. */
+/*           The following program segment will transfer the lower */
+/*           triangular part of a hermitian band matrix from conventional */
+/*           full matrix storage to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = 1 - J */
+/*                    DO 10, I = J, MIN( N, J + K ) */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Note that the imaginary parts of the diagonal elements need */
+/*           not be set and are assumed to be zero. */
+/*           Unchanged on exit. */
+
+/*  LDA    - INTEGER. */
+/*           On entry, LDA specifies the first dimension of A as declared */
+/*           in the calling (sub) program. LDA must be at least */
+/*           ( k + 1 ). */
+/*           Unchanged on exit. */
+
+/*  X      - COMPLEX*16       array of DIMENSION at least */
+/*           ( 1 + ( n - 1 )*abs( INCX ) ). */
+/*           Before entry, the incremented array X must contain the */
+/*           vector x. */
+/*           Unchanged on exit. */
+
+/*  INCX   - INTEGER. */
+/*           On entry, INCX specifies the increment for the elements of */
+/*           X. INCX must not be zero. */
+/*           Unchanged on exit. */
+
+/*  BETA   - COMPLEX*16      . */
+/*           On entry, BETA specifies the scalar beta. */
+/*           Unchanged on exit. */
+
+/*  Y      - COMPLEX*16       array of DIMENSION at least */
+/*           ( 1 + ( n - 1 )*abs( INCY ) ). */
+/*           Before entry, the incremented array Y must contain the */
+/*           vector y. On exit, Y is overwritten by the updated vector y. */
+
+/*  INCY   - INTEGER. */
+/*           On entry, INCY specifies the increment for the elements of */
+/*           Y. INCY must not be zero. */
+/*           Unchanged on exit. */
+
+/*  Further Details */
+/*  =============== */
+
+/*  Level 2 Blas routine. */
+
+/*  -- Written on 22-October-1986. */
+/*     Jack Dongarra, Argonne National Lab. */
+/*     Jeremy Du Croz, Nag Central Office. */
+/*     Sven Hammarling, Nag Central Office. */
+/*     Richard Hanson, Sandia National Labs. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+    --y;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
+	    ftnlen)1, (ftnlen)1)) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*k < 0) {
+	info = 3;
+    } else if (*lda < *k + 1) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    } else if (*incy == 0) {
+	info = 11;
+    }
+    if (info != 0) {
+	xerbla_("ZHBMV ", &info, (ftnlen)6);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || (alpha->r == 0. && alpha->i == 0. && (beta->r == 1. && 
+                                                         beta->i == 0.))) {
+	return 0;
+    }
+
+/*     Set up the start points in  X  and  Y. */
+
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (*n - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (*n - 1) * *incy;
+    }
+
+/*     Start the operations. In this version the elements of the array A */
+/*     are accessed sequentially with one pass through A. */
+
+/*     First form  y := beta*y. */
+
+    if (beta->r != 1. || beta->i != 0.) {
+	if (*incy == 1) {
+	    if (beta->r == 0. && beta->i == 0.) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = i__;
+		    y[i__2].r = 0., y[i__2].i = 0.;
+/* L10: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = i__;
+		    i__3 = i__;
+		    z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, 
+			    z__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
+			    .r;
+		    y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (beta->r == 0. && beta->i == 0.) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = iy;
+		    y[i__2].r = 0., y[i__2].i = 0.;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = iy;
+		    i__3 = iy;
+		    z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, 
+			    z__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
+			    .r;
+		    y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (alpha->r == 0. && alpha->i == 0.) {
+	return 0;
+    }
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+
+/*        Form  y  when upper triangle of A is stored. */
+
+	kplus1 = *k + 1;
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j;
+		z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i =
+			 alpha->r * x[i__2].i + alpha->i * x[i__2].r;
+		temp1.r = z__1.r, temp1.i = z__1.i;
+		temp2.r = 0., temp2.i = 0.;
+		l = kplus1 - j;
+/* Computing MAX */
+		i__2 = 1, i__3 = j - *k;
+		i__4 = j - 1;
+		for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
+		    i__2 = i__;
+		    i__3 = i__;
+		    i__5 = l + i__ + j * a_dim1;
+		    z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, 
+			    z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
+			    .r;
+		    z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
+		    y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+		    d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
+		    i__2 = i__;
+		    z__2.r = z__3.r * x[i__2].r - z__3.i * x[i__2].i, z__2.i =
+			     z__3.r * x[i__2].i + z__3.i * x[i__2].r;
+		    z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+/* L50: */
+		}
+		i__4 = j;
+		i__2 = j;
+		i__3 = kplus1 + j * a_dim1;
+		d__1 = a[i__3].r;
+		z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
+		z__2.r = y[i__2].r + z__3.r, z__2.i = y[i__2].i + z__3.i;
+		z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
+		y[i__4].r = z__1.r, y[i__4].i = z__1.i;
+/* L60: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__4 = jx;
+		z__1.r = alpha->r * x[i__4].r - alpha->i * x[i__4].i, z__1.i =
+			 alpha->r * x[i__4].i + alpha->i * x[i__4].r;
+		temp1.r = z__1.r, temp1.i = z__1.i;
+		temp2.r = 0., temp2.i = 0.;
+		ix = kx;
+		iy = ky;
+		l = kplus1 - j;
+/* Computing MAX */
+		i__4 = 1, i__2 = j - *k;
+		i__3 = j - 1;
+		for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
+		    i__4 = iy;
+		    i__2 = iy;
+		    i__5 = l + i__ + j * a_dim1;
+		    z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, 
+			    z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
+			    .r;
+		    z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i;
+		    y[i__4].r = z__1.r, y[i__4].i = z__1.i;
+		    d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
+		    i__4 = ix;
+		    z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i =
+			     z__3.r * x[i__4].i + z__3.i * x[i__4].r;
+		    z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+		    ix += *incx;
+		    iy += *incy;
+/* L70: */
+		}
+		i__3 = jy;
+		i__4 = jy;
+		i__2 = kplus1 + j * a_dim1;
+		d__1 = a[i__2].r;
+		z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
+		z__2.r = y[i__4].r + z__3.r, z__2.i = y[i__4].i + z__3.i;
+		z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
+		y[i__3].r = z__1.r, y[i__3].i = z__1.i;
+		jx += *incx;
+		jy += *incy;
+		if (j > *k) {
+		    kx += *incx;
+		    ky += *incy;
+		}
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y  when lower triangle of A is stored. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__3 = j;
+		z__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, z__1.i =
+			 alpha->r * x[i__3].i + alpha->i * x[i__3].r;
+		temp1.r = z__1.r, temp1.i = z__1.i;
+		temp2.r = 0., temp2.i = 0.;
+		i__3 = j;
+		i__4 = j;
+		i__2 = j * a_dim1 + 1;
+		d__1 = a[i__2].r;
+		z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
+		z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
+		y[i__3].r = z__1.r, y[i__3].i = z__1.i;
+		l = 1 - j;
+/* Computing MIN */
+		i__4 = *n, i__2 = j + *k;
+		i__3 = min(i__4,i__2);
+		for (i__ = j + 1; i__ <= i__3; ++i__) {
+		    i__4 = i__;
+		    i__2 = i__;
+		    i__5 = l + i__ + j * a_dim1;
+		    z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, 
+			    z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
+			    .r;
+		    z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i;
+		    y[i__4].r = z__1.r, y[i__4].i = z__1.i;
+		    d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
+		    i__4 = i__;
+		    z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i =
+			     z__3.r * x[i__4].i + z__3.i * x[i__4].r;
+		    z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+/* L90: */
+		}
+		i__3 = j;
+		i__4 = j;
+		z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
+		y[i__3].r = z__1.r, y[i__3].i = z__1.i;
+/* L100: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__3 = jx;
+		z__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, z__1.i =
+			 alpha->r * x[i__3].i + alpha->i * x[i__3].r;
+		temp1.r = z__1.r, temp1.i = z__1.i;
+		temp2.r = 0., temp2.i = 0.;
+		i__3 = jy;
+		i__4 = jy;
+		i__2 = j * a_dim1 + 1;
+		d__1 = a[i__2].r;
+		z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
+		z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
+		y[i__3].r = z__1.r, y[i__3].i = z__1.i;
+		l = 1 - j;
+		ix = jx;
+		iy = jy;
+/* Computing MIN */
+		i__4 = *n, i__2 = j + *k;
+		i__3 = min(i__4,i__2);
+		for (i__ = j + 1; i__ <= i__3; ++i__) {
+		    ix += *incx;
+		    iy += *incy;
+		    i__4 = iy;
+		    i__2 = iy;
+		    i__5 = l + i__ + j * a_dim1;
+		    z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, 
+			    z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
+			    .r;
+		    z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i;
+		    y[i__4].r = z__1.r, y[i__4].i = z__1.i;
+		    d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
+		    i__4 = ix;
+		    z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i =
+			     z__3.r * x[i__4].i + z__3.i * x[i__4].r;
+		    z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+/* L110: */
+		}
+		i__3 = jy;
+		i__4 = jy;
+		z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
+		y[i__3].r = z__1.r, y[i__3].i = z__1.i;
+		jx += *incx;
+		jy += *incy;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of ZHBMV . */
+
+} /* zhbmv_ */
+
diff --git a/blas/f2c/zhpmv.c b/blas/f2c/zhpmv.c
new file mode 100644
index 000000000..fbe2f42b3
--- /dev/null
+++ b/blas/f2c/zhpmv.c
@@ -0,0 +1,438 @@
+/* zhpmv.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int zhpmv_(char *uplo, integer *n, doublecomplex *alpha, 
+	doublecomplex *ap, doublecomplex *x, integer *incx, doublecomplex *
+	beta, doublecomplex *y, integer *incy, ftnlen uplo_len)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3, i__4, i__5;
+    doublereal d__1;
+    doublecomplex z__1, z__2, z__3, z__4;
+
+    /* Builtin functions */
+    void d_cnjg(doublecomplex *, doublecomplex *);
+
+    /* Local variables */
+    integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
+    doublecomplex temp1, temp2;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  ZHPMV  performs the matrix-vector operation */
+
+/*     y := alpha*A*x + beta*y, */
+
+/*  where alpha and beta are scalars, x and y are n element vectors and */
+/*  A is an n by n hermitian matrix, supplied in packed form. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  UPLO   - CHARACTER*1. */
+/*           On entry, UPLO specifies whether the upper or lower */
+/*           triangular part of the matrix A is supplied in the packed */
+/*           array AP as follows: */
+
+/*              UPLO = 'U' or 'u'   The upper triangular part of A is */
+/*                                  supplied in AP. */
+
+/*              UPLO = 'L' or 'l'   The lower triangular part of A is */
+/*                                  supplied in AP. */
+
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the order of the matrix A. */
+/*           N must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  ALPHA  - COMPLEX*16      . */
+/*           On entry, ALPHA specifies the scalar alpha. */
+/*           Unchanged on exit. */
+
+/*  AP     - COMPLEX*16       array of DIMENSION at least */
+/*           ( ( n*( n + 1 ) )/2 ). */
+/*           Before entry with UPLO = 'U' or 'u', the array AP must */
+/*           contain the upper triangular part of the hermitian matrix */
+/*           packed sequentially, column by column, so that AP( 1 ) */
+/*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
+/*           and a( 2, 2 ) respectively, and so on. */
+/*           Before entry with UPLO = 'L' or 'l', the array AP must */
+/*           contain the lower triangular part of the hermitian matrix */
+/*           packed sequentially, column by column, so that AP( 1 ) */
+/*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
+/*           and a( 3, 1 ) respectively, and so on. */
+/*           Note that the imaginary parts of the diagonal elements need */
+/*           not be set and are assumed to be zero. */
+/*           Unchanged on exit. */
+
+/*  X      - COMPLEX*16       array of dimension at least */
+/*           ( 1 + ( n - 1 )*abs( INCX ) ). */
+/*           Before entry, the incremented array X must contain the n */
+/*           element vector x. */
+/*           Unchanged on exit. */
+
+/*  INCX   - INTEGER. */
+/*           On entry, INCX specifies the increment for the elements of */
+/*           X. INCX must not be zero. */
+/*           Unchanged on exit. */
+
+/*  BETA   - COMPLEX*16      . */
+/*           On entry, BETA specifies the scalar beta. When BETA is */
+/*           supplied as zero then Y need not be set on input. */
+/*           Unchanged on exit. */
+
+/*  Y      - COMPLEX*16       array of dimension at least */
+/*           ( 1 + ( n - 1 )*abs( INCY ) ). */
+/*           Before entry, the incremented array Y must contain the n */
+/*           element vector y. On exit, Y is overwritten by the updated */
+/*           vector y. */
+
+/*  INCY   - INTEGER. */
+/*           On entry, INCY specifies the increment for the elements of */
+/*           Y. INCY must not be zero. */
+/*           Unchanged on exit. */
+
+/*  Further Details */
+/*  =============== */
+
+/*  Level 2 Blas routine. */
+
+/*  -- Written on 22-October-1986. */
+/*     Jack Dongarra, Argonne National Lab. */
+/*     Jeremy Du Croz, Nag Central Office. */
+/*     Sven Hammarling, Nag Central Office. */
+/*     Richard Hanson, Sandia National Labs. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    --y;
+    --x;
+    --ap;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
+	    ftnlen)1, (ftnlen)1)) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 6;
+    } else if (*incy == 0) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("ZHPMV ", &info, (ftnlen)6);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0 || (alpha->r == 0. && alpha->i == 0. && (beta->r == 1. && 
+                                                         beta->i == 0.))) {
+	return 0;
+    }
+
+/*     Set up the start points in  X  and  Y. */
+
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (*n - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (*n - 1) * *incy;
+    }
+
+/*     Start the operations. In this version the elements of the array AP */
+/*     are accessed sequentially with one pass through AP. */
+
+/*     First form  y := beta*y. */
+
+    if (beta->r != 1. || beta->i != 0.) {
+	if (*incy == 1) {
+	    if (beta->r == 0. && beta->i == 0.) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = i__;
+		    y[i__2].r = 0., y[i__2].i = 0.;
+/* L10: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = i__;
+		    i__3 = i__;
+		    z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, 
+			    z__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
+			    .r;
+		    y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (beta->r == 0. && beta->i == 0.) {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = iy;
+		    y[i__2].r = 0., y[i__2].i = 0.;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = *n;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    i__2 = iy;
+		    i__3 = iy;
+		    z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, 
+			    z__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
+			    .r;
+		    y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (alpha->r == 0. && alpha->i == 0.) {
+	return 0;
+    }
+    kk = 1;
+    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+
+/*        Form  y  when AP contains the upper triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j;
+		z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i =
+			 alpha->r * x[i__2].i + alpha->i * x[i__2].r;
+		temp1.r = z__1.r, temp1.i = z__1.i;
+		temp2.r = 0., temp2.i = 0.;
+		k = kk;
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    i__3 = i__;
+		    i__4 = i__;
+		    i__5 = k;
+		    z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, 
+			    z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5]
+			    .r;
+		    z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
+		    y[i__3].r = z__1.r, y[i__3].i = z__1.i;
+		    d_cnjg(&z__3, &ap[k]);
+		    i__3 = i__;
+		    z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i =
+			     z__3.r * x[i__3].i + z__3.i * x[i__3].r;
+		    z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+		    ++k;
+/* L50: */
+		}
+		i__2 = j;
+		i__3 = j;
+		i__4 = kk + j - 1;
+		d__1 = ap[i__4].r;
+		z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
+		z__2.r = y[i__3].r + z__3.r, z__2.i = y[i__3].i + z__3.i;
+		z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
+		y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+		kk += j;
+/* L60: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = jx;
+		z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i =
+			 alpha->r * x[i__2].i + alpha->i * x[i__2].r;
+		temp1.r = z__1.r, temp1.i = z__1.i;
+		temp2.r = 0., temp2.i = 0.;
+		ix = kx;
+		iy = ky;
+		i__2 = kk + j - 2;
+		for (k = kk; k <= i__2; ++k) {
+		    i__3 = iy;
+		    i__4 = iy;
+		    i__5 = k;
+		    z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, 
+			    z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5]
+			    .r;
+		    z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
+		    y[i__3].r = z__1.r, y[i__3].i = z__1.i;
+		    d_cnjg(&z__3, &ap[k]);
+		    i__3 = ix;
+		    z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i =
+			     z__3.r * x[i__3].i + z__3.i * x[i__3].r;
+		    z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+		    ix += *incx;
+		    iy += *incy;
+/* L70: */
+		}
+		i__2 = jy;
+		i__3 = jy;
+		i__4 = kk + j - 1;
+		d__1 = ap[i__4].r;
+		z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
+		z__2.r = y[i__3].r + z__3.r, z__2.i = y[i__3].i + z__3.i;
+		z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
+		y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+		jx += *incx;
+		jy += *incy;
+		kk += j;
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y  when AP contains the lower triangle. */
+
+	if (*incx == 1 && *incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = j;
+		z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i =
+			 alpha->r * x[i__2].i + alpha->i * x[i__2].r;
+		temp1.r = z__1.r, temp1.i = z__1.i;
+		temp2.r = 0., temp2.i = 0.;
+		i__2 = j;
+		i__3 = j;
+		i__4 = kk;
+		d__1 = ap[i__4].r;
+		z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
+		z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
+		y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+		k = kk + 1;
+		i__2 = *n;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    i__3 = i__;
+		    i__4 = i__;
+		    i__5 = k;
+		    z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, 
+			    z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5]
+			    .r;
+		    z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
+		    y[i__3].r = z__1.r, y[i__3].i = z__1.i;
+		    d_cnjg(&z__3, &ap[k]);
+		    i__3 = i__;
+		    z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i =
+			     z__3.r * x[i__3].i + z__3.i * x[i__3].r;
+		    z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+		    ++k;
+/* L90: */
+		}
+		i__2 = j;
+		i__3 = j;
+		z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
+		y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+		kk += *n - j + 1;
+/* L100: */
+	    }
+	} else {
+	    jx = kx;
+	    jy = ky;
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = jx;
+		z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i =
+			 alpha->r * x[i__2].i + alpha->i * x[i__2].r;
+		temp1.r = z__1.r, temp1.i = z__1.i;
+		temp2.r = 0., temp2.i = 0.;
+		i__2 = jy;
+		i__3 = jy;
+		i__4 = kk;
+		d__1 = ap[i__4].r;
+		z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
+		z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
+		y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+		ix = jx;
+		iy = jy;
+		i__2 = kk + *n - j;
+		for (k = kk + 1; k <= i__2; ++k) {
+		    ix += *incx;
+		    iy += *incy;
+		    i__3 = iy;
+		    i__4 = iy;
+		    i__5 = k;
+		    z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, 
+			    z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5]
+			    .r;
+		    z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
+		    y[i__3].r = z__1.r, y[i__3].i = z__1.i;
+		    d_cnjg(&z__3, &ap[k]);
+		    i__3 = ix;
+		    z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i =
+			     z__3.r * x[i__3].i + z__3.i * x[i__3].r;
+		    z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
+		    temp2.r = z__1.r, temp2.i = z__1.i;
+/* L110: */
+		}
+		i__2 = jy;
+		i__3 = jy;
+		z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = 
+			alpha->r * temp2.i + alpha->i * temp2.r;
+		z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
+		y[i__2].r = z__1.r, y[i__2].i = z__1.i;
+		jx += *incx;
+		jy += *incy;
+		kk += *n - j + 1;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of ZHPMV . */
+
+} /* zhpmv_ */
+
diff --git a/blas/f2c/ztbmv.c b/blas/f2c/ztbmv.c
new file mode 100644
index 000000000..4cdcd7f88
--- /dev/null
+++ b/blas/f2c/ztbmv.c
@@ -0,0 +1,647 @@
+/* ztbmv.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "datatypes.h"
+
+/* Subroutine */ int ztbmv_(char *uplo, char *trans, char *diag, integer *n, 
+	integer *k, doublecomplex *a, integer *lda, doublecomplex *x, integer 
+	*incx, ftnlen uplo_len, ftnlen trans_len, ftnlen diag_len)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+    doublecomplex z__1, z__2, z__3;
+
+    /* Builtin functions */
+    void d_cnjg(doublecomplex *, doublecomplex *);
+
+    /* Local variables */
+    integer i__, j, l, ix, jx, kx, info;
+    doublecomplex temp;
+    extern logical lsame_(char *, char *, ftnlen, ftnlen);
+    integer kplus1;
+    extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
+    logical noconj, nounit;
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  ZTBMV  performs one of the matrix-vector operations */
+
+/*     x := A*x,   or   x := A'*x,   or   x := conjg( A' )*x, */
+
+/*  where x is an n element vector and  A is an n by n unit, or non-unit, */
+/*  upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  UPLO   - CHARACTER*1. */
+/*           On entry, UPLO specifies whether the matrix is an upper or */
+/*           lower triangular matrix as follows: */
+
+/*              UPLO = 'U' or 'u'   A is an upper triangular matrix. */
+
+/*              UPLO = 'L' or 'l'   A is a lower triangular matrix. */
+
+/*           Unchanged on exit. */
+
+/*  TRANS  - CHARACTER*1. */
+/*           On entry, TRANS specifies the operation to be performed as */
+/*           follows: */
+
+/*              TRANS = 'N' or 'n'   x := A*x. */
+
+/*              TRANS = 'T' or 't'   x := A'*x. */
+
+/*              TRANS = 'C' or 'c'   x := conjg( A' )*x. */
+
+/*           Unchanged on exit. */
+
+/*  DIAG   - CHARACTER*1. */
+/*           On entry, DIAG specifies whether or not A is unit */
+/*           triangular as follows: */
+
+/*              DIAG = 'U' or 'u'   A is assumed to be unit triangular. */
+
+/*              DIAG = 'N' or 'n'   A is not assumed to be unit */
+/*                                  triangular. */
+
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the order of the matrix A. */
+/*           N must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  K      - INTEGER. */
+/*           On entry with UPLO = 'U' or 'u', K specifies the number of */
+/*           super-diagonals of the matrix A. */
+/*           On entry with UPLO = 'L' or 'l', K specifies the number of */
+/*           sub-diagonals of the matrix A. */
+/*           K must satisfy  0 .le. K. */
+/*           Unchanged on exit. */
+
+/*  A      - COMPLEX*16       array of DIMENSION ( LDA, n ). */
+/*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the upper triangular */
+/*           band part of the matrix of coefficients, supplied column by */
+/*           column, with the leading diagonal of the matrix in row */
+/*           ( k + 1 ) of the array, the first super-diagonal starting at */
+/*           position 2 in row k, and so on. The top left k by k triangle */
+/*           of the array A is not referenced. */
+/*           The following program segment will transfer an upper */
+/*           triangular band matrix from conventional full matrix storage */
+/*           to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = K + 1 - J */
+/*                    DO 10, I = MAX( 1, J - K ), J */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
+/*           by n part of the array A must contain the lower triangular */
+/*           band part of the matrix of coefficients, supplied column by */
+/*           column, with the leading diagonal of the matrix in row 1 of */
+/*           the array, the first sub-diagonal starting at position 1 in */
+/*           row 2, and so on. The bottom right k by k triangle of the */
+/*           array A is not referenced. */
+/*           The following program segment will transfer a lower */
+/*           triangular band matrix from conventional full matrix storage */
+/*           to band storage: */
+
+/*                 DO 20, J = 1, N */
+/*                    M = 1 - J */
+/*                    DO 10, I = J, MIN( N, J + K ) */
+/*                       A( M + I, J ) = matrix( I, J ) */
+/*              10    CONTINUE */
+/*              20 CONTINUE */
+
+/*           Note that when DIAG = 'U' or 'u' the elements of the array A */
+/*           corresponding to the diagonal elements of the matrix are not */
+/*           referenced, but are assumed to be unity. */
+/*           Unchanged on exit. */
+
+/*  LDA    - INTEGER. */
+/*           On entry, LDA specifies the first dimension of A as declared */
+/*           in the calling (sub) program. LDA must be at least */
+/*           ( k + 1 ). */
+/*           Unchanged on exit. */
+
+/*  X      - COMPLEX*16       array of dimension at least */
+/*           ( 1 + ( n - 1 )*abs( INCX ) ). */
+/*           Before entry, the incremented array X must contain the n */
+/*           element vector x. On exit, X is overwritten with the */
+/*           tranformed vector x. */
+
+/*  INCX   - INTEGER. */
+/*           On entry, INCX specifies the increment for the elements of */
+/*           X. INCX must not be zero. */
+/*           Unchanged on exit. */
+
+/*  Further Details */
+/*  =============== */
+
+/*  Level 2 Blas routine. */
+
+/*  -- Written on 22-October-1986. */
+/*     Jack Dongarra, Argonne National Lab. */
+/*     Jeremy Du Croz, Nag Central Office. */
+/*     Sven Hammarling, Nag Central Office. */
+/*     Richard Hanson, Sandia National Labs. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
+	    ftnlen)1, (ftnlen)1)) {
+	info = 1;
+    } else if (! lsame_(trans, "N", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, 
+	    "T", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, "C", (ftnlen)1, (
+	    ftnlen)1)) {
+	info = 2;
+    } else if (! lsame_(diag, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(diag, 
+	    "N", (ftnlen)1, (ftnlen)1)) {
+	info = 3;
+    } else if (*n < 0) {
+	info = 4;
+    } else if (*k < 0) {
+	info = 5;
+    } else if (*lda < *k + 1) {
+	info = 7;
+    } else if (*incx == 0) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("ZTBMV ", &info, (ftnlen)6);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    noconj = lsame_(trans, "T", (ftnlen)1, (ftnlen)1);
+    nounit = lsame_(diag, "N", (ftnlen)1, (ftnlen)1);
+
+/*     Set up the start point in X if the increment is not unity. This */
+/*     will be  ( N - 1 )*INCX   too small for descending loops. */
+
+    if (*incx <= 0) {
+	kx = 1 - (*n - 1) * *incx;
+    } else if (*incx != 1) {
+	kx = 1;
+    }
+
+/*     Start the operations. In this version the elements of A are */
+/*     accessed sequentially with one pass through A. */
+
+    if (lsame_(trans, "N", (ftnlen)1, (ftnlen)1)) {
+
+/*         Form  x := A*x. */
+
+	if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	    kplus1 = *k + 1;
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = j;
+		    if (x[i__2].r != 0. || x[i__2].i != 0.) {
+			i__2 = j;
+			temp.r = x[i__2].r, temp.i = x[i__2].i;
+			l = kplus1 - j;
+/* Computing MAX */
+			i__2 = 1, i__3 = j - *k;
+			i__4 = j - 1;
+			for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
+			    i__2 = i__;
+			    i__3 = i__;
+			    i__5 = l + i__ + j * a_dim1;
+			    z__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, 
+				    z__2.i = temp.r * a[i__5].i + temp.i * a[
+				    i__5].r;
+			    z__1.r = x[i__3].r + z__2.r, z__1.i = x[i__3].i + 
+				    z__2.i;
+			    x[i__2].r = z__1.r, x[i__2].i = z__1.i;
+/* L10: */
+			}
+			if (nounit) {
+			    i__4 = j;
+			    i__2 = j;
+			    i__3 = kplus1 + j * a_dim1;
+			    z__1.r = x[i__2].r * a[i__3].r - x[i__2].i * a[
+				    i__3].i, z__1.i = x[i__2].r * a[i__3].i + 
+				    x[i__2].i * a[i__3].r;
+			    x[i__4].r = z__1.r, x[i__4].i = z__1.i;
+			}
+		    }
+/* L20: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__4 = jx;
+		    if (x[i__4].r != 0. || x[i__4].i != 0.) {
+			i__4 = jx;
+			temp.r = x[i__4].r, temp.i = x[i__4].i;
+			ix = kx;
+			l = kplus1 - j;
+/* Computing MAX */
+			i__4 = 1, i__2 = j - *k;
+			i__3 = j - 1;
+			for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
+			    i__4 = ix;
+			    i__2 = ix;
+			    i__5 = l + i__ + j * a_dim1;
+			    z__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, 
+				    z__2.i = temp.r * a[i__5].i + temp.i * a[
+				    i__5].r;
+			    z__1.r = x[i__2].r + z__2.r, z__1.i = x[i__2].i + 
+				    z__2.i;
+			    x[i__4].r = z__1.r, x[i__4].i = z__1.i;
+			    ix += *incx;
+/* L30: */
+			}
+			if (nounit) {
+			    i__3 = jx;
+			    i__4 = jx;
+			    i__2 = kplus1 + j * a_dim1;
+			    z__1.r = x[i__4].r * a[i__2].r - x[i__4].i * a[
+				    i__2].i, z__1.i = x[i__4].r * a[i__2].i + 
+				    x[i__4].i * a[i__2].r;
+			    x[i__3].r = z__1.r, x[i__3].i = z__1.i;
+			}
+		    }
+		    jx += *incx;
+		    if (j > *k) {
+			kx += *incx;
+		    }
+/* L40: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    i__1 = j;
+		    if (x[i__1].r != 0. || x[i__1].i != 0.) {
+			i__1 = j;
+			temp.r = x[i__1].r, temp.i = x[i__1].i;
+			l = 1 - j;
+/* Computing MIN */
+			i__1 = *n, i__3 = j + *k;
+			i__4 = j + 1;
+			for (i__ = min(i__1,i__3); i__ >= i__4; --i__) {
+			    i__1 = i__;
+			    i__3 = i__;
+			    i__2 = l + i__ + j * a_dim1;
+			    z__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, 
+				    z__2.i = temp.r * a[i__2].i + temp.i * a[
+				    i__2].r;
+			    z__1.r = x[i__3].r + z__2.r, z__1.i = x[i__3].i + 
+				    z__2.i;
+			    x[i__1].r = z__1.r, x[i__1].i = z__1.i;
+/* L50: */
+			}
+			if (nounit) {
+			    i__4 = j;
+			    i__1 = j;
+			    i__3 = j * a_dim1 + 1;
+			    z__1.r = x[i__1].r * a[i__3].r - x[i__1].i * a[
+				    i__3].i, z__1.i = x[i__1].r * a[i__3].i + 
+				    x[i__1].i * a[i__3].r;
+			    x[i__4].r = z__1.r, x[i__4].i = z__1.i;
+			}
+		    }
+/* L60: */
+		}
+	    } else {
+		kx += (*n - 1) * *incx;
+		jx = kx;
+		for (j = *n; j >= 1; --j) {
+		    i__4 = jx;
+		    if (x[i__4].r != 0. || x[i__4].i != 0.) {
+			i__4 = jx;
+			temp.r = x[i__4].r, temp.i = x[i__4].i;
+			ix = kx;
+			l = 1 - j;
+/* Computing MIN */
+			i__4 = *n, i__1 = j + *k;
+			i__3 = j + 1;
+			for (i__ = min(i__4,i__1); i__ >= i__3; --i__) {
+			    i__4 = ix;
+			    i__1 = ix;
+			    i__2 = l + i__ + j * a_dim1;
+			    z__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, 
+				    z__2.i = temp.r * a[i__2].i + temp.i * a[
+				    i__2].r;
+			    z__1.r = x[i__1].r + z__2.r, z__1.i = x[i__1].i + 
+				    z__2.i;
+			    x[i__4].r = z__1.r, x[i__4].i = z__1.i;
+			    ix -= *incx;
+/* L70: */
+			}
+			if (nounit) {
+			    i__3 = jx;
+			    i__4 = jx;
+			    i__1 = j * a_dim1 + 1;
+			    z__1.r = x[i__4].r * a[i__1].r - x[i__4].i * a[
+				    i__1].i, z__1.i = x[i__4].r * a[i__1].i + 
+				    x[i__4].i * a[i__1].r;
+			    x[i__3].r = z__1.r, x[i__3].i = z__1.i;
+			}
+		    }
+		    jx -= *incx;
+		    if (*n - j >= *k) {
+			kx -= *incx;
+		    }
+/* L80: */
+		}
+	    }
+	}
+    } else {
+
+/*        Form  x := A'*x  or  x := conjg( A' )*x. */
+
+	if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	    kplus1 = *k + 1;
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    i__3 = j;
+		    temp.r = x[i__3].r, temp.i = x[i__3].i;
+		    l = kplus1 - j;
+		    if (noconj) {
+			if (nounit) {
+			    i__3 = kplus1 + j * a_dim1;
+			    z__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, 
+				    z__1.i = temp.r * a[i__3].i + temp.i * a[
+				    i__3].r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+/* Computing MAX */
+			i__4 = 1, i__1 = j - *k;
+			i__3 = max(i__4,i__1);
+			for (i__ = j - 1; i__ >= i__3; --i__) {
+			    i__4 = l + i__ + j * a_dim1;
+			    i__1 = i__;
+			    z__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[
+				    i__1].i, z__2.i = a[i__4].r * x[i__1].i + 
+				    a[i__4].i * x[i__1].r;
+			    z__1.r = temp.r + z__2.r, z__1.i = temp.i + 
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+/* L90: */
+			}
+		    } else {
+			if (nounit) {
+			    d_cnjg(&z__2, &a[kplus1 + j * a_dim1]);
+			    z__1.r = temp.r * z__2.r - temp.i * z__2.i, 
+				    z__1.i = temp.r * z__2.i + temp.i * 
+				    z__2.r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+/* Computing MAX */
+			i__4 = 1, i__1 = j - *k;
+			i__3 = max(i__4,i__1);
+			for (i__ = j - 1; i__ >= i__3; --i__) {
+			    d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
+			    i__4 = i__;
+			    z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, 
+				    z__2.i = z__3.r * x[i__4].i + z__3.i * x[
+				    i__4].r;
+			    z__1.r = temp.r + z__2.r, z__1.i = temp.i + 
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+/* L100: */
+			}
+		    }
+		    i__3 = j;
+		    x[i__3].r = temp.r, x[i__3].i = temp.i;
+/* L110: */
+		}
+	    } else {
+		kx += (*n - 1) * *incx;
+		jx = kx;
+		for (j = *n; j >= 1; --j) {
+		    i__3 = jx;
+		    temp.r = x[i__3].r, temp.i = x[i__3].i;
+		    kx -= *incx;
+		    ix = kx;
+		    l = kplus1 - j;
+		    if (noconj) {
+			if (nounit) {
+			    i__3 = kplus1 + j * a_dim1;
+			    z__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, 
+				    z__1.i = temp.r * a[i__3].i + temp.i * a[
+				    i__3].r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+/* Computing MAX */
+			i__4 = 1, i__1 = j - *k;
+			i__3 = max(i__4,i__1);
+			for (i__ = j - 1; i__ >= i__3; --i__) {
+			    i__4 = l + i__ + j * a_dim1;
+			    i__1 = ix;
+			    z__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[
+				    i__1].i, z__2.i = a[i__4].r * x[i__1].i + 
+				    a[i__4].i * x[i__1].r;
+			    z__1.r = temp.r + z__2.r, z__1.i = temp.i + 
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			    ix -= *incx;
+/* L120: */
+			}
+		    } else {
+			if (nounit) {
+			    d_cnjg(&z__2, &a[kplus1 + j * a_dim1]);
+			    z__1.r = temp.r * z__2.r - temp.i * z__2.i, 
+				    z__1.i = temp.r * z__2.i + temp.i * 
+				    z__2.r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+/* Computing MAX */
+			i__4 = 1, i__1 = j - *k;
+			i__3 = max(i__4,i__1);
+			for (i__ = j - 1; i__ >= i__3; --i__) {
+			    d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
+			    i__4 = ix;
+			    z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, 
+				    z__2.i = z__3.r * x[i__4].i + z__3.i * x[
+				    i__4].r;
+			    z__1.r = temp.r + z__2.r, z__1.i = temp.i + 
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			    ix -= *incx;
+/* L130: */
+			}
+		    }
+		    i__3 = jx;
+		    x[i__3].r = temp.r, x[i__3].i = temp.i;
+		    jx -= *incx;
+/* L140: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		i__3 = *n;
+		for (j = 1; j <= i__3; ++j) {
+		    i__4 = j;
+		    temp.r = x[i__4].r, temp.i = x[i__4].i;
+		    l = 1 - j;
+		    if (noconj) {
+			if (nounit) {
+			    i__4 = j * a_dim1 + 1;
+			    z__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, 
+				    z__1.i = temp.r * a[i__4].i + temp.i * a[
+				    i__4].r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+/* Computing MIN */
+			i__1 = *n, i__2 = j + *k;
+			i__4 = min(i__1,i__2);
+			for (i__ = j + 1; i__ <= i__4; ++i__) {
+			    i__1 = l + i__ + j * a_dim1;
+			    i__2 = i__;
+			    z__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[
+				    i__2].i, z__2.i = a[i__1].r * x[i__2].i + 
+				    a[i__1].i * x[i__2].r;
+			    z__1.r = temp.r + z__2.r, z__1.i = temp.i + 
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+/* L150: */
+			}
+		    } else {
+			if (nounit) {
+			    d_cnjg(&z__2, &a[j * a_dim1 + 1]);
+			    z__1.r = temp.r * z__2.r - temp.i * z__2.i, 
+				    z__1.i = temp.r * z__2.i + temp.i * 
+				    z__2.r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+/* Computing MIN */
+			i__1 = *n, i__2 = j + *k;
+			i__4 = min(i__1,i__2);
+			for (i__ = j + 1; i__ <= i__4; ++i__) {
+			    d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
+			    i__1 = i__;
+			    z__2.r = z__3.r * x[i__1].r - z__3.i * x[i__1].i, 
+				    z__2.i = z__3.r * x[i__1].i + z__3.i * x[
+				    i__1].r;
+			    z__1.r = temp.r + z__2.r, z__1.i = temp.i + 
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+/* L160: */
+			}
+		    }
+		    i__4 = j;
+		    x[i__4].r = temp.r, x[i__4].i = temp.i;
+/* L170: */
+		}
+	    } else {
+		jx = kx;
+		i__3 = *n;
+		for (j = 1; j <= i__3; ++j) {
+		    i__4 = jx;
+		    temp.r = x[i__4].r, temp.i = x[i__4].i;
+		    kx += *incx;
+		    ix = kx;
+		    l = 1 - j;
+		    if (noconj) {
+			if (nounit) {
+			    i__4 = j * a_dim1 + 1;
+			    z__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, 
+				    z__1.i = temp.r * a[i__4].i + temp.i * a[
+				    i__4].r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+/* Computing MIN */
+			i__1 = *n, i__2 = j + *k;
+			i__4 = min(i__1,i__2);
+			for (i__ = j + 1; i__ <= i__4; ++i__) {
+			    i__1 = l + i__ + j * a_dim1;
+			    i__2 = ix;
+			    z__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[
+				    i__2].i, z__2.i = a[i__1].r * x[i__2].i + 
+				    a[i__1].i * x[i__2].r;
+			    z__1.r = temp.r + z__2.r, z__1.i = temp.i + 
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			    ix += *incx;
+/* L180: */
+			}
+		    } else {
+			if (nounit) {
+			    d_cnjg(&z__2, &a[j * a_dim1 + 1]);
+			    z__1.r = temp.r * z__2.r - temp.i * z__2.i, 
+				    z__1.i = temp.r * z__2.i + temp.i * 
+				    z__2.r;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			}
+/* Computing MIN */
+			i__1 = *n, i__2 = j + *k;
+			i__4 = min(i__1,i__2);
+			for (i__ = j + 1; i__ <= i__4; ++i__) {
+			    d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
+			    i__1 = ix;
+			    z__2.r = z__3.r * x[i__1].r - z__3.i * x[i__1].i, 
+				    z__2.i = z__3.r * x[i__1].i + z__3.i * x[
+				    i__1].r;
+			    z__1.r = temp.r + z__2.r, z__1.i = temp.i + 
+				    z__2.i;
+			    temp.r = z__1.r, temp.i = z__1.i;
+			    ix += *incx;
+/* L190: */
+			}
+		    }
+		    i__4 = jx;
+		    x[i__4].r = temp.r, x[i__4].i = temp.i;
+		    jx += *incx;
+/* L200: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of ZTBMV . */
+
+} /* ztbmv_ */
+
diff --git a/blas/complexdots.f b/blas/fortran/complexdots.f
index a7da51d16..a7da51d16 100644
--- a/blas/complexdots.f
+++ b/blas/fortran/complexdots.f
diff --git a/blas/level2_impl.h b/blas/level2_impl.h
index 233c7b753..e604fe611 100644
--- a/blas/level2_impl.h
+++ b/blas/level2_impl.h
@@ -9,6 +9,20 @@
 
 #include "common.h"
 
+template<typename Index, typename Scalar, int StorageOrder, bool ConjugateLhs, bool ConjugateRhs>
+struct general_matrix_vector_product_wrapper
+{
+  static void run(Index rows, Index cols,const Scalar *lhs, Index lhsStride, const Scalar *rhs, Index rhsIncr, Scalar* res, Index resIncr, Scalar alpha)
+  {
+    typedef internal::const_blas_data_mapper<Scalar,Index,StorageOrder> LhsMapper;
+    typedef internal::const_blas_data_mapper<Scalar,Index,RowMajor> RhsMapper;
+    
+    internal::general_matrix_vector_product
+        <Index,Scalar,LhsMapper,StorageOrder,ConjugateLhs,Scalar,RhsMapper,ConjugateRhs>::run(
+        rows, cols, LhsMapper(lhs, lhsStride), RhsMapper(rhs, rhsIncr), res, resIncr, alpha);
+  }
+};
+
 int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *incb, RealScalar *pbeta, RealScalar *pc, int *incc)
 {
   typedef void (*functype)(int, int, const Scalar *, int, const Scalar *, int , Scalar *, int, Scalar);
@@ -20,9 +34,9 @@ int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealSca
     for(int k=0; k<4; ++k)
       func[k] = 0;
 
-    func[NOTR] = (internal::general_matrix_vector_product<int,Scalar,ColMajor,false,Scalar,false>::run);
-    func[TR  ] = (internal::general_matrix_vector_product<int,Scalar,RowMajor,false,Scalar,false>::run);
-    func[ADJ ] = (internal::general_matrix_vector_product<int,Scalar,RowMajor,Conj, Scalar,false>::run);
+    func[NOTR] = (general_matrix_vector_product_wrapper<int,Scalar,ColMajor,false,false>::run);
+    func[TR  ] = (general_matrix_vector_product_wrapper<int,Scalar,RowMajor,false,false>::run);
+    func[ADJ ] = (general_matrix_vector_product_wrapper<int,Scalar,RowMajor,Conj ,false>::run);
 
     init = true;
   }
diff --git a/blas/level3_impl.h b/blas/level3_impl.h
index a05872666..37a803ced 100644
--- a/blas/level3_impl.h
+++ b/blas/level3_impl.h
@@ -56,7 +56,7 @@ int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScal
     else                matrix(c, *m, *n, *ldc) *= beta;
   }
 
-  internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*m,*n,*k,true);
+  internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*m,*n,*k,1,true);
 
   int code = OP(*opa) | (OP(*opb) << 2);
   func[code](*m, *n, *k, a, *lda, b, *ldb, c, *ldc, alpha, blocking, 0);
@@ -131,12 +131,12 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m,
   
   if(SIDE(*side)==LEFT)
   {
-    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m);
+    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m,1,false);
     func[code](*m, *n, a, *lda, b, *ldb, blocking);
   }
   else
   {
-    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n);
+    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n,1,false);
     func[code](*n, *m, a, *lda, b, *ldb, blocking);
   }
 
@@ -222,12 +222,12 @@ int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m,
 
   if(SIDE(*side)==LEFT)
   {
-    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m);
+    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m,1,false);
     func[code](*m, *n, *m, a, *lda, tmp.data(), tmp.outerStride(), b, *ldb, alpha, blocking);
   }
   else
   {
-    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n);
+    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n,1,false);
     func[code](*m, *n, *n, tmp.data(), tmp.outerStride(), a, *lda, b, *ldb, alpha, blocking);
   }
   return 1;
@@ -577,7 +577,7 @@ int EIGEN_BLAS_FUNC(her2k)(char *uplo, char *op, int *n, int *k, RealScalar *pal
   else if(*n<0)                                                       info = 3;
   else if(*k<0)                                                       info = 4;
   else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k))                     info = 7;
-  else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k))                     info = 9;
+  else if(*ldb<std::max(1,(OP(*op)==NOTR)?*n:*k))                     info = 9;
   else if(*ldc<std::max(1,*n))                                        info = 12;
   if(info)
     return xerbla_(SCALAR_SUFFIX_UP"HER2K",&info,6);
diff --git a/blas/lsame.f b/blas/lsame.f
deleted file mode 100644
index f53690268..000000000
--- a/blas/lsame.f
+++ /dev/null
@@ -1,85 +0,0 @@
-      LOGICAL FUNCTION LSAME(CA,CB)
-*
-*  -- LAPACK auxiliary routine (version 3.1) --
-*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd..
-*     November 2006
-*
-*     .. Scalar Arguments ..
-      CHARACTER CA,CB
-*     ..
-*
-*  Purpose
-*  =======
-*
-*  LSAME returns .TRUE. if CA is the same letter as CB regardless of
-*  case.
-*
-*  Arguments
-*  =========
-*
-*  CA      (input) CHARACTER*1
-*
-*  CB      (input) CHARACTER*1
-*          CA and CB specify the single characters to be compared.
-*
-* =====================================================================
-*
-*     .. Intrinsic Functions ..
-      INTRINSIC ICHAR
-*     ..
-*     .. Local Scalars ..
-      INTEGER INTA,INTB,ZCODE
-*     ..
-*
-*     Test if the characters are equal
-*
-      LSAME = CA .EQ. CB
-      IF (LSAME) RETURN
-*
-*     Now test for equivalence if both characters are alphabetic.
-*
-      ZCODE = ICHAR('Z')
-*
-*     Use 'Z' rather than 'A' so that ASCII can be detected on Prime
-*     machines, on which ICHAR returns a value with bit 8 set.
-*     ICHAR('A') on Prime machines returns 193 which is the same as
-*     ICHAR('A') on an EBCDIC machine.
-*
-      INTA = ICHAR(CA)
-      INTB = ICHAR(CB)
-*
-      IF (ZCODE.EQ.90 .OR. ZCODE.EQ.122) THEN
-*
-*        ASCII is assumed - ZCODE is the ASCII code of either lower or
-*        upper case 'Z'.
-*
-          IF (INTA.GE.97 .AND. INTA.LE.122) INTA = INTA - 32
-          IF (INTB.GE.97 .AND. INTB.LE.122) INTB = INTB - 32
-*
-      ELSE IF (ZCODE.EQ.233 .OR. ZCODE.EQ.169) THEN
-*
-*        EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or
-*        upper case 'Z'.
-*
-          IF (INTA.GE.129 .AND. INTA.LE.137 .OR.
-     +        INTA.GE.145 .AND. INTA.LE.153 .OR.
-     +        INTA.GE.162 .AND. INTA.LE.169) INTA = INTA + 64
-          IF (INTB.GE.129 .AND. INTB.LE.137 .OR.
-     +        INTB.GE.145 .AND. INTB.LE.153 .OR.
-     +        INTB.GE.162 .AND. INTB.LE.169) INTB = INTB + 64
-*
-      ELSE IF (ZCODE.EQ.218 .OR. ZCODE.EQ.250) THEN
-*
-*        ASCII is assumed, on Prime machines - ZCODE is the ASCII code
-*        plus 128 of either lower or upper case 'Z'.
-*
-          IF (INTA.GE.225 .AND. INTA.LE.250) INTA = INTA - 32
-          IF (INTB.GE.225 .AND. INTB.LE.250) INTB = INTB - 32
-      END IF
-      LSAME = INTA .EQ. INTB
-*
-*     RETURN
-*
-*     End of LSAME
-*
-      END
diff --git a/blas/srotm.f b/blas/srotm.f
deleted file mode 100644
index fc5a59333..000000000
--- a/blas/srotm.f
+++ /dev/null
@@ -1,148 +0,0 @@
-      SUBROUTINE SROTM(N,SX,INCX,SY,INCY,SPARAM)
-*     .. Scalar Arguments ..
-      INTEGER INCX,INCY,N
-*     ..
-*     .. Array Arguments ..
-      REAL SPARAM(5),SX(*),SY(*)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*     APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX
-*
-*     (SX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF SX ARE IN
-*     (DX**T)
-*
-*     SX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE
-*     LX = (-INCX)*N, AND SIMILARLY FOR SY USING USING LY AND INCY.
-*     WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS..
-*
-*     SFLAG=-1.E0     SFLAG=0.E0        SFLAG=1.E0     SFLAG=-2.E0
-*
-*       (SH11  SH12)    (1.E0  SH12)    (SH11  1.E0)    (1.E0  0.E0)
-*     H=(          )    (          )    (          )    (          )
-*       (SH21  SH22),   (SH21  1.E0),   (-1.E0 SH22),   (0.E0  1.E0).
-*     SEE  SROTMG FOR A DESCRIPTION OF DATA STORAGE IN SPARAM.
-*
-*
-*  Arguments
-*  =========
-*
-*  N      (input) INTEGER
-*         number of elements in input vector(s)
-*
-*  SX     (input/output) REAL array, dimension N
-*         double precision vector with N elements
-*
-*  INCX   (input) INTEGER
-*         storage spacing between elements of SX
-*
-*  SY     (input/output) REAL array, dimension N
-*         double precision vector with N elements
-*
-*  INCY   (input) INTEGER
-*         storage spacing between elements of SY
-*
-*  SPARAM (input/output)  REAL array, dimension 5
-*     SPARAM(1)=SFLAG
-*     SPARAM(2)=SH11
-*     SPARAM(3)=SH21
-*     SPARAM(4)=SH12
-*     SPARAM(5)=SH22
-*
-*  =====================================================================
-*
-*     .. Local Scalars ..
-      REAL SFLAG,SH11,SH12,SH21,SH22,TWO,W,Z,ZERO
-      INTEGER I,KX,KY,NSTEPS
-*     ..
-*     .. Data statements ..
-      DATA ZERO,TWO/0.E0,2.E0/
-*     ..
-*
-      SFLAG = SPARAM(1)
-      IF (N.LE.0 .OR. (SFLAG+TWO.EQ.ZERO)) GO TO 140
-      IF (.NOT. (INCX.EQ.INCY.AND.INCX.GT.0)) GO TO 70
-*
-      NSTEPS = N*INCX
-      IF (SFLAG) 50,10,30
-   10 CONTINUE
-      SH12 = SPARAM(4)
-      SH21 = SPARAM(3)
-      DO 20 I = 1,NSTEPS,INCX
-          W = SX(I)
-          Z = SY(I)
-          SX(I) = W + Z*SH12
-          SY(I) = W*SH21 + Z
-   20 CONTINUE
-      GO TO 140
-   30 CONTINUE
-      SH11 = SPARAM(2)
-      SH22 = SPARAM(5)
-      DO 40 I = 1,NSTEPS,INCX
-          W = SX(I)
-          Z = SY(I)
-          SX(I) = W*SH11 + Z
-          SY(I) = -W + SH22*Z
-   40 CONTINUE
-      GO TO 140
-   50 CONTINUE
-      SH11 = SPARAM(2)
-      SH12 = SPARAM(4)
-      SH21 = SPARAM(3)
-      SH22 = SPARAM(5)
-      DO 60 I = 1,NSTEPS,INCX
-          W = SX(I)
-          Z = SY(I)
-          SX(I) = W*SH11 + Z*SH12
-          SY(I) = W*SH21 + Z*SH22
-   60 CONTINUE
-      GO TO 140
-   70 CONTINUE
-      KX = 1
-      KY = 1
-      IF (INCX.LT.0) KX = 1 + (1-N)*INCX
-      IF (INCY.LT.0) KY = 1 + (1-N)*INCY
-*
-      IF (SFLAG) 120,80,100
-   80 CONTINUE
-      SH12 = SPARAM(4)
-      SH21 = SPARAM(3)
-      DO 90 I = 1,N
-          W = SX(KX)
-          Z = SY(KY)
-          SX(KX) = W + Z*SH12
-          SY(KY) = W*SH21 + Z
-          KX = KX + INCX
-          KY = KY + INCY
-   90 CONTINUE
-      GO TO 140
-  100 CONTINUE
-      SH11 = SPARAM(2)
-      SH22 = SPARAM(5)
-      DO 110 I = 1,N
-          W = SX(KX)
-          Z = SY(KY)
-          SX(KX) = W*SH11 + Z
-          SY(KY) = -W + SH22*Z
-          KX = KX + INCX
-          KY = KY + INCY
-  110 CONTINUE
-      GO TO 140
-  120 CONTINUE
-      SH11 = SPARAM(2)
-      SH12 = SPARAM(4)
-      SH21 = SPARAM(3)
-      SH22 = SPARAM(5)
-      DO 130 I = 1,N
-          W = SX(KX)
-          Z = SY(KY)
-          SX(KX) = W*SH11 + Z*SH12
-          SY(KY) = W*SH21 + Z*SH22
-          KX = KX + INCX
-          KY = KY + INCY
-  130 CONTINUE
-  140 CONTINUE
-      RETURN
-      END
diff --git a/blas/srotmg.f b/blas/srotmg.f
deleted file mode 100644
index 7b3bd4272..000000000
--- a/blas/srotmg.f
+++ /dev/null
@@ -1,208 +0,0 @@
-      SUBROUTINE SROTMG(SD1,SD2,SX1,SY1,SPARAM)
-*     .. Scalar Arguments ..
-      REAL SD1,SD2,SX1,SY1
-*     ..
-*     .. Array Arguments ..
-      REAL SPARAM(5)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*     CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS
-*     THE SECOND COMPONENT OF THE 2-VECTOR  (SQRT(SD1)*SX1,SQRT(SD2)*
-*     SY2)**T.
-*     WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS..
-*
-*     SFLAG=-1.E0     SFLAG=0.E0        SFLAG=1.E0     SFLAG=-2.E0
-*
-*       (SH11  SH12)    (1.E0  SH12)    (SH11  1.E0)    (1.E0  0.E0)
-*     H=(          )    (          )    (          )    (          )
-*       (SH21  SH22),   (SH21  1.E0),   (-1.E0 SH22),   (0.E0  1.E0).
-*     LOCATIONS 2-4 OF SPARAM CONTAIN SH11,SH21,SH12, AND SH22
-*     RESPECTIVELY. (VALUES OF 1.E0, -1.E0, OR 0.E0 IMPLIED BY THE
-*     VALUE OF SPARAM(1) ARE NOT STORED IN SPARAM.)
-*
-*     THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE
-*     INEXACT.  THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE
-*     OF SD1 AND SD2.  ALL ACTUAL SCALING OF DATA IS DONE USING GAM.
-*
-*
-*  Arguments
-*  =========
-*
-*
-*  SD1    (input/output) REAL
-*
-*  SD2    (input/output) REAL
-*
-*  SX1    (input/output) REAL
-*
-*  SY1    (input) REAL
-*
-*
-*  SPARAM (input/output)  REAL array, dimension 5
-*     SPARAM(1)=SFLAG
-*     SPARAM(2)=SH11
-*     SPARAM(3)=SH21
-*     SPARAM(4)=SH12
-*     SPARAM(5)=SH22
-*
-*  =====================================================================
-*
-*     .. Local Scalars ..
-      REAL GAM,GAMSQ,ONE,RGAMSQ,SFLAG,SH11,SH12,SH21,SH22,SP1,SP2,SQ1,
-     +     SQ2,STEMP,SU,TWO,ZERO
-      INTEGER IGO
-*     ..
-*     .. Intrinsic Functions ..
-      INTRINSIC ABS
-*     ..
-*     .. Data statements ..
-*
-      DATA ZERO,ONE,TWO/0.E0,1.E0,2.E0/
-      DATA GAM,GAMSQ,RGAMSQ/4096.E0,1.67772E7,5.96046E-8/
-*     ..
-
-      IF (.NOT.SD1.LT.ZERO) GO TO 10
-*       GO ZERO-H-D-AND-SX1..
-      GO TO 60
-   10 CONTINUE
-*     CASE-SD1-NONNEGATIVE
-      SP2 = SD2*SY1
-      IF (.NOT.SP2.EQ.ZERO) GO TO 20
-      SFLAG = -TWO
-      GO TO 260
-*     REGULAR-CASE..
-   20 CONTINUE
-      SP1 = SD1*SX1
-      SQ2 = SP2*SY1
-      SQ1 = SP1*SX1
-*
-      IF (.NOT.ABS(SQ1).GT.ABS(SQ2)) GO TO 40
-      SH21 = -SY1/SX1
-      SH12 = SP2/SP1
-*
-      SU = ONE - SH12*SH21
-*
-      IF (.NOT.SU.LE.ZERO) GO TO 30
-*         GO ZERO-H-D-AND-SX1..
-      GO TO 60
-   30 CONTINUE
-      SFLAG = ZERO
-      SD1 = SD1/SU
-      SD2 = SD2/SU
-      SX1 = SX1*SU
-*         GO SCALE-CHECK..
-      GO TO 100
-   40 CONTINUE
-      IF (.NOT.SQ2.LT.ZERO) GO TO 50
-*         GO ZERO-H-D-AND-SX1..
-      GO TO 60
-   50 CONTINUE
-      SFLAG = ONE
-      SH11 = SP1/SP2
-      SH22 = SX1/SY1
-      SU = ONE + SH11*SH22
-      STEMP = SD2/SU
-      SD2 = SD1/SU
-      SD1 = STEMP
-      SX1 = SY1*SU
-*         GO SCALE-CHECK
-      GO TO 100
-*     PROCEDURE..ZERO-H-D-AND-SX1..
-   60 CONTINUE
-      SFLAG = -ONE
-      SH11 = ZERO
-      SH12 = ZERO
-      SH21 = ZERO
-      SH22 = ZERO
-*
-      SD1 = ZERO
-      SD2 = ZERO
-      SX1 = ZERO
-*         RETURN..
-      GO TO 220
-*     PROCEDURE..FIX-H..
-   70 CONTINUE
-      IF (.NOT.SFLAG.GE.ZERO) GO TO 90
-*
-      IF (.NOT.SFLAG.EQ.ZERO) GO TO 80
-      SH11 = ONE
-      SH22 = ONE
-      SFLAG = -ONE
-      GO TO 90
-   80 CONTINUE
-      SH21 = -ONE
-      SH12 = ONE
-      SFLAG = -ONE
-   90 CONTINUE
-      GO TO IGO(120,150,180,210)
-*     PROCEDURE..SCALE-CHECK
-  100 CONTINUE
-  110 CONTINUE
-      IF (.NOT.SD1.LE.RGAMSQ) GO TO 130
-      IF (SD1.EQ.ZERO) GO TO 160
-      ASSIGN 120 TO IGO
-*              FIX-H..
-      GO TO 70
-  120 CONTINUE
-      SD1 = SD1*GAM**2
-      SX1 = SX1/GAM
-      SH11 = SH11/GAM
-      SH12 = SH12/GAM
-      GO TO 110
-  130 CONTINUE
-  140 CONTINUE
-      IF (.NOT.SD1.GE.GAMSQ) GO TO 160
-      ASSIGN 150 TO IGO
-*              FIX-H..
-      GO TO 70
-  150 CONTINUE
-      SD1 = SD1/GAM**2
-      SX1 = SX1*GAM
-      SH11 = SH11*GAM
-      SH12 = SH12*GAM
-      GO TO 140
-  160 CONTINUE
-  170 CONTINUE
-      IF (.NOT.ABS(SD2).LE.RGAMSQ) GO TO 190
-      IF (SD2.EQ.ZERO) GO TO 220
-      ASSIGN 180 TO IGO
-*              FIX-H..
-      GO TO 70
-  180 CONTINUE
-      SD2 = SD2*GAM**2
-      SH21 = SH21/GAM
-      SH22 = SH22/GAM
-      GO TO 170
-  190 CONTINUE
-  200 CONTINUE
-      IF (.NOT.ABS(SD2).GE.GAMSQ) GO TO 220
-      ASSIGN 210 TO IGO
-*              FIX-H..
-      GO TO 70
-  210 CONTINUE
-      SD2 = SD2/GAM**2
-      SH21 = SH21*GAM
-      SH22 = SH22*GAM
-      GO TO 200
-  220 CONTINUE
-      IF (SFLAG) 250,230,240
-  230 CONTINUE
-      SPARAM(3) = SH21
-      SPARAM(4) = SH12
-      GO TO 260
-  240 CONTINUE
-      SPARAM(2) = SH11
-      SPARAM(5) = SH22
-      GO TO 260
-  250 CONTINUE
-      SPARAM(2) = SH11
-      SPARAM(3) = SH21
-      SPARAM(4) = SH12
-      SPARAM(5) = SH22
-  260 CONTINUE
-      SPARAM(1) = SFLAG
-      RETURN
-      END
diff --git a/blas/ssbmv.f b/blas/ssbmv.f
deleted file mode 100644
index 16893a295..000000000
--- a/blas/ssbmv.f
+++ /dev/null
@@ -1,306 +0,0 @@
-      SUBROUTINE SSBMV(UPLO,N,K,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
-*     .. Scalar Arguments ..
-      REAL ALPHA,BETA
-      INTEGER INCX,INCY,K,LDA,N
-      CHARACTER UPLO
-*     ..
-*     .. Array Arguments ..
-      REAL A(LDA,*),X(*),Y(*)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*  SSBMV  performs the matrix-vector  operation
-*
-*     y := alpha*A*x + beta*y,
-*
-*  where alpha and beta are scalars, x and y are n element vectors and
-*  A is an n by n symmetric band matrix, with k super-diagonals.
-*
-*  Arguments
-*  ==========
-*
-*  UPLO   - CHARACTER*1.
-*           On entry, UPLO specifies whether the upper or lower
-*           triangular part of the band matrix A is being supplied as
-*           follows:
-*
-*              UPLO = 'U' or 'u'   The upper triangular part of A is
-*                                  being supplied.
-*
-*              UPLO = 'L' or 'l'   The lower triangular part of A is
-*                                  being supplied.
-*
-*           Unchanged on exit.
-*
-*  N      - INTEGER.
-*           On entry, N specifies the order of the matrix A.
-*           N must be at least zero.
-*           Unchanged on exit.
-*
-*  K      - INTEGER.
-*           On entry, K specifies the number of super-diagonals of the
-*           matrix A. K must satisfy  0 .le. K.
-*           Unchanged on exit.
-*
-*  ALPHA  - REAL            .
-*           On entry, ALPHA specifies the scalar alpha.
-*           Unchanged on exit.
-*
-*  A      - REAL             array of DIMENSION ( LDA, n ).
-*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
-*           by n part of the array A must contain the upper triangular
-*           band part of the symmetric matrix, supplied column by
-*           column, with the leading diagonal of the matrix in row
-*           ( k + 1 ) of the array, the first super-diagonal starting at
-*           position 2 in row k, and so on. The top left k by k triangle
-*           of the array A is not referenced.
-*           The following program segment will transfer the upper
-*           triangular part of a symmetric band matrix from conventional
-*           full matrix storage to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = K + 1 - J
-*                    DO 10, I = MAX( 1, J - K ), J
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
-*           by n part of the array A must contain the lower triangular
-*           band part of the symmetric matrix, supplied column by
-*           column, with the leading diagonal of the matrix in row 1 of
-*           the array, the first sub-diagonal starting at position 1 in
-*           row 2, and so on. The bottom right k by k triangle of the
-*           array A is not referenced.
-*           The following program segment will transfer the lower
-*           triangular part of a symmetric band matrix from conventional
-*           full matrix storage to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = 1 - J
-*                    DO 10, I = J, MIN( N, J + K )
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Unchanged on exit.
-*
-*  LDA    - INTEGER.
-*           On entry, LDA specifies the first dimension of A as declared
-*           in the calling (sub) program. LDA must be at least
-*           ( k + 1 ).
-*           Unchanged on exit.
-*
-*  X      - REAL             array of DIMENSION at least
-*           ( 1 + ( n - 1 )*abs( INCX ) ).
-*           Before entry, the incremented array X must contain the
-*           vector x.
-*           Unchanged on exit.
-*
-*  INCX   - INTEGER.
-*           On entry, INCX specifies the increment for the elements of
-*           X. INCX must not be zero.
-*           Unchanged on exit.
-*
-*  BETA   - REAL            .
-*           On entry, BETA specifies the scalar beta.
-*           Unchanged on exit.
-*
-*  Y      - REAL             array of DIMENSION at least
-*           ( 1 + ( n - 1 )*abs( INCY ) ).
-*           Before entry, the incremented array Y must contain the
-*           vector y. On exit, Y is overwritten by the updated vector y.
-*
-*  INCY   - INTEGER.
-*           On entry, INCY specifies the increment for the elements of
-*           Y. INCY must not be zero.
-*           Unchanged on exit.
-*
-*  Further Details
-*  ===============
-*
-*  Level 2 Blas routine.
-*
-*  -- Written on 22-October-1986.
-*     Jack Dongarra, Argonne National Lab.
-*     Jeremy Du Croz, Nag Central Office.
-*     Sven Hammarling, Nag Central Office.
-*     Richard Hanson, Sandia National Labs.
-*
-*  =====================================================================
-*
-*     .. Parameters ..
-      REAL ONE,ZERO
-      PARAMETER (ONE=1.0E+0,ZERO=0.0E+0)
-*     ..
-*     .. Local Scalars ..
-      REAL TEMP1,TEMP2
-      INTEGER I,INFO,IX,IY,J,JX,JY,KPLUS1,KX,KY,L
-*     ..
-*     .. External Functions ..
-      LOGICAL LSAME
-      EXTERNAL LSAME
-*     ..
-*     .. External Subroutines ..
-      EXTERNAL XERBLA
-*     ..
-*     .. Intrinsic Functions ..
-      INTRINSIC MAX,MIN
-*     ..
-*
-*     Test the input parameters.
-*
-      INFO = 0
-      IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
-          INFO = 1
-      ELSE IF (N.LT.0) THEN
-          INFO = 2
-      ELSE IF (K.LT.0) THEN
-          INFO = 3
-      ELSE IF (LDA.LT. (K+1)) THEN
-          INFO = 6
-      ELSE IF (INCX.EQ.0) THEN
-          INFO = 8
-      ELSE IF (INCY.EQ.0) THEN
-          INFO = 11
-      END IF
-      IF (INFO.NE.0) THEN
-          CALL XERBLA('SSBMV ',INFO)
-          RETURN
-      END IF
-*
-*     Quick return if possible.
-*
-      IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
-*
-*     Set up the start points in  X  and  Y.
-*
-      IF (INCX.GT.0) THEN
-          KX = 1
-      ELSE
-          KX = 1 - (N-1)*INCX
-      END IF
-      IF (INCY.GT.0) THEN
-          KY = 1
-      ELSE
-          KY = 1 - (N-1)*INCY
-      END IF
-*
-*     Start the operations. In this version the elements of the array A
-*     are accessed sequentially with one pass through A.
-*
-*     First form  y := beta*y.
-*
-      IF (BETA.NE.ONE) THEN
-          IF (INCY.EQ.1) THEN
-              IF (BETA.EQ.ZERO) THEN
-                  DO 10 I = 1,N
-                      Y(I) = ZERO
-   10             CONTINUE
-              ELSE
-                  DO 20 I = 1,N
-                      Y(I) = BETA*Y(I)
-   20             CONTINUE
-              END IF
-          ELSE
-              IY = KY
-              IF (BETA.EQ.ZERO) THEN
-                  DO 30 I = 1,N
-                      Y(IY) = ZERO
-                      IY = IY + INCY
-   30             CONTINUE
-              ELSE
-                  DO 40 I = 1,N
-                      Y(IY) = BETA*Y(IY)
-                      IY = IY + INCY
-   40             CONTINUE
-              END IF
-          END IF
-      END IF
-      IF (ALPHA.EQ.ZERO) RETURN
-      IF (LSAME(UPLO,'U')) THEN
-*
-*        Form  y  when upper triangle of A is stored.
-*
-          KPLUS1 = K + 1
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 60 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  L = KPLUS1 - J
-                  DO 50 I = MAX(1,J-K),J - 1
-                      Y(I) = Y(I) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + A(L+I,J)*X(I)
-   50             CONTINUE
-                  Y(J) = Y(J) + TEMP1*A(KPLUS1,J) + ALPHA*TEMP2
-   60         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 80 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  IX = KX
-                  IY = KY
-                  L = KPLUS1 - J
-                  DO 70 I = MAX(1,J-K),J - 1
-                      Y(IY) = Y(IY) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + A(L+I,J)*X(IX)
-                      IX = IX + INCX
-                      IY = IY + INCY
-   70             CONTINUE
-                  Y(JY) = Y(JY) + TEMP1*A(KPLUS1,J) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-                  IF (J.GT.K) THEN
-                      KX = KX + INCX
-                      KY = KY + INCY
-                  END IF
-   80         CONTINUE
-          END IF
-      ELSE
-*
-*        Form  y  when lower triangle of A is stored.
-*
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 100 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  Y(J) = Y(J) + TEMP1*A(1,J)
-                  L = 1 - J
-                  DO 90 I = J + 1,MIN(N,J+K)
-                      Y(I) = Y(I) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + A(L+I,J)*X(I)
-   90             CONTINUE
-                  Y(J) = Y(J) + ALPHA*TEMP2
-  100         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 120 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  Y(JY) = Y(JY) + TEMP1*A(1,J)
-                  L = 1 - J
-                  IX = JX
-                  IY = JY
-                  DO 110 I = J + 1,MIN(N,J+K)
-                      IX = IX + INCX
-                      IY = IY + INCY
-                      Y(IY) = Y(IY) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + A(L+I,J)*X(IX)
-  110             CONTINUE
-                  Y(JY) = Y(JY) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-  120         CONTINUE
-          END IF
-      END IF
-*
-      RETURN
-*
-*     End of SSBMV .
-*
-      END
diff --git a/blas/sspmv.f b/blas/sspmv.f
deleted file mode 100644
index 0b8449824..000000000
--- a/blas/sspmv.f
+++ /dev/null
@@ -1,265 +0,0 @@
-      SUBROUTINE SSPMV(UPLO,N,ALPHA,AP,X,INCX,BETA,Y,INCY)
-*     .. Scalar Arguments ..
-      REAL ALPHA,BETA
-      INTEGER INCX,INCY,N
-      CHARACTER UPLO
-*     ..
-*     .. Array Arguments ..
-      REAL AP(*),X(*),Y(*)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*  SSPMV  performs the matrix-vector operation
-*
-*     y := alpha*A*x + beta*y,
-*
-*  where alpha and beta are scalars, x and y are n element vectors and
-*  A is an n by n symmetric matrix, supplied in packed form.
-*
-*  Arguments
-*  ==========
-*
-*  UPLO   - CHARACTER*1.
-*           On entry, UPLO specifies whether the upper or lower
-*           triangular part of the matrix A is supplied in the packed
-*           array AP as follows:
-*
-*              UPLO = 'U' or 'u'   The upper triangular part of A is
-*                                  supplied in AP.
-*
-*              UPLO = 'L' or 'l'   The lower triangular part of A is
-*                                  supplied in AP.
-*
-*           Unchanged on exit.
-*
-*  N      - INTEGER.
-*           On entry, N specifies the order of the matrix A.
-*           N must be at least zero.
-*           Unchanged on exit.
-*
-*  ALPHA  - REAL            .
-*           On entry, ALPHA specifies the scalar alpha.
-*           Unchanged on exit.
-*
-*  AP     - REAL             array of DIMENSION at least
-*           ( ( n*( n + 1 ) )/2 ).
-*           Before entry with UPLO = 'U' or 'u', the array AP must
-*           contain the upper triangular part of the symmetric matrix
-*           packed sequentially, column by column, so that AP( 1 )
-*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 )
-*           and a( 2, 2 ) respectively, and so on.
-*           Before entry with UPLO = 'L' or 'l', the array AP must
-*           contain the lower triangular part of the symmetric matrix
-*           packed sequentially, column by column, so that AP( 1 )
-*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 )
-*           and a( 3, 1 ) respectively, and so on.
-*           Unchanged on exit.
-*
-*  X      - REAL             array of dimension at least
-*           ( 1 + ( n - 1 )*abs( INCX ) ).
-*           Before entry, the incremented array X must contain the n
-*           element vector x.
-*           Unchanged on exit.
-*
-*  INCX   - INTEGER.
-*           On entry, INCX specifies the increment for the elements of
-*           X. INCX must not be zero.
-*           Unchanged on exit.
-*
-*  BETA   - REAL            .
-*           On entry, BETA specifies the scalar beta. When BETA is
-*           supplied as zero then Y need not be set on input.
-*           Unchanged on exit.
-*
-*  Y      - REAL             array of dimension at least
-*           ( 1 + ( n - 1 )*abs( INCY ) ).
-*           Before entry, the incremented array Y must contain the n
-*           element vector y. On exit, Y is overwritten by the updated
-*           vector y.
-*
-*  INCY   - INTEGER.
-*           On entry, INCY specifies the increment for the elements of
-*           Y. INCY must not be zero.
-*           Unchanged on exit.
-*
-*  Further Details
-*  ===============
-*
-*  Level 2 Blas routine.
-*
-*  -- Written on 22-October-1986.
-*     Jack Dongarra, Argonne National Lab.
-*     Jeremy Du Croz, Nag Central Office.
-*     Sven Hammarling, Nag Central Office.
-*     Richard Hanson, Sandia National Labs.
-*
-*  =====================================================================
-*
-*     .. Parameters ..
-      REAL ONE,ZERO
-      PARAMETER (ONE=1.0E+0,ZERO=0.0E+0)
-*     ..
-*     .. Local Scalars ..
-      REAL TEMP1,TEMP2
-      INTEGER I,INFO,IX,IY,J,JX,JY,K,KK,KX,KY
-*     ..
-*     .. External Functions ..
-      LOGICAL LSAME
-      EXTERNAL LSAME
-*     ..
-*     .. External Subroutines ..
-      EXTERNAL XERBLA
-*     ..
-*
-*     Test the input parameters.
-*
-      INFO = 0
-      IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
-          INFO = 1
-      ELSE IF (N.LT.0) THEN
-          INFO = 2
-      ELSE IF (INCX.EQ.0) THEN
-          INFO = 6
-      ELSE IF (INCY.EQ.0) THEN
-          INFO = 9
-      END IF
-      IF (INFO.NE.0) THEN
-          CALL XERBLA('SSPMV ',INFO)
-          RETURN
-      END IF
-*
-*     Quick return if possible.
-*
-      IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
-*
-*     Set up the start points in  X  and  Y.
-*
-      IF (INCX.GT.0) THEN
-          KX = 1
-      ELSE
-          KX = 1 - (N-1)*INCX
-      END IF
-      IF (INCY.GT.0) THEN
-          KY = 1
-      ELSE
-          KY = 1 - (N-1)*INCY
-      END IF
-*
-*     Start the operations. In this version the elements of the array AP
-*     are accessed sequentially with one pass through AP.
-*
-*     First form  y := beta*y.
-*
-      IF (BETA.NE.ONE) THEN
-          IF (INCY.EQ.1) THEN
-              IF (BETA.EQ.ZERO) THEN
-                  DO 10 I = 1,N
-                      Y(I) = ZERO
-   10             CONTINUE
-              ELSE
-                  DO 20 I = 1,N
-                      Y(I) = BETA*Y(I)
-   20             CONTINUE
-              END IF
-          ELSE
-              IY = KY
-              IF (BETA.EQ.ZERO) THEN
-                  DO 30 I = 1,N
-                      Y(IY) = ZERO
-                      IY = IY + INCY
-   30             CONTINUE
-              ELSE
-                  DO 40 I = 1,N
-                      Y(IY) = BETA*Y(IY)
-                      IY = IY + INCY
-   40             CONTINUE
-              END IF
-          END IF
-      END IF
-      IF (ALPHA.EQ.ZERO) RETURN
-      KK = 1
-      IF (LSAME(UPLO,'U')) THEN
-*
-*        Form  y  when AP contains the upper triangle.
-*
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 60 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  K = KK
-                  DO 50 I = 1,J - 1
-                      Y(I) = Y(I) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + AP(K)*X(I)
-                      K = K + 1
-   50             CONTINUE
-                  Y(J) = Y(J) + TEMP1*AP(KK+J-1) + ALPHA*TEMP2
-                  KK = KK + J
-   60         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 80 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  IX = KX
-                  IY = KY
-                  DO 70 K = KK,KK + J - 2
-                      Y(IY) = Y(IY) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + AP(K)*X(IX)
-                      IX = IX + INCX
-                      IY = IY + INCY
-   70             CONTINUE
-                  Y(JY) = Y(JY) + TEMP1*AP(KK+J-1) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-                  KK = KK + J
-   80         CONTINUE
-          END IF
-      ELSE
-*
-*        Form  y  when AP contains the lower triangle.
-*
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 100 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  Y(J) = Y(J) + TEMP1*AP(KK)
-                  K = KK + 1
-                  DO 90 I = J + 1,N
-                      Y(I) = Y(I) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + AP(K)*X(I)
-                      K = K + 1
-   90             CONTINUE
-                  Y(J) = Y(J) + ALPHA*TEMP2
-                  KK = KK + (N-J+1)
-  100         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 120 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  Y(JY) = Y(JY) + TEMP1*AP(KK)
-                  IX = JX
-                  IY = JY
-                  DO 110 K = KK + 1,KK + N - J
-                      IX = IX + INCX
-                      IY = IY + INCY
-                      Y(IY) = Y(IY) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + AP(K)*X(IX)
-  110             CONTINUE
-                  Y(JY) = Y(JY) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-                  KK = KK + (N-J+1)
-  120         CONTINUE
-          END IF
-      END IF
-*
-      RETURN
-*
-*     End of SSPMV .
-*
-      END
diff --git a/blas/stbmv.f b/blas/stbmv.f
deleted file mode 100644
index c0b8f1136..000000000
--- a/blas/stbmv.f
+++ /dev/null
@@ -1,335 +0,0 @@
-      SUBROUTINE STBMV(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX)
-*     .. Scalar Arguments ..
-      INTEGER INCX,K,LDA,N
-      CHARACTER DIAG,TRANS,UPLO
-*     ..
-*     .. Array Arguments ..
-      REAL A(LDA,*),X(*)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*  STBMV  performs one of the matrix-vector operations
-*
-*     x := A*x,   or   x := A'*x,
-*
-*  where x is an n element vector and  A is an n by n unit, or non-unit,
-*  upper or lower triangular band matrix, with ( k + 1 ) diagonals.
-*
-*  Arguments
-*  ==========
-*
-*  UPLO   - CHARACTER*1.
-*           On entry, UPLO specifies whether the matrix is an upper or
-*           lower triangular matrix as follows:
-*
-*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
-*
-*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
-*
-*           Unchanged on exit.
-*
-*  TRANS  - CHARACTER*1.
-*           On entry, TRANS specifies the operation to be performed as
-*           follows:
-*
-*              TRANS = 'N' or 'n'   x := A*x.
-*
-*              TRANS = 'T' or 't'   x := A'*x.
-*
-*              TRANS = 'C' or 'c'   x := A'*x.
-*
-*           Unchanged on exit.
-*
-*  DIAG   - CHARACTER*1.
-*           On entry, DIAG specifies whether or not A is unit
-*           triangular as follows:
-*
-*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
-*
-*              DIAG = 'N' or 'n'   A is not assumed to be unit
-*                                  triangular.
-*
-*           Unchanged on exit.
-*
-*  N      - INTEGER.
-*           On entry, N specifies the order of the matrix A.
-*           N must be at least zero.
-*           Unchanged on exit.
-*
-*  K      - INTEGER.
-*           On entry with UPLO = 'U' or 'u', K specifies the number of
-*           super-diagonals of the matrix A.
-*           On entry with UPLO = 'L' or 'l', K specifies the number of
-*           sub-diagonals of the matrix A.
-*           K must satisfy  0 .le. K.
-*           Unchanged on exit.
-*
-*  A      - REAL             array of DIMENSION ( LDA, n ).
-*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
-*           by n part of the array A must contain the upper triangular
-*           band part of the matrix of coefficients, supplied column by
-*           column, with the leading diagonal of the matrix in row
-*           ( k + 1 ) of the array, the first super-diagonal starting at
-*           position 2 in row k, and so on. The top left k by k triangle
-*           of the array A is not referenced.
-*           The following program segment will transfer an upper
-*           triangular band matrix from conventional full matrix storage
-*           to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = K + 1 - J
-*                    DO 10, I = MAX( 1, J - K ), J
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
-*           by n part of the array A must contain the lower triangular
-*           band part of the matrix of coefficients, supplied column by
-*           column, with the leading diagonal of the matrix in row 1 of
-*           the array, the first sub-diagonal starting at position 1 in
-*           row 2, and so on. The bottom right k by k triangle of the
-*           array A is not referenced.
-*           The following program segment will transfer a lower
-*           triangular band matrix from conventional full matrix storage
-*           to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = 1 - J
-*                    DO 10, I = J, MIN( N, J + K )
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Note that when DIAG = 'U' or 'u' the elements of the array A
-*           corresponding to the diagonal elements of the matrix are not
-*           referenced, but are assumed to be unity.
-*           Unchanged on exit.
-*
-*  LDA    - INTEGER.
-*           On entry, LDA specifies the first dimension of A as declared
-*           in the calling (sub) program. LDA must be at least
-*           ( k + 1 ).
-*           Unchanged on exit.
-*
-*  X      - REAL             array of dimension at least
-*           ( 1 + ( n - 1 )*abs( INCX ) ).
-*           Before entry, the incremented array X must contain the n
-*           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
-*
-*  INCX   - INTEGER.
-*           On entry, INCX specifies the increment for the elements of
-*           X. INCX must not be zero.
-*           Unchanged on exit.
-*
-*  Further Details
-*  ===============
-*
-*  Level 2 Blas routine.
-*
-*  -- Written on 22-October-1986.
-*     Jack Dongarra, Argonne National Lab.
-*     Jeremy Du Croz, Nag Central Office.
-*     Sven Hammarling, Nag Central Office.
-*     Richard Hanson, Sandia National Labs.
-*
-*  =====================================================================
-*
-*     .. Parameters ..
-      REAL ZERO
-      PARAMETER (ZERO=0.0E+0)
-*     ..
-*     .. Local Scalars ..
-      REAL TEMP
-      INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L
-      LOGICAL NOUNIT
-*     ..
-*     .. External Functions ..
-      LOGICAL LSAME
-      EXTERNAL LSAME
-*     ..
-*     .. External Subroutines ..
-      EXTERNAL XERBLA
-*     ..
-*     .. Intrinsic Functions ..
-      INTRINSIC MAX,MIN
-*     ..
-*
-*     Test the input parameters.
-*
-      INFO = 0
-      IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
-          INFO = 1
-      ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND.
-     +         .NOT.LSAME(TRANS,'C')) THEN
-          INFO = 2
-      ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN
-          INFO = 3
-      ELSE IF (N.LT.0) THEN
-          INFO = 4
-      ELSE IF (K.LT.0) THEN
-          INFO = 5
-      ELSE IF (LDA.LT. (K+1)) THEN
-          INFO = 7
-      ELSE IF (INCX.EQ.0) THEN
-          INFO = 9
-      END IF
-      IF (INFO.NE.0) THEN
-          CALL XERBLA('STBMV ',INFO)
-          RETURN
-      END IF
-*
-*     Quick return if possible.
-*
-      IF (N.EQ.0) RETURN
-*
-      NOUNIT = LSAME(DIAG,'N')
-*
-*     Set up the start point in X if the increment is not unity. This
-*     will be  ( N - 1 )*INCX   too small for descending loops.
-*
-      IF (INCX.LE.0) THEN
-          KX = 1 - (N-1)*INCX
-      ELSE IF (INCX.NE.1) THEN
-          KX = 1
-      END IF
-*
-*     Start the operations. In this version the elements of A are
-*     accessed sequentially with one pass through A.
-*
-      IF (LSAME(TRANS,'N')) THEN
-*
-*         Form  x := A*x.
-*
-          IF (LSAME(UPLO,'U')) THEN
-              KPLUS1 = K + 1
-              IF (INCX.EQ.1) THEN
-                  DO 20 J = 1,N
-                      IF (X(J).NE.ZERO) THEN
-                          TEMP = X(J)
-                          L = KPLUS1 - J
-                          DO 10 I = MAX(1,J-K),J - 1
-                              X(I) = X(I) + TEMP*A(L+I,J)
-   10                     CONTINUE
-                          IF (NOUNIT) X(J) = X(J)*A(KPLUS1,J)
-                      END IF
-   20             CONTINUE
-              ELSE
-                  JX = KX
-                  DO 40 J = 1,N
-                      IF (X(JX).NE.ZERO) THEN
-                          TEMP = X(JX)
-                          IX = KX
-                          L = KPLUS1 - J
-                          DO 30 I = MAX(1,J-K),J - 1
-                              X(IX) = X(IX) + TEMP*A(L+I,J)
-                              IX = IX + INCX
-   30                     CONTINUE
-                          IF (NOUNIT) X(JX) = X(JX)*A(KPLUS1,J)
-                      END IF
-                      JX = JX + INCX
-                      IF (J.GT.K) KX = KX + INCX
-   40             CONTINUE
-              END IF
-          ELSE
-              IF (INCX.EQ.1) THEN
-                  DO 60 J = N,1,-1
-                      IF (X(J).NE.ZERO) THEN
-                          TEMP = X(J)
-                          L = 1 - J
-                          DO 50 I = MIN(N,J+K),J + 1,-1
-                              X(I) = X(I) + TEMP*A(L+I,J)
-   50                     CONTINUE
-                          IF (NOUNIT) X(J) = X(J)*A(1,J)
-                      END IF
-   60             CONTINUE
-              ELSE
-                  KX = KX + (N-1)*INCX
-                  JX = KX
-                  DO 80 J = N,1,-1
-                      IF (X(JX).NE.ZERO) THEN
-                          TEMP = X(JX)
-                          IX = KX
-                          L = 1 - J
-                          DO 70 I = MIN(N,J+K),J + 1,-1
-                              X(IX) = X(IX) + TEMP*A(L+I,J)
-                              IX = IX - INCX
-   70                     CONTINUE
-                          IF (NOUNIT) X(JX) = X(JX)*A(1,J)
-                      END IF
-                      JX = JX - INCX
-                      IF ((N-J).GE.K) KX = KX - INCX
-   80             CONTINUE
-              END IF
-          END IF
-      ELSE
-*
-*        Form  x := A'*x.
-*
-          IF (LSAME(UPLO,'U')) THEN
-              KPLUS1 = K + 1
-              IF (INCX.EQ.1) THEN
-                  DO 100 J = N,1,-1
-                      TEMP = X(J)
-                      L = KPLUS1 - J
-                      IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J)
-                      DO 90 I = J - 1,MAX(1,J-K),-1
-                          TEMP = TEMP + A(L+I,J)*X(I)
-   90                 CONTINUE
-                      X(J) = TEMP
-  100             CONTINUE
-              ELSE
-                  KX = KX + (N-1)*INCX
-                  JX = KX
-                  DO 120 J = N,1,-1
-                      TEMP = X(JX)
-                      KX = KX - INCX
-                      IX = KX
-                      L = KPLUS1 - J
-                      IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J)
-                      DO 110 I = J - 1,MAX(1,J-K),-1
-                          TEMP = TEMP + A(L+I,J)*X(IX)
-                          IX = IX - INCX
-  110                 CONTINUE
-                      X(JX) = TEMP
-                      JX = JX - INCX
-  120             CONTINUE
-              END IF
-          ELSE
-              IF (INCX.EQ.1) THEN
-                  DO 140 J = 1,N
-                      TEMP = X(J)
-                      L = 1 - J
-                      IF (NOUNIT) TEMP = TEMP*A(1,J)
-                      DO 130 I = J + 1,MIN(N,J+K)
-                          TEMP = TEMP + A(L+I,J)*X(I)
-  130                 CONTINUE
-                      X(J) = TEMP
-  140             CONTINUE
-              ELSE
-                  JX = KX
-                  DO 160 J = 1,N
-                      TEMP = X(JX)
-                      KX = KX + INCX
-                      IX = KX
-                      L = 1 - J
-                      IF (NOUNIT) TEMP = TEMP*A(1,J)
-                      DO 150 I = J + 1,MIN(N,J+K)
-                          TEMP = TEMP + A(L+I,J)*X(IX)
-                          IX = IX + INCX
-  150                 CONTINUE
-                      X(JX) = TEMP
-                      JX = JX + INCX
-  160             CONTINUE
-              END IF
-          END IF
-      END IF
-*
-      RETURN
-*
-*     End of STBMV .
-*
-      END
diff --git a/blas/xerbla.cpp b/blas/xerbla.cpp
index 0422f79b7..8775b88cd 100644
--- a/blas/xerbla.cpp
+++ b/blas/xerbla.cpp
@@ -1,5 +1,5 @@
 
-#include <iostream>
+#include <stdio.h>
 
 #if (defined __GNUC__) && (!defined __MINGW32__)
 #define EIGEN_WEAK_LINKING __attribute__ ((weak))
@@ -14,7 +14,7 @@ extern "C"
 
 EIGEN_WEAK_LINKING int xerbla_(const char * msg, int *info, int)
 {
-  std::cerr << "Eigen BLAS ERROR #" << *info << ": " << msg << "\n";
+  printf("Eigen BLAS ERROR #%i: %s\n", *info, msg );
   return 0;
 }
 
diff --git a/blas/zhbmv.f b/blas/zhbmv.f
deleted file mode 100644
index bca0da5fc..000000000
--- a/blas/zhbmv.f
+++ /dev/null
@@ -1,310 +0,0 @@
-      SUBROUTINE ZHBMV(UPLO,N,K,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
-*     .. Scalar Arguments ..
-      DOUBLE COMPLEX ALPHA,BETA
-      INTEGER INCX,INCY,K,LDA,N
-      CHARACTER UPLO
-*     ..
-*     .. Array Arguments ..
-      DOUBLE COMPLEX A(LDA,*),X(*),Y(*)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*  ZHBMV  performs the matrix-vector  operation
-*
-*     y := alpha*A*x + beta*y,
-*
-*  where alpha and beta are scalars, x and y are n element vectors and
-*  A is an n by n hermitian band matrix, with k super-diagonals.
-*
-*  Arguments
-*  ==========
-*
-*  UPLO   - CHARACTER*1.
-*           On entry, UPLO specifies whether the upper or lower
-*           triangular part of the band matrix A is being supplied as
-*           follows:
-*
-*              UPLO = 'U' or 'u'   The upper triangular part of A is
-*                                  being supplied.
-*
-*              UPLO = 'L' or 'l'   The lower triangular part of A is
-*                                  being supplied.
-*
-*           Unchanged on exit.
-*
-*  N      - INTEGER.
-*           On entry, N specifies the order of the matrix A.
-*           N must be at least zero.
-*           Unchanged on exit.
-*
-*  K      - INTEGER.
-*           On entry, K specifies the number of super-diagonals of the
-*           matrix A. K must satisfy  0 .le. K.
-*           Unchanged on exit.
-*
-*  ALPHA  - COMPLEX*16      .
-*           On entry, ALPHA specifies the scalar alpha.
-*           Unchanged on exit.
-*
-*  A      - COMPLEX*16       array of DIMENSION ( LDA, n ).
-*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
-*           by n part of the array A must contain the upper triangular
-*           band part of the hermitian matrix, supplied column by
-*           column, with the leading diagonal of the matrix in row
-*           ( k + 1 ) of the array, the first super-diagonal starting at
-*           position 2 in row k, and so on. The top left k by k triangle
-*           of the array A is not referenced.
-*           The following program segment will transfer the upper
-*           triangular part of a hermitian band matrix from conventional
-*           full matrix storage to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = K + 1 - J
-*                    DO 10, I = MAX( 1, J - K ), J
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
-*           by n part of the array A must contain the lower triangular
-*           band part of the hermitian matrix, supplied column by
-*           column, with the leading diagonal of the matrix in row 1 of
-*           the array, the first sub-diagonal starting at position 1 in
-*           row 2, and so on. The bottom right k by k triangle of the
-*           array A is not referenced.
-*           The following program segment will transfer the lower
-*           triangular part of a hermitian band matrix from conventional
-*           full matrix storage to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = 1 - J
-*                    DO 10, I = J, MIN( N, J + K )
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Note that the imaginary parts of the diagonal elements need
-*           not be set and are assumed to be zero.
-*           Unchanged on exit.
-*
-*  LDA    - INTEGER.
-*           On entry, LDA specifies the first dimension of A as declared
-*           in the calling (sub) program. LDA must be at least
-*           ( k + 1 ).
-*           Unchanged on exit.
-*
-*  X      - COMPLEX*16       array of DIMENSION at least
-*           ( 1 + ( n - 1 )*abs( INCX ) ).
-*           Before entry, the incremented array X must contain the
-*           vector x.
-*           Unchanged on exit.
-*
-*  INCX   - INTEGER.
-*           On entry, INCX specifies the increment for the elements of
-*           X. INCX must not be zero.
-*           Unchanged on exit.
-*
-*  BETA   - COMPLEX*16      .
-*           On entry, BETA specifies the scalar beta.
-*           Unchanged on exit.
-*
-*  Y      - COMPLEX*16       array of DIMENSION at least
-*           ( 1 + ( n - 1 )*abs( INCY ) ).
-*           Before entry, the incremented array Y must contain the
-*           vector y. On exit, Y is overwritten by the updated vector y.
-*
-*  INCY   - INTEGER.
-*           On entry, INCY specifies the increment for the elements of
-*           Y. INCY must not be zero.
-*           Unchanged on exit.
-*
-*  Further Details
-*  ===============
-*
-*  Level 2 Blas routine.
-*
-*  -- Written on 22-October-1986.
-*     Jack Dongarra, Argonne National Lab.
-*     Jeremy Du Croz, Nag Central Office.
-*     Sven Hammarling, Nag Central Office.
-*     Richard Hanson, Sandia National Labs.
-*
-*  =====================================================================
-*
-*     .. Parameters ..
-      DOUBLE COMPLEX ONE
-      PARAMETER (ONE= (1.0D+0,0.0D+0))
-      DOUBLE COMPLEX ZERO
-      PARAMETER (ZERO= (0.0D+0,0.0D+0))
-*     ..
-*     .. Local Scalars ..
-      DOUBLE COMPLEX TEMP1,TEMP2
-      INTEGER I,INFO,IX,IY,J,JX,JY,KPLUS1,KX,KY,L
-*     ..
-*     .. External Functions ..
-      LOGICAL LSAME
-      EXTERNAL LSAME
-*     ..
-*     .. External Subroutines ..
-      EXTERNAL XERBLA
-*     ..
-*     .. Intrinsic Functions ..
-      INTRINSIC DBLE,DCONJG,MAX,MIN
-*     ..
-*
-*     Test the input parameters.
-*
-      INFO = 0
-      IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
-          INFO = 1
-      ELSE IF (N.LT.0) THEN
-          INFO = 2
-      ELSE IF (K.LT.0) THEN
-          INFO = 3
-      ELSE IF (LDA.LT. (K+1)) THEN
-          INFO = 6
-      ELSE IF (INCX.EQ.0) THEN
-          INFO = 8
-      ELSE IF (INCY.EQ.0) THEN
-          INFO = 11
-      END IF
-      IF (INFO.NE.0) THEN
-          CALL XERBLA('ZHBMV ',INFO)
-          RETURN
-      END IF
-*
-*     Quick return if possible.
-*
-      IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
-*
-*     Set up the start points in  X  and  Y.
-*
-      IF (INCX.GT.0) THEN
-          KX = 1
-      ELSE
-          KX = 1 - (N-1)*INCX
-      END IF
-      IF (INCY.GT.0) THEN
-          KY = 1
-      ELSE
-          KY = 1 - (N-1)*INCY
-      END IF
-*
-*     Start the operations. In this version the elements of the array A
-*     are accessed sequentially with one pass through A.
-*
-*     First form  y := beta*y.
-*
-      IF (BETA.NE.ONE) THEN
-          IF (INCY.EQ.1) THEN
-              IF (BETA.EQ.ZERO) THEN
-                  DO 10 I = 1,N
-                      Y(I) = ZERO
-   10             CONTINUE
-              ELSE
-                  DO 20 I = 1,N
-                      Y(I) = BETA*Y(I)
-   20             CONTINUE
-              END IF
-          ELSE
-              IY = KY
-              IF (BETA.EQ.ZERO) THEN
-                  DO 30 I = 1,N
-                      Y(IY) = ZERO
-                      IY = IY + INCY
-   30             CONTINUE
-              ELSE
-                  DO 40 I = 1,N
-                      Y(IY) = BETA*Y(IY)
-                      IY = IY + INCY
-   40             CONTINUE
-              END IF
-          END IF
-      END IF
-      IF (ALPHA.EQ.ZERO) RETURN
-      IF (LSAME(UPLO,'U')) THEN
-*
-*        Form  y  when upper triangle of A is stored.
-*
-          KPLUS1 = K + 1
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 60 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  L = KPLUS1 - J
-                  DO 50 I = MAX(1,J-K),J - 1
-                      Y(I) = Y(I) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + DCONJG(A(L+I,J))*X(I)
-   50             CONTINUE
-                  Y(J) = Y(J) + TEMP1*DBLE(A(KPLUS1,J)) + ALPHA*TEMP2
-   60         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 80 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  IX = KX
-                  IY = KY
-                  L = KPLUS1 - J
-                  DO 70 I = MAX(1,J-K),J - 1
-                      Y(IY) = Y(IY) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + DCONJG(A(L+I,J))*X(IX)
-                      IX = IX + INCX
-                      IY = IY + INCY
-   70             CONTINUE
-                  Y(JY) = Y(JY) + TEMP1*DBLE(A(KPLUS1,J)) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-                  IF (J.GT.K) THEN
-                      KX = KX + INCX
-                      KY = KY + INCY
-                  END IF
-   80         CONTINUE
-          END IF
-      ELSE
-*
-*        Form  y  when lower triangle of A is stored.
-*
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 100 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  Y(J) = Y(J) + TEMP1*DBLE(A(1,J))
-                  L = 1 - J
-                  DO 90 I = J + 1,MIN(N,J+K)
-                      Y(I) = Y(I) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + DCONJG(A(L+I,J))*X(I)
-   90             CONTINUE
-                  Y(J) = Y(J) + ALPHA*TEMP2
-  100         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 120 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  Y(JY) = Y(JY) + TEMP1*DBLE(A(1,J))
-                  L = 1 - J
-                  IX = JX
-                  IY = JY
-                  DO 110 I = J + 1,MIN(N,J+K)
-                      IX = IX + INCX
-                      IY = IY + INCY
-                      Y(IY) = Y(IY) + TEMP1*A(L+I,J)
-                      TEMP2 = TEMP2 + DCONJG(A(L+I,J))*X(IX)
-  110             CONTINUE
-                  Y(JY) = Y(JY) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-  120         CONTINUE
-          END IF
-      END IF
-*
-      RETURN
-*
-*     End of ZHBMV .
-*
-      END
diff --git a/blas/zhpmv.f b/blas/zhpmv.f
deleted file mode 100644
index b686108b3..000000000
--- a/blas/zhpmv.f
+++ /dev/null
@@ -1,272 +0,0 @@
-      SUBROUTINE ZHPMV(UPLO,N,ALPHA,AP,X,INCX,BETA,Y,INCY)
-*     .. Scalar Arguments ..
-      DOUBLE COMPLEX ALPHA,BETA
-      INTEGER INCX,INCY,N
-      CHARACTER UPLO
-*     ..
-*     .. Array Arguments ..
-      DOUBLE COMPLEX AP(*),X(*),Y(*)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*  ZHPMV  performs the matrix-vector operation
-*
-*     y := alpha*A*x + beta*y,
-*
-*  where alpha and beta are scalars, x and y are n element vectors and
-*  A is an n by n hermitian matrix, supplied in packed form.
-*
-*  Arguments
-*  ==========
-*
-*  UPLO   - CHARACTER*1.
-*           On entry, UPLO specifies whether the upper or lower
-*           triangular part of the matrix A is supplied in the packed
-*           array AP as follows:
-*
-*              UPLO = 'U' or 'u'   The upper triangular part of A is
-*                                  supplied in AP.
-*
-*              UPLO = 'L' or 'l'   The lower triangular part of A is
-*                                  supplied in AP.
-*
-*           Unchanged on exit.
-*
-*  N      - INTEGER.
-*           On entry, N specifies the order of the matrix A.
-*           N must be at least zero.
-*           Unchanged on exit.
-*
-*  ALPHA  - COMPLEX*16      .
-*           On entry, ALPHA specifies the scalar alpha.
-*           Unchanged on exit.
-*
-*  AP     - COMPLEX*16       array of DIMENSION at least
-*           ( ( n*( n + 1 ) )/2 ).
-*           Before entry with UPLO = 'U' or 'u', the array AP must
-*           contain the upper triangular part of the hermitian matrix
-*           packed sequentially, column by column, so that AP( 1 )
-*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 )
-*           and a( 2, 2 ) respectively, and so on.
-*           Before entry with UPLO = 'L' or 'l', the array AP must
-*           contain the lower triangular part of the hermitian matrix
-*           packed sequentially, column by column, so that AP( 1 )
-*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 )
-*           and a( 3, 1 ) respectively, and so on.
-*           Note that the imaginary parts of the diagonal elements need
-*           not be set and are assumed to be zero.
-*           Unchanged on exit.
-*
-*  X      - COMPLEX*16       array of dimension at least
-*           ( 1 + ( n - 1 )*abs( INCX ) ).
-*           Before entry, the incremented array X must contain the n
-*           element vector x.
-*           Unchanged on exit.
-*
-*  INCX   - INTEGER.
-*           On entry, INCX specifies the increment for the elements of
-*           X. INCX must not be zero.
-*           Unchanged on exit.
-*
-*  BETA   - COMPLEX*16      .
-*           On entry, BETA specifies the scalar beta. When BETA is
-*           supplied as zero then Y need not be set on input.
-*           Unchanged on exit.
-*
-*  Y      - COMPLEX*16       array of dimension at least
-*           ( 1 + ( n - 1 )*abs( INCY ) ).
-*           Before entry, the incremented array Y must contain the n
-*           element vector y. On exit, Y is overwritten by the updated
-*           vector y.
-*
-*  INCY   - INTEGER.
-*           On entry, INCY specifies the increment for the elements of
-*           Y. INCY must not be zero.
-*           Unchanged on exit.
-*
-*  Further Details
-*  ===============
-*
-*  Level 2 Blas routine.
-*
-*  -- Written on 22-October-1986.
-*     Jack Dongarra, Argonne National Lab.
-*     Jeremy Du Croz, Nag Central Office.
-*     Sven Hammarling, Nag Central Office.
-*     Richard Hanson, Sandia National Labs.
-*
-*  =====================================================================
-*
-*     .. Parameters ..
-      DOUBLE COMPLEX ONE
-      PARAMETER (ONE= (1.0D+0,0.0D+0))
-      DOUBLE COMPLEX ZERO
-      PARAMETER (ZERO= (0.0D+0,0.0D+0))
-*     ..
-*     .. Local Scalars ..
-      DOUBLE COMPLEX TEMP1,TEMP2
-      INTEGER I,INFO,IX,IY,J,JX,JY,K,KK,KX,KY
-*     ..
-*     .. External Functions ..
-      LOGICAL LSAME
-      EXTERNAL LSAME
-*     ..
-*     .. External Subroutines ..
-      EXTERNAL XERBLA
-*     ..
-*     .. Intrinsic Functions ..
-      INTRINSIC DBLE,DCONJG
-*     ..
-*
-*     Test the input parameters.
-*
-      INFO = 0
-      IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
-          INFO = 1
-      ELSE IF (N.LT.0) THEN
-          INFO = 2
-      ELSE IF (INCX.EQ.0) THEN
-          INFO = 6
-      ELSE IF (INCY.EQ.0) THEN
-          INFO = 9
-      END IF
-      IF (INFO.NE.0) THEN
-          CALL XERBLA('ZHPMV ',INFO)
-          RETURN
-      END IF
-*
-*     Quick return if possible.
-*
-      IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
-*
-*     Set up the start points in  X  and  Y.
-*
-      IF (INCX.GT.0) THEN
-          KX = 1
-      ELSE
-          KX = 1 - (N-1)*INCX
-      END IF
-      IF (INCY.GT.0) THEN
-          KY = 1
-      ELSE
-          KY = 1 - (N-1)*INCY
-      END IF
-*
-*     Start the operations. In this version the elements of the array AP
-*     are accessed sequentially with one pass through AP.
-*
-*     First form  y := beta*y.
-*
-      IF (BETA.NE.ONE) THEN
-          IF (INCY.EQ.1) THEN
-              IF (BETA.EQ.ZERO) THEN
-                  DO 10 I = 1,N
-                      Y(I) = ZERO
-   10             CONTINUE
-              ELSE
-                  DO 20 I = 1,N
-                      Y(I) = BETA*Y(I)
-   20             CONTINUE
-              END IF
-          ELSE
-              IY = KY
-              IF (BETA.EQ.ZERO) THEN
-                  DO 30 I = 1,N
-                      Y(IY) = ZERO
-                      IY = IY + INCY
-   30             CONTINUE
-              ELSE
-                  DO 40 I = 1,N
-                      Y(IY) = BETA*Y(IY)
-                      IY = IY + INCY
-   40             CONTINUE
-              END IF
-          END IF
-      END IF
-      IF (ALPHA.EQ.ZERO) RETURN
-      KK = 1
-      IF (LSAME(UPLO,'U')) THEN
-*
-*        Form  y  when AP contains the upper triangle.
-*
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 60 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  K = KK
-                  DO 50 I = 1,J - 1
-                      Y(I) = Y(I) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + DCONJG(AP(K))*X(I)
-                      K = K + 1
-   50             CONTINUE
-                  Y(J) = Y(J) + TEMP1*DBLE(AP(KK+J-1)) + ALPHA*TEMP2
-                  KK = KK + J
-   60         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 80 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  IX = KX
-                  IY = KY
-                  DO 70 K = KK,KK + J - 2
-                      Y(IY) = Y(IY) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + DCONJG(AP(K))*X(IX)
-                      IX = IX + INCX
-                      IY = IY + INCY
-   70             CONTINUE
-                  Y(JY) = Y(JY) + TEMP1*DBLE(AP(KK+J-1)) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-                  KK = KK + J
-   80         CONTINUE
-          END IF
-      ELSE
-*
-*        Form  y  when AP contains the lower triangle.
-*
-          IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
-              DO 100 J = 1,N
-                  TEMP1 = ALPHA*X(J)
-                  TEMP2 = ZERO
-                  Y(J) = Y(J) + TEMP1*DBLE(AP(KK))
-                  K = KK + 1
-                  DO 90 I = J + 1,N
-                      Y(I) = Y(I) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + DCONJG(AP(K))*X(I)
-                      K = K + 1
-   90             CONTINUE
-                  Y(J) = Y(J) + ALPHA*TEMP2
-                  KK = KK + (N-J+1)
-  100         CONTINUE
-          ELSE
-              JX = KX
-              JY = KY
-              DO 120 J = 1,N
-                  TEMP1 = ALPHA*X(JX)
-                  TEMP2 = ZERO
-                  Y(JY) = Y(JY) + TEMP1*DBLE(AP(KK))
-                  IX = JX
-                  IY = JY
-                  DO 110 K = KK + 1,KK + N - J
-                      IX = IX + INCX
-                      IY = IY + INCY
-                      Y(IY) = Y(IY) + TEMP1*AP(K)
-                      TEMP2 = TEMP2 + DCONJG(AP(K))*X(IX)
-  110             CONTINUE
-                  Y(JY) = Y(JY) + ALPHA*TEMP2
-                  JX = JX + INCX
-                  JY = JY + INCY
-                  KK = KK + (N-J+1)
-  120         CONTINUE
-          END IF
-      END IF
-*
-      RETURN
-*
-*     End of ZHPMV .
-*
-      END
diff --git a/blas/ztbmv.f b/blas/ztbmv.f
deleted file mode 100644
index 7c85c1b55..000000000
--- a/blas/ztbmv.f
+++ /dev/null
@@ -1,366 +0,0 @@
-      SUBROUTINE ZTBMV(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX)
-*     .. Scalar Arguments ..
-      INTEGER INCX,K,LDA,N
-      CHARACTER DIAG,TRANS,UPLO
-*     ..
-*     .. Array Arguments ..
-      DOUBLE COMPLEX A(LDA,*),X(*)
-*     ..
-*
-*  Purpose
-*  =======
-*
-*  ZTBMV  performs one of the matrix-vector operations
-*
-*     x := A*x,   or   x := A'*x,   or   x := conjg( A' )*x,
-*
-*  where x is an n element vector and  A is an n by n unit, or non-unit,
-*  upper or lower triangular band matrix, with ( k + 1 ) diagonals.
-*
-*  Arguments
-*  ==========
-*
-*  UPLO   - CHARACTER*1.
-*           On entry, UPLO specifies whether the matrix is an upper or
-*           lower triangular matrix as follows:
-*
-*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
-*
-*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
-*
-*           Unchanged on exit.
-*
-*  TRANS  - CHARACTER*1.
-*           On entry, TRANS specifies the operation to be performed as
-*           follows:
-*
-*              TRANS = 'N' or 'n'   x := A*x.
-*
-*              TRANS = 'T' or 't'   x := A'*x.
-*
-*              TRANS = 'C' or 'c'   x := conjg( A' )*x.
-*
-*           Unchanged on exit.
-*
-*  DIAG   - CHARACTER*1.
-*           On entry, DIAG specifies whether or not A is unit
-*           triangular as follows:
-*
-*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
-*
-*              DIAG = 'N' or 'n'   A is not assumed to be unit
-*                                  triangular.
-*
-*           Unchanged on exit.
-*
-*  N      - INTEGER.
-*           On entry, N specifies the order of the matrix A.
-*           N must be at least zero.
-*           Unchanged on exit.
-*
-*  K      - INTEGER.
-*           On entry with UPLO = 'U' or 'u', K specifies the number of
-*           super-diagonals of the matrix A.
-*           On entry with UPLO = 'L' or 'l', K specifies the number of
-*           sub-diagonals of the matrix A.
-*           K must satisfy  0 .le. K.
-*           Unchanged on exit.
-*
-*  A      - COMPLEX*16       array of DIMENSION ( LDA, n ).
-*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
-*           by n part of the array A must contain the upper triangular
-*           band part of the matrix of coefficients, supplied column by
-*           column, with the leading diagonal of the matrix in row
-*           ( k + 1 ) of the array, the first super-diagonal starting at
-*           position 2 in row k, and so on. The top left k by k triangle
-*           of the array A is not referenced.
-*           The following program segment will transfer an upper
-*           triangular band matrix from conventional full matrix storage
-*           to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = K + 1 - J
-*                    DO 10, I = MAX( 1, J - K ), J
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
-*           by n part of the array A must contain the lower triangular
-*           band part of the matrix of coefficients, supplied column by
-*           column, with the leading diagonal of the matrix in row 1 of
-*           the array, the first sub-diagonal starting at position 1 in
-*           row 2, and so on. The bottom right k by k triangle of the
-*           array A is not referenced.
-*           The following program segment will transfer a lower
-*           triangular band matrix from conventional full matrix storage
-*           to band storage:
-*
-*                 DO 20, J = 1, N
-*                    M = 1 - J
-*                    DO 10, I = J, MIN( N, J + K )
-*                       A( M + I, J ) = matrix( I, J )
-*              10    CONTINUE
-*              20 CONTINUE
-*
-*           Note that when DIAG = 'U' or 'u' the elements of the array A
-*           corresponding to the diagonal elements of the matrix are not
-*           referenced, but are assumed to be unity.
-*           Unchanged on exit.
-*
-*  LDA    - INTEGER.
-*           On entry, LDA specifies the first dimension of A as declared
-*           in the calling (sub) program. LDA must be at least
-*           ( k + 1 ).
-*           Unchanged on exit.
-*
-*  X      - COMPLEX*16       array of dimension at least
-*           ( 1 + ( n - 1 )*abs( INCX ) ).
-*           Before entry, the incremented array X must contain the n
-*           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
-*
-*  INCX   - INTEGER.
-*           On entry, INCX specifies the increment for the elements of
-*           X. INCX must not be zero.
-*           Unchanged on exit.
-*
-*  Further Details
-*  ===============
-*
-*  Level 2 Blas routine.
-*
-*  -- Written on 22-October-1986.
-*     Jack Dongarra, Argonne National Lab.
-*     Jeremy Du Croz, Nag Central Office.
-*     Sven Hammarling, Nag Central Office.
-*     Richard Hanson, Sandia National Labs.
-*
-*  =====================================================================
-*
-*     .. Parameters ..
-      DOUBLE COMPLEX ZERO
-      PARAMETER (ZERO= (0.0D+0,0.0D+0))
-*     ..
-*     .. Local Scalars ..
-      DOUBLE COMPLEX TEMP
-      INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L
-      LOGICAL NOCONJ,NOUNIT
-*     ..
-*     .. External Functions ..
-      LOGICAL LSAME
-      EXTERNAL LSAME
-*     ..
-*     .. External Subroutines ..
-      EXTERNAL XERBLA
-*     ..
-*     .. Intrinsic Functions ..
-      INTRINSIC DCONJG,MAX,MIN
-*     ..
-*
-*     Test the input parameters.
-*
-      INFO = 0
-      IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
-          INFO = 1
-      ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND.
-     +         .NOT.LSAME(TRANS,'C')) THEN
-          INFO = 2
-      ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN
-          INFO = 3
-      ELSE IF (N.LT.0) THEN
-          INFO = 4
-      ELSE IF (K.LT.0) THEN
-          INFO = 5
-      ELSE IF (LDA.LT. (K+1)) THEN
-          INFO = 7
-      ELSE IF (INCX.EQ.0) THEN
-          INFO = 9
-      END IF
-      IF (INFO.NE.0) THEN
-          CALL XERBLA('ZTBMV ',INFO)
-          RETURN
-      END IF
-*
-*     Quick return if possible.
-*
-      IF (N.EQ.0) RETURN
-*
-      NOCONJ = LSAME(TRANS,'T')
-      NOUNIT = LSAME(DIAG,'N')
-*
-*     Set up the start point in X if the increment is not unity. This
-*     will be  ( N - 1 )*INCX   too small for descending loops.
-*
-      IF (INCX.LE.0) THEN
-          KX = 1 - (N-1)*INCX
-      ELSE IF (INCX.NE.1) THEN
-          KX = 1
-      END IF
-*
-*     Start the operations. In this version the elements of A are
-*     accessed sequentially with one pass through A.
-*
-      IF (LSAME(TRANS,'N')) THEN
-*
-*         Form  x := A*x.
-*
-          IF (LSAME(UPLO,'U')) THEN
-              KPLUS1 = K + 1
-              IF (INCX.EQ.1) THEN
-                  DO 20 J = 1,N
-                      IF (X(J).NE.ZERO) THEN
-                          TEMP = X(J)
-                          L = KPLUS1 - J
-                          DO 10 I = MAX(1,J-K),J - 1
-                              X(I) = X(I) + TEMP*A(L+I,J)
-   10                     CONTINUE
-                          IF (NOUNIT) X(J) = X(J)*A(KPLUS1,J)
-                      END IF
-   20             CONTINUE
-              ELSE
-                  JX = KX
-                  DO 40 J = 1,N
-                      IF (X(JX).NE.ZERO) THEN
-                          TEMP = X(JX)
-                          IX = KX
-                          L = KPLUS1 - J
-                          DO 30 I = MAX(1,J-K),J - 1
-                              X(IX) = X(IX) + TEMP*A(L+I,J)
-                              IX = IX + INCX
-   30                     CONTINUE
-                          IF (NOUNIT) X(JX) = X(JX)*A(KPLUS1,J)
-                      END IF
-                      JX = JX + INCX
-                      IF (J.GT.K) KX = KX + INCX
-   40             CONTINUE
-              END IF
-          ELSE
-              IF (INCX.EQ.1) THEN
-                  DO 60 J = N,1,-1
-                      IF (X(J).NE.ZERO) THEN
-                          TEMP = X(J)
-                          L = 1 - J
-                          DO 50 I = MIN(N,J+K),J + 1,-1
-                              X(I) = X(I) + TEMP*A(L+I,J)
-   50                     CONTINUE
-                          IF (NOUNIT) X(J) = X(J)*A(1,J)
-                      END IF
-   60             CONTINUE
-              ELSE
-                  KX = KX + (N-1)*INCX
-                  JX = KX
-                  DO 80 J = N,1,-1
-                      IF (X(JX).NE.ZERO) THEN
-                          TEMP = X(JX)
-                          IX = KX
-                          L = 1 - J
-                          DO 70 I = MIN(N,J+K),J + 1,-1
-                              X(IX) = X(IX) + TEMP*A(L+I,J)
-                              IX = IX - INCX
-   70                     CONTINUE
-                          IF (NOUNIT) X(JX) = X(JX)*A(1,J)
-                      END IF
-                      JX = JX - INCX
-                      IF ((N-J).GE.K) KX = KX - INCX
-   80             CONTINUE
-              END IF
-          END IF
-      ELSE
-*
-*        Form  x := A'*x  or  x := conjg( A' )*x.
-*
-          IF (LSAME(UPLO,'U')) THEN
-              KPLUS1 = K + 1
-              IF (INCX.EQ.1) THEN
-                  DO 110 J = N,1,-1
-                      TEMP = X(J)
-                      L = KPLUS1 - J
-                      IF (NOCONJ) THEN
-                          IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J)
-                          DO 90 I = J - 1,MAX(1,J-K),-1
-                              TEMP = TEMP + A(L+I,J)*X(I)
-   90                     CONTINUE
-                      ELSE
-                          IF (NOUNIT) TEMP = TEMP*DCONJG(A(KPLUS1,J))
-                          DO 100 I = J - 1,MAX(1,J-K),-1
-                              TEMP = TEMP + DCONJG(A(L+I,J))*X(I)
-  100                     CONTINUE
-                      END IF
-                      X(J) = TEMP
-  110             CONTINUE
-              ELSE
-                  KX = KX + (N-1)*INCX
-                  JX = KX
-                  DO 140 J = N,1,-1
-                      TEMP = X(JX)
-                      KX = KX - INCX
-                      IX = KX
-                      L = KPLUS1 - J
-                      IF (NOCONJ) THEN
-                          IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J)
-                          DO 120 I = J - 1,MAX(1,J-K),-1
-                              TEMP = TEMP + A(L+I,J)*X(IX)
-                              IX = IX - INCX
-  120                     CONTINUE
-                      ELSE
-                          IF (NOUNIT) TEMP = TEMP*DCONJG(A(KPLUS1,J))
-                          DO 130 I = J - 1,MAX(1,J-K),-1
-                              TEMP = TEMP + DCONJG(A(L+I,J))*X(IX)
-                              IX = IX - INCX
-  130                     CONTINUE
-                      END IF
-                      X(JX) = TEMP
-                      JX = JX - INCX
-  140             CONTINUE
-              END IF
-          ELSE
-              IF (INCX.EQ.1) THEN
-                  DO 170 J = 1,N
-                      TEMP = X(J)
-                      L = 1 - J
-                      IF (NOCONJ) THEN
-                          IF (NOUNIT) TEMP = TEMP*A(1,J)
-                          DO 150 I = J + 1,MIN(N,J+K)
-                              TEMP = TEMP + A(L+I,J)*X(I)
-  150                     CONTINUE
-                      ELSE
-                          IF (NOUNIT) TEMP = TEMP*DCONJG(A(1,J))
-                          DO 160 I = J + 1,MIN(N,J+K)
-                              TEMP = TEMP + DCONJG(A(L+I,J))*X(I)
-  160                     CONTINUE
-                      END IF
-                      X(J) = TEMP
-  170             CONTINUE
-              ELSE
-                  JX = KX
-                  DO 200 J = 1,N
-                      TEMP = X(JX)
-                      KX = KX + INCX
-                      IX = KX
-                      L = 1 - J
-                      IF (NOCONJ) THEN
-                          IF (NOUNIT) TEMP = TEMP*A(1,J)
-                          DO 180 I = J + 1,MIN(N,J+K)
-                              TEMP = TEMP + A(L+I,J)*X(IX)
-                              IX = IX + INCX
-  180                     CONTINUE
-                      ELSE
-                          IF (NOUNIT) TEMP = TEMP*DCONJG(A(1,J))
-                          DO 190 I = J + 1,MIN(N,J+K)
-                              TEMP = TEMP + DCONJG(A(L+I,J))*X(IX)
-                              IX = IX + INCX
-  190                     CONTINUE
-                      END IF
-                      X(JX) = TEMP
-                      JX = JX + INCX
-  200             CONTINUE
-              END IF
-          END IF
-      END IF
-*
-      RETURN
-*
-*     End of ZTBMV .
-*
-      END
diff --git a/cmake/EigenUninstall.cmake b/cmake/EigenUninstall.cmake
new file mode 100644
index 000000000..4dae8c85c
--- /dev/null
+++ b/cmake/EigenUninstall.cmake
@@ -0,0 +1,40 @@
+################ CMake Uninstall Template #######################
+# CMake Template file for uninstallation of files
+# mentioned in 'install_manifest.txt'
+#
+# Used by uinstall target
+#################################################################
+
+set(MANIFEST "${CMAKE_CURRENT_BINARY_DIR}/install_manifest.txt")
+
+if(EXISTS ${MANIFEST})
+  message(STATUS "============== Uninstalling Eigen  ===================")
+
+  file(STRINGS ${MANIFEST} files)
+  foreach(file ${files})
+    if(EXISTS ${file})
+      message(STATUS "Removing file: '${file}'")
+
+      execute_process(
+        COMMAND ${CMAKE_COMMAND} -E remove ${file}
+        OUTPUT_VARIABLE rm_out
+        RESULT_VARIABLE rm_retval
+        )
+
+      if(NOT "${rm_retval}" STREQUAL 0)
+        message(FATAL_ERROR "Failed to remove file: '${file}'.")
+      endif()
+    else()
+      message(STATUS "File '${file}' does not exist.")
+    endif()
+  endforeach(file)
+
+  message(STATUS "========== Finished Uninstalling Eigen  ==============")
+else()
+  message(STATUS "Cannot find install manifest: '${MANIFEST}'")
+  message(STATUS "Probably make install has not been performed")
+  message(STATUS "   or install_manifest.txt has been deleted.")
+endif()
+
+
+
diff --git a/doc/AsciiQuickReference.txt b/doc/AsciiQuickReference.txt
index c4d021624..b5bdfa1f4 100644
--- a/doc/AsciiQuickReference.txt
+++ b/doc/AsciiQuickReference.txt
@@ -67,10 +67,10 @@ P.rightCols<cols>()                // P(:, end-cols+1:end)
 P.rightCols(cols)                  // P(:, end-cols+1:end)
 P.topRows<rows>()                  // P(1:rows, :)
 P.topRows(rows)                    // P(1:rows, :)
-P.middleRows<rows>(i)              // P(:, i+1:i+rows)
-P.middleRows(i, rows)              // P(:, i+1:i+rows)
-P.bottomRows<rows>()               // P(:, end-rows+1:end)
-P.bottomRows(rows)                 // P(:, end-rows+1:end)
+P.middleRows<rows>(i)              // P(i+1:i+rows, :)
+P.middleRows(i, rows)              // P(i+1:i+rows, :)
+P.bottomRows<rows>()               // P(end-rows+1:end, :)
+P.bottomRows(rows)                 // P(end-rows+1:end, :)
 P.topLeftCorner(rows, cols)        // P(1:rows, 1:cols)
 P.topRightCorner(rows, cols)       // P(1:rows, end-cols+1:end)
 P.bottomLeftCorner(rows, cols)     // P(end-rows+1:end, 1:cols)
diff --git a/doc/SparseQuickReference.dox b/doc/SparseQuickReference.dox
index 4a33d0cc9..d04ac35c5 100644
--- a/doc/SparseQuickReference.dox
+++ b/doc/SparseQuickReference.dox
@@ -71,11 +71,10 @@ i.e either row major or column major. The default is column major. Most arithmet
 <td> Constant or Random Insertion</td>
 <td>
 \code
-sm1.setZero(); // Set the matrix with zero elements
-sm1.setConstant(val); //Replace all the nonzero values with val
+sm1.setZero();
 \endcode
 </td>
-<td> The matrix sm1 should have been created before ???</td>
+<td>Remove all non-zero coefficients</td>
 </tr>
 </table>
 
diff --git a/failtest/CMakeLists.txt b/failtest/CMakeLists.txt
index d2fea7bdc..c8795a344 100644
--- a/failtest/CMakeLists.txt
+++ b/failtest/CMakeLists.txt
@@ -41,6 +41,12 @@ ei_add_failtest("ref_5")
 ei_add_failtest("swap_1")
 ei_add_failtest("swap_2")
 
+ei_add_failtest("sparse_ref_1")
+ei_add_failtest("sparse_ref_2")
+ei_add_failtest("sparse_ref_3")
+ei_add_failtest("sparse_ref_4")
+ei_add_failtest("sparse_ref_5")
+
 if (EIGEN_FAILTEST_FAILURE_COUNT)
   message(FATAL_ERROR
           "${EIGEN_FAILTEST_FAILURE_COUNT} out of ${EIGEN_FAILTEST_COUNT} failtests FAILED. "
diff --git a/failtest/sparse_ref_1.cpp b/failtest/sparse_ref_1.cpp
new file mode 100644
index 000000000..d78d1f9b1
--- /dev/null
+++ b/failtest/sparse_ref_1.cpp
@@ -0,0 +1,18 @@
+#include "../Eigen/Sparse"
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+#define CV_QUALIFIER const
+#else
+#define CV_QUALIFIER
+#endif
+
+using namespace Eigen;
+
+void call_ref(Ref<SparseMatrix<float> > a) { }
+
+int main()
+{
+  SparseMatrix<float> a(10,10);
+  CV_QUALIFIER SparseMatrix<float>& ac(a);
+  call_ref(ac);
+}
diff --git a/failtest/sparse_ref_2.cpp b/failtest/sparse_ref_2.cpp
new file mode 100644
index 000000000..46c9440c2
--- /dev/null
+++ b/failtest/sparse_ref_2.cpp
@@ -0,0 +1,15 @@
+#include "../Eigen/Sparse"
+
+using namespace Eigen;
+
+void call_ref(Ref<SparseMatrix<float> > a) { }
+
+int main()
+{
+  SparseMatrix<float> A(10,10);
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+  call_ref(A.row(3));
+#else
+  call_ref(A.col(3));
+#endif
+}
diff --git a/failtest/sparse_ref_3.cpp b/failtest/sparse_ref_3.cpp
new file mode 100644
index 000000000..a9949b552
--- /dev/null
+++ b/failtest/sparse_ref_3.cpp
@@ -0,0 +1,15 @@
+#include "../Eigen/Sparse"
+
+using namespace Eigen;
+
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+void call_ref(Ref<SparseMatrix<float> > a) { }
+#else
+void call_ref(const Ref<const SparseMatrix<float> > &a) { }
+#endif
+
+int main()
+{
+  SparseMatrix<float> a(10,10);
+  call_ref(a+a);
+}
diff --git a/failtest/sparse_ref_4.cpp b/failtest/sparse_ref_4.cpp
new file mode 100644
index 000000000..57bb6a1fc
--- /dev/null
+++ b/failtest/sparse_ref_4.cpp
@@ -0,0 +1,15 @@
+#include "../Eigen/Sparse"
+
+using namespace Eigen;
+
+void call_ref(Ref<SparseMatrix<float> > a) {}
+
+int main()
+{
+  SparseMatrix<float> A(10,10);
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+  call_ref(A.transpose());
+#else
+  call_ref(A);
+#endif
+}
diff --git a/failtest/sparse_ref_5.cpp b/failtest/sparse_ref_5.cpp
new file mode 100644
index 000000000..4478f6f2f
--- /dev/null
+++ b/failtest/sparse_ref_5.cpp
@@ -0,0 +1,16 @@
+#include "../Eigen/Sparse"
+
+using namespace Eigen;
+
+void call_ref(Ref<SparseMatrix<float> > a) { }
+
+int main()
+{
+  SparseMatrix<float> a(10,10);
+  SparseMatrixBase<SparseMatrix<float> > &ac(a);
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+  call_ref(ac);
+#else
+  call_ref(ac.derived());
+#endif
+}
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index f57d8ce36..168749634 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -228,6 +228,7 @@ ei_add_test(stddeque)
 ei_add_test(sparse_basic)
 ei_add_test(sparse_vector)
 ei_add_test(sparse_product)
+ei_add_test(sparse_ref)
 ei_add_test(sparse_solvers)
 ei_add_test(sparse_permutations)
 ei_add_test(simplicial_cholesky)
diff --git a/test/adjoint.cpp b/test/adjoint.cpp
index ea36f7841..3b2a53c91 100644
--- a/test/adjoint.cpp
+++ b/test/adjoint.cpp
@@ -64,6 +64,7 @@ template<typename MatrixType> void adjoint(const MatrixType& m)
   typedef typename NumTraits<Scalar>::Real RealScalar;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> SquareMatrixType;
+  const Index PacketSize = internal::packet_traits<Scalar>::size;
   
   Index rows = m.rows();
   Index cols = m.cols();
@@ -108,6 +109,17 @@ template<typename MatrixType> void adjoint(const MatrixType& m)
   VERIFY_IS_APPROX(m3,m1.transpose());
   m3.transposeInPlace();
   VERIFY_IS_APPROX(m3,m1);
+  
+  if(PacketSize<m3.rows() && PacketSize<m3.cols())
+  {
+    m3 = m1;
+    Index i = internal::random<Index>(0,m3.rows()-PacketSize);
+    Index j = internal::random<Index>(0,m3.cols()-PacketSize);
+    m3.template block<PacketSize,PacketSize>(i,j).transposeInPlace();
+    VERIFY_IS_APPROX( (m3.template block<PacketSize,PacketSize>(i,j)), (m1.template block<PacketSize,PacketSize>(i,j).transpose()) );
+    m3.template block<PacketSize,PacketSize>(i,j).transposeInPlace();
+    VERIFY_IS_APPROX(m3,m1);
+  }
 
   // check inplace adjoint
   m3 = m1;
@@ -129,9 +141,19 @@ void test_adjoint()
     CALL_SUBTEST_1( adjoint(Matrix<float, 1, 1>()) );
     CALL_SUBTEST_2( adjoint(Matrix3d()) );
     CALL_SUBTEST_3( adjoint(Matrix4f()) );
+    
     CALL_SUBTEST_4( adjoint(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
     CALL_SUBTEST_5( adjoint(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_6( adjoint(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    
+    // Complement for 128 bits vectorization:
+    CALL_SUBTEST_8( adjoint(Matrix2d()) );
+    CALL_SUBTEST_9( adjoint(Matrix<int,4,4>()) );
+    
+    // 256 bits vectorization:
+    CALL_SUBTEST_10( adjoint(Matrix<float,8,8>()) );
+    CALL_SUBTEST_11( adjoint(Matrix<double,4,4>()) );
+    CALL_SUBTEST_12( adjoint(Matrix<int,8,8>()) );
   }
   // test a large static matrix only once
   CALL_SUBTEST_7( adjoint(Matrix<float, 100, 100>()) );
diff --git a/test/conjugate_gradient.cpp b/test/conjugate_gradient.cpp
index 869051b31..019cc4d64 100644
--- a/test/conjugate_gradient.cpp
+++ b/test/conjugate_gradient.cpp
@@ -12,13 +12,15 @@
 
 template<typename T> void test_conjugate_gradient_T()
 {
-  ConjugateGradient<SparseMatrix<T>, Lower> cg_colmajor_lower_diag;
-  ConjugateGradient<SparseMatrix<T>, Upper> cg_colmajor_upper_diag;
+  ConjugateGradient<SparseMatrix<T>, Lower      > cg_colmajor_lower_diag;
+  ConjugateGradient<SparseMatrix<T>, Upper      > cg_colmajor_upper_diag;
+  ConjugateGradient<SparseMatrix<T>, Lower|Upper> cg_colmajor_loup_diag;
   ConjugateGradient<SparseMatrix<T>, Lower, IdentityPreconditioner> cg_colmajor_lower_I;
   ConjugateGradient<SparseMatrix<T>, Upper, IdentityPreconditioner> cg_colmajor_upper_I;
 
   CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_lower_diag)  );
   CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_upper_diag)  );
+  CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_loup_diag)   );
   CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_lower_I)     );
   CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_upper_I)     );
 }
diff --git a/test/diagonalmatrices.cpp b/test/diagonalmatrices.cpp
index 149f1db2f..0227ba577 100644
--- a/test/diagonalmatrices.cpp
+++ b/test/diagonalmatrices.cpp
@@ -84,6 +84,13 @@ template<typename MatrixType> void diagonalmatrices(const MatrixType& m)
   
   VERIFY_IS_APPROX(m1 * (rdm1 * s1), (m1 * rdm1) * s1);
   VERIFY_IS_APPROX(m1 * (s1 * rdm1), (m1 * rdm1) * s1);
+  
+  // Diagonal to dense
+  sq_m1.setRandom();
+  sq_m2 = sq_m1;
+  VERIFY_IS_APPROX( (sq_m1 += (s1*v1).asDiagonal()), sq_m2 += (s1*v1).asDiagonal().toDenseMatrix() );
+  VERIFY_IS_APPROX( (sq_m1 -= (s1*v1).asDiagonal()), sq_m2 -= (s1*v1).asDiagonal().toDenseMatrix() );
+  VERIFY_IS_APPROX( (sq_m1 = (s1*v1).asDiagonal()), (s1*v1).asDiagonal().toDenseMatrix() );
 }
 
 void test_diagonalmatrices()
diff --git a/test/geo_transformations.cpp b/test/geo_transformations.cpp
index e189217eb..042dd0329 100644
--- a/test/geo_transformations.cpp
+++ b/test/geo_transformations.cpp
@@ -99,10 +99,16 @@ template<typename Scalar, int Mode, int Options> void transformations()
 
   Scalar a = internal::random<Scalar>(-Scalar(M_PI), Scalar(M_PI));
   Scalar s0 = internal::random<Scalar>(), s1 = internal::random<Scalar>();
+  
+  while(v0.norm() < test_precision<Scalar>()) v0 = Vector3::Random();
+  while(v1.norm() < test_precision<Scalar>()) v1 = Vector3::Random();
 
   VERIFY_IS_APPROX(v0, AngleAxisx(a, v0.normalized()) * v0);
   VERIFY_IS_APPROX(-v0, AngleAxisx(Scalar(M_PI), v0.unitOrthogonal()) * v0);
-  VERIFY_IS_APPROX(cos(a)*v0.squaredNorm(), v0.dot(AngleAxisx(a, v0.unitOrthogonal()) * v0));
+  if(abs(cos(a)) > test_precision<Scalar>())
+  {
+    VERIFY_IS_APPROX(cos(a)*v0.squaredNorm(), v0.dot(AngleAxisx(a, v0.unitOrthogonal()) * v0));
+  }
   m = AngleAxisx(a, v0.normalized()).toRotationMatrix().adjoint();
   VERIFY_IS_APPROX(Matrix3::Identity(), m * AngleAxisx(a, v0.normalized()));
   VERIFY_IS_APPROX(Matrix3::Identity(), AngleAxisx(a, v0.normalized()) * m);
@@ -123,11 +129,18 @@ template<typename Scalar, int Mode, int Options> void transformations()
   // angle-axis conversion
   AngleAxisx aa = AngleAxisx(q1);
   VERIFY_IS_APPROX(q1 * v1, Quaternionx(aa) * v1);
-  VERIFY_IS_NOT_APPROX(q1 * v1, Quaternionx(AngleAxisx(aa.angle()*2,aa.axis())) * v1);
+  
+  if(abs(aa.angle()) > NumTraits<Scalar>::dummy_precision())
+  {
+    VERIFY( !(q1 * v1).isApprox(Quaternionx(AngleAxisx(aa.angle()*2,aa.axis())) * v1) );
+  }
 
   aa.fromRotationMatrix(aa.toRotationMatrix());
   VERIFY_IS_APPROX(q1 * v1, Quaternionx(aa) * v1);
-  VERIFY_IS_NOT_APPROX(q1 * v1, Quaternionx(AngleAxisx(aa.angle()*2,aa.axis())) * v1);
+  if(abs(aa.angle()) > NumTraits<Scalar>::dummy_precision())
+  {
+    VERIFY( !(q1 * v1).isApprox(Quaternionx(AngleAxisx(aa.angle()*2,aa.axis())) * v1) );
+  }
 
   // AngleAxis
   VERIFY_IS_APPROX(AngleAxisx(a,v1.normalized()).toRotationMatrix(),
@@ -347,7 +360,9 @@ template<typename Scalar, int Mode, int Options> void transformations()
   // test transform inversion
   t0.setIdentity();
   t0.translate(v0);
-  t0.linear().setRandom();
+  do {
+    t0.linear().setRandom();
+  } while(t0.linear().jacobiSvd().singularValues()(2)<test_precision<Scalar>());
   Matrix4 t044 = Matrix4::Zero();
   t044(3,3) = 1;
   t044.block(0,0,t0.matrix().rows(),4) = t0.matrix();
diff --git a/test/main.h b/test/main.h
index a1ccd28b5..ab0b9187e 100644
--- a/test/main.h
+++ b/test/main.h
@@ -47,8 +47,8 @@
 // protected by parenthesis against macro expansion, the min()/max() macros
 // are defined here and any not-parenthesized min/max call will cause a
 // compiler error.
-#define min(A,B) please_protect_your_min_with_parentheses
-#define max(A,B) please_protect_your_max_with_parentheses
+//#define min(A,B) please_protect_your_min_with_parentheses
+//#define max(A,B) please_protect_your_max_with_parentheses
 
 #define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes
 // B0 is defined in POSIX header termios.h
@@ -237,6 +237,7 @@ inline void verify_impl(bool condition, const char *testname, const char *file,
 #define VERIFY(a) ::verify_impl(a, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a))
 
 #define VERIFY_IS_EQUAL(a, b) VERIFY(test_is_equal(a, b))
+#define VERIFY_IS_NOT_EQUAL(a, b) VERIFY(!test_is_equal(a, b))
 #define VERIFY_IS_APPROX(a, b) VERIFY(test_isApprox(a, b))
 #define VERIFY_IS_NOT_APPROX(a, b) VERIFY(!test_isApprox(a, b))
 #define VERIFY_IS_MUCH_SMALLER_THAN(a, b) VERIFY(test_isMuchSmallerThan(a, b))
diff --git a/test/nullary.cpp b/test/nullary.cpp
index a19c674e4..8344855df 100644
--- a/test/nullary.cpp
+++ b/test/nullary.cpp
@@ -80,7 +80,9 @@ void testVectorType(const VectorType& base)
   Matrix<Scalar,1,Dynamic> col_vector(size);
   row_vector.setLinSpaced(size,low,high);
   col_vector.setLinSpaced(size,low,high);
-  VERIFY( row_vector.isApprox(col_vector.transpose(), NumTraits<Scalar>::epsilon()));
+  // when using the extended precision (e.g., FPU) the relative error might exceed 1 bit
+  // when computing the squared sum in isApprox, thus the 2x factor.
+  VERIFY( row_vector.isApprox(col_vector.transpose(), Scalar(2)*NumTraits<Scalar>::epsilon()));
 
   Matrix<Scalar,Dynamic,1> size_changer(size+50);
   size_changer.setLinSpaced(size,low,high);
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index ee0502f69..49f601907 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -261,6 +261,22 @@ template<typename Scalar> void packetmath()
       VERIFY(isApproxAbs(data2[j], data1[i+j*PacketSize], refvalue) && "ptranspose");
     }
   }
+
+  if (internal::packet_traits<Scalar>::HasBlend) {
+    Packet thenPacket = internal::pload<Packet>(data1);
+    Packet elsePacket = internal::pload<Packet>(data2);
+    EIGEN_ALIGN_DEFAULT internal::Selector<PacketSize> selector;
+    for (int i = 0; i < PacketSize; ++i) {
+      selector.select[i] = i;
+    }
+
+    Packet blend = internal::pblend(selector, thenPacket, elsePacket);
+    EIGEN_ALIGN_DEFAULT Scalar result[size];
+    internal::pstore(result, blend);
+    for (int i = 0; i < PacketSize; ++i) {
+      VERIFY(isApproxAbs(result[i], (selector.select[i] ? data1[i] : data2[i]), refvalue));
+    }
+  }
 }
 
 template<typename Scalar> void packetmath_real()
diff --git a/test/product_large.cpp b/test/product_large.cpp
index 11531aa1d..ffb8b7bf2 100644
--- a/test/product_large.cpp
+++ b/test/product_large.cpp
@@ -39,15 +39,16 @@ void test_product_large()
     // check the functions to setup blocking sizes compile and do not segfault
     // FIXME check they do what they are supposed to do !!
     std::ptrdiff_t l1 = internal::random<int>(10000,20000);
-    std::ptrdiff_t l2 = internal::random<int>(1000000,2000000);
-    setCpuCacheSizes(l1,l2);
+    std::ptrdiff_t l2 = internal::random<int>(100000,200000);
+    std::ptrdiff_t l3 = internal::random<int>(1000000,2000000);
+    setCpuCacheSizes(l1,l2,l3);
     VERIFY(l1==l1CacheSize());
     VERIFY(l2==l2CacheSize());
     std::ptrdiff_t k1 = internal::random<int>(10,100)*16;
     std::ptrdiff_t m1 = internal::random<int>(10,100)*16;
     std::ptrdiff_t n1 = internal::random<int>(10,100)*16;
     // only makes sure it compiles fine
-    internal::computeProductBlockingSizes<float,float>(k1,m1,n1);
+    internal::computeProductBlockingSizes<float,float>(k1,m1,n1,1);
   }
 
   {
diff --git a/test/product_small.cpp b/test/product_small.cpp
index 8b132abb6..091955a0f 100644
--- a/test/product_small.cpp
+++ b/test/product_small.cpp
@@ -9,6 +9,7 @@
 
 #define EIGEN_NO_STATIC_ASSERT
 #include "product.h"
+#include <Eigen/LU>
 
 // regression test for bug 447
 void product1x1()
@@ -46,5 +47,14 @@ void test_product_small()
     Vector3f v = Vector3f::Random();
     VERIFY_IS_APPROX( (v * v.transpose()) * v, (v * v.transpose()).eval() * v);
   }
+  
+  {
+    // regression test for pull-request #93
+    Eigen::Matrix<double, 1, 1> A;  A.setRandom();
+    Eigen::Matrix<double, 18, 1> B; B.setRandom();
+    Eigen::Matrix<double, 1, 18> C; C.setRandom();
+    VERIFY_IS_APPROX(B * A.inverse(), B * A.inverse()[0]);
+    VERIFY_IS_APPROX(A.inverse() * C, A.inverse()[0] * C);
+  }
 #endif
 }
diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp
index a26fd5dcd..8fd759c93 100644
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp
@@ -408,6 +408,26 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
     m.setFromTriplets(triplets.begin(), triplets.end());
     VERIFY_IS_APPROX(m, refMat);
   }
+  
+  // test Map
+  {
+    DenseMatrix refMat2(rows, cols), refMat3(rows, cols);
+    SparseMatrixType m2(rows, cols), m3(rows, cols);
+    initSparse<Scalar>(density, refMat2, m2);
+    initSparse<Scalar>(density, refMat3, m3);
+    {
+      Map<SparseMatrixType> mapMat2(m2.rows(), m2.cols(), m2.nonZeros(), m2.outerIndexPtr(), m2.innerIndexPtr(), m2.valuePtr(), m2.innerNonZeroPtr());
+      Map<SparseMatrixType> mapMat3(m3.rows(), m3.cols(), m3.nonZeros(), m3.outerIndexPtr(), m3.innerIndexPtr(), m3.valuePtr(), m3.innerNonZeroPtr());
+      VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3);
+      VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3);
+    }
+    {
+      MappedSparseMatrix<Scalar,SparseMatrixType::Options,StorageIndex> mapMat2(m2.rows(), m2.cols(), m2.nonZeros(), m2.outerIndexPtr(), m2.innerIndexPtr(), m2.valuePtr(), m2.innerNonZeroPtr());
+      MappedSparseMatrix<Scalar,SparseMatrixType::Options,StorageIndex> mapMat3(m3.rows(), m3.cols(), m3.nonZeros(), m3.outerIndexPtr(), m3.innerIndexPtr(), m3.valuePtr(), m3.innerNonZeroPtr());
+      VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3);
+      VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3);
+    }
+  }
 
   // test triangularView
   {
diff --git a/test/sparse_ref.cpp b/test/sparse_ref.cpp
new file mode 100644
index 000000000..e7380ba21
--- /dev/null
+++ b/test/sparse_ref.cpp
@@ -0,0 +1,99 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 20015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// This unit test cannot be easily written to work with EIGEN_DEFAULT_TO_ROW_MAJOR
+#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
+#undef EIGEN_DEFAULT_TO_ROW_MAJOR
+#endif
+
+static long int nb_temporaries;
+
+inline void on_temporary_creation() {
+  // here's a great place to set a breakpoint when debugging failures in this test!
+  nb_temporaries++;
+}
+
+#define EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN { on_temporary_creation(); }
+
+#include "main.h"
+#include <Eigen/SparseCore>
+
+#define VERIFY_EVALUATION_COUNT(XPR,N) {\
+    nb_temporaries = 0; \
+    XPR; \
+    if(nb_temporaries!=N) std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; \
+    VERIFY( (#XPR) && nb_temporaries==N ); \
+  }
+
+template<typename PlainObjectType> void check_const_correctness(const PlainObjectType&)
+{
+  // verify that ref-to-const don't have LvalueBit
+  typedef typename internal::add_const<PlainObjectType>::type ConstPlainObjectType;
+  VERIFY( !(internal::traits<Ref<ConstPlainObjectType> >::Flags & LvalueBit) );
+  VERIFY( !(internal::traits<Ref<ConstPlainObjectType, Aligned> >::Flags & LvalueBit) );
+  VERIFY( !(Ref<ConstPlainObjectType>::Flags & LvalueBit) );
+  VERIFY( !(Ref<ConstPlainObjectType, Aligned>::Flags & LvalueBit) );
+}
+
+template<typename B>
+EIGEN_DONT_INLINE void call_ref_1(Ref<SparseMatrix<float> > a, const B &b) { VERIFY_IS_EQUAL(a.toDense(),b.toDense()); }
+
+template<typename B>
+EIGEN_DONT_INLINE void call_ref_2(const Ref<const SparseMatrix<float> >& a, const B &b) { VERIFY_IS_EQUAL(a.toDense(),b.toDense()); }
+
+void call_ref()
+{
+//   SparseVector<std::complex<float> > ca = VectorXcf::Random(10).sparseView();
+//   SparseVector<float>                a  = VectorXf::Random(10).sparseView();
+  SparseMatrix<float>               A = MatrixXf::Random(10,10).sparseView(0.5,1);
+  SparseMatrix<float,RowMajor>      B = MatrixXf::Random(10,10).sparseView(0.5,1);
+  const SparseMatrix<float>&        Ac(A);
+  Block<SparseMatrix<float> >       Ab(A,0,1, 3,3);
+  const Block<SparseMatrix<float> > Abc(A,0,1,3,3);
+  
+
+  VERIFY_EVALUATION_COUNT( call_ref_1(A, A),  0);
+//   VERIFY_EVALUATION_COUNT( call_ref_1(Ac, Ac),  0); // does not compile on purpose
+  VERIFY_EVALUATION_COUNT( call_ref_2(A, A),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(A.transpose(), A.transpose()),  1);
+  VERIFY_EVALUATION_COUNT( call_ref_2(Ac,Ac), 0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(A+A,2*Ac), 1);
+  VERIFY_EVALUATION_COUNT( call_ref_2(B, B),  1);
+  VERIFY_EVALUATION_COUNT( call_ref_2(B.transpose(), B.transpose()),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(A*A, A*A),  1);
+  
+  Ref<SparseMatrix<float> > Ar(A);
+  VERIFY_IS_APPROX(Ar+Ar, A+A);
+  VERIFY_EVALUATION_COUNT( call_ref_1(Ar, A),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(Ar, A),  0);
+  
+  Ref<SparseMatrix<float,RowMajor> > Br(B);
+  VERIFY_EVALUATION_COUNT( call_ref_1(Br.transpose(), Br.transpose()),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(Br, Br),  1);
+  VERIFY_EVALUATION_COUNT( call_ref_2(Br.transpose(), Br.transpose()),  0);
+  
+  Ref<const SparseMatrix<float> > Arc(A);
+//   VERIFY_EVALUATION_COUNT( call_ref_1(Arc, Arc),  0); // does not compile on purpose
+  VERIFY_EVALUATION_COUNT( call_ref_2(Arc, Arc),  0);
+  
+  VERIFY_EVALUATION_COUNT( call_ref_2(A.middleCols(1,3), A.middleCols(1,3)),  0);
+  
+  VERIFY_EVALUATION_COUNT( call_ref_2(A.col(2), A.col(2)),  0);
+  
+  VERIFY_EVALUATION_COUNT( call_ref_2(A.block(1,1,3,3), A.block(1,1,3,3)),  1); // should be 0 (allocate starts/nnz only)
+}
+
+void test_sparse_ref()
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( check_const_correctness(SparseMatrix<float>()) );
+    CALL_SUBTEST_1( check_const_correctness(SparseMatrix<double,RowMajor>()) );
+    CALL_SUBTEST_2( call_ref() );
+  }
+}
diff --git a/test/sparse_solver.h b/test/sparse_solver.h
index afda26b93..42b365eaa 100644
--- a/test/sparse_solver.h
+++ b/test/sparse_solver.h
@@ -32,7 +32,7 @@ void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A,
     x = solver.solve(b);
     if (solver.info() != Success)
     {
-      std::cerr << "sparse solver testing: solving failed\n";
+      std::cerr << "sparse solver testing: solving failed (" << typeid(Solver).name() << ")\n";
       return;
     }
     VERIFY(oldb.isApprox(b) && "sparse solver testing: the rhs should not be modified!");
@@ -75,7 +75,8 @@ void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A,
     xm = solver.solve(bm);
     if (solver.info() != Success)
     {
-      std::cerr << "sparse solver testing: solving failed\n";
+      std::cerr << "sparse solver testing: solving with a Map failed\n";
+      exit(0);
       return;
     }
     VERIFY(oldb.isApprox(bm) && "sparse solver testing: the rhs should not be modified!");
@@ -194,7 +195,10 @@ int generate_sparse_spd_problem(Solver& , typename Solver::MatrixType& A, typena
   dA = dM * dM.adjoint();
   
   halfA.resize(size,size);
-  halfA.template selfadjointView<Solver::UpLo>().rankUpdate(M);
+  if(Solver::UpLo==(Lower|Upper))
+    halfA = A;
+  else
+    halfA.template selfadjointView<Solver::UpLo>().rankUpdate(M);
   
   return size;
 }
diff --git a/unsupported/Eigen/CXX11/Core b/unsupported/Eigen/CXX11/Core
index 4dc4ab224..292f09564 100644
--- a/unsupported/Eigen/CXX11/Core
+++ b/unsupported/Eigen/CXX11/Core
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -21,20 +22,26 @@
   * module. Note that at this stage, you should not need to include
   * this module directly.
   *
+  * It also provides a limited fallback for compilers that don't support
+  * CXX11 yet, such as nvcc.
+  *
   * \code
   * #include <Eigen/CXX11/Core>
   * \endcode
   */
 
-#include <array>
+#include <vector>
 
+// Emulate the cxx11 functionality that we need if the compiler doesn't support it.
+#if __cplusplus <= 199711L
+#include "src/Core/util/EmulateCXX11Meta.h"
+#else
+#include <array>
 #include "src/Core/util/CXX11Workarounds.h"
 #include "src/Core/util/CXX11Meta.h"
+#endif
 
 #include <Eigen/src/Core/util/ReenableStupidWarnings.h>
 
 #endif // EIGEN_CXX11_CORE_MODULE
 
-/*
- * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
- */
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 049ce5596..34107ae71 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -1,6 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
 // Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -10,9 +11,10 @@
 #ifndef EIGEN_CXX11_TENSOR_MODULE
 #define EIGEN_CXX11_TENSOR_MODULE
 
-#include <unsupported/Eigen/CXX11/Core>
+#include "Eigen/src/Core/util/StaticAssert.h"
+#include "unsupported/Eigen/CXX11/Core"
 
-#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+#include "Eigen/src/Core/util/DisableStupidWarnings.h"
 
 /** \defgroup CXX11_Tensor_Module Tensor Module
   *
@@ -26,14 +28,69 @@
 
 #include <cstddef>
 #include <cstring>
+#include <stdint.h>
 
-#include "src/Tensor/TensorStorage.h"
-#include "src/Tensor/Tensor.h"
+#if __cplusplus > 199711
+#include <random>
+#endif
 
-#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+#ifdef EIGEN_USE_THREADS
+#include <future>
+#endif
 
-#endif // EIGEN_CXX11_TENSOR_MODULE
+#ifdef EIGEN_USE_GPU
+#include <cuda_runtime.h>
+#if defined(__CUDACC__)
+#include <curand_kernel.h>
+#endif
+#endif
+
+#include "Eigen/Core"
+
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h"
+
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorBase.h"
+
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h"
 
-/*
- * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
- */
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h"
+
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorMap.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorRef.h"
+
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorIO.h"
+
+#include "Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_CXX11_TENSOR_MODULE
diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h
index 0e274b801..3a08628be 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h
@@ -317,7 +317,7 @@ constexpr inline decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts
 template<typename Array, int... n>
 constexpr inline Array h_array_reverse(Array arr, numeric_list<int, n...>)
 {
-  return {{std_array_get<sizeof...(n) - n - 1>(arr)...}};
+  return {{array_get<sizeof...(n) - n - 1>(arr)...}};
 }
 
 template<typename T, std::size_t N>
@@ -335,9 +335,9 @@ constexpr inline std::array<T, N> array_reverse(std::array<T, N> arr)
 // an infinite loop)
 template<typename Reducer, typename T, std::size_t N, std::size_t n = N - 1>
 struct h_array_reduce {
-  constexpr static inline auto run(std::array<T, N> arr) -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr), std_array_get<n>(arr)))
+  constexpr static inline auto run(std::array<T, N> arr) -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr), array_get<n>(arr)))
   {
-    return Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr), std_array_get<n>(arr));
+    return Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr), array_get<n>(arr));
   }
 };
 
@@ -346,7 +346,7 @@ struct h_array_reduce<Reducer, T, N, 0>
 {
   constexpr static inline T run(std::array<T, N> arr)
   {
-    return std_array_get<0>(arr);
+    return array_get<0>(arr);
   }
 };
 
@@ -370,12 +370,20 @@ constexpr inline auto array_prod(std::array<T, N> arr) -> decltype(array_reduce<
   return array_reduce<product_op, T, N>(arr);
 }
 
+template<typename t>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
+  eigen_assert(a.size() > 0);
+  t prod = 1;
+  for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; }
+  return prod;
+}
+
 /* zip an array */
 
 template<typename Op, typename A, typename B, std::size_t N, int... n>
 constexpr inline std::array<decltype(Op::run(A(), B())),N> h_array_zip(std::array<A, N> a, std::array<B, N> b, numeric_list<int, n...>)
 {
-  return std::array<decltype(Op::run(A(), B())),N>{{ Op::run(std_array_get<n>(a), std_array_get<n>(b))... }};
+  return std::array<decltype(Op::run(A(), B())),N>{{ Op::run(array_get<n>(a), array_get<n>(b))... }};
 }
 
 template<typename Op, typename A, typename B, std::size_t N>
@@ -387,9 +395,9 @@ constexpr inline std::array<decltype(Op::run(A(), B())),N> array_zip(std::array<
 /* zip an array and reduce the result */
 
 template<typename Reducer, typename Op, typename A, typename B, std::size_t N, int... n>
-constexpr inline auto h_array_zip_and_reduce(std::array<A, N> a, std::array<B, N> b, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(std_array_get<n>(a), std_array_get<n>(b))...))
+constexpr inline auto h_array_zip_and_reduce(std::array<A, N> a, std::array<B, N> b, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...))
 {
-  return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(std_array_get<n>(a), std_array_get<n>(b))...);
+  return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...);
 }
 
 template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
@@ -403,7 +411,7 @@ constexpr inline auto array_zip_and_reduce(std::array<A, N> a, std::array<B, N>
 template<typename Op, typename A, std::size_t N, int... n>
 constexpr inline std::array<decltype(Op::run(A())),N> h_array_apply(std::array<A, N> a, numeric_list<int, n...>)
 {
-  return std::array<decltype(Op::run(A())),N>{{ Op::run(std_array_get<n>(a))... }};
+  return std::array<decltype(Op::run(A())),N>{{ Op::run(array_get<n>(a))... }};
 }
 
 template<typename Op, typename A, std::size_t N>
@@ -415,9 +423,9 @@ constexpr inline std::array<decltype(Op::run(A())),N> array_apply(std::array<A,
 /* apply stuff to an array and reduce */
 
 template<typename Reducer, typename Op, typename A, std::size_t N, int... n>
-constexpr inline auto h_array_apply_and_reduce(std::array<A, N> arr, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(std_array_get<n>(arr))...))
+constexpr inline auto h_array_apply_and_reduce(std::array<A, N> arr, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...))
 {
-  return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(std_array_get<n>(arr))...);
+  return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...);
 }
 
 template<typename Reducer, typename Op, typename A, std::size_t N>
@@ -497,7 +505,3 @@ InstType instantiate_by_c_array(ArrType* arr)
 } // end namespace Eigen
 
 #endif // EIGEN_CXX11META_H
-
-/*
- * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
- */
diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h
index d71a67590..a590cf4e1 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h
@@ -17,9 +17,6 @@
 #error Intel Compiler only supports required C++ features since version 13.1.
 // note that most stuff in principle works with 13.0 but when combining
 // some features, at some point 13.0 will just fail with an internal assertion
-#elif defined(__clang__) && (__clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ < 1))
-// note that it _should_ work with 3.1 but it was only tested with 3.2
-#error Clang C++ Compiler (clang++) only supports required C++ features since version 3.1.
 #elif defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6))
 // G++ < 4.6 by default will continue processing the source files - even if we use #error to make
 // it error out. For this reason, we use the pragma to make sure G++ aborts at the first error
@@ -42,32 +39,46 @@
 
 namespace Eigen {
 
+// Use std::array as Eigen array
+template <typename T, std::size_t N> using array = std::array<T, N>;
+
 namespace internal {
 
 /* std::get is only constexpr in C++14, not yet in C++11
  *     - libstdc++ from version 4.7 onwards has it nevertheless,
  *                                          so use that
  *     - libstdc++ older versions: use _M_instance directly
- *     - libc++ from version 3.4 onwards has it IF compiled with
- *                                       -std=c++1y
- *     - libc++ older versions or -std=c++11: use __elems_ directly
+ *     - libc++ all versions so far: use __elems_ directly
  *     - all other libs: use std::get to be portable, but
  *                       this may not be constexpr
  */
 #if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322
 #define STD_GET_ARR_HACK             a._M_instance[I]
-#elif defined(_LIBCPP_VERSION) && (!defined(_LIBCPP_STD_VER) || _LIBCPP_STD_VER <= 11)
+#elif defined(_LIBCPP_VERSION)
 #define STD_GET_ARR_HACK             a.__elems_[I]
 #else
 #define STD_GET_ARR_HACK             std::template get<I, T, N>(a)
 #endif
 
-template<std::size_t I, class T, std::size_t N> constexpr inline T&       std_array_get(std::array<T,N>&       a) { return (T&)       STD_GET_ARR_HACK; }
-template<std::size_t I, class T, std::size_t N> constexpr inline T&&      std_array_get(std::array<T,N>&&      a) { return (T&&)      STD_GET_ARR_HACK; }
-template<std::size_t I, class T, std::size_t N> constexpr inline T const& std_array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; }
+template<std::size_t I, class T, std::size_t N> constexpr inline T&       array_get(std::array<T,N>&       a) { return (T&)       STD_GET_ARR_HACK; }
+template<std::size_t I, class T, std::size_t N> constexpr inline T&&      array_get(std::array<T,N>&&      a) { return (T&&)      STD_GET_ARR_HACK; }
+template<std::size_t I, class T, std::size_t N> constexpr inline T const& array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; }
+
+template<std::size_t I, class T> constexpr inline T&       array_get(std::vector<T>&       a) { return a[I]; }
+template<std::size_t I, class T> constexpr inline T&&      array_get(std::vector<T>&&      a) { return a[I]; }
+template<std::size_t I, class T> constexpr inline T const& array_get(std::vector<T> const& a) { return a[I]; }
 
 #undef STD_GET_ARR_HACK
 
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<const std::array<T,N> > {
+  static const size_t value = N;
+};
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<std::array<T,N> > {
+  static const size_t value = N;
+};
+
 /* Suppose you have a template of the form
  * template<typename T> struct X;
  * And you want to specialize it in such a way:
diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
new file mode 100644
index 000000000..494f95690
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
@@ -0,0 +1,435 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EMULATE_CXX11_META_H
+#define EIGEN_EMULATE_CXX11_META_H
+
+
+
+namespace Eigen {
+
+// The array class is only available starting with cxx11. Emulate our own here
+// if needed
+template <typename T, size_t n> class array {
+ public:
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& operator[] (size_t index) { return values[index]; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; }
+
+  static const std::size_t size = n;
+
+  T values[n];
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array() { }
+  explicit EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v) {
+    EIGEN_STATIC_ASSERT(n==1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2) {
+    EIGEN_STATIC_ASSERT(n==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3) {
+    EIGEN_STATIC_ASSERT(n==3, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3,
+                            const T& v4) {
+    EIGEN_STATIC_ASSERT(n==4, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+                            const T& v5) {
+    EIGEN_STATIC_ASSERT(n==5, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+                            const T& v5, const T& v6) {
+    EIGEN_STATIC_ASSERT(n==6, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+                            const T& v5, const T& v6, const T& v7) {
+    EIGEN_STATIC_ASSERT(n==7, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+    values[6] = v7;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(
+      const T& v1, const T& v2, const T& v3, const T& v4,
+      const T& v5, const T& v6, const T& v7, const T& v8) {
+    EIGEN_STATIC_ASSERT(n==8, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+    values[6] = v7;
+    values[7] = v8;
+  }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+  array(std::initializer_list<T> l) {
+    eigen_assert(l.size() == n);
+    std::copy(l.begin(), l.end(), values);
+  }
+#endif
+};
+
+
+namespace internal {
+
+/** \internal
+  * \file CXX11/Core/util/EmulateCXX11Meta.h
+  * This file emulates a subset of the functionality provided by CXXMeta.h for
+  * compilers that don't yet support cxx11 such as nvcc.
+  */
+
+struct empty_list { static const std::size_t count = 0; };
+
+template<typename T, typename Tail=empty_list> struct type_list {
+  typedef T HeadType;
+  typedef Tail TailType;
+  static const T head;
+  static const Tail tail;
+  static const std::size_t count = 1 + Tail::count;
+};
+
+struct null_type { };
+
+template<typename T1 = null_type, typename T2 = null_type, typename T3 = null_type,
+         typename T4 = null_type, typename T5 = null_type, typename T6 = null_type,
+         typename T7 = null_type, typename T8 = null_type>
+struct make_type_list {
+  typedef typename make_type_list<T2, T3, T4, T5, T6, T7, T8>::type tailresult;
+
+  typedef type_list<T1, tailresult> type;
+};
+
+template<> struct make_type_list<> {
+  typedef empty_list type;
+};
+
+
+template <std::size_t index, class TList> struct get_type;
+
+template <class Head, class Tail>
+struct get_type<0, type_list<Head, Tail> >
+{
+  typedef Head type;
+};
+
+template <std::size_t i, class Head, class Tail>
+struct get_type<i, type_list<Head, Tail> >
+{
+  typedef typename get_type<i-1, Tail>::type type;
+};
+
+
+/* numeric list */
+template <typename T, T n>
+struct type2val {
+  typedef T type;
+  static const T value = n;
+};
+
+
+template<typename T, size_t n, T V> struct gen_numeric_list_repeated;
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 1, V> {
+  typedef typename make_type_list<type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 2, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 3, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 4, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 5, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 6, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 7, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 8, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V>, type2val<T, V> >::type type;
+};
+
+
+template <std::size_t index, class NList> struct get;
+
+template <std::size_t i>
+struct get<i, empty_list>
+{
+  get() { eigen_assert(false && "index overflow"); }
+  typedef void type;
+  static const char value = '\0';
+};
+
+template <std::size_t i, class Head>
+struct get<i, type_list<Head, empty_list> >
+{
+  get() { eigen_assert(false && "index overflow"); }
+  typedef void type;
+  static const char value = '\0';
+};
+
+template <class Head>
+struct get<0, type_list<Head, empty_list> >
+{
+  typedef typename Head::type type;
+  static const type value = Head::value;
+};
+
+template <class Head, class Tail>
+struct get<0, type_list<Head, Tail> >
+{
+  typedef typename Head::type type;
+  static const type value = Head::value;
+};
+
+template <std::size_t i, class Head, class Tail>
+struct get<i, type_list<Head, Tail> >
+{
+  typedef typename Tail::HeadType::type type;
+  static const type value = get<i-1, Tail>::value;
+};
+
+
+template <class NList> struct arg_prod {
+  static const typename NList::HeadType::type value = get<0, NList>::value * arg_prod<typename NList::TailType>::value;
+};
+template <> struct arg_prod<empty_list> {
+  static const int value = 1;
+};
+
+
+template<int n, typename t>
+array<t, n> repeat(t v) {
+  array<t, n> array;
+  array.fill(v);
+  return array;
+}
+
+template<std::size_t I, class Head, class Tail>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(type_list<Head, Tail>& a) {
+  return get<I, type_list<Head, Tail> >::value;
+}
+template<std::size_t I, class Head, class Tail>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_list<Head, Tail>& a) {
+  return get<I, type_list<Head, Tail> >::value;
+}
+
+template <class NList>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList& l) {
+  return arg_prod<NList>::value;
+};
+
+template<std::size_t n, typename t>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, n>& a) {
+  t prod = 1;
+  for (size_t i = 0; i < n; ++i) { prod *= a[i]; }
+  return prod;
+}
+template<typename t>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, 0>& /*a*/) {
+  return 0;
+}
+
+template<typename t>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
+  eigen_assert(a.size() > 0);
+  t prod = 1;
+  for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; }
+  return prod;
+}
+
+template<std::size_t I, class T, std::size_t N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array<T,N>& a) {
+  return a[I];
+}
+template<std::size_t I, class T, std::size_t N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) {
+  return a[I];
+}
+
+template<std::size_t I, class T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(std::vector<T>& a) {
+  return a[I];
+}
+template<std::size_t I, class T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const std::vector<T>& a) {
+  return a[I];
+}
+
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<array<T,N> > {
+  static const size_t value = N;
+};
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<array<T,N>& > {
+  static const size_t value = N;
+};
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<const array<T,N> > {
+  static const size_t value = N;
+};
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<const array<T,N>& > {
+  static const size_t value = N;
+};
+
+struct sum_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a + b; }
+};
+struct product_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a * b; }
+};
+
+struct logical_and_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a && b; }
+};
+struct logical_or_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a || b; }
+};
+
+struct equal_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a == b; }
+};
+struct not_equal_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a != b; }
+};
+struct lesser_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a < b; }
+};
+struct lesser_equal_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a <= b; }
+};
+
+struct greater_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a > b; }
+};
+struct greater_equal_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a >= b; }
+};
+
+struct not_op {
+  template<typename A> static inline bool run(A a) { return !a; }
+};
+struct negation_op {
+  template<typename A> static inline bool run(A a) { return -a; }
+};
+struct greater_equal_zero_op {
+  template<typename A> static inline bool run(A a) { return a >= 0; }
+};
+
+
+template<typename Reducer, typename Op, typename A, std::size_t N>
+struct ArrayApplyAndReduce {
+  static inline bool run(const array<A, N>& a) {
+    EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    bool result = Reducer::run(Op::run(a[0]), Op::run(a[1]));
+    for (size_t i = 2; i < N; ++i) {
+      result = Reducer::run(result, Op::run(a[i]));
+    }
+    return result;
+  }
+};
+
+template<typename Reducer, typename Op, typename A>
+struct ArrayApplyAndReduce<Reducer, Op, A, 1>  {
+  static inline bool run(const array<A, 1>& a) {
+    return Op::run(a[0]);
+  }
+};
+
+template<typename Reducer, typename Op, typename A, std::size_t N>
+inline bool array_apply_and_reduce(const array<A, N>& a) {
+  return ArrayApplyAndReduce<Reducer, Op, A, N>::run(a);
+}
+
+template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
+struct ArrayZipAndReduce {
+  static inline bool run(const array<A, N>& a, const array<B, N>& b) {
+    EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    bool result = Reducer::run(Op::run(a[0], b[0]), Op::run(a[1], b[1]));
+    for (size_t i = 2; i < N; ++i) {
+      result = Reducer::run(result, Op::run(a[i], b[i]));
+    }
+    return result;
+  }
+};
+
+template<typename Reducer, typename Op, typename A, typename B>
+struct ArrayZipAndReduce<Reducer, Op, A, B, 1> {
+  static inline bool run(const array<A, 1>& a, const array<B, 1>& b) {
+    return Op::run(a[0], b[0]);
+  }
+};
+
+template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
+inline bool array_zip_and_reduce(const array<A, N>& a, const array<B, N>& b) {
+  return ArrayZipAndReduce<Reducer, Op, A, B, N>::run(a, b);
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+
+
+#endif  // EIGEN_EMULATE_CXX11_META_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md
new file mode 100644
index 000000000..6a4d52cc3
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/README.md
@@ -0,0 +1,1446 @@
+# Eigen Tensors
+
+Tensors are multidimensional arrays of elements. Elements are typically scalars,
+but more complex types such as strings are also supported.
+
+[TOC]
+
+## Tensor Classes
+
+You can manipulate a tensor with one of the following classes.  They all are in
+the namespace ```::Eigen.```
+
+
+### Class Tensor&lt;data_type, rank&gt;
+
+This is the class to use to create a tensor and allocate memory for it.  The
+class is templatized with the tensor datatype, such as float or int, and the
+tensor rank.  The rank is the number of dimensions, for example rank 2 is a
+matrix.
+
+Tensors of this class are resizable.  For example, if you assign a tensor of a
+different size to a Tensor, that tensor is resized to match its new value.
+
+#### Constructor Tensor&lt;data_type, rank&gt;(size0, size1, ...)
+
+Constructor for a Tensor.  The constructor must be passed ```rank``` integers
+indicating the sizes of the instance along each of the the ```rank```
+dimensions.
+
+    // Create a tensor of rank 3 of sizes 2, 3, 4.  This tensor owns
+    // memory to hold 24 floating point values (24 = 2 x 3 x 4).
+    Tensor<float, 3> t_3d(2, 3, 4);
+
+    // Resize t_3d by assigning a tensor of different sizes, but same rank.
+    t_3d = Tensor<float, 3>(3, 4, 3);
+
+#### Constructor Tensor&lt;data_type, rank&gt;(size_array)
+
+Constructor where the sizes for the constructor are specified as an array of
+values instead of an explicitly list of parameters.  The array type to use is
+```Eigen::array&lt;Eigen::Index&gt;```.  The array can be constructed automatically
+from an initializer list.
+
+    // Create a tensor of strings of rank 2 with sizes 5, 7.
+    Tensor<string, 2> t_2d({5, 7});
+
+
+### Class TensorFixedSize&lt;data_type, Sizes&lt;size0, size1, ...&gt;&gt;
+
+Class to use for tensors of fixed size, where the size is known at compile
+time.  Fixed sized tensors can provide very fast computations because all their
+dimensions are known by the compiler.  FixedSize tensors are not resizable.
+
+If the total number of elements in a fixed size tensor is small enough the
+tensor data is held onto the stack and does not cause heap allocation and free.
+
+    // Create a 4 x 3 tensor of floats.
+    TensorFixedSize<float, Sizes<4, 3>> t_4x3;
+
+### Class TensorMap&lt;Tensor&lt;data_type, rank&gt;&gt;
+
+This is the class to use to create a tensor on top of memory allocated and
+owned by another part of your code.  It allows to view any piece of allocated
+memory as a Tensor.  Instances of this class do not own the memory where the
+data are stored.
+
+A TensorMap is not resizable because it does not own the memory where its data
+are stored.
+
+#### Constructor TensorMap&lt;Tensor&lt;data_type, rank&gt;&gt;(data, size0, size1, ...)
+
+Constructor for a Tensor.  The constructor must be passed a pointer to the
+storage for the data, and "rank" size attributes.  The storage has to be
+large enough to hold all the data.
+
+    // Map a tensor of ints on top of stack-allocated storage.
+    int storage[128];  // 2 x 4 x 2 x 8 = 128
+    TensorMap<int, 4> t_4d(storage, 2, 4, 2, 8);
+
+    // The same storage can be viewed as a different tensor.
+    // You can also pass the sizes as an array.
+    TensorMap<int, 2> t_2d(storage, 16, 8);
+
+    // You can also map fixed-size tensors.  Here we get a 1d view of
+    // the 2d fixed-size tensor.
+    Tensor<float, Sizes<4, 5>> t_4x3;
+    TensorMap<float, 1> t_12(t_4x3, 12);
+
+
+#### Class TensorRef
+
+See Assigning to a TensorRef below.
+
+## Accessing Tensor Elements
+
+#### &lt;data_type&gt; tensor(index0, index1...)
+
+Return the element at position ```(index0, index1...)``` in tensor
+```tensor```.  You must pass as many parameters as the rank of ```tensor```.
+The expression can be used as an l-value to set the value of the element at the
+specified position.  The value returned is of the datatype of the tensor.
+
+    // Set the value of the element at position (0, 1, 0);
+    Tensor<float, 3> t_3d(2, 3, 4);
+    t_3d(0, 1, 0) = 12.0f;
+
+    // Initialize all elements to random values.
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 4; ++k) {
+          t_3d(i, j, k) = ...some random value...;
+        }
+      }
+    }
+
+    // Print elements of a tensor.
+    for (int i = 0; i < 2; ++i) {
+      LOG(INFO) << t_3d(i, 0, 0);
+    }
+
+
+## TensorLayout
+
+The tensor library supports 2 layouts: ```ColMajor``` (the default) and
+```RowMajor```.  Only the default column major layout is currently fully
+supported, and it is therefore not recommended to attempt to use the row major
+layout at the moment.
+
+The layout of a tensor is optionally specified as part of its type. If not
+specified explicitly column major is assumed.
+
+    Tensor<float, 3, ColMajor> col_major;  // equivalent to Tensor<float, 3>
+    TensorMap<Tensor<float, 3, RowMajor> > row_major(data, ...);
+
+All the arguments to an expression must use the same layout. Attempting to mix
+different layouts will result in a compilation error.
+
+It is possible to change the layout of a tensor or an expression using the
+```swap_layout()``` method.  Note that this will also reverse the order of the
+dimensions.
+
+    Tensor<float, 2, ColMajor> col_major(2, 4);
+    Tensor<float, 2, RowMajor> row_major(2, 4);
+
+    Tensor<float, 2> col_major_result = col_major;  // ok, layouts match
+    Tensor<float, 2> col_major_result = row_major;  // will not compile
+
+    // Simple layout swap
+    col_major_result = row_major.swap_layout();
+    eigen_assert(col_major_result.dimension(0) == 4);
+    eigen_assert(col_major_result.dimension(1) == 2);
+
+    // Swap the layout and preserve the order of the dimensions
+    array<int, 2> shuffle(1, 0);
+    col_major_result = row_major.swap_layout().shuffle(shuffle);
+    eigen_assert(col_major_result.dimension(0) == 2);
+    eigen_assert(col_major_result.dimension(1) == 4);
+
+
+## Tensor Operations
+
+The Eigen Tensor library provides a vast library of operations on Tensors:
+numerical operations such as addition and multiplication, geometry operations
+such as slicing and shuffling, etc.  These operations are available as methods
+of the Tensor classes, and in some cases as operator overloads.  For example
+the following code computes the elementwise addition of two tensors:
+
+    Tensor<float, 3> t1(2, 3, 4);
+    ...set some values in t1...
+    Tensor<float, 3> t2(2, 3, 4);
+    ...set some values in t2...
+    // Set t3 to the element wise sum of t1 and t2
+    Tensor<float, 3> t3 = t1 + t2;
+
+While the code above looks easy enough, it is important to understand that the
+expression ```t1 + t2``` is not actually adding the values of the tensors.  The
+expression instead constructs a "tensor operator" object of the class
+TensorCwiseBinaryOp&lt;scalar_sum&gt;, which has references to the tensors
+```t1``` and ```t2```.  This is a small C++ object that knows how to add
+```t1``` and ```t2```.  It is only when the value of the expression is assigned
+to the tensor ```t3``` that the addition is actually performed.  Technically,
+this happens through the overloading of ```operator=()``` in the Tensor class.
+
+This mechanism for computing tensor expressions allows for lazy evaluation and
+optimizations which are what make the tensor library very fast.
+
+Of course, the tensor operators do nest, and the expression ```t1 + t2 *
+0.3f``` is actually represented with the (approximate) tree of operators:
+
+    TensorCwiseBinaryOp<scalar_sum>(t1, TensorCwiseUnaryOp<scalar_mul>(t2, 0.3f))
+
+
+### Tensor Operations and C++ "auto"
+
+Because Tensor operations create tensor operators, the C++ ```auto``` keyword
+does not have its intuitive meaning.  Consider these 2 lines of code:
+
+    Tensor<float, 3> t3 = t1 + t2;
+    auto t4 = t1 + t2;
+
+In the first line we allocate the tensor ```t3``` and it will contain the
+result of the addition of ```t1``` and ```t2```.  In the second line, ```t4```
+is actually the tree of tensor operators that will compute the addition of
+```t1``` and ```t2```.  In fact, ```t4``` is *not* a tensor and you cannot get
+the values of its elements:
+
+    Tensor<float, 3> t3 = t1 + t2;
+    cout << t3(0, 0, 0);  // OK prints the value of t1(0, 0, 0) + t2(0, 0, 0)
+
+    auto t4 = t1 + t2;
+    cout << t4(0, 0, 0);  // Compilation error!
+
+When you use ```auto``` you do not get a Tensor as a result but instead a
+non-evaluated expression.  So only use ```auto``` to delay evaluation.
+
+Unfortunately, there is no single underlying concrete type for holding
+non-evaluated expressions, hence you have to use auto in the case when you do
+want to hold non-evaluated expressions.
+
+When you need the results of set of tensor computations you have to assign the
+result to a Tensor that will be capable of holding onto them.  This can be
+either a normal Tensor, a fixed size Tensor, or a TensorMap on an existing
+piece of memory.  All the following will work:
+
+    auto t4 = t1 + t2;
+
+    Tensor<float, 3> result = t4;  // Could also be: result(t4);
+    cout << result(0, 0, 0);
+
+    TensorMap<float, 4> result(<a float* with enough space>, <size0>, ...) = t4;
+    cout << result(0, 0, 0);
+
+    TensorFixedSize<float, Sizes<size0, ...>> result = t4;
+    cout << result(0, 0, 0);
+
+Until you need the results, you can keep the operation around, and even reuse
+it for additional operations.  As long as you keep the expression as an
+operation, no computation is performed.
+
+    // One way to compute exp((t1 + t2) * 0.2f);
+    auto t3 = t1 + t2;
+    auto t4 = t3 * 0.2f;
+    auto t5 = t4.exp();
+    Tensor<float, 3> result = t5;
+
+    // Another way, exactly as efficient as the previous one:
+    Tensor<float, 3> result = ((t1 + t2) * 0.2f).exp();
+
+### Controlling When Expression are Evaluated
+
+There are several ways to control when expressions are evaluated:
+* Assignment to a Tensor, TensorFixedSize, or TensorMap.
+* Use of the eval() method.
+* Assignment to a TensorRef.
+
+#### Assigning to a Tensor, TensorFixedSize, or TensorMap.
+
+The most common way to evaluate an expression is to assign it to a Tensor.  In
+the example below, the ```auto``` declarations make the intermediate values
+"Operations", not Tensors, and do not cause the expressions to be evaluated.
+The assignment to the Tensor ```result``` causes the evaluation of all the
+operations.
+
+    auto t3 = t1 + t2;             // t3 is an Operation.
+    auto t4 = t3 * 0.2f;           // t4 is an Operation.
+    auto t5 = t4.exp();            // t5 is an Operation.
+    Tensor<float, 3> result = t5;  // The operations are evaluated.
+
+If you know the ranks and sizes of the Operation value you can assign the
+Operation to a TensorFixedSize instead of a Tensor, which is a bit more
+efficient.
+
+    // We know that the result is a 4x4x2 tensor!
+    TensorFixedSize<float, 4, 4, 2> result = t5;
+
+Simiarly, assigning an expression to a TensorMap causes its evaluation.  Like
+tensors of type TensorFixedSize, TensorMaps cannot be resized so they have to
+have the rank and sizes of the expression that are assigned to them.
+
+#### Calling eval().
+
+When you compute large composite expressions, you sometimes want to tell Eigen
+that an intermediate value in the expression tree is worth evaluating ahead of
+time.  This is done by inserting a call to the ```eval()``` method of the
+expression Operation.
+
+    // The previous example could have been written:
+    Tensor<float, 3> result = ((t1 + t2) * 0.2f).exp();
+
+    // If you want to compute (t1 + t2) once ahead of time you can write:
+    Tensor<float, 3> result = ((t1 + t2).eval() * 0.2f).exp();
+
+Semantically, calling ```eval()``` is equivalent to materializing the value of
+the expression in a temporary Tensor of the right size.  The code above in
+effect does:
+
+    // .eval() knows the size!
+    TensorFixedSize<float, 4, 4, 2> tmp = t1 + t2;
+    Tensor<float, 3> result = (tmp * 0.2f).exp();
+
+Note that the return value of ```eval()``` is itself an Operation, so the
+following code does not do what you may think:
+
+    // Here t3 is an evaluation Operation.  t3 has not been evaluated yet.
+    auto t3 = (t1 + t2).eval();
+
+    // You can use t3 in another expression.  Still no evaluation.
+    auto t4 = (t3 * 0.2f).exp();
+
+    // The value is evaluated when you assign the Operation to a Tensor, using
+    // an intermediate tensor to represent t3.x
+    Tensor<float, 3> result = t4;
+
+While in the examples above calling ```eval()``` does not make a difference in
+performance, in other cases it can make a huge difference.  In the expression
+below the ```broadcast()``` expression causes the ```X.maximum()``` expression
+to be evaluated many times:
+
+    Tensor<...> X ...;
+    Tensor<...> Y = ((X - X.maximum(depth_dim).reshape(dims2d).broadcast(bcast))
+                     * beta).exp();
+
+Inserting a call to ```eval()``` between the ```maximum()``` and
+```reshape()``` calls guarantees that maximum() is only computed once and
+greatly speeds-up execution:
+
+    Tensor<...> Y =
+      ((X - X.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast))
+        * beta).exp();
+
+In the other example below, the tensor ```Y``` is both used in the expression
+and its assignment.  This is an aliasing problem and if the evaluation is not
+done in the right order Y will be updated incrementally during the evaluation
+resulting in bogus results:
+
+     Tensor<...> Y ...;
+     Y = Y / (Y.sum(depth_dim).reshape(dims2d).broadcast(bcast));
+
+Inserting a call to ```eval()``` between the ```sum()``` and ```reshape()```
+expressions ensures that the sum is computed before any updates to ```Y``` are
+done.
+
+     Y = Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
+
+Note that an eval around the full right hand side expression is not needed
+because the generated has to compute the i-th value of the right hand side
+before assigning it to the left hand side.
+
+However, if you were assigning the expression value to a shuffle of ```Y```
+then you would need to force an eval for correctness by adding an ```eval()```
+call for the right hand side:
+
+     Y.shuffle(...) =
+        (Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast))).eval();
+
+
+#### Assigning to a TensorRef.
+
+If you need to access only a few elements from the value of an expression you
+can avoid materializing the value in a full tensor by using a TensorRef.
+
+A TensorRef is a small wrapper class for any Eigen Operation.  It provides
+overloads for the ```()``` operator that let you access individual values in
+the expression.  TensorRef is convenient, because the Operation themselves do
+not provide a way to access individual elements.
+
+    // Create a TensorRef for the expression.  The expression is not
+    // evaluated yet.
+    TensorRef<Tensor<float, 3> > ref = ((t1 + t2) * 0.2f).exp();
+
+    // Use "ref" to access individual elements.  The expression is evaluated
+    // on the fly.
+    float at_0 = ref(0, 0, 0);
+    cout << ref(0, 1, 0);
+
+Only use TensorRef when you need a subset of the values of the expression.
+TensorRef only computes the values you access.  However note that if you are
+going to access all the values it will be much faster to materialize the
+results in a Tensor first.
+
+In some cases, if the full Tensor result would be very large, you may save
+memory by accessing it as a TensorRef.  But not always.  So don't count on it.
+
+
+### Controlling How Expressions Are Evaluated
+
+The tensor library provides several implementations of the various operations
+such as contractions and convolutions.  The implementations are optimized for
+different environments: single threaded on CPU, multi threaded on CPU, or on a
+GPU using cuda.  Additional implementations may be added later.
+
+You can choose which implementation to use with the ```device()``` call.  If
+you do not choose an implementation explicitly the default implementation that
+uses a single thread on the CPU is used.
+
+The default implementation has been optimized for recent Intel CPUs, taking
+advantage of SSE, AVX, and FMA instructions.  Work is ongoing to tune the
+library on ARM CPUs.  Note that you need to pass compiler-dependent flags
+to enable the use of SSE, AVX, and other instructions.
+
+For example, the following code adds two tensors using the default
+single-threaded CPU implementation:
+
+    Tensor<float, 2> a(30, 40);
+    Tensor<float, 2> b(30, 40);
+    Tensor<float, 2> c = a + b;
+
+To choose a different implementation you have to insert a ```device()``` call
+before the assignment of the result.  For technical C++ reasons this requires
+that the Tensor for the result be declared on its own.  This means that you
+have to know the size of the result.
+
+    Eigen::Tensor<float, 2> c(30, 40);
+    c.device(...) = a + b;
+
+The call to ```device()``` must be the last call on the left of the operator=.
+
+You must pass to the ```device()``` call an Eigen device object.  There are
+presently three devices you can use: DefaultDevice, ThreadPoolDevice and
+GpuDevice.
+
+
+#### Evaluating With the DefaultDevice
+
+This is exactly the same as not inserting a ```device()``` call.
+
+    DefaultDevice my_device;
+    c.device(my_device) = a + b;
+
+#### Evaluating with a Thread Pool
+
+    // Create the Eigen ThreadPoolDevice.
+    Eigen::ThreadPoolDevice my_device(4 /* number of threads to use */);
+
+    // Now just use the device when evaluating expressions.
+    Eigen::Tensor<float, 2> c(30, 50);
+    c.device(my_device) = a.contract(b, dot_product_dims);
+
+
+#### Evaluating On GPU
+
+This is presently a bit more complicated than just using a thread pool device.
+You need to create a GPU device but you also need to explicitly allocate the
+memory for tensors with cuda.
+
+
+## API Reference
+
+### Datatypes
+
+In the documentation of the tensor methods and Operation we mention datatypes
+that are tensor-type specific:
+
+#### &lt;Tensor-Type&gt;::Dimensions
+
+Acts like an array of ints.  Has an ```int size``` attribute, and can be
+indexed like an array to access individual values.  Used to represent the
+dimensions of a tensor.  See ```dimensions()```.
+
+#### &lt;Tensor-Type&gt;::Index
+
+Acts like an ```int```.  Used for indexing tensors along their dimensions.  See
+```operator()```, ```dimension()```, and ```size()```.
+
+#### &lt;Tensor-Type&gt;::Scalar
+
+Represents the datatype of individual tensor elements.  For example, for a
+```Tensor<float>```, ```Scalar``` is the type ```float```.  See
+```setConstant()```.
+
+#### &lt;Operation&gt;
+
+We use this pseudo type to indicate that a tensor Operation is returned by a
+method.  We indicate in the text the type and dimensions of the tensor that the
+Operation returns after evaluation.
+
+The Operation will have to be evaluated, for example by assigning it to a
+tensor, before you can access the values of the resulting tensor.  You can also
+access the values through a TensorRef.
+
+
+## Built-in Tensor Methods
+
+These are usual C++ methods that act on tensors immediately.  They are not
+Operations which provide delayed evaluation of their results.  Unless specified
+otherwise, all the methods listed below are available on all tensor classes:
+Tensor, TensorFixedSize, and TensorMap.
+
+## Metadata
+
+### int NumDimensions
+
+Constant value indicating the number of dimensions of a Tensor.  This is also
+known as the tensor "rank".
+
+      Eigen::Tensor<float, 2> a(3, 4);
+      cout << "Dims " << a.NumDimensions;
+      => Dims 2
+
+### Dimensions dimensions()
+
+Returns an array-like object representing the dimensions of the tensor.
+The actual type of the dimensions() result is <Tensor-Type>::Dimensions.
+
+    Eigen::Tensor<float, 2> a(3, 4);
+    const Eigen::Tensor<float, 2>::Dimensions& d = a.dimensions();
+    cout << "Dim size: " << d.size << ", dim 0: " << d[0]
+         << ", dim 1: " << d[1];
+    => Dim size: 2, dim 0: 3, dim 1: 4
+
+If you use a C++11 compiler, you can use ```auto``` to simplify the code:
+
+    const auto& d = a.dimensions();
+    cout << "Dim size: " << d.size << ", dim 0: " << d[0]
+         << ", dim 1: " << d[1];
+    => Dim size: 2, dim 0: 3, dim 1: 4
+
+### Index dimension(Index n)
+
+Returns the n-th dimension of the tensor.  The actual type of the
+```dimension()``` result is ```<Tensor-Type>::Index```, but you can
+always use it like an int.
+
+      Eigen::Tensor<float, 2> a(3, 4);
+      int dim1 = a.dimension(1);
+      cout << "Dim 1: " << dim1;
+      => Dim 1: 4
+
+### Index size()
+
+Returns the total number of elements in the tensor.  This is the product of all
+the tensor dimensions.  The actual type of the ```size()``` result is
+```<Tensor-Type>::Index```, but you can always use it like an int.
+
+    Eigen::Tensor<float, 2> a(3, 4);
+    cout << "Size: " << a.size();
+    => Size: 12
+
+
+### Getting Dimensions From An Operation
+
+A few operations provide ```dimensions()``` directly,
+e.g. ```TensorReslicingOp```.  Most operations defer calculating dimensions
+until the operation is being evaluated.  If you need access to the dimensions
+of a deferred operation, you can wrap it in a TensorRef (see Assigning to a
+TensorRef above), which provides ```dimensions()``` and ```dimension()``` as
+above.
+
+TensorRef can also wrap the plain Tensor types, so this is a useful idiom in
+templated contexts where the underlying object could be either a raw Tensor
+or some deferred operation (e.g. a slice of a Tensor).  In this case, the
+template code can wrap the object in a TensorRef and reason about its
+dimensionality while remaining agnostic to the underlying type.
+
+
+## Constructors and Copies
+
+TODO.
+
+    Tensor(...)
+    TensorFixedSize(...)
+    TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions)
+    TensorMap(PointerArgType dataPtr, const array<Index, NumIndices>& dimensions)
+    TensorMap(PointerArgType dataPtr, const Dimensions& dimensions)
+    Self& operator=(const Self& other)
+    Self& operator=(const OtherDerived& other)
+
+
+## Contents Initialization
+
+When a new Tensor or a new TensorFixedSize are created, memory is allocated to
+hold all the tensor elements, but the memory is not initialized.  Similarly,
+when a new TensorMap is created on top of non-initialized memory the memory its
+contents are not initialized.
+
+You can use one of the methods below to initialize the tensor memory.  These
+have an immediate effect on the tensor and return the tensor itself as a
+result.  These are not tensor Operations which delay evaluation.
+
+### &lt;Tensor-Type&gt; setConstant(const Scalar& val)
+
+Sets all elements of the tensor to the constant value ```val```.  ```Scalar```
+is the type of data stored in the tensor.  You can pass any value that is
+convertible to that type.
+
+Returns the tensor itself in case you want to chain another call.
+
+    a.setConstant(12.3f);
+    cout << "Constant: " << endl << a << endl << endl;
+    =>
+    Constant:
+    12.3 12.3 12.3 12.3
+    12.3 12.3 12.3 12.3
+    12.3 12.3 12.3 12.3
+
+Note that ```setConstant()``` can be used on any tensor where the element type
+has a copy constructor and an ```operator=()```:
+
+    Eigen::Tensor<string, 2> a(2, 3);
+    a.setConstant("yolo");
+    cout << "String tensor: " << endl << a << endl << endl;
+    =>
+    String tensor:
+    yolo yolo yolo
+    yolo yolo yolo
+
+
+### &lt;Tensor-Type&gt; setZero()
+
+Fills the tensor with zeros.  Equivalent to ```setConstant(Scalar(0))```.
+Returns the tensor itself in case you want to chain another call.
+
+    a.setZero();
+    cout << "Zeros: " << endl << a << endl << endl;
+    =>
+    Zeros:
+    0 0 0 0
+    0 0 0 0
+    0 0 0 0
+
+
+### &lt;Tensor-Type&gt; setValues({..initializer_list})
+
+Fills the tensor with explicit values specified in a std::initializer_list.
+The type of the initializer list depends on the type and rank of the tensor.
+
+If the tensor has rank N, the initializer list must be nested N times.  The
+most deeply nested lists must contains P scalars of the Tensor type where P is
+the size of the last dimension of the Tensor.
+
+For example, for a ```TensorFixedSize<float, 2, 3>``` the initializer list must
+contains 2 lists of 3 floats each.
+
+```setValues()``` returns the tensor itself in case you want to chain another
+call.
+
+    Eigen::Tensor<float, 2> a(2, 3);
+    a.setValues({{0.0f, 1.0f, 2.0f}, {3.0f, 4.0f, 5.0f}});
+    cout << "a" << endl << a << endl << endl;
+    =>
+    a
+    0 1 2
+    3 4 5
+
+If a list is too short, the corresponding elements of the tensor will not be
+changed.  This is valid at each level of nesting.  For example the following
+code only sets the values of the first row of the tensor.
+
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setConstant(1000);
+    a.setValues({{10, 20, 30}});
+    cout << "a" << endl << a << endl << endl;
+    =>
+    a
+    10   20   30
+    1000 1000 1000
+
+### &lt;Tensor-Type&gt; setRandom()
+
+Fills the tensor with random values.  Returns the tensor itself in case you
+want to chain another call.
+
+    a.setRandom();
+    cout << "Random: " << endl << a << endl << endl;
+    =>
+    Random:
+      0.680375    0.59688  -0.329554    0.10794
+     -0.211234   0.823295   0.536459 -0.0452059
+      0.566198  -0.604897  -0.444451   0.257742
+
+You can customize ```setRandom()``` by providing your own random number
+generator as a template argument:
+
+    a.setRandom<MyRandomGenerator>();
+
+Here, ```MyRandomGenerator``` must be a struct with the following member
+functions, where Scalar and Index are the same as ```<Tensor-Type>::Scalar```
+and ```<Tensor-Type>::Index```.
+
+See ```struct UniformRandomGenerator``` in TensorFunctors.h for an example.
+
+    // Custom number generator for use with setRandom().
+    struct MyRandomGenerator {
+      // Default and copy constructors. Both are needed
+      MyRandomGenerator() { }
+      MyRandomGenerator(const MyRandomGenerator& ) { }
+
+      // Return a random value to be used.  "element_location" is the
+      // location of the entry to set in the tensor, it can typically
+      // be ignored.
+      Scalar operator()(Eigen::DenseIndex element_location,
+                        Eigen::DenseIndex /*unused*/ = 0) const {
+        return <randomly generated value of type T>;
+      }
+
+      // Same as above but generates several numbers at a time.
+      typename internal::packet_traits<Scalar>::type packetOp(
+          Eigen::DenseIndex packet_location, Eigen::DenseIndex /*unused*/ = 0) const {
+        return <a packet of randomly generated values>;
+      }
+    };
+
+You can also use one of the 2 random number generators that are part of the
+tensor library:
+*   UniformRandomGenerator
+*   NormalRandomGenerator
+
+
+## Data Access
+
+TODO
+
+    const Scalar& operator()(const array<Index, NumIndices>& indices)
+    const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
+    Scalar& operator()(const array<Index, NumIndices>& indices)
+    Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
+    Scalar& operator[](Index index)
+    ??? mention coeff() and coeffRef() ???
+
+### Scalar* data()
+### const Scalar* data() const
+
+Returns a pointer to the storage for the tensor.  The pointer is const if the
+tensor was const.  This allows direct access to the data.  The layout of the
+data depends on the tensor layout: RowMajor or ColMajor.
+
+This access is usually only needed for special cases, for example when mixing
+Eigen Tensor code with other libraries.
+
+Scalar is the type of data stored in the tensor.
+
+    Eigen::Tensor<float, 2> a(3, 4);
+    float* a_data = a.data();
+    a_data[0] = 123.45f;
+    cout << "a(0, 0): " << a(0, 0);
+    => a(0, 0): 123.45
+
+
+## Tensor Operations
+
+All the methods documented below return non evaluated tensor ```Operations```.
+These can be chained: you can apply another Tensor Operation to the value
+returned by the method.
+
+The chain of Operation is evaluated lazily, typically when it is assigned to a
+tensor.  See "Controlling when Expression are Evaluated" for more details about
+their evaluation.
+
+### &lt;Operation&gt; constant(const Scalar& val)
+
+Returns a tensor of the same type and dimensions as the original tensor but
+where all elements have the value ```val```.
+
+This is useful, for example, when you want to add or subtract a constant from a
+tensor, or multiply every element of a tensor by a scalar.
+
+    Eigen::Tensor<float, 2> a(2, 3);
+    a.setConstant(1.0f);
+    Eigen::Tensor<float, 2> b = a + a.constant(2.0f);
+    Eigen::Tensor<float, 2> c = b * b.constant(0.2f);
+    cout << "a" << endl << a << endl << endl;
+    cout << "b" << endl << b << endl << endl;
+    cout << "c" << endl << c << endl << endl;
+    =>
+    a
+    1 1 1
+    1 1 1
+
+    b
+    3 3 3
+    3 3 3
+
+    c
+    0.6 0.6 0.6
+    0.6 0.6 0.6
+
+### &lt;Operation&gt; random()
+
+Returns a tensor of the same type and dimensions as the current tensor
+but where all elements have random values.
+
+This is for example useful to add random values to an existing tensor.
+The generation of random values can be customized in the same manner
+as for ```setRandom()```.
+
+    Eigen::Tensor<float, 2> a(2, 3);
+    a.setConstant(1.0f);
+    Eigen::Tensor<float, 2> b = a + a.random();
+    cout << "a" << endl << a << endl << endl;
+    cout << "b" << endl << b << endl << endl;
+    =>
+    a
+    1 1 1
+    1 1 1
+
+    b
+    1.68038   1.5662  1.82329
+    0.788766  1.59688 0.395103
+
+
+## Unary Element Wise Operations
+
+All these operations take a single input tensor as argument and return a tensor
+of the same type and dimensions as the tensor to which they are applied.  The
+requested operations are applied to each element independently.
+
+### &lt;Operation&gt; operator-()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the opposite values of the original tensor.
+
+    Eigen::Tensor<float, 2> a(2, 3);
+    a.setConstant(1.0f);
+    Eigen::Tensor<float, 2> b = -a;
+    cout << "a" << endl << a << endl << endl;
+    cout << "b" << endl << b << endl << endl;
+    =>
+    a
+    1 1 1
+    1 1 1
+
+    b
+    -1 -1 -1
+    -1 -1 -1
+
+### &lt;Operation&gt; sqrt()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the square roots of the original tensor.
+
+### &lt;Operation&gt; rsqrt()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the inverse square roots of the original tensor.
+
+### &lt;Operation&gt; square()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the squares of the original tensor values.
+
+### &lt;Operation&gt; inverse()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the inverse of the original tensor values.
+
+### &lt;Operation&gt; exp()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the exponential of the original tensor.
+
+### &lt;Operation&gt; log()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the natural logarithms of the original tensor.
+
+### &lt;Operation&gt; abs()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the absolute values of the original tensor.
+
+### &lt;Operation&gt; pow(Scalar exponent)
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the coefficients of the original tensor to the power of the
+exponent.
+
+The type of the exponent, Scalar, is always the same as the type of the
+tensor coefficients.  For example, only integer exponents can be used in
+conjuntion with tensors of integer values.
+
+You can use cast() to lift this restriction.  For example this computes
+cubic roots of an int Tensor:
+
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setValues({{0, 1, 8}, {27, 64, 125}});
+    Eigen::Tensor<double, 2> b = a.cast<double>().pow(1.0 / 3.0);
+    cout << "a" << endl << a << endl << endl;
+    cout << "b" << endl << b << endl << endl;
+    =>
+    a
+    0   1   8
+    27  64 125
+
+    b
+    0 1 2
+    3 4 5
+
+### &lt;Operation&gt;  operator * (Scalar scale)
+TODO
+
+### &lt;Operation&gt;  cwiseMax(Scalar threshold)
+TODO
+
+### &lt;Operation&gt;  cwiseMin(Scalar threshold)
+TODO
+
+ ### &lt;Operation&gt;  unaryExpr(const CustomUnaryOp& func)
+TODO
+
+
+## Binary Element Wise Operations
+
+These operations take two input tensors as arguments. The 2 input tensors should
+be of the same type and dimensions. The result is a tensor of the same
+dimensions as the tensors to which they are applied, and unless otherwise
+specified it is also of the same type. The requested operations are applied to
+each pair of elements independently.
+
+### &lt;Operation&gt; operator+(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise sums of the inputs.
+
+### &lt;Operation&gt; operator-(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise differences of the inputs.
+
+### &lt;Operation&gt; operator*(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise products of the inputs.
+
+### &lt;Operation&gt; operator/(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise quotients of the inputs.
+
+This operator is not supported for integer types.
+
+### &lt;Operation&gt; cwiseMax(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise maximums of the inputs.
+
+### &lt;Operation&gt; cwiseMin(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise mimimums of the inputs.
+
+### &lt;Operation&gt; Logical operators
+
+The following logical operators are supported as well:
+
+*   operator&&(const OtherDerived& other)
+
+*   operator||(const OtherDerived& other)
+
+*   operator<(const OtherDerived& other)
+
+*   operator<=(const OtherDerived& other)
+
+*   operator>(const OtherDerived& other)
+
+*   operator>=(const OtherDerived& other)
+
+*   operator==(const OtherDerived& other)
+
+*   operator!=(const OtherDerived& other)
+
+They all return a tensor of boolean values.
+
+
+## Selection (select(const ThenDerived& thenTensor, const ElseDerived& elseTensor)
+
+Selection is a coefficient-wise ternary operator that is the tensor equivalent
+to the if-then-else operation.
+
+    Tensor<bool, 3> if = ...;
+    Tensor<float, 3> then = ...;
+    Tensor<float, 3> else = ...;
+    Tensor<float, 3> result = if.select(then, else);
+
+The 3 arguments must be of the same dimensions, which will also be the dimension
+of the result.  The 'if' tensor must be of type boolean, the 'then' and the
+'else' tensor must be of the same type, which will also be the type of the
+result.
+
+Each coefficient in the result is equal to the corresponding coefficient in the
+'then' tensor if the corresponding value in the 'if' tensor is true. If not, the
+resulting coefficient will come from the 'else' tensor.
+
+
+## Contractions
+
+TODO
+    contract(const OtherDerived& other, const Dimensions& dims)
+
+
+
+## Reduction Operations
+
+A *Reduction* operation returns a tensor with fewer dimensions than the
+original tensor.  The values in the returned tensor are computed by applying a
+*reduction operator* to slices of values from the original tensor.  You specify
+the dimensions along which the slices are made.
+
+The Eigen Tensor library provides a set of predefined reduction operators such
+as ```maximum()``` and ```sum()``` and lets you define additional operators by
+implementing a few methods from a reductor template.
+
+### Reduction Dimensions
+
+All reduction operations take a single parameter of type
+```<TensorType>::Dimensions``` which can always be specified as an array of
+ints.  These are called the "reduction dimensions."  The values are the indices
+of the dimensions of the input tensor over which the reduction is done.  The
+parameter can have at most as many element as the rank of the input tensor;
+each element must be less than the tensor rank, as it indicates one of the
+dimensions to reduce.
+
+Each dimension of the input tensor should occur at most once in the reduction
+dimensions as the implementation does not remove duplicates.
+
+The order of the values in the reduction dimensions does not affect the
+results, but the code may execute faster if you list the dimensions in
+increasing order.
+
+Example: Reduction along one dimension.
+
+    // Create a tensor of 3 dimensions: 2, 3, 4
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setValues({{1, 2, 3}, {6, 5, 4}});
+    // Reduce it along the second dimension (1)...
+    Eigen::array<int, 1> dims({1 /* dimension to reduce */});
+    // ...using the "maximum" operator.
+    // The result is a tensor with one dimension.  The size of
+    // that dimension is the same as the first (non-reduced) dimension of a.
+    Eigen::Tensor<int, 1> b = a.maximum(dims);
+    cout << "a" << endl << a << endl << endl;
+    cout << "b" << endl << b << endl << endl;
+    =>
+    a
+    1 2 3
+    6 5 4
+
+    b
+    3
+    6
+
+Example: Reduction along two dimensions.
+
+    Eigen::Tensor<float, 3, Eigen::ColMajor> a(2, 3, 4);
+    a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f},
+                  {7.0f, 6.0f, 5.0f, 4.0f},
+                  {8.0f, 9.0f, 10.0f, 11.0f}},
+                 {{12.0f, 13.0f, 14.0f, 15.0f},
+                  {19.0f, 18.0f, 17.0f, 16.0f},
+                  {20.0f, 21.0f, 22.0f, 23.0f}}});
+    // The tensor a has 3 dimensions.  We reduce along the
+    // first 2, resulting in a tensor with a single dimension
+    // of size 4 (the last dimension of a.)
+    // Note that we pass the array of reduction dimensions
+    // directly to the maximum() call.
+    Eigen::Tensor<float, 1, Eigen::ColMajor> b =
+        a.maximum(Eigen::array<int, 2>({0, 1}));
+    cout << "b" << endl << b << endl << endl;
+    =>
+    b
+    20
+    21
+    22
+    23
+
+#### Reduction along all dimensions
+
+As a special case, if you pass no parameter to a reduction operation the
+original tensor is reduced along *all* its dimensions.  The result is a
+one-dimension tensor with a single value.
+
+    Eigen::Tensor<float, 3> a(2, 3, 4);
+    a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f},
+                  {7.0f, 6.0f, 5.0f, 4.0f},
+                  {8.0f, 9.0f, 10.0f, 11.0f}},
+                 {{12.0f, 13.0f, 14.0f, 15.0f},
+                  {19.0f, 18.0f, 17.0f, 16.0f},
+                  {20.0f, 21.0f, 22.0f, 23.0f}}});
+    // Reduce along all dimensions using the sum() operator.
+    Eigen::Tensor<float, 1> b = a.sum();
+    cout << "b" << endl << b << endl << endl;
+    =>
+    b
+    276
+
+
+### &lt;Operation&gt; sum(const Dimensions& new_dims)
+### &lt;Operation&gt; sum()
+
+Reduce a tensor using the sum() operator.  The resulting values
+are the sum of the reduced values.
+
+### &lt;Operation&gt; mean(const Dimensions& new_dims)
+### &lt;Operation&gt; mean()
+
+Reduce a tensor using the mean() operator.  The resulting values
+are the mean of the reduced values.
+
+### &lt;Operation&gt; maximum(const Dimensions& new_dims)
+### &lt;Operation&gt; maximum()
+
+Reduce a tensor using the maximum() operator.  The resulting values are the
+largest of the reduced values.
+
+### &lt;Operation&gt; minimum(const Dimensions& new_dims)
+### &lt;Operation&gt; minimum()
+
+Reduce a tensor using the minimum() operator.  The resulting values
+are the smallest of the reduced values.
+
+### &lt;Operation&gt; prod(const Dimensions& new_dims)
+### &lt;Operation&gt; prod()
+
+Reduce a tensor using the prod() operator.  The resulting values
+are the product of the reduced values.
+
+### &lt;Operation&gt; reduce(const Dimensions& new_dims, const Reducer& reducer)
+
+Reduce a tensor using a user-defined reduction operator.  See ```SumReducer```
+in TensorFunctors.h for information on how to implement a reduction operator.
+
+
+## Convolutions
+
+TBD: convolve(const KernelDerived& kernel, const Dimensions& dims)
+
+
+## Geometrical Operations
+
+These operations return a Tensor with different dimensions than the original
+Tensor.  They can be used to access slices of tensors, see them with different
+dimensions, or pad tensors with additional data.
+
+### &lt;Operation&gt; reshape(const Dimensions& new_dims)
+
+Returns a view of the input tensor that has been reshaped to the specified
+new dimensions.  The argument new_dims is an array of Index values.  The
+rank of the resulting tensor is equal to the number of elements in new_dims.
+
+The product of all the sizes in the new dimension array must be equal to
+the number of elements in the input tensor.
+
+    // Increase the rank of the input tensor by introducing a new dimension
+    // of size 1.
+    Tensor<float, 2> input(7, 11);
+    array<int, 3> three_dims{{7, 11, 1}};
+    Tensor<float, 3> result = input.reshape(three_dims);
+
+    // Decrease the rank of the input tensor by merging 2 dimensions;
+    array<int, 1> one_dim{{7 * 11}};
+    Tensor<float, 1> result = input.reshape(one_dim);
+
+This operation does not move any data in the input tensor, so the resulting
+contents of a reshaped Tensor depend on the data layout of the original Tensor.
+
+For example this is what happens when you ```reshape()``` a 2D ColMajor tensor
+to one dimension:
+
+    Eigen::Tensor<float, 2, Eigen::ColMajor> a(2, 3);
+    a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
+    Eigen::array<Eigen::DenseIndex, 1> one_dim({3 * 2});
+    Eigen::Tensor<float, 1, Eigen::ColMajor> b = a.reshape(one_dim);
+    cout << "b" << endl << b << endl;
+    =>
+    b
+      0
+    300
+    100
+    400
+    200
+    500
+
+This is what happens when the 2D Tensor is RowMajor:
+
+    Eigen::Tensor<float, 2, Eigen::RowMajor> a(2, 3);
+    a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
+    Eigen::array<Eigen::DenseIndex, 1> one_dim({3 * 2});
+    Eigen::Tensor<float, 1, Eigen::RowMajor> b = a.reshape(one_dim);
+    cout << "b" << endl << b << endl;
+    =>
+    b
+      0
+    100
+    200
+    300
+    400
+    500
+
+The reshape operation is a lvalue. In other words, it can be used on the left
+side of the assignment operator.
+
+The previous example can be rewritten as follow:
+
+    Eigen::Tensor<float, 2, Eigen::ColMajor> a(2, 3);
+    a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
+    Eigen::array<Eigen::DenseIndex, 2> two_dim({2, 3});
+    Eigen::Tensor<float, 1, Eigen::ColMajor> b;
+    b.reshape(two_dim) = a;
+    cout << "b" << endl << b << endl;
+    =>
+    b
+      0
+    300
+    100
+    400
+    200
+    500
+
+Note that "b" itself was not reshaped but that instead the assignment is done to
+the reshape view of b.
+
+
+### &lt;Operation&gt; shuffle(const Shuffle& shuffle)
+
+Returns a copy of the input tensor whose dimensions have been
+reordered according to the specified permutation. The argument shuffle
+is an array of Index values. Its size is the rank of the input
+tensor. It must contain a permutation of 0, 1, ..., rank - 1. The i-th
+dimension of the output tensor equals to the size of the shuffle[i]-th
+dimension of the input tensor. For example:
+
+    // Shuffle all dimensions to the left by 1.
+    Tensor<float, 3> input(20, 30, 50);
+    // ... set some values in input.
+    Tensor<float, 3> output = input.shuffle({1, 2, 0})
+
+    eigen_assert(output.dimension(0) == 30);
+    eigen_assert(output.dimension(1) == 50);
+    eigen_assert(output.dimension(2) == 20);
+
+Indices into the output tensor are shuffled accordingly to formulate
+indices into the input tensor. For example, one can assert in the above
+code snippet that:
+
+    eigen_assert(output(3, 7, 11) == input(11, 3, 7));
+
+In general, one can assert that
+
+    eigen_assert(output(..., indices[shuffle[i]], ...) ==
+                 input(..., indices[i], ...))
+
+The shuffle operation results in a lvalue, which means that it can be assigned
+to. In other words, it can be used on the left side of the assignment operator.
+
+Let's rewrite the previous example to take advantage of this feature:
+
+    // Shuffle all dimensions to the left by 1.
+    Tensor<float, 3> input(20, 30, 50);
+    // ... set some values in input.
+    Tensor<float, 3> output(30, 50, 20);
+    output.shuffle({2, 0, 1}) = input;
+
+
+### &lt;Operation&gt; stride(const Strides& strides)
+
+Returns a view of the input tensor that strides (skips stride-1
+elements) along each of the dimensions.  The argument strides is an
+array of Index values.  The dimensions of the resulting tensor are
+ceil(input_dimensions[i] / strides[i]).
+
+For example this is what happens when you ```stride()``` a 2D tensor:
+
+    Eigen::Tensor<int, 2> a(4, 3);
+    a.setValues({{0, 100, 200}, {300, 400, 500}, {600, 700, 800}, {900, 1000, 1100}});
+    Eigen::array<Eigen::DenseIndex, 2> strides({3, 2});
+    Eigen::Tensor<int, 2> b = a.stride(strides);
+    cout << "b" << endl << b << endl;
+    =>
+    b
+       0   200
+     900  1100
+
+It is possible to assign a tensor to a stride:
+    Tensor<float, 3> input(20, 30, 50);
+    // ... set some values in input.
+    Tensor<float, 3> output(40, 90, 200);
+    output.stride({2, 3, 4}) = input;
+
+
+### &lt;Operation&gt; slice(const StartIndices& startIndices,
+                            const Sizes& sizes)
+
+TBD
+
+
+### &lt;Operation&gt; chip(const Index offset, const Index dim)
+
+A chip is a special kind of slice. It is the subtensor at the given offset in
+the dimension dim. The returned tensor has one fewer dimension than the input
+tensor: the dimension dim is removed.
+
+For example, a matrix chip would be either a row or a column of the input
+matrix.
+
+    Eigen::Tensor<int, 2> a(4, 3);
+    a.setValues({{0, 100, 200}, {300, 400, 500},
+                 {600, 700, 800}, {900, 1000, 1100}});
+    Eigen::Tensor<int, 1> row_3 = a.chip(2, 0);
+    Eigen::Tensor<int, 1> col_2 = a.chip(1, 1);
+    cout << "a" << endl << a << endl;
+    =>
+    a
+       0   100   200
+     300   400   500
+     600   700   800
+     900  1000  1100
+    cout << "row_3" << endl << row_3 << endl;
+    =>
+    row_3
+       600   700   800
+    cout << "col_2" << endl << col_2 << endl;
+    =>
+    col_2
+       100   400   700    1000
+
+It is possible to assign values to a tensor chip since the chip operation is a
+lvalue. For example:
+
+    Eigen::Tensor<int, 1> a(3);
+    a.setValues({{100, 200, 300}});
+    Eigen::Tensor<int, 2> b(2, 3);
+    b.setZero();
+    b.chip(0, 0) = a;
+    cout << "a" << endl << a << endl;
+    =>
+    a
+     100
+     200
+     300
+    cout << "b" << endl << b << endl;
+    =>
+    b
+       100   200   300
+         0     0     0
+
+
+### &lt;Operation&gt; reverse(const ReverseDimensions& reverse)
+
+Returns a view of the input tensor that reverses the order of the coefficients
+along a subset of the dimensions.  The argument reverse is an array of boolean
+values that indicates whether or not the order of the coefficients should be
+reversed along each of the dimensions.  This operation preserves the dimensions
+of the input tensor.
+
+For example this is what happens when you ```reverse()``` the first dimension
+of a 2D tensor:
+
+    Eigen::Tensor<int, 2> a(4, 3);
+    a.setValues({{0, 100, 200}, {300, 400, 500},
+                {600, 700, 800}, {900, 1000, 1100}});
+    Eigen::array<bool, 2> reverse({true, false});
+    Eigen::Tensor<int, 2> b = a.reverse(reverse);
+    cout << "a" << endl << a << endl << "b" << endl << b << endl;
+    =>
+    a
+       0   100   200
+     300   400   500
+     600   700   800
+     900  1000  1100
+    b
+     900  1000  1100
+     600   700   800
+     300   400   500
+       0   100   200
+
+
+TODO
+### &lt;Operation&gt; broadcast(const Broadcast& broadcast)
+
+TODO
+
+### &lt;Operation&gt; concatenate(const OtherDerived& other, Axis axis)
+
+TODO
+
+### &lt;Operation&gt;  pad(const PaddingDimensions& padding)
+
+TODO
+
+### &lt;Operation&gt;  extract_patches(const PatchDims& patch_dims)
+
+TODO
+
+### &lt;Operation&gt;  extract_image_patches(const Index patch_rows, const Index patch_cols,
+                          const Index row_stride, const Index col_stride,
+                          const PaddingType padding_type)
+
+TODO
+
+
+## Special Operations
+
+### &lt;Operation&gt; cast&lt;T&gt;()
+
+Returns a tensor of type T with the same dimensions as the original tensor.
+The returned tensor contains the values of the original tensor converted to
+type T.
+
+    Eigen::Tensor<float, 2> a(2, 3);
+    Eigen::Tensor<int, 2> b = a.cast<int>();
+
+This can be useful for example if you need to do element-wise division of
+Tensors of integers.  This is not currently supported by the Tensor library
+but you can easily cast the tensors to floats to do the division:
+
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setValues({{0, 1, 2}, {3, 4, 5}});
+    Eigen::Tensor<int, 2> b =
+        (a.cast<float>() / a.constant(2).cast<float>()).cast<int>();
+    cout << "a" << endl << a << endl << endl;
+    cout << "b" << endl << b << endl << endl;
+    =>
+    a
+    0 1 2
+    3 4 5
+
+    b
+    0 0 1
+    1 2 2
+
+
+### &lt;Operation&gt;     eval()
+
+TODO
+
+
+## Representation of scalar values
+
+Scalar values are often represented by tensors of size 1 and rank 1. It would be
+more logical and user friendly to use tensors of rank 0 instead. For example
+Tensor&lt;T, N&gt;::maximum() currently returns a Tensor&lt;T, 1&gt;. Similarly, the inner
+product of 2 1d tensors (through contractions) returns a 1d tensor. In the
+future these operations might be updated to return 0d tensors instead.
+
+## Limitations
+
+* The number of tensor dimensions is currently limited to 250 when using a
+  compiler that supports cxx11. It is limited to only 5 for older compilers.
+* The IndexList class requires a cxx11 compliant compiler. You can use an
+  array of indices instead if you don't have access to a modern compiler.
+* TensorVarDims are only partially supported
+* On GPUs only floating point values are properly tested and optimized for.
+* Complex and integer values are known to be broken on GPUs. If you try to use
+  them you'll most likely end up triggering a static assertion failure such as
+  EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+
+
diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
index 70ca1433f..037219f23 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@@ -1,6 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
 // Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -55,70 +56,46 @@ namespace Eigen {
   * change dramatically.</dd>
   * </dl>
   *
-  * \ref TopicStorageOrders 
+  * \ref TopicStorageOrders
   */
-template<typename Scalar_, std::size_t NumIndices_, int Options_ = 0>
-class Tensor;
 
-namespace internal {
 template<typename Scalar_, std::size_t NumIndices_, int Options_>
-struct traits<Tensor<Scalar_, NumIndices_, Options_>>
+class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_> >
 {
-  typedef Scalar_ Scalar;
-  typedef Dense StorageKind;
-  typedef DenseIndex Index;
-  enum {
-    Options = Options_
-  };
-};
-
-template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
-struct tensor_index_linearization_helper
-{
-  constexpr static inline Index run(std::array<Index, NumIndices> const& indices, std::array<Index, NumIndices> const& dimensions)
-  {
-    return std_array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) + 
-      std_array_get<RowMajor ? n : (NumIndices - n - 1)>(dimensions) *
-        tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
-  }
-};
-
-template<typename Index, std::size_t NumIndices, bool RowMajor>
-struct tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
-{
-  constexpr static inline Index run(std::array<Index, NumIndices> const& indices, std::array<Index, NumIndices> const&)
-  {
-    return std_array_get<RowMajor ? 0 : NumIndices - 1>(indices);
-  }
-};
-} // end namespace internal
-
-template<typename Scalar_, std::size_t NumIndices_, int Options_>
-class Tensor
-{
-    static_assert(NumIndices_ >= 1, "A tensor must have at least one index.");
-  
   public:
     typedef Tensor<Scalar_, NumIndices_, Options_> Self;
+    typedef TensorBase<Tensor<Scalar_, NumIndices_, Options_> > Base;
+    typedef typename Eigen::internal::nested<Self>::type Nested;
     typedef typename internal::traits<Self>::StorageKind StorageKind;
     typedef typename internal::traits<Self>::Index Index;
-    typedef typename internal::traits<Self>::Scalar Scalar;
-    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+    typedef Scalar_ Scalar;
+    typedef typename internal::packet_traits<Scalar>::type Packet;
     typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef Self DenseType;
+    typedef typename Base::CoeffReturnType CoeffReturnType;
+    typedef typename Base::PacketReturnType PacketReturnType;
+
+    enum {
+      IsAligned = bool(EIGEN_ALIGN) & !(Options_&DontAlign),
+      PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+      Layout = Options_ & RowMajor ? RowMajor : ColMajor,
+      CoordAccess = true,
+    };
 
-    constexpr static int Options = Options_;
-    constexpr static std::size_t NumIndices = NumIndices_;
+    static const int Options = Options_;
+    static const std::size_t NumIndices = NumIndices_;
+    typedef DSizes<Index, NumIndices_> Dimensions;
 
   protected:
     TensorStorage<Scalar, NumIndices, Dynamic, Options> m_storage;
 
   public:
-    EIGEN_STRONG_INLINE Index                         dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
-    EIGEN_STRONG_INLINE std::array<Index, NumIndices> dimensions()             const { return m_storage.dimensions(); }
-    EIGEN_STRONG_INLINE Index                         size()                   const { return internal::array_prod(m_storage.dimensions()); }
-    EIGEN_STRONG_INLINE Scalar                        *data()                        { return m_storage.data(); }
-    EIGEN_STRONG_INLINE const Scalar                  *data()                  const { return m_storage.data(); }
+    // Metadata
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                         rank()                   const { return NumIndices; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                         dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<DenseIndex, NumIndices_>& dimensions()    const { return m_storage.dimensions(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                         size()                   const { return m_storage.size(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar                        *data()                        { return m_storage.data(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar                  *data()                  const { return m_storage.data(); }
 
     // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
     // work, because that uses base().coeffRef() - and we don't yet
@@ -126,146 +103,254 @@ class Tensor
     inline Self& base()             { return *this; }
     inline const Self& base() const { return *this; }
 
-    void setZero()
-    {
-      // FIXME: until we have implemented packet access and the
-      //        expression engine w.r.t. nullary ops, use this
-      //        as a kludge. Only works with POD types, but for
-      //        any standard usage, this shouldn't be a problem
-      memset((void *)data(), 0, size() * sizeof(Scalar));
-    }
-
-    inline Self& operator=(Self const& other)
-    {
-      m_storage = other.m_storage;
-      return *this;
-    }
-
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
     {
-      static_assert(sizeof...(otherIndices) + 2 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
-      return coeff(std::array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return coeff(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
     }
+#endif
 
-    inline const Scalar& coeff(const std::array<Index, NumIndices>& indices) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const
     {
       eigen_internal_assert(checkIndexRange(indices));
       return m_storage.data()[linearizedIndex(indices)];
     }
 
-    inline const Scalar& coeff(Index index) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
     {
       eigen_internal_assert(index >= 0 && index < size());
       return m_storage.data()[index];
     }
 
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
     {
-      static_assert(sizeof...(otherIndices) + 2 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
-      return coeffRef(std::array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return coeffRef(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
     }
+#endif
 
-    inline Scalar& coeffRef(const std::array<Index, NumIndices>& indices)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
     {
       eigen_internal_assert(checkIndexRange(indices));
       return m_storage.data()[linearizedIndex(indices)];
     }
 
-    inline Scalar& coeffRef(Index index)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
     {
       eigen_internal_assert(index >= 0 && index < size());
       return m_storage.data()[index];
     }
 
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
     {
-      static_assert(sizeof...(otherIndices) + 2 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
-      return this->operator()(std::array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return this->operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+    }
+#else
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
+    {
+      return coeff(array<Index, 2>(i0, i1));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
+    {
+      return coeff(array<Index, 3>(i0, i1, i2));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const
+    {
+      return coeff(array<Index, 4>(i0, i1, i2, i3));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
+    {
+      return coeff(array<Index, 5>(i0, i1, i2, i3, i4));
     }
+#endif
 
-    inline const Scalar& operator()(const std::array<Index, NumIndices>& indices) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
     {
       eigen_assert(checkIndexRange(indices));
       return coeff(indices);
     }
 
-    inline const Scalar& operator()(Index index) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
     {
       eigen_internal_assert(index >= 0 && index < size());
       return coeff(index);
     }
 
-    inline const Scalar& operator[](Index index) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const
     {
-      static_assert(NumIndices == 1, "The bracket operator is only for vectors, use the parenthesis operator instead.");
+      // The bracket operator is only for vectors, use the parenthesis operator instead.
+      EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE);
       return coeff(index);
     }
 
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
     {
-      static_assert(sizeof...(otherIndices) + 2 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
-      return operator()(std::array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+    }
+#else
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
+    {
+      return coeffRef(array<Index, 2>(i0, i1));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
+    {
+      return coeffRef(array<Index, 3>(i0, i1, i2));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
+    {
+      return coeffRef(array<Index, 4>(i0, i1, i2, i3));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
+    {
+      return coeffRef(array<Index, 5>(i0, i1, i2, i3, i4));
     }
+#endif
 
-    inline Scalar& operator()(const std::array<Index, NumIndices>& indices)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
     {
       eigen_assert(checkIndexRange(indices));
       return coeffRef(indices);
     }
 
-    inline Scalar& operator()(Index index)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index)
     {
       eigen_assert(index >= 0 && index < size());
       return coeffRef(index);
     }
 
-    inline Scalar& operator[](Index index)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index)
     {
-      static_assert(NumIndices == 1, "The bracket operator is only for vectors, use the parenthesis operator instead.");
+      // The bracket operator is only for vectors, use the parenthesis operator instead
+      EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
       return coeffRef(index);
     }
 
-    inline Tensor()
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Tensor()
       : m_storage()
     {
     }
 
-    inline Tensor(const Self& other)
-      : m_storage(other.m_storage)
-    {
-    }
-
-    inline Tensor(Self&& other)
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Tensor(const Self& other)
       : m_storage(other.m_storage)
     {
     }
 
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     inline Tensor(Index firstDimension, IndexTypes... otherDimensions)
-      : m_storage()
+        : m_storage(internal::array_prod(array<Index, NumIndices>{{firstDimension, otherDimensions...}}), array<Index, NumIndices>{{firstDimension, otherDimensions...}})
+    {
+      // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+#else
+    inline explicit Tensor(Index dim1)
+      : m_storage(dim1, array<Index, 1>(dim1))
+    {
+      EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+    inline explicit Tensor(Index dim1, Index dim2)
+      : m_storage(dim1*dim2, array<Index, 2>(dim1, dim2))
     {
-      static_assert(sizeof...(otherDimensions) + 1 == NumIndices, "Number of dimensions used to construct a tensor must be equal to the rank of the tensor.");
-      resize(std::array<Index, NumIndices>{{firstDimension, otherDimensions...}});
+      EIGEN_STATIC_ASSERT(2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
+    inline explicit Tensor(Index dim1, Index dim2, Index dim3)
+      : m_storage(dim1*dim2*dim3, array<Index, 3>(dim1, dim2, dim3))
+    {
+      EIGEN_STATIC_ASSERT(3 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+    inline explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4)
+      : m_storage(dim1*dim2*dim3*dim4, array<Index, 4>(dim1, dim2, dim3, dim4))
+    {
+      EIGEN_STATIC_ASSERT(4 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+    inline explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5)
+      : m_storage(dim1*dim2*dim3*dim4*dim5, array<Index, 4>(dim1, dim2, dim3, dim4, dim5))
+    {
+      EIGEN_STATIC_ASSERT(5 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+#endif
 
-    inline Tensor(std::array<Index, NumIndices> dimensions)
-      : m_storage(internal::array_prod(dimensions), dimensions)
+    inline explicit Tensor(const array<Index, NumIndices>& dimensions)
+        : m_storage(internal::array_prod(dimensions), dimensions)
     {
       EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
     }
 
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, ReadOnlyAccessors>& other)
+    {
+      typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
+      Assign assign(*this, other.derived());
+      resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+    }
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, WriteAccessors>& other)
+    {
+      typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
+      Assign assign(*this, other.derived());
+      resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other)
+    {
+      typedef TensorAssignOp<Tensor, const Tensor> Assign;
+      Assign assign(*this, other);
+      resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Tensor& operator=(const OtherDerived& other)
+    {
+      typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     void resize(Index firstDimension, IndexTypes... otherDimensions)
     {
-      static_assert(sizeof...(otherDimensions) + 1 == NumIndices, "Number of dimensions used to resize a tensor must be equal to the rank of the tensor.");
-      resize(std::array<Index, NumIndices>{{firstDimension, otherDimensions...}});
+      // The number of dimensions used to resize a tensor must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      resize(array<Index, NumIndices>{{firstDimension, otherDimensions...}});
     }
+#endif
 
-    void resize(const std::array<Index, NumIndices>& dimensions)
+    EIGEN_DEVICE_FUNC void resize(const array<Index, NumIndices>& dimensions)
     {
       std::size_t i;
       Index size = Index(1);
@@ -282,8 +367,17 @@ class Tensor
       #endif
     }
 
+    EIGEN_DEVICE_FUNC void resize(const DSizes<Index, NumIndices>& dimensions) {
+      array<Index, NumIndices> dims;
+      for (std::size_t i = 0; i < NumIndices; ++i) {
+        dims[i] = dimensions[i];
+      }
+      resize(dims);
+    }
+
   protected:
-    bool checkIndexRange(const std::array<Index, NumIndices>& indices) const
+
+    bool checkIndexRange(const array<Index, NumIndices>& indices) const
     {
       using internal::array_apply_and_reduce;
       using internal::array_zip_and_reduce;
@@ -298,16 +392,16 @@ class Tensor
         array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions());
     }
 
-    inline Index linearizedIndex(const std::array<Index, NumIndices>& indices) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const
     {
-      return internal::tensor_index_linearization_helper<Index, NumIndices, NumIndices - 1, Options&RowMajor>::run(indices, m_storage.dimensions());
+      if (Options&RowMajor) {
+        return m_storage.dimensions().IndexOfRowMajor(indices);
+      } else {
+        return m_storage.dimensions().IndexOfColMajor(indices);
+      }
     }
 };
 
 } // end namespace Eigen
 
 #endif // EIGEN_CXX11_TENSOR_TENSOR_H
-
-/*
- * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
- */
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
new file mode 100644
index 000000000..a4f73b2a1
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -0,0 +1,164 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
+#define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
+
+namespace Eigen {
+
+/** \class TensorAssign
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief The tensor assignment class.
+  *
+  * This class is represents the assignment of the values resulting from the evaluation of
+  * the rhs expression to the memory locations denoted by the lhs expression.
+  */
+namespace internal {
+template<typename LhsXprType, typename RhsXprType>
+struct traits<TensorAssignOp<LhsXprType, RhsXprType> >
+{
+  typedef typename LhsXprType::Scalar Scalar;
+  typedef typename internal::packet_traits<Scalar>::type Packet;
+  typedef typename traits<LhsXprType>::StorageKind StorageKind;
+  typedef typename promote_index_type<typename traits<LhsXprType>::Index,
+                                      typename traits<RhsXprType>::Index>::type Index;
+  typedef typename LhsXprType::Nested LhsNested;
+  typedef typename RhsXprType::Nested RhsNested;
+  typedef typename remove_reference<LhsNested>::type _LhsNested;
+  typedef typename remove_reference<RhsNested>::type _RhsNested;
+  static const std::size_t NumDimensions = internal::traits<LhsXprType>::NumDimensions;
+  static const int Layout = internal::traits<LhsXprType>::Layout;
+
+  enum {
+    Flags = 0,
+  };
+};
+
+template<typename LhsXprType, typename RhsXprType>
+struct eval<TensorAssignOp<LhsXprType, RhsXprType>, Eigen::Dense>
+{
+  typedef const TensorAssignOp<LhsXprType, RhsXprType>& type;
+};
+
+template<typename LhsXprType, typename RhsXprType>
+struct nested<TensorAssignOp<LhsXprType, RhsXprType>, 1, typename eval<TensorAssignOp<LhsXprType, RhsXprType> >::type>
+{
+  typedef TensorAssignOp<LhsXprType, RhsXprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename LhsXprType, typename RhsXprType>
+class TensorAssignOp : public TensorBase<TensorAssignOp<LhsXprType, RhsXprType> >
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorAssignOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorAssignOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename LhsXprType::CoeffReturnType CoeffReturnType;
+  typedef typename LhsXprType::PacketReturnType PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorAssignOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs)
+      : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {}
+
+    /** \returns the nested expressions */
+    EIGEN_DEVICE_FUNC
+    typename internal::remove_all<typename LhsXprType::Nested>::type&
+    lhsExpression() const { return *((typename internal::remove_all<typename LhsXprType::Nested>::type*)&m_lhs_xpr); }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename RhsXprType::Nested>::type&
+    rhsExpression() const { return m_rhs_xpr; }
+
+  protected:
+    typename internal::remove_all<typename LhsXprType::Nested>::type& m_lhs_xpr;
+    const typename internal::remove_all<typename RhsXprType::Nested>::type& m_rhs_xpr;
+};
+
+
+template<typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
+{
+  typedef TensorAssignOp<LeftArgType, RightArgType> XprType;
+
+  enum {
+    IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
+      m_leftImpl(op.lhsExpression(), device),
+      m_rightImpl(op.rhsExpression(), device)
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    // The dimensions of the lhs and the rhs tensors should be equal to prevent
+    // overflows and ensure the result is fully initialized.
+    eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_leftImpl.dimensions()));
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
+  {
+    // TODO: use left impl instead if right impl dimensions are known at compile time.
+    return m_rightImpl.dimensions();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+    eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
+    m_leftImpl.evalSubExprsIfNeeded(NULL);
+    // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non
+    // null value), attempt to evaluate the rhs expression in place. Returns true iff in place
+    // evaluation isn't supported and the caller still needs to manually assign the values generated
+    // by the rhs to the lhs.
+    return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_leftImpl.cleanup();
+    m_rightImpl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
+    m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
+    const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
+    const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
+    m_leftImpl.template writePacket<LhsStoreMode>(i, m_rightImpl.template packet<RhsLoadMode>(i));
+  }
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_leftImpl.coeff(index);
+  }
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
+  {
+    return m_leftImpl.template packet<LoadMode>(index);
+  }
+
+ private:
+  TensorEvaluator<LeftArgType, Device> m_leftImpl;
+  TensorEvaluator<RightArgType, Device> m_rightImpl;
+};
+
+}
+
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
new file mode 100644
index 000000000..e08ac6aa1
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -0,0 +1,573 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_BASE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_BASE_H
+
+namespace Eigen {
+
+/** \class TensorBase
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief The tensor base class.
+  *
+  * This class is the common parent of the Tensor and TensorMap class, thus
+  * making it possible to use either class interchangably in expressions.
+  */
+
+template<typename Derived>
+class TensorBase<Derived, ReadOnlyAccessors>
+{
+  public:
+    typedef internal::traits<Derived> DerivedTraits;
+    typedef typename DerivedTraits::Scalar Scalar;
+    typedef typename DerivedTraits::Index Index;
+    typedef typename internal::remove_const<Scalar>::type CoeffReturnType;
+    typedef typename internal::packet_traits<CoeffReturnType>::type PacketReturnType;
+    static const int NumDimensions = DerivedTraits::NumDimensions;
+
+    // Generic nullary operation support.
+    template <typename CustomNullaryOp> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<CustomNullaryOp, const Derived>
+    nullaryExpr(const CustomNullaryOp& func) const {
+      return TensorCwiseNullaryOp<CustomNullaryOp, const Derived>(derived(), func);
+    }
+
+    // Coefficient-wise nullary operators
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived>
+    constant(const Scalar& value) const {
+      return nullaryExpr(internal::scalar_constant_op<Scalar>(value));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<internal::UniformRandomGenerator<Scalar>, const Derived>
+    random() const {
+      return nullaryExpr(internal::UniformRandomGenerator<Scalar>());
+    }
+    template <typename RandomGenerator> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<RandomGenerator, const Derived>
+    random() const {
+      return nullaryExpr(RandomGenerator());
+    }
+
+    // Generic unary operation support.
+    template <typename CustomUnaryOp> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<CustomUnaryOp, const Derived>
+    unaryExpr(const CustomUnaryOp& func) const {
+      return TensorCwiseUnaryOp<CustomUnaryOp, const Derived>(derived(), func);
+    }
+
+    // Coefficient-wise unary operators
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const Derived>
+    operator-() const {
+      return unaryExpr(internal::scalar_opposite_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived>
+    sqrt() const {
+      return unaryExpr(internal::scalar_sqrt_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived>
+    square() const {
+      return unaryExpr(internal::scalar_square_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived>
+    cube() const {
+      return unaryExpr(internal::scalar_cube_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived>
+    inverse() const {
+      return unaryExpr(internal::scalar_inverse_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_exp_op<Scalar>, const Derived>
+    exp() const {
+      return unaryExpr(internal::scalar_exp_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived>
+    log() const {
+      return unaryExpr(internal::scalar_log_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
+    abs() const {
+      return unaryExpr(internal::scalar_abs_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived>
+    pow(Scalar exponent) const {
+      return unaryExpr(internal::scalar_pow_op<Scalar>(exponent));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>
+    operator+ (Scalar rhs) const {
+      return unaryExpr(internal::scalar_add_op<Scalar>(rhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sub_op<Scalar>, const Derived>
+    operator- (Scalar rhs) const {
+      EIGEN_STATIC_ASSERT((std::numeric_limits<Scalar>::is_signed || internal::is_same<Scalar, const std::complex<float> >::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return unaryExpr(internal::scalar_sub_op<Scalar>(rhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Derived>
+    operator* (Scalar rhs) const {
+      return unaryExpr(internal::scalar_multiple_op<Scalar>(rhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived>
+    operator/ (Scalar rhs) const {
+      // EIGEN_STATIC_ASSERT(!std::numeric_limits<Scalar>::is_integer, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return unaryExpr(internal::scalar_quotient1_op<Scalar>(rhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    cwiseMax(Scalar threshold) const {
+      return cwiseMax(constant(threshold));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    cwiseMin(Scalar threshold) const {
+      return cwiseMin(constant(threshold));
+    }
+
+    template <typename NewType> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_cast_op<Scalar, NewType>, const Derived>
+    cast() const {
+      return unaryExpr(internal::scalar_cast_op<Scalar, NewType>());
+    }
+
+    // Generic binary operation support.
+    template <typename CustomBinaryOp, typename OtherDerived> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>
+    binaryExpr(const OtherDerived& other, const CustomBinaryOp& func) const {
+      return TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>(derived(), other, func);
+    }
+
+    // Coefficient-wise binary operators.
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const Derived, const OtherDerived>
+    operator+(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_sum_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const Derived, const OtherDerived>
+    operator-(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_difference_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_product_op<Scalar>, const Derived, const OtherDerived>
+    operator*(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_product_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>
+    operator/(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_quotient_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const OtherDerived>
+    cwiseMax(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_max_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const OtherDerived>
+    cwiseMin(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_min_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived>
+    operator&&(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_boolean_and_op());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>
+    operator||(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_boolean_or_op());
+    }
+
+    // Comparisons and tests.
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<std::less<Scalar>, const Derived, const OtherDerived>
+    operator<(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), std::less<Scalar>());
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<std::less_equal<Scalar>, const Derived, const OtherDerived>
+    operator<=(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), std::less_equal<Scalar>());
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<std::greater<Scalar>, const Derived, const OtherDerived>
+    operator>(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), std::greater<Scalar>());
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<std::greater_equal<Scalar>, const Derived, const OtherDerived>
+    operator>=(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), std::greater_equal<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<std::equal_to<Scalar>, const Derived, const OtherDerived>
+    operator==(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), std::equal_to<Scalar>());
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<std::not_equal_to<Scalar>, const Derived, const OtherDerived>
+    operator!=(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), std::not_equal_to<Scalar>());
+    }
+
+    // Coefficient-wise ternary operators.
+    template<typename ThenDerived, typename ElseDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorSelectOp<const Derived, const ThenDerived, const ElseDerived>
+    select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const {
+      return TensorSelectOp<const Derived, const ThenDerived, const ElseDerived>(derived(), thenTensor.derived(), elseTensor.derived());
+    }
+
+    // Contractions.
+    typedef Eigen::IndexPair<Index> DimensionPair;
+
+    template<typename OtherDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorContractionOp<const Dimensions, const Derived, const OtherDerived>
+    contract(const OtherDerived& other, const Dimensions& dims) const {
+      return TensorContractionOp<const Dimensions, const Derived, const OtherDerived>(derived(), other.derived(), dims);
+    }
+
+    // Convolutions.
+    template<typename KernelDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorConvolutionOp<const Dimensions, const Derived, const KernelDerived>
+    convolve(const KernelDerived& kernel, const Dimensions& dims) const {
+      return TensorConvolutionOp<const Dimensions, const Derived, const KernelDerived>(derived(), kernel.derived(), dims);
+    }
+
+    // Reductions.
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived>
+    sum(const Dims& dims) const {
+      return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::SumReducer<CoeffReturnType>());
+    }
+
+    const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>
+    sum() const {
+      array<Index, NumDimensions> in_dims;
+      for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i;
+      return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::SumReducer<CoeffReturnType>());
+    }
+
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const Dims, const Derived>
+    mean(const Dims& dims) const {
+      return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MeanReducer<CoeffReturnType>());
+    }
+
+    const TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>
+    mean() const {
+      array<Index, NumDimensions> in_dims;
+      for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i;
+      return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MeanReducer<CoeffReturnType>());
+    }
+
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const Dims, const Derived>
+    prod(const Dims& dims) const {
+      return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::ProdReducer<CoeffReturnType>());
+    }
+
+    const TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>
+    prod() const {
+      array<Index, NumDimensions> in_dims;
+      for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i;
+      return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::ProdReducer<CoeffReturnType>());
+    }
+
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const Dims, const Derived>
+    maximum(const Dims& dims) const {
+      return TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MaxReducer<CoeffReturnType>());
+    }
+
+    const TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>
+    maximum() const {
+      array<Index, NumDimensions> in_dims;
+      for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i;
+      return TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MaxReducer<CoeffReturnType>());
+    }
+
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::MinReducer<CoeffReturnType>, const Dims, const Derived>
+    minimum(const Dims& dims) const {
+      return TensorReductionOp<internal::MinReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MinReducer<CoeffReturnType>());
+    }
+
+    const TensorReductionOp<internal::MinReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>
+    minimum() const {
+      array<Index, NumDimensions> in_dims;
+      for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i;
+      return TensorReductionOp<internal::MinReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MinReducer<CoeffReturnType>());
+    }
+
+    template <typename Reducer, typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<Reducer, const Dims, const Derived>
+    reduce(const Dims& dims, const Reducer& reducer) const {
+      return TensorReductionOp<Reducer, const Dims, const Derived>(derived(), dims, reducer);
+    }
+
+    template <typename Broadcast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorBroadcastingOp<const Broadcast, const Derived>
+    broadcast(const Broadcast& broadcast) const {
+      return TensorBroadcastingOp<const Broadcast, const Derived>(derived(), broadcast);
+    }
+
+    template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorConcatenationOp<Axis, const Derived, const OtherDerived>
+    concatenate(const OtherDerived& other, Axis axis) const {
+      return TensorConcatenationOp<Axis, const Derived, const OtherDerived>(derived(), other.derived(), axis);
+    }
+
+    template <typename PatchDims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorPatchOp<const PatchDims, const Derived>
+    extract_patches(const PatchDims& patch_dims) const {
+      return TensorPatchOp<const PatchDims, const Derived>(derived(), patch_dims);
+    }
+
+    template <Index Rows, Index Cols> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorImagePatchOp<Rows, Cols, const Derived>
+    extract_image_patches() const {
+      return TensorImagePatchOp<Rows, Cols, const Derived>(derived(), Rows, Cols, 1, 1, PADDING_SAME);
+    }
+
+    template <Index Rows, Index Cols> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorImagePatchOp<Rows, Cols, const Derived>
+    extract_image_patches(const PaddingType padding_type) const {
+      return TensorImagePatchOp<Rows, Cols, const Derived>(derived(), Rows, Cols, 1, 1, padding_type);
+    }
+
+    template <Index Rows, Index Cols> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorImagePatchOp<Rows, Cols, const Derived>
+    extract_image_patches(const Index stride, const PaddingType padding_type) const {
+      return TensorImagePatchOp<Rows, Cols, const Derived>(derived(), Rows, Cols, stride, stride, padding_type);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
+    extract_image_patches(const Index patch_rows, const Index patch_cols,
+                          const Index row_stride = 1, const Index col_stride = 1) const {
+      return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
+                                                                 PADDING_SAME);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
+    extract_image_patches(const Index patch_rows, const Index patch_cols,
+                          const Index row_stride, const Index col_stride,
+                          const PaddingType padding_type) const {
+      return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
+                                                                 padding_type);
+    }
+
+    // Morphing operators.
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorLayoutSwapOp<const Derived>
+    swap_layout() const {
+      return TensorLayoutSwapOp<const Derived>(derived());
+    }
+    template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReshapingOp<const NewDimensions, const Derived>
+    reshape(const NewDimensions& newDimensions) const {
+      return TensorReshapingOp<const NewDimensions, const Derived>(derived(), newDimensions);
+    }
+    template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorSlicingOp<const StartIndices, const Sizes, const Derived>
+    slice(const StartIndices& startIndices, const Sizes& sizes) const {
+      return TensorSlicingOp<const StartIndices, const Sizes, const Derived>(derived(), startIndices, sizes);
+    }
+    template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorChippingOp<DimId, const Derived>
+    chip(const Index offset) const {
+      return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId);
+    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorChippingOp<Dynamic, const Derived>
+    chip(const Index offset, const Index dim) const {
+      return TensorChippingOp<Dynamic, const Derived>(derived(), offset, dim);
+    }
+    template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReverseOp<const ReverseDimensions, const Derived>
+    reverse(const ReverseDimensions& rev) const {
+      return TensorReverseOp<const ReverseDimensions, const Derived>(derived(), rev);
+    }
+    template <typename PaddingDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorPaddingOp<const PaddingDimensions, const Derived>
+    pad(const PaddingDimensions& padding) const {
+      return TensorPaddingOp<const PaddingDimensions, const Derived>(derived(), padding);
+    }
+    template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorShufflingOp<const Shuffle, const Derived>
+    shuffle(const Shuffle& shuffle) const {
+      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shuffle);
+    }
+    template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorStridingOp<const Strides, const Derived>
+    stride(const Strides& strides) const {
+      return TensorStridingOp<const Strides, const Derived>(derived(), strides);
+    }
+
+    // Force the evaluation of the expression.
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorForcedEvalOp<const Derived> eval() const {
+      return TensorForcedEvalOp<const Derived>(derived());
+    }
+
+  protected:
+    template <typename Scalar, std::size_t NumIndices, int Options> friend class Tensor;
+    template <typename Scalar, int Options> friend class TensorVarDim;
+    template <typename OtherDerived, int AccessLevel> friend class TensorBase;
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
+};
+
+template<typename Derived>
+class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyAccessors> {
+ public:
+    typedef internal::traits<Derived> DerivedTraits;
+    typedef typename DerivedTraits::Scalar Scalar;
+    typedef typename DerivedTraits::Index Index;
+    typedef Scalar CoeffReturnType;
+    typedef typename internal::packet_traits<Scalar>::type PacketReturnType;
+    static const int NumDimensions = DerivedTraits::NumDimensions;
+
+    template <typename Scalar, std::size_t NumIndices, int Options> friend class Tensor;
+    template <typename Scalar, int Options> friend class TensorVarDim;
+    template <typename OtherDerived, int AccessLevel> friend class TensorBase;
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& setZero() {
+      return setConstant(Scalar(0));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) {
+      return derived() = this->constant(val);
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& setRandom() {
+      return derived() = this->random();
+    }
+    template <typename RandomGenerator> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& setRandom() {
+      return derived() = this->template random<RandomGenerator>();
+    }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& setValues(
+        const typename internal::Initializer<Derived, NumDimensions>::InitList& vals) {
+      TensorEvaluator<Derived, DefaultDevice> eval(derived(), DefaultDevice());
+      internal::initialize_tensor<Derived, NumDimensions>(eval, vals);
+      return derived();
+    }
+#endif  // EIGEN_HAS_VARIADIC_TEMPLATES
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator+=(const OtherDerived& other) {
+      return derived() = derived() + other.derived();
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator-=(const OtherDerived& other) {
+      return derived() = derived() - other.derived();
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator*=(const OtherDerived& other) {
+      return derived() = derived() * other.derived();
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator/=(const OtherDerived& other) {
+      return derived() = derived() / other.derived();
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorLayoutSwapOp<Derived>
+    swap_layout() const {
+      return TensorLayoutSwapOp<Derived>(derived());
+    }
+    template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorReshapingOp<const NewDimensions, Derived>
+    reshape(const NewDimensions& newDimensions) const {
+      return TensorReshapingOp<const NewDimensions, Derived>(derived(), newDimensions);
+    }
+    template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorSlicingOp<const StartIndices, const Sizes, Derived>
+    slice(const StartIndices& startIndices, const Sizes& sizes) const {
+      return TensorSlicingOp<const StartIndices, const Sizes, Derived>(derived(), startIndices, sizes);
+    }
+    template <DenseIndex DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorChippingOp<DimId, Derived>
+    chip(const Index offset) const {
+      return TensorChippingOp<DimId, Derived>(derived(), offset, DimId);
+    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorChippingOp<Dynamic, Derived>
+    chip(const Index offset, const Index dim) const {
+      return TensorChippingOp<Dynamic, Derived>(derived(), offset, dim);
+    }
+    template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorShufflingOp<const Shuffle, Derived>
+    shuffle(const Shuffle& shuffle) const {
+      return TensorShufflingOp<const Shuffle, Derived>(derived(), shuffle);
+    }
+    template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorStridingOp<const Strides, Derived>
+    stride(const Strides& strides) const {
+      return TensorStridingOp<const Strides, Derived>(derived(), strides);
+    }
+
+    // Select the device on which to evaluate the expression.
+    template <typename DeviceType>
+    TensorDevice<Derived, DeviceType> device(const DeviceType& device) {
+      return TensorDevice<Derived, DeviceType>(device, derived());
+    }
+
+ protected:
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& derived() { return *static_cast<Derived*>(this); }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
new file mode 100644
index 000000000..5790e19d6
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -0,0 +1,341 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
+
+namespace Eigen {
+
+/** \class TensorBroadcasting
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor broadcasting class.
+  *
+  *
+  */
+namespace internal {
+template<typename Broadcast, typename XprType>
+struct traits<TensorBroadcastingOp<Broadcast, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename Broadcast, typename XprType>
+struct eval<TensorBroadcastingOp<Broadcast, XprType>, Eigen::Dense>
+{
+  typedef const TensorBroadcastingOp<Broadcast, XprType>& type;
+};
+
+template<typename Broadcast, typename XprType>
+struct nested<TensorBroadcastingOp<Broadcast, XprType>, 1, typename eval<TensorBroadcastingOp<Broadcast, XprType> >::type>
+{
+  typedef TensorBroadcastingOp<Broadcast, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename Broadcast, typename XprType>
+class TensorBroadcastingOp : public TensorBase<TensorBroadcastingOp<Broadcast, XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorBroadcastingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast)
+      : m_xpr(expr), m_broadcast(broadcast) {}
+
+    EIGEN_DEVICE_FUNC
+    const Broadcast& broadcast() const { return m_broadcast; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const Broadcast m_broadcast;
+};
+
+
+// Eval as rvalue
+template<typename Broadcast, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
+{
+  typedef TensorBroadcastingOp<Broadcast, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : m_impl(op.expression(), device)
+  {
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    const Broadcast& broadcast = op.broadcast();
+    for (int i = 0; i < NumDims; ++i) {
+      eigen_assert(input_dims[i] > 0);
+      m_dimensions[i] = input_dims[i] * broadcast[i];
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputStrides[0] = 1;
+      m_outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+        m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+      }
+    } else {
+      m_inputStrides[NumDims-1] = 1;
+      m_outputStrides[NumDims-1] = 1;
+      for (int i = NumDims-2; i >= 0; --i) {
+        m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
+        m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
+      }
+    }
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const
+  {
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      return coeffColMajor(index);
+    } else {
+      return coeffRowMajor(index);
+    }
+  }
+
+  // TODO: attempt to speed this up. The integer divisions and modulo are slow
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
+  {
+    Index inputIndex = 0;
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx = index / m_outputStrides[i];
+      if (internal::index_statically_eq<Broadcast>()(i, 1)) {
+        eigen_assert(idx < m_impl.dimensions()[i]);
+        inputIndex += idx * m_inputStrides[i];
+      } else {
+        if (internal::index_statically_eq<InputDimensions>()(i, 1)) {
+          eigen_assert(idx % m_impl.dimensions()[i] == 0);
+        } else {
+          inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+        }
+      }
+      index -= idx * m_outputStrides[i];
+    }
+    if (internal::index_statically_eq<Broadcast>()(0, 1)) {
+      eigen_assert(index < m_impl.dimensions()[0]);
+      inputIndex += index;
+    } else {
+      if (internal::index_statically_eq<InputDimensions>()(0, 1)) {
+        eigen_assert(index % m_impl.dimensions()[0] == 0);
+      } else {
+        inputIndex += (index % m_impl.dimensions()[0]);
+      }
+    }
+    return m_impl.coeff(inputIndex);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
+  {
+    Index inputIndex = 0;
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const Index idx = index / m_outputStrides[i];
+      if (internal::index_statically_eq<Broadcast>()(i, 1)) {
+        eigen_assert(idx < m_impl.dimensions()[i]);
+        inputIndex += idx * m_inputStrides[i];
+      } else {
+        if (internal::index_statically_eq<InputDimensions>()(i, 1)) {
+          eigen_assert(idx % m_impl.dimensions()[i] == 0);
+        } else {
+          inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+        }
+      }
+      index -= idx * m_outputStrides[i];
+    }
+    if (internal::index_statically_eq<Broadcast>()(NumDims-1, 1)) {
+      eigen_assert(index < m_impl.dimensions()[NumDims-1]);
+      inputIndex += index;
+    } else {
+      if (internal::index_statically_eq<InputDimensions>()(NumDims-1, 1)) {
+        eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
+      } else {
+        inputIndex += (index % m_impl.dimensions()[NumDims-1]);
+      }
+    }
+    return m_impl.coeff(inputIndex);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const
+  {
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      return packetColMajor<LoadMode>(index);
+    } else {
+      return packetRowMajor<LoadMode>(index);
+    }
+  }
+
+  // Ignore the LoadMode and always use unaligned loads since we can't guarantee
+  // the alignment at compile time.
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    const Index originalIndex = index;
+
+    Index inputIndex = 0;
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx = index / m_outputStrides[i];
+      if (internal::index_statically_eq<Broadcast>()(i, 1)) {
+        eigen_assert(idx < m_impl.dimensions()[i]);
+        inputIndex += idx * m_inputStrides[i];
+      } else {
+        if (internal::index_statically_eq<InputDimensions>()(i, 1)) {
+          eigen_assert(idx % m_impl.dimensions()[i] == 0);
+        } else {
+          inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+        }
+      }
+      index -= idx * m_outputStrides[i];
+    }
+    Index innermostLoc;
+    if (internal::index_statically_eq<Broadcast>()(0, 1)) {
+      eigen_assert(index < m_impl.dimensions()[0]);
+      innermostLoc = index;
+    } else {
+      if (internal::index_statically_eq<InputDimensions>()(0, 1)) {
+        eigen_assert(innermostLoc % m_impl.dimensions()[0] == 0);
+        innermostLoc = 0;
+      } else {
+        innermostLoc = index % m_impl.dimensions()[0];
+      }
+    }
+    inputIndex += innermostLoc;
+
+    // Todo: this could be extended to the second dimension if we're not
+    // broadcasting alongside the first dimension, and so on.
+    if (innermostLoc + packetSize <= m_impl.dimensions()[0]) {
+      return m_impl.template packet<Unaligned>(inputIndex);
+    } else {
+      EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+      values[0] = m_impl.coeff(inputIndex);
+      for (int i = 1; i < packetSize; ++i) {
+        values[i] = coeffColMajor(originalIndex+i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    const Index originalIndex = index;
+
+    Index inputIndex = 0;
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const Index idx = index / m_outputStrides[i];
+      if (internal::index_statically_eq<Broadcast>()(i, 1)) {
+        eigen_assert(idx < m_impl.dimensions()[i]);
+        inputIndex += idx * m_inputStrides[i];
+      } else {
+        if (internal::index_statically_eq<InputDimensions>()(i, 1)) {
+          eigen_assert(idx % m_impl.dimensions()[i] == 0);
+        } else {
+          inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+        }
+      }
+      index -= idx * m_outputStrides[i];
+    }
+    Index innermostLoc;
+    if (internal::index_statically_eq<Broadcast>()(NumDims-1, 1)) {
+      eigen_assert(index < m_impl.dimensions()[NumDims-1]);
+      innermostLoc = index;
+    } else {
+      if (internal::index_statically_eq<InputDimensions>()(NumDims-1, 1)) {
+        eigen_assert(innermostLoc % m_impl.dimensions()[NumDims-1] == 0);
+        innermostLoc = 0;
+      } else {
+        innermostLoc = index % m_impl.dimensions()[NumDims-1];
+      }
+    }
+    inputIndex += innermostLoc;
+
+    // Todo: this could be extended to the second dimension if we're not
+    // broadcasting alongside the first dimension, and so on.
+    if (innermostLoc + packetSize <= m_impl.dimensions()[NumDims-1]) {
+      return m_impl.template packet<Unaligned>(inputIndex);
+    } else {
+      EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+      values[0] = m_impl.coeff(inputIndex);
+      for (int i = 1; i < packetSize; ++i) {
+        values[i] = coeffRowMajor(originalIndex+i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  }
+
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
new file mode 100644
index 000000000..dc9586cbc
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -0,0 +1,363 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
+
+namespace Eigen {
+
+/** \class TensorKChippingReshaping
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief A chip is a thin slice, corresponding to a column or a row in a 2-d tensor.
+  *
+  *
+  */
+
+namespace internal {
+template<DenseIndex DimId, typename XprType>
+struct traits<TensorChippingOp<DimId, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions - 1;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<DenseIndex DimId, typename XprType>
+struct eval<TensorChippingOp<DimId, XprType>, Eigen::Dense>
+{
+  typedef const TensorChippingOp<DimId, XprType>& type;
+};
+
+template<DenseIndex DimId, typename XprType>
+struct nested<TensorChippingOp<DimId, XprType>, 1, typename eval<TensorChippingOp<DimId, XprType> >::type>
+{
+  typedef TensorChippingOp<DimId, XprType> type;
+};
+
+template <DenseIndex DimId>
+struct DimensionId
+{
+  DimensionId(DenseIndex dim) {
+    eigen_assert(dim == DimId);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
+    return DimId;
+  }
+};
+template <>
+struct DimensionId<Dynamic>
+{
+  DimensionId(DenseIndex dim) : actual_dim(dim) {
+    eigen_assert(dim >= 0);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
+    return actual_dim;
+  }
+ private:
+  const DenseIndex actual_dim;
+};
+
+
+}  // end namespace internal
+
+
+
+template<DenseIndex DimId, typename XprType>
+class TensorChippingOp : public TensorBase<TensorChippingOp<DimId, XprType> >
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorChippingOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorChippingOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorChippingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorChippingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorChippingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim)
+      : m_xpr(expr), m_offset(offset), m_dim(dim) {
+  }
+
+  EIGEN_DEVICE_FUNC
+  const Index offset() const { return m_offset; }
+  EIGEN_DEVICE_FUNC
+  const Index dim() const { return m_dim.actualDim(); }
+
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename XprType::Nested>::type&
+  expression() const { return m_xpr; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE TensorChippingOp& operator = (const TensorChippingOp& other)
+  {
+    typedef TensorAssignOp<TensorChippingOp, const TensorChippingOp> Assign;
+    Assign assign(*this, other);
+    static const bool Vectorize = TensorEvaluator<const Assign, DefaultDevice>::PacketAccess;
+    internal::TensorExecutor<const Assign, DefaultDevice, Vectorize>::run(assign, DefaultDevice());
+    return *this;
+  }
+
+  template<typename OtherDerived>
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE TensorChippingOp& operator = (const OtherDerived& other)
+  {
+    typedef TensorAssignOp<TensorChippingOp, const OtherDerived> Assign;
+    Assign assign(*this, other);
+    static const bool Vectorize = TensorEvaluator<const Assign, DefaultDevice>::PacketAccess;
+    internal::TensorExecutor<const Assign, DefaultDevice, Vectorize>::run(assign, DefaultDevice());
+    return *this;
+  }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const Index m_offset;
+    const internal::DimensionId<DimId> m_dim;
+};
+
+
+// Eval as rvalue
+template<DenseIndex DimId, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
+{
+  typedef TensorChippingOp<DimId, ArgType> XprType;
+  static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  static const int NumDims = NumInputDims-1;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+
+  enum {
+    // Alignment can't be guaranteed at compile time since it depends on the
+    // slice offsets.
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
+  {
+    // We could also support the case where NumInputDims==1 if needed.
+    EIGEN_STATIC_ASSERT(NumInputDims >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    eigen_assert(NumInputDims > m_dim.actualDim());
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    int j = 0;
+    for (int i = 0; i < NumInputDims; ++i) {
+      if (i != m_dim.actualDim()) {
+        m_dimensions[j] = input_dims[i];
+        ++j;
+      }
+    }
+
+    m_stride = 1;
+    m_inputStride = 1;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < m_dim.actualDim(); ++i) {
+        m_stride *= input_dims[i];
+        m_inputStride *= input_dims[i];
+      }
+    } else {
+      for (int i = NumInputDims-1; i > m_dim.actualDim(); --i) {
+        m_stride *= input_dims[i];
+        m_inputStride *= input_dims[i];
+      }
+    }
+    m_inputStride *= input_dims[m_dim.actualDim()];
+    m_inputOffset = m_stride * op.offset();
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_impl.coeff(srcCoeff(index));
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
+	(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
+      // m_stride is equal to 1, so let's avoid the integer division.
+      eigen_assert(m_stride == 1);
+      Index inputIndex = index * m_inputStride + m_inputOffset;
+      EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+      for (int i = 0; i < packetSize; ++i) {
+        values[i] = m_impl.coeff(inputIndex);
+        inputIndex += m_inputStride;
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims - 1) ||
+	       (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
+      // m_stride is aways greater than index, so let's avoid the integer division.
+      eigen_assert(m_stride > index);
+      return m_impl.template packet<LoadMode>(index + m_inputOffset);
+    } else {
+      const Index idx = index / m_stride;
+      const Index rem = index - idx * m_stride;
+      if (rem + packetSize <= m_stride) {
+        Index inputIndex = idx * m_inputStride + m_inputOffset + rem;
+        return m_impl.template packet<LoadMode>(inputIndex);
+      } else {
+        // Cross the stride boundary. Fallback to slow path.
+        EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+        for (int i = 0; i < packetSize; ++i) {
+          values[i] = coeff(index);
+          ++index;
+        }
+        PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+        return rslt;
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
+    Scalar* result = m_impl.data();
+    if (m_dim.actualDim() == NumDims && result) {
+      return result + m_inputOffset;
+    } else {
+      return NULL;
+    }
+  }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
+  {
+    Index inputIndex;
+    if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
+	(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
+      // m_stride is equal to 1, so let's avoid the integer division.
+      eigen_assert(m_stride == 1);
+      inputIndex = index * m_inputStride + m_inputOffset;
+    } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims-1) ||
+	       (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
+      // m_stride is aways greater than index, so let's avoid the integer division.
+      eigen_assert(m_stride > index);
+      inputIndex = index + m_inputOffset;
+    } else {
+      const Index idx = index / m_stride;
+      inputIndex = idx * m_inputStride + m_inputOffset;
+      index -= idx * m_stride;
+      inputIndex += index;
+    }
+    return inputIndex;
+  }
+
+  Dimensions m_dimensions;
+  Index m_stride;
+  Index m_inputOffset;
+  Index m_inputStride;
+  TensorEvaluator<ArgType, Device> m_impl;
+  const internal::DimensionId<DimId> m_dim;
+  const Device& m_device;
+};
+
+
+// Eval as lvalue
+template<DenseIndex DimId, typename ArgType, typename Device>
+struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
+  : public TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
+{
+  typedef TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> Base;
+  typedef TensorChippingOp<DimId, ArgType> XprType;
+  static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  static const int NumDims = NumInputDims-1;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : Base(op, device)
+    { }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  {
+    return this->m_impl.coeffRef(this->srcCoeff(index));
+  }
+
+  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketReturnType& x)
+  {
+    static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+
+    if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == 0) ||
+	(static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) {
+      // m_stride is equal to 1, so let's avoid the integer division.
+      eigen_assert(this->m_stride == 1);
+      EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+      internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+      Index inputIndex = index * this->m_inputStride + this->m_inputOffset;
+      for (int i = 0; i < packetSize; ++i) {
+        this->m_impl.coeffRef(inputIndex) = values[i];
+        inputIndex += this->m_inputStride;
+      }
+    } else if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == NumInputDims-1) ||
+	       (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == 0)) {
+      // m_stride is aways greater than index, so let's avoid the integer division.
+      eigen_assert(this->m_stride > index);
+      this->m_impl.template writePacket<StoreMode>(index + this->m_inputOffset, x);
+    } else {
+      const Index idx = index / this->m_stride;
+      const Index rem = index - idx * this->m_stride;
+      if (rem + packetSize <= this->m_stride) {
+        const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem;
+        this->m_impl.template writePacket<StoreMode>(inputIndex, x);
+      } else {
+        // Cross stride boundary. Fallback to slow path.
+        EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+        internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+        for (int i = 0; i < packetSize; ++i) {
+          this->coeffRef(index) = values[i];
+          ++index;
+        }
+      }
+    }
+  }
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
new file mode 100644
index 000000000..a1dec76d1
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -0,0 +1,258 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
+
+namespace Eigen {
+
+/** \class TensorConcatenationOp
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor concatenation class.
+  *
+  *
+  */
+namespace internal {
+template<typename Axis, typename LhsXprType, typename RhsXprType>
+struct traits<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >
+{
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename promote_storage_type<typename LhsXprType::Scalar,
+                                        typename RhsXprType::Scalar>::ret Scalar;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
+                                        typename traits<RhsXprType>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<LhsXprType>::Index,
+                                      typename traits<RhsXprType>::Index>::type Index;
+  typedef typename LhsXprType::Nested LhsNested;
+  typedef typename RhsXprType::Nested RhsNested;
+  typedef typename remove_reference<LhsNested>::type _LhsNested;
+  typedef typename remove_reference<RhsNested>::type _RhsNested;
+  static const int NumDimensions = traits<LhsXprType>::NumDimensions;
+  static const int Layout = traits<LhsXprType>::Layout;
+  enum { Flags = 0 };
+};
+
+template<typename Axis, typename LhsXprType, typename RhsXprType>
+struct eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, Eigen::Dense>
+{
+  typedef const TensorConcatenationOp<Axis, LhsXprType, RhsXprType>& type;
+};
+
+template<typename Axis, typename LhsXprType, typename RhsXprType>
+struct nested<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, 1, typename eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >::type>
+{
+  typedef TensorConcatenationOp<Axis, LhsXprType, RhsXprType> type;
+};
+
+}  // end namespace internal
+
+
+template<typename Axis, typename LhsXprType, typename RhsXprType>
+class TensorConcatenationOp : public TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors>
+{
+  public:
+    typedef typename internal::traits<TensorConcatenationOp>::Scalar Scalar;
+    typedef typename internal::traits<TensorConcatenationOp>::Packet Packet;
+    typedef typename internal::traits<TensorConcatenationOp>::StorageKind StorageKind;
+    typedef typename internal::traits<TensorConcatenationOp>::Index Index;
+    typedef typename internal::nested<TensorConcatenationOp>::type Nested;
+    typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
+                                                    typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
+    typedef typename internal::promote_storage_type<typename LhsXprType::PacketReturnType,
+                                                    typename RhsXprType::PacketReturnType>::ret PacketReturnType;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConcatenationOp(const LhsXprType& lhs, const RhsXprType& rhs, Axis axis)
+        : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_axis(axis) {}
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename LhsXprType::Nested>::type&
+    lhsExpression() const { return m_lhs_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename RhsXprType::Nested>::type&
+    rhsExpression() const { return m_rhs_xpr; }
+
+    EIGEN_DEVICE_FUNC Axis axis() const { return m_axis; }
+
+  protected:
+    typename LhsXprType::Nested m_lhs_xpr;
+    typename RhsXprType::Nested m_rhs_xpr;
+    const Axis m_axis;
+};
+
+
+// Eval as rvalue
+template<typename Axis, typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device>
+{
+  typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
+  static const int RightNumDims = internal::array_size<typename TensorEvaluator<RightArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis())
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT(NumDims == RightNumDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(0 <= m_axis && m_axis < NumDims);
+    const Dimensions& lhs_dims = m_leftImpl.dimensions();
+    const Dimensions& rhs_dims = m_rightImpl.dimensions();
+    int i = 0;
+    for (; i < m_axis; ++i) {
+      eigen_assert(lhs_dims[i] > 0);
+      eigen_assert(lhs_dims[i] == rhs_dims[i]);
+      m_dimensions[i] = lhs_dims[i];
+    }
+    eigen_assert(lhs_dims[i] > 0);  // Now i == m_axis.
+    eigen_assert(rhs_dims[i] > 0);
+    m_dimensions[i] = lhs_dims[i] + rhs_dims[i];
+    for (++i; i < NumDims; ++i) {
+      eigen_assert(lhs_dims[i] > 0);
+      eigen_assert(lhs_dims[i] == rhs_dims[i]);
+      m_dimensions[i] = lhs_dims[i];
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_leftStrides[0] = 1;
+      m_rightStrides[0] = 1;
+      m_outputStrides[0] = 1;
+
+      for (int i = 1; i < NumDims; ++i) {
+        m_leftStrides[i] = m_leftStrides[i-1] * lhs_dims[i-1];
+        m_rightStrides[i] = m_rightStrides[i-1] * rhs_dims[i-1];
+        m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+      }
+    } else {
+      m_leftStrides[NumDims - 1] = 1;
+      m_rightStrides[NumDims - 1] = 1;
+      m_outputStrides[NumDims - 1] = 1;
+
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_leftStrides[i] = m_leftStrides[i+1] * lhs_dims[i+1];
+        m_rightStrides[i] = m_rightStrides[i+1] * rhs_dims[i+1];
+        m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear?
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/)
+  {
+    m_leftImpl.evalSubExprsIfNeeded(NULL);
+    m_rightImpl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
+  {
+    m_leftImpl.cleanup();
+    m_rightImpl.cleanup();
+  }
+
+  // TODO(phli): attempt to speed this up. The integer divisions and modulo are slow.
+  // See CL/76180724 comments for more ideas.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    // Collect dimension-wise indices (subs).
+    array<Index, NumDims> subs;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        subs[i] = index / m_outputStrides[i];
+        index -= subs[i] * m_outputStrides[i];
+      }
+      subs[0] = index;
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        subs[i] = index / m_outputStrides[i];
+        index -= subs[i] * m_outputStrides[i];
+      }
+      subs[NumDims - 1] = index;
+    }
+
+    const Dimensions& left_dims = m_leftImpl.dimensions();
+    if (subs[m_axis] < left_dims[m_axis]) {
+      Index left_index;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        left_index = subs[0];
+        for (int i = 1; i < NumDims; ++i) {
+          left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
+        }
+      } else {
+        left_index = subs[NumDims - 1];
+        for (int i = NumDims - 2; i >= 0; --i) {
+          left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
+        }
+      }
+      return m_leftImpl.coeff(left_index);
+    } else {
+      subs[m_axis] -= left_dims[m_axis];
+      const Dimensions& right_dims = m_rightImpl.dimensions();
+      Index right_index;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        right_index = subs[0];
+        for (int i = 1; i < NumDims; ++i) {
+          right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
+        }
+      } else {
+        right_index = subs[NumDims - 1];
+        for (int i = NumDims - 2; i >= 0; --i) {
+          right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
+        }
+      }
+      return m_rightImpl.coeff(right_index);
+    }
+  }
+
+  // TODO(phli): Add a real vectorization.
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = coeff(index+i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+  protected:
+    Dimensions m_dimensions;
+    array<Index, NumDims> m_outputStrides;
+    array<Index, NumDims> m_leftStrides;
+    array<Index, NumDims> m_rightStrides;
+    TensorEvaluator<LeftArgType, Device> m_leftImpl;
+    TensorEvaluator<RightArgType, Device> m_rightImpl;
+    const Axis m_axis;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
new file mode 100644
index 000000000..f7254a24d
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -0,0 +1,992 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
+
+namespace Eigen {
+
+/** \class TensorContraction
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor contraction class.
+  *
+  *
+  */
+namespace internal {
+
+enum {
+  Rhs = 0,
+  Lhs = 1,
+};
+
+/*
+ * Implementation of the Eigen blas_data_mapper class for tensors.
+ */
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         size_t packet_size, bool inner_dim_contiguous>
+class BaseTensorContractionMapper {
+  public:
+  EIGEN_DEVICE_FUNC
+  BaseTensorContractionMapper(const Tensor& tensor,
+                              const nocontract_t& nocontract_strides,
+                              const nocontract_t& ij_strides,
+                              const contract_t& contract_strides,
+                              const contract_t& k_strides) :
+      m_tensor(tensor),
+      m_nocontract_strides(nocontract_strides),
+      m_ij_strides(ij_strides),
+      m_contract_strides(contract_strides),
+      m_k_strides(k_strides) { }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar operator()(Index row) const {
+    // column major assumption
+    return operator()(row, 0);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const {
+    return m_tensor.coeff(computeIndex(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const {
+    const bool left = (side == Lhs);
+    Index nocontract_val = left ? row : col;
+    Index linidx = 0;
+    for (int i = array_size<nocontract_t>::value - 1; i > 0; i--) {
+      const Index idx = nocontract_val / m_ij_strides[i];
+      linidx += idx * m_nocontract_strides[i];
+      nocontract_val -= idx * m_ij_strides[i];
+    }
+    if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+      if (side == Lhs && inner_dim_contiguous) {
+        eigen_assert(m_nocontract_strides[0] == 1);
+        linidx += nocontract_val;
+      } else {
+        linidx += nocontract_val * m_nocontract_strides[0];
+      }
+    }
+
+    Index contract_val = left ? col : row;
+    for (int i = array_size<contract_t>::value - 1; i > 0; i--) {
+      const Index idx = contract_val / m_k_strides[i];
+      linidx += idx * m_contract_strides[i];
+      contract_val -= idx * m_k_strides[i];
+    }
+    EIGEN_STATIC_ASSERT(array_size<contract_t>::value > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    if (side == Rhs && inner_dim_contiguous) {
+      eigen_assert(m_contract_strides[0] == 1);
+      linidx += contract_val;
+    } else {
+      linidx += contract_val * m_contract_strides[0];
+    }
+
+    return linidx;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE IndexPair<Index> computeIndexPair(Index row, Index col, const Index distance) const {
+    const bool left = (side == Lhs);
+    Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
+    Index linidx[2] = {0, 0};
+    for (int i = array_size<nocontract_t>::value - 1; i > 0; i--) {
+      const Index idx0 = nocontract_val[0] / m_ij_strides[i];
+      const Index idx1 = nocontract_val[1] / m_ij_strides[i];
+      linidx[0] += idx0 * m_nocontract_strides[i];
+      linidx[1] += idx1 * m_nocontract_strides[i];
+      nocontract_val[0] -= idx0 * m_ij_strides[i];
+      nocontract_val[1] -= idx1 * m_ij_strides[i];
+    }
+    if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+      if (side == Lhs && inner_dim_contiguous) {
+        eigen_assert(m_nocontract_strides[0] == 1);
+        linidx[0] += nocontract_val[0];
+        linidx[1] += nocontract_val[1];
+      } else {
+        linidx[0] += nocontract_val[0] * m_nocontract_strides[0];
+        linidx[1] += nocontract_val[1] * m_nocontract_strides[0];
+      }
+    }
+
+    Index contract_val[2] = {left ? col : row, left ? col : row + distance};
+    for (int i = array_size<contract_t>::value - 1; i > 0; i--) {
+      const Index idx0 = contract_val[0] / m_k_strides[i];
+      const Index idx1 = contract_val[1] / m_k_strides[i];
+      linidx[0] += idx0 * m_contract_strides[i];
+      linidx[1] += idx1 * m_contract_strides[i];
+      contract_val[0] -= idx0 * m_k_strides[i];
+      contract_val[1] -= idx1 * m_k_strides[i];
+    }
+    EIGEN_STATIC_ASSERT(array_size<contract_t>::value > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    if (side == Rhs && inner_dim_contiguous) {
+      eigen_assert(m_contract_strides[0] == 1);
+      linidx[0] += contract_val[0];
+      linidx[1] += contract_val[1];
+    } else {
+      linidx[0] += contract_val[0] * m_contract_strides[0];
+      linidx[1] += contract_val[1] * m_contract_strides[0];
+    }
+    return IndexPair<Index>(linidx[0], linidx[1]);
+  }
+
+  Index firstAligned(Index size) const {
+    return size;
+  }
+  Index stride() const {
+    return 1;
+  }
+
+ protected:
+  const Tensor m_tensor;
+  const nocontract_t m_nocontract_strides;
+  const nocontract_t m_ij_strides;
+  const contract_t m_contract_strides;
+  const contract_t m_k_strides;
+};
+
+
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         size_t packet_size,
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionInputMapper;
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         size_t packet_size,
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionSubMapper {
+ public:
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
+  typedef TensorContractionInputMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
+  typedef Self LinearMapper;
+
+  EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
+      : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
+    return m_base_mapper(i + m_vert_offset, m_horiz_offset);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const {
+    return m_base_mapper(i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
+    return m_base_mapper.loadPacket(i + m_vert_offset, m_horiz_offset);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
+   return m_base_mapper.loadPacket(i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
+    return m_base_mapper.loadHalfPacket(i + m_vert_offset, m_horiz_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
+    m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  template <typename PacketT, int AlignmentType>
+  EIGEN_ALWAYS_INLINE PacketT load(Index i) const {
+    EIGEN_STATIC_ASSERT((internal::is_same<PacketT, Packet>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((AlignmentType == Aligned || Alignment == Unaligned), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return loadPacket(i);
+  }
+
+  template <typename Packet>
+  bool aligned(Index /*i*/) const {
+    return false;
+  }
+
+ private:
+  const ParentMapper& m_base_mapper;
+  const Index m_vert_offset;
+  const Index m_horiz_offset;
+};
+
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         size_t packet_size = (Tensor::PacketAccess ? packet_traits<Scalar>::size : 1),
+         bool inner_dim_contiguous = false, bool inner_dim_reordered = (side != Lhs), int Alignment=Unaligned>
+class TensorContractionInputMapper
+    : public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous> {
+
+ public:
+  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous> Base;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
+  typedef SubMapper VectorMapper;
+
+  TensorContractionInputMapper(const Tensor& tensor,
+                               const nocontract_t& nocontract_strides,
+                               const nocontract_t& ij_strides,
+                               const contract_t& contract_strides,
+                               const contract_t& k_strides)
+      : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
+    return SubMapper(*this, i, j);
+  }
+
+  EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
+    return VectorMapper(*this, i, j);
+  }
+
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
+    // whole method makes column major assumption
+
+    // don't need to add offsets for now (because operator handles that)
+    // current code assumes packet size must be a multiple of 2
+    EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) {
+      const Index index = this->computeIndex(i, j);
+      eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1);
+      return this->m_tensor.template packet<Alignment>(index);
+    }
+
+    const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1);
+    const Index first = indexPair.first;
+    const Index last = indexPair.second;
+
+    // We can always do optimized packet reads from left hand side right now, because
+    // the vertical matrix dimension on the left hand side is never contracting.
+    // On the right hand side we need to check if the contracting dimensions may have
+    // been shuffled first.
+    if (Tensor::PacketAccess &&
+        (side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) &&
+        (last - first) == (packet_size - 1)) {
+
+      return this->m_tensor.template packet<Alignment>(first);
+    }
+
+    EIGEN_ALIGN_DEFAULT Scalar data[packet_size];
+
+    data[0] = this->m_tensor.coeff(first);
+    for (Index k = 1; k < packet_size - 1; k += 2) {
+      const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
+      data[k] = this->m_tensor.coeff(internal_pair.first);
+      data[k + 1] = this->m_tensor.coeff(internal_pair.second);
+    }
+    data[packet_size - 1] = this->m_tensor.coeff(last);
+
+    return pload<Packet>(data);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
+    // whole method makes column major assumption
+
+    // don't need to add offsets for now (because operator handles that)
+    const Index half_packet_size = unpacket_traits<HalfPacket>::size;
+    if (half_packet_size == packet_size) {
+      return loadPacket(i, j);
+    }
+    EIGEN_ALIGN_DEFAULT Scalar data[half_packet_size];
+    for (Index k = 0; k < half_packet_size; k++) {
+      data[k] = operator()(i + k, j);
+    }
+    return pload<HalfPacket>(data);
+  }
+};
+
+
+
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionInputMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment>
+    : public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous>  {
+
+ public:
+  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous> Base;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
+  typedef SubMapper VectorMapper;
+
+  TensorContractionInputMapper(const Tensor& tensor,
+                               const nocontract_t& nocontract_strides,
+                               const nocontract_t& ij_strides,
+                               const contract_t& contract_strides,
+                               const contract_t& k_strides)
+      : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
+    return SubMapper(*this, i, j);
+  }
+
+  EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
+    return VectorMapper(*this, i, j);
+  }
+
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
+    EIGEN_ALIGN_DEFAULT Scalar data[1];
+    data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
+    return pload<typename packet_traits<Scalar>::type>(data);
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const {
+    return loadPacket(i, j);
+  }
+};
+
+
+template <size_t n> struct max_n_1 {
+  static const size_t size = n;
+};
+template <> struct max_n_1<0> {
+  static const size_t size = 1;
+};
+
+
+template<typename Dimensions, typename LhsXprType, typename RhsXprType>
+struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
+{
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename internal::promote_storage_type<typename LhsXprType::Scalar,
+                                                  typename RhsXprType::Scalar>::ret Scalar;
+  typedef typename internal::packet_traits<Scalar>::type Packet;
+  typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
+                                        typename traits<RhsXprType>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<LhsXprType>::Index,
+                                      typename traits<RhsXprType>::Index>::type Index;
+  typedef typename LhsXprType::Nested LhsNested;
+  typedef typename RhsXprType::Nested RhsNested;
+  typedef typename remove_reference<LhsNested>::type _LhsNested;
+  typedef typename remove_reference<RhsNested>::type _RhsNested;
+
+  // From NumDims below.
+  static const int NumDimensions = max_n_1<traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value>::size;
+  static const int Layout = traits<LhsXprType>::Layout;
+
+  enum {
+    Flags = 0,
+  };
+};
+
+template<typename Dimensions, typename LhsXprType, typename RhsXprType>
+struct eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, Eigen::Dense>
+{
+  typedef const TensorContractionOp<Dimensions, LhsXprType, RhsXprType>& type;
+};
+
+template<typename Dimensions, typename LhsXprType, typename RhsXprType>
+struct nested<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, 1, typename eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >::type>
+{
+  typedef TensorContractionOp<Dimensions, LhsXprType, RhsXprType> type;
+};
+
+template<typename Indices_, typename LeftArgType_, typename RightArgType_, typename Device_>
+struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_, RightArgType_>, Device_> > {
+  typedef Indices_ Indices;
+  typedef LeftArgType_ LeftArgType;
+  typedef RightArgType_ RightArgType;
+  typedef Device_ Device;
+
+  // From NumDims below.
+  static const int NumDimensions = max_n_1<traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value>::size;
+};
+
+}  // end namespace internal
+
+template<typename Indices, typename LhsXprType, typename RhsXprType>
+class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXprType, RhsXprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorContractionOp>::Packet Packet;
+  typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
+                                                  typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
+  typedef typename internal::promote_storage_type<typename LhsXprType::PacketReturnType,
+                                                  typename RhsXprType::PacketReturnType>::ret PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp(
+      const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims)
+      : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {}
+
+  EIGEN_DEVICE_FUNC
+  const Indices& indices() const { return m_indices; }
+
+  /** \returns the nested expressions */
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename LhsXprType::Nested>::type&
+  lhsExpression() const { return m_lhs_xpr; }
+
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename RhsXprType::Nested>::type&
+  rhsExpression() const { return m_rhs_xpr; }
+
+  protected:
+    typename LhsXprType::Nested m_lhs_xpr;
+    typename RhsXprType::Nested m_rhs_xpr;
+    const Indices m_indices;
+};
+
+
+template<bool cond> struct Cond {};
+
+template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+const T1& choose(Cond<true>, const T1& first, const T2&) {
+  return first;
+}
+
+template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+const T2& choose(Cond<false>, const T1&, const T2& second) {
+  return second;
+}
+
+
+template<typename Derived>
+struct TensorContractionEvaluatorBase
+{
+  typedef typename internal::traits<Derived>::Indices Indices;
+  typedef typename internal::traits<Derived>::LeftArgType LeftArgType;
+  typedef typename internal::traits<Derived>::RightArgType RightArgType;
+  typedef typename internal::traits<Derived>::Device Device;
+
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::Packet Packet;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  enum {
+    IsAligned = true,
+    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+
+  static const int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static const int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static const int ContractDims = internal::array_size<Indices>::value;
+  static const int NumDims = internal::max_n_1<LDims + RDims - 2 * ContractDims>::size;
+
+  typedef array<Index, LDims> left_dim_mapper_t;
+  typedef array<Index, RDims> right_dim_mapper_t;
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, internal::max_n_1<LDims - ContractDims>::size> left_nocontract_t;
+  typedef array<Index, internal::max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  TensorContractionEvaluatorBase(const XprType& op, const Device& device)
+    : m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
+                          op.lhsExpression(), op.rhsExpression()), device),
+    m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
+                          op.rhsExpression(), op.lhsExpression()), device),
+        m_device(device),
+        m_result(NULL) {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
+			   static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    eigen_assert((internal::array_size<contract_t>::value > 0) && "Must contract on some indices");
+
+
+    DSizes<Index, LDims> eval_left_dims;
+    DSizes<Index, RDims> eval_right_dims;
+    array<IndexPair<Index>, ContractDims> eval_op_indices;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      // For ColMajor, we keep using the existing dimensions
+      for (int i = 0; i < LDims; i++) {
+        eval_left_dims[i] = m_leftImpl.dimensions()[i];
+      }
+      for (int i = 0; i < RDims; i++) {
+        eval_right_dims[i] = m_rightImpl.dimensions()[i];
+      }
+      // We keep the pairs of contracting indices.
+      for (int i = 0; i < ContractDims; i++) {
+        eval_op_indices[i].first = op.indices()[i].first;
+        eval_op_indices[i].second = op.indices()[i].second;
+      }
+    } else {
+      // For RowMajor, we need to reverse the existing dimensions
+      for (int i = 0; i < LDims; i++) {
+        eval_left_dims[i] = m_leftImpl.dimensions()[LDims - i - 1];
+      }
+      for (int i = 0; i < RDims; i++) {
+        eval_right_dims[i] = m_rightImpl.dimensions()[RDims - i - 1];
+      }
+      // We need to flip all the pairs of contracting indices as well as
+      // reversing the dimensions.
+      for (int i = 0; i < ContractDims; i++) {
+        eval_op_indices[i].first = LDims - 1 - op.indices()[i].second;
+        eval_op_indices[i].second = RDims - 1 - op.indices()[i].first;
+      }
+    }
+
+    array<Index, LDims> lhs_strides;
+    lhs_strides[0] = 1;
+    for (int i = 0; i < LDims-1; ++i) {
+      lhs_strides[i+1] = lhs_strides[i] * eval_left_dims[i];
+    }
+
+    array<Index, RDims> rhs_strides;
+    rhs_strides[0] = 1;
+    for (int i = 0; i < RDims-1; ++i) {
+      rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i];
+    }
+
+    m_i_strides[0] = 1;
+    m_j_strides[0] = 1;
+    m_k_strides[0] = 1;
+
+    m_i_size = 1;
+    m_j_size = 1;
+    m_k_size = 1;
+
+    // To compute the dimension, we simply concatenate the non-contracting
+    // dimensions of the left and then the right tensor. Additionally, we also
+    // compute the strides corresponding to the left non-contracting
+    // dimensions and right non-contracting dimensions.
+    m_lhs_inner_dim_contiguous = true;
+    int dim_idx = 0;
+    int nocontract_idx = 0;
+
+    for (int i = 0; i < LDims; i++) {
+      // find if we are contracting on index i of left tensor
+      bool contracting = false;
+      for (int j = 0; j < ContractDims; j++) {
+        if (eval_op_indices[j].first == i) {
+          contracting = true;
+          break;
+        }
+      }
+      if (!contracting) {
+        // add dimension size to output dimensions
+        m_dimensions[dim_idx] = eval_left_dims[i];
+        m_left_nocontract_strides[nocontract_idx] = lhs_strides[i];
+        if (dim_idx != i) {
+          m_lhs_inner_dim_contiguous = false;
+        }
+        if (nocontract_idx+1 < internal::array_size<left_nocontract_t>::value) {
+          m_i_strides[nocontract_idx+1] =
+              m_i_strides[nocontract_idx] * eval_left_dims[i];
+        } else {
+          m_i_size = m_i_strides[nocontract_idx] * eval_left_dims[i];
+        }
+        dim_idx++;
+        nocontract_idx++;
+      }
+    }
+
+    nocontract_idx = 0;
+    for (int i = 0; i < RDims; i++) {
+      bool contracting = false;
+      // find if we are contracting on index i of right tensor
+      for (int j = 0; j < ContractDims; j++) {
+        if (eval_op_indices[j].second == i) {
+          contracting = true;
+          break;
+        }
+      }
+      if (!contracting) {
+        m_dimensions[dim_idx] = eval_right_dims[i];
+        if (nocontract_idx+1 < internal::array_size<right_nocontract_t>::value) {
+          m_j_strides[nocontract_idx+1] =
+              m_j_strides[nocontract_idx] * eval_right_dims[i];
+        } else {
+          m_j_size = m_j_strides[nocontract_idx] * eval_right_dims[i];
+        }
+        m_right_nocontract_strides[nocontract_idx] = rhs_strides[i];
+        dim_idx++;
+        nocontract_idx++;
+      }
+    }
+
+    // Now compute the strides corresponding to the contracting dimensions. We
+    // assumed above that non-contracting axes are represented in the same order
+    // in the matrix as they are in the tensor. This is not the case for
+    // contracting axes. As the contracting axes must be of the same size in
+    // each tensor, we'll only look at the first tensor here.
+    m_rhs_inner_dim_contiguous = true;
+    m_rhs_inner_dim_reordered = false;
+    for (int i = 0; i < ContractDims; i++) {
+      Index left = eval_op_indices[i].first;
+      Index right = eval_op_indices[i].second;
+
+      Index size = eval_left_dims[left];
+      eigen_assert(size == eval_right_dims[right] &&
+                   "Contraction axes must be same size");
+
+      if (i+1 < internal::array_size<contract_t>::value) {
+        m_k_strides[i+1] = m_k_strides[i] * size;
+      } else {
+        m_k_size = m_k_strides[i] * size;
+      }
+      m_left_contracting_strides[i] = lhs_strides[left];
+      m_right_contracting_strides[i] = rhs_strides[right];
+
+      if (i > 0 && right < eval_op_indices[i-1].second) {
+        m_rhs_inner_dim_reordered = true;
+      }
+      if (right != i) {
+        m_rhs_inner_dim_contiguous = false;
+      }
+    }
+
+    // Scalar case. We represent the result as a 1d tensor of size 1.
+    if (LDims + RDims == 2 * ContractDims) {
+      m_dimensions[0] = 1;
+    }
+
+    // If the layout is RowMajor, we need to reverse the m_dimensions
+    if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) {
+      for (int i = 0, j = NumDims - 1; i < j; i++, j--) {
+        std::swap(m_dimensions[i], m_dimensions[j]);
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+    m_leftImpl.evalSubExprsIfNeeded(NULL);
+    m_rightImpl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      evalTo(data);
+      return false;
+    } else {
+      m_result = static_cast<Scalar *>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+      evalTo(m_result);
+      return true;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const {
+    if (this->m_lhs_inner_dim_contiguous) {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          static_cast<const Derived*>(this)->template evalProduct<true, true, true, Unaligned>(buffer);
+        }
+        else {
+          static_cast<const Derived*>(this)->template evalProduct<true, true, false, Unaligned>(buffer);
+        }
+      }
+      else {
+       if (this->m_rhs_inner_dim_reordered) {
+          static_cast<const Derived*>(this)->template evalProduct<true, false, true, Unaligned>(buffer);
+        }
+        else {
+          static_cast<const Derived*>(this)->template evalProduct<true, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+    else {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          static_cast<const Derived*>(this)->template evalProduct<false, true, true, Unaligned>(buffer);
+        }
+        else {
+          static_cast<const Derived*>(this)->template evalProduct<false, true, false, Unaligned>(buffer);
+        }
+      }
+      else {
+       if (this->m_rhs_inner_dim_reordered) {
+          static_cast<const Derived*>(this)->template evalProduct<false, false, true, Unaligned>(buffer);
+        }
+        else {
+          static_cast<const Derived*>(this)->template evalProduct<false, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalGemv(Scalar* buffer) const {
+    const Index rows = m_i_size;
+    const Index cols = m_k_size;
+
+    typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+    typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+    typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+    typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+    const int lhs_packet_size = internal::packet_traits<LhsScalar>::size;
+    const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+                                                   LeftEvaluator, left_nocontract_t,
+                                                   contract_t, lhs_packet_size,
+                                                   lhs_inner_dim_contiguous,
+                                                   false, Unaligned> LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+                                                   RightEvaluator, right_nocontract_t,
+                                                   contract_t, rhs_packet_size,
+                                                   rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+    LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides,
+                  m_left_contracting_strides, m_k_strides);
+    RhsMapper rhs(m_rightImpl, m_right_nocontract_strides, m_j_strides,
+                  m_right_contracting_strides, m_k_strides);
+
+    const Scalar alpha(1);
+    const Index resIncr(1);
+
+    // zero out the result buffer (which must be of size at least rows * sizeof(Scalar)
+    m_device.memset(buffer, 0, rows * sizeof(Scalar));
+
+    internal::general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,false,RhsScalar,RhsMapper,false>::run(
+        rows, cols, lhs, rhs,
+        buffer, resIncr, alpha);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_leftImpl.cleanup();
+    m_rightImpl.cleanup();
+
+    if (m_result != NULL) {
+      m_device.deallocate(m_result);
+      m_result = NULL;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_result[index];
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+    return internal::ploadt<Packet, LoadMode>(m_result + index);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+  protected:
+  // Prevent assignment
+  TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&);
+  Dimensions m_dimensions;
+
+  contract_t m_k_strides;
+  contract_t m_left_contracting_strides;
+  contract_t m_right_contracting_strides;
+
+  bool m_lhs_inner_dim_contiguous;
+  bool m_rhs_inner_dim_contiguous;
+  bool m_rhs_inner_dim_reordered;
+
+  left_nocontract_t m_i_strides;
+  right_nocontract_t m_j_strides;
+  left_nocontract_t m_left_nocontract_strides;
+  right_nocontract_t m_right_nocontract_strides;
+
+  Index m_i_size;
+  Index m_j_size;
+  Index m_k_size;
+
+  TensorEvaluator<EvalLeftArgType, Device> m_leftImpl;
+  TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
+  const Device& m_device;
+  Scalar* m_result;
+};
+
+
+// evaluator for default device
+template<typename Indices, typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> :
+    public TensorContractionEvaluatorBase<
+      TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> > {
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
+  typedef TensorContractionEvaluatorBase<Self> Base;
+
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::Packet Packet;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  enum {
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+  };
+
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+
+  static const int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static const int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static const int ContractDims = internal::array_size<Indices>::value;
+
+  typedef array<Index, LDims> left_dim_mapper_t;
+  typedef array<Index, RDims> right_dim_mapper_t;
+
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, internal::max_n_1<LDims - ContractDims>::size> left_nocontract_t;
+  typedef array<Index, internal::max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+
+  static const int NumDims = internal::max_n_1<LDims + RDims - 2 * ContractDims>::size;
+
+  // Could we use NumDimensions here?
+  typedef DSizes<Index, NumDims> Dimensions;
+
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
+      Base(op, device) { }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalProduct(Scalar* buffer) const {
+    if (this->m_j_size == 1) {
+      this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
+      return;
+    }
+
+    evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
+    // columns in left side, rows in right side
+    const Index k = this->m_k_size;
+
+    // rows in left side
+    const Index m = this->m_i_size;
+
+    // columns in right side
+    const Index n = this->m_j_size;
+
+    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
+    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+
+    // define mr, nr, and all of my data mapper types
+    typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+    typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+    typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+
+    const Index nr = Traits::nr;
+    const Index mr = Traits::mr;
+
+    typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+    typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+    const int lhs_packet_size = internal::packet_traits<LhsScalar>::size;
+    const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
+
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+                                                   LeftEvaluator, left_nocontract_t,
+                                                   contract_t, lhs_packet_size,
+                                                   lhs_inner_dim_contiguous,
+                                                   false, Unaligned> LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+                                                   RightEvaluator, right_nocontract_t,
+                                                   contract_t, rhs_packet_size,
+                                                   rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+    // Declare GEBP packing and kernel structs
+    internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, ColMajor> pack_lhs;
+    internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs;
+
+    internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp;
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+    OutputMapper output(buffer, m);
+
+    typedef typename internal::gemm_blocking_space<ColMajor, LhsScalar, RhsScalar, Dynamic, Dynamic, Dynamic> BlockingType;
+
+    // Sizes of the blocks to load in cache. See the Goto paper for details.
+    BlockingType blocking(m, n, k, 1, true);
+    const Index kc = blocking.kc();
+    const Index mc = (std::min)(m, blocking.mc());
+    const Index nc = (std::min)(n, blocking.nc());
+    const Index sizeA = mc * kc;
+    const Index sizeB = kc * nc;
+
+    LhsScalar* blockA = static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar)));
+    RhsScalar* blockB = static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar)));
+
+    for(Index i2=0; i2<m; i2+=mc)
+    {
+      const Index actual_mc = (std::min)(i2+mc,m)-i2;
+      for (Index k2 = 0; k2 < k; k2 += kc) {
+        // make sure we don't overshoot right edge of left matrix, then pack vertical panel
+        const Index actual_kc = (std::min)(k2 + kc, k) - k2;
+        pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0);
+
+        // series of horizontal blocks
+        for (Index j2 = 0; j2 < n; j2 += nc) {
+          // make sure we don't overshoot right edge of right matrix, then pack block
+          const Index actual_nc = (std::min)(j2 + nc, n) - j2;
+          pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0);
+
+          // call gebp (matrix kernel)
+          // The parameters here are copied from Eigen's GEMM implementation
+          gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, 1.0, -1, -1, 0, 0);
+        }
+      }
+    }
+
+    this->m_device.deallocate(blockA);
+    this->m_device.deallocate(blockB);
+  }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
new file mode 100644
index 000000000..588770bb4
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -0,0 +1,1384 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2015 Navdeep Jaitly <ndjaitly@google.com>
+// Copyright (C) 2014 Eric Martin <eric@ericmart.in>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
+
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+
+namespace Eigen {
+
+template<typename Scalar, typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper, bool needs_edge_check>
+__device__ EIGEN_STRONG_INLINE void
+EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
+                               const OutputMapper output, volatile Scalar* lhs_shmem, volatile Scalar* rhs_shmem,
+                       const Index m_size, const Index n_size, const Index k_size) {
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  // declare and initialize 64 registers for output 8x8 block
+
+  // prefetch registers
+  Scalar lhs_pf0;
+  Scalar lhs_pf1;
+  Scalar lhs_pf2;
+  Scalar lhs_pf3;
+  Scalar lhs_pf4;
+  Scalar lhs_pf5;
+  Scalar lhs_pf6;
+  Scalar lhs_pf7;
+
+  Scalar rhs_pf0;
+  Scalar rhs_pf1;
+  Scalar rhs_pf2;
+  Scalar rhs_pf3;
+  Scalar rhs_pf4;
+  Scalar rhs_pf5;
+  Scalar rhs_pf6;
+  Scalar rhs_pf7;
+
+  // shared memory is formatted
+  // (contract idx in block, nocontract idx in block, block idx)
+  // where block idx is column major. This transposition limits the number of
+  // bank conflicts when reading the LHS. The core idea is that since the contracting
+  // index is shared by both sides, then the contracting index should be in threadIdx.x.
+
+  // On the LHS, we pad each row inside of each block with an extra element. This makes
+  // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts
+  // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks.
+
+  // On the RHS we just add 8 padding elements to the end of each block. This gives no bank
+  // conflicts on writes and also none on reads.
+
+  // storage indices
+  const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z;
+  const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x;
+
+  const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0;
+  const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1;
+  const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2;
+  const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3;
+  const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4;
+  const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5;
+  const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6;
+  const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7;
+
+  const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0;
+  const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1;
+  const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2;
+  const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3;
+  const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4;
+  const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5;
+  const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6;
+  const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7;
+
+  // in the loading code, the following variables are important:
+  // threadIdx.x: the vertical position in an 8x8 block
+  // threadIdx.y: the vertical index of the 8x8 block in the grid
+  // threadIdx.z: the horizontal position in an 8x8 block
+  // k: the horizontal index of the 8x8 block in the grid
+  //
+  // The k parameter is implicit (it was the loop counter for a loop that went
+  // from 0 to <8, but now that loop is unrolled in the below code.
+
+  const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y;
+  const Index lhs_vert = base_m + load_idx_vert;
+
+#define prefetchIntoRegisters(base_k)                           \
+  {                                                             \
+    lhs_pf0 = Scalar(0);                                        \
+    lhs_pf1 = Scalar(0);                                        \
+    lhs_pf2 = Scalar(0);                                        \
+    lhs_pf3 = Scalar(0);                                        \
+    lhs_pf4 = Scalar(0);                                        \
+    lhs_pf5 = Scalar(0);                                        \
+    lhs_pf6 = Scalar(0);                                        \
+    lhs_pf7 = Scalar(0);                                        \
+                                                                \
+    rhs_pf0 = Scalar(0);                                        \
+    rhs_pf1 = Scalar(0);                                        \
+    rhs_pf2 = Scalar(0);                                        \
+    rhs_pf3 = Scalar(0);                                        \
+    rhs_pf4 = Scalar(0);                                        \
+    rhs_pf5 = Scalar(0);                                        \
+    rhs_pf6 = Scalar(0);                                        \
+    rhs_pf7 = Scalar(0);                                        \
+                                                                \
+    if (!needs_edge_check || lhs_vert < m_size) {               \
+      const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8;   \
+      const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8;   \
+      const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8;   \
+      const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8;   \
+      const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8;   \
+      const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8;   \
+      const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8;   \
+      const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8;   \
+                                                                \
+      if (!needs_edge_check || lhs_horiz_7 < k_size) {          \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
+        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
+        lhs_pf7 = lhs(lhs_vert, lhs_horiz_7);                   \
+      } else if (lhs_horiz_6 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
+        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
+      } else if (lhs_horiz_5 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
+      } else if (lhs_horiz_4 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+      } else if (lhs_horiz_3 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+      } else if (lhs_horiz_2 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+      } else if (lhs_horiz_1 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+      } else if (lhs_horiz_0 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+      }                                                         \
+    }                                                           \
+                                                                \
+    const Index rhs_vert = base_k + load_idx_vert;              \
+    if (!needs_edge_check || rhs_vert < k_size) {               \
+      const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8;   \
+      const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8;   \
+      const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8;   \
+      const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8;   \
+      const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8;   \
+      const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8;   \
+      const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8;   \
+      const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8;   \
+                                                                \
+      if (rhs_horiz_7 < n_size) {                               \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
+        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
+        rhs_pf7 = rhs(rhs_vert, rhs_horiz_7);                   \
+      } else if (rhs_horiz_6 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
+        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
+      } else if (rhs_horiz_5 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
+      } else if (rhs_horiz_4 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+      } else if (rhs_horiz_3 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+      } else if (rhs_horiz_2 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+      } else if (rhs_horiz_1 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+      } else if (rhs_horiz_0 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+      }                                                         \
+    }                                                           \
+  }                                                             \
+
+#define writeRegToShmem(_)                      \
+  lhs_shmem[lhs_store_idx_0] = lhs_pf0;         \
+  rhs_shmem[rhs_store_idx_0] = rhs_pf0;         \
+                                                \
+  lhs_shmem[lhs_store_idx_1] = lhs_pf1;         \
+  rhs_shmem[rhs_store_idx_1] = rhs_pf1;         \
+                                                \
+  lhs_shmem[lhs_store_idx_2] = lhs_pf2;         \
+  rhs_shmem[rhs_store_idx_2] = rhs_pf2;         \
+                                                \
+  lhs_shmem[lhs_store_idx_3] = lhs_pf3;         \
+  rhs_shmem[rhs_store_idx_3] = rhs_pf3;         \
+                                                \
+  lhs_shmem[lhs_store_idx_4] = lhs_pf4;         \
+  rhs_shmem[rhs_store_idx_4] = rhs_pf4;         \
+                                                \
+  lhs_shmem[lhs_store_idx_5] = lhs_pf5;         \
+  rhs_shmem[rhs_store_idx_5] = rhs_pf5;         \
+                                                \
+  lhs_shmem[lhs_store_idx_6] = lhs_pf6;         \
+  rhs_shmem[rhs_store_idx_6] = rhs_pf6;         \
+                                                \
+  lhs_shmem[lhs_store_idx_7] = lhs_pf7;         \
+  rhs_shmem[rhs_store_idx_7] = rhs_pf7;         \
+
+  // declare and initialize result array
+#define res(i, j) _res_##i##j
+#define initResultRow(i)                        \
+  Scalar res(i, 0) = Scalar(0);                 \
+  Scalar res(i, 1) = Scalar(0);                 \
+  Scalar res(i, 2) = Scalar(0);                 \
+  Scalar res(i, 3) = Scalar(0);                 \
+  Scalar res(i, 4) = Scalar(0);                 \
+  Scalar res(i, 5) = Scalar(0);                 \
+  Scalar res(i, 6) = Scalar(0);                 \
+  Scalar res(i, 7) = Scalar(0);                 \
+
+  initResultRow(0);
+  initResultRow(1);
+  initResultRow(2);
+  initResultRow(3);
+  initResultRow(4);
+  initResultRow(5);
+  initResultRow(6);
+  initResultRow(7);
+#undef initResultRow
+
+  for (Index base_k = 0; base_k < k_size; base_k += 64) {
+    // wait for previous iteration to finish with shmem. Despite common sense,
+    // the code is a bit faster with this here then at bottom of loop
+    __syncthreads();
+
+    prefetchIntoRegisters(base_k);
+    writeRegToShmem();
+
+    #undef prefetchIntoRegisters
+    #undef writeRegToShmem
+
+    // wait for shared mem packing to be done before starting computation
+    __syncthreads();
+
+    // compute 8x8 matrix product by outer product. This involves packing one column
+    // of LHS and one row of RHS into registers (takes 16 registers).
+
+#define lcol(i) _lcol##i
+    Scalar lcol(0);
+    Scalar lcol(1);
+    Scalar lcol(2);
+    Scalar lcol(3);
+    Scalar lcol(4);
+    Scalar lcol(5);
+    Scalar lcol(6);
+    Scalar lcol(7);
+
+#define rrow(j) _rrow##j
+    Scalar rrow(0);
+    Scalar rrow(1);
+    Scalar rrow(2);
+    Scalar rrow(3);
+    Scalar rrow(4);
+    Scalar rrow(5);
+    Scalar rrow(6);
+    Scalar rrow(7);
+
+    // Now x corresponds to k, y to m, and z to n
+    const volatile Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
+    const volatile Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
+
+#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
+#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
+
+#define loadData(i, j)                          \
+    lcol(0) = lhs_element(0, j);               \
+    rrow(0) = rhs_element(i, 0);               \
+    lcol(1) = lhs_element(1, j);               \
+    rrow(1) = rhs_element(i, 1);               \
+    lcol(2) = lhs_element(2, j);               \
+    rrow(2) = rhs_element(i, 2);               \
+    lcol(3) = lhs_element(3, j);               \
+    rrow(3) = rhs_element(i, 3);               \
+    lcol(4) = lhs_element(4, j);               \
+    rrow(4) = rhs_element(i, 4);               \
+    lcol(5) = lhs_element(5, j);               \
+    rrow(5) = rhs_element(i, 5);               \
+    lcol(6) = lhs_element(6, j);               \
+    rrow(6) = rhs_element(i, 6);               \
+    lcol(7) = lhs_element(7, j);               \
+    rrow(7) = rhs_element(i, 7);               \
+
+#define computeCol(j)                           \
+    res(0, j) += lcol(0) * rrow(j);             \
+    res(1, j) += lcol(1) * rrow(j);             \
+    res(2, j) += lcol(2) * rrow(j);             \
+    res(3, j) += lcol(3) * rrow(j);             \
+    res(4, j) += lcol(4) * rrow(j);             \
+    res(5, j) += lcol(5) * rrow(j);             \
+    res(6, j) += lcol(6) * rrow(j);             \
+    res(7, j) += lcol(7) * rrow(j);             \
+
+#define computePass(i)                          \
+    loadData(i, i);                             \
+                                                \
+    computeCol(0);                              \
+    computeCol(1);                              \
+    computeCol(2);                              \
+    computeCol(3);                              \
+    computeCol(4);                              \
+    computeCol(5);                              \
+    computeCol(6);                              \
+    computeCol(7);                              \
+
+    computePass(0);
+    computePass(1);
+    computePass(2);
+    computePass(3);
+    computePass(4);
+    computePass(5);
+    computePass(6);
+    computePass(7);
+
+#undef lcol
+#undef rrow
+#undef lhs_element
+#undef rhs_element
+#undef loadData
+#undef computeCol
+#undef computePass
+  } // end loop over k
+
+  // we've now iterated over all of the large (ie width 64) k blocks and
+  // accumulated results in registers. At this point thread (x, y, z) contains
+  // the sum across all big k blocks of the product of little k block of index (x, y)
+  // with block of index (y, z). To compute the final output, we need to reduce
+  // the 8 threads over y by summation.
+#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
+
+#define reduceRow(i, mask)                      \
+  shuffleInc(i, 0, mask);                       \
+  shuffleInc(i, 1, mask);                       \
+  shuffleInc(i, 2, mask);                       \
+  shuffleInc(i, 3, mask);                       \
+  shuffleInc(i, 4, mask);                       \
+  shuffleInc(i, 5, mask);                       \
+  shuffleInc(i, 6, mask);                       \
+  shuffleInc(i, 7, mask);                       \
+
+#define reduceMatrix(mask)                      \
+  reduceRow(0, mask);                           \
+  reduceRow(1, mask);                           \
+  reduceRow(2, mask);                           \
+  reduceRow(3, mask);                           \
+  reduceRow(4, mask);                           \
+  reduceRow(5, mask);                           \
+  reduceRow(6, mask);                           \
+  reduceRow(7, mask);                           \
+
+  // actually perform the reduction, now each thread of index (_, y, z)
+  // contains the correct values in its registers that belong in the output
+  // block
+  reduceMatrix(1);
+  reduceMatrix(2);
+  reduceMatrix(4);
+
+#undef shuffleInc
+#undef reduceRow
+#undef reduceMatrix
+
+  // now we need to copy the 64 values into main memory. We can't split work
+  // among threads because all variables are in registers. There's 2 ways
+  // to do this:
+  // (1) have 1 thread do 64 writes from registers into global memory
+  // (2) have 1 thread do 64 writes into shared memory, and then 8 threads
+  //     each do 8 writes into global memory. We can just overwrite the shared
+  //     memory from the problem we just solved.
+  // (2) is slightly faster than (1) due to less branching and more ILP
+
+  // TODO: won't yield much gain, but could just use currently unused shared mem
+  //       and then we won't have to sync
+  // wait for shared mem to be out of use
+  __syncthreads();
+
+#define writeResultShmem(i, j)                                          \
+  lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \
+
+#define writeRow(i)                             \
+  writeResultShmem(i, 0);                       \
+  writeResultShmem(i, 1);                       \
+  writeResultShmem(i, 2);                       \
+  writeResultShmem(i, 3);                       \
+  writeResultShmem(i, 4);                       \
+  writeResultShmem(i, 5);                       \
+  writeResultShmem(i, 6);                       \
+  writeResultShmem(i, 7);                       \
+
+  if (threadIdx.x == 0) {
+    writeRow(0);
+    writeRow(1);
+    writeRow(2);
+    writeRow(3);
+    writeRow(4);
+    writeRow(5);
+    writeRow(6);
+    writeRow(7);
+  }
+#undef writeResultShmem
+#undef writeRow
+
+  const int max_i_write = (min)((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
+  const int max_j_write = (min)((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
+
+  if (threadIdx.x < max_i_write) {
+    if (max_j_write == 8) {
+      // TODO: can i trade bank conflicts for coalesced writes?
+      Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0];
+      Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1];
+      Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2];
+      Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3];
+      Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4];
+      Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5];
+      Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6];
+      Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7];
+
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7;
+    } else {
+#pragma unroll 7
+      for (int j = 0; j < max_j_write; j++) {
+        Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j];
+        output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val;
+      }
+    }
+  }
+#undef res
+}
+
+
+template<typename Scalar, typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper>
+__global__ void
+__launch_bounds__(512)
+EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output,
+                       const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ volatile Scalar lhs_shmem[72 * 64];
+  __shared__ volatile Scalar rhs_shmem[72 * 64];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  if (base_m + 63 < m_size && base_n + 63 < n_size) {
+    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
+  } else {
+    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
+  }
+}
+
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
+         bool CHECK_RHS_BOUNDARY>
+__device__ EIGEN_STRONG_INLINE void
+EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output, float2 lhs_shmem2[][16],
+                       float2 rhs_shmem2[][8], const Index m_size,
+                       const Index n_size, const Index k_size,
+                       const Index base_m, const Index base_n) {
+  typedef float Scalar;
+
+  // prefetch registers
+  float4 lhs_pf0, rhs_pf0;
+
+  float4 results[4];
+  for (int i=0; i < 4; i++) {
+    results[i].x = results[i].y = results[i].z = results[i].w = 0;
+  }
+
+
+#define prefetch_lhs(reg, row, col)                   \
+    if (!CHECK_LHS_BOUNDARY) {                        \
+      if (col < k_size) {                             \
+        reg =lhs.loadPacket(row, col);                \
+      }                                               \
+    } else {                                          \
+      if (col < k_size) {                             \
+        if (row + 3 < m_size) {                       \
+          reg =lhs.loadPacket(row, col);              \
+        } else if (row + 2 < m_size) {                \
+          reg.x =lhs(row + 0, col);                   \
+          reg.y =lhs(row + 1, col);                   \
+          reg.z =lhs(row + 2, col);                   \
+        } else if (row + 1 < m_size) {                \
+          reg.x =lhs(row + 0, col);                   \
+          reg.y =lhs(row + 1, col);                   \
+        } else if (row  < m_size) {                   \
+          reg.x =lhs(row + 0, col);                   \
+        }                                             \
+      }                                               \
+    }                                                 \
+
+
+  Index lhs_vert = base_m+threadIdx.x*4;
+
+  for (Index k = 0; k < k_size; k += 16) {
+    lhs_pf0 = internal::pset1<float4>(0);
+    rhs_pf0 = internal::pset1<float4>(0);
+
+    Index lhs_horiz = threadIdx.y+k;
+    prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz)
+
+    Index rhs_vert = k+(threadIdx.x%4)*4;
+    Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n;
+
+    if (!CHECK_RHS_BOUNDARY) {
+      if ((rhs_vert + 3) < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
+      } else if (rhs_vert + 2 < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+      } else if (rhs_vert + 1 < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+      } else if (rhs_vert  < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+      }
+    } else {
+      if (rhs_horiz0 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
+        } else if ((rhs_vert + 2) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        } else if ((rhs_vert + 1) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        } else if (rhs_vert  < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        }
+      }
+    }
+    float x1, x2 ;
+    // the following can be a bitwise operation..... some day.
+    if((threadIdx.x%8) < 4) {
+      x1 = rhs_pf0.y;
+      x2 = rhs_pf0.w;
+    } else {
+      x1 = rhs_pf0.x;
+      x2 = rhs_pf0.z;
+    }
+    x1 = __shfl_xor(x1, 4);
+    x2 = __shfl_xor(x2, 4);
+    if((threadIdx.x%8) < 4) {
+      rhs_pf0.y = x1;
+      rhs_pf0.w = x2;
+    } else {
+      rhs_pf0.x = x1;
+      rhs_pf0.z = x2;
+    }
+
+    // We have 64 features.
+    // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1.
+    // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3.
+    // ...
+    // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63
+    // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1
+    // ...
+    rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y);
+    rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w);
+
+    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // ...
+    // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63)
+    // ...
+
+    lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y);
+    lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w);
+
+
+#define add_vals(fl1, fl2, fr1, fr2)\
+    results[0].x += fl1.x * fr1.x;\
+    results[0].y += fl1.y * fr1.x;\
+    results[0].z += fl2.x * fr1.x;\
+    results[0].w += fl2.y * fr1.x;\
+\
+    results[1].x += fl1.x * fr1.y;\
+    results[1].y += fl1.y * fr1.y;\
+    results[1].z += fl2.x * fr1.y;\
+    results[1].w += fl2.y * fr1.y;\
+\
+    results[2].x += fl1.x * fr2.x;\
+    results[2].y += fl1.y * fr2.x;\
+    results[2].z += fl2.x * fr2.x;\
+    results[2].w += fl2.y * fr2.x;\
+\
+    results[3].x += fl1.x * fr2.y;\
+    results[3].y += fl1.y * fr2.y;\
+    results[3].z += fl2.x * fr2.y;\
+    results[3].w += fl2.y * fr2.y;\
+
+    __syncthreads();
+
+    // Do the multiplies.
+    #pragma unroll
+    for (int koff = 0; koff < 16; koff ++) {
+      // 32 x threads.
+      float2 fl1 = lhs_shmem2[koff][threadIdx.x];
+      float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x];
+
+      int start_feature = threadIdx.y * 4;
+      float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
+      float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
+
+      add_vals(fl1, fl2, fr1, fr2)
+    }
+    __syncthreads();
+  }
+
+#undef prefetch_lhs
+#undef add_vals
+
+  Index horiz_base = threadIdx.y*4+base_n;
+  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
+    for (int i = 0; i < 4; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }
+  } else if (!CHECK_RHS_BOUNDARY) {
+    // CHECK LHS
+    if (lhs_vert + 3 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    } else if (lhs_vert + 2 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      }
+    } else if (lhs_vert + 1 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      }
+    } else if (lhs_vert  < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+      }
+    }
+  } else if (!CHECK_LHS_BOUNDARY) {
+    // CHECK RHS
+    /*
+    int ncols_rem = fminf(n_size- horiz_base, 4);
+    for (int i = 0; i < ncols_rem; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }*/
+    for (int i = 0; i < 4; i++) {
+      if (horiz_base+i < n_size) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+       }
+    }
+  } else {
+    // CHECK both boundaries.
+    for (int i = 0; i < 4; i++) {
+      if (horiz_base+i < n_size) {
+        if (lhs_vert < m_size)
+          output(lhs_vert, horiz_base + i) = results[i].x;
+        if (lhs_vert + 1 < m_size)
+          output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        if (lhs_vert + 2 < m_size)
+          output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        if (lhs_vert + 3 < m_size)
+          output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  }
+}
+
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
+         bool CHECK_RHS_BOUNDARY>
+__device__ EIGEN_STRONG_INLINE void
+EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output, float2 lhs_shmem2[][32],
+                       float2 rhs_shmem2[][8], const Index m_size,
+                       const Index n_size, const Index k_size,
+                       const Index base_m, const Index base_n) {
+  typedef float Scalar;
+
+  // prefetch registers
+  float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
+  float4 rhs_pf0, rhs_pf1;
+
+  float4 results[8];
+  for (int i=0; i < 8; i++) {
+    results[i].x = results[i].y = results[i].z = results[i].w = 0;
+  }
+
+
+  Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32;
+  for (Index k = 0; k < k_size; k += 32) {
+    lhs_pf0 = internal::pset1<float4>(0);
+    lhs_pf1 = internal::pset1<float4>(0);
+    lhs_pf2 = internal::pset1<float4>(0);
+    lhs_pf3 = internal::pset1<float4>(0);
+
+    rhs_pf0 = internal::pset1<float4>(0);
+    rhs_pf1 = internal::pset1<float4>(0);
+
+     if (!CHECK_LHS_BOUNDARY) {
+      if ((threadIdx.y/4+k+24) < k_size) {
+        lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
+        lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16));
+        lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24));
+      } else if ((threadIdx.y/4+k+16) < k_size) {
+        lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
+        lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16));
+      } else if ((threadIdx.y/4+k+8) < k_size) {
+        lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
+      } else if ((threadIdx.y/4+k) < k_size) {
+        lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
+      }
+    } else {
+      // just CHECK_LHS_BOUNDARY
+      if (lhs_vert + 3 < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
+          lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16));
+          lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
+          lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
+        }
+      } else if (lhs_vert + 2 < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+          lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
+          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+          lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
+          lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+          lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+        }
+      } else if (lhs_vert + 1 < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+          lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+        }
+      } else if (lhs_vert < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+        }
+      }
+    }
+    __syncthreads();
+    Index rhs_vert = k+threadIdx.x*4;
+    Index rhs_horiz0 = threadIdx.y*2+base_n;
+    Index rhs_horiz1 = threadIdx.y*2+1+base_n;
+    if (!CHECK_RHS_BOUNDARY) {
+      if ((rhs_vert + 3) < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
+        rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1);
+      } else if (rhs_vert + 2 < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+        rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+      } else if (rhs_vert + 1 < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+      } else if (rhs_vert  < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+      }
+    } else {
+      if (rhs_horiz1 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
+          rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1);
+        } else if (rhs_vert + 2 < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+          rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+        } else if (k+threadIdx.x*4 + 1 < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+        } else if (k+threadIdx.x*4  < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        }
+      } else if (rhs_horiz0 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
+        } else if ((rhs_vert + 2) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        } else if ((rhs_vert + 1) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        } else if (rhs_vert  < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        }
+      }
+    }
+    __syncthreads();
+    // Loaded. Do computation
+    // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1.
+    // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3.
+    // ..
+    // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63
+    rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x);
+    // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1.
+    // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3.
+    // ..
+    rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y);
+    // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1.
+    // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3.
+    rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z);
+    // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1.
+    // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3.
+    rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w);
+
+    // LHS.
+    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
+    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
+    // ...
+    // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
+    // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
+
+
+#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\
+      results[0].x += a_feat1.x * f1.x;\
+      results[1].x += a_feat1.x * f1.y;\
+      results[2].x += a_feat1.x * f2.x;\
+      results[3].x += a_feat1.x * f2.y;\
+      results[4].x += a_feat1.x * f3.x;\
+      results[5].x += a_feat1.x * f3.y;\
+      results[6].x += a_feat1.x * f4.x;\
+      results[7].x += a_feat1.x * f4.y;\
+\
+      results[0].y += a_feat1.y * f1.x;\
+      results[1].y += a_feat1.y * f1.y;\
+      results[2].y += a_feat1.y * f2.x;\
+      results[3].y += a_feat1.y * f2.y;\
+      results[4].y += a_feat1.y * f3.x;\
+      results[5].y += a_feat1.y * f3.y;\
+      results[6].y += a_feat1.y * f4.x;\
+      results[7].y += a_feat1.y * f4.y;\
+\
+      results[0].z += a_feat2.x * f1.x;\
+      results[1].z += a_feat2.x * f1.y;\
+      results[2].z += a_feat2.x * f2.x;\
+      results[3].z += a_feat2.x * f2.y;\
+      results[4].z += a_feat2.x * f3.x;\
+      results[5].z += a_feat2.x * f3.y;\
+      results[6].z += a_feat2.x * f4.x;\
+      results[7].z += a_feat2.x * f4.y;\
+\
+      results[0].w += a_feat2.y * f1.x;\
+      results[1].w += a_feat2.y * f1.y;\
+      results[2].w += a_feat2.y * f2.x;\
+      results[3].w += a_feat2.y * f2.y;\
+      results[4].w += a_feat2.y * f3.x;\
+      results[5].w += a_feat2.y * f3.y;\
+      results[6].w += a_feat2.y * f4.x;\
+      results[7].w += a_feat2.y * f4.y;\
+
+    lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y);
+    lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y);
+    lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y);
+    lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y);
+
+    lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w);
+    lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w);
+    lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w);
+    lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w);
+
+    __syncthreads();
+
+    // Do the multiplies.
+    #pragma unroll
+    for (int koff = 0; koff < 32; koff ++) {
+      float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8];
+      float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8];
+
+      // first feature is at (threadIdx.y/4) * 8 last is at start + 8.
+      int start_feature = (threadIdx.y / 4) * 8;
+
+      float2 br1 = rhs_shmem2[start_feature/2 +     (koff % 4) * 32][koff/4];
+      float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4];
+      float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4];
+      float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4];
+
+      add_vals(a3, a4, br1, br2, br3, br4)
+    }
+    __syncthreads();
+  } // end loop over k
+
+
+  __syncthreads();
+  Index horiz_base = (threadIdx.y/4)*8+base_n;
+  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
+    for (int i = 0; i < 8; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }
+  } else if (!CHECK_RHS_BOUNDARY) {
+    if (lhs_vert + 3 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    } else if (lhs_vert + 2 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      }
+    } else if (lhs_vert + 1 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      }
+    } else if (lhs_vert  < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+      }
+    }
+  } else if (!CHECK_LHS_BOUNDARY) {
+    // CHECK BOUNDARY_B
+    for (int i = 0; i < 8; i++) {
+      if (horiz_base + i < n_size) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  } else {
+    // CHECK both boundaries.
+    for (int i = 0; i < 8; i++) {
+      if (horiz_base + i < n_size) {
+        if (lhs_vert < m_size)
+          output(lhs_vert, horiz_base + i) = results[i].x;
+        if (lhs_vert + 1 < m_size)
+          output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        if (lhs_vert + 2 < m_size)
+          output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        if (lhs_vert + 3 < m_size)
+          output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  }
+}
+
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper>
+__global__ void
+__launch_bounds__(256)
+EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output,
+                       const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ float2 lhs_shmem[64*32];
+  __shared__ float2 rhs_shmem[128*8];
+
+  typedef float2 LHS_MEM[64][32];
+  typedef float2 RHS_MEM[128][8];
+
+  typedef float2 LHS_MEM16x16[32][16];
+  typedef float2 RHS_MEM16x16[64][8];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 128 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  bool check_rhs = (base_n + 63) >= n_size;
+  bool check_lhs128 = (base_m + 127) >= m_size;
+  bool check_lhs64 = (base_m + 63) >= m_size;
+
+  if (!check_rhs) {
+    if (!check_lhs128) {
+      // >= 128 rows left
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    }
+  } else {
+    if (!check_lhs128) {
+      // >= 128 rows left
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    }
+  }
+}
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper>
+__global__ void
+__launch_bounds__(256)
+EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output,
+                       const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ float2 lhs_shmem[32][16];
+  __shared__ float2 rhs_shmem[64][8];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  if (base_m + 63 < m_size) {
+    if (base_n + 63 < n_size) {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    }
+  } else {
+    if (base_n + 63 < n_size) {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    }
+  }
+}
+
+
+template<typename Indices, typename LeftArgType, typename RightArgType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> :
+    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> > {
+
+  typedef GpuDevice Device;
+
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
+  typedef TensorContractionEvaluatorBase<Self> Base;
+
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::Packet Packet;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  enum {
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+  };
+
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef typename internal::conditional<
+    Layout == ColMajor, LeftArgType, RightArgType>::type EvalLeftArgType;
+  typedef typename internal::conditional<
+    Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType;
+
+  static const int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static const int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static const int ContractDims = internal::array_size<Indices>::value;
+
+  typedef array<Index, LDims> left_dim_mapper_t;
+  typedef array<Index, RDims> right_dim_mapper_t;
+
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, internal::max_n_1<LDims - ContractDims>::size> left_nocontract_t;
+  typedef array<Index, internal::max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+
+  static const int NumDims = internal::max_n_1<LDims + RDims - 2 * ContractDims>::size;
+
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  // typedefs needed in evalTo
+  typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+  typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+
+  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+  typedef typename LeftEvaluator::Dimensions LeftDimensions;
+  typedef typename RightEvaluator::Dimensions RightDimensions;
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
+      Base(op, device) {}
+
+  // We need to redefine this method to make nvcc happy
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+    this->m_leftImpl.evalSubExprsIfNeeded(NULL);
+    this->m_rightImpl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      evalTo(data);
+      return false;
+    } else {
+      this->m_result = static_cast<Scalar *>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
+      evalTo(this->m_result);
+      return true;
+    }
+  }
+
+  void evalTo(Scalar* buffer) const {
+    if (this->m_lhs_inner_dim_contiguous) {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, true, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<true, true, false, Unaligned>(buffer);
+        }
+      }
+      else {
+       if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, false, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<true, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+    else {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, true, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<false, true, false, Unaligned>(buffer);
+        }
+      }
+      else {
+       if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, false, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<false, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalTyped(Scalar* buffer) const {
+    // columns in left side, rows in right side
+    const Index k = this->m_k_size;
+
+    // rows in left side
+    const Index m = this->m_i_size;
+
+    // columns in right side
+    const Index n = this->m_j_size;
+
+    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
+    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+                                                   LeftEvaluator, left_nocontract_t,
+                                                   contract_t, 4,
+                                                   lhs_inner_dim_contiguous,
+                                                   false, Unaligned> LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+                                                   RightEvaluator, right_nocontract_t,
+                                                   contract_t, 4,
+                                                   rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+    OutputMapper output(buffer, m);
+
+    setCudaSharedMemConfig(cudaSharedMemBankSizeEightByte);
+    if (internal::is_same<LhsScalar, float>::value &&
+        internal::is_same<RhsScalar, float>::value) {
+      if (m < 768 || n < 768) {
+        const Index m_blocks = (m + 63) / 64;
+        const Index n_blocks = (n + 63) / 64;
+        const dim3 num_blocks(m_blocks, n_blocks, 1);
+        const dim3 block_size(16, 16, 1);
+        LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k);
+      } else {
+       const Index m_blocks = (m + 127) / 128;
+        const Index n_blocks = (n + 63) / 64;
+        const dim3 num_blocks(m_blocks, n_blocks, 1);
+        const dim3 block_size(8, 32, 1);
+        LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k);
+      }
+    } else {
+      const Index m_blocks = (m + 63) / 64;
+      const Index n_blocks = (n + 63) / 64;
+      const dim3 num_blocks(m_blocks, n_blocks, 1);
+      const dim3 block_size(8, 8, 8);
+      LAUNCH_CUDA_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k);
+    }
+  }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_USE_GPU and __CUDACC__
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
new file mode 100644
index 000000000..8b87f1045
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -0,0 +1,382 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
+
+// evaluator for thread pool device
+#ifdef EIGEN_USE_THREADS
+
+namespace Eigen {
+namespace internal {
+
+template<typename LhsScalar, typename LhsMapper, typename Index>
+struct packLhsArg {
+  LhsScalar* blockA;
+  const LhsMapper& lhs;
+  const Index m_start;
+  const Index k_start;
+  const Index mc;
+  const Index kc;
+};
+
+template<typename LhsScalar, typename RhsScalar, typename RhsMapper, typename OutputMapper, typename Index>
+struct packRhsAndKernelArg {
+  const std::vector<LhsScalar*>* blockAs;
+  RhsScalar* blockB;
+  const RhsMapper& rhs;
+  OutputMapper& output;
+  const Index m;
+  const Index k;
+  const Index n;
+  const Index mc;
+  const Index kc;
+  const Index nc;
+  const Index num_threads;
+  const Index num_blockAs;
+  const Index max_m;
+  const Index k_block_idx;
+  const Index m_block_idx;
+  const Index n_block_idx;
+  const Index m_blocks;
+  const Index n_blocks;
+  std::vector<Promise>* kernel_promises;
+  const std::vector<Future>* lhs_futures;
+  const bool need_to_pack;
+};
+
+}  // end namespace internal
+
+
+template<typename Indices, typename LeftArgType, typename RightArgType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> :
+    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> > {
+
+  typedef ThreadPoolDevice Device;
+
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
+  typedef TensorContractionEvaluatorBase<Self> Base;
+
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::Packet Packet;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  enum {
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+  };
+
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+
+  static const int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static const int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static const int ContractDims = internal::array_size<Indices>::value;
+
+  typedef array<Index, LDims> left_dim_mapper_t;
+  typedef array<Index, RDims> right_dim_mapper_t;
+
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, internal::max_n_1<LDims - ContractDims>::size> left_nocontract_t;
+  typedef array<Index, internal::max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+
+  static const int NumDims = internal::max_n_1<LDims + RDims - 2 * ContractDims>::size;
+
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  // typedefs needed in evalTo
+  typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+  typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+  typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+
+  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+  TensorEvaluator(const XprType& op, const Device& device) :
+      Base(op, device) {}
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalProduct(Scalar* buffer) const {
+    if (this->m_j_size == 1) {
+      this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
+      return;
+    }
+
+    evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalGemm(Scalar* buffer) const {
+    // columns in left side, rows in right side
+    const Index k = this->m_k_size;
+
+    // rows in left side
+    const Index m = this->m_i_size;
+
+    // columns in right side
+    const Index n = this->m_j_size;
+
+    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
+    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+
+
+    const int lhs_packet_size = internal::packet_traits<LhsScalar>::size;
+    const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
+
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+                                                   LeftEvaluator, left_nocontract_t,
+                                                   contract_t, lhs_packet_size,
+                                                   lhs_inner_dim_contiguous,
+                                                   false, Unaligned> LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+                                                   RightEvaluator, right_nocontract_t,
+                                                   contract_t, rhs_packet_size,
+                                                   rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+    // TODO: packing could be faster sometimes if we supported row major tensor mappers
+    typedef internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, Traits::mr,
+                                    Traits::LhsProgress, ColMajor> LhsPacker;
+    typedef internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> RhsPacker;
+
+    // TODO: replace false, false with conjugate values?
+    typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
+                                  Traits::mr, Traits::nr, false, false> GebpKernel;
+
+    typedef internal::packLhsArg<LhsScalar, LhsMapper, Index> packLArg;
+    typedef internal::packRhsAndKernelArg<LhsScalar, RhsScalar, RhsMapper, OutputMapper, Index> packRKArg;
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+    OutputMapper output(buffer, m);
+
+    LhsPacker pack_lhs;
+
+    // compute block sizes (which depend on number of threads)
+    const Index num_threads = this->m_device.numThreads();
+    Index mc = m;
+    Index nc = n;
+    Index kc = k;
+    internal::computeProductBlockingSizes<LhsScalar,RhsScalar,1>(kc, mc, nc, num_threads);
+    eigen_assert(mc <= m);
+    eigen_assert(nc <= n);
+    eigen_assert(kc <= k);
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+    const Index k_blocks = CEIL_DIV(k, kc);
+    const Index n_blocks = CEIL_DIV(n, nc);
+    const Index m_blocks = CEIL_DIV(m, mc);
+    const int sizeA = mc * kc;
+    const int sizeB = kc * nc;
+
+    /*    cout << "m: " << m << " n: " << n << " k: " << k << endl;
+    cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl;
+    cout << "m_blocks: " << m_blocks << " n_blocks: " << n_blocks << " k_blocks: " << k_blocks << endl;
+    cout << "num threads: " << num_threads << endl;
+    */
+
+    // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB
+    //       aren't 16 byte aligned segfaults will happen due to SIMD instructions
+    // note: You can get away with allocating just a single blockA and offsets and meet the
+    //       the alignment requirements with the assumption that
+    //       (Traits::mr * sizeof(ResScalar)) % 16 == 0
+    const Index numBlockAs = (std::min)(num_threads, m_blocks);
+    std::vector<LhsScalar *> blockAs;
+    blockAs.reserve(num_threads);
+    for (int i = 0; i < num_threads; i++) {
+      blockAs.push_back(static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar))));
+    }
+
+    // To circumvent alignment issues, I'm just going to separately allocate the memory for each thread
+    // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful.
+    //       Other options: (1) reuse memory when a thread finishes. con: tricky
+    //                      (2) allocate block B memory in each thread. con: overhead
+    std::vector<RhsScalar *> blockBs;
+    blockBs.reserve(n_blocks);
+    for (int i = 0; i < n_blocks; i++) {
+      blockBs.push_back(static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar))));
+    }
+
+    // lhs_futures starts with all null futures
+    std::vector<Future> lhs_futures(num_threads);
+
+    // this should really be numBlockAs * n_blocks;
+    const Index num_kernel_promises = num_threads * n_blocks;
+    std::vector<Promise> kernel_promises(num_kernel_promises);
+    std::vector<Future> kernel_futures(num_kernel_promises);
+    for (int i = 0; i < kernel_promises.size(); ++i) {
+      kernel_promises[i].set_value();
+      kernel_futures[i] = kernel_promises[i].get_future();
+    }
+
+    for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) {
+      const Index k_start = k_block_idx * kc;
+      // make sure we don't overshoot right edge of left matrix
+      const Index actual_kc = (std::min)(k_start + kc, k) - k_start;
+
+      for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) {
+        const int num_blocks = (std::min)(m_blocks-m_block_idx, numBlockAs);
+
+        for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) {
+          const Index m_start = mt_block_idx * mc;
+          const Index actual_mc = (std::min)(m_start + mc, m) - m_start;
+          eigen_assert(actual_mc > 0);
+
+          int blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads;
+          for (int i = 0; i < n_blocks; ++i) {
+            int future_id = (blockAId * n_blocks + i);
+            wait_until_ready(&kernel_futures[future_id]);
+            kernel_promises[future_id] = Promise();
+            kernel_futures[future_id] = kernel_promises[future_id].get_future();
+          }
+          const packLArg arg = {
+            blockAs[blockAId], // blockA
+            lhs,        // lhs
+            m_start,    // m
+            k_start,    // k
+            actual_mc,  // mc
+            actual_kc,  // kc
+          };
+
+          lhs_futures[blockAId] =
+              this->m_device.enqueue(&Self::packLhs<packLArg, LhsPacker>, arg);
+        }
+
+        // now start kernels.
+        const Index m_base_start = m_block_idx * mc;
+        const bool need_to_pack = m_block_idx == 0;
+
+        for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) {
+          const Index n_start = n_block_idx * nc;
+          const Index actual_nc = (std::min)(n_start + nc, n) - n_start;
+
+          // first make sure the previous kernels are all done before overwriting rhs. Also wait if
+          // we're going to start new k. In both cases need_to_pack is true.
+          if (need_to_pack) {
+            for (int i = num_blocks; i < num_threads; ++i) {
+              int blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads;
+              int future_id = (blockAId * n_blocks + n_block_idx);
+              wait_until_ready(&kernel_futures[future_id]);
+            }
+          }
+
+          packRKArg arg = {
+            &blockAs, // blockA
+            blockBs[n_block_idx], // blockB
+            rhs,          // rhs
+            output,       // output
+            m_base_start, // m
+            k_start,      // k
+            n_start,      // n
+            mc,           // mc
+            actual_kc,    // kc
+            actual_nc,    // nc
+            num_threads,
+            numBlockAs,
+            m,
+            k_block_idx,
+            m_block_idx,
+            n_block_idx, // n_block_idx
+            m_blocks, // m_blocks
+            n_blocks, // n_blocks
+            &kernel_promises, // kernel_promises
+            &lhs_futures, // lhs_futures
+            need_to_pack, // need_to_pack
+          };
+
+          this->m_device.enqueueNoFuture(&Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>, arg);
+        }
+      }
+    }
+
+    // Make sure all the kernels are done.
+    for (int i = 0; i < kernel_futures.size(); ++i) {
+      wait_until_ready(&kernel_futures[i]);
+    }
+
+    // deallocate all of the memory for both A and B's
+    for (int i = 0; i < blockAs.size(); i++) {
+      this->m_device.deallocate(blockAs[i]);
+    }
+    for (int i = 0; i < blockBs.size(); i++) {
+      this->m_device.deallocate(blockBs[i]);
+    }
+
+#undef CEIL_DIV
+  }
+
+  /*
+   * Packs a LHS block of size (mt, kc) starting at lhs(m, k). Before packing
+   * the LHS block, check that all of the kernels that worked on the same
+   * mt_block_idx in the previous m_block are done.
+   */
+  template <typename packLArg, typename LhsPacker>
+  static void packLhs(const packLArg arg) {
+    // perform actual packing
+    LhsPacker pack_lhs;
+    pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m_start, arg.k_start), arg.kc, arg.mc);
+  }
+
+  /*
+   * Packs a RHS block of size (kc, nc) starting at (k, n) after checking that
+   * all kernels in the previous block are done.
+   * Then for each LHS future, we wait on the future and then call GEBP
+   * on the area packed by the future (which starts at
+   * blockA + future_idx * mt * kc) on the LHS and with the full packed
+   * RHS block.
+   * The output of this GEBP is written to output(m + i * mt, n).
+   */
+  template <typename packRKArg, typename RhsPacker, typename GebpKernel>
+  static void packRhsAndKernel(packRKArg arg) {
+    if (arg.need_to_pack) {
+      RhsPacker pack_rhs;
+      pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k, arg.n), arg.kc, arg.nc);
+    }
+
+    GebpKernel gebp;
+    for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) {
+      const Index m_base_start = arg.m + arg.mc*mt_block_idx;
+      if (m_base_start < arg.max_m) {
+        int blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads;
+
+        wait_until_ready(&(*arg.lhs_futures)[blockAId]);
+        const Index actual_mc = (std::min)(m_base_start + arg.mc, arg.max_m) - m_base_start;
+        gebp(arg.output.getSubMapper(m_base_start, arg.n),
+             (*arg.blockAs)[blockAId], arg.blockB,
+             actual_mc, arg.kc, arg.nc, 1.0, -1, -1, 0, 0);
+
+        const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx;
+        (*arg.kernel_promises)[set_idx].set_value();
+      }
+    }
+  }
+};
+
+} // end namespace Eigen
+
+#endif  // EIGEN_USE_THREADS
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
new file mode 100644
index 000000000..591fd2464
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -0,0 +1,912 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
+
+namespace Eigen {
+
+/** \class TensorConvolution
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor convolution class.
+  *
+  *
+  */
+namespace internal {
+
+
+template <typename Index, typename InputDims, size_t NumKernelDims> class IndexMapper {
+ public:
+  IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
+              const array<Index, NumKernelDims>& indices) {
+
+    array<Index, NumDims> dimensions = input_dims;
+    for (int i = 0; i < NumKernelDims; ++i) {
+      const Index index = indices[i];
+      const Index input_dim = input_dims[index];
+      const Index kernel_dim = kernel_dims[i];
+      const Index result_dim = input_dim - kernel_dim + 1;
+      dimensions[index] = result_dim;
+    }
+
+    array<Index, NumDims> inputStrides;
+    array<Index, NumDims> outputStrides;
+    for (int i = 0; i < NumDims; ++i) {
+      if (i > 0) {
+        inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
+        outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
+      } else {
+        inputStrides[0] = 1;
+        outputStrides[0] = 1;
+      }
+    }
+
+    array<Index, NumDims> cudaInputDimensions;
+    array<Index, NumDims> cudaOutputDimensions;
+    array<Index, NumDims> tmp = dimensions;
+    array<Index, NumDims> ordering;
+    for (int i = 0; i < NumKernelDims; ++i) {
+      ordering[i] = indices[i];
+      tmp[indices[i]] = -1;
+      cudaInputDimensions[i] = input_dims[ordering[i]];
+      cudaOutputDimensions[i] = dimensions[ordering[i]];
+    }
+    int written = NumKernelDims;
+    for (int i = 0; i < NumDims; ++i) {
+      if (tmp[i] >= 0) {
+        ordering[written] = i;
+        cudaInputDimensions[written] = input_dims[i];
+        cudaOutputDimensions[written] = dimensions[i];
+        ++written;
+      }
+    }
+
+    for (int i = 0; i < NumDims; ++i) {
+      m_inputStrides[i] = inputStrides[ordering[i]];
+      m_outputStrides[i] = outputStrides[ordering[i]];
+    }
+
+    for (int i = 0; i < NumDims; ++i) {
+      if (i > NumKernelDims) {
+        m_cudaInputStrides[i] = m_cudaInputStrides[i-1] * cudaInputDimensions[i-1];
+        m_cudaOutputStrides[i] = m_cudaOutputStrides[i-1] * cudaOutputDimensions[i-1];
+      } else {
+        m_cudaInputStrides[i] = 1;
+        m_cudaOutputStrides[i] = 1;
+      }
+    }
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const {
+    Index inputIndex = 0;
+    for (int d = NumDims - 1; d > NumKernelDims; --d) {
+      const Index idx = p / m_cudaInputStrides[d];
+      inputIndex += idx * m_inputStrides[d];
+      p -= idx * m_cudaInputStrides[d];
+    }
+    inputIndex += p * m_inputStrides[NumKernelDims];
+    return inputIndex;
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const {
+    Index outputIndex = 0;
+    for (int d = NumDims - 1; d > NumKernelDims; --d) {
+      const Index idx = p / m_cudaOutputStrides[d];
+      outputIndex += idx * m_outputStrides[d];
+      p -= idx * m_cudaOutputStrides[d];
+    }
+    outputIndex += p * m_outputStrides[NumKernelDims];
+    return outputIndex;
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const {
+    return i * m_inputStrides[0];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const {
+    return i * m_outputStrides[0];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const {
+    return i * m_inputStrides[0] + j*m_inputStrides[1];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const {
+    return i * m_outputStrides[0] + j * m_outputStrides[1];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
+    return i * m_inputStrides[0] + j*m_inputStrides[1] + k*m_inputStrides[2];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
+    return i * m_outputStrides[0] + j*m_outputStrides[1] + k*m_outputStrides[2];
+  }
+
+ private:
+  static const size_t NumDims = internal::array_size<InputDims>::value;
+  array<Index, NumDims> m_inputStrides;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims> m_cudaInputStrides;
+  array<Index, NumDims> m_cudaOutputStrides;
+};
+
+
+
+template<typename Dimensions, typename InputXprType, typename KernelXprType>
+struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
+{
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename promote_storage_type<typename InputXprType::Scalar,
+                                        typename KernelXprType::Scalar>::ret Scalar;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
+                                        typename traits<KernelXprType>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<InputXprType>::Index,
+                                      typename traits<KernelXprType>::Index>::type Index;
+  typedef typename InputXprType::Nested LhsNested;
+  typedef typename KernelXprType::Nested RhsNested;
+  typedef typename remove_reference<LhsNested>::type _LhsNested;
+  typedef typename remove_reference<RhsNested>::type _RhsNested;
+  static const int NumDimensions = traits<InputXprType>::NumDimensions;
+  static const int Layout = traits<InputXprType>::Layout;
+
+  enum {
+    Flags = 0,
+  };
+};
+
+template<typename Dimensions, typename InputXprType, typename KernelXprType>
+struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
+{
+  typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
+};
+
+template<typename Dimensions, typename InputXprType, typename KernelXprType>
+struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
+{
+  typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename Indices, typename InputXprType, typename KernelXprType>
+class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType> >
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorConvolutionOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
+                                                  typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
+  typedef typename internal::promote_storage_type<typename InputXprType::PacketReturnType,
+                                                  typename KernelXprType::PacketReturnType>::ret PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
+      : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const Indices& indices() const { return m_indices; }
+
+    /** \returns the nested expressions */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<typename InputXprType::Nested>::type&
+    inputExpression() const { return m_input_xpr; }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<typename KernelXprType::Nested>::type&
+    kernelExpression() const { return m_kernel_xpr; }
+
+  protected:
+    typename InputXprType::Nested m_input_xpr;
+    typename KernelXprType::Nested m_kernel_xpr;
+    const Indices m_indices;
+};
+
+
+template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
+{
+  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
+
+  static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
+  static const int NumKernelDims = internal::array_size<Indices>::value;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  enum {
+    IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<InputArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    // Only column major tensors are supported for now.
+    EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
+    const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
+
+    m_inputStride[0] = 1;
+    for (int i = 1; i < NumDims; ++i) {
+      m_inputStride[i] = m_inputStride[i-1] * input_dims[i-1];
+    }
+
+    m_dimensions = m_inputImpl.dimensions();
+    for (int i = 0; i < NumKernelDims; ++i) {
+      const Index index = op.indices()[i];
+      const Index input_dim = input_dims[index];
+      const Index kernel_dim = kernel_dims[i];
+      const Index result_dim = input_dim - kernel_dim + 1;
+      m_dimensions[index] = result_dim;
+      if (i > 0) {
+        m_kernelStride[i] = m_kernelStride[i-1] * kernel_dims[i-1];
+      } else {
+        m_kernelStride[0] = 1;
+      }
+      m_indexStride[i] = m_inputStride[index];
+    }
+
+    m_outputStride[0] = 1;
+    for (int i = 1; i < NumDims; ++i) {
+      m_outputStride[i] = m_outputStride[i-1] * m_dimensions[i-1];
+    }
+  }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+    m_inputImpl.evalSubExprsIfNeeded(NULL);
+    preloadKernel();
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_inputImpl.cleanup();
+    if (m_local_kernel) {
+      m_device.deallocate((void*)m_kernel);
+      m_local_kernel = false;
+    }
+    m_kernel = NULL;
+  }
+
+  void evalTo(typename XprType::Scalar* buffer) {
+    evalSubExprsIfNeeded(NULL);
+    for (int i = 0; i < dimensions().TotalSize(); ++i) {
+      buffer[i] += coeff(i);
+    }
+    cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    CoeffReturnType result = CoeffReturnType(0);
+    convolve(firstInput(index), 0, NumKernelDims-1, result);
+    return result;
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
+  {
+    const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+    Index indices[2] = {index, index+PacketSize-1};
+    Index startInputs[2] = {0, 0};
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx0 = indices[0] / m_outputStride[i];
+      const Index idx1 = indices[1] / m_outputStride[i];
+      startInputs[0] += idx0 * m_inputStride[i];
+      startInputs[1] += idx1 * m_inputStride[i];
+      indices[0] -= idx0 * m_outputStride[i];
+      indices[1] -= idx1 * m_outputStride[i];
+    }
+    startInputs[0] += indices[0];
+    startInputs[1] += indices[1];
+
+    if (startInputs[1]-startInputs[0] == PacketSize-1) {
+      PacketReturnType result = internal::pset1<PacketReturnType>(0);
+      convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
+      return result;
+    } else {
+      EIGEN_ALIGN_DEFAULT Scalar data[PacketSize];
+      data[0] = Scalar(0);
+      convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
+      for (int i = 1; i < PacketSize-1; ++i) {
+        data[i] = Scalar(0);
+        convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
+      }
+      data[PacketSize-1] = Scalar(0);
+      convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
+      return internal::pload<PacketReturnType>(data);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ private:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
+    Index startInput = 0;
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx = index / m_outputStride[i];
+      startInput += idx * m_inputStride[i];
+      index -= idx * m_outputStride[i];
+    }
+    startInput += index;
+    return startInput;
+  }
+
+  EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
+    for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
+      const Index input = firstIndex + j * m_indexStride[DimIndex];
+      const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
+      if (DimIndex > 0) {
+        convolve(input, kernel, DimIndex-1, accum);
+      } else {
+        accum += m_inputImpl.coeff(input) * m_kernel[kernel];
+      }
+    }
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
+    for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
+      const Index input = firstIndex + j * m_indexStride[DimIndex];
+      const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
+      if (DimIndex > 0) {
+        convolvePacket(input, kernel, DimIndex-1, accum);
+      } else {
+        accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
+      }
+    }
+  }
+
+  EIGEN_STRONG_INLINE void preloadKernel() {
+    // Don't make a local copy of the kernel unless we have to (i.e. it's an
+    // expression that needs to be evaluated)
+    const Scalar* in_place = m_kernelImpl.data();
+    if (in_place) {
+      m_kernel = in_place;
+      m_local_kernel = false;
+    } else {
+      size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+      Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
+      typedef TensorEvalToOp<const KernelArgType> EvalTo;
+      EvalTo evalToTmp(local, m_kernelArg);
+      internal::TensorExecutor<const EvalTo, Device, TensorEvaluator<KernelArgType, Device>::PacketAccess>::run(evalToTmp, m_device);
+
+      m_kernel = local;
+      m_local_kernel = true;
+    }
+  }
+
+  array<Index, NumDims> m_inputStride;
+  array<Index, NumDims> m_outputStride;
+
+  array<Index, NumKernelDims> m_indexStride;
+  array<Index, NumKernelDims> m_kernelStride;
+  TensorEvaluator<InputArgType, Device> m_inputImpl;
+  TensorEvaluator<KernelArgType, Device> m_kernelImpl;
+  Dimensions m_dimensions;
+
+  KernelArgType m_kernelArg;
+  const Scalar* m_kernel;
+  bool m_local_kernel;
+  const Device& m_device;
+};
+
+
+
+
+// Use an optimized implementation of the evaluation code for GPUs whenever possible.
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+
+template <int StaticKernelSize>
+struct GetKernelSize {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
+    return StaticKernelSize;
+  }
+};
+template <>
+struct GetKernelSize<Dynamic> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
+    return kernelSize;
+  }
+};
+
+
+
+
+template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSize>
+__global__ void EigenConvolutionKernel1D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 1> indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int kernelSize, float* buffer) {
+  extern __shared__ float s[];
+
+  const int first_x = blockIdx.x * maxX;
+  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
+  const int num_x_output = last_x - first_x + 1;
+
+  const int first_plane = blockIdx.y * blockDim.y;
+  const int plane_stride = blockDim.y * gridDim.y;
+
+  for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
+    // Load inputs to shared memory
+    const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+    const int plane_kernel_offset = threadIdx.y * num_x_input;
+    #pragma unroll
+    for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
+      const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x);
+      s[i + plane_kernel_offset] =  eval.coeff(tensor_index);
+    }
+
+    __syncthreads();
+
+    // Compute the convolution
+    const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
+
+    #pragma unroll
+    for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
+      const int kernel_offset = plane_kernel_offset + i;
+      float result = 0.0f;
+      #pragma unroll
+      for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
+        result += s[k + kernel_offset] * kernel[k];
+      }
+      const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x);
+      buffer[tensor_index] = result;
+    }
+    __syncthreads();
+  }
+};
+
+
+template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSizeX, int StaticKernelSizeY>
+__global__ void EigenConvolutionKernel2D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 2> indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int numY, const int maxY, const int kernelSizeX, const int kernelSizeY, float* buffer) {
+  extern __shared__ float s[];
+
+  const int first_x = blockIdx.x * maxX;
+  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
+  const int num_x_output = last_x - first_x + 1;
+
+  const int first_y = blockIdx.y * maxY;
+  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
+  const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
+  const int num_y_output = last_y - first_y + 1;
+
+  const int first_plane = blockIdx.z * blockDim.z;
+  const int plane_stride = blockDim.z * gridDim.z;
+
+  for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
+
+    const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+    const int plane_kernel_offset = threadIdx.z * num_y_input;
+
+    // Load inputs to shared memory
+    #pragma unroll
+    for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
+      const int input_offset = num_x_input * (j + plane_kernel_offset);
+      #pragma unroll
+      for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
+        const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y);
+        s[i + input_offset] = eval.coeff(tensor_index);
+      }
+    }
+
+    __syncthreads();
+
+    // Convolution
+    const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
+
+    #pragma unroll
+    for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
+      #pragma unroll
+      for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
+        float result = 0.0f;
+        #pragma unroll
+        for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
+          const int kernel_offset = kernelSizeX * l;
+          const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
+          #pragma unroll
+          for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
+            result += s[k + input_offset] * kernel[k + kernel_offset];
+          }
+        }
+        const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
+        buffer[tensor_index] = result;
+      }
+    }
+
+    __syncthreads();
+  }
+};
+
+
+template <typename InputEvaluator, typename Index, typename InputDims>
+__global__ void EigenConvolutionKernel3D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 3> indexMapper, const float* __restrict kernel, const size_t numPlanes, const size_t numX, const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, const size_t kernelSizeZ, float* buffer) {
+  extern __shared__ float s[];
+
+  // Load inputs to shared memory
+  const int first_x = blockIdx.x * maxX;
+  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+  const int num_x_input = last_x - first_x + kernelSizeX;
+
+  const int first_y = blockIdx.y * maxY;
+  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
+  const int num_y_input = last_y - first_y + kernelSizeY;
+
+  const int first_z = blockIdx.z * maxZ;
+  const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
+  const int num_z_input = last_z - first_z + kernelSizeZ;
+
+  for (int p = 0; p < numPlanes; ++p) {
+
+    const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+    const int plane_kernel_offset = 0;
+
+    for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
+      for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
+        for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
+          const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
+          s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
+        }
+      }
+    }
+
+    __syncthreads();
+
+    // Convolution
+    const int num_z_output = last_z - first_z + 1;
+    const int num_y_output = last_y - first_y + 1;
+    const int num_x_output = last_x - first_x + 1;
+    const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
+
+    for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
+      for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
+        for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
+          float result = 0.0f;
+          for (int n = 0; n < kernelSizeZ; ++n) {
+            for (int m = 0; m < kernelSizeY; ++m) {
+              for (int l = 0; l < kernelSizeX; ++l) {
+                result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
+              }
+            }
+          }
+          const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
+          buffer[tensor_index] = result;
+        }
+      }
+    }
+    __syncthreads();
+  }
+};
+
+
+
+template<typename Indices, typename InputArgType, typename KernelArgType>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
+{
+  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
+
+  static const int NumDims =  internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
+  static const int NumKernelDims = internal::array_size<Indices>::value;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
+
+  enum {
+    IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
+    PacketAccess = false,
+    Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
+      : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    // Only column major tensors are supported for now.
+    EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
+    const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
+
+    m_dimensions = m_inputImpl.dimensions();
+    for (int i = 0; i < NumKernelDims; ++i) {
+      const Index index = op.indices()[i];
+      const Index input_dim = input_dims[index];
+      const Index kernel_dim = kernel_dims[i];
+      const Index result_dim = input_dim - kernel_dim + 1;
+      m_dimensions[index] = result_dim;
+    }
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename InputArgType::Scalar Scalar;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+    preloadKernel();
+    m_inputImpl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      executeEval(data);
+      return false;
+    } else {
+      m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
+      executeEval(m_buf);
+      return true;
+    }
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_inputImpl.cleanup();
+    if (m_buf) {
+      m_device.deallocate(m_buf);
+      m_buf = NULL;
+    }
+    if (m_local_kernel) {
+      m_device.deallocate((void*)m_kernel);
+      m_local_kernel = false;
+    }
+    m_kernel = NULL;
+  }
+
+  EIGEN_STRONG_INLINE void preloadKernel() {
+    // Don't make a local copy of the kernel unless we have to (i.e. it's an
+    // expression that needs to be evaluated)
+    const Scalar* in_place = m_kernelImpl.data();
+    if (in_place) {
+      m_kernel = in_place;
+      m_local_kernel = false;
+    } else {
+      size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+      Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
+      typedef TensorEvalToOp<const KernelArgType> EvalTo;
+      EvalTo evalToTmp(local, m_kernelArg);
+      internal::TensorExecutor<const EvalTo, GpuDevice, TensorEvaluator<KernelArgType, GpuDevice>::PacketAccess>::run(evalToTmp, m_device);
+
+      m_kernel = local;
+      m_local_kernel = true;
+    }
+  }
+
+  static unsigned int ceil(unsigned int num, unsigned int denom) {
+    const unsigned int rounded_toward_zero = num / denom;
+    if (num > rounded_toward_zero * denom) {
+      return rounded_toward_zero + 1;
+    }
+    return rounded_toward_zero;
+  }
+
+  void executeEval(Scalar* data) const {
+    typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
+
+    const int maxSharedMem = sharedMemPerBlock();
+    const int maxThreadsPerBlock = maxCudaThreadsPerBlock();
+    const int maxBlocksPerProcessor = maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock;
+    const int numMultiProcessors = getNumCudaMultiProcessors();
+    const int warpSize = 32;
+
+    switch (NumKernelDims) {
+      case 1: {
+        const int kernel_size = m_kernelImpl.dimensions().TotalSize();
+
+        const int numX = dimensions()[m_indices[0]];
+        const int numP = dimensions().TotalSize() / numX;
+
+        int maxX;
+        dim3 block_size;
+        if (m_indices[0] == 0) {
+          // Maximum the reuse
+          const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
+          maxX = (std::min<int>)(inner_dim, numX);
+          const int maxP = (std::min<int>)(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
+          block_size.x = (std::min)(maxThreadsPerBlock, maxX);
+          block_size.y = (std::min<int>)(maxThreadsPerBlock / block_size.x, maxP);
+        }
+        else {
+          // Read as much as possible alongside the inner most dimension, that is the plane
+          const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
+          const int maxP = (std::min<int>)(inner_dim, numP);
+          maxX = (std::min<int>)(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
+
+          block_size.x = (std::min)(warpSize, maxX);
+          block_size.y = (std::min<int>)(maxThreadsPerBlock/block_size.x, maxP);
+        }
+
+        const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
+        assert(shared_mem <= maxSharedMem);
+
+        const int num_x_blocks = ceil(numX, maxX);
+        const int blocksPerProcessor = (std::min)(maxBlocksPerProcessor, maxSharedMem / shared_mem);
+        const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
+
+        dim3 num_blocks(num_x_blocks, min<int>(num_y_blocks, ceil(numP, block_size.y)));
+
+
+        //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
+
+        const array<Index, 1> indices(m_indices[0]);
+        const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
+        internal::IndexMapper<Index, InputDims, 1> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+        switch(kernel_size) {
+          case 4: {
+            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
+            break;
+          }
+          case 7: {
+            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
+            break;
+          }
+          default: {
+            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
+          }
+        }
+        break;
+      }
+
+      case 2: {
+        const int kernel_size_x = m_kernelImpl.dimensions()[0];
+        const int kernel_size_y = m_kernelImpl.dimensions()[1];
+
+        const int numX = dimensions()[m_indices[0]];
+        const int numY = dimensions()[m_indices[1]];
+        const int numP = dimensions().TotalSize() / (numX*numY);
+
+        const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
+
+        // Snap maxX to warp size
+        int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
+        const int maxX = (std::min<int>)(inner_dim, numX);
+        const int maxY = (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
+        const int maxP = (std::min<int>)(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
+
+        dim3 block_size;
+        block_size.x = (std::min)(1024, maxX);
+        block_size.y = (std::min<int>)(1024/block_size.x, maxY);
+        block_size.z = (std::min<int>)(1024/(block_size.x*block_size.y), maxP);
+
+        const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
+        assert(shared_mem <= maxSharedMem);
+
+        const int num_x_blocks = ceil(numX, maxX);
+        const int num_y_blocks = ceil(numY, maxY);
+        const int blocksPerProcessor = (std::min)(maxBlocksPerProcessor, maxSharedMem / shared_mem);
+        const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
+
+        dim3 num_blocks(num_x_blocks, num_y_blocks, min<int>(num_z_blocks, ceil(numP, block_size.z)));
+
+
+        //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
+
+        const array<Index, 2> indices(m_indices[0], m_indices[1]);
+        const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[0], m_kernelImpl.dimensions()[1]);
+        internal::IndexMapper<Index, InputDims, 2> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+        switch (kernel_size_x) {
+          case 4: {
+            switch (kernel_size_y) {
+              case 7: {
+                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
+                break;
+              }
+              default: {
+                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
+                break;
+              }
+            }
+            break;
+          }
+          case 7: {
+            switch (kernel_size_y) {
+              case 4: {
+                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
+                break;
+              }
+              default: {
+                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
+                break;
+              }
+            }
+            break;
+          }
+          default: {
+            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
+            break;
+          }
+        }
+        break;
+      }
+
+      case 3: {
+        const int kernel_size_x = m_kernelImpl.dimensions()[0];
+        const int kernel_size_y = m_kernelImpl.dimensions()[1];
+        const int kernel_size_z = m_kernelImpl.dimensions()[2];
+
+        const int numX = dimensions()[m_indices[0]];
+        const int numY = dimensions()[m_indices[1]];
+        const int numZ = dimensions()[m_indices[2]];
+        const int numP = dimensions().TotalSize() / (numX*numY*numZ);
+
+        const int maxX = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
+        const int maxY = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
+        const int maxZ = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
+
+        dim3 block_size;
+        block_size.x = (std::min)(32, maxX);
+        block_size.y = (std::min)(32, maxY);
+        block_size.z = (std::min<int>)(1024/(block_size.x*block_size.y), maxZ);
+        dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
+
+        const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
+        assert(shared_mem <= maxSharedMem);
+
+        //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z  << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
+        const array<Index, 3> indices(m_indices[0], m_indices[1], m_indices[2]);
+        const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[0], m_kernelImpl.dimensions()[1], m_kernelImpl.dimensions()[2]);
+        internal::IndexMapper<Index, InputDims, 3> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+
+        LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
+        break;
+      }
+
+      default: {
+        assert(false && "not supported yet");
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    eigen_assert(m_buf);
+    eigen_assert(index < m_dimensions.TotalSize());
+    return m_buf[index];
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
+  {
+    eigen_assert(m_buf);
+    eigen_assert(index < m_dimensions.TotalSize());
+    return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
+  }
+
+ private:
+  // No assignment (copies are needed by the kernels)
+  TensorEvaluator& operator = (const TensorEvaluator&);
+
+  TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
+  TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
+  KernelArgType m_kernelArg;
+  Indices m_indices;
+  Dimensions m_dimensions;
+  Scalar* m_buf;
+  const Scalar* m_kernel;
+  bool m_local_kernel;
+
+  const GpuDevice& m_device;
+};
+#endif
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
new file mode 100644
index 000000000..649bdb308
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
@@ -0,0 +1,126 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
+
+namespace Eigen {
+
+/** \class TensorDevice
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Pseudo expression providing an operator = that will evaluate its argument
+  * on the specified computing 'device' (GPU, thread pool, ...)
+  *
+  * Example:
+  *    C.device(EIGEN_GPU) = A + B;
+  *
+  * Todo: thread pools.
+  * Todo: operator +=, -=, *= and so on.
+  */
+
+template <typename ExpressionType, typename DeviceType> class TensorDevice {
+  public:
+    TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
+      typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+      Assign assign(m_expression, other);
+      static const bool Vectorize = TensorEvaluator<const Assign, DeviceType>::PacketAccess;
+      internal::TensorExecutor<const Assign, DeviceType, Vectorize>::run(assign, m_device);
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
+      typedef typename OtherDerived::Scalar Scalar;
+      typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
+      Sum sum(m_expression, other);
+      typedef TensorAssignOp<ExpressionType, const Sum> Assign;
+      Assign assign(m_expression, sum);
+      static const bool Vectorize = TensorEvaluator<const Assign, DeviceType>::PacketAccess;
+      internal::TensorExecutor<const Assign, DeviceType, Vectorize>::run(assign, m_device);
+      return *this;
+    }
+
+  protected:
+    const DeviceType& m_device;
+    ExpressionType& m_expression;
+};
+
+
+#ifdef EIGEN_USE_THREADS
+template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPoolDevice> {
+  public:
+    TensorDevice(const ThreadPoolDevice& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
+      typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+      Assign assign(m_expression, other);
+      static const bool Vectorize = TensorEvaluator<const Assign, ThreadPoolDevice>::PacketAccess;
+      internal::TensorExecutor<const Assign, ThreadPoolDevice, Vectorize>::run(assign, m_device);
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
+      typedef typename OtherDerived::Scalar Scalar;
+      typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
+      Sum sum(m_expression, other);
+      typedef TensorAssignOp<ExpressionType, const Sum> Assign;
+      Assign assign(m_expression, sum);
+      static const bool Vectorize = TensorEvaluator<const Assign, ThreadPoolDevice>::PacketAccess;
+      internal::TensorExecutor<const Assign, ThreadPoolDevice, Vectorize>::run(assign, m_device);
+      return *this;
+    }
+
+  protected:
+    const ThreadPoolDevice& m_device;
+    ExpressionType& m_expression;
+};
+#endif
+
+
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
+{
+  public:
+    TensorDevice(const GpuDevice& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
+      typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+      Assign assign(m_expression, other);
+      internal::TensorExecutor<const Assign, GpuDevice, false>::run(assign, m_device);
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
+      typedef typename OtherDerived::Scalar Scalar;
+      typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
+      Sum sum(m_expression, other);
+      typedef TensorAssignOp<ExpressionType, const Sum> Assign;
+      Assign assign(m_expression, sum);
+      internal::TensorExecutor<const Assign, GpuDevice, false>::run(assign, m_device);
+      return *this;
+    }
+
+  protected:
+    const GpuDevice& m_device;
+    ExpressionType m_expression;
+};
+#endif
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
new file mode 100644
index 000000000..efd207507
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
@@ -0,0 +1,190 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
+
+
+namespace Eigen {
+
+// Default device for the machine (typically a single cpu core)
+struct DefaultDevice {
+  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+    return internal::aligned_malloc(num_bytes);
+  }
+  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+    internal::aligned_free(buffer);
+  }
+  EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+    ::memcpy(dst, src, n);
+  }
+  EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+    ::memset(buffer, c, n);
+  }
+
+  EIGEN_STRONG_INLINE size_t numThreads() const {
+    return 1;
+  }
+};
+
+
+// Multiple cpu cores
+// We should really use a thread pool here but first we need to find a portable thread pool library.
+#ifdef EIGEN_USE_THREADS
+
+typedef std::future<void> Future;
+typedef std::promise<void> Promise;
+
+static EIGEN_STRONG_INLINE void wait_until_ready(const Future* f) {
+  f->wait();
+}
+static EIGEN_STRONG_INLINE void get_when_ready(Future* f) {
+  f->get();
+}
+
+
+struct ThreadPoolDevice {
+  ThreadPoolDevice(size_t num_cores) : num_threads_(num_cores) { }
+
+  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+    return internal::aligned_malloc(num_bytes);
+  }
+
+  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+    internal::aligned_free(buffer);
+  }
+
+  EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+    ::memcpy(dst, src, n);
+  }
+
+  EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+    ::memset(buffer, c, n);
+  }
+
+  EIGEN_STRONG_INLINE size_t numThreads() const {
+    return num_threads_;
+  }
+
+  template <class Function, class... Args>
+  EIGEN_STRONG_INLINE Future enqueue(Function&& f, Args&&... args) const {
+    return std::async(std::launch::async, f, args...);
+  }
+  template <class Function, class... Args>
+  EIGEN_STRONG_INLINE void enqueueNoFuture(Function&& f, Args&&... args) const {
+    std::async(std::launch::async, f, args...);
+  }
+
+ private:
+  size_t num_threads_;
+};
+
+#endif
+
+
+// GPU offloading
+#ifdef EIGEN_USE_GPU
+static cudaDeviceProp m_deviceProperties;
+static bool m_devicePropInitialized = false;
+
+static void initializeDeviceProp() {
+  if (!m_devicePropInitialized) {
+    assert(cudaGetDeviceProperties(&m_deviceProperties, 0) == cudaSuccess);
+    m_devicePropInitialized = true;
+  }
+}
+
+static inline int getNumCudaMultiProcessors() {
+  initializeDeviceProp();
+  return m_deviceProperties.multiProcessorCount;
+}
+static inline int maxCudaThreadsPerBlock() {
+  initializeDeviceProp();
+  return m_deviceProperties.maxThreadsPerBlock;
+}
+static inline int maxCudaThreadsPerMultiProcessor() {
+  initializeDeviceProp();
+  return m_deviceProperties.maxThreadsPerMultiProcessor;
+}
+static inline int sharedMemPerBlock() {
+  initializeDeviceProp();
+  return m_deviceProperties.sharedMemPerBlock;
+}
+
+static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
+  cudaError_t status = cudaDeviceSetSharedMemConfig(config);
+  assert(status == cudaSuccess);
+}
+
+struct GpuDevice {
+  // The cudastream is not owned: the caller is responsible for its initialization and eventual destruction.
+  GpuDevice(const cudaStream_t* stream) : stream_(stream) { eigen_assert(stream); }
+
+  EIGEN_STRONG_INLINE const cudaStream_t& stream() const { return *stream_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+#ifndef __CUDA_ARCH__
+    void* result;
+    assert(cudaMalloc(&result, num_bytes) == cudaSuccess);
+    assert(result != NULL);
+    return result;
+#else
+    assert(false && "The default device should be used instead to generate kernel code");
+    return NULL;
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+#ifndef __CUDA_ARCH__
+    assert(buffer != NULL);
+    assert(cudaFree(buffer) == cudaSuccess);
+#else
+    assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+#ifndef __CUDA_ARCH__
+    assert(cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, *stream_) == cudaSuccess);
+#else
+    assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+#ifndef __CUDA_ARCH__
+    assert(cudaMemsetAsync(buffer, c, n, *stream_) == cudaSuccess);
+#else
+    assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
+    // FIXME
+    return 32;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
+    cudaStreamSynchronize(*stream_);
+  }
+
+ private:
+  // TODO: multigpu.
+  const cudaStream_t* stream_;
+};
+
+#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)            \
+  (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);  \
+  assert(cudaGetLastError() == cudaSuccess);
+
+#endif
+
+}  // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
new file mode 100644
index 000000000..2ad52b2f9
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -0,0 +1,380 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
+
+
+namespace Eigen {
+
+/** \internal
+  *
+  * \class TensorDimensions
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Set of classes used to encode and store the dimensions of a Tensor.
+  *
+  * The Sizes class encodes as part of the type the number of dimensions and the
+  * sizes corresponding to each dimension. It uses no storage space since it is
+  * entirely known at compile time.
+  * The DSizes class is its dynamic sibling: the number of dimensions is known
+  * at compile time but the sizes are set during execution.
+  *
+  * \sa Tensor
+  */
+
+// Can't use std::pair on cuda devices
+template <typename Index> struct IndexPair {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) { }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Index f, Index s) : first(f), second(s) { }
+  Index first;
+  Index second;
+};
+
+// Boilerplate code
+namespace internal {
+
+template<std::size_t n, typename Dimension> struct dget {
+  static const std::size_t value = get<n, Dimension>::value;
+};
+
+
+template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
+struct fixed_size_tensor_index_linearization_helper
+{
+  template <typename Dimensions> EIGEN_DEVICE_FUNC
+  static inline Index run(array<Index, NumIndices> const& indices,
+                          const Dimensions& dimensions)
+  {
+    return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) +
+        dget<RowMajor ? n : (NumIndices - n - 1), Dimensions>::value *
+        fixed_size_tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
+  }
+};
+
+template<typename Index, std::size_t NumIndices, bool RowMajor>
+struct fixed_size_tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
+{
+  template <typename Dimensions> EIGEN_DEVICE_FUNC
+  static inline Index run(array<Index, NumIndices> const& indices,
+                          const Dimensions&)
+  {
+    return array_get<RowMajor ? 0 : NumIndices - 1>(indices);
+  }
+};
+
+}  // end namespace internal
+
+
+// Fixed size
+#ifndef EIGEN_EMULATE_CXX11_META_H
+template <typename std::size_t... Indices>
+struct Sizes : internal::numeric_list<std::size_t, Indices...> {
+  typedef internal::numeric_list<std::size_t, Indices...> Base;
+  static const std::size_t total_size = internal::arg_prod(Indices...);
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
+    return Base::count;
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t TotalSize() {
+    return internal::arg_prod(Indices...);
+  }
+
+  Sizes() { }
+  template <typename DenseIndex>
+  explicit Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
+    // todo: add assertion
+  }
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+  template <typename... DenseIndex> Sizes(DenseIndex...) { }
+  explicit Sizes(std::initializer_list<std::size_t> /*l*/) {
+    // todo: add assertion
+  }
+#endif
+
+  template <typename T> Sizes& operator = (const T& /*other*/) {
+    // add assertion failure if the size of other is different
+    return *this;
+  }
+
+  template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, false>::run(indices, *static_cast<const Base*>(this));
+  }
+  template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, true>::run(indices, *static_cast<const Base*>(this));
+  }
+};
+
+template <typename std::size_t... Indices>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes<Indices...>&) {
+  return Sizes<Indices...>::total_size;
+}
+
+#else
+
+template <std::size_t n>
+struct non_zero_size {
+  typedef internal::type2val<std::size_t, n> type;
+};
+template <>
+struct non_zero_size<0> {
+  typedef internal::null_type type;
+};
+
+template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0, std::size_t V5=0> struct Sizes {
+  typedef typename internal::make_type_list<typename non_zero_size<V1>::type, typename non_zero_size<V2>::type, typename non_zero_size<V3>::type, typename non_zero_size<V4>::type, typename non_zero_size<V5>::type >::type Base;
+  static const size_t count = Base::count;
+  static const std::size_t total_size = internal::arg_prod<Base>::value;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
+    return count;
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() {
+    return internal::arg_prod<Base>::value;
+  }
+
+  Sizes() { }
+  template <typename DenseIndex>
+  explicit Sizes(const array<DenseIndex, Base::count>& indices) {
+    // todo: add assertion
+  }
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+  template <typename... DenseIndex> Sizes(DenseIndex... indices) { }
+  explicit Sizes(std::initializer_list<std::size_t> l) {
+    // todo: add assertion
+  }
+#else
+  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0) {
+  }
+  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1) {
+  }
+  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
+  }
+  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
+  }
+  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
+  }
+#endif
+
+  template <typename T> Sizes& operator = (const T& other) {
+    // to do: check the size of other
+    return *this;
+  }
+
+  template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, false>::run(indices, *static_cast<const Base*>(this);
+  }
+  template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, true>::run(indices, *static_cast<const Base*>(this);
+  }
+};
+
+template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes<V1, V2, V3, V4, V5>&) {
+  return Sizes<V1, V2, V3, V4, V5>::total_size;
+};
+
+#endif
+
+// Boilerplate
+namespace internal {
+template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
+struct tensor_index_linearization_helper
+{
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Index run(array<Index, NumIndices> const& indices, array<Index, NumIndices> const& dimensions)
+  {
+    return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) +
+      array_get<RowMajor ? n : (NumIndices - n - 1)>(dimensions) *
+        tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
+  }
+};
+
+template<typename Index, std::size_t NumIndices, bool RowMajor>
+struct tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
+{
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Index run(array<Index, NumIndices> const& indices, array<Index, NumIndices> const&)
+  {
+    return array_get<RowMajor ? 0 : NumIndices - 1>(indices);
+  }
+};
+}  // end namespace internal
+
+
+
+// Dynamic size
+template <typename DenseIndex, std::size_t NumDims>
+struct DSizes : array<DenseIndex, NumDims> {
+  typedef array<DenseIndex, NumDims> Base;
+  static const std::size_t count = NumDims;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
+    return NumDims;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() const {
+    return internal::array_prod(*static_cast<const Base*>(this));
+  }
+
+  EIGEN_DEVICE_FUNC DSizes() {
+    for (int i = 0 ; i < NumDims; ++i) {
+      (*this)[i] = 0;
+    }
+  }
+  EIGEN_DEVICE_FUNC explicit DSizes(const array<DenseIndex, NumDims>& a) : Base(a) { }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+  template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, IndexTypes... otherDimensions) {
+    EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    (*this) = array<DenseIndex, NumDims>{{firstDimension, otherDimensions...}};
+  }
+#else
+  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) {
+    eigen_assert(NumDims == 1);
+    (*this)[0] = i0;
+  }
+  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1) {
+    eigen_assert(NumDims == 2);
+    (*this)[0] = i0;
+    (*this)[1] = i1;
+  }
+  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
+    eigen_assert(NumDims == 3);
+    (*this)[0] = i0;
+    (*this)[1] = i1;
+    (*this)[2] = i2;
+  }
+  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
+    eigen_assert(NumDims == 4);
+    (*this)[0] = i0;
+    (*this)[1] = i1;
+    (*this)[2] = i2;
+    (*this)[3] = i3;
+  }
+  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
+    eigen_assert(NumDims == 5);
+    (*this)[0] = i0;
+    (*this)[1] = i1;
+    (*this)[2] = i2;
+    (*this)[3] = i3;
+    (*this)[4] = i4;
+  }
+#endif
+
+  EIGEN_DEVICE_FUNC DSizes& operator = (const array<DenseIndex, NumDims>& other) {
+    *static_cast<Base*>(this) = other;
+    return *this;
+  }
+
+  // A constexpr would be so much better here
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array<DenseIndex, NumDims>& indices) const {
+    return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, false>::run(indices, *static_cast<const Base*>(this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array<DenseIndex, NumDims>& indices) const {
+    return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, true>::run(indices, *static_cast<const Base*>(this));
+  }
+};
+
+
+
+
+// Boilerplate
+namespace internal {
+template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
+struct tensor_vsize_index_linearization_helper
+{
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Index run(array<Index, NumIndices> const& indices, std::vector<DenseIndex> const& dimensions)
+  {
+    return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) +
+      array_get<RowMajor ? n : (NumIndices - n - 1)>(dimensions) *
+        tensor_vsize_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
+  }
+};
+
+template<typename Index, std::size_t NumIndices, bool RowMajor>
+struct tensor_vsize_index_linearization_helper<Index, NumIndices, 0, RowMajor>
+{
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Index run(array<Index, NumIndices> const& indices, std::vector<DenseIndex> const&)
+  {
+    return array_get<RowMajor ? 0 : NumIndices - 1>(indices);
+  }
+};
+}  // end namespace internal
+
+
+namespace internal {
+
+template <typename DenseIndex, std::size_t NumDims> struct array_size<const DSizes<DenseIndex, NumDims> > {
+  static const size_t value = NumDims;
+};
+template <typename DenseIndex, std::size_t NumDims> struct array_size<DSizes<DenseIndex, NumDims> > {
+  static const size_t value = NumDims;
+};
+#ifndef EIGEN_EMULATE_CXX11_META_H
+template <typename std::size_t... Indices> struct array_size<const Sizes<Indices...> > {
+static const size_t value = Sizes<Indices...>::count;
+};
+template <typename std::size_t... Indices> struct array_size<Sizes<Indices...> > {
+static const size_t value = Sizes<Indices...>::count;
+};
+template <std::size_t n, typename std::size_t... Indices> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes<Indices...>&) {
+  return get<n, internal::numeric_list<std::size_t, Indices...> >::value;
+}
+#else
+template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<const Sizes<V1,V2,V3,V4,V5> > {
+  static const size_t value = Sizes<V1,V2,V3,V4,V5>::count;
+};
+template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<Sizes<V1,V2,V3,V4,V5> > {
+  static const size_t value = Sizes<V1,V2,V3,V4,V5>::count;
+};
+template <std::size_t n, std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes<V1,V2,V3,V4,V5>& a) {
+  return get<n, typename Sizes<V1,V2,V3,V4,V5>::Base>::value;
+};
+
+#endif
+
+
+template <typename Dims1, typename Dims2, size_t n>
+struct sizes_match_up_to_dim {
+  static inline bool run(Dims1& dims1, Dims2& dims2) {
+    return (array_get<n>(dims1) == array_get<n>(dims2)) &
+        sizes_match_up_to_dim<Dims1, Dims2, n-1>::run(dims1, dims2);
+  }
+};
+template <typename Dims1, typename Dims2>
+struct sizes_match_up_to_dim<Dims1, Dims2, 0> {
+  static inline bool run(Dims1& dims1, Dims2& dims2) {
+    return (array_get<0>(dims1) == array_get<0>(dims2));
+  }
+};
+
+} // end namespace internal
+
+
+template <typename Dims1, typename Dims2>
+bool dimensions_match(Dims1& dims1, Dims2& dims2) {
+  if (internal::array_size<Dims1>::value != internal::array_size<Dims2>::value) {
+    return false;
+  }
+  return internal::sizes_match_up_to_dim<Dims1, Dims2, internal::array_size<Dims1>::value-1>::run(dims1, dims2);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
new file mode 100644
index 000000000..883e6cab1
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -0,0 +1,154 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
+
+namespace Eigen {
+
+/** \class TensorForcedEval
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor reshaping class.
+  *
+  *
+  */
+namespace internal {
+template<typename XprType>
+struct traits<TensorEvalToOp<XprType> >
+{
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+
+  enum {
+    Flags = 0,
+  };
+};
+
+template<typename XprType>
+struct eval<TensorEvalToOp<XprType>, Eigen::Dense>
+{
+  typedef const TensorEvalToOp<XprType>& type;
+};
+
+template<typename XprType>
+struct nested<TensorEvalToOp<XprType>, 1, typename eval<TensorEvalToOp<XprType> >::type>
+{
+  typedef TensorEvalToOp<XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+
+template<typename XprType>
+class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType> >
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorEvalToOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorEvalToOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorEvalToOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorEvalToOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorEvalToOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(CoeffReturnType* buffer, const XprType& expr)
+      : m_xpr(expr), m_buffer(buffer) {}
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+    EIGEN_DEVICE_FUNC CoeffReturnType* buffer() const { return m_buffer; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    CoeffReturnType* m_buffer;
+};
+
+
+
+template<typename ArgType, typename Device>
+struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
+{
+  typedef TensorEvalToOp<ArgType> XprType;
+  typedef typename ArgType::Scalar Scalar;
+  typedef typename ArgType::Packet Packet;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+
+  enum {
+    IsAligned = true,
+    PacketAccess = true,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_device(device), m_buffer(op.buffer())
+  { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
+    m_buffer[i] = m_impl.coeff(i);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
+    internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_buffer[index];
+  }
+
+  template<int LoadMode>
+  EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return internal::ploadt<Packet, LoadMode>(m_buffer + index);
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+
+ private:
+  TensorEvaluator<ArgType, Device> m_impl;
+  const Device& m_device;
+  CoeffReturnType* m_buffer;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
new file mode 100644
index 000000000..d084880de
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -0,0 +1,427 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
+
+namespace Eigen {
+
+/** \class TensorEvaluator
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief The tensor evaluator classes.
+  *
+  * These classes are responsible for the evaluation of the tensor expression.
+  *
+  * TODO: add support for more types of expressions, in particular expressions
+  * leading to lvalues (slicing, reshaping, etc...)
+  */
+
+// Generic evaluator
+template<typename Derived, typename Device>
+struct TensorEvaluator
+{
+  typedef typename Derived::Index Index;
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Packet Packet;
+  typedef typename Derived::Scalar CoeffReturnType;
+  typedef typename Derived::Packet PacketReturnType;
+  typedef typename Derived::Dimensions Dimensions;
+
+  // NumDimensions is -1 for variable dim tensors
+  static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
+                               internal::traits<Derived>::NumDimensions : 0;
+
+  enum {
+    IsAligned = Derived::IsAligned,
+    PacketAccess = Derived::PacketAccess,
+    Layout = Derived::Layout,
+    CoordAccess = NumCoords > 0,
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
+      : m_data(const_cast<Scalar*>(m.data())), m_dims(m.dimensions()), m_device(device)
+  { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) {
+    if (dest) {
+      m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize());
+      return false;
+    }
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    eigen_assert(m_data);
+    return m_data[index];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    eigen_assert(m_data);
+    return m_data[index];
+  }
+
+  template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketReturnType packet(Index index) const
+  {
+    return internal::ploadt<Packet, LoadMode>(m_data + index);
+  }
+
+  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void writePacket(Index index, const Packet& x)
+  {
+    return internal::pstoret<Scalar, Packet, StoreMode>(m_data + index, x);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
+    eigen_assert(m_data);
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      return m_data[m_dims.IndexOfColMajor(coords)];
+    } else {
+      return m_data[m_dims.IndexOfRowMajor(coords)];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<DenseIndex, NumCoords>& coords) {
+    eigen_assert(m_data);
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      return m_data[m_dims.IndexOfColMajor(coords)];
+    } else {
+      return m_data[m_dims.IndexOfRowMajor(coords)];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; }
+
+ protected:
+  Scalar* m_data;
+  Dimensions m_dims;
+  const Device& m_device;
+};
+
+
+// Default evaluator for rvalues
+template<typename Derived, typename Device>
+struct TensorEvaluator<const Derived, Device>
+{
+  typedef typename Derived::Index Index;
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Packet Packet;
+  typedef typename Derived::Scalar CoeffReturnType;
+  typedef typename Derived::Packet PacketReturnType;
+  typedef typename Derived::Dimensions Dimensions;
+
+  // NumDimensions is -1 for variable dim tensors
+  static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
+                               internal::traits<Derived>::NumDimensions : 0;
+
+  enum {
+    IsAligned = Derived::IsAligned,
+    PacketAccess = Derived::PacketAccess,
+    Layout = Derived::Layout,
+    CoordAccess = NumCoords > 0,
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device&)
+      : m_data(m.data()), m_dims(m.dimensions())
+  { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    eigen_assert(m_data);
+#ifdef __CUDA_ARCH__
+    return __ldg(m_data+index);
+#else
+    return m_data[index];
+#endif
+  }
+
+  template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketReturnType packet(Index index) const
+  {
+    return internal::ploadt_ro<Packet, LoadMode>(m_data + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
+    eigen_assert(m_data);
+    const Index index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_dims.IndexOfColMajor(coords)
+                        : m_dims.IndexOfRowMajor(coords);
+#ifdef __CUDA_ARCH__
+    return __ldg(m_data+index);
+#else
+    return m_data[index];
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; }
+
+ protected:
+  const Scalar* m_data;
+  Dimensions m_dims;
+};
+
+
+
+
+// -------------------- CwiseNullaryOp --------------------
+
+template<typename NullaryOp, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
+{
+  typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType;
+
+  enum {
+    IsAligned = true,
+    PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC
+  TensorEvaluator(const XprType& op, const Device& device)
+      : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device)
+  { }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+  typedef typename internal::traits<XprType>::Packet PacketReturnType;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_functor(index);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return m_functor.packetOp(index);
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+
+ private:
+  const NullaryOp m_functor;
+  TensorEvaluator<ArgType, Device> m_argImpl;
+};
+
+
+
+// -------------------- CwiseUnaryOp --------------------
+
+template<typename UnaryOp, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
+{
+  typedef TensorCwiseUnaryOp<UnaryOp, ArgType> XprType;
+
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess & internal::functor_traits<UnaryOp>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+    : m_functor(op.functor()),
+      m_argImpl(op.nestedExpression(), device)
+  { }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+  typedef typename internal::traits<XprType>::Packet PacketReturnType;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+    m_argImpl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_argImpl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_functor(m_argImpl.coeff(index));
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index));
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+
+ private:
+  const UnaryOp m_functor;
+  TensorEvaluator<ArgType, Device> m_argImpl;
+};
+
+
+// -------------------- CwiseBinaryOp --------------------
+
+template<typename BinaryOp, typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType>, Device>
+{
+  typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
+
+  enum {
+    IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess &
+                   internal::functor_traits<BinaryOp>::PacketAccess,
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+    : m_functor(op.functor()),
+      m_leftImpl(op.lhsExpression(), device),
+      m_rightImpl(op.rhsExpression(), device)
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || internal::traits<XprType>::NumDimensions == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+  typedef typename internal::traits<XprType>::Packet PacketReturnType;
+  typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
+  {
+    // TODO: use right impl instead if right impl dimensions are known at compile time.
+    return m_leftImpl.dimensions();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+    m_leftImpl.evalSubExprsIfNeeded(NULL);
+    m_rightImpl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_leftImpl.cleanup();
+    m_rightImpl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index));
+  }
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return m_functor.packetOp(m_leftImpl.template packet<LoadMode>(index), m_rightImpl.template packet<LoadMode>(index));
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+
+ private:
+  const BinaryOp m_functor;
+  TensorEvaluator<LeftArgType, Device> m_leftImpl;
+  TensorEvaluator<RightArgType, Device> m_rightImpl;
+};
+
+
+// -------------------- SelectOp --------------------
+
+template<typename IfArgType, typename ThenArgType, typename ElseArgType, typename Device>
+struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>, Device>
+{
+  typedef TensorSelectOp<IfArgType, ThenArgType, ElseArgType> XprType;
+
+  enum {
+    IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess/* &
+                                                                                                             TensorEvaluator<IfArgType>::PacketAccess*/,
+    Layout = TensorEvaluator<IfArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+    : m_condImpl(op.ifExpression(), device),
+      m_thenImpl(op.thenExpression(), device),
+      m_elseImpl(op.elseExpression(), device)
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<ThenArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<ElseArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    eigen_assert(dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions()));
+    eigen_assert(dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions()));
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+  typedef typename internal::traits<XprType>::Packet PacketReturnType;
+  typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
+  {
+    // TODO: use then or else impl instead if they happen to be known at compile time.
+    return m_condImpl.dimensions();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+    m_condImpl.evalSubExprsIfNeeded(NULL);
+    m_thenImpl.evalSubExprsIfNeeded(NULL);
+    m_elseImpl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_condImpl.cleanup();
+    m_thenImpl.cleanup();
+    m_elseImpl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index);
+  }
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
+  {
+    static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+    internal::Selector<PacketSize> select;
+    for (Index i = 0; i < PacketSize; ++i) {
+      select.select[i] = m_condImpl.coeff(index+i);
+    }
+    return internal::pblend(select,
+                            m_thenImpl.template packet<LoadMode>(index),
+                            m_elseImpl.template packet<LoadMode>(index));
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+
+ private:
+  TensorEvaluator<IfArgType, Device> m_condImpl;
+  TensorEvaluator<ThenArgType, Device> m_thenImpl;
+  TensorEvaluator<ElseArgType, Device> m_elseImpl;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
new file mode 100644
index 000000000..05ac9bd2f
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -0,0 +1,244 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
+
+namespace Eigen {
+
+/** \class TensorExecutor
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief The tensor executor class.
+  *
+  * This class is responsible for launch the evaluation of the expression on
+  * the specified computing device.
+  */
+namespace internal {
+
+template <typename Device, typename Expression>
+struct IsVectorizable {
+  static const bool value = TensorEvaluator<Expression, Device>::PacketAccess;
+};
+
+// Default strategy: the expression is evaluated with a single cpu thread.
+template<typename Expression, typename Device = DefaultDevice, bool Vectorizable = IsVectorizable<Device, Expression>::value>
+class TensorExecutor
+{
+ public:
+  typedef typename Expression::Index Index;
+  EIGEN_DEVICE_FUNC
+  static inline void run(const Expression& expr, const Device& device = Device())
+  {
+    TensorEvaluator<Expression, Device> evaluator(expr, device);
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+    if (needs_assign)
+    {
+      const Index size = array_prod(evaluator.dimensions());
+      for (Index i = 0; i < size; ++i) {
+        evaluator.evalScalar(i);
+      }
+    }
+    evaluator.cleanup();
+  }
+};
+
+
+template<typename Expression>
+class TensorExecutor<Expression, DefaultDevice, true>
+{
+ public:
+  typedef typename Expression::Index Index;
+  static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
+  {
+    TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+    if (needs_assign)
+    {
+      const Index size = array_prod(evaluator.dimensions());
+      static const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
+      const Index VectorizedSize = (size / PacketSize) * PacketSize;
+
+      for (Index i = 0; i < VectorizedSize; i += PacketSize) {
+        evaluator.evalPacket(i);
+      }
+      for (Index i = VectorizedSize; i < size; ++i) {
+        evaluator.evalScalar(i);
+      }
+    }
+    evaluator.cleanup();
+  }
+};
+
+
+
+// Multicore strategy: the index space is partitioned and each partition is executed on a single core
+#ifdef EIGEN_USE_THREADS
+template <typename Evaluator, typename Index, bool Vectorizable = Evaluator::PacketAccess>
+struct EvalRange {
+  static void run(Evaluator evaluator, const Index first, const Index last) {
+    eigen_assert(last > first);
+    for (Index i = first; i < last; ++i) {
+      evaluator.evalScalar(i);
+    }
+  }
+};
+
+template <typename Evaluator, typename Index>
+struct EvalRange<Evaluator, Index, true> {
+  static void run(Evaluator evaluator, const Index first, const Index last) {
+    eigen_assert(last > first);
+
+    Index i = first;
+    static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+    if (last - first > PacketSize) {
+      eigen_assert(first % PacketSize == 0);
+      Index lastPacket = last - (last % PacketSize);
+      for (; i < lastPacket; i += PacketSize) {
+        evaluator.evalPacket(i);
+      }
+    }
+
+    for (; i < last; ++i) {
+      evaluator.evalScalar(i);
+    }
+  }
+};
+
+template<typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
+{
+ public:
+  typedef typename Expression::Index Index;
+  static inline void run(const Expression& expr, const ThreadPoolDevice& device)
+  {
+    typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+    Evaluator evaluator(expr, device);
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+    if (needs_assign)
+    {
+      const Index size = array_prod(evaluator.dimensions());
+
+      static const int PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
+
+      int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1;
+      const Index blocksize = std::max<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
+      const Index numblocks = size / blocksize;
+
+      Index i = 0;
+      std::vector<Future> results;
+      results.reserve(numblocks);
+      for (int i = 0; i < numblocks; ++i) {
+        results.push_back(device.enqueue(&EvalRange<Evaluator, Index>::run, evaluator, i*blocksize, (i+1)*blocksize));
+      }
+
+      if (numblocks * blocksize < size) {
+        EvalRange<Evaluator, Index>::run(evaluator, numblocks * blocksize, size);
+      }
+
+      for (int i = 0; i < numblocks; ++i) {
+        get_when_ready(&results[i]);
+      }
+
+    }
+    evaluator.cleanup();
+  }
+};
+#endif
+
+
+// GPU: the evaluation of the expression is offloaded to a GPU.
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+template <typename Evaluator, typename Index>
+__global__ void
+__launch_bounds__(1024)
+EigenMetaKernel_NonVectorizable(Evaluator eval, Index size) {
+
+  const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index step_size = blockDim.x * gridDim.x;
+
+  // Use the scalar path
+  for (Index i = first_index; i < size; i += step_size) {
+    eval.evalScalar(i);
+  }
+}
+
+template <typename Evaluator, typename Index>
+__global__ void
+__launch_bounds__(1024)
+EigenMetaKernel_Vectorizable(Evaluator eval, Index size) {
+
+  const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index step_size = blockDim.x * gridDim.x;
+
+  // Use the vector path
+  const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+  const Index vectorized_step_size = step_size * PacketSize;
+  const Index vectorized_size = (size / PacketSize) * PacketSize;
+  for (Index i = first_index * PacketSize; i < vectorized_size;
+       i += vectorized_step_size) {
+    eval.evalPacket(i);
+  }
+  for (Index i = vectorized_size + first_index; i < size; i += step_size) {
+    eval.evalScalar(i);
+  }
+}
+
+template <typename Expression>
+struct IsVectorizable<GpuDevice, Expression> {
+  static const bool value = TensorEvaluator<Expression, GpuDevice>::PacketAccess && TensorEvaluator<Expression, GpuDevice>::IsAligned;
+};
+
+template<typename Expression>
+class TensorExecutor<Expression, GpuDevice, false>
+{
+ public:
+  typedef typename Expression::Index Index;
+  static inline void run(const Expression& expr, const GpuDevice& device)
+  {
+    TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+    if (needs_assign)
+    {
+      const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock();
+      const int block_size = maxCudaThreadsPerBlock();
+      const Index size = array_prod(evaluator.dimensions());
+      LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
+    }
+    evaluator.cleanup();
+  }
+};
+
+template<typename Expression>
+class TensorExecutor<Expression, GpuDevice, true>
+{
+ public:
+  typedef typename Expression::Index Index;
+  static inline void run(const Expression& expr, const GpuDevice& device)
+  {
+    TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+    if (needs_assign)
+    {
+      const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock();
+      const int block_size = maxCudaThreadsPerBlock();
+      const Index size = array_prod(evaluator.dimensions());
+      LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
+    }
+    evaluator.cleanup();
+  }
+};
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
new file mode 100644
index 000000000..b66b3ec2c
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
@@ -0,0 +1,300 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
+
+namespace Eigen {
+
+/** \class TensorExpr
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor expression classes.
+  *
+  * The TensorCwiseNullaryOp class applies a nullary operators to an expression.
+  * This is typically used to generate constants.
+  *
+  * The TensorCwiseUnaryOp class represents an expression where a unary operator
+  * (e.g. cwiseSqrt) is applied to an expression.
+  *
+  * The TensorCwiseBinaryOp class represents an expression where a binary
+  * operator (e.g. addition) is applied to a lhs and a rhs expression.
+  *
+  */
+namespace internal {
+template<typename NullaryOp, typename XprType>
+struct traits<TensorCwiseNullaryOp<NullaryOp, XprType> >
+    : traits<XprType>
+{
+  typedef typename XprType::Packet Packet;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::Nested XprTypeNested;
+  typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+
+  enum {
+    Flags = 0,
+  };
+};
+
+}  // end namespace internal
+
+
+
+template<typename NullaryOp, typename XprType>
+class TensorCwiseNullaryOp : public TensorBase<TensorCwiseNullaryOp<NullaryOp, XprType>, ReadOnlyAccessors>
+{
+  public:
+    typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Scalar Scalar;
+    typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Packet Packet;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename XprType::CoeffReturnType CoeffReturnType;
+    typedef typename XprType::PacketReturnType PacketReturnType;
+    typedef TensorCwiseNullaryOp<NullaryOp, XprType> Nested;
+    typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseNullaryOp(const XprType& xpr, const NullaryOp& func = NullaryOp())
+        : m_xpr(xpr), m_functor(func) {}
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    nestedExpression() const { return m_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    const NullaryOp& functor() const { return m_functor; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const NullaryOp m_functor;
+};
+
+
+
+namespace internal {
+template<typename UnaryOp, typename XprType>
+struct traits<TensorCwiseUnaryOp<UnaryOp, XprType> >
+    : traits<XprType>
+{
+  // TODO(phli): Add InputScalar, InputPacket.  Check references to
+  // current Scalar/Packet to see if the intent is Input or Output.
+  typedef typename result_of<UnaryOp(typename XprType::Scalar)>::type Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename internal::packet_traits<Scalar>::type Packet;
+  typedef typename XprType::Nested XprTypeNested;
+  typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename UnaryOp, typename XprType>
+struct eval<TensorCwiseUnaryOp<UnaryOp, XprType>, Eigen::Dense>
+{
+  typedef const TensorCwiseUnaryOp<UnaryOp, XprType>& type;
+};
+
+template<typename UnaryOp, typename XprType>
+struct nested<TensorCwiseUnaryOp<UnaryOp, XprType>, 1, typename eval<TensorCwiseUnaryOp<UnaryOp, XprType> >::type>
+{
+  typedef TensorCwiseUnaryOp<UnaryOp, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename UnaryOp, typename XprType>
+class TensorCwiseUnaryOp : public TensorBase<TensorCwiseUnaryOp<UnaryOp, XprType>, ReadOnlyAccessors>
+{
+  public:
+    // TODO(phli): Add InputScalar, InputPacket.  Check references to
+    // current Scalar/Packet to see if the intent is Input or Output.
+    typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Scalar Scalar;
+    typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Packet Packet;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef Scalar CoeffReturnType;
+    typedef typename internal::packet_traits<CoeffReturnType>::type PacketReturnType;
+    typedef typename Eigen::internal::nested<TensorCwiseUnaryOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
+      : m_xpr(xpr), m_functor(func) {}
+
+    EIGEN_DEVICE_FUNC
+    const UnaryOp& functor() const { return m_functor; }
+
+    /** \returns the nested expression */
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    nestedExpression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const UnaryOp m_functor;
+};
+
+
+namespace internal {
+template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
+struct traits<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >
+{
+  // Type promotion to handle the case where the types of the lhs and the rhs
+  // are different.
+  // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket.  Check references to
+  // current Scalar/Packet to see if the intent is Inputs or Output.
+  typedef typename result_of<
+      BinaryOp(typename LhsXprType::Scalar,
+               typename RhsXprType::Scalar)>::type Scalar;
+  typedef traits<LhsXprType> XprTraits;
+  typedef typename internal::packet_traits<Scalar>::type Packet;
+  typedef typename promote_storage_type<
+      typename traits<LhsXprType>::StorageKind,
+      typename traits<RhsXprType>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<
+      typename traits<LhsXprType>::Index,
+      typename traits<RhsXprType>::Index>::type Index;
+  typedef typename LhsXprType::Nested LhsNested;
+  typedef typename RhsXprType::Nested RhsNested;
+  typedef typename remove_reference<LhsNested>::type _LhsNested;
+  typedef typename remove_reference<RhsNested>::type _RhsNested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+
+  enum {
+    Flags = 0,
+  };
+};
+
+template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
+struct eval<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, Eigen::Dense>
+{
+  typedef const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>& type;
+};
+
+template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
+struct nested<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, 1, typename eval<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >::type>
+{
+  typedef TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
+class TensorCwiseBinaryOp : public TensorBase<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, ReadOnlyAccessors>
+{
+  public:
+    // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket.  Check references to
+    // current Scalar/Packet to see if the intent is Inputs or Output.
+    typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Scalar Scalar;
+    typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Packet Packet;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef Scalar CoeffReturnType;
+    typedef typename internal::packet_traits<CoeffReturnType>::type PacketReturnType;
+    typedef typename Eigen::internal::nested<TensorCwiseBinaryOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp())
+        : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {}
+
+    EIGEN_DEVICE_FUNC
+    const BinaryOp& functor() const { return m_functor; }
+
+    /** \returns the nested expressions */
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename LhsXprType::Nested>::type&
+    lhsExpression() const { return m_lhs_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename RhsXprType::Nested>::type&
+    rhsExpression() const { return m_rhs_xpr; }
+
+  protected:
+    typename LhsXprType::Nested m_lhs_xpr;
+    typename RhsXprType::Nested m_rhs_xpr;
+    const BinaryOp m_functor;
+};
+
+
+namespace internal {
+template<typename IfXprType, typename ThenXprType, typename ElseXprType>
+struct traits<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
+    : traits<ThenXprType>
+{
+  typedef typename traits<ThenXprType>::Scalar Scalar;
+  typedef traits<ThenXprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename promote_storage_type<typename traits<ThenXprType>::StorageKind,
+                                        typename traits<ElseXprType>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<ElseXprType>::Index,
+                                      typename traits<ThenXprType>::Index>::type Index;
+  typedef typename IfXprType::Nested IfNested;
+  typedef typename ThenXprType::Nested ThenNested;
+  typedef typename ElseXprType::Nested ElseNested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename IfXprType, typename ThenXprType, typename ElseXprType>
+struct eval<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, Eigen::Dense>
+{
+  typedef const TensorSelectOp<IfXprType, ThenXprType, ElseXprType>& type;
+};
+
+template<typename IfXprType, typename ThenXprType, typename ElseXprType>
+struct nested<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, 1, typename eval<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >::type>
+{
+  typedef TensorSelectOp<IfXprType, ThenXprType, ElseXprType> type;
+};
+
+}  // end namespace internal
+
+
+template<typename IfXprType, typename ThenXprType, typename ElseXprType>
+class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
+{
+  public:
+    typedef typename Eigen::internal::traits<TensorSelectOp>::Scalar Scalar;
+    typedef typename Eigen::internal::traits<TensorSelectOp>::Packet Packet;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename internal::promote_storage_type<typename ThenXprType::CoeffReturnType,
+                                                    typename ElseXprType::CoeffReturnType>::ret CoeffReturnType;
+    typedef typename internal::promote_storage_type<typename ThenXprType::PacketReturnType,
+                                                    typename ElseXprType::PacketReturnType>::ret PacketReturnType;
+    typedef typename Eigen::internal::nested<TensorSelectOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorSelectOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorSelectOp>::Index Index;
+
+    TensorSelectOp(const IfXprType& a_condition,
+                   const ThenXprType& a_then,
+                   const ElseXprType& a_else)
+      : m_condition(a_condition), m_then(a_then), m_else(a_else)
+    { }
+
+    const IfXprType& ifExpression() const { return m_condition; }
+
+    const ThenXprType& thenExpression() const { return m_then; }
+
+    const ElseXprType& elseExpression() const { return m_else; }
+
+  protected:
+    typename IfXprType::Nested m_condition;
+    typename ThenXprType::Nested m_then;
+    typename ElseXprType::Nested m_else;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
new file mode 100644
index 000000000..94b3f957b
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -0,0 +1,253 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
+
+namespace Eigen {
+
+/** \class TensorFixedSize
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief The fixed sized version of the tensor class.
+  *
+  * The fixed sized equivalent of
+  * Eigen::Tensor<float, 3> t(3, 5, 7);
+  * is
+  * Eigen::TensorFixedSize<float, Size<3,5,7>> t;
+  */
+
+template<typename Scalar_, typename Dimensions_, int Options_>
+class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_> >
+{
+  public:
+    typedef TensorFixedSize<Scalar_, Dimensions_, Options_> Self;
+    typedef TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_> > Base;
+    typedef typename Eigen::internal::nested<Self>::type Nested;
+    typedef typename internal::traits<Self>::StorageKind StorageKind;
+    typedef typename internal::traits<Self>::Index Index;
+    typedef Scalar_ Scalar;
+    typedef typename internal::packet_traits<Scalar>::type Packet;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef typename Base::CoeffReturnType CoeffReturnType;
+
+    static const int Options = Options_;
+
+    enum {
+      IsAligned = bool(EIGEN_ALIGN),
+      PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+      Layout = Options_ & RowMajor ? RowMajor : ColMajor,
+      CoordAccess = true,
+   };
+
+  typedef Dimensions_ Dimensions;
+  static const std::size_t NumIndices = Dimensions::count;
+
+  protected:
+  TensorStorage<Scalar, NumIndices, Dimensions::total_size, Options, Dimensions> m_storage;
+
+  public:
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                      rank()                   const { return NumIndices; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                    dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions&        dimensions()             const { return m_storage.dimensions(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                    size()                   const { return m_storage.size(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar                   *data()                        { return m_storage.data(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar             *data()                  const { return m_storage.data(); }
+
+    // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+    // work, because that uses base().coeffRef() - and we don't yet
+    // implement a similar class hierarchy
+    inline Self& base()             { return *this; }
+    inline const Self& base() const { return *this; }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes>
+    inline const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const
+    {
+      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return coeff(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const
+    {
+      eigen_internal_assert(checkIndexRange(indices));
+      return m_storage.data()[linearizedIndex(indices)];
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      return m_storage.data()[index];
+    }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes>
+    inline Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices)
+    {
+      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return coeffRef(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
+    {
+      eigen_internal_assert(checkIndexRange(indices));
+      return m_storage.data()[linearizedIndex(indices)];
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      return m_storage.data()[index];
+    }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes>
+    inline const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const
+    {
+      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return this->operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
+    {
+      eigen_assert(checkIndexRange(indices));
+      return coeff(indices);
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      return coeff(index);
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const
+    {
+      // The bracket operator is only for vectors, use the parenthesis operator instead.
+      EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return coeff(index);
+    }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes>
+    inline Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
+    {
+      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
+    {
+      eigen_assert(checkIndexRange(indices));
+      return coeffRef(indices);
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index index)
+    {
+      eigen_assert(index >= 0 && index < size());
+      return coeffRef(index);
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator[](Index index)
+    {
+      // The bracket operator is only for vectors, use the parenthesis operator instead
+      EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return coeffRef(index);
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorFixedSize()
+      : m_storage()
+    {
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorFixedSize(const Self& other)
+      : m_storage(other.m_storage)
+    {
+    }
+
+#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+    inline TensorFixedSize(Self&& other)
+      : m_storage(other.m_storage)
+    {
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorFixedSize& operator=(const TensorFixedSize& other)
+    {
+      // FIXME: check that the dimensions of other match the dimensions of *this.
+      // Unfortunately this isn't possible yet when the rhs is an expression.
+      typedef TensorAssignOp<Self, const TensorFixedSize> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorFixedSize& operator=(const OtherDerived& other)
+    {
+      // FIXME: check that the dimensions of other match the dimensions of *this.
+      // Unfortunately this isn't possible yet when the rhs is an expression.
+      typedef TensorAssignOp<Self, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+  protected:
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE bool checkIndexRange(const array<Index, NumIndices>& /*indices*/) const
+    {
+      using internal::array_apply_and_reduce;
+      using internal::array_zip_and_reduce;
+      using internal::greater_equal_zero_op;
+      using internal::logical_and_op;
+      using internal::lesser_op;
+
+      return true;
+        // check whether the indices are all >= 0
+          /*       array_apply_and_reduce<logical_and_op, greater_equal_zero_op>(indices) &&
+        // check whether the indices fit in the dimensions
+        array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions());*/
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const
+    {
+      if (Options&RowMajor) {
+        return m_storage.dimensions().IndexOfRowMajor(indices);
+      } else {
+        return m_storage.dimensions().IndexOfColMajor(indices);
+      }
+    }
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
new file mode 100644
index 000000000..41a36cb75
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -0,0 +1,151 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
+
+namespace Eigen {
+
+/** \class TensorForcedEval
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor reshaping class.
+  *
+  *
+  */
+namespace internal {
+template<typename XprType>
+struct traits<TensorForcedEvalOp<XprType> >
+{
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename traits<XprType>::StorageKind StorageKind;
+  typedef typename traits<XprType>::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+
+  enum {
+    Flags = 0,
+  };
+};
+
+template<typename XprType>
+struct eval<TensorForcedEvalOp<XprType>, Eigen::Dense>
+{
+  typedef const TensorForcedEvalOp<XprType>& type;
+};
+
+template<typename XprType>
+struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<XprType> >::type>
+{
+  typedef TensorForcedEvalOp<XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename XprType>
+class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType> >
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorForcedEvalOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorForcedEvalOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorForcedEvalOp(const XprType& expr)
+      : m_xpr(expr) {}
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+};
+
+
+template<typename ArgType, typename Device>
+struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
+{
+  typedef TensorForcedEvalOp<ArgType> XprType;
+  typedef typename ArgType::Scalar Scalar;
+  typedef typename ArgType::Packet Packet;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+
+  enum {
+    IsAligned = true,
+    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL)
+  { }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    const Index numValues = m_impl.dimensions().TotalSize();
+    m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType));
+    // Should initialize the memory in case we're dealing with non POD types.
+    if (!internal::is_arithmetic<CoeffReturnType>::value) {
+      for (Index i = 0; i < numValues; ++i) {
+        new(m_buffer+i) CoeffReturnType();
+      }
+    }
+    typedef TensorEvalToOp<const ArgType> EvalTo;
+    EvalTo evalToTmp(m_buffer, m_op);
+    internal::TensorExecutor<const EvalTo, Device, TensorEvaluator<ArgType, Device>::PacketAccess>::run(evalToTmp, m_device);
+    m_impl.cleanup();
+    return true;
+  }
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_device.deallocate(m_buffer);
+    m_buffer = NULL;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_buffer[index];
+  }
+
+  template<int LoadMode>
+  EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return internal::ploadt<Packet, LoadMode>(m_buffer + index);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return m_buffer; }
+
+ private:
+  TensorEvaluator<ArgType, Device> m_impl;
+  const ArgType m_op;
+  const Device& m_device;
+  CoeffReturnType* m_buffer;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
new file mode 100644
index 000000000..7bec2b10a
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -0,0 +1,54 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
+
+namespace Eigen {
+
+template<typename Scalar_, std::size_t NumIndices_, int Options_ = 0> class Tensor;
+template<typename Scalar_, typename Dimensions, int Options_ = 0> class TensorFixedSize;
+template<typename PlainObjectType, int Options_ = Unaligned> class TensorMap;
+template<typename PlainObjectType> class TensorRef;
+template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value> class TensorBase;
+
+template<typename NullaryOp, typename PlainObjectType> class TensorCwiseNullaryOp;
+template<typename UnaryOp, typename XprType> class TensorCwiseUnaryOp;
+template<typename BinaryOp, typename LeftXprType, typename RightXprType> class TensorCwiseBinaryOp;
+template<typename IfXprType, typename ThenXprType, typename ElseXprType> class TensorSelectOp;
+template<typename Op, typename Dims, typename XprType> class TensorReductionOp;
+template<typename Axis, typename LeftXprType, typename RightXprType> class TensorConcatenationOp;
+template<typename Dimensions, typename LeftXprType, typename RightXprType> class TensorContractionOp;
+template<typename Dimensions, typename InputXprType, typename KernelXprType> class TensorConvolutionOp;
+template<typename PatchDim, typename XprType> class TensorPatchOp;
+template<DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorImagePatchOp;
+template<typename Broadcast, typename XprType> class TensorBroadcastingOp;
+template<DenseIndex DimId, typename XprType> class TensorChippingOp;
+template<typename NewDimensions, typename XprType> class TensorReshapingOp;
+template<typename XprType> class TensorLayoutSwapOp;
+template<typename StartIndices, typename Sizes, typename XprType> class TensorSlicingOp;
+template<typename ReverseDimensions, typename XprType> class TensorReverseOp;
+template<typename PaddingDimensions, typename XprType> class TensorPaddingOp;
+template<typename Shuffle, typename XprType> class TensorShufflingOp;
+template<typename Strides, typename XprType> class TensorStridingOp;
+template<typename LeftXprType, typename RightXprType> class TensorAssignOp;
+
+template<typename XprType> class TensorEvalToOp;
+template<typename XprType> class TensorForcedEvalOp;
+
+template<typename ExpressionType, typename DeviceType> class TensorDevice;
+template<typename Derived, typename Device> struct TensorEvaluator;
+
+namespace internal {
+template<typename Expression, typename Device, bool Vectorizable> class TensorExecutor;
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
new file mode 100644
index 000000000..38586d067
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -0,0 +1,338 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
+
+namespace Eigen {
+namespace internal {
+
+// Standard reduction functors
+template <typename T> struct SumReducer
+{
+  static const bool PacketAccess = true;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+    (*accum) += t;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
+    (*accum) = padd<Packet>(*accum, p);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return static_cast<T>(0);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return pset1<Packet>(0);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+    return accum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+    return vaccum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+    return saccum + predux(vaccum);
+  }
+};
+
+template <typename T> struct MeanReducer
+{
+  static const bool PacketAccess = true;
+  MeanReducer() : scalarCount_(0), packetCount_(0) { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
+    (*accum) += t;
+    scalarCount_++;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) {
+    (*accum) = padd<Packet>(*accum, p);
+    packetCount_++;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return static_cast<T>(0);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return pset1<Packet>(0);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+    return accum / scalarCount_;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+    return pdiv(vaccum, pset1<Packet>(packetCount_));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+    return (saccum + predux(vaccum)) / (scalarCount_ + packetCount_ * packet_traits<Packet>::size);
+  }
+
+  protected:
+    int scalarCount_;
+    int packetCount_;
+};
+
+template <typename T> struct MaxReducer
+{
+  static const bool PacketAccess = true;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+    if (t > *accum) { *accum = t; }
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
+    (*accum) = pmax<Packet>(*accum, p);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return -(std::numeric_limits<T>::max)();
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return pset1<Packet>(-(std::numeric_limits<T>::max)());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+    return accum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+    return vaccum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+    return (std::max)(saccum, predux_max(vaccum));
+  }
+};
+
+template <typename T> struct MinReducer
+{
+  static const bool PacketAccess = true;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+    if (t < *accum) { *accum = t; }
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
+    (*accum) = pmin<Packet>(*accum, p);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return (std::numeric_limits<T>::max)();
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return pset1<Packet>((std::numeric_limits<T>::max)());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+    return accum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+    return vaccum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+    return (std::min)(saccum, predux_min(vaccum));
+  }
+};
+
+
+template <typename T> struct ProdReducer
+{
+  static const bool PacketAccess = true;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+    (*accum) *= t;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
+    (*accum) = pmul<Packet>(*accum, p);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return static_cast<T>(1);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return pset1<Packet>(1);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+    return accum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+    return vaccum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+    return saccum * predux_mul(vaccum);
+  }
+};
+
+#if !defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)
+// We're not compiling a cuda kernel
+template <typename T> struct UniformRandomGenerator {
+
+  static const bool PacketAccess = true;
+
+  template<typename Index>
+  T operator()(Index, Index = 0) const {
+    return random<T>();
+  }
+  template<typename Index>
+  typename internal::packet_traits<T>::type packetOp(Index, Index = 0) const {
+    const int packetSize = internal::packet_traits<T>::size;
+    EIGEN_ALIGN_DEFAULT T values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = random<T>();
+    }
+    return internal::pload<typename internal::packet_traits<T>::type>(values);
+  }
+};
+
+#else
+
+// We're compiling a cuda kernel
+template <typename T> struct UniformRandomGenerator;
+
+template <> struct UniformRandomGenerator<float> {
+
+  static const bool PacketAccess = true;
+
+  EIGEN_DEVICE_FUNC UniformRandomGenerator() {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    curand_init(0, tid, 0, &m_state);
+  }
+
+  template<typename Index> EIGEN_DEVICE_FUNC
+  float operator()(Index, Index = 0) const {
+    return curand_uniform(&m_state);
+  }
+  template<typename Index> EIGEN_DEVICE_FUNC
+  float4 packetOp(Index, Index = 0) const {
+    return curand_uniform4(&m_state);
+  }
+
+ private:
+  mutable curandStatePhilox4_32_10_t m_state;
+};
+
+template <> struct UniformRandomGenerator<double> {
+
+  static const bool PacketAccess = true;
+
+  EIGEN_DEVICE_FUNC UniformRandomGenerator() {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    curand_init(0, tid, 0, &m_state);
+  }
+  template<typename Index> EIGEN_DEVICE_FUNC
+  double operator()(Index, Index = 0) const {
+    return curand_uniform_double(&m_state);
+  }
+  template<typename Index> EIGEN_DEVICE_FUNC
+  double2 packetOp(Index, Index = 0) const {
+    return curand_uniform2_double(&m_state);
+  }
+
+ private:
+  mutable curandStatePhilox4_32_10_t m_state;
+};
+
+#endif
+
+
+#if (!defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)) && __cplusplus > 199711
+// We're not compiling a cuda kernel
+template <typename T> struct NormalRandomGenerator {
+
+  static const bool PacketAccess = true;
+
+  NormalRandomGenerator() : m_distribution(0, 1) {}
+  NormalRandomGenerator(const NormalRandomGenerator& other) : m_distribution(other.m_distribution) { }
+
+  template<typename Index>
+  T operator()(Index, Index = 0) const {
+    return m_distribution(m_generator);
+  }
+  template<typename Index>
+  typename internal::packet_traits<T>::type packetOp(Index, Index = 0) const {
+    const int packetSize = internal::packet_traits<T>::size;
+    EIGEN_ALIGN_DEFAULT T values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = m_distribution(m_generator);
+    }
+    return internal::pload<typename internal::packet_traits<T>::type>(values);
+  }
+
+  mutable std::normal_distribution<T> m_distribution;
+  mutable std::default_random_engine m_generator;
+};
+
+#elif defined (EIGEN_USE_GPU) && defined(__CUDACC__) && defined(__CUDA_ARCH__)
+
+// We're compiling a cuda kernel
+template <typename T> struct NormalRandomGenerator;
+
+template <> struct NormalRandomGenerator<float> {
+
+  static const bool PacketAccess = true;
+
+  EIGEN_DEVICE_FUNC NormalRandomGenerator() {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    curand_init(0, tid, 0, &m_state);
+  }
+
+  template<typename Index> EIGEN_DEVICE_FUNC
+  float operator()(Index, Index = 0) const {
+    return curand_normal(&m_state);
+  }
+  template<typename Index> EIGEN_DEVICE_FUNC
+  float4 packetOp(Index, Index = 0) const {
+    return curand_normal4(&m_state);
+  }
+
+ private:
+  mutable curandStatePhilox4_32_10_t m_state;
+};
+
+template <> struct NormalRandomGenerator<double> {
+
+  static const bool PacketAccess = true;
+
+  EIGEN_DEVICE_FUNC NormalRandomGenerator() {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    curand_init(0, tid, 0, &m_state);
+  }
+  template<typename Index> EIGEN_DEVICE_FUNC
+  double operator()(Index, Index = 0) const {
+    return curand_normal_double(&m_state);
+  }
+  template<typename Index> EIGEN_DEVICE_FUNC
+  double2 packetOp(Index, Index = 0) const {
+    return curand_normal2_double(&m_state);
+  }
+
+ private:
+  mutable curandStatePhilox4_32_10_t m_state;
+};
+
+#endif
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
new file mode 100644
index 000000000..a9d0f6c39
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
@@ -0,0 +1,53 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_IO_H
+#define EIGEN_CXX11_TENSOR_TENSOR_IO_H
+
+namespace Eigen {
+
+namespace internal {
+template<>
+struct significant_decimals_impl<std::string>
+    : significant_decimals_default_impl<std::string, true>
+{};
+}
+
+
+template <typename T>
+std::ostream& operator << (std::ostream& os, const TensorBase<T, ReadOnlyAccessors>& expr) {
+  // Evaluate the expression if needed
+  TensorForcedEvalOp<const T> eval = expr.eval();
+  TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> tensor(eval, DefaultDevice());
+  tensor.evalSubExprsIfNeeded(NULL);
+
+  typedef typename internal::remove_const<typename T::Scalar>::type Scalar;
+  typedef typename T::Index Index;
+  typedef typename TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Dimensions Dimensions;
+  const Index total_size = internal::array_prod(tensor.dimensions());
+
+  // Print the tensor as a 1d vector or a 2d matrix.
+  if (internal::array_size<Dimensions>::value == 1) {
+    Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size);
+    os << array;
+  } else {
+    const Index first_dim = tensor.dimensions()[0];
+    static const int layout = TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Layout;
+    Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim);
+    os << matrix;
+  }
+
+  // Cleanup.
+  tensor.cleanup();
+  return os;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_IO_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
new file mode 100644
index 000000000..bf0e7edfb
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -0,0 +1,382 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
+#define EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
+
+namespace Eigen {
+
+/** \class TensorImagePatch
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Patch extraction specialized for image processing.
+  * This assumes that the input has a least 3 dimensions ordered as follow:
+  *  1st dimension: channels (of size d)
+  *  2nd dimension: rows (of size r)
+  *  3rd dimension: columns (of size c)
+  *  There can be additional dimensions such as time (for video) or batch (for
+  * bulk processing after the first 3.
+  * Calling the image patch code with patch_rows and patch_cols is equivalent
+  * to calling the regular patch extraction code with parameters d, patch_rows,
+  * patch_cols, and 1 for all the additional dimensions.
+  */
+namespace internal {
+template<DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct traits<TensorImagePatchOp<Rows, Cols, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions + 1;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct eval<TensorImagePatchOp<Rows, Cols, XprType>, Eigen::Dense>
+{
+  typedef const TensorImagePatchOp<Rows, Cols, XprType>& type;
+};
+
+template<DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct nested<TensorImagePatchOp<Rows, Cols, XprType>, 1, typename eval<TensorImagePatchOp<Rows, Cols, XprType> >::type>
+{
+  typedef TensorImagePatchOp<Rows, Cols, XprType> type;
+};
+
+}  // end namespace internal
+
+template<DenseIndex Rows, DenseIndex Cols, typename XprType>
+class TensorImagePatchOp : public TensorBase<TensorImagePatchOp<Rows, Cols, XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorImagePatchOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorImagePatchOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorImagePatchOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorImagePatchOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorImagePatchOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols,
+                                                           DenseIndex row_strides, DenseIndex col_strides,
+                                                           PaddingType padding_type)
+      : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+        m_row_strides(row_strides), m_col_strides(col_strides),
+        m_padding_type(padding_type) {}
+
+    EIGEN_DEVICE_FUNC
+    DenseIndex patch_rows() const { return m_patch_rows; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex patch_cols() const { return m_patch_cols; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex row_strides() const { return m_row_strides; }
+    EIGEN_DEVICE_FUNC
+    DenseIndex col_strides() const { return m_col_strides; }
+    EIGEN_DEVICE_FUNC
+    PaddingType padding_type() const { return m_padding_type; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const DenseIndex m_patch_rows;
+    const DenseIndex m_patch_cols;
+    const DenseIndex m_row_strides;
+    const DenseIndex m_col_strides;
+    const PaddingType m_padding_type;
+};
+
+
+// Eval as rvalue
+template<DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
+{
+  typedef TensorImagePatchOp<Rows, Cols, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value + 1;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = NumDims == 5,
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device)
+  {
+   // Only column major tensors are supported for now.
+    EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    EIGEN_STATIC_ASSERT(NumDims >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+
+    // Caches a few variables.
+    m_inputRows = input_dims[1];
+    m_inputCols = input_dims[2];
+
+    m_row_strides = op.row_strides();
+    m_col_strides = op.col_strides();
+
+    // We only support same strides for both dimensions and square patches.
+    eigen_assert(m_row_strides == m_col_strides);
+
+    switch (op.padding_type()) {
+      case PADDING_VALID:
+        m_outputRows = ceil((m_inputRows - op.patch_rows() + 1.f) / static_cast<float>(m_row_strides));
+        m_outputCols = ceil((m_inputCols - op.patch_cols() + 1.f) / static_cast<float>(m_col_strides));
+        // Calculate the padding
+        m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + op.patch_rows() - m_inputRows) / 2;
+        m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + op.patch_cols() - m_inputCols) / 2;
+        break;
+      case PADDING_SAME:
+        m_outputRows = ceil(m_inputRows / static_cast<float>(m_row_strides));
+        m_outputCols = ceil(m_inputCols / static_cast<float>(m_col_strides));
+        // Calculate the padding
+        m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + op.patch_rows() - m_inputRows) / 2;
+        m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + op.patch_cols() - m_inputCols) / 2;
+        break;
+      default:
+        eigen_assert(false && "unexpected padding");
+      }
+
+    // Dimensions for result of extraction.
+    // 0: depth
+    // 1: patch_rows
+    // 2: patch_cols
+    // 3: number of patches
+    // 4 and beyond: anything else (such as batch).
+    m_dimensions[0] = input_dims[0];
+    m_dimensions[1] = op.patch_rows();
+    m_dimensions[2] = op.patch_cols();
+    m_dimensions[3] = m_outputRows * m_outputCols;
+    for (int i = 4; i < NumDims; ++i) {
+      m_dimensions[i] = input_dims[i-1];
+    }
+
+    // Strides for moving the patch in various dimensions.
+    m_colStride = m_dimensions[1];
+    m_patchStride = m_colStride * m_dimensions[2] * m_dimensions[0];
+    m_otherStride = m_patchStride * m_dimensions[3];
+
+    // Strides for navigating through the input tensor.
+    m_rowInputStride = input_dims[0];
+    m_colInputStride = input_dims[0] * input_dims[1];
+    m_patchInputStride = input_dims[0] * input_dims[1] * input_dims[2];
+
+    // Fast representations of different variables.
+    m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride);
+    m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride);
+    m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
+    // Number of patches in the width dimension.
+    m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows);
+    m_fastDimZero = internal::TensorIntDivisor<Index>(m_dimensions[0]);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    // Patch index corresponding to the passed in index.
+    const Index patchIndex = index / m_fastPatchStride;
+
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastDimZero;
+
+    // Other ways to index this element.
+    const Index otherIndex = (NumDims == 4) ? 0 : index / m_fastOtherStride;
+    const Index patch2DIndex = (NumDims == 4) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride;
+
+    const Index colIndex = patch2DIndex / m_fastOutputRows;
+    const Index colOffset = patchOffset / m_fastColStride;
+
+    // Calculate col index in the input original tensor.
+    const Index inputCol = colIndex * m_col_strides + colOffset - m_colPaddingLeft;
+    if (inputCol < 0 || inputCol >= m_inputCols) {
+      return Scalar(0);
+    }
+    const Index rowIndex = patch2DIndex - colIndex * m_outputRows;
+    const Index rowOffset = patchOffset - colOffset * m_colStride;
+
+    // Calculate row index in the original input tensor.
+    const Index inputRow = rowIndex * m_row_strides + rowOffset - m_rowPaddingTop;
+    if (inputRow < 0 || inputRow >= m_inputRows) {
+      return Scalar(0);
+    }
+
+    const Index depth = index - (index / m_fastDimZero) * m_dimensions[0];
+
+    const Index inputIndex = depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex * m_patchInputStride;
+    return m_impl.coeff(inputIndex);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    const Index packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    const Index indices[2] = {index, index + packetSize - 1};
+    const Index patchIndex = indices[0] / m_fastPatchStride;
+    if (patchIndex != indices[1] / m_fastPatchStride) {
+      return packetWithPossibleZero(index);
+    }
+    const Index otherIndex = (NumDims == 4) ? 0 : indices[0] / m_fastOtherStride;
+    eigen_assert(otherIndex == indices[1] / m_fastOtherStride);
+
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastDimZero,
+                                   (indices[1] - patchIndex * m_patchStride) / m_fastDimZero};
+
+    const Index patch2DIndex = (NumDims == 4) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride;
+    eigen_assert(patch2DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride);
+
+    const Index colIndex = patch2DIndex / m_fastOutputRows;
+    const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride};
+
+    // Calculate col indices in the original input tensor.
+    const Index inputCols[2] = {colIndex * m_col_strides + colOffsets[0] -
+      m_colPaddingLeft, colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft};
+    if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) {
+      // all zeros
+      return internal::pset1<PacketReturnType>(Scalar(0));
+    }
+
+    if (inputCols[0] == inputCols[1]) {
+      const Index rowIndex = patch2DIndex - colIndex * m_outputRows;
+      const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0]*m_colStride, patchOffsets[1] - colOffsets[1]*m_colStride};
+      eigen_assert(rowOffsets[0] <= rowOffsets[1]);
+      // Calculate col indices in the original input tensor.
+      const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] -
+        m_rowPaddingTop, rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop};
+
+      if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) {
+        // all zeros
+        return internal::pset1<PacketReturnType>(Scalar(0));
+      }
+
+      if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) {
+        // no padding
+        const Index depth = index - (index / m_fastDimZero) * m_dimensions[0];
+        const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex * m_patchInputStride;
+        return m_impl.template packet<Unaligned>(inputIndex);
+      }
+    }
+
+    return packetWithPossibleZero(index);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+  Index rowPaddingTop() const { return m_rowPaddingTop; }
+  Index colPaddingLeft() const { return m_colPaddingLeft; }
+  Index outputRows() const { return m_outputRows; }
+  Index outputCols() const { return m_outputCols; }
+  Index userRowStride() const { return m_row_strides; }
+  Index userColStride() const { return m_col_strides; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const
+  {
+    // Location of the first element of the patch.
+    // 0: d, 1: patch_rows, 2: patch_cols, 3: number of patches, 4: number of batches
+    const Index patchIndex = coords[3];
+
+    array<Index, NumDims-1> inputCoords;
+    inputCoords[0] = coords[0];  // depth
+    inputCoords[1] = patchIndex / m_inputCols  + coords[1] - m_rowPaddingTop;
+    inputCoords[2] = patchIndex - patchIndex / m_inputCols * m_inputCols + coords[2] - m_colPaddingLeft;
+    inputCoords[3] = coords[4];  // batch
+    // If the computed coordinates are outside the original image perimeter, return 0.
+    if (inputCoords[1] < 0 || inputCoords[1] >= m_inputRows ||
+        inputCoords[2] < 0 || inputCoords[2] >= m_inputCols) {
+      return Scalar(0);
+    }
+    if (TensorEvaluator<ArgType, Device>::CoordAccess) {
+      return m_impl.coeff(inputCoords);
+    } else {
+      Index inputIndex =
+          inputCoords[3] * m_patchInputStride +
+          inputCoords[2] * m_colInputStride +
+          inputCoords[1] * m_rowInputStride +
+          inputCoords[0];
+      return m_impl.coeff(inputIndex);
+    }
+  }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = coeff(index+i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  Dimensions m_dimensions;
+
+  Index m_otherStride;
+  Index m_patchStride;
+  Index m_colStride;
+  Index m_row_strides;
+  Index m_col_strides;
+  internal::TensorIntDivisor<Index> m_fastOtherStride;
+  internal::TensorIntDivisor<Index> m_fastPatchStride;
+  internal::TensorIntDivisor<Index> m_fastColStride;
+
+  Index m_rowInputStride;
+  Index m_colInputStride;
+  Index m_patchInputStride;
+
+  Index m_inputRows;
+  Index m_inputCols;
+
+  Index m_outputRows;
+  Index m_outputCols;
+
+  Index m_rowPaddingTop;
+  Index m_colPaddingLeft;
+
+  internal::TensorIntDivisor<Index> m_fastOutputRows;
+  internal::TensorIntDivisor<Index> m_fastDimZero;
+
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
new file mode 100644
index 000000000..eed0a9f05
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@@ -0,0 +1,419 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
+#define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
+
+#ifdef EIGEN_HAS_CONSTEXPR
+
+namespace Eigen {
+
+/** \internal
+  *
+  * \class TensorIndexList
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Set of classes used to encode a set of Tensor dimensions/indices.
+  *
+  * The indices in the list can be known at compile time or at runtime. A mix
+  * of static and dynamic indices can also be provided if needed. The tensor
+  * code will attempt to take advantage of the indices that are known at
+  * compile time to optimize the code it generates.
+  *
+  * This functionality requires a c++11 compliant compiler. If your compiler
+  * is older you need to use arrays of indices instead.
+  *
+  * Several examples are provided in the cxx11_tensor_index_list.cpp file.
+  *
+  * \sa Tensor
+  */
+
+template <DenseIndex n>
+struct type2index {
+  static const DenseIndex value = n;
+  constexpr operator DenseIndex() const { return n; }
+  void set(DenseIndex val) {
+    eigen_assert(val == n);
+  }
+};
+
+namespace internal {
+template <typename T>
+void update_value(T& val, DenseIndex new_val) {
+  val = new_val;
+}
+template <DenseIndex n>
+void update_value(type2index<n>& val, DenseIndex new_val) {
+  val.set(new_val);
+}
+
+template <typename T>
+struct is_compile_time_constant {
+  static constexpr bool value = false;
+};
+
+template <DenseIndex idx>
+struct is_compile_time_constant<type2index<idx> > {
+  static constexpr bool value = true;
+};
+template <DenseIndex idx>
+struct is_compile_time_constant<const type2index<idx> > {
+  static constexpr bool value = true;
+};
+template <DenseIndex idx>
+struct is_compile_time_constant<type2index<idx>& > {
+  static constexpr bool value = true;
+};
+template <DenseIndex idx>
+struct is_compile_time_constant<const type2index<idx>& > {
+  static constexpr bool value = true;
+};
+
+template <DenseIndex Idx>
+struct tuple_coeff {
+  template <typename... T>
+  static constexpr DenseIndex get(const DenseIndex i, const std::tuple<T...>& t) {
+    return std::get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx);
+  }
+  template <typename... T>
+  static void set(const DenseIndex i, std::tuple<T...>& t, const DenseIndex value) {
+    if (i == Idx) {
+      update_value(std::get<Idx>(t), value);
+    } else {
+      tuple_coeff<Idx-1>::set(i, t, value);
+    }
+  }
+
+  template <typename... T>
+  static constexpr bool value_known_statically(const DenseIndex i, const std::tuple<T...>& t) {
+    return ((i == Idx) & is_compile_time_constant<typename std::tuple_element<Idx, std::tuple<T...> >::type>::value) ||
+        tuple_coeff<Idx-1>::value_known_statically(i, t);
+  }
+
+  template <typename... T>
+  static constexpr bool values_up_to_known_statically(const std::tuple<T...>& t) {
+    return is_compile_time_constant<typename std::tuple_element<Idx, std::tuple<T...> >::type>::value &&
+        tuple_coeff<Idx-1>::values_up_to_known_statically(t);
+  }
+
+  template <typename... T>
+  static constexpr bool values_up_to_statically_known_to_increase(const std::tuple<T...>& t) {
+    return is_compile_time_constant<typename std::tuple_element<Idx, std::tuple<T...> >::type>::value &&
+           is_compile_time_constant<typename std::tuple_element<Idx-1, std::tuple<T...> >::type>::value &&
+           std::get<Idx>(t) > std::get<Idx-1>(t) &&
+           tuple_coeff<Idx-1>::values_up_to_statically_known_to_increase(t);
+  }
+};
+
+template <>
+struct tuple_coeff<0> {
+  template <typename... T>
+  static constexpr DenseIndex get(const DenseIndex i, const std::tuple<T...>& t) {
+    //  eigen_assert (i == 0);  // gcc fails to compile assertions in constexpr
+    return std::get<0>(t) * (i == 0);
+  }
+  template <typename... T>
+  static void set(const DenseIndex i, std::tuple<T...>& t, const DenseIndex value) {
+    eigen_assert (i == 0);
+    update_value(std::get<0>(t), value);
+  }
+  template <typename... T>
+  static constexpr bool value_known_statically(const DenseIndex i, const std::tuple<T...>&) {
+    //    eigen_assert (i == 0);  // gcc fails to compile assertions in constexpr
+    return is_compile_time_constant<typename std::tuple_element<0, std::tuple<T...> >::type>::value & (i == 0);
+  }
+
+  template <typename... T>
+  static constexpr bool values_up_to_known_statically(const std::tuple<T...>&) {
+    return is_compile_time_constant<typename std::tuple_element<0, std::tuple<T...> >::type>::value;
+  }
+
+  template <typename... T>
+  static constexpr bool values_up_to_statically_known_to_increase(const std::tuple<T...>&) {
+    return true;
+  }
+};
+}  // namespace internal
+
+
+template<typename FirstType, typename... OtherTypes>
+struct IndexList : std::tuple<FirstType, OtherTypes...> {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex operator[] (const DenseIndex i) const {
+    return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::get(i, *this);
+  }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const DenseIndex value) {
+    return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::set(i, *this, value);
+  }
+
+  constexpr IndexList(const std::tuple<FirstType, OtherTypes...>& other) : std::tuple<FirstType, OtherTypes...>(other) { }
+  constexpr IndexList() : std::tuple<FirstType, OtherTypes...>() { }
+
+  constexpr bool value_known_statically(const DenseIndex i) const {
+    return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::value_known_statically(i, *this);
+  }
+  constexpr bool all_values_known_statically() const {
+    return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::values_up_to_known_statically(*this);
+  }
+
+  constexpr bool values_statically_known_to_increase() const {
+    return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::values_up_to_statically_known_to_increase(*this);
+  }
+};
+
+
+template<typename FirstType, typename... OtherTypes>
+constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, OtherTypes... other_vals) {
+  return std::make_tuple(val1, other_vals...);
+}
+
+
+namespace internal {
+
+template<typename FirstType, typename... OtherTypes> size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
+  size_t result = 1;
+  for (int i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) {
+    result *= sizes[i];
+  }
+  return result;
+};
+
+template<typename FirstType, typename... OtherTypes> struct array_size<IndexList<FirstType, OtherTypes...> > {
+  static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
+};
+template<typename FirstType, typename... OtherTypes> struct array_size<const IndexList<FirstType, OtherTypes...> > {
+  static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
+};
+
+template<DenseIndex n, typename FirstType, typename... OtherTypes> constexpr DenseIndex array_get(IndexList<FirstType, OtherTypes...>& a) {
+  return std::get<n>(a);
+}
+template<DenseIndex n, typename FirstType, typename... OtherTypes> constexpr DenseIndex array_get(const IndexList<FirstType, OtherTypes...>& a) {
+  return std::get<n>(a);
+}
+
+template <typename T>
+struct index_known_statically {
+  constexpr bool operator() (DenseIndex) const {
+    return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_known_statically<IndexList<FirstType, OtherTypes...> > {
+  constexpr bool operator() (const DenseIndex i) const {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_known_statically<const IndexList<FirstType, OtherTypes...> > {
+  constexpr bool operator() (const DenseIndex i) const {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
+  }
+};
+
+template <typename T>
+struct all_indices_known_statically {
+  constexpr bool operator() () const {
+    return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct all_indices_known_statically<IndexList<FirstType, OtherTypes...> > {
+  constexpr bool operator() () const {
+    return IndexList<FirstType, OtherTypes...>().all_values_known_statically();
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct all_indices_known_statically<const IndexList<FirstType, OtherTypes...> > {
+  constexpr bool operator() () const {
+    return IndexList<FirstType, OtherTypes...>().all_values_known_statically();
+  }
+};
+
+template <typename T>
+struct indices_statically_known_to_increase {
+  constexpr bool operator() () const {
+    return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct indices_statically_known_to_increase<IndexList<FirstType, OtherTypes...> > {
+  constexpr bool operator() () const {
+    return IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct indices_statically_known_to_increase<const IndexList<FirstType, OtherTypes...> > {
+  constexpr bool operator() () const {
+    return IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
+  }
+};
+
+template <typename Tx>
+struct index_statically_eq {
+  constexpr bool operator() (DenseIndex, DenseIndex) const {
+    return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_eq<IndexList<FirstType, OtherTypes...> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexList<FirstType, OtherTypes...>()[i] == value);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_eq<const IndexList<FirstType, OtherTypes...> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexList<FirstType, OtherTypes...>()[i] == value);
+  }
+};
+
+template <typename T>
+struct index_statically_ne {
+  constexpr bool operator() (DenseIndex, DenseIndex) const {
+  return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_ne<IndexList<FirstType, OtherTypes...> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexList<FirstType, OtherTypes...>()[i] != value);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_ne<const IndexList<FirstType, OtherTypes...> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexList<FirstType, OtherTypes...>()[i] != value);
+  }
+};
+
+
+template <typename T>
+struct index_statically_gt {
+  constexpr bool operator() (DenseIndex, DenseIndex) const {
+  return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_gt<IndexList<FirstType, OtherTypes...> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexList<FirstType, OtherTypes...>()[i] > value);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_gt<const IndexList<FirstType, OtherTypes...> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexList<FirstType, OtherTypes...>()[i] > value);
+  }
+};
+
+template <typename T>
+struct index_statically_lt {
+  constexpr bool operator() (DenseIndex, DenseIndex) const {
+  return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_lt<IndexList<FirstType, OtherTypes...> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexList<FirstType, OtherTypes...>()[i] < value);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_lt<const IndexList<FirstType, OtherTypes...> > {
+  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexList<FirstType, OtherTypes...>()[i] < value);
+  }
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#else
+
+namespace Eigen {
+namespace internal {
+
+// No C++11 support
+template <typename T>
+struct index_known_statically {
+  EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex) const{
+    return false;
+  }
+};
+
+template <typename T>
+struct all_indices_known_statically {
+  EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() () const {
+    return false;
+  }
+};
+
+template <typename T>
+struct indices_statically_known_to_increase {
+  EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() () const {
+    return false;
+  }
+};
+
+template <typename T>
+struct index_statically_eq {
+  EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{
+    return false;
+  }
+};
+
+template <typename T>
+struct index_statically_ne {
+  EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{
+    return false;
+  }
+};
+
+template <typename T>
+struct index_statically_gt {
+  EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{
+    return false;
+  }
+};
+
+template <typename T>
+struct index_statically_lt {
+  EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{
+    return false;
+  }
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
new file mode 100644
index 000000000..4303e3536
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
@@ -0,0 +1,70 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
+#define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+
+#include <initializer_list>
+
+namespace Eigen {
+
+/** \class TensorInitializer
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Helper template to initialize Tensors from std::initializer_lists.
+  */
+namespace internal {
+
+template <typename Derived, int N>
+struct Initializer {
+  typedef std::initializer_list<
+    typename Initializer<Derived, N - 1>::InitList> InitList;
+
+  static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
+                  Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
+                  const InitList& vals) {
+    int i = 0;
+    for (auto v : vals) {
+      (*indices)[traits<Derived>::NumDimensions - N] = i++;
+      Initializer<Derived, N - 1>::run(tensor, indices, v);
+    }
+  }
+};
+
+template <typename Derived>
+struct Initializer<Derived, 1> {
+  typedef std::initializer_list<typename traits<Derived>::Scalar> InitList;
+
+  static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
+                  Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
+                  const InitList& vals) {
+    int i = 0;
+    // There is likely a faster way to do that than iterating.
+    for (auto v : vals) {
+      (*indices)[traits<Derived>::NumDimensions - 1] = i++;
+      tensor.coeffRef(*indices) = v;
+    }
+  }
+};
+
+template <typename Derived, int N>
+void initialize_tensor(TensorEvaluator<Derived, DefaultDevice>& tensor,
+                       const typename Initializer<Derived, traits<Derived>::NumDimensions>::InitList& vals) {
+  Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions> indices;
+  Initializer<Derived, traits<Derived>::NumDimensions>::run(tensor, &indices, vals);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_HAS_VARIADIC_TEMPLATES
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
new file mode 100644
index 000000000..2714117ab
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -0,0 +1,86 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
+#define EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
+
+
+namespace Eigen {
+
+/** \internal
+  *
+  * \class TensorIntDiv
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Fast integer division by a constant.
+  *
+  * See the paper from Granlund and Montgomery for explanation.
+  *   (at http://dx.doi.org/10.1145/773473.178249)
+  *
+  * \sa Tensor
+  */
+
+namespace internal {
+
+template <typename T>
+struct TensorIntDivisor {
+ public:
+   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
+    multiplier = 0;
+    shift1 = 0;
+    shift2 = 0;
+  }
+
+  // Must have 1 <= divider <= 2^31-1
+   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) {
+    const int N = 32;
+    eigen_assert(divider > 0);
+    eigen_assert(divider <= (1<<(N-1)) - 1);
+
+    // fast ln2
+#ifndef __CUDA_ARCH__
+    const int leading_zeros = __builtin_clz(divider);
+#else
+    const int leading_zeros = __clz(divider);
+#endif
+    const int log_div = N - (leading_zeros+1);
+
+    multiplier = (static_cast<uint64_t>(1) << (N+log_div)) / divider - (static_cast<uint64_t>(1) << N) + 1;
+    shift1 = log_div > 1 ? 1 : log_div;
+    shift2 = log_div > 1 ? log_div-1 : 0;
+  }
+
+  // Must have 0 <= numerator <= 2^32-1
+   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const {
+    const int N = 32;
+    eigen_assert(numerator >= 0);
+    eigen_assert(numerator <= (1ull<<N) - 1);
+
+    uint32_t t1 = (multiplier * numerator) >> 32;
+    uint32_t t = (static_cast<uint32_t>(numerator) - t1) >> shift1;
+    return (t1 + t) >> shift2;
+  }
+
+ private:
+  uint64_t multiplier;
+  int32_t shift1;
+  int32_t shift2;
+};
+
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor<T>& divisor) {
+  return divisor.divide(numerator);
+}
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
new file mode 100644
index 000000000..c119b30e2
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -0,0 +1,198 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
+#define EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
+
+namespace Eigen {
+
+/** \class TensorLayoutSwap
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Swap the layout from col-major to row-major, or row-major
+  * to col-major, and invert the order of the dimensions.
+  *
+  * Beware: the dimensions are reversed by this operation. If you want to
+  * preserve the ordering of the dimensions, you need to combine this
+  * operation with a shuffle.
+  *
+  * \example:
+  * Tensor<float, 2, ColMajor> input(2, 4);
+  * Tensor<float, 2, RowMajor> output = input.swap_layout();
+  * eigen_assert(output.dimension(0) == 4);
+  * eigen_assert(output.dimension(1) == 2);
+  *
+  * array<int, 2> shuffle(1, 0);
+  * output = input.swap_layout().shuffle(shuffle);
+  * eigen_assert(output.dimension(0) == 2);
+  * eigen_assert(output.dimension(1) == 4);
+  *
+  */
+namespace internal {
+template<typename XprType>
+struct traits<TensorLayoutSwapOp<XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = traits<XprType>::NumDimensions;
+  static const int Layout = (traits<XprType>::Layout == ColMajor) ? RowMajor : ColMajor;
+};
+
+template<typename XprType>
+struct eval<TensorLayoutSwapOp<XprType>, Eigen::Dense>
+{
+  typedef const TensorLayoutSwapOp<XprType>& type;
+};
+
+template<typename XprType>
+struct nested<TensorLayoutSwapOp<XprType>, 1, typename eval<TensorLayoutSwapOp<XprType> >::type>
+{
+  typedef TensorLayoutSwapOp<XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename XprType>
+class TensorLayoutSwapOp : public TensorBase<TensorLayoutSwapOp<XprType>, WriteAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorLayoutSwapOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr)
+      : m_xpr(expr) {}
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const OtherDerived& other)
+    {
+      typedef TensorAssignOp<TensorLayoutSwapOp, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+  protected:
+    typename XprType::Nested m_xpr;
+};
+
+
+// Eval as rvalue
+template<typename ArgType, typename Device>
+struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
+{
+  typedef TensorLayoutSwapOp<ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device)
+  {
+    for(int i = 0; i < NumDims; ++i) {
+      m_dimensions[i] = m_impl.dimensions()[NumDims-1-i];
+    }
+  }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+    return m_impl.evalSubExprsIfNeeded(data);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_impl.coeff(index);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return m_impl.template packet<LoadMode>(index);
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_impl.data(); }
+
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+ protected:
+  TensorEvaluator<ArgType, Device> m_impl;
+  Dimensions m_dimensions;
+};
+
+
+// Eval as lvalue
+template<typename ArgType, typename Device>
+  struct TensorEvaluator<TensorLayoutSwapOp<ArgType>, Device>
+  : public TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
+{
+  typedef TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> Base;
+  typedef TensorLayoutSwapOp<ArgType> XprType;
+
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : Base(op, device)
+  { }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  {
+    return this->m_impl.coeffRef(index);
+  }
+  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketReturnType& x)
+  {
+    this->m_impl.template writePacket<StoreMode>(index, x);
+  }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
new file mode 100644
index 000000000..2cb2bc7a6
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@@ -0,0 +1,291 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_MAP_H
+#define EIGEN_CXX11_TENSOR_TENSOR_MAP_H
+
+namespace Eigen {
+
+/** \class TensorMap
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief A tensor expression mapping an existing array of data.
+  *
+  */
+
+template<typename PlainObjectType, int Options_> class TensorMap : public TensorBase<TensorMap<PlainObjectType, Options_> >
+{
+  public:
+    typedef TensorMap<PlainObjectType, Options_> Self;
+    typedef typename PlainObjectType::Base Base;
+    typedef typename Eigen::internal::nested<Self>::type Nested;
+    typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
+    typedef typename internal::traits<PlainObjectType>::Index Index;
+    typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
+    typedef typename internal::packet_traits<Scalar>::type Packet;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef typename Base::CoeffReturnType CoeffReturnType;
+
+  /*    typedef typename internal::conditional<
+                         bool(internal::is_lvalue<PlainObjectType>::value),
+                         Scalar *,
+                         const Scalar *>::type
+                     PointerType;*/
+    typedef Scalar* PointerType;
+    typedef PointerType PointerArgType;
+
+    static const int Options = Options_;
+
+    static const Index NumIndices = PlainObjectType::NumIndices;
+    typedef typename PlainObjectType::Dimensions Dimensions;
+
+    enum {
+      IsAligned = ((int(Options_)&Aligned)==Aligned),
+      PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+      Layout = PlainObjectType::Layout,
+      CoordAccess = true,
+    };
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
+      // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+#else
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) {
+      // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) {
+      EIGEN_STATIC_ASSERT(2 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) {
+      EIGEN_STATIC_ASSERT(3 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) {
+      EIGEN_STATIC_ASSERT(4 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) {
+      EIGEN_STATIC_ASSERT(5 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+#endif
+
+   inline TensorMap(PointerArgType dataPtr, const array<Index, NumIndices>& dimensions)
+      : m_data(dataPtr), m_dimensions(dimensions)
+    { }
+
+    template <typename Dimensions>
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const Dimensions& dimensions)
+      : m_data(dataPtr), m_dimensions(dimensions)
+    { }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index rank() const { return m_dimensions.rank(); }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar* data() { return m_data; }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar* data() const { return m_data; }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
+    {
+      //      eigen_assert(checkIndexRange(indices));
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = m_dimensions.IndexOfRowMajor(indices);
+        return m_data[index];
+      } else {
+        const Index index = m_dimensions.IndexOfColMajor(indices);
+        return m_data[index];
+      }
+    }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const
+    {
+      static_assert(sizeof...(otherIndices) + 1 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+        return m_data[index];
+      } else {
+        const Index index = m_dimensions.IndexOfColMajor(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+        return m_data[index];
+      }
+    }
+#else
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      return m_data[index];
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
+    {
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = i1 + i0 * m_dimensions[0];
+        return m_data[index];
+      } else {
+        const Index index = i0 + i1 * m_dimensions[0];
+        return m_data[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
+    {
+      if (PlainObjectType::Options&RowMajor) {
+         const Index index = i2 + m_dimensions[1] * (i1 + m_dimensions[0] * i0);
+         return m_data[index];
+      } else {
+         const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2);
+        return m_data[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const
+    {
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
+        return m_data[index];
+      } else {
+        const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3));
+        return m_data[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
+    {
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
+        return m_data[index];
+      } else {
+        const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4)));
+        return m_data[index];
+      }
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
+    {
+      //      eigen_assert(checkIndexRange(indices));
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = m_dimensions.IndexOfRowMajor(indices);
+        return m_data[index];
+      } else {
+        const Index index = m_dimensions.IndexOfColMajor(indices);
+        return m_data[index];
+      }
+    }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
+    {
+      static_assert(sizeof...(otherIndices) + 1 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+      const std::size_t NumDims = sizeof...(otherIndices) + 1;
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumDims>{{firstIndex, otherIndices...}});
+        return m_data[index];
+      } else {
+        const Index index = m_dimensions.IndexOfColMajor(array<Index, NumDims>{{firstIndex, otherIndices...}});
+        return m_data[index];
+      }
+    }
+#else
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index index)
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      return m_data[index];
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
+    {
+       if (PlainObjectType::Options&RowMajor) {
+         const Index index = i1 + i0 * m_dimensions[0];
+        return m_data[index];
+      } else {
+        const Index index = i0 + i1 * m_dimensions[0];
+        return m_data[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
+    {
+       if (PlainObjectType::Options&RowMajor) {
+         const Index index = i2 + m_dimensions[1] * (i1 + m_dimensions[0] * i0);
+        return m_data[index];
+      } else {
+         const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2);
+        return m_data[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
+    {
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
+        return m_data[index];
+      } else {
+        const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3));
+        return m_data[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
+    {
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
+        return m_data[index];
+      } else {
+        const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4)));
+        return m_data[index];
+      }
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self& operator=(const Self& other)
+    {
+      typedef TensorAssignOp<Self, const Self> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Self& operator=(const OtherDerived& other)
+    {
+      typedef TensorAssignOp<Self, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+  private:
+    Scalar* m_data;
+    Dimensions m_dimensions;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_MAP_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
new file mode 100644
index 000000000..a93f48ccb
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -0,0 +1,600 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
+
+namespace Eigen {
+
+/** \class TensorReshaping
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor reshaping class.
+  *
+  *
+  */
+namespace internal {
+template<typename NewDimensions, typename XprType>
+struct traits<TensorReshapingOp<NewDimensions, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = array_size<NewDimensions>::value;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename NewDimensions, typename XprType>
+struct eval<TensorReshapingOp<NewDimensions, XprType>, Eigen::Dense>
+{
+  typedef const TensorReshapingOp<NewDimensions, XprType>& type;
+};
+
+template<typename NewDimensions, typename XprType>
+struct nested<TensorReshapingOp<NewDimensions, XprType>, 1, typename eval<TensorReshapingOp<NewDimensions, XprType> >::type>
+{
+  typedef TensorReshapingOp<NewDimensions, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename NewDimensions, typename XprType>
+class TensorReshapingOp : public TensorBase<TensorReshapingOp<NewDimensions, XprType>, WriteAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorReshapingOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorReshapingOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorReshapingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorReshapingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorReshapingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp(const XprType& expr, const NewDimensions& dims)
+      : m_xpr(expr), m_dims(dims) {}
+
+    EIGEN_DEVICE_FUNC
+    const NewDimensions& dimensions() const { return m_dims; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const TensorReshapingOp& other)
+    {
+      typedef TensorAssignOp<TensorReshapingOp, const TensorReshapingOp> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const OtherDerived& other)
+    {
+      typedef TensorAssignOp<TensorReshapingOp, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const NewDimensions m_dims;
+};
+
+
+// Eval as rvalue
+template<typename NewDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
+{
+  typedef TensorReshapingOp<NewDimensions, ArgType> XprType;
+  typedef NewDimensions Dimensions;
+
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_dimensions(op.dimensions())
+  {
+    // The total size of the reshaped tensor must be equal to the total size
+    // of the input tensor.
+    eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions()));
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+    return m_impl.evalSubExprsIfNeeded(data);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_impl.coeff(index);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return m_impl.template packet<LoadMode>(index);
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_impl.data(); }
+
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+ protected:
+  TensorEvaluator<ArgType, Device> m_impl;
+  NewDimensions m_dimensions;
+};
+
+
+// Eval as lvalue
+template<typename NewDimensions, typename ArgType, typename Device>
+  struct TensorEvaluator<TensorReshapingOp<NewDimensions, ArgType>, Device>
+  : public TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
+
+{
+  typedef TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> Base;
+  typedef TensorReshapingOp<NewDimensions, ArgType> XprType;
+  typedef NewDimensions Dimensions;
+
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : Base(op, device)
+  { }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  {
+    return this->m_impl.coeffRef(index);
+  }
+  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketReturnType& x)
+  {
+    this->m_impl.template writePacket<StoreMode>(index, x);
+  }
+};
+
+
+/** \class TensorSlicing
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor slicing class.
+  *
+  *
+  */
+namespace internal {
+template<typename StartIndices, typename Sizes, typename XprType>
+struct traits<TensorSlicingOp<StartIndices, Sizes, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = array_size<StartIndices>::value;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename StartIndices, typename Sizes, typename XprType>
+struct eval<TensorSlicingOp<StartIndices, Sizes, XprType>, Eigen::Dense>
+{
+  typedef const TensorSlicingOp<StartIndices, Sizes, XprType>& type;
+};
+
+template<typename StartIndices, typename Sizes, typename XprType>
+struct nested<TensorSlicingOp<StartIndices, Sizes, XprType>, 1, typename eval<TensorSlicingOp<StartIndices, Sizes, XprType> >::type>
+{
+  typedef TensorSlicingOp<StartIndices, Sizes, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename StartIndices, typename Sizes, typename XprType>
+class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, XprType> >
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorSlicingOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorSlicingOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorSlicingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorSlicingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorSlicingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSlicingOp(const XprType& expr, const StartIndices& indices, const Sizes& sizes)
+      : m_xpr(expr), m_indices(indices), m_sizes(sizes) {}
+
+    EIGEN_DEVICE_FUNC
+    const StartIndices& startIndices() const { return m_indices; }
+    EIGEN_DEVICE_FUNC
+    const Sizes& sizes() const { return m_sizes; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const OtherDerived& other)
+    {
+      typedef TensorAssignOp<TensorSlicingOp, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const TensorSlicingOp& other)
+    {
+      typedef TensorAssignOp<TensorSlicingOp, const TensorSlicingOp> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const StartIndices m_indices;
+    const Sizes m_sizes;
+};
+
+
+// Eval as rvalue
+template<typename StartIndices, typename Sizes, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
+{
+  typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
+  static const int NumDims = internal::array_size<Sizes>::value;
+
+  enum {
+    // Alignment can't be guaranteed at compile time since it depends on the
+    // slice offsets and sizes.
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices())
+  {
+    for (int i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+      eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]);
+    }
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    const Sizes& output_dims = op.sizes();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+      }
+
+      m_outputStrides[0] = 1;
+      m_fastOutputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1];
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
+      }
+    } else {
+      m_inputStrides[NumDims-1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
+      }
+
+      m_outputStrides[NumDims-1] = 1;
+      m_fastOutputStrides[NumDims-1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1];
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
+      }
+    }
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef Sizes Dimensions;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    if (internal::is_arithmetic<Scalar>::value && data && m_impl.data()) {
+      Index contiguous_values = 1;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        for (int i = 0; i < NumDims; ++i) {
+          contiguous_values *= dimensions()[i];
+          if (dimensions()[i] != m_impl.dimensions()[i]) {
+            break;
+          }
+        }
+      } else {
+        for (int i = NumDims-1; i >= 0; --i) {
+          contiguous_values *= dimensions()[i];
+          if (dimensions()[i] != m_impl.dimensions()[i]) {
+            break;
+          }
+        }
+      }
+      // Use memcpy if it's going to be faster than using the regular evaluation.
+      if (contiguous_values > 2 * m_device.numThreads()) {
+        Scalar* src = m_impl.data();
+        for (int i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
+          Index offset = srcCoeff(i);
+          m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar));
+        }
+        return false;
+      }
+    }
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_impl.coeff(srcCoeff(index));
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    Index inputIndices[] = {0, 0};
+    Index indices[] = {index, index + packetSize - 1};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx0 = indices[0] / m_fastOutputStrides[i];
+        const Index idx1 = indices[1] / m_fastOutputStrides[i];
+        inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i];
+        inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i];
+        indices[0] -= idx0 * m_outputStrides[i];
+        indices[1] -= idx1 * m_outputStrides[i];
+      }
+      inputIndices[0] += (indices[0] + m_offsets[0]);
+      inputIndices[1] += (indices[1] + m_offsets[0]);
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx0 = indices[0] / m_fastOutputStrides[i];
+        const Index idx1 = indices[1] / m_fastOutputStrides[i];
+        inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i];
+        inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i];
+        indices[0] -= idx0 * m_outputStrides[i];
+        indices[1] -= idx1 * m_outputStrides[i];
+      }
+      inputIndices[0] += (indices[0] + m_offsets[NumDims-1]);
+      inputIndices[1] += (indices[1] + m_offsets[NumDims-1]);
+    }
+    if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+      PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
+      return rslt;
+    }
+    else {
+      typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+      values[0] = m_impl.coeff(inputIndices[0]);
+      values[packetSize-1] = m_impl.coeff(inputIndices[1]);
+      for (int i = 1; i < packetSize-1; ++i) {
+        values[i] = coeff(index+i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords)
+  {
+    array<Index, NumDims> inputCoords;
+    for (int i = 0; i < NumDims; ++i) {
+      inputCoords = coords[i] + this->m_offsets[i];
+    }
+    return m_impl.coeff(inputCoords);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const {
+    Scalar* result = m_impl.data();
+    if (result) {
+      Index offset = 0;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        for (int i = 0; i < NumDims; ++i) {
+          if (m_dimensions[i] != m_impl.dimensions()[i]) {
+            offset += m_offsets[i] * m_inputStrides[i];
+            for (int j = i+1; j < NumDims; ++j) {
+              if (m_dimensions[j] > 1) {
+                return NULL;
+              }
+              offset += m_offsets[j] * m_inputStrides[j];
+            }
+            break;
+          }
+        }
+      } else {
+        for (int i = NumDims - 1; i >= 0; --i) {
+          if (m_dimensions[i] != m_impl.dimensions()[i]) {
+            offset += m_offsets[i] * m_inputStrides[i];
+            for (int j = i-1; j >= 0; --j) {
+              if (m_dimensions[j] > 1) {
+                return NULL;
+              }
+              offset += m_offsets[j] * m_inputStrides[j];
+            }
+            break;
+          }
+        }
+      }
+      return result + offset;
+    }
+    return NULL;
+  }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
+  {
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      inputIndex += (index + m_offsets[0]);
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      inputIndex += (index + m_offsets[NumDims-1]);
+    }
+    return inputIndex;
+  }
+
+  array<Index, NumDims> m_outputStrides;
+  array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  const Device& m_device;
+  Dimensions m_dimensions;
+  const StartIndices m_offsets;
+};
+
+
+// Eval as lvalue
+template<typename StartIndices, typename Sizes, typename ArgType, typename Device>
+struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
+  : public TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
+{
+  typedef TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> Base;
+  typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
+  static const int NumDims = internal::array_size<Sizes>::value;
+
+  enum {
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : Base(op, device)
+    { }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef Sizes Dimensions;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  {
+    return this->m_impl.coeffRef(this->srcCoeff(index));
+  }
+
+  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketReturnType& x)
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    Index inputIndices[] = {0, 0};
+    Index indices[] = {index, index + packetSize - 1};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
+        const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
+        inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i];
+        inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i];
+        indices[0] -= idx0 * this->m_outputStrides[i];
+        indices[1] -= idx1 * this->m_outputStrides[i];
+      }
+      inputIndices[0] += (indices[0] + this->m_offsets[0]);
+      inputIndices[1] += (indices[1] + this->m_offsets[0]);
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
+        const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
+        inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i];
+        inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i];
+        indices[0] -= idx0 * this->m_outputStrides[i];
+        indices[1] -= idx1 * this->m_outputStrides[i];
+      }
+      inputIndices[0] += (indices[0] + this->m_offsets[NumDims-1]);
+      inputIndices[1] += (indices[1] + this->m_offsets[NumDims-1]);
+    }
+    if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+      this->m_impl.template writePacket<StoreMode>(inputIndices[0], x);
+    }
+    else {
+      EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
+      internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+      this->m_impl.coeffRef(inputIndices[0]) = values[0];
+      this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1];
+      for (int i = 1; i < packetSize-1; ++i) {
+        this->coeffRef(index+i) = values[i];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(const array<Index, NumDims>& coords)
+  {
+    array<Index, NumDims> inputCoords;
+    for (int i = 0; i < NumDims; ++i) {
+      inputCoords = coords[i] + this->m_offsets[i];
+    }
+    return this->m_impl.coeffRef(inputCoords);
+  }
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
new file mode 100644
index 000000000..2a7dd45c0
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -0,0 +1,361 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
+
+namespace Eigen {
+
+/** \class TensorPadding
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor padding class.
+  * At the moment only 0-padding is supported.
+  *
+  */
+namespace internal {
+template<typename PaddingDimensions, typename XprType>
+struct traits<TensorPaddingOp<PaddingDimensions, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename PaddingDimensions, typename XprType>
+struct eval<TensorPaddingOp<PaddingDimensions, XprType>, Eigen::Dense>
+{
+  typedef const TensorPaddingOp<PaddingDimensions, XprType>& type;
+};
+
+template<typename PaddingDimensions, typename XprType>
+struct nested<TensorPaddingOp<PaddingDimensions, XprType>, 1, typename eval<TensorPaddingOp<PaddingDimensions, XprType> >::type>
+{
+  typedef TensorPaddingOp<PaddingDimensions, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename PaddingDimensions, typename XprType>
+class TensorPaddingOp : public TensorBase<TensorPaddingOp<PaddingDimensions, XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorPaddingOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorPaddingOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorPaddingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorPaddingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorPaddingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims)
+      : m_xpr(expr), m_padding_dims(padding_dims) {}
+
+    EIGEN_DEVICE_FUNC
+    const PaddingDimensions& padding() const { return m_padding_dims; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const PaddingDimensions m_padding_dims;
+};
+
+
+// Eval as rvalue
+template<typename PaddingDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device>
+{
+  typedef TensorPaddingOp<PaddingDimensions, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<PaddingDimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = true,
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_padding(op.padding())
+  {
+    // Compute dimensions
+    m_dimensions = m_impl.dimensions();
+    for (int i = 0; i < NumDims; ++i) {
+      m_dimensions[i] += m_padding[i].first + m_padding[i].second;
+    }
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputStrides[0] = 1;
+      m_outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+        m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+      }
+      m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1];
+    } else {
+      m_inputStrides[NumDims - 1] = 1;
+      m_outputStrides[NumDims] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
+        m_outputStrides[i+1] = m_outputStrides[i+2] * m_dimensions[i+1];
+      }
+      m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0];
+    }
+  }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    eigen_assert(index < dimensions().TotalSize());
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStrides[i];
+        if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
+          return Scalar(0);
+        }
+        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      if (index < m_padding[0].first || index >= m_dimensions[0] - m_padding[0].second) {
+        return Scalar(0);
+      }
+      inputIndex += (index - m_padding[0].first);
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_outputStrides[i+1];
+        if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
+          return Scalar(0);
+        }
+        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i+1];
+      }
+      if (index < m_padding[NumDims-1].first ||
+          index >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) {
+        return Scalar(0);
+      }
+      inputIndex += (index - m_padding[NumDims-1].first);
+    }
+    return m_impl.coeff(inputIndex);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      return packetColMajor(index);
+    }
+    return packetRowMajor(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const
+  {
+    Index inputIndex;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      const Index idx = coords[0];
+      if (idx < m_padding[0].first || idx >= m_dimensions[0] - m_padding[0].second) {
+        return Scalar(0);
+      }
+      inputIndex = idx - m_padding[0].first;
+      for (int i = 1; i < NumDims; ++i) {
+        const Index idx = coords[i];
+        if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
+          return Scalar(0);
+        }
+        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+      }
+    } else {
+      const Index idx = coords[NumDims-1];
+      if (idx < m_padding[NumDims-1].first || idx >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) {
+        return Scalar(0);
+      }
+      inputIndex = idx - m_padding[NumDims-1].first;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        const Index idx = coords[i];
+        if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
+          return Scalar(0);
+        }
+        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+      }
+    }
+    return m_impl.coeff(inputIndex);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    const Index initialIndex = index;
+    Index inputIndex = 0;
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index first = index;
+      const Index last = index + packetSize - 1;
+      const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
+      const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
+      const Index lastPaddedRight = m_outputStrides[i+1];
+
+      if (last < lastPaddedLeft) {
+        // all the coefficient are in the padding zone.
+        return internal::pset1<PacketReturnType>(Scalar(0));
+      }
+      else if (first >= firstPaddedRight && last < lastPaddedRight) {
+        // all the coefficient are in the padding zone.
+        return internal::pset1<PacketReturnType>(Scalar(0));
+      }
+      else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+        // all the coefficient are between the 2 padding zones.
+        const Index idx = index / m_outputStrides[i];
+        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      else {
+        // Every other case
+        return packetWithPossibleZero(initialIndex);
+      }
+    }
+
+    const Index last = index + packetSize - 1;
+    const Index first = index;
+    const Index lastPaddedLeft = m_padding[0].first;
+    const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
+    const Index lastPaddedRight = m_outputStrides[1];
+
+    if (last < lastPaddedLeft) {
+      // all the coefficient are in the padding zone.
+      return internal::pset1<PacketReturnType>(Scalar(0));
+    }
+    else if (first >= firstPaddedRight && last < lastPaddedRight) {
+      // all the coefficient are in the padding zone.
+      return internal::pset1<PacketReturnType>(Scalar(0));
+    }
+    else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+      // all the coefficient are between the 2 padding zones.
+      inputIndex += (index - m_padding[0].first);
+      return m_impl.template packet<Unaligned>(inputIndex);
+    }
+    // Every other case
+    return packetWithPossibleZero(initialIndex);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    const Index initialIndex = index;
+    Index inputIndex = 0;
+
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const Index first = index;
+      const Index last = index + packetSize - 1;
+      const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1];
+      const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1];
+      const Index lastPaddedRight = m_outputStrides[i];
+
+      if (last < lastPaddedLeft) {
+        // all the coefficient are in the padding zone.
+        return internal::pset1<PacketReturnType>(Scalar(0));
+      }
+      else if (first >= firstPaddedRight && last < lastPaddedRight) {
+        // all the coefficient are in the padding zone.
+        return internal::pset1<PacketReturnType>(Scalar(0));
+      }
+      else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+        // all the coefficient are between the 2 padding zones.
+        const Index idx = index / m_outputStrides[i+1];
+        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i+1];
+      }
+      else {
+        // Every other case
+        return packetWithPossibleZero(initialIndex);
+      }
+    }
+
+    const Index last = index + packetSize - 1;
+    const Index first = index;
+    const Index lastPaddedLeft = m_padding[NumDims-1].first;
+    const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second);
+    const Index lastPaddedRight = m_outputStrides[NumDims-1];
+
+    if (last < lastPaddedLeft) {
+      // all the coefficient are in the padding zone.
+      return internal::pset1<PacketReturnType>(Scalar(0));
+    }
+    else if (first >= firstPaddedRight && last < lastPaddedRight) {
+      // all the coefficient are in the padding zone.
+      return internal::pset1<PacketReturnType>(Scalar(0));
+    }
+    else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+      // all the coefficient are between the 2 padding zones.
+      inputIndex += (index - m_padding[NumDims-1].first);
+      return m_impl.template packet<Unaligned>(inputIndex);
+    }
+    // Every other case
+    return packetWithPossibleZero(initialIndex);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = coeff(index+i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  Dimensions m_dimensions;
+  array<Index, NumDims+1> m_outputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  PaddingDimensions m_padding;
+};
+
+
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
new file mode 100644
index 000000000..8a42ab6b1
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -0,0 +1,248 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
+#define EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
+
+namespace Eigen {
+
+/** \class TensorPatch
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor patch class.
+  *
+  *
+  */
+namespace internal {
+template<typename PatchDim, typename XprType>
+struct traits<TensorPatchOp<PatchDim, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions + 1;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename PatchDim, typename XprType>
+struct eval<TensorPatchOp<PatchDim, XprType>, Eigen::Dense>
+{
+  typedef const TensorPatchOp<PatchDim, XprType>& type;
+};
+
+template<typename PatchDim, typename XprType>
+struct nested<TensorPatchOp<PatchDim, XprType>, 1, typename eval<TensorPatchOp<PatchDim, XprType> >::type>
+{
+  typedef TensorPatchOp<PatchDim, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename PatchDim, typename XprType>
+class TensorPatchOp : public TensorBase<TensorPatchOp<PatchDim, XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorPatchOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorPatchOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorPatchOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorPatchOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorPatchOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPatchOp(const XprType& expr, const PatchDim& patch_dims)
+      : m_xpr(expr), m_patch_dims(patch_dims) {}
+
+    EIGEN_DEVICE_FUNC
+    const PatchDim& patch_dims() const { return m_patch_dims; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const PatchDim m_patch_dims;
+};
+
+
+// Eval as rvalue
+template<typename PatchDim, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
+{
+  typedef TensorPatchOp<PatchDim, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value + 1;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = true,
+ };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device)
+  {
+   // Only column major tensors are supported for now.
+    EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    Index num_patches = 1;
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    const PatchDim& patch_dims = op.patch_dims();
+    for (int i = 0; i < NumDims-1; ++i) {
+      m_dimensions[i] = patch_dims[i];
+      num_patches *= (input_dims[i] - patch_dims[i] + 1);
+    }
+    m_dimensions[NumDims-1] = num_patches;
+
+    m_inputStrides[0] = 1;
+    m_patchStrides[0] = 1;
+    for (int i = 1; i < NumDims-1; ++i) {
+      m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+      m_patchStrides[i] = m_patchStrides[i-1] * (input_dims[i-1] - patch_dims[i-1] + 1);
+    }
+    m_outputStrides[0] = 1;
+    for (int i = 1; i < NumDims; ++i) {
+      m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+    }
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    // Find the location of the first element of the patch.
+    Index patchIndex = index / m_outputStrides[NumDims - 1];
+    // Find the offset of the element wrt the location of the first element.
+    Index patchOffset = index - patchIndex * m_outputStrides[NumDims - 1];
+
+    Index inputIndex = 0;
+    for (int i = NumDims - 2; i > 0; --i) {
+      const Index patchIdx = patchIndex / m_patchStrides[i];
+      patchIndex -= patchIdx * m_patchStrides[i];
+      const Index offsetIdx = patchOffset / m_outputStrides[i];
+      patchOffset -= offsetIdx * m_outputStrides[i];
+      inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
+    }
+    inputIndex += (patchIndex + patchOffset);
+    return m_impl.coeff(inputIndex);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    Index indices[2] = {index, index + packetSize - 1};
+    Index patchIndices[2] = {indices[0] / m_outputStrides[NumDims - 1],
+                             indices[1] / m_outputStrides[NumDims - 1]};
+    Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[NumDims - 1],
+                             indices[1] - patchIndices[1] * m_outputStrides[NumDims - 1]};
+
+    Index inputIndices[2] = {0, 0};
+    for (int i = NumDims - 2; i > 0; --i) {
+      const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i],
+                                 patchIndices[1] / m_patchStrides[i]};
+      patchIndices[0] -= patchIdx[0] * m_patchStrides[i];
+      patchIndices[1] -= patchIdx[1] * m_patchStrides[i];
+
+      const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i],
+                                  patchOffsets[1] / m_outputStrides[i]};
+      patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i];
+      patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i];
+
+      inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i];
+      inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i];
+    }
+    inputIndices[0] += (patchIndices[0] + patchOffsets[0]);
+    inputIndices[1] += (patchIndices[1] + patchOffsets[1]);
+
+    if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+      PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
+      return rslt;
+    }
+    else {
+      EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
+      values[0] = m_impl.coeff(inputIndices[0]);
+      values[packetSize-1] = m_impl.coeff(inputIndices[1]);
+      for (int i = 1; i < packetSize-1; ++i) {
+        values[i] = coeff(index+i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const
+  {
+    // Location of the first element of the patch.
+    const Index patchIndex = coords[NumDims - 1];
+
+    if (TensorEvaluator<ArgType, Device>::CoordAccess) {
+      array<Index, NumDims-1> inputCoords;
+      for (int i = NumDims - 2; i > 0; --i) {
+        const Index patchIdx = patchIndex / m_patchStrides[i];
+        patchIndex -= patchIdx * m_patchStrides[i];
+        const Index offsetIdx = coords[i];
+        inputCoords[i] = coords[i] + patchIdx;
+      }
+      inputCoords[0] = (patchIndex + coords[0]);
+      return m_impl.coeff(inputCoords);
+    }
+    else {
+      Index inputIndex = 0;
+      for (int i = NumDims - 2; i > 0; --i) {
+        const Index patchIdx = patchIndex / m_patchStrides[i];
+        patchIndex -= patchIdx * m_patchStrides[i];
+        const Index offsetIdx = coords[i];
+        inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
+      }
+      inputIndex += (patchIndex + coords[0]);
+      return m_impl.coeff(inputIndex);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims-1> m_inputStrides;
+  array<Index, NumDims-1> m_patchStrides;
+
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
new file mode 100644
index 000000000..de5747905
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -0,0 +1,426 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
+
+namespace Eigen {
+
+/** \class TensorReduction
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor reduction class.
+  *
+  */
+
+namespace internal {
+template<typename Op, typename Dims, typename XprType>
+struct traits<TensorReductionOp<Op, Dims, XprType> >
+ : traits<XprType>
+{
+  typedef typename traits<XprType>::Scalar Scalar;
+  typedef typename internal::packet_traits<Scalar>::type Packet;
+  typedef typename traits<XprType>::StorageKind StorageKind;
+  typedef typename traits<XprType>::Index Index;
+  typedef typename XprType::Nested Nested;
+};
+
+template<typename Op, typename Dims, typename XprType>
+struct eval<TensorReductionOp<Op, Dims, XprType>, Eigen::Dense>
+{
+  typedef const TensorReductionOp<Op, Dims, XprType>& type;
+};
+
+template<typename Op, typename Dims, typename XprType>
+struct nested<TensorReductionOp<Op, Dims, XprType>, 1, typename eval<TensorReductionOp<Op, Dims, XprType> >::type>
+{
+  typedef TensorReductionOp<Op, Dims, XprType> type;
+};
+
+
+template <typename ReducedDims, int NumTensorDims, int Layout>
+struct are_inner_most_dims {
+  static const bool value = false;
+};
+template <typename ReducedDims, int NumTensorDims, int Layout>
+struct preserve_inner_most_dims {
+  static const bool value = false;
+};
+
+#ifdef EIGEN_HAS_CONSTEXPR
+template <typename ReducedDims, int NumTensorDims>
+struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
+  static const bool value = indices_statically_known_to_increase<ReducedDims>()() &&
+                            index_statically_eq<ReducedDims>()(0, 0) &&
+                            index_statically_eq<ReducedDims>()(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1);
+};
+template <typename ReducedDims, int NumTensorDims>
+struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
+  static const bool value = indices_statically_known_to_increase<ReducedDims>()() &&
+                            index_statically_eq<ReducedDims>()(0, NumTensorDims - array_size<ReducedDims>::value) &&
+                            index_statically_eq<ReducedDims>()(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
+};
+template <typename ReducedDims, int NumTensorDims>
+struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
+  static const bool value = indices_statically_known_to_increase<ReducedDims>()() &&
+                            index_statically_gt<ReducedDims>()(0, 0);
+};
+template <typename ReducedDims, int NumTensorDims>
+struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
+  static const bool value = indices_statically_known_to_increase<ReducedDims>()() &&
+                            index_statically_lt<ReducedDims>()(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
+};
+#endif
+
+
+template <int DimIndex, typename Self, typename Op>
+struct GenericDimReducer {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
+    EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
+      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
+      GenericDimReducer<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
+    }
+  }
+};
+template <typename Self, typename Op>
+struct GenericDimReducer<0, Self, Op> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
+    for (int j = 0; j < self.m_reducedDims[0]; ++j) {
+      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
+      reducer.reduce(self.m_impl.coeff(input), accum);
+    }
+  }
+};
+
+template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+struct InnerMostDimReducer {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
+    typename Self::CoeffReturnType accum = reducer.initialize();
+    for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
+      reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
+    }
+    return reducer.finalize(accum);
+  }
+};
+
+template <typename Self, typename Op>
+struct InnerMostDimReducer<Self, Op, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
+    const int packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
+    const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
+    typename Self::PacketReturnType p = reducer.template initializePacket<typename Self::PacketReturnType>();
+    for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
+      reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &p);
+    }
+    typename Self::CoeffReturnType accum = reducer.initialize();
+    for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
+      reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
+    }
+    return reducer.finalizeBoth(accum, p);
+  }
+};
+
+template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+struct InnerMostDimPreserver {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
+    eigen_assert(false && "should never be called");
+  }
+};
+
+template <int DimIndex, typename Self, typename Op>
+struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
+    EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
+      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
+      InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
+    }
+  }
+};
+
+template <typename Self, typename Op>
+struct InnerMostDimPreserver<0, Self, Op, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
+    for (int j = 0; j < self.m_reducedDims[0]; ++j) {
+      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
+      reducer.reducePacket(self.m_impl.template packet<Unaligned>(input), accum);
+    }
+  }
+};
+
+}  // end namespace internal
+
+
+template <typename Op, typename Dims, typename XprType>
+class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType>, ReadOnlyAccessors> {
+  public:
+    typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar;
+    typedef typename Eigen::internal::traits<TensorReductionOp>::Packet Packet;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+    typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
+    typedef typename Eigen::internal::nested<TensorReductionOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorReductionOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorReductionOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims)
+    { }
+    TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer)
+    { }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const XprType& expression() const { return m_expr; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const Dims& dims() const { return m_dims; }
+    const Op& reducer() const { return m_reducer; }
+
+  protected:
+    typename XprType::Nested m_expr;
+    const Dims m_dims;
+    const Op m_reducer;
+};
+
+
+// Eval as rvalue
+template<typename Op, typename Dims, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
+{
+  typedef TensorReductionOp<Op, Dims, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  static const int NumReducedDims = internal::array_size<Dims>::value;
+  static const int NumOutputDims = (NumInputDims==NumReducedDims) ? 1 : NumInputDims - NumReducedDims;
+  typedef DSizes<Index, NumOutputDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> Self;
+  static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
+  static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_reducer(op.reducer())
+  {
+    EIGEN_STATIC_ASSERT(NumInputDims >= NumReducedDims, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    // Bitmap indicating if an input dimension is reduced or not.
+    array<bool, NumInputDims> reduced;
+    for (int i = 0; i < NumInputDims; ++i) {
+      reduced[i] = false;
+    }
+    for (int i = 0; i < NumReducedDims; ++i) {
+      eigen_assert(op.dims()[i] >= 0);
+      eigen_assert(op.dims()[i] < NumInputDims);
+      reduced[op.dims()[i]] = true;
+    }
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    int outputIndex = 0;
+    int reduceIndex = 0;
+    for (int i = 0; i < NumInputDims; ++i) {
+      if (reduced[i]) {
+        m_reducedDims[reduceIndex] = input_dims[i];
+        ++reduceIndex;
+      } else {
+        m_dimensions[outputIndex] = input_dims[i];
+        ++outputIndex;
+      }
+    }
+
+    // Precompute output strides.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_outputStrides[0] = 1;
+      for (int i = 1; i < NumOutputDims; ++i) {
+        m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+      }
+    } else {
+      m_outputStrides[NumOutputDims - 1] = 1;
+      for (int i = NumOutputDims - 2; i >= 0; --i) {
+        m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+      }
+    }
+
+    // Precompute input strides.
+    array<Index, NumInputDims> input_strides;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      input_strides[0] = 1;
+      for (int i = 1; i < NumInputDims; ++i) {
+        input_strides[i] = input_strides[i-1] * input_dims[i-1];
+      }
+    } else {
+      input_strides[NumInputDims - 1] = 1;
+      for (int i = NumInputDims - 2; i >= 0; --i) {
+        input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
+      }
+    }
+
+    outputIndex = 0;
+    reduceIndex = 0;
+    for (int i = 0; i < NumInputDims; ++i) {
+      if (reduced[i]) {
+        m_reducedStrides[reduceIndex] = input_strides[i];
+        ++reduceIndex;
+      } else {
+        m_preservedStrides[outputIndex] = input_strides[i];
+        ++outputIndex;
+      }
+    }
+
+    // Special case for full reductions
+    if (NumInputDims == NumReducedDims) {
+      m_dimensions[0] = 1;
+      m_preservedStrides[0] = internal::array_prod(input_dims);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    Op reducer(m_reducer);
+    if (ReducingInnerMostDims) {
+      const Index num_values_to_reduce =
+	(static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1];
+      return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index),
+                                                             num_values_to_reduce, reducer);
+    } else {
+      typename Self::CoeffReturnType accum = reducer.initialize();
+      internal::GenericDimReducer<NumReducedDims-1, Self, Op>::reduce(*this, firstInput(index), reducer, &accum);
+      return reducer.finalize(accum);
+    }
+  }
+
+  // TODO(bsteiner): provide a more efficient implementation.
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+    if (ReducingInnerMostDims) {
+      const Index num_values_to_reduce =
+	(static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1];
+      const Index firstIndex = firstInput(index);
+      for (Index i = 0; i < packetSize; ++i) {
+        Op reducer(m_reducer);
+        values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce,
+                                                                    num_values_to_reduce, reducer);
+      }
+    } else if (PreservingInnerMostDims) {
+      const Index firstIndex = firstInput(index);
+      const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
+      // TBD: extend this the the n innermost dimensions that we preserve.
+      if (((firstIndex % m_dimensions[innermost_dim]) + packetSize - 1) < m_dimensions[innermost_dim]) {
+        Op reducer(m_reducer);
+        typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
+        internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*this, firstIndex, reducer, &accum);
+        return reducer.finalizePacket(accum);
+      } else {
+        for (int i = 0; i < packetSize; ++i) {
+          values[i] = coeff(index + i);
+        }
+      }
+    } else {
+      for (int i = 0; i < packetSize; ++i) {
+        values[i] = coeff(index + i);
+      }
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+  private:
+  template <int, typename, typename> friend struct internal::GenericDimReducer;
+  template <typename, typename, bool> friend struct internal::InnerMostDimReducer;
+  template <int, typename, typename, bool> friend struct internal::InnerMostDimPreserver;
+
+  // Returns the Index in the input tensor of the first value that needs to be
+  // used to compute the reduction at output index "index".
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
+    if (ReducingInnerMostDims) {
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        return index * m_preservedStrides[0];
+      } else {
+        return index * m_preservedStrides[NumOutputDims - 1];
+      }
+    }
+    // TBD: optimize the case where we preserve the innermost dimensions.
+    Index startInput = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumOutputDims - 1; i > 0; --i) {
+        // This is index_i in the output tensor.
+        const Index idx = index / m_outputStrides[i];
+        startInput += idx * m_preservedStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      startInput += index * m_preservedStrides[0];
+    } else {
+      for (int i = 0; i < NumOutputDims - 1; ++i) {
+        // This is index_i in the output tensor.
+        const Index idx = index / m_outputStrides[i];
+        startInput += idx * m_preservedStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      startInput += index * m_preservedStrides[NumOutputDims - 1];
+    }
+    return startInput;
+  }
+
+  // Dimensions of the output of the operation.
+  Dimensions m_dimensions;
+  // Precomputed strides for the output tensor.
+  array<Index, NumOutputDims> m_outputStrides;
+  // Subset of strides of the input tensor for the non-reduced dimensions.
+  // Indexed by output dimensions.
+  array<Index, NumOutputDims> m_preservedStrides;
+
+  // Subset of strides of the input tensor for the reduced dimensions.
+  // Indexed by reduced dimensions.
+  array<Index, NumReducedDims> m_reducedStrides;
+  // Size of the input dimensions that are reduced.
+  // Indexed by reduced dimensions.
+  array<Index, NumReducedDims> m_reducedDims;
+
+  // Evaluator for the input expression.
+  TensorEvaluator<ArgType, Device> m_impl;
+
+  // Operation to apply for computing the reduction.
+  Op m_reducer;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
new file mode 100644
index 000000000..0a87e67eb
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@@ -0,0 +1,429 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REF_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REF_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Dimensions, typename Scalar>
+class TensorLazyBaseEvaluator {
+ public:
+  TensorLazyBaseEvaluator() : m_refcount(0) { }
+  virtual ~TensorLazyBaseEvaluator() { }
+
+  virtual const Dimensions& dimensions() const = 0;
+  virtual const Scalar* data() const = 0;
+
+  virtual const Scalar coeff(DenseIndex index) const = 0;
+  virtual Scalar& coeffRef(DenseIndex index) = 0;
+
+  void incrRefCount() { ++m_refcount; }
+  void decrRefCount() { --m_refcount; }
+  int refCount() const { return m_refcount; }
+
+ private:
+  // No copy, no assigment;
+  TensorLazyBaseEvaluator(const TensorLazyBaseEvaluator& other);
+  TensorLazyBaseEvaluator& operator = (const TensorLazyBaseEvaluator& other);
+
+  int m_refcount;
+};
+
+static char dummy[8];
+
+template <typename Dimensions, typename Expr, typename Device>
+class TensorLazyEvaluatorReadOnly : public TensorLazyBaseEvaluator<Dimensions, typename TensorEvaluator<Expr, Device>::Scalar> {
+ public:
+  //  typedef typename TensorEvaluator<Expr, Device>::Dimensions Dimensions;
+  typedef typename TensorEvaluator<Expr, Device>::Scalar Scalar;
+
+  TensorLazyEvaluatorReadOnly(const Expr& expr, const Device& device) : m_impl(expr, device) {
+    m_dims = m_impl.dimensions();
+    m_impl.evalSubExprsIfNeeded(NULL);
+  }
+  virtual ~TensorLazyEvaluatorReadOnly() {
+    m_impl.cleanup();
+  }
+
+  virtual const Dimensions& dimensions() const {
+    return m_dims;
+  }
+  virtual const Scalar* data() const {
+    return m_impl.data();
+  }
+
+  virtual const Scalar coeff(DenseIndex index) const {
+    return m_impl.coeff(index);
+  }
+  virtual Scalar& coeffRef(DenseIndex /*index*/) {
+    eigen_assert(false && "can't reference the coefficient of a rvalue");
+    return *reinterpret_cast<Scalar*>(dummy);
+  };
+
+ protected:
+  TensorEvaluator<Expr, Device> m_impl;
+  Dimensions m_dims;
+};
+
+template <typename Dimensions, typename Expr, typename Device>
+class TensorLazyEvaluatorWritable : public TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> {
+ public:
+  typedef TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> Base;
+  typedef typename Base::Scalar Scalar;
+
+  TensorLazyEvaluatorWritable(const Expr& expr, const Device& device) : Base(expr, device) {
+  }
+  virtual ~TensorLazyEvaluatorWritable() {
+  }
+
+  virtual Scalar& coeffRef(DenseIndex index) {
+    return this->m_impl.coeffRef(index);
+  }
+};
+
+template <typename Dimensions, typename Expr, typename Device>
+class TensorLazyEvaluator : public internal::conditional<bool(internal::is_lvalue<Expr>::value),
+                            TensorLazyEvaluatorWritable<Dimensions, Expr, Device>,
+                            TensorLazyEvaluatorReadOnly<Dimensions, const Expr, Device> >::type {
+ public:
+  typedef typename internal::conditional<bool(internal::is_lvalue<Expr>::value),
+                                         TensorLazyEvaluatorWritable<Dimensions, Expr, Device>,
+                                         TensorLazyEvaluatorReadOnly<Dimensions, const Expr, Device> >::type Base;
+  typedef typename Base::Scalar Scalar;
+
+  TensorLazyEvaluator(const Expr& expr, const Device& device) : Base(expr, device) {
+  }
+  virtual ~TensorLazyEvaluator() {
+  }
+};
+
+}  // namespace internal
+
+
+/** \class TensorRef
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief A reference to a tensor expression
+  * The expression will be evaluated lazily (as much as possible).
+  *
+  */
+template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef<PlainObjectType> >
+{
+  public:
+    typedef TensorRef<PlainObjectType> Self;
+    typedef typename PlainObjectType::Base Base;
+    typedef typename Eigen::internal::nested<Self>::type Nested;
+    typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
+    typedef typename internal::traits<PlainObjectType>::Index Index;
+    typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
+    typedef typename internal::packet_traits<Scalar>::type Packet;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef typename Base::CoeffReturnType CoeffReturnType;
+    typedef Scalar* PointerType;
+    typedef PointerType PointerArgType;
+
+    static const Index NumIndices = PlainObjectType::NumIndices;
+    typedef typename PlainObjectType::Dimensions Dimensions;
+
+    enum {
+      IsAligned = false,
+      PacketAccess = false,
+      Layout = PlainObjectType::Layout,
+      CoordAccess = false,  // to be implemented
+    };
+
+    EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) {
+    }
+
+    template <typename Expression>
+    EIGEN_STRONG_INLINE TensorRef(const Expression& expr) : m_evaluator(new internal::TensorLazyEvaluator<Dimensions, Expression, DefaultDevice>(expr, DefaultDevice())) {
+      m_evaluator->incrRefCount();
+    }
+
+    template <typename Expression>
+    EIGEN_STRONG_INLINE TensorRef& operator = (const Expression& expr) {
+      unrefEvaluator();
+      m_evaluator = new internal::TensorLazyEvaluator<Dimensions, Expression, DefaultDevice>(expr, DefaultDevice());
+      m_evaluator->incrRefCount();
+      return *this;
+    }
+
+    ~TensorRef() {
+      unrefEvaluator();
+    }
+
+    TensorRef(const TensorRef& other) : m_evaluator(other.m_evaluator) {
+      eigen_assert(m_evaluator->refCount() > 0);
+      m_evaluator->incrRefCount();
+    }
+
+    TensorRef& operator = (const TensorRef& other) {
+      if (this != &other) {
+        unrefEvaluator();
+        m_evaluator = other.m_evaluator;
+        eigen_assert(m_evaluator->refCount() > 0);
+        m_evaluator->incrRefCount();
+      }
+      return *this;
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index rank() const { return m_evaluator->dimensions().size(); }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_evaluator->dimensions()[n]; }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_evaluator->dimensions(); }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index size() const { return m_evaluator->dimensions().TotalSize(); }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar* data() const { return m_evaluator->data(); }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar operator()(Index index) const
+    {
+      return m_evaluator->coeff(index);
+    }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar operator()(Index firstIndex, IndexTypes... otherIndices) const
+    {
+      const std::size_t NumIndices = (sizeof...(otherIndices) + 1);
+      const array<Index, NumIndices> indices{{firstIndex, otherIndices...}};
+      return coeff(indices);
+    }
+    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices)
+    {
+      const std::size_t NumIndices = (sizeof...(otherIndices) + 1);
+      const array<Index, NumIndices> indices{{firstIndex, otherIndices...}};
+      return coeffRef(indices);
+    }
+#else
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1) const
+    {
+      array<Index, 2> indices;
+      indices[0] = i0;
+      indices[1] = i1;
+      return coeff(indices);
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2) const
+    {
+      array<Index, 3> indices;
+      indices[0] = i0;
+      indices[1] = i1;
+      indices[2] = i2;
+      return coeff(indices);
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3) const
+    {
+      array<Index, 4> indices;
+      indices[0] = i0;
+      indices[1] = i1;
+      indices[2] = i2;
+      indices[3] = i3;
+      return coeff(indices);
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
+    {
+      array<Index, 5> indices;
+      indices[0] = i0;
+      indices[1] = i1;
+      indices[2] = i2;
+      indices[3] = i3;
+      indices[4] = i4;
+      return coeff(indices);
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1)
+    {
+      array<Index, 2> indices;
+      indices[0] = i0;
+      indices[1] = i1;
+      return coeffRef(indices);
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2)
+    {
+      array<Index, 3> indices;
+      indices[0] = i0;
+      indices[1] = i1;
+      indices[2] = i2;
+      return coeffRef(indices);
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
+    {
+      array<Index, 4> indices;
+      indices[0] = i0;
+      indices[1] = i1;
+      indices[2] = i2;
+      indices[3] = i3;
+      return coeffRef(indices);
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2, Index i3, Index i4)
+    {
+      array<Index, 5> indices;
+      indices[0] = i0;
+      indices[1] = i1;
+      indices[2] = i2;
+      indices[3] = i3;
+      indices[4] = i4;
+      return coeffRef(indices);
+    }
+#endif
+
+    template <std::size_t NumIndices> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar coeff(const array<Index, NumIndices>& indices) const
+    {
+      const Dimensions& dims = this->dimensions();
+      Index index = 0;
+      if (PlainObjectType::Options & RowMajor) {
+        index += indices[0];
+        for (int i = 1; i < NumIndices; ++i) {
+          index = index * dims[i] + indices[i];
+        }
+      } else {
+        index += indices[NumIndices-1];
+        for (int i = NumIndices-2; i >= 0; --i) {
+          index = index * dims[i] + indices[i];
+        }
+      }
+      return m_evaluator->coeff(index);
+    }
+    template <std::size_t NumIndices> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
+    {
+      const Dimensions& dims = this->dimensions();
+      Index index = 0;
+      if (PlainObjectType::Options & RowMajor) {
+        index += indices[0];
+        for (int i = 1; i < NumIndices; ++i) {
+          index = index * dims[i] + indices[i];
+        }
+      } else {
+        index += indices[NumIndices-1];
+        for (int i = NumIndices-2; i >= 0; --i) {
+          index = index * dims[i] + indices[i];
+        }
+      }
+      return m_evaluator->coeffRef(index);
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
+    {
+      return m_evaluator->coeff(index);
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+    {
+      return m_evaluator->coeffRef(index);
+    }
+
+  private:
+    EIGEN_STRONG_INLINE void unrefEvaluator() {
+      if (m_evaluator) {
+        m_evaluator->decrRefCount();
+        if (m_evaluator->refCount() == 0) {
+          delete m_evaluator;
+        }
+      }
+    }
+
+  internal::TensorLazyBaseEvaluator<Dimensions, Scalar>* m_evaluator;
+};
+
+
+// evaluator for rvalues
+template<typename Derived, typename Device>
+struct TensorEvaluator<const TensorRef<Derived>, Device>
+{
+  typedef typename Derived::Index Index;
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Packet Packet;
+  typedef typename Derived::Scalar CoeffReturnType;
+  typedef typename Derived::Packet PacketReturnType;
+  typedef typename Derived::Dimensions Dimensions;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = false,
+    Layout = TensorRef<Derived>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&)
+      : m_ref(m)
+  { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_ref.coeff(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    return m_ref.coeffRef(index);
+  }
+
+  Scalar* data() const { return m_ref.data(); }
+
+ protected:
+  TensorRef<Derived> m_ref;
+};
+
+
+// evaluator for lvalues
+template<typename Derived, typename Device>
+struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<const TensorRef<Derived>, Device>
+{
+  typedef typename Derived::Index Index;
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Packet Packet;
+  typedef typename Derived::Scalar CoeffReturnType;
+  typedef typename Derived::Packet PacketReturnType;
+  typedef typename Derived::Dimensions Dimensions;
+
+  typedef TensorEvaluator<const TensorRef<Derived>, Device> Base;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = false,
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d)
+  { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    return this->m_ref.coeffRef(index);
+  }
+};
+
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REF_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
new file mode 100644
index 000000000..ad21e966b
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -0,0 +1,207 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>
+//                    Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
+namespace Eigen {
+
+/** \class TensorReverse
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor reverse elements class.
+  *
+  */
+namespace internal {
+template<typename ReverseDimensions, typename XprType>
+struct traits<TensorReverseOp<ReverseDimensions,
+                              XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename ReverseDimensions, typename XprType>
+struct eval<TensorReverseOp<ReverseDimensions, XprType>, Eigen::Dense>
+{
+  typedef const TensorReverseOp<ReverseDimensions, XprType>& type;
+};
+
+template<typename ReverseDimensions, typename XprType>
+struct nested<TensorReverseOp<ReverseDimensions, XprType>, 1,
+            typename eval<TensorReverseOp<ReverseDimensions, XprType> >::type>
+{
+  typedef TensorReverseOp<ReverseDimensions, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+
+template<typename ReverseDimensions, typename XprType>
+class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions,
+                                          XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorReverseOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorReverseOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorReverseOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorReverseOp>::StorageKind
+                                                                    StorageKind;
+  typedef typename Eigen::internal::traits<TensorReverseOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(const XprType& expr,
+                                          const ReverseDimensions& reverse_dims)
+      : m_xpr(expr), m_reverse_dims(reverse_dims) {}
+
+    EIGEN_DEVICE_FUNC
+    const ReverseDimensions& reverse() const { return m_reverse_dims; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const ReverseDimensions m_reverse_dims;
+};
+
+
+// Eval as rvalue
+template<typename ReverseDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device>
+{
+  typedef TensorReverseOp<ReverseDimensions, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<ReverseDimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
+                                                        const Device& device)
+      : m_impl(op.expression(), device), m_reverse(op.reverse())
+  {
+    // Compute strides
+    m_dimensions = m_impl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_strides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_strides[i] = m_strides[i-1] * m_dimensions[i-1];
+      }
+    } else {
+      m_strides[NumDims-1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_strides[i] = m_strides[i+1] * m_dimensions[i+1];
+      }
+    }
+  }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    eigen_assert(index < dimensions().TotalSize());
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        Index idx = index / m_strides[i];
+        index -= idx * m_strides[i];
+        if (m_reverse[i]) {
+          idx = m_dimensions[i] - idx - 1;
+        }
+        inputIndex += idx * m_strides[i] ;
+      }
+      if (m_reverse[0]) {
+        inputIndex += (m_dimensions[0] - index - 1);
+      } else {
+        inputIndex += index;
+      }
+      return m_impl.coeff(inputIndex);
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        Index idx = index / m_strides[i];
+        index -= idx * m_strides[i];
+        if (m_reverse[i]) {
+          idx = m_dimensions[i] - idx - 1;
+        }
+        inputIndex += idx * m_strides[i] ;
+      }
+      if (m_reverse[NumDims-1]) {
+        inputIndex += (m_dimensions[NumDims-1] - index - 1);
+      } else {
+        inputIndex += index;
+      }
+      return m_impl.coeff(inputIndex);
+    }
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketReturnType packet(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    // TODO(ndjaitly): write a better packing routine that uses
+    // local structure.
+    EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type
+                                                            values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = coeff(index+i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_strides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  ReverseDimensions m_reverse;
+};
+
+
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
new file mode 100644
index 000000000..1012ecd69
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -0,0 +1,259 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
+
+namespace Eigen {
+
+/** \class TensorShuffling
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor shuffling class.
+  *
+  *
+  */
+namespace internal {
+template<typename Shuffle, typename XprType>
+struct traits<TensorShufflingOp<Shuffle, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename Shuffle, typename XprType>
+struct eval<TensorShufflingOp<Shuffle, XprType>, Eigen::Dense>
+{
+  typedef const TensorShufflingOp<Shuffle, XprType>& type;
+};
+
+template<typename Shuffle, typename XprType>
+struct nested<TensorShufflingOp<Shuffle, XprType>, 1, typename eval<TensorShufflingOp<Shuffle, XprType> >::type>
+{
+  typedef TensorShufflingOp<Shuffle, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename Shuffle, typename XprType>
+class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType> >
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorShufflingOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorShufflingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorShufflingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shuffle)
+      : m_xpr(expr), m_shuffle(shuffle) {}
+
+    EIGEN_DEVICE_FUNC
+    const Shuffle& shuffle() const { return m_shuffle; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const TensorShufflingOp& other)
+    {
+      typedef TensorAssignOp<TensorShufflingOp, const TensorShufflingOp> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const OtherDerived& other)
+    {
+      typedef TensorAssignOp<TensorShufflingOp, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const Shuffle m_shuffle;
+};
+
+
+// Eval as rvalue
+template<typename Shuffle, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
+{
+  typedef TensorShufflingOp<Shuffle, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device)
+  {
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    const Shuffle& shuffle = op.shuffle();
+    for (int i = 0; i < NumDims; ++i) {
+      m_dimensions[i] = input_dims[shuffle[i]];
+    }
+
+    array<Index, NumDims> inputStrides;
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      inputStrides[0] = 1;
+      m_outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1];
+        m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+      }
+    } else {
+      inputStrides[NumDims - 1] = 1;
+      m_outputStrides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
+        m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+      }
+    }
+
+    for (int i = 0; i < NumDims; ++i) {
+      m_inputStrides[i] = inputStrides[shuffle[i]];
+    }
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_impl.coeff(srcCoeff(index));
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = coeff(index+i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStrides[i];
+        inputIndex += idx * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      return inputIndex + index * m_inputStrides[0];
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_outputStrides[i];
+        inputIndex += idx * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      return inputIndex + index * m_inputStrides[NumDims - 1];
+    }
+  }
+
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+
+// Eval as lvalue
+template<typename Shuffle, typename ArgType, typename Device>
+struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
+    : public TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
+{
+  typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Base;
+
+  typedef TensorShufflingOp<Shuffle, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : Base(op, device)
+  { }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  {
+    return this->m_impl.coeffRef(this->srcCoeff(index));
+  }
+
+  template <int StoreMode> EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketReturnType& x)
+  {
+    static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+
+    EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+    internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+    for (int i = 0; i < packetSize; ++i) {
+      this->coeffRef(index+i) = values[i];
+    }
+  }
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
index a34600ee6..1b227e8c2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -30,39 +30,70 @@ namespace Eigen {
   *
   * \sa Tensor
   */
-template<typename T, std::size_t NumIndices_, DenseIndex Size, int Options_, typename Dimensions = void> class TensorStorage;
+template<typename T, DenseIndex NumIndices_, DenseIndex Size, int Options_, typename Dimensions = void> class TensorStorage;
+
+
+// Pure fixed-size storage
+template<typename T, DenseIndex NumIndices_, DenseIndex Size, int Options_, typename FixedDimensions>
+class TensorStorage
+{
+ private:
+  EIGEN_ALIGN_DEFAULT T m_data[Size];
+  FixedDimensions m_dimensions;
+
+ public:
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE TensorStorage() {
+    EIGEN_STATIC_ASSERT(Size == FixedDimensions::total_size, YOU_MADE_A_PROGRAMMING_MISTAKE)
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T *data() { return m_data; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T *data() const { return m_data; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const FixedDimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE DenseIndex size() const { return m_dimensions.TotalSize(); }
+};
+
+
 
 // pure-dynamic, but without specification of all dimensions explicitly
-template<typename T, std::size_t NumIndices_, int Options_>
+template<typename T, DenseIndex NumIndices_, int Options_>
 class TensorStorage<T, NumIndices_, Dynamic, Options_, void>
   : public TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type>
 {
-    typedef TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type> Base_;
+  typedef TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type> Base_;
+
   public:
-    TensorStorage() = default;
-    TensorStorage(const TensorStorage<T, NumIndices_, Dynamic, Options_, void>&) = default;
-    TensorStorage(TensorStorage<T, NumIndices_, Dynamic, Options_, void>&&) = default;
+    TensorStorage() { }
+    TensorStorage(const TensorStorage<T, NumIndices_, Dynamic, Options_, void>& other) : Base_(other) { }
+
     TensorStorage(internal::constructor_without_unaligned_array_assert) : Base_(internal::constructor_without_unaligned_array_assert()) {}
-    TensorStorage(DenseIndex size, const std::array<DenseIndex, NumIndices_>& dimensions) : Base_(size, dimensions) {}
-    TensorStorage<T, NumIndices_, Dynamic, Options_, void>& operator=(const TensorStorage<T, NumIndices_, Dynamic, Options_, void>&) = default;
+    TensorStorage(DenseIndex size, const array<DenseIndex, NumIndices_>& dimensions) : Base_(size, dimensions) {}
+
+  //      TensorStorage<T, NumIndices_, Dynamic, Options_, void>& operator=(const TensorStorage<T, NumIndices_, Dynamic, Options_, void>&) = default;
 };
 
 // pure dynamic
-template<typename T, std::size_t NumIndices_, int Options_>
+template<typename T, DenseIndex NumIndices_, int Options_>
 class TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type>
 {
     T *m_data;
-    std::array<DenseIndex, NumIndices_> m_dimensions;
+    DSizes<DenseIndex, NumIndices_> m_dimensions;
 
     typedef TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type> Self_;
   public:
-    TensorStorage() : m_data(0), m_dimensions(internal::template repeat<NumIndices_, DenseIndex>(0)) {}
+    TensorStorage() : m_data(0), m_dimensions() {}
     TensorStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(0), m_dimensions(internal::template repeat<NumIndices_, DenseIndex>(0)) {}
-    TensorStorage(DenseIndex size, const std::array<DenseIndex, NumIndices_>& dimensions)
-      : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size)), m_dimensions(dimensions)
-    { EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN }
-    TensorStorage(const Self_& other)
+    TensorStorage(DenseIndex size, const array<DenseIndex, NumIndices_>& dimensions)
+        : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size)), m_dimensions(dimensions)
+      { EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN }
+      TensorStorage(const Self_& other)
       : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(internal::array_prod(other.m_dimensions)))
       , m_dimensions(other.m_dimensions)
     {
@@ -76,32 +107,19 @@ class TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_nu
       }
       return *this;
     }
-    TensorStorage(Self_&& other)
-      : m_data(std::move(other.m_data)), m_dimensions(std::move(other.m_dimensions))
-    {
-      other.m_data = nullptr;
-    }
-    Self_& operator=(Self_&& other)
-    {
-      using std::swap;
-      swap(m_data, other.m_data);
-      swap(m_dimensions, other.m_dimensions);
-      return *this;
-    }
+
     ~TensorStorage() { internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, internal::array_prod(m_dimensions)); }
     void swap(Self_& other)
     { std::swap(m_data,other.m_data); std::swap(m_dimensions,other.m_dimensions); }
-    std::array<DenseIndex, NumIndices_> dimensions(void) const {return m_dimensions;}
-    void conservativeResize(DenseIndex size, const std::array<DenseIndex, NumIndices_>& nbDimensions)
-    {
-      m_data = internal::conditional_aligned_realloc_new_auto<T,(Options_&DontAlign)==0>(m_data, size, internal::array_prod(m_dimensions));
-      m_dimensions = nbDimensions;
-    }
-    void resize(DenseIndex size, const std::array<DenseIndex, NumIndices_>& nbDimensions)
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<DenseIndex, NumIndices_>& dimensions() const {return m_dimensions;}
+
+    EIGEN_DEVICE_FUNC void resize(DenseIndex size, const array<DenseIndex, NumIndices_>& nbDimensions)
     {
-      if(size != internal::array_prod(m_dimensions))
+      const DenseIndex currentSz = internal::array_prod(m_dimensions);
+      if(size != currentSz)
       {
-        internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, internal::array_prod(m_dimensions));
+        internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, currentSz);
         if (size)
           m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size);
         else
@@ -110,16 +128,13 @@ class TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_nu
       }
       m_dimensions = nbDimensions;
     }
-    const T *data() const { return m_data; }
-    T *data() { return m_data; }
-};
 
-// TODO: implement fixed-size stuff
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T *data() { return m_data; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T *data() const { return m_data; }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex size() const { return m_dimensions.TotalSize(); }
+};
 
 } // end namespace Eigen
 
 #endif // EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
-
-/*
- * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
- */
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
new file mode 100644
index 000000000..00cb8e373
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -0,0 +1,325 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
+
+namespace Eigen {
+
+/** \class TensorStriding
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor striding class.
+  *
+  *
+  */
+namespace internal {
+template<typename Strides, typename XprType>
+struct traits<TensorStridingOp<Strides, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename Strides, typename XprType>
+struct eval<TensorStridingOp<Strides, XprType>, Eigen::Dense>
+{
+  typedef const TensorStridingOp<Strides, XprType>& type;
+};
+
+template<typename Strides, typename XprType>
+struct nested<TensorStridingOp<Strides, XprType>, 1, typename eval<TensorStridingOp<Strides, XprType> >::type>
+{
+  typedef TensorStridingOp<Strides, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename Strides, typename XprType>
+class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType> >
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorStridingOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorStridingOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorStridingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorStridingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorStridingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp(const XprType& expr, const Strides& dims)
+      : m_xpr(expr), m_dims(dims) {}
+
+    EIGEN_DEVICE_FUNC
+    const Strides& strides() const { return m_dims; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorStridingOp& operator = (const TensorStridingOp& other)
+    {
+      typedef TensorAssignOp<TensorStridingOp, const TensorStridingOp> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorStridingOp& operator = (const OtherDerived& other)
+    {
+      typedef TensorAssignOp<TensorStridingOp, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
+      return *this;
+    }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const Strides m_dims;
+};
+
+
+// Eval as rvalue
+template<typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
+{
+  typedef TensorStridingOp<Strides, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  enum {
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device)
+  {
+    m_dimensions = m_impl.dimensions();
+    for (int i = 0; i < NumDims; ++i) {
+      m_dimensions[i] = ceilf(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
+    }
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_outputStrides[0] = 1;
+      m_inputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+        m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+        m_inputStrides[i-1] *= op.strides()[i-1];
+      }
+      m_inputStrides[NumDims-1] *= op.strides()[NumDims-1];
+    } else {  // RowMajor
+      m_outputStrides[NumDims-1] = 1;
+      m_inputStrides[NumDims-1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
+        m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
+        m_inputStrides[i+1] *= op.strides()[i+1];
+      }
+      m_inputStrides[0] *= op.strides()[0];
+    }
+  }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_impl.coeff(srcCoeff(index));
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    Index inputIndices[] = {0, 0};
+    Index indices[] = {index, index + packetSize - 1};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx0 = indices[0] / m_outputStrides[i];
+        const Index idx1 = indices[1] / m_outputStrides[i];
+        inputIndices[0] += idx0 * m_inputStrides[i];
+        inputIndices[1] += idx1 * m_inputStrides[i];
+        indices[0] -= idx0 * m_outputStrides[i];
+        indices[1] -= idx1 * m_outputStrides[i];
+      }
+      inputIndices[0] += indices[0] * m_inputStrides[0];
+      inputIndices[1] += indices[1] * m_inputStrides[0];
+    } else {  // RowMajor
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx0 = indices[0] / m_outputStrides[i];
+        const Index idx1 = indices[1] / m_outputStrides[i];
+        inputIndices[0] += idx0 * m_inputStrides[i];
+        inputIndices[1] += idx1 * m_inputStrides[i];
+        indices[0] -= idx0 * m_outputStrides[i];
+        indices[1] -= idx1 * m_outputStrides[i];
+      }
+      inputIndices[0] += indices[0] * m_inputStrides[NumDims-1];
+      inputIndices[1] += indices[1] * m_inputStrides[NumDims-1];
+    }
+    if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+      PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
+      return rslt;
+    }
+    else {
+      EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+      values[0] = m_impl.coeff(inputIndices[0]);
+      values[packetSize-1] = m_impl.coeff(inputIndices[1]);
+      for (int i = 1; i < packetSize-1; ++i) {
+        values[i] = coeff(index+i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
+  {
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStrides[i];
+        inputIndex += idx * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      inputIndex += index * m_inputStrides[0];
+    } else {  // RowMajor
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_outputStrides[i];
+        inputIndex += idx * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      inputIndex += index * m_inputStrides[NumDims-1];
+    }
+    return inputIndex;
+  }
+
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+
+// Eval as lvalue
+template<typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
+    : public TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
+{
+  typedef TensorStridingOp<Strides, ArgType> XprType;
+  typedef TensorEvaluator<const XprType, Device> Base;
+  //  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  //  typedef DSizes<Index, NumDims> Dimensions;
+
+  enum {
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : Base(op, device) { }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+  {
+    return this->m_impl.coeffRef(this->srcCoeff(index));
+  }
+
+  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void writePacket(Index index, const PacketReturnType& x)
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < this->dimensions().TotalSize());
+
+    Index inputIndices[] = {0, 0};
+    Index indices[] = {index, index + packetSize - 1};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx0 = indices[0] / this->m_outputStrides[i];
+        const Index idx1 = indices[1] / this->m_outputStrides[i];
+        inputIndices[0] += idx0 * this->m_inputStrides[i];
+        inputIndices[1] += idx1 * this->m_inputStrides[i];
+        indices[0] -= idx0 * this->m_outputStrides[i];
+        indices[1] -= idx1 * this->m_outputStrides[i];
+      }
+      inputIndices[0] += indices[0] * this->m_inputStrides[0];
+      inputIndices[1] += indices[1] * this->m_inputStrides[0];
+    } else {  // RowMajor
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx0 = indices[0] / this->m_outputStrides[i];
+        const Index idx1 = indices[1] / this->m_outputStrides[i];
+        inputIndices[0] += idx0 * this->m_inputStrides[i];
+        inputIndices[1] += idx1 * this->m_inputStrides[i];
+        indices[0] -= idx0 * this->m_outputStrides[i];
+        indices[1] -= idx1 * this->m_outputStrides[i];
+      }
+      inputIndices[0] += indices[0] * this->m_inputStrides[NumDims-1];
+      inputIndices[1] += indices[1] * this->m_inputStrides[NumDims-1];
+    }
+    if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+      this->m_impl.template writePacket<Unaligned>(inputIndices[0], x);
+    }
+    else {
+      EIGEN_ALIGN_DEFAULT Scalar values[packetSize];
+      internal::pstore<Scalar, PacketReturnType>(values, x);
+      this->m_impl.coeffRef(inputIndices[0]) = values[0];
+      this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1];
+      for (int i = 1; i < packetSize-1; ++i) {
+        this->coeffRef(index+i) = values[i];
+      }
+    }
+  }
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
new file mode 100644
index 000000000..a844a4d68
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -0,0 +1,256 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
+
+namespace Eigen {
+namespace internal {
+
+
+template<typename Scalar, int Options>
+class compute_tensor_flags
+{
+  enum {
+    is_dynamic_size_storage = 1,
+
+    aligned_bit =
+    (
+        ((Options&DontAlign)==0) && (
+#if EIGEN_ALIGN_STATICALLY
+            (!is_dynamic_size_storage)
+#else
+            0
+#endif
+            ||
+#if EIGEN_ALIGN
+            is_dynamic_size_storage
+#else
+            0
+#endif
+      )
+    ) ? AlignedBit : 0,
+    packet_access_bit = packet_traits<Scalar>::Vectorizable && aligned_bit ? PacketAccessBit : 0
+  };
+
+  public:
+    enum { ret = packet_access_bit | aligned_bit};
+};
+
+
+template<typename Scalar_, std::size_t NumIndices_, int Options_>
+struct traits<Tensor<Scalar_, NumIndices_, Options_> >
+{
+  typedef Scalar_ Scalar;
+  typedef Dense StorageKind;
+  typedef DenseIndex Index;
+  static const int NumDimensions = NumIndices_;
+  static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
+  enum {
+    Options = Options_,
+    Flags = compute_tensor_flags<Scalar_, Options_>::ret | LvalueBit,
+  };
+};
+
+
+template<typename Scalar_, typename Dimensions, int Options_>
+struct traits<TensorFixedSize<Scalar_, Dimensions, Options_> >
+{
+  typedef Scalar_ Scalar;
+  typedef Dense StorageKind;
+  typedef DenseIndex Index;
+  static const int NumDimensions = array_size<Dimensions>::value;
+  static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
+  enum {
+    Options = Options_,
+    Flags = compute_tensor_flags<Scalar_, Options_>::ret | LvalueBit,
+  };
+};
+
+
+template<typename PlainObjectType, int Options_>
+struct traits<TensorMap<PlainObjectType, Options_> >
+  : public traits<PlainObjectType>
+{
+  typedef traits<PlainObjectType> BaseTraits;
+  typedef typename BaseTraits::Scalar Scalar;
+  typedef typename BaseTraits::StorageKind StorageKind;
+  typedef typename BaseTraits::Index Index;
+  static const int NumDimensions = BaseTraits::NumDimensions;
+  static const int Layout = BaseTraits::Layout;
+  enum {
+    Options = Options_,
+    Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0),
+  };
+};
+
+template<typename PlainObjectType>
+struct traits<TensorRef<PlainObjectType> >
+  : public traits<PlainObjectType>
+{
+  typedef traits<PlainObjectType> BaseTraits;
+  typedef typename BaseTraits::Scalar Scalar;
+  typedef typename BaseTraits::StorageKind StorageKind;
+  typedef typename BaseTraits::Index Index;
+  static const int NumDimensions = BaseTraits::NumDimensions;
+  static const int Layout = BaseTraits::Layout;
+  enum {
+    Options = BaseTraits::Options,
+    Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0),
+  };
+};
+
+
+template<typename _Scalar, std::size_t NumIndices_, int Options>
+struct eval<Tensor<_Scalar, NumIndices_, Options>, Eigen::Dense>
+{
+  typedef const Tensor<_Scalar, NumIndices_, Options>& type;
+};
+
+template<typename _Scalar, std::size_t NumIndices_, int Options>
+struct eval<const Tensor<_Scalar, NumIndices_, Options>, Eigen::Dense>
+{
+  typedef const Tensor<_Scalar, NumIndices_, Options>& type;
+};
+
+template<typename Scalar_, typename Dimensions, int Options>
+struct eval<TensorFixedSize<Scalar_, Dimensions, Options>, Eigen::Dense>
+{
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options>& type;
+};
+
+template<typename Scalar_, typename Dimensions, int Options>
+struct eval<const TensorFixedSize<Scalar_, Dimensions, Options>, Eigen::Dense>
+{
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options>& type;
+};
+
+template<typename PlainObjectType, int Options>
+struct eval<TensorMap<PlainObjectType, Options>, Eigen::Dense>
+{
+  typedef const TensorMap<PlainObjectType, Options>& type;
+};
+
+template<typename PlainObjectType, int Options>
+struct eval<const TensorMap<PlainObjectType, Options>, Eigen::Dense>
+{
+  typedef const TensorMap<PlainObjectType, Options>& type;
+};
+
+template<typename PlainObjectType>
+struct eval<TensorRef<PlainObjectType>, Eigen::Dense>
+{
+  typedef const TensorRef<PlainObjectType>& type;
+};
+
+template<typename PlainObjectType>
+struct eval<const TensorRef<PlainObjectType>, Eigen::Dense>
+{
+  typedef const TensorRef<PlainObjectType>& type;
+};
+
+
+template <typename Scalar_, std::size_t NumIndices_, int Options_>
+struct nested<Tensor<Scalar_, NumIndices_, Options_> >
+{
+  typedef const Tensor<Scalar_, NumIndices_, Options_>& type;
+};
+
+template <typename Scalar_, std::size_t NumIndices_, int Options_>
+struct nested<const Tensor<Scalar_, NumIndices_, Options_> >
+{
+  typedef const Tensor<Scalar_, NumIndices_, Options_>& type;
+};
+
+template <typename Scalar_, typename Dimensions, int Options>
+struct nested<TensorFixedSize<Scalar_, Dimensions, Options> >
+{
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options>& type;
+};
+
+template <typename Scalar_, typename Dimensions, int Options>
+struct nested<const TensorFixedSize<Scalar_, Dimensions, Options> >
+{
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options>& type;
+};
+
+
+template <typename PlainObjectType, int Options>
+struct nested<TensorMap<PlainObjectType, Options> >
+{
+  typedef const TensorMap<PlainObjectType, Options>& type;
+};
+
+template <typename PlainObjectType, int Options>
+struct nested<const TensorMap<PlainObjectType, Options> >
+{
+  typedef const TensorMap<PlainObjectType, Options>& type;
+};
+
+template <typename PlainObjectType>
+struct nested<TensorRef<PlainObjectType> >
+{
+  typedef const TensorRef<PlainObjectType>& type;
+};
+
+template <typename PlainObjectType>
+struct nested<const TensorRef<PlainObjectType> >
+{
+  typedef const TensorRef<PlainObjectType>& type;
+};
+
+}  // end namespace internal
+
+// Convolutional layers take in an input tensor of shape (D, R, C, B), or (D, C,
+// R, B), and convolve it with a set of filters, which can also be presented as
+// a tensor (D, K, K, M), where M is the number of filters, K is the filter
+// size, and each 3-dimensional tensor of size (D, K, K) is a filter. For
+// simplicity we assume that we always use square filters (which is usually the
+// case in images), hence the two Ks in the tensor dimension.  It also takes in
+// a few additional parameters:
+// Stride (S): The convolution stride is the offset between locations where we
+//             apply the filters.  A larger stride means that the output will be
+//             spatially smaller.
+// Padding (P): The padding we apply to the input tensor along the R and C
+//              dimensions.  This is usually used to make sure that the spatial
+//              dimensions of the output matches our intention.
+//
+// Two types of padding are often used:
+//   SAME: The pad value is computed so that the output will have size
+//         R/S and C/S.
+//   VALID: no padding is carried out.
+// When we do padding, the padded values at the padded locations are usually
+// zero.
+//
+// The output dimensions for convolution, when given all the parameters above,
+// are as follows:
+// When Padding = SAME: the output size is (B, R', C', M), where
+//   R' = ceil(float(R) / float(S))
+//   C' = ceil(float(C) / float(S))
+// where ceil is the ceiling function.  The input tensor is padded with 0 as
+// needed.  The number of padded rows and columns are computed as:
+//   Pr = ((R' - 1) * S + K - R) / 2
+//   Pc = ((C' - 1) * S + K - C) / 2
+// when the stride is 1, we have the simplified case R'=R, C'=C, Pr=Pc=(K-1)/2.
+// This is where SAME comes from - the output has the same size as the input has.
+// When Padding = VALID: the output size is computed as
+//   R' = ceil(float(R - K + 1) / float(S))
+//   C' = ceil(float(C - K + 1) / float(S))
+// and the number of padded rows and columns are computed in the same way as in
+// the SAME case.
+// When the stride is 1, we have the simplified case R'=R-K+1, C'=C-K+1, Pr=0,
+// Pc=0.
+typedef enum {
+  PADDING_VALID = 1,
+  PADDING_SAME = 2,
+} PaddingType;
+
+}  // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
diff --git a/unsupported/Eigen/OpenGLSupport b/unsupported/Eigen/OpenGLSupport
index 4ed545174..6ca1b1217 100644
--- a/unsupported/Eigen/OpenGLSupport
+++ b/unsupported/Eigen/OpenGLSupport
@@ -178,11 +178,11 @@ template<typename Scalar> void glLoadMatrix(const Transform<Scalar,3,Affine>& t)
 template<typename Scalar> void glLoadMatrix(const Transform<Scalar,3,Projective>& t)    { glLoadMatrix(t.matrix()); }
 template<typename Scalar> void glLoadMatrix(const Transform<Scalar,3,AffineCompact>& t) { glLoadMatrix(Transform<Scalar,3,Affine>(t).matrix()); }
 
-static void glRotate(const Rotation2D<float>& rot)
+inline void glRotate(const Rotation2D<float>& rot)
 {
   glRotatef(rot.angle()*180.f/float(M_PI), 0.f, 0.f, 1.f);
 }
-static void glRotate(const Rotation2D<double>& rot)
+inline void glRotate(const Rotation2D<double>& rot)
 {
   glRotated(rot.angle()*180.0/M_PI, 0.0, 0.0, 1.0);
 }
@@ -246,18 +246,18 @@ EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glGet,GLenum,_,double, 4,4,Doublev)
 
 #ifdef GL_VERSION_2_0
 
-static void glUniform2fv_ei  (GLint loc, const float* v)         { glUniform2fv(loc,1,v); }
-static void glUniform2iv_ei  (GLint loc, const int* v)           { glUniform2iv(loc,1,v); }
+inline void glUniform2fv_ei  (GLint loc, const float* v)         { glUniform2fv(loc,1,v); }
+inline void glUniform2iv_ei  (GLint loc, const int* v)           { glUniform2iv(loc,1,v); }
 
-static void glUniform3fv_ei  (GLint loc, const float* v)         { glUniform3fv(loc,1,v); }
-static void glUniform3iv_ei  (GLint loc, const int* v)           { glUniform3iv(loc,1,v); }
+inline void glUniform3fv_ei  (GLint loc, const float* v)         { glUniform3fv(loc,1,v); }
+inline void glUniform3iv_ei  (GLint loc, const int* v)           { glUniform3iv(loc,1,v); }
 
-static void glUniform4fv_ei  (GLint loc, const float* v)         { glUniform4fv(loc,1,v); }
-static void glUniform4iv_ei  (GLint loc, const int* v)           { glUniform4iv(loc,1,v); }
+inline void glUniform4fv_ei  (GLint loc, const float* v)         { glUniform4fv(loc,1,v); }
+inline void glUniform4iv_ei  (GLint loc, const int* v)           { glUniform4iv(loc,1,v); }
 
-static void glUniformMatrix2fv_ei  (GLint loc, const float* v)         { glUniformMatrix2fv(loc,1,false,v); }
-static void glUniformMatrix3fv_ei  (GLint loc, const float* v)         { glUniformMatrix3fv(loc,1,false,v); }
-static void glUniformMatrix4fv_ei  (GLint loc, const float* v)         { glUniformMatrix4fv(loc,1,false,v); }
+inline void glUniformMatrix2fv_ei  (GLint loc, const float* v)         { glUniformMatrix2fv(loc,1,false,v); }
+inline void glUniformMatrix3fv_ei  (GLint loc, const float* v)         { glUniformMatrix3fv(loc,1,false,v); }
+inline void glUniformMatrix4fv_ei  (GLint loc, const float* v)         { glUniformMatrix4fv(loc,1,false,v); }
 
 
 EIGEN_GL_FUNC1_DECLARATION       (glUniform,GLint,const)
@@ -294,9 +294,9 @@ EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        4,3,Matrix
 
 #ifdef GL_VERSION_3_0
 
-static void glUniform2uiv_ei (GLint loc, const unsigned int* v)  { glUniform2uiv(loc,1,v); }
-static void glUniform3uiv_ei (GLint loc, const unsigned int* v)  { glUniform3uiv(loc,1,v); }
-static void glUniform4uiv_ei (GLint loc, const unsigned int* v)  { glUniform4uiv(loc,1,v); }
+inline void glUniform2uiv_ei (GLint loc, const unsigned int* v)  { glUniform2uiv(loc,1,v); }
+inline void glUniform3uiv_ei (GLint loc, const unsigned int* v)  { glUniform3uiv(loc,1,v); }
+inline void glUniform4uiv_ei (GLint loc, const unsigned int* v)  { glUniform4uiv(loc,1,v); }
 
 EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,unsigned int, 2,2uiv_ei)
 EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,unsigned int, 3,3uiv_ei)
@@ -305,9 +305,9 @@ EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,unsigned int, 4,4uiv_ei)
 #endif
 
 #ifdef GL_ARB_gpu_shader_fp64
-static void glUniform2dv_ei  (GLint loc, const double* v)        { glUniform2dv(loc,1,v); }
-static void glUniform3dv_ei  (GLint loc, const double* v)        { glUniform3dv(loc,1,v); }
-static void glUniform4dv_ei  (GLint loc, const double* v)        { glUniform4dv(loc,1,v); }
+inline void glUniform2dv_ei  (GLint loc, const double* v)        { glUniform2dv(loc,1,v); }
+inline void glUniform3dv_ei  (GLint loc, const double* v)        { glUniform3dv(loc,1,v); }
+inline void glUniform4dv_ei  (GLint loc, const double* v)        { glUniform4dv(loc,1,v); }
 
 EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,double,       2,2dv_ei)
 EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,double,       3,3dv_ei)
diff --git a/unsupported/Eigen/src/IterativeSolvers/DGMRES.h b/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
index 0e1b7d977..52eb65a2f 100644
--- a/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
+++ b/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
@@ -150,7 +150,7 @@ class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
       m_error = Base::m_tolerance;
       
       typename Dest::ColXpr xj(x,j);
-      dgmres(*mp_matrix, b.col(j), xj, Base::m_preconditioner);
+      dgmres(mp_matrix, b.col(j), xj, Base::m_preconditioner);
     }
     m_info = failed ? NumericalIssue
            : m_error <= Base::m_tolerance ? Success
diff --git a/unsupported/Eigen/src/IterativeSolvers/GMRES.h b/unsupported/Eigen/src/IterativeSolvers/GMRES.h
index 60f5f662c..6e847e110 100644
--- a/unsupported/Eigen/src/IterativeSolvers/GMRES.h
+++ b/unsupported/Eigen/src/IterativeSolvers/GMRES.h
@@ -250,21 +250,8 @@ struct traits<GMRES<_MatrixType,_Preconditioner> >
   * \endcode
   *
   * By default the iterations start with x=0 as an initial guess of the solution.
-  * One can control the start using the solveWithGuess() method. Here is a step by
-  * step execution example starting with a random guess and printing the evolution
-  * of the estimated error:
-  * * \code
-  * x = VectorXd::Random(n);
-  * solver.setMaxIterations(1);
-  * int i = 0;
-  * do {
-  *   x = solver.solveWithGuess(b,x);
-  *   std::cout << i << " : " << solver.error() << std::endl;
-  *   ++i;
-  * } while (solver.info()!=Success && i<100);
-  * \endcode
-  * Note that such a step by step excution is slightly slower.
-  *
+  * One can control the start using the solveWithGuess() method.
+  * 
   * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
   */
 template< typename _MatrixType, typename _Preconditioner>
@@ -327,7 +314,7 @@ public:
       m_error = Base::m_tolerance;
 
       typename Dest::ColXpr xj(x,j);
-      if(!internal::gmres(*mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_restart, m_error))
+      if(!internal::gmres(mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_restart, m_error))
         failed = true;
     }
     m_info = failed ? NumericalIssue
diff --git a/unsupported/Eigen/src/IterativeSolvers/MINRES.h b/unsupported/Eigen/src/IterativeSolvers/MINRES.h
index eccdce24c..2845b9cfd 100644
--- a/unsupported/Eigen/src/IterativeSolvers/MINRES.h
+++ b/unsupported/Eigen/src/IterativeSolvers/MINRES.h
@@ -165,8 +165,8 @@ namespace Eigen {
      * The vectors x and b can be either dense or sparse.
      *
      * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix.
-     * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
-     *               or Upper. Default is Lower.
+     * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower,
+     *               Upper, or Lower|Upper in which the full matrix entries will be considered. Default is Lower.
      * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
      *
      * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
@@ -189,20 +189,7 @@ namespace Eigen {
      * \endcode
      *
      * By default the iterations start with x=0 as an initial guess of the solution.
-     * One can control the start using the solveWithGuess() method. Here is a step by
-     * step execution example starting with a random guess and printing the evolution
-     * of the estimated error:
-     * * \code
-     * x = VectorXd::Random(n);
-     * mr.setMaxIterations(1);
-     * int i = 0;
-     * do {
-     *   x = mr.solveWithGuess(b,x);
-     *   std::cout << i << " : " << mr.error() << std::endl;
-     *   ++i;
-     * } while (mr.info()!=Success && i<100);
-     * \endcode
-     * Note that such a step by step excution is slightly slower.
+     * One can control the start using the solveWithGuess() method.
      *
      * \sa class ConjugateGradient, BiCGSTAB, SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
      */
@@ -250,6 +237,11 @@ namespace Eigen {
         template<typename Rhs,typename Dest>
         void _solve_with_guess_impl(const Rhs& b, Dest& x) const
         {
+            typedef typename internal::conditional<UpLo==(Lower|Upper),
+                                                   Ref<const MatrixType>&,
+                                                   SparseSelfAdjointView<const Ref<const MatrixType>, UpLo>
+                                                  >::type MatrixWrapperType;
+                                          
             m_iterations = Base::maxIterations();
             m_error = Base::m_tolerance;
             
@@ -259,7 +251,7 @@ namespace Eigen {
                 m_error = Base::m_tolerance;
                 
                 typename Dest::ColXpr xj(x,j);
-                internal::minres(mp_matrix->template selfadjointView<UpLo>(), b.col(j), xj,
+                internal::minres(MatrixWrapperType(mp_matrix), b.col(j), xj,
                                  Base::m_preconditioner, m_iterations, m_error);
             }
             
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
index 42b60b9b1..22bfdc4ac 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
@@ -53,15 +53,20 @@ void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result)
   result(1,0) = Scalar(0);
   result(1,1) = logA11;
 
-  if (A(0,0) == A(1,1)) {
+  Scalar y = A(1,1) - A(0,0);
+  if (y==Scalar(0))
+  {
     result(0,1) = A(0,1) / A(0,0);
-  } else if ((abs(A(0,0)) < 0.5*abs(A(1,1))) || (abs(A(0,0)) > 2*abs(A(1,1)))) {
-    result(0,1) = A(0,1) * (logA11 - logA00) / (A(1,1) - A(0,0));
-  } else {
+  }
+  else if ((abs(A(0,0)) < 0.5*abs(A(1,1))) || (abs(A(0,0)) > 2*abs(A(1,1))))
+  {
+    result(0,1) = A(0,1) * (logA11 - logA00) / y;
+  }
+  else
+  {
     // computation in previous branch is inaccurate if A(1,1) \approx A(0,0)
     int unwindingNumber = static_cast<int>(ceil((imag(logA11 - logA00) - M_PI) / (2*M_PI)));
-    Scalar y = A(1,1) - A(0,0), x = A(1,1) + A(0,0);
-    result(0,1) = A(0,1) * (Scalar(2) * numext::atanh2(y,x) + Scalar(0,2*M_PI*unwindingNumber)) / y;
+    result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,2*M_PI*unwindingNumber)) / y;
   }
 }
 
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
index ee665c18e..1e5a59c55 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
@@ -299,7 +299,7 @@ MatrixPowerAtomic<MatrixType>::computeSuperDiag(const ComplexScalar& curr, const
   ComplexScalar logCurr = log(curr);
   ComplexScalar logPrev = log(prev);
   int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - M_PI) / (2*M_PI));
-  ComplexScalar w = numext::atanh2(curr - prev, curr + prev) + ComplexScalar(0, M_PI*unwindingNumber);
+  ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, M_PI*unwindingNumber);
   return RealScalar(2) * exp(RealScalar(0.5) * p * (logCurr + logPrev)) * sinh(p * w) / (curr - prev);
 }
 
@@ -311,7 +311,7 @@ MatrixPowerAtomic<MatrixType>::computeSuperDiag(RealScalar curr, RealScalar prev
   using std::log;
   using std::sinh;
 
-  RealScalar w = numext::atanh2(curr - prev, curr + prev);
+  RealScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2);
   return 2 * exp(p * (log(curr) + log(prev)) / 2) * sinh(p * w) / (curr - prev);
 }
 
diff --git a/unsupported/Eigen/src/Polynomials/PolynomialUtils.h b/unsupported/Eigen/src/Polynomials/PolynomialUtils.h
index 2bb8bc84a..40ba65b7e 100644
--- a/unsupported/Eigen/src/Polynomials/PolynomialUtils.h
+++ b/unsupported/Eigen/src/Polynomials/PolynomialUtils.h
@@ -56,7 +56,7 @@ T poly_eval( const Polynomials& poly, const T& x )
     for( DenseIndex i=1; i<poly.size(); ++i ){
       val = val*inv_x + poly[i]; }
 
-    return std::pow(x,(T)(poly.size()-1)) * val;
+    return numext::pow(x,(T)(poly.size()-1)) * val;
   }
 }
 
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 97849a25a..7b6751f00 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -50,7 +50,7 @@ if(MPFR_FOUND)
   include_directories(${MPFR_INCLUDES} ./mpreal)
   ei_add_property(EIGEN_TESTED_BACKENDS "MPFR C++, ")
   set(EIGEN_MPFR_TEST_LIBRARIES ${MPFR_LIBRARIES} ${GMP_LIBRARIES})
-  ei_add_test(mpreal_support "" "${EIGEN_MPFR_TEST_LIBRARIES}" )
+#  ei_add_test(mpreal_support "" "${EIGEN_MPFR_TEST_LIBRARIES}" )
 else()
   ei_add_property(EIGEN_MISSING_BACKENDS "MPFR C++, ")
 endif()
@@ -76,8 +76,9 @@ if(NOT EIGEN_TEST_NO_OPENGL)
   find_package(GLUT)
   find_package(GLEW)
   if(OPENGL_FOUND AND GLUT_FOUND AND GLEW_FOUND)
+    include_directories(${OPENGL_INCLUDE_DIR} ${GLUT_INCLUDE_DIR} ${GLEW_INCLUDE_DIRS})
     ei_add_property(EIGEN_TESTED_BACKENDS "OpenGL, ")
-    set(EIGEN_GL_LIB ${GLUT_LIBRARIES} ${GLEW_LIBRARIES})
+    set(EIGEN_GL_LIB ${GLUT_LIBRARIES} ${GLEW_LIBRARIES} ${OPENGL_LIBRARIES})
     ei_add_test(openglsupport  "" "${EIGEN_GL_LIB}" )
   else()
     ei_add_property(EIGEN_MISSING_BACKENDS "OpenGL, ")
@@ -94,12 +95,50 @@ ei_add_test(minres)
 ei_add_test(levenberg_marquardt)
 ei_add_test(kronecker_product)
 
-option(EIGEN_TEST_CXX11 "Enable testing of C++11 features (e.g. Tensor module)." OFF)
+option(EIGEN_TEST_CXX11 "Enable testing of C++11 features (e.g. Tensor module)." ON)
 if(EIGEN_TEST_CXX11)
-  # FIXME: add C++11 compiler switch in some portable way
-  #        (MSVC doesn't need any for example, so this will
-  #        clash there)
+  # It should be safe to always run these tests as there is some fallback code for
+  # older compiler that don't support cxx11.
   ei_add_test(cxx11_meta "-std=c++0x")
   ei_add_test(cxx11_tensor_simple "-std=c++0x")
-  ei_add_test(cxx11_tensor_symmetry "-std=c++0x")
+#  ei_add_test(cxx11_tensor_symmetry "-std=c++0x")
+  ei_add_test(cxx11_tensor_assign "-std=c++0x")
+  ei_add_test(cxx11_tensor_dimension "-std=c++0x")
+  ei_add_test(cxx11_tensor_index_list "-std=c++0x")
+  ei_add_test(cxx11_tensor_comparisons "-std=c++0x")
+  ei_add_test(cxx11_tensor_contraction "-std=c++0x")
+  ei_add_test(cxx11_tensor_convolution "-std=c++0x")
+  ei_add_test(cxx11_tensor_expr "-std=c++0x")
+  ei_add_test(cxx11_tensor_forced_eval "-std=c++0x")
+  ei_add_test(cxx11_tensor_fixed_size "-std=c++0x")
+  ei_add_test(cxx11_tensor_const "-std=c++0x")
+  ei_add_test(cxx11_tensor_of_const_values "-std=c++0x")
+  ei_add_test(cxx11_tensor_of_complex "-std=c++0x")
+  ei_add_test(cxx11_tensor_of_strings "-std=c++0x")
+  ei_add_test(cxx11_tensor_intdiv "-std=c++0x")
+  ei_add_test(cxx11_tensor_lvalue "-std=c++0x")
+  ei_add_test(cxx11_tensor_map "-std=c++0x")
+  ei_add_test(cxx11_tensor_broadcasting "-std=c++0x")
+  ei_add_test(cxx11_tensor_chipping "-std=c++0x")
+  ei_add_test(cxx11_tensor_concatenation "-std=c++0x")
+  ei_add_test(cxx11_tensor_morphing "-std=c++0x")
+  ei_add_test(cxx11_tensor_padding "-std=c++0x")
+  ei_add_test(cxx11_tensor_patch "-std=c++0x")
+  ei_add_test(cxx11_tensor_image_patch "-std=c++0x")
+  ei_add_test(cxx11_tensor_reduction "-std=c++0x")
+  ei_add_test(cxx11_tensor_shuffling "-std=c++0x")
+  ei_add_test(cxx11_tensor_striding "-std=c++0x")
+  ei_add_test(cxx11_tensor_thread_pool "-std=c++0x")
+  ei_add_test(cxx11_tensor_ref "-std=c++0x")
+  ei_add_test(cxx11_tensor_random "-std=c++0x")
+  ei_add_test(cxx11_tensor_casts "-std=c++0x")
+  ei_add_test(cxx11_tensor_reverse "-std=c++0x")
+  ei_add_test(cxx11_tensor_layout_swap "-std=c++0x")
+  ei_add_test(cxx11_tensor_io "-std=c++0x")
+
+  # These tests needs nvcc
+#  ei_add_test(cxx11_tensor_device "-std=c++0x")
+#  ei_add_test(cxx11_tensor_cuda "-std=c++0x")
+#  ei_add_test(cxx11_tensor_contract_cuda "-std=c++0x")
+
 endif()
diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp
new file mode 100644
index 000000000..d16aaf847
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_assign.cpp
@@ -0,0 +1,370 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_1d()
+{
+  Tensor<int, 1> vec1(6);
+  Tensor<int, 1, RowMajor> vec2(6);
+  vec1(0) = 4;  vec2(0) = 0;
+  vec1(1) = 8;  vec2(1) = 1;
+  vec1(2) = 15; vec2(2) = 2;
+  vec1(3) = 16; vec2(3) = 3;
+  vec1(4) = 23; vec2(4) = 4;
+  vec1(5) = 42; vec2(5) = 5;
+
+  int col_major[6];
+  int row_major[6];
+  memset(col_major, 0, 6*sizeof(int));
+  memset(row_major, 0, 6*sizeof(int));
+  TensorMap<Tensor<int, 1>> vec3(col_major, 6);
+  TensorMap<Tensor<int, 1, RowMajor>> vec4(row_major, 6);
+
+  vec3 = vec1;
+  vec4 = vec2;
+
+  VERIFY_IS_EQUAL(vec3(0), 4);
+  VERIFY_IS_EQUAL(vec3(1), 8);
+  VERIFY_IS_EQUAL(vec3(2), 15);
+  VERIFY_IS_EQUAL(vec3(3), 16);
+  VERIFY_IS_EQUAL(vec3(4), 23);
+  VERIFY_IS_EQUAL(vec3(5), 42);
+
+  VERIFY_IS_EQUAL(vec4(0), 0);
+  VERIFY_IS_EQUAL(vec4(1), 1);
+  VERIFY_IS_EQUAL(vec4(2), 2);
+  VERIFY_IS_EQUAL(vec4(3), 3);
+  VERIFY_IS_EQUAL(vec4(4), 4);
+  VERIFY_IS_EQUAL(vec4(5), 5);
+
+  vec1.setZero();
+  vec2.setZero();
+  vec1 = vec3;
+  vec2 = vec4;
+
+  VERIFY_IS_EQUAL(vec1(0), 4);
+  VERIFY_IS_EQUAL(vec1(1), 8);
+  VERIFY_IS_EQUAL(vec1(2), 15);
+  VERIFY_IS_EQUAL(vec1(3), 16);
+  VERIFY_IS_EQUAL(vec1(4), 23);
+  VERIFY_IS_EQUAL(vec1(5), 42);
+
+  VERIFY_IS_EQUAL(vec2(0), 0);
+  VERIFY_IS_EQUAL(vec2(1), 1);
+  VERIFY_IS_EQUAL(vec2(2), 2);
+  VERIFY_IS_EQUAL(vec2(3), 3);
+  VERIFY_IS_EQUAL(vec2(4), 4);
+  VERIFY_IS_EQUAL(vec2(5), 5);
+}
+
+static void test_2d()
+{
+  Tensor<int, 2> mat1(2,3);
+  Tensor<int, 2, RowMajor> mat2(2,3);
+
+  mat1(0,0) = 0;
+  mat1(0,1) = 1;
+  mat1(0,2) = 2;
+  mat1(1,0) = 3;
+  mat1(1,1) = 4;
+  mat1(1,2) = 5;
+
+  mat2(0,0) = 0;
+  mat2(0,1) = 1;
+  mat2(0,2) = 2;
+  mat2(1,0) = 3;
+  mat2(1,1) = 4;
+  mat2(1,2) = 5;
+
+  int col_major[6];
+  int row_major[6];
+  memset(col_major, 0, 6*sizeof(int));
+  memset(row_major, 0, 6*sizeof(int));
+  TensorMap<Tensor<int, 2>> mat3(row_major, 2, 3);
+  TensorMap<Tensor<int, 2, RowMajor>> mat4(col_major, 2, 3);
+
+  mat3 = mat1;
+  mat4 = mat2;
+
+  VERIFY_IS_EQUAL(mat3(0,0), 0);
+  VERIFY_IS_EQUAL(mat3(0,1), 1);
+  VERIFY_IS_EQUAL(mat3(0,2), 2);
+  VERIFY_IS_EQUAL(mat3(1,0), 3);
+  VERIFY_IS_EQUAL(mat3(1,1), 4);
+  VERIFY_IS_EQUAL(mat3(1,2), 5);
+
+  VERIFY_IS_EQUAL(mat4(0,0), 0);
+  VERIFY_IS_EQUAL(mat4(0,1), 1);
+  VERIFY_IS_EQUAL(mat4(0,2), 2);
+  VERIFY_IS_EQUAL(mat4(1,0), 3);
+  VERIFY_IS_EQUAL(mat4(1,1), 4);
+  VERIFY_IS_EQUAL(mat4(1,2), 5);
+
+  mat1.setZero();
+  mat2.setZero();
+  mat1 = mat3;
+  mat2 = mat4;
+
+  VERIFY_IS_EQUAL(mat1(0,0), 0);
+  VERIFY_IS_EQUAL(mat1(0,1), 1);
+  VERIFY_IS_EQUAL(mat1(0,2), 2);
+  VERIFY_IS_EQUAL(mat1(1,0), 3);
+  VERIFY_IS_EQUAL(mat1(1,1), 4);
+  VERIFY_IS_EQUAL(mat1(1,2), 5);
+
+  VERIFY_IS_EQUAL(mat2(0,0), 0);
+  VERIFY_IS_EQUAL(mat2(0,1), 1);
+  VERIFY_IS_EQUAL(mat2(0,2), 2);
+  VERIFY_IS_EQUAL(mat2(1,0), 3);
+  VERIFY_IS_EQUAL(mat2(1,1), 4);
+  VERIFY_IS_EQUAL(mat2(1,2), 5);
+}
+
+static void test_3d()
+{
+  Tensor<int, 3> mat1(2,3,7);
+  Tensor<int, 3, RowMajor> mat2(2,3,7);
+
+  int val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        mat2(i,j,k) = val;
+        val++;
+      }
+    }
+  }
+
+  int col_major[2*3*7];
+  int row_major[2*3*7];
+  memset(col_major, 0, 2*3*7*sizeof(int));
+  memset(row_major, 0, 2*3*7*sizeof(int));
+  TensorMap<Tensor<int, 3>> mat3(col_major, 2, 3, 7);
+  TensorMap<Tensor<int, 3, RowMajor>> mat4(row_major, 2, 3, 7);
+
+  mat3 = mat1;
+  mat4 = mat2;
+
+  val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(mat3(i,j,k), val);
+        VERIFY_IS_EQUAL(mat4(i,j,k), val);
+        val++;
+      }
+    }
+  }
+
+  mat1.setZero();
+  mat2.setZero();
+  mat1 = mat3;
+  mat2 = mat4;
+
+  val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(mat1(i,j,k), val);
+        VERIFY_IS_EQUAL(mat2(i,j,k), val);
+        val++;
+      }
+    }
+  }
+}
+
+static void test_same_type()
+{
+  Tensor<int, 1> orig_tensor(5);
+  Tensor<int, 1> dest_tensor(5);
+  orig_tensor.setRandom();
+  dest_tensor.setRandom();
+  int* orig_data = orig_tensor.data();
+  int* dest_data = dest_tensor.data();
+  dest_tensor = orig_tensor;
+  VERIFY_IS_EQUAL(orig_tensor.data(), orig_data);
+  VERIFY_IS_EQUAL(dest_tensor.data(), dest_data);
+  for (int i = 0; i < 5; ++i) {
+    VERIFY_IS_EQUAL(dest_tensor(i), orig_tensor(i));
+  }
+
+  TensorFixedSize<int, Sizes<5> > orig_array;
+  TensorFixedSize<int, Sizes<5> > dest_array;
+  orig_array.setRandom();
+  dest_array.setRandom();
+  orig_data = orig_array.data();
+  dest_data = dest_array.data();
+  dest_array = orig_array;
+  VERIFY_IS_EQUAL(orig_array.data(), orig_data);
+  VERIFY_IS_EQUAL(dest_array.data(), dest_data);
+  for (int i = 0; i < 5; ++i) {
+    VERIFY_IS_EQUAL(dest_array(i), orig_array(i));
+  }
+
+  int orig[5] = {1, 2, 3, 4, 5};
+  int dest[5] = {6, 7, 8, 9, 10};
+  TensorMap<Tensor<int, 1> > orig_map(orig, 5);
+  TensorMap<Tensor<int, 1> > dest_map(dest, 5);
+  orig_data = orig_map.data();
+  dest_data = dest_map.data();
+  dest_map = orig_map;
+  VERIFY_IS_EQUAL(orig_map.data(), orig_data);
+  VERIFY_IS_EQUAL(dest_map.data(), dest_data);
+  for (int i = 0; i < 5; ++i) {
+    VERIFY_IS_EQUAL(dest[i], i+1);
+  }
+}
+
+static void test_auto_resize()
+{
+  Tensor<int, 1> tensor1;
+  Tensor<int, 1> tensor2(3);
+  Tensor<int, 1> tensor3(5);
+  Tensor<int, 1> tensor4(7);
+
+  Tensor<int, 1> new_tensor(5);
+  new_tensor.setRandom();
+
+  tensor1 = tensor2 = tensor3 = tensor4 = new_tensor;
+
+  VERIFY_IS_EQUAL(tensor1.dimension(0), new_tensor.dimension(0));
+  VERIFY_IS_EQUAL(tensor2.dimension(0), new_tensor.dimension(0));
+  VERIFY_IS_EQUAL(tensor3.dimension(0), new_tensor.dimension(0));
+  VERIFY_IS_EQUAL(tensor4.dimension(0), new_tensor.dimension(0));
+  for (int i = 0; i < new_tensor.dimension(0); ++i) {
+    VERIFY_IS_EQUAL(tensor1(i), new_tensor(i));
+    VERIFY_IS_EQUAL(tensor2(i), new_tensor(i));
+    VERIFY_IS_EQUAL(tensor3(i), new_tensor(i));
+    VERIFY_IS_EQUAL(tensor4(i), new_tensor(i));
+  }
+}
+
+
+static void test_compound_assign()
+{
+  Tensor<int, 1> start_tensor(10);
+  Tensor<int, 1> offset_tensor(10);
+  start_tensor.setRandom();
+  offset_tensor.setRandom();
+
+  Tensor<int, 1> tensor = start_tensor;
+  tensor += offset_tensor;
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_EQUAL(tensor(i), start_tensor(i) + offset_tensor(i));
+  }
+
+  tensor = start_tensor;
+  tensor -= offset_tensor;
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_EQUAL(tensor(i), start_tensor(i) - offset_tensor(i));
+  }
+
+  tensor = start_tensor;
+  tensor *= offset_tensor;
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_EQUAL(tensor(i), start_tensor(i) * offset_tensor(i));
+  }
+
+  tensor = start_tensor;
+  tensor /= offset_tensor;
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_EQUAL(tensor(i), start_tensor(i) / offset_tensor(i));
+  }
+}
+
+static void test_std_initializers_tensor() {
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+  Tensor<int, 1> a(3);
+  a.setValues({0, 1, 2});
+  VERIFY_IS_EQUAL(a(0), 0);
+  VERIFY_IS_EQUAL(a(1), 1);
+  VERIFY_IS_EQUAL(a(2), 2);
+
+  // It fills the top-left slice.
+  a.setValues({10, 20});
+  VERIFY_IS_EQUAL(a(0), 10);
+  VERIFY_IS_EQUAL(a(1), 20);
+  VERIFY_IS_EQUAL(a(2), 2);
+
+  // Chaining.
+  Tensor<int, 1> a2(3);
+  a2 = a.setValues({100, 200, 300});
+  VERIFY_IS_EQUAL(a(0), 100);
+  VERIFY_IS_EQUAL(a(1), 200);
+  VERIFY_IS_EQUAL(a(2), 300);
+  VERIFY_IS_EQUAL(a2(0), 100);
+  VERIFY_IS_EQUAL(a2(1), 200);
+  VERIFY_IS_EQUAL(a2(2), 300);
+
+  Tensor<int, 2> b(2, 3);
+  b.setValues({{0, 1, 2}, {3, 4, 5}});
+  VERIFY_IS_EQUAL(b(0, 0), 0);
+  VERIFY_IS_EQUAL(b(0, 1), 1);
+  VERIFY_IS_EQUAL(b(0, 2), 2);
+  VERIFY_IS_EQUAL(b(1, 0), 3);
+  VERIFY_IS_EQUAL(b(1, 1), 4);
+  VERIFY_IS_EQUAL(b(1, 2), 5);
+
+  // It fills the top-left slice.
+  b.setValues({{10, 20}, {30}});
+  VERIFY_IS_EQUAL(b(0, 0), 10);
+  VERIFY_IS_EQUAL(b(0, 1), 20);
+  VERIFY_IS_EQUAL(b(0, 2), 2);
+  VERIFY_IS_EQUAL(b(1, 0), 30);
+  VERIFY_IS_EQUAL(b(1, 1), 4);
+  VERIFY_IS_EQUAL(b(1, 2), 5);
+
+  Eigen::Tensor<int, 3> c(3, 2, 4);
+  c.setValues({{{0, 1, 2, 3}, {4, 5, 6, 7}},
+               {{10, 11, 12, 13}, {14, 15, 16, 17}},
+               {{20, 21, 22, 23}, {24, 25, 26, 27}}});
+  VERIFY_IS_EQUAL(c(0, 0, 0), 0);
+  VERIFY_IS_EQUAL(c(0, 0, 1), 1);
+  VERIFY_IS_EQUAL(c(0, 0, 2), 2);
+  VERIFY_IS_EQUAL(c(0, 0, 3), 3);
+  VERIFY_IS_EQUAL(c(0, 1, 0), 4);
+  VERIFY_IS_EQUAL(c(0, 1, 1), 5);
+  VERIFY_IS_EQUAL(c(0, 1, 2), 6);
+  VERIFY_IS_EQUAL(c(0, 1, 3), 7);
+  VERIFY_IS_EQUAL(c(1, 0, 0), 10);
+  VERIFY_IS_EQUAL(c(1, 0, 1), 11);
+  VERIFY_IS_EQUAL(c(1, 0, 2), 12);
+  VERIFY_IS_EQUAL(c(1, 0, 3), 13);
+  VERIFY_IS_EQUAL(c(1, 1, 0), 14);
+  VERIFY_IS_EQUAL(c(1, 1, 1), 15);
+  VERIFY_IS_EQUAL(c(1, 1, 2), 16);
+  VERIFY_IS_EQUAL(c(1, 1, 3), 17);
+  VERIFY_IS_EQUAL(c(2, 0, 0), 20);
+  VERIFY_IS_EQUAL(c(2, 0, 1), 21);
+  VERIFY_IS_EQUAL(c(2, 0, 2), 22);
+  VERIFY_IS_EQUAL(c(2, 0, 3), 23);
+  VERIFY_IS_EQUAL(c(2, 1, 0), 24);
+  VERIFY_IS_EQUAL(c(2, 1, 1), 25);
+  VERIFY_IS_EQUAL(c(2, 1, 2), 26);
+  VERIFY_IS_EQUAL(c(2, 1, 3), 27);
+#endif  // EIGEN_HAS_VARIADIC_TEMPLATES
+}
+
+void test_cxx11_tensor_assign()
+{
+  CALL_SUBTEST(test_1d());
+  CALL_SUBTEST(test_2d());
+  CALL_SUBTEST(test_3d());
+  CALL_SUBTEST(test_same_type());
+  CALL_SUBTEST(test_auto_resize());
+  CALL_SUBTEST(test_compound_assign());
+  CALL_SUBTEST(test_std_initializers_tensor());
+}
diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp
new file mode 100644
index 000000000..2ddf47234
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_broadcasting.cpp
@@ -0,0 +1,194 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <int DataLayout>
+static void test_simple_broadcasting()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> broadcasts;
+  broadcasts[0] = 1;
+  broadcasts[1] = 1;
+  broadcasts[2] = 1;
+  broadcasts[3] = 1;
+
+  Tensor<float, 4, DataLayout> no_broadcast;
+  no_broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(no_broadcast.dimension(0), 2);
+  VERIFY_IS_EQUAL(no_broadcast.dimension(1), 3);
+  VERIFY_IS_EQUAL(no_broadcast.dimension(2), 5);
+  VERIFY_IS_EQUAL(no_broadcast.dimension(3), 7);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_broadcast(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  broadcasts[0] = 2;
+  broadcasts[1] = 3;
+  broadcasts[2] = 1;
+  broadcasts[3] = 4;
+  Tensor<float, 4, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 4);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 9);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 5);
+  VERIFY_IS_EQUAL(broadcast.dimension(3), 28);
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 28; ++l) {
+          VERIFY_IS_EQUAL(tensor(i%2,j%3,k%5,l%7), broadcast(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_vectorized_broadcasting()
+{
+  Tensor<float, 3, DataLayout> tensor(8,3,5);
+  tensor.setRandom();
+  array<ptrdiff_t, 3> broadcasts;
+  broadcasts[0] = 2;
+  broadcasts[1] = 3;
+  broadcasts[2] = 4;
+
+  Tensor<float, 3, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 16);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 9);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 20);
+
+  for (int i = 0; i < 16; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 20; ++k) {
+        VERIFY_IS_EQUAL(tensor(i%8,j%3,k%5), broadcast(i,j,k));
+      }
+    }
+  }
+
+  tensor.resize(11,3,5);
+  tensor.setRandom();
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 22);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 9);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 20);
+
+  for (int i = 0; i < 22; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 20; ++k) {
+        VERIFY_IS_EQUAL(tensor(i%11,j%3,k%5), broadcast(i,j,k));
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_static_broadcasting()
+{
+  Tensor<float, 3, DataLayout> tensor(8,3,5);
+  tensor.setRandom();
+
+#ifdef EIGEN_HAS_CONSTEXPR
+  Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3>, Eigen::type2index<4>> broadcasts;
+#else
+  Eigen::array<int, 3> broadcasts;
+  broadcasts[0] = 2;
+  broadcasts[1] = 3;
+  broadcasts[2] = 4;
+#endif
+
+  Tensor<float, 3, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 16);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 9);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 20);
+
+  for (int i = 0; i < 16; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 20; ++k) {
+        VERIFY_IS_EQUAL(tensor(i%8,j%3,k%5), broadcast(i,j,k));
+      }
+    }
+  }
+
+  tensor.resize(11,3,5);
+  tensor.setRandom();
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 22);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 9);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 20);
+
+  for (int i = 0; i < 22; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 20; ++k) {
+        VERIFY_IS_EQUAL(tensor(i%11,j%3,k%5), broadcast(i,j,k));
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_fixed_size_broadcasting()
+{
+  // Need to add a [] operator to the Size class for this to work
+#if 0
+  Tensor<float, 1, DataLayout> t1(10);
+  t1.setRandom();
+  TensorFixedSize<float, Sizes<1>, DataLayout> t2;
+  t2 = t2.constant(20.0f);
+
+  Tensor<float, 1, DataLayout> t3 = t1 + t2.broadcast(Eigen::array<int, 1>{{10}});
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_APPROX(t3(i), t1(i) + t2(0));
+  }
+
+  TensorMap<TensorFixedSize<float, Sizes<1>, DataLayout> > t4(t2.data(), {{1}});
+  Tensor<float, 1, DataLayout> t5 = t1 + t4.broadcast(Eigen::array<int, 1>{{10}});
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_APPROX(t5(i), t1(i) + t2(0));
+  }
+#endif
+}
+
+
+void test_cxx11_tensor_broadcasting()
+{
+  CALL_SUBTEST(test_simple_broadcasting<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting<RowMajor>());
+  CALL_SUBTEST(test_vectorized_broadcasting<ColMajor>());
+  CALL_SUBTEST(test_vectorized_broadcasting<RowMajor>());
+  CALL_SUBTEST(test_static_broadcasting<ColMajor>());
+  CALL_SUBTEST(test_static_broadcasting<RowMajor>());
+  CALL_SUBTEST(test_fixed_size_broadcasting<ColMajor>());
+  CALL_SUBTEST(test_fixed_size_broadcasting<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_casts.cpp b/unsupported/test/cxx11_tensor_casts.cpp
new file mode 100644
index 000000000..4f7ff7067
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_casts.cpp
@@ -0,0 +1,41 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::array;
+
+static void test_simple_cast()
+{
+  Tensor<float, 2> ftensor(20,30);
+  ftensor.setRandom();
+  Tensor<char, 2> chartensor(20,30);
+  chartensor.setRandom();
+  Tensor<std::complex<float>, 2> cplextensor(20,30);
+  cplextensor.setRandom();
+
+  chartensor = ftensor.cast<char>();
+  cplextensor = ftensor.cast<std::complex<float>>();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_EQUAL(chartensor(i,j), static_cast<char>(ftensor(i,j)));
+      VERIFY_IS_EQUAL(cplextensor(i,j), static_cast<std::complex<float>>(ftensor(i,j)));
+    }
+  }
+}
+
+
+void test_cxx11_tensor_casts()
+{
+   CALL_SUBTEST(test_simple_cast());
+}
diff --git a/unsupported/test/cxx11_tensor_chipping.cpp b/unsupported/test/cxx11_tensor_chipping.cpp
new file mode 100644
index 000000000..d83417872
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_chipping.cpp
@@ -0,0 +1,397 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<int DataLayout>
+static void test_simple_chip()
+{
+  Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
+  tensor.setRandom();
+
+  Tensor<float, 4, DataLayout> chip1;
+  chip1 = tensor.template chip<0>(1);
+
+  VERIFY_IS_EQUAL(chip1.dimension(0), 3);
+  VERIFY_IS_EQUAL(chip1.dimension(1), 5);
+  VERIFY_IS_EQUAL(chip1.dimension(2), 7);
+  VERIFY_IS_EQUAL(chip1.dimension(3), 11);
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1,i,j,k,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip2 = tensor.template chip<1>(1);
+  VERIFY_IS_EQUAL(chip2.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip2.dimension(1), 5);
+  VERIFY_IS_EQUAL(chip2.dimension(2), 7);
+  VERIFY_IS_EQUAL(chip2.dimension(3), 11);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip3 = tensor.template chip<2>(2);
+  VERIFY_IS_EQUAL(chip3.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip3.dimension(1), 3);
+  VERIFY_IS_EQUAL(chip3.dimension(2), 7);
+  VERIFY_IS_EQUAL(chip3.dimension(3), 11);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2,k,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip4(tensor.template chip<3>(5));
+  VERIFY_IS_EQUAL(chip4.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip4.dimension(1), 3);
+  VERIFY_IS_EQUAL(chip4.dimension(2), 5);
+  VERIFY_IS_EQUAL(chip4.dimension(3), 11);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip5(tensor.template chip<4>(7));
+  VERIFY_IS_EQUAL(chip5.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip5.dimension(1), 3);
+  VERIFY_IS_EQUAL(chip5.dimension(2), 5);
+  VERIFY_IS_EQUAL(chip5.dimension(3), 7);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7));
+        }
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_dynamic_chip()
+{
+  Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
+  tensor.setRandom();
+
+  Tensor<float, 4, DataLayout> chip1;
+  chip1 = tensor.chip(1, 0);
+  VERIFY_IS_EQUAL(chip1.dimension(0), 3);
+  VERIFY_IS_EQUAL(chip1.dimension(1), 5);
+  VERIFY_IS_EQUAL(chip1.dimension(2), 7);
+  VERIFY_IS_EQUAL(chip1.dimension(3), 11);
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1,i,j,k,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip2 = tensor.chip(1, 1);
+  VERIFY_IS_EQUAL(chip2.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip2.dimension(1), 5);
+  VERIFY_IS_EQUAL(chip2.dimension(2), 7);
+  VERIFY_IS_EQUAL(chip2.dimension(3), 11);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip3 = tensor.chip(2, 2);
+  VERIFY_IS_EQUAL(chip3.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip3.dimension(1), 3);
+  VERIFY_IS_EQUAL(chip3.dimension(2), 7);
+  VERIFY_IS_EQUAL(chip3.dimension(3), 11);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2,k,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip4(tensor.chip(5, 3));
+  VERIFY_IS_EQUAL(chip4.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip4.dimension(1), 3);
+  VERIFY_IS_EQUAL(chip4.dimension(2), 5);
+  VERIFY_IS_EQUAL(chip4.dimension(3), 11);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip5(tensor.chip(7, 4));
+  VERIFY_IS_EQUAL(chip5.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip5.dimension(1), 3);
+  VERIFY_IS_EQUAL(chip5.dimension(2), 5);
+  VERIFY_IS_EQUAL(chip5.dimension(3), 7);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7));
+        }
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_chip_in_expr() {
+  Tensor<float, 5, DataLayout> input1(2,3,5,7,11);
+  input1.setRandom();
+  Tensor<float, 4, DataLayout> input2(3,5,7,11);
+  input2.setRandom();
+
+  Tensor<float, 4, DataLayout> result = input1.template chip<0>(0) + input2;
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          float expected = input1(0,i,j,k,l) + input2(i,j,k,l);
+          VERIFY_IS_EQUAL(result(i,j,k,l), expected);
+        }
+      }
+    }
+  }
+
+  Tensor<float, 3, DataLayout> input3(3,7,11);
+  input3.setRandom();
+  Tensor<float, 3, DataLayout> result2 = input1.template chip<0>(0).template chip<1>(2) + input3;
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      for (int k = 0; k < 11; ++k) {
+        float expected = input1(0,i,2,j,k) + input3(i,j,k);
+        VERIFY_IS_EQUAL(result2(i,j,k), expected);
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_chip_as_lvalue()
+{
+  Tensor<float, 5, DataLayout> input1(2,3,5,7,11);
+  input1.setRandom();
+
+  Tensor<float, 4, DataLayout> input2(3,5,7,11);
+  input2.setRandom();
+  Tensor<float, 5, DataLayout> tensor = input1;
+  tensor.template chip<0>(1) = input2;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          for (int m = 0; m < 11; ++m) {
+            if (i != 1) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input2(j,k,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> input3(2,5,7,11);
+  input3.setRandom();
+  tensor = input1;
+  tensor.template chip<1>(1) = input3;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          for (int m = 0; m < 11; ++m) {
+            if (j != 1) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input3(i,k,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> input4(2,3,7,11);
+  input4.setRandom();
+  tensor = input1;
+  tensor.template chip<2>(3) = input4;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          for (int m = 0; m < 11; ++m) {
+            if (k != 3) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input4(i,j,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> input5(2,3,5,11);
+  input5.setRandom();
+  tensor = input1;
+  tensor.template chip<3>(4) = input5;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          for (int m = 0; m < 11; ++m) {
+            if (l != 4) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input5(i,j,k,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> input6(2,3,5,7);
+  input6.setRandom();
+  tensor = input1;
+  tensor.template chip<4>(5) = input6;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          for (int m = 0; m < 11; ++m) {
+            if (m != 5) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input6(i,j,k,l));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  Tensor<float, 5, DataLayout> input7(2,3,5,7,11);
+  input7.setRandom();
+  tensor = input1;
+  tensor.chip(0, 0) = input7.chip(0, 0);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          for (int m = 0; m < 11; ++m) {
+            if (i != 0) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input7(i,j,k,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+
+template<int DataLayout>
+static void test_chip_raw_data()
+{
+  Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
+  tensor.setRandom();
+
+  typedef TensorEvaluator<decltype(tensor.template chip<4>(3)), DefaultDevice> Evaluator4;
+  auto chip = Evaluator4(tensor.template chip<4>(3), DefaultDevice());
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          int chip_index;
+          if (DataLayout == ColMajor) {
+            chip_index = i + 2 * (j + 3 * (k + 5 * l));
+          } else {
+            chip_index = 11 * (l + 7 * (k + 5 * (j + 3 * i)));
+          }
+          VERIFY_IS_EQUAL(chip.data()[chip_index], tensor(i,j,k,l,3));
+        }
+      }
+    }
+  }
+
+  typedef TensorEvaluator<decltype(tensor.template chip<0>(0)), DefaultDevice> Evaluator0;
+  auto chip0 = Evaluator0(tensor.template chip<0>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip0.data(), static_cast<float*>(0));
+
+  typedef TensorEvaluator<decltype(tensor.template chip<1>(0)), DefaultDevice> Evaluator1;
+  auto chip1 = Evaluator1(tensor.template chip<1>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip1.data(), static_cast<float*>(0));
+
+  typedef TensorEvaluator<decltype(tensor.template chip<2>(0)), DefaultDevice> Evaluator2;
+  auto chip2 = Evaluator2(tensor.template chip<2>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip2.data(), static_cast<float*>(0));
+
+  typedef TensorEvaluator<decltype(tensor.template chip<3>(0)), DefaultDevice> Evaluator3;
+  auto chip3 = Evaluator3(tensor.template chip<3>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip3.data(), static_cast<float*>(0));
+}
+
+void test_cxx11_tensor_chipping()
+{
+  CALL_SUBTEST(test_simple_chip<ColMajor>());
+  CALL_SUBTEST(test_simple_chip<RowMajor>());
+  CALL_SUBTEST(test_dynamic_chip<ColMajor>());
+  CALL_SUBTEST(test_dynamic_chip<RowMajor>());
+  CALL_SUBTEST(test_chip_in_expr<ColMajor>());
+  CALL_SUBTEST(test_chip_in_expr<RowMajor>());
+  CALL_SUBTEST(test_chip_as_lvalue<ColMajor>());
+  CALL_SUBTEST(test_chip_as_lvalue<RowMajor>());
+  CALL_SUBTEST(test_chip_raw_data<ColMajor>());
+  CALL_SUBTEST(test_chip_raw_data<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_comparisons.cpp b/unsupported/test/cxx11_tensor_comparisons.cpp
new file mode 100644
index 000000000..186f56ac3
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_comparisons.cpp
@@ -0,0 +1,84 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_orderings()
+{
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<bool, 3> lt(2,3,7);
+  Tensor<bool, 3> le(2,3,7);
+  Tensor<bool, 3> gt(2,3,7);
+  Tensor<bool, 3> ge(2,3,7);
+
+  mat1.setRandom();
+  mat2.setRandom();
+
+  lt = mat1 < mat2;
+  le = mat1 <= mat2;
+  gt = mat1 > mat2;
+  ge = mat1 >= mat2;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(lt(i,j,k), mat1(i,j,k) < mat2(i,j,k));
+        VERIFY_IS_EQUAL(le(i,j,k), mat1(i,j,k) <= mat2(i,j,k));
+        VERIFY_IS_EQUAL(gt(i,j,k), mat1(i,j,k) > mat2(i,j,k));
+        VERIFY_IS_EQUAL(ge(i,j,k), mat1(i,j,k) >= mat2(i,j,k));
+      }
+    }
+  }
+}
+
+
+static void test_equality()
+{
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+
+  mat1.setRandom();
+  mat2.setRandom();
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        if (random() < 0.5) {
+          mat2(i,j,k) = mat1(i,j,k);
+        }
+      }
+    }
+  }
+
+  Tensor<bool, 3> eq(2,3,7);
+  Tensor<bool, 3> ne(2,3,7);
+  eq = (mat1 == mat2);
+  ne = (mat1 != mat2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(eq(i,j,k), mat1(i,j,k) == mat2(i,j,k));
+        VERIFY_IS_EQUAL(ne(i,j,k), mat1(i,j,k) != mat2(i,j,k));
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_comparisons()
+{
+  CALL_SUBTEST(test_orderings());
+  CALL_SUBTEST(test_equality());
+}
diff --git a/unsupported/test/cxx11_tensor_concatenation.cpp b/unsupported/test/cxx11_tensor_concatenation.cpp
new file mode 100644
index 000000000..9fdf33c16
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_concatenation.cpp
@@ -0,0 +1,116 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<int DataLayout>
+static void test_dimension_failures()
+{
+  Tensor<int, 3, DataLayout> left(2, 3, 1);
+  Tensor<int, 3, DataLayout> right(3, 3, 1);
+  left.setRandom();
+  right.setRandom();
+
+  // Okay; other dimensions are equal.
+  Tensor<int, 3, DataLayout> concatenation = left.concatenate(right, 0);
+
+  // Dimension mismatches.
+  VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 1));
+  VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 2));
+
+  // Axis > NumDims or < 0.
+  VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 3));
+  VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, -1));
+}
+
+template<int DataLayout>
+static void test_static_dimension_failure()
+{
+  Tensor<int, 2, DataLayout> left(2, 3);
+  Tensor<int, 3, DataLayout> right(2, 3, 1);
+
+#ifdef CXX11_TENSOR_CONCATENATION_STATIC_DIMENSION_FAILURE
+  // Technically compatible, but we static assert that the inputs have same
+  // NumDims.
+  Tensor<int, 3, DataLayout> concatenation = left.concatenate(right, 0);
+#endif
+
+  // This can be worked around in this case.
+  Tensor<int, 3, DataLayout> concatenation = left
+      .reshape(Tensor<int, 3>::Dimensions{{2, 3, 1}})
+      .concatenate(right, 0);
+  Tensor<int, 2, DataLayout> alternative = left
+      .concatenate(right.reshape(Tensor<int, 2>::Dimensions{{2, 3}}), 0);
+}
+
+template<int DataLayout>
+static void test_simple_concatenation()
+{
+  Tensor<int, 3, DataLayout> left(2, 3, 1);
+  Tensor<int, 3, DataLayout> right(2, 3, 1);
+  left.setRandom();
+  right.setRandom();
+
+  Tensor<int, 3, DataLayout> concatenation = left.concatenate(right, 0);
+  VERIFY_IS_EQUAL(concatenation.dimension(0), 4);
+  VERIFY_IS_EQUAL(concatenation.dimension(1), 3);
+  VERIFY_IS_EQUAL(concatenation.dimension(2), 1);
+  for (int j = 0; j < 3; ++j) {
+    for (int i = 0; i < 2; ++i) {
+      VERIFY_IS_EQUAL(concatenation(i, j, 0), left(i, j, 0));
+    }
+    for (int i = 2; i < 4; ++i) {
+      VERIFY_IS_EQUAL(concatenation(i, j, 0), right(i - 2, j, 0));
+    }
+  }
+
+  concatenation = left.concatenate(right, 1);
+  VERIFY_IS_EQUAL(concatenation.dimension(0), 2);
+  VERIFY_IS_EQUAL(concatenation.dimension(1), 6);
+  VERIFY_IS_EQUAL(concatenation.dimension(2), 1);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(concatenation(i, j, 0), left(i, j, 0));
+    }
+    for (int j = 3; j < 6; ++j) {
+      VERIFY_IS_EQUAL(concatenation(i, j, 0), right(i, j - 3, 0));
+    }
+  }
+
+  concatenation = left.concatenate(right, 2);
+  VERIFY_IS_EQUAL(concatenation.dimension(0), 2);
+  VERIFY_IS_EQUAL(concatenation.dimension(1), 3);
+  VERIFY_IS_EQUAL(concatenation.dimension(2), 2);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(concatenation(i, j, 0), left(i, j, 0));
+      VERIFY_IS_EQUAL(concatenation(i, j, 1), right(i, j, 0));
+    }
+  }
+}
+
+
+// TODO(phli): Add test once we have a real vectorized implementation.
+// static void test_vectorized_concatenation() {}
+
+
+void test_cxx11_tensor_concatenation()
+{
+   CALL_SUBTEST(test_dimension_failures<ColMajor>());
+   CALL_SUBTEST(test_dimension_failures<RowMajor>());
+   CALL_SUBTEST(test_static_dimension_failure<ColMajor>());
+   CALL_SUBTEST(test_static_dimension_failure<RowMajor>());
+   CALL_SUBTEST(test_simple_concatenation<ColMajor>());
+   CALL_SUBTEST(test_simple_concatenation<RowMajor>());
+   // CALL_SUBTEST(test_vectorized_concatenation());
+}
diff --git a/unsupported/test/cxx11_tensor_const.cpp b/unsupported/test/cxx11_tensor_const.cpp
new file mode 100644
index 000000000..0ffb02afd
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_const.cpp
@@ -0,0 +1,39 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+using Eigen::Tensor;
+
+
+
+
+static void test_simple_assign()
+{
+  Tensor<int, 3> random(2,3,7);
+  random.setRandom();
+
+  TensorMap<Tensor<const int, 3> > constant(random.data(), 2, 3, 7);
+  Tensor<int, 3> result(2,3,7);
+  result = constant;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL((result(i,j,k)), random(i,j,k));
+      }
+    }
+  }
+}
+
+void test_cxx11_tensor_const()
+{
+  CALL_SUBTEST(test_simple_assign());
+}
diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cpp b/unsupported/test/cxx11_tensor_contract_cuda.cpp
new file mode 100644
index 000000000..9599607c6
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_contract_cuda.cpp
@@ -0,0 +1,121 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_cuda
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+typedef Tensor<float, 1>::DimensionPair DimPair;
+
+template<int DataLayout>
+static void test_cuda_contraction(int m_size, int k_size, int n_size)
+{
+  cout<<"Calling with ("<<m_size<<","<<k_size<<","<<n_size<<")"<<std::endl;
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  Tensor<float, 2, DataLayout> t_left(Eigen::array<int, 2>(m_size, k_size));
+  Tensor<float, 2, DataLayout> t_right(Eigen::array<int, 2>(k_size, n_size));
+  Tensor<float, 2, DataLayout> t_result(Eigen::array<int, 2>(m_size, n_size));
+  Tensor<float, 2, DataLayout> t_result_gpu(Eigen::array<int, 2>(m_size, n_size));
+  Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
+  std::size_t t_right_bytes = t_right.size() * sizeof(float);
+  std::size_t t_result_bytes = t_result.size() * sizeof(float);
+
+  float* d_t_left;
+  float* d_t_right;
+  float* d_t_result;
+
+  cudaMalloc((void**)(&d_t_left), t_left_bytes);
+  cudaMalloc((void**)(&d_t_right), t_right_bytes);
+  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+
+  cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+
+  cudaStream_t stream;
+  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_left(d_t_left, Eigen::array<int, 2>(m_size, k_size));
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_right(d_t_right, Eigen::array<int, 2>(k_size, n_size));
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_result(d_t_result, Eigen::array<int, 2>(m_size, n_size));
+
+
+  gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
+  t_result = t_left.contract(t_right, dims);
+
+  cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+  for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    if (fabs(t_result.data()[i] - t_result_gpu.data()[i]) >= 1e-4) {
+      cout << "mismatch detected at index " << i << ": " << t_result.data()[i]
+           << " vs " <<  t_result_gpu.data()[i] << endl;
+      assert(false);
+    }
+  }
+
+  cudaFree((void*)d_t_left);
+  cudaFree((void*)d_t_right);
+  cudaFree((void*)d_t_result);
+}
+
+
+void test_cxx11_tensor_cuda()
+{
+  cout<<"Calling contraction tests"<<std::endl;
+  CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, 128, 128));
+  CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, 128, 128));
+  for (int k = 32; k < 256; k++) {
+    CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, k, 128));
+    CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, k, 128));
+  }
+  for (int k = 32; k < 256; k++) {
+    CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, 128, k));
+    CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, 128, k));
+  }
+  for (int k = 32; k < 256; k++) {
+    CALL_SUBTEST(test_cuda_contraction<ColMajor>(k, 128, 128));
+    CALL_SUBTEST(test_cuda_contraction<RowMajor>(k, 128, 128));
+  }
+
+  int m_sizes[] = {31,   39,   63,   64,  65,
+                   127, 129,  255,  257, 511,
+                   512, 513, 1023, 1024, 1025 };
+  int n_sizes[] = {31,   39,   63,   64,  65,
+                   127, 129,  255,  257, 511,
+                   512, 513, 1023, 1024, 1025 };
+
+  int k_sizes[] = { 31,  39,  63, 64,    65,
+                    95,  96, 127, 129,  255,
+                   257, 511, 512, 513, 1023,
+                  1024, 1025};
+
+  for (int i = 0; i <15; i++)
+    for (int j = 0; j < 15; j++)
+      for (int k = 0; k < 17; k++) {
+        CALL_SUBTEST(test_cuda_contraction<ColMajor>(m_sizes[i], n_sizes[j], k_sizes[k]));
+        CALL_SUBTEST(test_cuda_contraction<RowMajor>(m_sizes[i], n_sizes[j], k_sizes[k]));
+      }
+}
diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp
new file mode 100644
index 000000000..2bcae90b8
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_contraction.cpp
@@ -0,0 +1,480 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::DefaultDevice;
+using Eigen::Tensor;
+
+typedef Tensor<float, 1>::DimensionPair DimPair;
+
+template<int DataLayout>
+static void test_evals()
+{
+  Tensor<float, 2, DataLayout> mat1(2, 3);
+  Tensor<float, 2, DataLayout> mat2(2, 3);
+  Tensor<float, 2, DataLayout> mat3(3, 2);
+
+  mat1.setRandom();
+  mat2.setRandom();
+  mat3.setRandom();
+
+  Tensor<float, 2, DataLayout> mat4(3,3);
+  mat4.setZero();
+  Eigen::array<DimPair, 1> dims3({{DimPair(0, 0)}});
+  typedef TensorEvaluator<decltype(mat1.contract(mat2, dims3)), DefaultDevice> Evaluator;
+  Evaluator eval(mat1.contract(mat2, dims3), DefaultDevice());
+  eval.evalTo(mat4.data());
+  EIGEN_STATIC_ASSERT(Evaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  VERIFY_IS_EQUAL(eval.dimensions()[0], 3);
+  VERIFY_IS_EQUAL(eval.dimensions()[1], 3);
+
+  VERIFY_IS_APPROX(mat4(0,0), mat1(0,0)*mat2(0,0) + mat1(1,0)*mat2(1,0));
+  VERIFY_IS_APPROX(mat4(0,1), mat1(0,0)*mat2(0,1) + mat1(1,0)*mat2(1,1));
+  VERIFY_IS_APPROX(mat4(0,2), mat1(0,0)*mat2(0,2) + mat1(1,0)*mat2(1,2));
+  VERIFY_IS_APPROX(mat4(1,0), mat1(0,1)*mat2(0,0) + mat1(1,1)*mat2(1,0));
+  VERIFY_IS_APPROX(mat4(1,1), mat1(0,1)*mat2(0,1) + mat1(1,1)*mat2(1,1));
+  VERIFY_IS_APPROX(mat4(1,2), mat1(0,1)*mat2(0,2) + mat1(1,1)*mat2(1,2));
+  VERIFY_IS_APPROX(mat4(2,0), mat1(0,2)*mat2(0,0) + mat1(1,2)*mat2(1,0));
+  VERIFY_IS_APPROX(mat4(2,1), mat1(0,2)*mat2(0,1) + mat1(1,2)*mat2(1,1));
+  VERIFY_IS_APPROX(mat4(2,2), mat1(0,2)*mat2(0,2) + mat1(1,2)*mat2(1,2));
+
+  Tensor<float, 2, DataLayout> mat5(2,2);
+  mat5.setZero();
+  Eigen::array<DimPair, 1> dims4({{DimPair(1, 1)}});
+  typedef TensorEvaluator<decltype(mat1.contract(mat2, dims4)), DefaultDevice> Evaluator2;
+  Evaluator2 eval2(mat1.contract(mat2, dims4), DefaultDevice());
+  eval2.evalTo(mat5.data());
+  EIGEN_STATIC_ASSERT(Evaluator2::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  VERIFY_IS_EQUAL(eval2.dimensions()[0], 2);
+  VERIFY_IS_EQUAL(eval2.dimensions()[1], 2);
+
+  VERIFY_IS_APPROX(mat5(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(0,1) + mat1(0,2)*mat2(0,2));
+  VERIFY_IS_APPROX(mat5(0,1), mat1(0,0)*mat2(1,0) + mat1(0,1)*mat2(1,1) + mat1(0,2)*mat2(1,2));
+  VERIFY_IS_APPROX(mat5(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(0,1) + mat1(1,2)*mat2(0,2));
+  VERIFY_IS_APPROX(mat5(1,1), mat1(1,0)*mat2(1,0) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(1,2));
+
+  Tensor<float, 2, DataLayout> mat6(2,2);
+  mat6.setZero();
+  Eigen::array<DimPair, 1> dims6({{DimPair(1, 0)}});
+  typedef TensorEvaluator<decltype(mat1.contract(mat3, dims6)), DefaultDevice> Evaluator3;
+  Evaluator3 eval3(mat1.contract(mat3, dims6), DefaultDevice());
+  eval3.evalTo(mat6.data());
+  EIGEN_STATIC_ASSERT(Evaluator3::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  VERIFY_IS_EQUAL(eval3.dimensions()[0], 2);
+  VERIFY_IS_EQUAL(eval3.dimensions()[1], 2);
+
+  VERIFY_IS_APPROX(mat6(0,0), mat1(0,0)*mat3(0,0) + mat1(0,1)*mat3(1,0) + mat1(0,2)*mat3(2,0));
+  VERIFY_IS_APPROX(mat6(0,1), mat1(0,0)*mat3(0,1) + mat1(0,1)*mat3(1,1) + mat1(0,2)*mat3(2,1));
+  VERIFY_IS_APPROX(mat6(1,0), mat1(1,0)*mat3(0,0) + mat1(1,1)*mat3(1,0) + mat1(1,2)*mat3(2,0));
+  VERIFY_IS_APPROX(mat6(1,1), mat1(1,0)*mat3(0,1) + mat1(1,1)*mat3(1,1) + mat1(1,2)*mat3(2,1));
+}
+
+template<int DataLayout>
+static void test_scalar()
+{
+  Tensor<float, 1, DataLayout> vec1({6});
+  Tensor<float, 1, DataLayout> vec2({6});
+
+  vec1.setRandom();
+  vec2.setRandom();
+
+  Tensor<float, 1, DataLayout> scalar(1);
+  scalar.setZero();
+  Eigen::array<DimPair, 1> dims({{DimPair(0, 0)}});
+  typedef TensorEvaluator<decltype(vec1.contract(vec2, dims)), DefaultDevice> Evaluator;
+  Evaluator eval(vec1.contract(vec2, dims), DefaultDevice());
+  eval.evalTo(scalar.data());
+  EIGEN_STATIC_ASSERT(Evaluator::NumDims==1ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  float expected = 0.0f;
+  for (int i = 0; i < 6; ++i) {
+    expected += vec1(i) * vec2(i);
+  }
+  VERIFY_IS_APPROX(scalar(0), expected);
+}
+
+template<int DataLayout>
+static void test_multidims()
+{
+  Tensor<float, 3, DataLayout> mat1(2, 2, 2);
+  Tensor<float, 4, DataLayout> mat2(2, 2, 2, 2);
+
+  mat1.setRandom();
+  mat2.setRandom();
+
+  Tensor<float, 3, DataLayout> mat3(2, 2, 2);
+  mat3.setZero();
+  Eigen::array<DimPair, 2> dims({{DimPair(1, 2), DimPair(2, 3)}});
+  typedef TensorEvaluator<decltype(mat1.contract(mat2, dims)), DefaultDevice> Evaluator;
+  Evaluator eval(mat1.contract(mat2, dims), DefaultDevice());
+  eval.evalTo(mat3.data());
+  EIGEN_STATIC_ASSERT(Evaluator::NumDims==3ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  VERIFY_IS_EQUAL(eval.dimensions()[0], 2);
+  VERIFY_IS_EQUAL(eval.dimensions()[1], 2);
+  VERIFY_IS_EQUAL(eval.dimensions()[2], 2);
+
+  VERIFY_IS_APPROX(mat3(0,0,0), mat1(0,0,0)*mat2(0,0,0,0) + mat1(0,1,0)*mat2(0,0,1,0) +
+                                mat1(0,0,1)*mat2(0,0,0,1) + mat1(0,1,1)*mat2(0,0,1,1));
+  VERIFY_IS_APPROX(mat3(0,0,1), mat1(0,0,0)*mat2(0,1,0,0) + mat1(0,1,0)*mat2(0,1,1,0) +
+                                mat1(0,0,1)*mat2(0,1,0,1) + mat1(0,1,1)*mat2(0,1,1,1));
+  VERIFY_IS_APPROX(mat3(0,1,0), mat1(0,0,0)*mat2(1,0,0,0) + mat1(0,1,0)*mat2(1,0,1,0) +
+                                mat1(0,0,1)*mat2(1,0,0,1) + mat1(0,1,1)*mat2(1,0,1,1));
+  VERIFY_IS_APPROX(mat3(0,1,1), mat1(0,0,0)*mat2(1,1,0,0) + mat1(0,1,0)*mat2(1,1,1,0) +
+                                mat1(0,0,1)*mat2(1,1,0,1) + mat1(0,1,1)*mat2(1,1,1,1));
+  VERIFY_IS_APPROX(mat3(1,0,0), mat1(1,0,0)*mat2(0,0,0,0) + mat1(1,1,0)*mat2(0,0,1,0) +
+                                mat1(1,0,1)*mat2(0,0,0,1) + mat1(1,1,1)*mat2(0,0,1,1));
+  VERIFY_IS_APPROX(mat3(1,0,1), mat1(1,0,0)*mat2(0,1,0,0) + mat1(1,1,0)*mat2(0,1,1,0) +
+                                mat1(1,0,1)*mat2(0,1,0,1) + mat1(1,1,1)*mat2(0,1,1,1));
+  VERIFY_IS_APPROX(mat3(1,1,0), mat1(1,0,0)*mat2(1,0,0,0) + mat1(1,1,0)*mat2(1,0,1,0) +
+                                mat1(1,0,1)*mat2(1,0,0,1) + mat1(1,1,1)*mat2(1,0,1,1));
+  VERIFY_IS_APPROX(mat3(1,1,1), mat1(1,0,0)*mat2(1,1,0,0) + mat1(1,1,0)*mat2(1,1,1,0) +
+                                mat1(1,0,1)*mat2(1,1,0,1) + mat1(1,1,1)*mat2(1,1,1,1));
+}
+
+template<int DataLayout>
+static void test_holes() {
+  Tensor<float, 4, DataLayout> t1(2, 5, 7, 3);
+  Tensor<float, 5, DataLayout> t2(2, 7, 11, 13, 3);
+  t1.setRandom();
+  t2.setRandom();
+
+  Eigen::array<DimPair, 2> dims({{DimPair(0, 0), DimPair(3, 4)}});
+  Tensor<float, 5, DataLayout> result = t1.contract(t2, dims);
+  VERIFY_IS_EQUAL(result.dimension(0), 5);
+  VERIFY_IS_EQUAL(result.dimension(1), 7);
+  VERIFY_IS_EQUAL(result.dimension(2), 7);
+  VERIFY_IS_EQUAL(result.dimension(3), 11);
+  VERIFY_IS_EQUAL(result.dimension(4), 13);
+
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 5; ++l) {
+          for (int m = 0; m < 5; ++m) {
+            VERIFY_IS_APPROX(result(i, j, k, l, m),
+                             t1(0, i, j, 0) * t2(0, k, l, m, 0) +
+                             t1(1, i, j, 0) * t2(1, k, l, m, 0) +
+                             t1(0, i, j, 1) * t2(0, k, l, m, 1) +
+                             t1(1, i, j, 1) * t2(1, k, l, m, 1) +
+                             t1(0, i, j, 2) * t2(0, k, l, m, 2) +
+                             t1(1, i, j, 2) * t2(1, k, l, m, 2));
+          }
+        }
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_full_redux()
+{
+  Tensor<float, 2, DataLayout> t1(2, 2);
+  Tensor<float, 3, DataLayout> t2(2, 2, 2);
+  t1.setRandom();
+  t2.setRandom();
+
+  Eigen::array<DimPair, 2> dims({{DimPair(0, 0), DimPair(1, 1)}});
+  Tensor<float, 1, DataLayout> result = t1.contract(t2, dims);
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_APPROX(result(0), t1(0, 0) * t2(0, 0, 0) +  t1(1, 0) * t2(1, 0, 0)
+                            + t1(0, 1) * t2(0, 1, 0) +  t1(1, 1) * t2(1, 1, 0));
+  VERIFY_IS_APPROX(result(1), t1(0, 0) * t2(0, 0, 1) +  t1(1, 0) * t2(1, 0, 1)
+                            + t1(0, 1) * t2(0, 1, 1) +  t1(1, 1) * t2(1, 1, 1));
+
+  dims[0] = DimPair(1, 0);
+  dims[1] = DimPair(2, 1);
+  result = t2.contract(t1, dims);
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_APPROX(result(0), t1(0, 0) * t2(0, 0, 0) +  t1(1, 0) * t2(0, 1, 0)
+                            + t1(0, 1) * t2(0, 0, 1) +  t1(1, 1) * t2(0, 1, 1));
+  VERIFY_IS_APPROX(result(1), t1(0, 0) * t2(1, 0, 0) +  t1(1, 0) * t2(1, 1, 0)
+                            + t1(0, 1) * t2(1, 0, 1) +  t1(1, 1) * t2(1, 1, 1));
+}
+
+template<int DataLayout>
+static void test_contraction_of_contraction()
+{
+  Tensor<float, 2, DataLayout> t1(2, 2);
+  Tensor<float, 2, DataLayout> t2(2, 2);
+  Tensor<float, 2, DataLayout> t3(2, 2);
+  Tensor<float, 2, DataLayout> t4(2, 2);
+  t1.setRandom();
+  t2.setRandom();
+  t3.setRandom();
+  t4.setRandom();
+
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+  auto contract1 = t1.contract(t2, dims);
+  auto diff = t3 - contract1;
+  auto contract2 = t1.contract(t4, dims);
+  Tensor<float, 2, DataLayout> result = contract2.contract(diff, dims);
+
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 2);
+
+  Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>>
+      m1(t1.data(), 2, 2), m2(t2.data(), 2, 2), m3(t3.data(), 2, 2),
+      m4(t4.data(), 2, 2);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>
+      expected = (m1 * m4) * (m3 - m1 * m2);
+
+  VERIFY_IS_APPROX(result(0, 0), expected(0, 0));
+  VERIFY_IS_APPROX(result(0, 1), expected(0, 1));
+  VERIFY_IS_APPROX(result(1, 0), expected(1, 0));
+  VERIFY_IS_APPROX(result(1, 1), expected(1, 1));
+}
+
+template<int DataLayout>
+static void test_expr()
+{
+  Tensor<float, 2, DataLayout> mat1(2, 3);
+  Tensor<float, 2, DataLayout> mat2(3, 2);
+  mat1.setRandom();
+  mat2.setRandom();
+
+  Tensor<float, 2, DataLayout> mat3(2,2);
+
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+  mat3 = mat1.contract(mat2, dims);
+
+  VERIFY_IS_APPROX(mat3(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(1,0) + mat1(0,2)*mat2(2,0));
+  VERIFY_IS_APPROX(mat3(0,1), mat1(0,0)*mat2(0,1) + mat1(0,1)*mat2(1,1) + mat1(0,2)*mat2(2,1));
+  VERIFY_IS_APPROX(mat3(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(1,0) + mat1(1,2)*mat2(2,0));
+  VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1));
+}
+
+template<int DataLayout>
+static void test_out_of_order_contraction()
+{
+  Tensor<float, 3, DataLayout> mat1(2, 2, 2);
+  Tensor<float, 3, DataLayout> mat2(2, 2, 2);
+
+  mat1.setRandom();
+  mat2.setRandom();
+
+  Tensor<float, 2, DataLayout> mat3(2, 2);
+
+  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(0, 2)}});
+  mat3 = mat1.contract(mat2, dims);
+
+  VERIFY_IS_APPROX(mat3(0, 0),
+                   mat1(0,0,0)*mat2(0,0,0) + mat1(1,0,0)*mat2(0,0,1) +
+                   mat1(0,0,1)*mat2(1,0,0) + mat1(1,0,1)*mat2(1,0,1));
+  VERIFY_IS_APPROX(mat3(1, 0),
+                   mat1(0,1,0)*mat2(0,0,0) + mat1(1,1,0)*mat2(0,0,1) +
+                   mat1(0,1,1)*mat2(1,0,0) + mat1(1,1,1)*mat2(1,0,1));
+  VERIFY_IS_APPROX(mat3(0, 1),
+                   mat1(0,0,0)*mat2(0,1,0) + mat1(1,0,0)*mat2(0,1,1) +
+                   mat1(0,0,1)*mat2(1,1,0) + mat1(1,0,1)*mat2(1,1,1));
+  VERIFY_IS_APPROX(mat3(1, 1),
+                   mat1(0,1,0)*mat2(0,1,0) + mat1(1,1,0)*mat2(0,1,1) +
+                   mat1(0,1,1)*mat2(1,1,0) + mat1(1,1,1)*mat2(1,1,1));
+
+  Eigen::array<DimPair, 2> dims2({{DimPair(0, 2), DimPair(2, 0)}});
+  mat3 = mat1.contract(mat2, dims2);
+
+  VERIFY_IS_APPROX(mat3(0, 0),
+                   mat1(0,0,0)*mat2(0,0,0) + mat1(1,0,0)*mat2(0,0,1) +
+                   mat1(0,0,1)*mat2(1,0,0) + mat1(1,0,1)*mat2(1,0,1));
+  VERIFY_IS_APPROX(mat3(1, 0),
+                   mat1(0,1,0)*mat2(0,0,0) + mat1(1,1,0)*mat2(0,0,1) +
+                   mat1(0,1,1)*mat2(1,0,0) + mat1(1,1,1)*mat2(1,0,1));
+  VERIFY_IS_APPROX(mat3(0, 1),
+                   mat1(0,0,0)*mat2(0,1,0) + mat1(1,0,0)*mat2(0,1,1) +
+                   mat1(0,0,1)*mat2(1,1,0) + mat1(1,0,1)*mat2(1,1,1));
+  VERIFY_IS_APPROX(mat3(1, 1),
+                   mat1(0,1,0)*mat2(0,1,0) + mat1(1,1,0)*mat2(0,1,1) +
+                   mat1(0,1,1)*mat2(1,1,0) + mat1(1,1,1)*mat2(1,1,1));
+
+}
+
+template<int DataLayout>
+static void test_consistency()
+{
+  // this does something like testing (A*B)^T = (B^T * A^T)
+
+  Tensor<float, 3, DataLayout> mat1(4, 3, 5);
+  Tensor<float, 5, DataLayout> mat2(3, 2, 1, 5, 4);
+  mat1.setRandom();
+  mat2.setRandom();
+
+  Tensor<float, 4, DataLayout> mat3(5, 2, 1, 5);
+  Tensor<float, 4, DataLayout> mat4(2, 1, 5, 5);
+
+  // contract on dimensions of size 4 and 3
+  Eigen::array<DimPair, 2> dims1({{DimPair(0, 4), DimPair(1, 0)}});
+  Eigen::array<DimPair, 2> dims2({{DimPair(4, 0), DimPair(0, 1)}});
+
+  mat3 = mat1.contract(mat2, dims1);
+  mat4 = mat2.contract(mat1, dims2);
+
+  // check that these are equal except for ordering of dimensions
+  if (DataLayout == ColMajor) {
+    for (size_t i = 0; i < 5; i++) {
+      for (size_t j = 0; j < 10; j++) {
+        VERIFY_IS_APPROX(mat3.data()[i + 5 * j], mat4.data()[j + 10 * i]);
+      }
+    }
+  } else {
+    // Row major
+    for (size_t i = 0; i < 5; i++) {
+      for (size_t j = 0; j < 10; j++) {
+        VERIFY_IS_APPROX(mat3.data()[10 * i + j], mat4.data()[i + 5 * j]);
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_large_contraction()
+{
+  Tensor<float, 4, DataLayout> t_left(30, 50, 8, 31);
+  Tensor<float, 5, DataLayout> t_right(8, 31, 7, 20, 10);
+  Tensor<float, 5, DataLayout> t_result(30, 50, 7, 20, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 1500, 248);
+  MapXf m_right(t_right.data(), 248, 1400);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+
+  // compute results by separate methods
+  t_result = t_left.contract(t_right, dims);
+  m_result = m_left * m_right;
+
+  for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY(&t_result.data()[i] != &m_result.data()[i]);
+    VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
+  }
+}
+
+template<int DataLayout>
+static void test_matrix_vector()
+{
+  Tensor<float, 2, DataLayout> t_left(30, 50);
+  Tensor<float, 1, DataLayout> t_right(50);
+  Tensor<float, 1, DataLayout> t_result(30);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 30, 50);
+  MapXf m_right(t_right.data(), 50, 1);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(30, 1);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 1> dims{{DimPair(1, 0)}};
+
+  // compute results by separate methods
+  t_result = t_left.contract(t_right, dims);
+  m_result = m_left * m_right;
+
+  for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY(internal::isApprox(t_result(i), m_result(i, 0), 1));
+  }
+}
+
+
+template<int DataLayout>
+static void test_tensor_vector()
+{
+  Tensor<float, 3, DataLayout> t_left(7, 13, 17);
+  Tensor<float, 2, DataLayout> t_right(1, 7);
+  
+  t_left.setRandom();
+  t_right.setRandom();
+  
+  typedef typename Tensor<float, 1, DataLayout>::DimensionPair DimensionPair;
+  Eigen::array<DimensionPair, 1> dim_pair01{{{0, 1}}};
+  Tensor<float, 3, DataLayout> t_result = t_left.contract(t_right, dim_pair01);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 7, 13*17);
+  MapXf m_right(t_right.data(), 1, 7);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result = m_left.transpose() * m_right.transpose();
+
+  for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY(internal::isApprox(t_result(i), m_result(i, 0), 1));
+  }
+}
+
+
+template<int DataLayout>
+static void test_small_blocking_factors()
+{
+  Tensor<float, 4, DataLayout> t_left(30, 5, 3, 31);
+  Tensor<float, 5, DataLayout> t_right(3, 31, 7, 20, 1);
+  t_left.setRandom();
+  t_right.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  // Force the cache sizes, which results in smaller blocking factors.
+  Eigen::setCpuCacheSizes(896, 1920, 2944);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+  Tensor<float, 5, DataLayout> t_result;
+  t_result = t_left.contract(t_right, dims);
+
+  // compute result using a simple eigen matrix product
+  Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> m_left(t_left.data(), 150, 93);
+  Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> m_right(t_right.data(), 93, 140);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result = m_left * m_right;
+
+  for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
+  }
+}
+
+
+void test_cxx11_tensor_contraction()
+{
+  CALL_SUBTEST(test_evals<ColMajor>());
+  CALL_SUBTEST(test_evals<RowMajor>());
+  CALL_SUBTEST(test_scalar<ColMajor>());
+  CALL_SUBTEST(test_scalar<RowMajor>());
+  CALL_SUBTEST(test_multidims<ColMajor>());
+  CALL_SUBTEST(test_multidims<RowMajor>());
+  CALL_SUBTEST(test_holes<ColMajor>());
+  CALL_SUBTEST(test_holes<RowMajor>());
+  CALL_SUBTEST(test_full_redux<ColMajor>());
+  CALL_SUBTEST(test_full_redux<RowMajor>());
+  CALL_SUBTEST(test_contraction_of_contraction<ColMajor>());
+  CALL_SUBTEST(test_contraction_of_contraction<RowMajor>());
+  CALL_SUBTEST(test_expr<ColMajor>());
+  CALL_SUBTEST(test_expr<RowMajor>());
+  CALL_SUBTEST(test_out_of_order_contraction<ColMajor>());
+  CALL_SUBTEST(test_out_of_order_contraction<RowMajor>());
+  CALL_SUBTEST(test_consistency<ColMajor>());
+  CALL_SUBTEST(test_consistency<RowMajor>());
+  CALL_SUBTEST(test_large_contraction<ColMajor>());
+  CALL_SUBTEST(test_large_contraction<RowMajor>());
+  CALL_SUBTEST(test_matrix_vector<ColMajor>());
+  CALL_SUBTEST(test_matrix_vector<RowMajor>());
+  CALL_SUBTEST(test_tensor_vector<ColMajor>());
+  CALL_SUBTEST(test_tensor_vector<RowMajor>());
+  CALL_SUBTEST(test_small_blocking_factors<ColMajor>());
+  CALL_SUBTEST(test_small_blocking_factors<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_convolution.cpp b/unsupported/test/cxx11_tensor_convolution.cpp
new file mode 100644
index 000000000..4672db463
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_convolution.cpp
@@ -0,0 +1,141 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::DefaultDevice;
+
+static void test_evals()
+{
+  Tensor<float, 2> input(3, 3);
+  Tensor<float, 1> kernel(2);
+
+  input.setRandom();
+  kernel.setRandom();
+
+  Tensor<float, 2> result(2,3);
+  result.setZero();
+  Eigen::array<Tensor<float, 2>::Index, 1> dims3({0});
+
+  typedef TensorEvaluator<decltype(input.convolve(kernel, dims3)), DefaultDevice> Evaluator;
+  Evaluator eval(input.convolve(kernel, dims3), DefaultDevice());
+  eval.evalTo(result.data());
+  EIGEN_STATIC_ASSERT(Evaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  VERIFY_IS_EQUAL(eval.dimensions()[0], 2);
+  VERIFY_IS_EQUAL(eval.dimensions()[1], 3);
+
+  VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0) + input(1,0)*kernel(1));  // index 0
+  VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0) + input(1,1)*kernel(1));  // index 2
+  VERIFY_IS_APPROX(result(0,2), input(0,2)*kernel(0) + input(1,2)*kernel(1));  // index 4
+  VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0) + input(2,0)*kernel(1));  // index 1
+  VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0) + input(2,1)*kernel(1));  // index 3
+  VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1));  // index 5
+}
+
+
+static void test_expr()
+{
+  Tensor<float, 2> input(3, 3);
+  Tensor<float, 2> kernel(2, 2);
+  input.setRandom();
+  kernel.setRandom();
+
+  Tensor<float, 2> result(2,2);
+  Eigen::array<ptrdiff_t, 2> dims({0, 1});
+  result = input.convolve(kernel, dims);
+
+  VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0,0) + input(0,1)*kernel(0,1) +
+                                input(1,0)*kernel(1,0) + input(1,1)*kernel(1,1));
+  VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0,0) + input(0,2)*kernel(0,1) +
+                                input(1,1)*kernel(1,0) + input(1,2)*kernel(1,1));
+  VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0,0) + input(1,1)*kernel(0,1) +
+                                input(2,0)*kernel(1,0) + input(2,1)*kernel(1,1));
+  VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0,0) + input(1,2)*kernel(0,1) +
+                                input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1));
+}
+
+
+static void test_modes() {
+  Tensor<float, 1> input(3);
+  Tensor<float, 1> kernel(3);
+  input(0) = 1.0f;
+  input(1) = 2.0f;
+  input(2) = 3.0f;
+  kernel(0) = 0.5f;
+  kernel(1) = 1.0f;
+  kernel(2) = 0.0f;
+
+  const Eigen::array<ptrdiff_t, 1> dims{{0}};
+  Eigen::array<std::pair<ptrdiff_t, ptrdiff_t>, 1> padding;
+
+  // Emulate VALID mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(0, 0);
+  Tensor<float, 1> valid(1);
+  valid = input.pad(padding).convolve(kernel, dims);
+  VERIFY_IS_EQUAL(valid.dimension(0), 1);
+  VERIFY_IS_APPROX(valid(0), 2.5f);
+
+  // Emulate SAME mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(1, 1);
+  Tensor<float, 1> same(3);
+  same = input.pad(padding).convolve(kernel, dims);
+  VERIFY_IS_EQUAL(same.dimension(0), 3);
+  VERIFY_IS_APPROX(same(0), 1.0f);
+  VERIFY_IS_APPROX(same(1), 2.5f);
+  VERIFY_IS_APPROX(same(2), 4.0f);
+
+  // Emulate FULL mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(2, 2);
+  Tensor<float, 1> full(5);
+  full = input.pad(padding).convolve(kernel, dims);
+  VERIFY_IS_EQUAL(full.dimension(0), 5);
+  VERIFY_IS_APPROX(full(0), 0.0f);
+  VERIFY_IS_APPROX(full(1), 1.0f);
+  VERIFY_IS_APPROX(full(2), 2.5f);
+  VERIFY_IS_APPROX(full(3), 4.0f);
+  VERIFY_IS_APPROX(full(4), 1.5f);
+}
+
+
+static void test_strides() {
+  Tensor<float, 1> input(13);
+  Tensor<float, 1> kernel(3);
+  input.setRandom();
+  kernel.setRandom();
+
+  const Eigen::array<ptrdiff_t, 1> dims{{0}};
+  const Eigen::array<ptrdiff_t, 1> stride_of_3{{3}};
+  const Eigen::array<ptrdiff_t, 1> stride_of_2{{2}};
+
+  Tensor<float, 1> result;
+  result = input.stride(stride_of_3).convolve(kernel, dims).stride(stride_of_2);
+
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_APPROX(result(0), (input(0)*kernel(0) + input(3)*kernel(1) +
+                               input(6)*kernel(2)));
+  VERIFY_IS_APPROX(result(1), (input(6)*kernel(0) + input(9)*kernel(1) +
+                               input(12)*kernel(2)));
+}
+
+
+
+
+void test_cxx11_tensor_convolution()
+{
+  CALL_SUBTEST(test_evals());
+  CALL_SUBTEST(test_expr());
+  CALL_SUBTEST(test_modes());
+  CALL_SUBTEST(test_strides());
+}
diff --git a/unsupported/test/cxx11_tensor_cuda.cpp b/unsupported/test/cxx11_tensor_cuda.cpp
new file mode 100644
index 000000000..8c1ca1bf8
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_cuda.cpp
@@ -0,0 +1,514 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// TODO(mdevin): Free the cuda memory.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_cuda
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+void test_cuda_elementwise_small() {
+  Tensor<float, 1> in1(Eigen::array<int, 1>(2));
+  Tensor<float, 1> in2(Eigen::array<int, 1>(2));
+  Tensor<float, 1> out(Eigen::array<int, 1>(2));
+  in1.setRandom();
+  in2.setRandom();
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t in2_bytes = in2.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_in2;
+  float* d_out;
+  cudaMalloc((void**)(&d_in1), in1_bytes);
+  cudaMalloc((void**)(&d_in2), in2_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
+
+  cudaStream_t stream;
+  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
+      d_in1, Eigen::array<int, 1>(2));
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2(
+      d_in2, Eigen::array<int, 1>(2));
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out(
+      d_out, Eigen::array<int, 1>(2));
+
+  gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost,
+                         gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 2; ++i) {
+    VERIFY_IS_APPROX(
+        out(Eigen::array<int, 1>(i)),
+        in1(Eigen::array<int, 1>(i)) + in2(Eigen::array<int, 1>(i)));
+  }
+}
+
+void test_cuda_elementwise()
+{
+  Tensor<float, 3> in1(Eigen::array<int, 3>(72,53,97));
+  Tensor<float, 3> in2(Eigen::array<int, 3>(72,53,97));
+  Tensor<float, 3> in3(Eigen::array<int, 3>(72,53,97));
+  Tensor<float, 3> out(Eigen::array<int, 3>(72,53,97));
+  in1.setRandom();
+  in2.setRandom();
+  in3.setRandom();
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t in2_bytes = in2.size() * sizeof(float);
+  std::size_t in3_bytes = in3.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_in2;
+  float* d_in3;
+  float* d_out;
+  cudaMalloc((void**)(&d_in1), in1_bytes);
+  cudaMalloc((void**)(&d_in2), in2_bytes);
+  cudaMalloc((void**)(&d_in3), in3_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in3, in3.data(), in3_bytes, cudaMemcpyHostToDevice);
+
+  cudaStream_t stream;
+  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<int, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<int, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<int, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<int, 3>(72,53,97));
+
+  gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 53; ++j) {
+      for (int k = 0; k < 97; ++k) {
+        VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), in1(Eigen::array<int, 3>(i,j,k)) + in2(Eigen::array<int, 3>(i,j,k)) * in3(Eigen::array<int, 3>(i,j,k)));
+      }
+    }
+  }
+}
+
+
+void test_cuda_reduction()
+{
+  Tensor<float, 4> in1(Eigen::array<int, 4>(72,53,97,113));
+  Tensor<float, 2> out(Eigen::array<int, 2>(72,97));
+  in1.setRandom();
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_out;
+  cudaMalloc((void**)(&d_in1), in1_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+
+  cudaStream_t stream;
+  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, Eigen::array<int, 4>(72,53,97,113));
+  Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, Eigen::array<int, 2>(72,97));
+
+  array<int, 2> reduction_axis;
+  reduction_axis[0] = 1;
+  reduction_axis[1] = 3;
+
+  gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      float expected = 0;
+      for (int k = 0; k < 53; ++k) {
+        for (int l = 0; l < 113; ++l) {
+          expected =
+              std::max<float>(expected, in1(Eigen::array<int, 4>(i, k, j, l)));
+        }
+      }
+      VERIFY_IS_APPROX(out(Eigen::array<int, 2>(i,j)), expected);
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_cuda_contraction()
+{
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  Tensor<float, 4, DataLayout> t_left(Eigen::array<int, 4>(6, 50, 3, 31));
+  Tensor<float, 5, DataLayout> t_right(Eigen::array<int, 5>(3, 31, 7, 20, 1));
+  Tensor<float, 5, DataLayout> t_result(Eigen::array<int, 5>(6, 50, 7, 20, 1));
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
+  std::size_t t_right_bytes = t_right.size() * sizeof(float);
+  std::size_t t_result_bytes = t_result.size() * sizeof(float);
+
+  float* d_t_left;
+  float* d_t_right;
+  float* d_t_result;
+
+  cudaMalloc((void**)(&d_t_left), t_left_bytes);
+  cudaMalloc((void**)(&d_t_right), t_right_bytes);
+  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+
+  cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+
+  cudaStream_t stream;
+  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> >
+      gpu_t_left(d_t_left, Eigen::array<int, 4>(6, 50, 3, 31));
+  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> >
+      gpu_t_right(d_t_right, Eigen::array<int, 5>(3, 31, 7, 20, 1));
+  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> >
+      gpu_t_result(d_t_result, Eigen::array<int, 5>(6, 50, 7, 20, 1));
+
+  typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf;
+  MapXf m_left(t_left.data(), 300, 93);
+  MapXf m_right(t_right.data(), 93, 140);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(300, 140);
+
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 2> dims;
+  dims[0] = DimPair(2, 0);
+  dims[1] = DimPair(3, 1);
+
+  m_result = m_left * m_right;
+  gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
+
+  cudaMemcpy(t_result.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+
+  for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
+      cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " <<  m_result.data()[i] << endl;
+      assert(false);
+    }
+  }
+}
+
+static void test_cuda_convolution_1d()
+{
+  Tensor<float, 4> input(Eigen::array<int, 4>(74,37,11,137));
+  Tensor<float, 1> kernel(Eigen::array<int, 1>(4));
+  Tensor<float, 4> out(Eigen::array<int, 4>(74,34,11,137));
+  input = input.constant(10.0f) + input.random();
+  kernel = kernel.constant(7.0f) + kernel.random();
+
+  std::size_t input_bytes = input.size() * sizeof(float);
+  std::size_t kernel_bytes = kernel.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_input;
+  float* d_kernel;
+  float* d_out;
+  cudaMalloc((void**)(&d_input), input_bytes);
+  cudaMalloc((void**)(&d_kernel), kernel_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+
+  cudaStream_t stream;
+  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_input(d_input, Eigen::array<int, 4>(74,37,11,137));
+  Eigen::TensorMap<Eigen::Tensor<float, 1> > gpu_kernel(d_kernel, Eigen::array<int, 1>(4));
+  Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_out(d_out, Eigen::array<int, 4>(74,34,11,137));
+
+  Eigen::array<int, 1> dims(1);
+  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 74; ++i) {
+    for (int j = 0; j < 34; ++j) {
+      for (int k = 0; k < 11; ++k) {
+        for (int l = 0; l < 137; ++l) {
+          const float result = out(Eigen::array<int, 4>(i,j,k,l));
+          const float expected = input(Eigen::array<int, 4>(i,j+0,k,l)) * kernel(Eigen::array<int, 1>(0)) +
+                                 input(Eigen::array<int, 4>(i,j+1,k,l)) * kernel(Eigen::array<int, 1>(1)) +
+                                 input(Eigen::array<int, 4>(i,j+2,k,l)) * kernel(Eigen::array<int, 1>(2)) +
+                                 input(Eigen::array<int, 4>(i,j+3,k,l)) * kernel(Eigen::array<int, 1>(3));
+          VERIFY_IS_APPROX(result, expected);
+        }
+      }
+    }
+  }
+}
+
+
+static void test_cuda_convolution_2d()
+{
+  Tensor<float, 4> input(Eigen::array<int, 4>(74,37,11,137));
+  Tensor<float, 2> kernel(Eigen::array<int, 2>(3,4));
+  Tensor<float, 4> out(Eigen::array<int, 4>(74,35,8,137));
+  input = input.constant(10.0f) + input.random();
+  kernel = kernel.constant(7.0f) + kernel.random();
+
+  std::size_t input_bytes = input.size() * sizeof(float);
+  std::size_t kernel_bytes = kernel.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_input;
+  float* d_kernel;
+  float* d_out;
+  cudaMalloc((void**)(&d_input), input_bytes);
+  cudaMalloc((void**)(&d_kernel), kernel_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+
+  cudaStream_t stream;
+  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_input(d_input, Eigen::array<int, 4>(74,37,11,137));
+  Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_kernel(d_kernel, Eigen::array<int, 2>(3,4));
+  Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_out(d_out, Eigen::array<int, 4>(74,35,8,137));
+
+  Eigen::array<int, 2> dims(1,2);
+  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 74; ++i) {
+    for (int j = 0; j < 35; ++j) {
+      for (int k = 0; k < 8; ++k) {
+        for (int l = 0; l < 137; ++l) {
+          const float result = out(Eigen::array<int, 4>(i,j,k,l));
+          const float expected = input(Eigen::array<int, 4>(i,j+0,k+0,l)) * kernel(Eigen::array<int, 2>(0,0)) +
+                                 input(Eigen::array<int, 4>(i,j+1,k+0,l)) * kernel(Eigen::array<int, 2>(1,0)) +
+                                 input(Eigen::array<int, 4>(i,j+2,k+0,l)) * kernel(Eigen::array<int, 2>(2,0)) +
+                                 input(Eigen::array<int, 4>(i,j+0,k+1,l)) * kernel(Eigen::array<int, 2>(0,1)) +
+                                 input(Eigen::array<int, 4>(i,j+1,k+1,l)) * kernel(Eigen::array<int, 2>(1,1)) +
+                                 input(Eigen::array<int, 4>(i,j+2,k+1,l)) * kernel(Eigen::array<int, 2>(2,1)) +
+                                 input(Eigen::array<int, 4>(i,j+0,k+2,l)) * kernel(Eigen::array<int, 2>(0,2)) +
+                                 input(Eigen::array<int, 4>(i,j+1,k+2,l)) * kernel(Eigen::array<int, 2>(1,2)) +
+                                 input(Eigen::array<int, 4>(i,j+2,k+2,l)) * kernel(Eigen::array<int, 2>(2,2)) +
+                                 input(Eigen::array<int, 4>(i,j+0,k+3,l)) * kernel(Eigen::array<int, 2>(0,3)) +
+                                 input(Eigen::array<int, 4>(i,j+1,k+3,l)) * kernel(Eigen::array<int, 2>(1,3)) +
+                                 input(Eigen::array<int, 4>(i,j+2,k+3,l)) * kernel(Eigen::array<int, 2>(2,3));
+            VERIFY_IS_APPROX(result, expected);
+        }
+      }
+    }
+  }
+}
+
+
+static void test_cuda_convolution_3d()
+{
+  Tensor<float, 5> input(Eigen::array<int, 5>(74,37,11,137,17));
+  Tensor<float, 3> kernel(Eigen::array<int, 3>(3,4,2));
+  Tensor<float, 5> out(Eigen::array<int, 5>(74,35,8,136,17));
+  input = input.constant(10.0f) + input.random();
+  kernel = kernel.constant(7.0f) + kernel.random();
+
+  std::size_t input_bytes = input.size() * sizeof(float);
+  std::size_t kernel_bytes = kernel.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_input;
+  float* d_kernel;
+  float* d_out;
+  cudaMalloc((void**)(&d_input), input_bytes);
+  cudaMalloc((void**)(&d_kernel), kernel_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+
+  cudaStream_t stream;
+  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 5> > gpu_input(d_input, Eigen::array<int, 5>(74,37,11,137,17));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_kernel(d_kernel, Eigen::array<int, 3>(3,4,2));
+  Eigen::TensorMap<Eigen::Tensor<float, 5> > gpu_out(d_out, Eigen::array<int, 5>(74,35,8,136,17));
+
+  Eigen::array<int, 3> dims(1,2,3);
+  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 74; ++i) {
+    for (int j = 0; j < 35; ++j) {
+      for (int k = 0; k < 8; ++k) {
+        for (int l = 0; l < 136; ++l) {
+          for (int m = 0; m < 17; ++m) {
+            const float result = out(Eigen::array<int, 5>(i,j,k,l,m));
+            const float expected = input(Eigen::array<int, 5>(i,j+0,k+0,l+0,m)) * kernel(Eigen::array<int, 3>(0,0,0)) +
+                                   input(Eigen::array<int, 5>(i,j+1,k+0,l+0,m)) * kernel(Eigen::array<int, 3>(1,0,0)) +
+                                   input(Eigen::array<int, 5>(i,j+2,k+0,l+0,m)) * kernel(Eigen::array<int, 3>(2,0,0)) +
+                                   input(Eigen::array<int, 5>(i,j+0,k+1,l+0,m)) * kernel(Eigen::array<int, 3>(0,1,0)) +
+                                   input(Eigen::array<int, 5>(i,j+1,k+1,l+0,m)) * kernel(Eigen::array<int, 3>(1,1,0)) +
+                                   input(Eigen::array<int, 5>(i,j+2,k+1,l+0,m)) * kernel(Eigen::array<int, 3>(2,1,0)) +
+                                   input(Eigen::array<int, 5>(i,j+0,k+2,l+0,m)) * kernel(Eigen::array<int, 3>(0,2,0)) +
+                                   input(Eigen::array<int, 5>(i,j+1,k+2,l+0,m)) * kernel(Eigen::array<int, 3>(1,2,0)) +
+                                   input(Eigen::array<int, 5>(i,j+2,k+2,l+0,m)) * kernel(Eigen::array<int, 3>(2,2,0)) +
+                                   input(Eigen::array<int, 5>(i,j+0,k+3,l+0,m)) * kernel(Eigen::array<int, 3>(0,3,0)) +
+                                   input(Eigen::array<int, 5>(i,j+1,k+3,l+0,m)) * kernel(Eigen::array<int, 3>(1,3,0)) +
+                                   input(Eigen::array<int, 5>(i,j+2,k+3,l+0,m)) * kernel(Eigen::array<int, 3>(2,3,0)) +
+                                   input(Eigen::array<int, 5>(i,j+0,k+0,l+1,m)) * kernel(Eigen::array<int, 3>(0,0,1)) +
+                                   input(Eigen::array<int, 5>(i,j+1,k+0,l+1,m)) * kernel(Eigen::array<int, 3>(1,0,1)) +
+                                   input(Eigen::array<int, 5>(i,j+2,k+0,l+1,m)) * kernel(Eigen::array<int, 3>(2,0,1)) +
+                                   input(Eigen::array<int, 5>(i,j+0,k+1,l+1,m)) * kernel(Eigen::array<int, 3>(0,1,1)) +
+                                   input(Eigen::array<int, 5>(i,j+1,k+1,l+1,m)) * kernel(Eigen::array<int, 3>(1,1,1)) +
+                                   input(Eigen::array<int, 5>(i,j+2,k+1,l+1,m)) * kernel(Eigen::array<int, 3>(2,1,1)) +
+                                   input(Eigen::array<int, 5>(i,j+0,k+2,l+1,m)) * kernel(Eigen::array<int, 3>(0,2,1)) +
+                                   input(Eigen::array<int, 5>(i,j+1,k+2,l+1,m)) * kernel(Eigen::array<int, 3>(1,2,1)) +
+                                   input(Eigen::array<int, 5>(i,j+2,k+2,l+1,m)) * kernel(Eigen::array<int, 3>(2,2,1)) +
+                                   input(Eigen::array<int, 5>(i,j+0,k+3,l+1,m)) * kernel(Eigen::array<int, 3>(0,3,1)) +
+                                   input(Eigen::array<int, 5>(i,j+1,k+3,l+1,m)) * kernel(Eigen::array<int, 3>(1,3,1)) +
+                                   input(Eigen::array<int, 5>(i,j+2,k+3,l+1,m)) * kernel(Eigen::array<int, 3>(2,3,1));
+            VERIFY_IS_APPROX(result, expected);
+          }
+        }
+      }
+    }
+  }
+}
+
+static float* CudaCopyFloat(float* data, int size) {
+  const int nbytes = size * sizeof(float);
+  float* result = NULL;
+  if (cudaMalloc((void**)(&result), nbytes) != cudaSuccess) {
+    return NULL;
+  } else {
+    if (data != NULL) {
+      cudaMemcpy(result, data, nbytes, cudaMemcpyHostToDevice);
+    }
+    return result;
+  }
+}
+
+static void test_cuda_constant_broadcast()
+{
+  cudaStream_t stream;
+  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Tensor<float, 1> t1(10);
+  for (int i = 0; i < 10; ++i) {
+    t1(i) = 10.0f * i;
+  }
+  float* t1_cuda = CudaCopyFloat(t1.data(), t1.size());
+  Eigen::TensorMap<Eigen::Tensor<float, 1> > t1_gpu(t1_cuda, 10);
+
+  Tensor<float, 1> t2(1);
+  t2 = t2.constant(20.0f);
+  float* t2_cuda = CudaCopyFloat(t2.data(), t2.size());
+  Eigen::TensorMap<Eigen::TensorFixedSize<float, Sizes<1> > > t2_gpu(t2_cuda, 1);
+
+  float* t3_cuda = CudaCopyFloat(NULL, 10);
+  Eigen::TensorMap<Eigen::Tensor<float, 1> > t3_gpu(t3_cuda, 10);
+
+  t3_gpu.device(gpu_device) =
+      t1_gpu + t2_gpu.broadcast(Eigen::array<int, 1>(10));
+
+  Eigen::Tensor<float, 1> t3(10);
+  cudaMemcpy(t3.data(), t3_gpu.data(), 10 * sizeof(float),
+             cudaMemcpyDeviceToHost);
+
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_APPROX(t3(i), t1(i) + t2(0));
+  }
+}
+
+
+void test_cuda_cast()
+{
+  Tensor<double, 3> in(Eigen::array<int, 3>(72,53,97));
+  Tensor<float, 3> out(Eigen::array<int, 3>(72,53,97));
+  in.setRandom();
+
+  std::size_t in_bytes = in.size() * sizeof(double);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  double* d_in;
+  float* d_out;
+  cudaMalloc((void**)(&d_in), in_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice);
+
+  cudaStream_t stream;
+  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<double, 3> > gpu_in(d_in, Eigen::array<int, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<int, 3>(72,53,97));
+
+  gpu_out.device(gpu_device) = gpu_in.template cast<float>();
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 53; ++j) {
+      for (int k = 0; k < 97; ++k) {
+        VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), static_cast<float>(in(Eigen::array<int, 3>(i,j,k))));
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_cuda()
+{
+  CALL_SUBTEST(test_cuda_elementwise_small());
+  CALL_SUBTEST(test_cuda_elementwise());
+  CALL_SUBTEST(test_cuda_reduction());
+  CALL_SUBTEST(test_cuda_contraction<ColMajor>());
+  CALL_SUBTEST(test_cuda_contraction<RowMajor>());
+  CALL_SUBTEST(test_cuda_convolution_1d());
+  CALL_SUBTEST(test_cuda_convolution_2d());
+  CALL_SUBTEST(test_cuda_convolution_3d());
+  CALL_SUBTEST(test_cuda_constant_broadcast());
+  CALL_SUBTEST(test_cuda_cast());
+}
diff --git a/unsupported/test/cxx11_tensor_device.cpp b/unsupported/test/cxx11_tensor_device.cpp
new file mode 100644
index 000000000..f2d7e4ce6
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_device.cpp
@@ -0,0 +1,391 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_device
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+// Context for evaluation on cpu
+struct CPUContext {
+  CPUContext(const Eigen::Tensor<float, 3>& in1, Eigen::Tensor<float, 3>& in2, Eigen::Tensor<float, 3>& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) {
+    kernel_1d_(0) = 3.14f;
+    kernel_1d_(1) = 2.7f;
+
+    kernel_2d_(0,0) = 3.14f;
+    kernel_2d_(1,0) = 2.7f;
+    kernel_2d_(0,1) = 0.2f;
+    kernel_2d_(1,1) = 7.0f;
+
+    kernel_3d_(0,0,0) = 3.14f;
+    kernel_3d_(0,1,0) = 2.7f;
+    kernel_3d_(0,0,1) = 0.2f;
+    kernel_3d_(0,1,1) = 7.0f;
+    kernel_3d_(1,0,0) = -1.0f;
+    kernel_3d_(1,1,0) = -0.3f;
+    kernel_3d_(1,0,1) = -0.7f;
+    kernel_3d_(1,1,1) = -0.5f;
+  }
+
+  const Eigen::DefaultDevice& device() const { return cpu_device_; }
+
+  const Eigen::Tensor<float, 3>& in1() const { return in1_; }
+  const Eigen::Tensor<float, 3>& in2() const { return in2_; }
+  Eigen::Tensor<float, 3>& out() { return out_; }
+  const Eigen::Tensor<float, 1>& kernel1d() const { return kernel_1d_; }
+  const Eigen::Tensor<float, 2>& kernel2d() const { return kernel_2d_; }
+  const Eigen::Tensor<float, 3>& kernel3d() const { return kernel_3d_; }
+
+ private:
+  const Eigen::Tensor<float, 3>& in1_;
+  const Eigen::Tensor<float, 3>& in2_;
+  Eigen::Tensor<float, 3>& out_;
+
+  Eigen::Tensor<float, 1> kernel_1d_;
+  Eigen::Tensor<float, 2> kernel_2d_;
+  Eigen::Tensor<float, 3> kernel_3d_;
+
+  Eigen::DefaultDevice cpu_device_;
+};
+
+
+// Context for evaluation on GPU
+struct GPUContext {
+  GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
+    assert(cudaMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == cudaSuccess);
+    float kernel_1d_val[] = {3.14f, 2.7f};
+    assert(cudaMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
+
+    assert(cudaMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == cudaSuccess);
+    float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f};
+    assert(cudaMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
+
+    assert(cudaMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == cudaSuccess);
+    float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f};
+    assert(cudaMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
+
+    assert(cudaStreamCreate(&stream_) == cudaSuccess);
+  }
+  ~GPUContext() {
+    assert(cudaFree(kernel_1d_) == cudaSuccess);
+    assert(cudaFree(kernel_2d_) == cudaSuccess);
+    assert(cudaFree(kernel_3d_) == cudaSuccess);
+    assert(cudaStreamDestroy(stream_) == cudaSuccess);
+  }
+
+  const Eigen::GpuDevice& device() const { return gpu_device_; }
+
+  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1() const { return in1_; }
+  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2() const { return in2_; }
+  Eigen::TensorMap<Eigen::Tensor<float, 3> >& out() { return out_; }
+  Eigen::TensorMap<Eigen::Tensor<float, 1> > kernel1d() const { return Eigen::TensorMap<Eigen::Tensor<float, 1> >(kernel_1d_, 2); }
+  Eigen::TensorMap<Eigen::Tensor<float, 2> > kernel2d() const { return Eigen::TensorMap<Eigen::Tensor<float, 2> >(kernel_2d_, 2, 2); }
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > kernel3d() const { return Eigen::TensorMap<Eigen::Tensor<float, 3> >(kernel_3d_, 2, 2, 2); }
+
+ private:
+  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1_;
+  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2_;
+  Eigen::TensorMap<Eigen::Tensor<float, 3> >& out_;
+
+  float* kernel_1d_;
+  float* kernel_2d_;
+  float* kernel_3d_;
+
+  cudaStream_t stream_;
+  Eigen::GpuDevice gpu_device_;
+};
+
+
+// The actual expression to evaluate
+template <typename Context>
+static void test_contextual_eval(Context* context)
+{
+  context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f);
+}
+
+template <typename Context>
+static void test_forced_contextual_eval(Context* context)
+{
+  context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f);
+}
+
+template <typename Context>
+static void test_compound_assignment(Context* context)
+{
+  context->out().device(context->device()) = context->in1().constant(2.718f);
+  context->out().device(context->device()) += context->in1() + context->in2() * 3.14f;
+}
+
+
+template <typename Context>
+static void test_contraction(Context* context)
+{
+  Eigen::array<std::pair<int, int>, 2> dims;
+  dims[0] = std::make_pair(1, 1);
+  dims[1] = std::make_pair(2, 2);
+
+  Eigen::array<int, 2> shape(40, 50*70);
+
+  Eigen::DSizes<int, 2> indices(0,0);
+  Eigen::DSizes<int, 2> sizes(40,40);
+
+  context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims);
+}
+
+
+template <typename Context>
+static void test_1d_convolution(Context* context)
+{
+  Eigen::DSizes<int, 3> indices(0,0,0);
+  Eigen::DSizes<int, 3> sizes(40,49,70);
+
+  Eigen::array<int, 1> dims(1);
+  context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims);
+}
+
+template <typename Context>
+static void test_2d_convolution(Context* context)
+{
+  Eigen::DSizes<int, 3> indices(0,0,0);
+  Eigen::DSizes<int, 3> sizes(40,49,69);
+
+  Eigen::array<int, 2> dims(1,2);
+  context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims);
+}
+
+template <typename Context>
+static void test_3d_convolution(Context* context)
+{
+  Eigen::DSizes<int, 3> indices(0,0,0);
+  Eigen::DSizes<int, 3> sizes(39,49,69);
+
+  Eigen::array<int, 3> dims(0,1,2);
+  context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims);
+}
+
+
+static void test_cpu() {
+  Eigen::Tensor<float, 3> in1(40,50,70);
+  Eigen::Tensor<float, 3> in2(40,50,70);
+  Eigen::Tensor<float, 3> out(40,50,70);
+
+  in1 = in1.random() + in1.constant(10.0f);
+  in2 = in2.random() + in2.constant(10.0f);
+
+  CPUContext context(in1, in2, out);
+  test_contextual_eval(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_forced_contextual_eval(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_compound_assignment(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_contraction(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 40; ++j) {
+      const float result = out(i,j,0);
+      float expected = 0;
+      for (int k = 0; k < 50; ++k) {
+        for (int l = 0; l < 70; ++l) {
+          expected += in1(i, k, l) * in2(j, k, l);
+        }
+      }
+      VERIFY_IS_APPROX(expected, result);
+    }
+  }
+
+  test_1d_convolution(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
+      }
+    }
+  }
+
+  test_2d_convolution(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 69; ++k) {
+        const float result = out(i,j,k);
+        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) +
+                               (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
+        if (fabs(expected) < 1e-4 && fabs(result) < 1e-4) {
+          continue;
+        }
+        VERIFY_IS_APPROX(expected, result);
+      }
+    }
+  }
+
+  test_3d_convolution(&context);
+  for (int i = 0; i < 39; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 69; ++k) {
+        const float result = out(i,j,k);
+        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
+                                in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) +
+                               (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
+                                in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
+        if (fabs(expected) < 1e-4 && fabs(result) < 1e-4) {
+          continue;
+        }
+        VERIFY_IS_APPROX(expected, result);
+      }
+    }
+  }
+}
+
+static void test_gpu() {
+  Eigen::Tensor<float, 3> in1(40,50,70);
+  Eigen::Tensor<float, 3> in2(40,50,70);
+  Eigen::Tensor<float, 3> out(40,50,70);
+  in1 = in1.random() + in1.constant(10.0f);
+  in2 = in2.random() + in2.constant(10.0f);
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t in2_bytes = in2.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_in2;
+  float* d_out;
+  cudaMalloc((void**)(&d_in1), in1_bytes);
+  cudaMalloc((void**)(&d_in2), in2_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, 40,50,70);
+
+  GPUContext context(gpu_in1, gpu_in2, gpu_out);
+  test_contextual_eval(&context);
+  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_forced_contextual_eval(&context);
+  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_compound_assignment(&context);
+  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_contraction(&context);
+  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 40; ++j) {
+      const float result = out(i,j,0);
+      float expected = 0;
+      for (int k = 0; k < 50; ++k) {
+        for (int l = 0; l < 70; ++l) {
+          expected += in1(i, k, l) * in2(j, k, l);
+        }
+      }
+      VERIFY_IS_APPROX(expected, result);
+    }
+  }
+
+  test_1d_convolution(&context);
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
+      }
+    }
+  }
+
+  test_2d_convolution(&context);
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 69; ++k) {
+        const float result = out(i,j,k);
+        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
+                                in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
+        VERIFY_IS_APPROX(expected, result);
+      }
+    }
+  }
+
+  test_3d_convolution(&context);
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
+  for (int i = 0; i < 39; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 69; ++k) {
+       const float result = out(i,j,k);
+        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
+                                in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f +
+                                in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
+                                in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
+        VERIFY_IS_APPROX(expected, result);
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_device()
+{
+  CALL_SUBTEST(test_cpu());
+  CALL_SUBTEST(test_gpu());
+}
diff --git a/unsupported/test/cxx11_tensor_dimension.cpp b/unsupported/test/cxx11_tensor_dimension.cpp
new file mode 100644
index 000000000..0cc4e86f7
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_dimension.cpp
@@ -0,0 +1,54 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+
+static void test_dynamic_size()
+{
+  Eigen::DSizes<int, 3> dimensions(2,3,7);
+
+  VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2);
+  VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3);
+  VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7);
+  VERIFY_IS_EQUAL(dimensions.TotalSize(), (size_t)2*3*7);
+  VERIFY_IS_EQUAL((int)dimensions[0], 2);
+  VERIFY_IS_EQUAL((int)dimensions[1], 3);
+  VERIFY_IS_EQUAL((int)dimensions[2], 7);
+}
+
+static void test_fixed_size()
+{
+  Eigen::Sizes<2,3,7> dimensions;
+
+  VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2);
+  VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3);
+  VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7);
+  VERIFY_IS_EQUAL(dimensions.TotalSize(), (size_t)2*3*7);
+}
+
+
+static void test_match()
+{
+  Eigen::DSizes<int, 3> dyn(2,3,7);
+  Eigen::Sizes<2,3,7> stat;
+  VERIFY_IS_EQUAL(Eigen::dimensions_match(dyn, stat), true);
+}
+
+
+void test_cxx11_tensor_dimension()
+{
+  CALL_SUBTEST(test_dynamic_size());
+  CALL_SUBTEST(test_fixed_size());
+  CALL_SUBTEST(test_match());
+}
diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp
new file mode 100644
index 000000000..695565e9b
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_expr.cpp
@@ -0,0 +1,314 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_1d()
+{
+  Tensor<float, 1> vec1({6});
+  Tensor<float, 1, RowMajor> vec2({6});
+
+  vec1(0) = 4.0;  vec2(0) = 0.0;
+  vec1(1) = 8.0;  vec2(1) = 1.0;
+  vec1(2) = 15.0; vec2(2) = 2.0;
+  vec1(3) = 16.0; vec2(3) = 3.0;
+  vec1(4) = 23.0; vec2(4) = 4.0;
+  vec1(5) = 42.0; vec2(5) = 5.0;
+
+  float data3[6];
+  TensorMap<Tensor<float, 1>> vec3(data3, 6);
+  vec3 = vec1.sqrt();
+  float data4[6];
+  TensorMap<Tensor<float, 1, RowMajor>> vec4(data4, 6);
+  vec4 = vec2.square();
+  float data5[6];
+  TensorMap<Tensor<float, 1, RowMajor>> vec5(data5, 6);
+  vec5 = vec2.cube();
+
+  VERIFY_IS_APPROX(vec3(0), sqrtf(4.0));
+  VERIFY_IS_APPROX(vec3(1), sqrtf(8.0));
+  VERIFY_IS_APPROX(vec3(2), sqrtf(15.0));
+  VERIFY_IS_APPROX(vec3(3), sqrtf(16.0));
+  VERIFY_IS_APPROX(vec3(4), sqrtf(23.0));
+  VERIFY_IS_APPROX(vec3(5), sqrtf(42.0));
+
+  VERIFY_IS_APPROX(vec4(0), 0.0f);
+  VERIFY_IS_APPROX(vec4(1), 1.0f);
+  VERIFY_IS_APPROX(vec4(2), 2.0f * 2.0f);
+  VERIFY_IS_APPROX(vec4(3), 3.0f * 3.0f);
+  VERIFY_IS_APPROX(vec4(4), 4.0f * 4.0f);
+  VERIFY_IS_APPROX(vec4(5), 5.0f * 5.0f);
+
+  VERIFY_IS_APPROX(vec5(0), 0.0f);
+  VERIFY_IS_APPROX(vec5(1), 1.0f);
+  VERIFY_IS_APPROX(vec5(2), 2.0f * 2.0f * 2.0f);
+  VERIFY_IS_APPROX(vec5(3), 3.0f * 3.0f * 3.0f);
+  VERIFY_IS_APPROX(vec5(4), 4.0f * 4.0f * 4.0f);
+  VERIFY_IS_APPROX(vec5(5), 5.0f * 5.0f * 5.0f);
+
+  vec3 = vec1 + vec2;
+  VERIFY_IS_APPROX(vec3(0), 4.0f + 0.0f);
+  VERIFY_IS_APPROX(vec3(1), 8.0f + 1.0f);
+  VERIFY_IS_APPROX(vec3(2), 15.0f + 2.0f);
+  VERIFY_IS_APPROX(vec3(3), 16.0f + 3.0f);
+  VERIFY_IS_APPROX(vec3(4), 23.0f + 4.0f);
+  VERIFY_IS_APPROX(vec3(5), 42.0f + 5.0f);
+}
+
+static void test_2d()
+{
+  float data1[6];
+  TensorMap<Tensor<float, 2>> mat1(data1, 2, 3);
+  float data2[6];
+  TensorMap<Tensor<float, 2, RowMajor>> mat2(data2, 2, 3);
+
+  mat1(0,0) = 0.0;
+  mat1(0,1) = 1.0;
+  mat1(0,2) = 2.0;
+  mat1(1,0) = 3.0;
+  mat1(1,1) = 4.0;
+  mat1(1,2) = 5.0;
+
+  mat2(0,0) = -0.0;
+  mat2(0,1) = -1.0;
+  mat2(0,2) = -2.0;
+  mat2(1,0) = -3.0;
+  mat2(1,1) = -4.0;
+  mat2(1,2) = -5.0;
+
+  Tensor<float, 2> mat3(2,3);
+  Tensor<float, 2, RowMajor> mat4(2,3);
+  mat3 = mat1.abs();
+  mat4 = mat2.abs();
+
+  VERIFY_IS_APPROX(mat3(0,0), 0.0f);
+  VERIFY_IS_APPROX(mat3(0,1), 1.0f);
+  VERIFY_IS_APPROX(mat3(0,2), 2.0f);
+  VERIFY_IS_APPROX(mat3(1,0), 3.0f);
+  VERIFY_IS_APPROX(mat3(1,1), 4.0f);
+  VERIFY_IS_APPROX(mat3(1,2), 5.0f);
+
+  VERIFY_IS_APPROX(mat4(0,0), 0.0f);
+  VERIFY_IS_APPROX(mat4(0,1), 1.0f);
+  VERIFY_IS_APPROX(mat4(0,2), 2.0f);
+  VERIFY_IS_APPROX(mat4(1,0), 3.0f);
+  VERIFY_IS_APPROX(mat4(1,1), 4.0f);
+  VERIFY_IS_APPROX(mat4(1,2), 5.0f);
+}
+
+static void test_3d()
+{
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3, RowMajor> mat2(2,3,7);
+
+  float val = 1.0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        mat2(i,j,k) = val;
+        val += 1.0;
+      }
+    }
+  }
+
+  Tensor<float, 3> mat3(2,3,7);
+  mat3 = mat1 + mat1;
+  Tensor<float, 3, RowMajor> mat4(2,3,7);
+  mat4 = mat2 * 3.14f;
+  Tensor<float, 3> mat5(2,3,7);
+  mat5 = mat1.inverse().log();
+  Tensor<float, 3, RowMajor> mat6(2,3,7);
+  mat6 = mat2.pow(0.5f) * 3.14f;
+  Tensor<float, 3> mat7(2,3,7);
+  mat7 = mat1.cwiseMax(mat5 * 2.0f).exp();
+  Tensor<float, 3, RowMajor> mat8(2,3,7);
+  mat8 = (-mat2).exp() * 3.14f;
+  Tensor<float, 3, RowMajor> mat9(2,3,7);
+  mat9 = mat2 + 3.14f;
+  Tensor<float, 3, RowMajor> mat10(2,3,7);
+  mat10 = mat2 - 3.14f;
+  Tensor<float, 3, RowMajor> mat11(2,3,7);
+  mat11 = mat2 / 3.14f;
+
+  val = 1.0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat3(i,j,k), val + val);
+        VERIFY_IS_APPROX(mat4(i,j,k), val * 3.14f);
+        VERIFY_IS_APPROX(mat5(i,j,k), logf(1.0f/val));
+        VERIFY_IS_APPROX(mat6(i,j,k), sqrtf(val) * 3.14f);
+        VERIFY_IS_APPROX(mat7(i,j,k), expf((std::max)(val, mat5(i,j,k) * 2.0f)));
+        VERIFY_IS_APPROX(mat8(i,j,k), expf(-val) * 3.14f);
+        VERIFY_IS_APPROX(mat9(i,j,k), val + 3.14f);
+        VERIFY_IS_APPROX(mat10(i,j,k), val - 3.14f);
+        VERIFY_IS_APPROX(mat11(i,j,k), val / 3.14f);
+        val += 1.0;
+      }
+    }
+  }
+}
+
+static void test_constants()
+{
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<float, 3> mat3(2,3,7);
+
+  float val = 1.0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        val += 1.0;
+      }
+    }
+  }
+  mat2 = mat1.constant(3.14f);
+  mat3 = mat1.cwiseMax(7.3f).exp();
+
+  val = 1.0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat2(i,j,k), 3.14f);
+        VERIFY_IS_APPROX(mat3(i,j,k), expf((std::max)(val, 7.3f)));
+        val += 1.0;
+      }
+    }
+  }
+}
+
+static void test_boolean()
+{
+  Tensor<int, 1> vec(6);
+  std::copy_n(std::begin({0, 1, 2, 3, 4, 5}), 6, vec.data());
+
+  // Test ||.
+  Tensor<bool, 1> bool1 = vec < vec.constant(1) || vec > vec.constant(4);
+  VERIFY_IS_EQUAL(bool1[0], true);
+  VERIFY_IS_EQUAL(bool1[1], false);
+  VERIFY_IS_EQUAL(bool1[2], false);
+  VERIFY_IS_EQUAL(bool1[3], false);
+  VERIFY_IS_EQUAL(bool1[4], false);
+  VERIFY_IS_EQUAL(bool1[5], true);
+
+  // Test &&, including cast of operand vec.
+  Tensor<bool, 1> bool2 = vec.cast<bool>() && vec < vec.constant(4);
+  VERIFY_IS_EQUAL(bool2[0], false);
+  VERIFY_IS_EQUAL(bool2[1], true);
+  VERIFY_IS_EQUAL(bool2[2], true);
+  VERIFY_IS_EQUAL(bool2[3], true);
+  VERIFY_IS_EQUAL(bool2[4], false);
+  VERIFY_IS_EQUAL(bool2[5], false);
+
+  // Compilation tests:
+  // Test Tensor<bool> against results of cast or comparison; verifies that
+  // CoeffReturnType is set to match Op return type of bool for Unary and Binary
+  // Ops.
+  Tensor<bool, 1> bool3 = vec.cast<bool>() && bool2;
+  bool3 = vec < vec.constant(4) && bool2;
+}
+
+static void test_functors()
+{
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<float, 3> mat3(2,3,7);
+
+  float val = 1.0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        val += 1.0;
+      }
+    }
+  }
+  mat2 = mat1.inverse().unaryExpr(&asinf);
+  mat3 = mat1.unaryExpr(&tanhf);
+
+  val = 1.0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat2(i,j,k), asinf(1.0f / mat1(i,j,k)));
+        VERIFY_IS_APPROX(mat3(i,j,k), tanhf(mat1(i,j,k)));
+        val += 1.0;
+      }
+    }
+  }
+}
+
+static void test_type_casting()
+{
+  Tensor<bool, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<double, 3> mat3(2,3,7);
+  mat1.setRandom();
+  mat2.setRandom();
+
+  mat3 = mat1.template cast<double>();
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat3(i,j,k), mat1(i,j,k) ? 1.0 : 0.0);
+      }
+    }
+  }
+
+  mat3 = mat2.template cast<double>();
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat3(i,j,k), static_cast<double>(mat2(i,j,k)));
+      }
+    }
+  }
+}
+
+static void test_select()
+{
+  Tensor<float, 3> selector(2,3,7);
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<float, 3> result(2,3,7);
+
+  selector.setRandom();
+  mat1.setRandom();
+  mat2.setRandom();
+  result = (selector > selector.constant(0.5f)).select(mat1, mat2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(result(i,j,k), (selector(i,j,k) > 0.5f) ? mat1(i,j,k) : mat2(i,j,k));
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_expr()
+{
+  CALL_SUBTEST(test_1d());
+  CALL_SUBTEST(test_2d());
+  CALL_SUBTEST(test_3d());
+  CALL_SUBTEST(test_constants());
+  CALL_SUBTEST(test_boolean());
+  CALL_SUBTEST(test_functors());
+  CALL_SUBTEST(test_type_casting());
+  CALL_SUBTEST(test_select());
+}
diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp
new file mode 100644
index 000000000..8a27f5ad8
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_fixed_size.cpp
@@ -0,0 +1,198 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+
+static void test_1d()
+{
+  TensorFixedSize<float, Sizes<6> > vec1;
+  TensorFixedSize<float, Sizes<6>, RowMajor> vec2;
+
+  VERIFY_IS_EQUAL((vec1.size()), 6);
+  //  VERIFY_IS_EQUAL((vec1.dimensions()[0]), 6);
+  //  VERIFY_IS_EQUAL((vec1.dimension(0)), 6);
+
+  vec1(0) = 4.0;  vec2(0) = 0.0;
+  vec1(1) = 8.0;  vec2(1) = 1.0;
+  vec1(2) = 15.0; vec2(2) = 2.0;
+  vec1(3) = 16.0; vec2(3) = 3.0;
+  vec1(4) = 23.0; vec2(4) = 4.0;
+  vec1(5) = 42.0; vec2(5) = 5.0;
+
+  float data3[6];
+  TensorMap<TensorFixedSize<float, Sizes<6> > > vec3(data3, 6);
+  vec3 = vec1.sqrt();
+  float data4[6];
+  TensorMap<TensorFixedSize<float, Sizes<6>, RowMajor> > vec4(data4, 6);
+  vec4 = vec2.sqrt();
+
+  VERIFY_IS_EQUAL((vec3.size()), 6);
+  VERIFY_IS_EQUAL(vec3.rank(), 1);
+  //  VERIFY_IS_EQUAL((vec3.dimensions()[0]), 6);
+  //  VERIFY_IS_EQUAL((vec3.dimension(0)), 6);
+
+  VERIFY_IS_APPROX(vec3(0), sqrtf(4.0));
+  VERIFY_IS_APPROX(vec3(1), sqrtf(8.0));
+  VERIFY_IS_APPROX(vec3(2), sqrtf(15.0));
+  VERIFY_IS_APPROX(vec3(3), sqrtf(16.0));
+  VERIFY_IS_APPROX(vec3(4), sqrtf(23.0));
+  VERIFY_IS_APPROX(vec3(5), sqrtf(42.0));
+
+  VERIFY_IS_APPROX(vec4(0), sqrtf(0.0));
+  VERIFY_IS_APPROX(vec4(1), sqrtf(1.0));
+  VERIFY_IS_APPROX(vec4(2), sqrtf(2.0));
+  VERIFY_IS_APPROX(vec4(3), sqrtf(3.0));
+  VERIFY_IS_APPROX(vec4(4), sqrtf(4.0));
+  VERIFY_IS_APPROX(vec4(5), sqrtf(5.0));
+
+  vec3 = vec1 + vec2;
+  VERIFY_IS_APPROX(vec3(0), 4.0f + 0.0f);
+  VERIFY_IS_APPROX(vec3(1), 8.0f + 1.0f);
+  VERIFY_IS_APPROX(vec3(2), 15.0f + 2.0f);
+  VERIFY_IS_APPROX(vec3(3), 16.0f + 3.0f);
+  VERIFY_IS_APPROX(vec3(4), 23.0f + 4.0f);
+  VERIFY_IS_APPROX(vec3(5), 42.0f + 5.0f);
+}
+
+static void test_2d()
+{
+  float data1[6];
+  TensorMap<TensorFixedSize<float, Sizes<2, 3> >> mat1(data1,2,3);
+  float data2[6];
+  TensorMap<TensorFixedSize<float, Sizes<2, 3>, RowMajor>> mat2(data2,2,3);
+
+  VERIFY_IS_EQUAL((mat1.size()), 2*3);
+  VERIFY_IS_EQUAL(mat1.rank(), 2);
+  //  VERIFY_IS_EQUAL((mat1.dimension(0)), 2);
+  //  VERIFY_IS_EQUAL((mat1.dimension(1)), 3);
+
+  mat1(0,0) = 0.0;
+  mat1(0,1) = 1.0;
+  mat1(0,2) = 2.0;
+  mat1(1,0) = 3.0;
+  mat1(1,1) = 4.0;
+  mat1(1,2) = 5.0;
+
+  mat2(0,0) = -0.0;
+  mat2(0,1) = -1.0;
+  mat2(0,2) = -2.0;
+  mat2(1,0) = -3.0;
+  mat2(1,1) = -4.0;
+  mat2(1,2) = -5.0;
+
+  TensorFixedSize<float, Sizes<2, 3>> mat3;
+  TensorFixedSize<float, Sizes<2, 3>, RowMajor> mat4;
+  mat3 = mat1.abs();
+  mat4 = mat2.abs();
+
+  VERIFY_IS_EQUAL((mat3.size()), 2*3);
+    //  VERIFY_IS_EQUAL((mat3.dimension(0)), 2);
+    //  VERIFY_IS_EQUAL((mat3.dimension(1)), 3);
+
+  VERIFY_IS_APPROX(mat3(0,0), 0.0f);
+  VERIFY_IS_APPROX(mat3(0,1), 1.0f);
+  VERIFY_IS_APPROX(mat3(0,2), 2.0f);
+  VERIFY_IS_APPROX(mat3(1,0), 3.0f);
+  VERIFY_IS_APPROX(mat3(1,1), 4.0f);
+  VERIFY_IS_APPROX(mat3(1,2), 5.0f);
+
+  VERIFY_IS_APPROX(mat4(0,0), 0.0f);
+  VERIFY_IS_APPROX(mat4(0,1), 1.0f);
+  VERIFY_IS_APPROX(mat4(0,2), 2.0f);
+  VERIFY_IS_APPROX(mat4(1,0), 3.0f);
+  VERIFY_IS_APPROX(mat4(1,1), 4.0f);
+  VERIFY_IS_APPROX(mat4(1,2), 5.0f);
+}
+
+static void test_3d()
+{
+  TensorFixedSize<float, Sizes<2, 3, 7> > mat1;
+  TensorFixedSize<float, Sizes<2, 3, 7>, RowMajor> mat2;
+
+  VERIFY_IS_EQUAL((mat1.size()), 2*3*7);
+  VERIFY_IS_EQUAL(mat1.rank(), 3);
+  //  VERIFY_IS_EQUAL((mat1.dimension(0)), 2);
+  //  VERIFY_IS_EQUAL((mat1.dimension(1)), 3);
+  //  VERIFY_IS_EQUAL((mat1.dimension(2)), 7);
+
+  float val = 0.0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        mat2(i,j,k) = val;
+        val += 1.0;
+      }
+    }
+  }
+
+  TensorFixedSize<float, Sizes<2, 3, 7> > mat3;
+  mat3 = mat1.sqrt();
+  TensorFixedSize<float, Sizes<2, 3, 7>, RowMajor> mat4;
+  mat4 = mat2.sqrt();
+
+  VERIFY_IS_EQUAL((mat3.size()), 2*3*7);
+  //  VERIFY_IS_EQUAL((mat3.dimension(0)), 2);
+  //  VERIFY_IS_EQUAL((mat3.dimension(1)), 3);
+  //  VERIFY_IS_EQUAL((mat3.dimension(2)), 7);
+
+
+  val = 0.0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat3(i,j,k), sqrtf(val));
+        VERIFY_IS_APPROX(mat4(i,j,k), sqrtf(val));
+        val += 1.0;
+      }
+    }
+  }
+}
+
+
+static void test_array()
+{
+  TensorFixedSize<float, Sizes<2, 3, 7> > mat1;
+  float val = 0.0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        val += 1.0;
+      }
+    }
+  }
+
+  TensorFixedSize<float, Sizes<2, 3, 7> > mat3;
+  mat3 = mat1.pow(3.5f);
+
+  val = 0.0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat3(i,j,k), powf(val, 3.5f));
+        val += 1.0;
+      }
+    }
+  }
+}
+
+void test_cxx11_tensor_fixed_size()
+{
+  CALL_SUBTEST(test_1d());
+  CALL_SUBTEST(test_2d());
+  CALL_SUBTEST(test_3d());
+  CALL_SUBTEST(test_array());
+}
diff --git a/unsupported/test/cxx11_tensor_forced_eval.cpp b/unsupported/test/cxx11_tensor_forced_eval.cpp
new file mode 100644
index 000000000..ad9de867d
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_forced_eval.cpp
@@ -0,0 +1,78 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/Core>
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::MatrixXf;
+using Eigen::Tensor;
+
+static void test_simple()
+{
+  MatrixXf m1(3,3);
+  MatrixXf m2(3,3);
+  m1.setRandom();
+  m2.setRandom();
+
+  TensorMap<Tensor<float, 2>> mat1(m1.data(), 3,3);
+  TensorMap<Tensor<float, 2>> mat2(m2.data(), 3,3);
+
+  Tensor<float, 2> mat3(3,3);
+  mat3 = mat1;
+
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+
+  mat3 = mat3.contract(mat2, dims).eval();
+
+  VERIFY_IS_APPROX(mat3(0, 0), (m1*m2).eval()(0,0));
+  VERIFY_IS_APPROX(mat3(0, 1), (m1*m2).eval()(0,1));
+  VERIFY_IS_APPROX(mat3(0, 2), (m1*m2).eval()(0,2));
+  VERIFY_IS_APPROX(mat3(1, 0), (m1*m2).eval()(1,0));
+  VERIFY_IS_APPROX(mat3(1, 1), (m1*m2).eval()(1,1));
+  VERIFY_IS_APPROX(mat3(1, 2), (m1*m2).eval()(1,2));
+  VERIFY_IS_APPROX(mat3(2, 0), (m1*m2).eval()(2,0));
+  VERIFY_IS_APPROX(mat3(2, 1), (m1*m2).eval()(2,1));
+  VERIFY_IS_APPROX(mat3(2, 2), (m1*m2).eval()(2,2));
+}
+
+
+static void test_const()
+{
+  MatrixXf input(3,3);
+  input.setRandom();
+  MatrixXf output = input;
+  output.rowwise() -= input.colwise().maxCoeff();
+
+  Eigen::array<int, 1> depth_dim;
+  depth_dim[0] = 0;
+  Tensor<float, 2>::Dimensions dims2d;
+  dims2d[0] = 1;
+  dims2d[1] = 3;
+  Eigen::array<int, 2> bcast;
+  bcast[0] = 3;
+  bcast[1] = 1;
+  const TensorMap<Tensor<const float, 2>> input_tensor(input.data(), 3, 3);
+  Tensor<float, 2> output_tensor= (input_tensor - input_tensor.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_APPROX(output(i, j), output_tensor(i, j));
+    }
+  }
+}
+
+
+void test_cxx11_tensor_forced_eval()
+{
+  CALL_SUBTEST(test_simple());
+  CALL_SUBTEST(test_const());
+}
diff --git a/unsupported/test/cxx11_tensor_image_patch.cpp b/unsupported/test/cxx11_tensor_image_patch.cpp
new file mode 100644
index 000000000..26854f5a4
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_image_patch.cpp
@@ -0,0 +1,476 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+static void test_simple_patch()
+{
+  Tensor<float, 4> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  Tensor<float, 5> single_pixel_patch;
+  single_pixel_patch = tensor.extract_image_patches<1, 1>();
+
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(0), 2);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(3), 3*5);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(4), 7);
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    if (tensor.data()[i] != single_pixel_patch.data()[i]) {
+      std::cout << "Mismatch detected at index " << i << " : " << tensor.data()[i] << " vs " << single_pixel_patch.data()[i] << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_pixel_patch.data()[i], tensor.data()[i]);
+  }
+
+  Tensor<float, 5> entire_image_patch;
+  entire_image_patch = tensor.extract_image_patches<3, 5>();
+
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(0), 2);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(1), 3);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(2), 5);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(3), 3*5);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(4), 7);
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      int patchId = i+3*j;
+      for (int r = 0; r < 3; ++r) {
+        for (int c = 0; c < 5; ++c) {
+          for (int d = 0; d < 2; ++d) {
+            for (int b = 0; b < 7; ++b) {
+              float expected = 0.0f;
+              if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) {
+                expected = tensor(d, r-1+i, c-2+j, b);
+              }
+              if (entire_image_patch(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(entire_image_patch(d, r, c, patchId, b), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  Tensor<float, 5> twod_patch;
+  twod_patch = tensor.extract_image_patches<2, 2>();
+
+  VERIFY_IS_EQUAL(twod_patch.dimension(0), 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(1), 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(3), 3*5);
+  VERIFY_IS_EQUAL(twod_patch.dimension(4), 7);
+
+  // Based on the calculation described in TensorTraits.h, padding happens to be 0.
+  int row_padding = 0;
+  int col_padding = 0;
+  int stride = 1;
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      int patchId = i+3*j;
+      for (int r = 0; r < 2; ++r) {
+        for (int c = 0; c < 2; ++c) {
+          for (int d = 0; d < 2; ++d) {
+            for (int b = 0; b < 7; ++b) {
+              float expected = 0.0f;
+              int row_offset = r*stride + i - row_padding;
+              int col_offset = c*stride + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor.dimension(1) && col_offset < tensor.dimension(2)) {
+                expected = tensor(d, row_offset, col_offset, b);
+              }
+              if (twod_patch(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(twod_patch(d, r, c, patchId, b), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Verifies VALID padding (no padding) with incrementing values.
+static void test_patch_padding_valid()
+{
+  int input_depth = 3;
+  int input_rows = 3;
+  int input_cols = 3;
+  int input_batches = 1;
+  int ksize = 2;  // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+  int stride = 2;  // Only same stride is supported.
+  Tensor<float, 4> tensor(input_depth, input_rows, input_cols, input_batches);
+  // Initializes tensor with incrementing numbers.
+  for (int i = 0; i < tensor.size(); ++i) {
+    tensor.data()[i] = i + 1;
+  }
+  Tensor<float, 5> result = tensor.extract_image_patches(ksize, ksize, stride, stride, PADDING_VALID);
+
+  VERIFY_IS_EQUAL(result.dimension(0), input_depth);  // depth
+  VERIFY_IS_EQUAL(result.dimension(1), ksize);  // kernel rows
+  VERIFY_IS_EQUAL(result.dimension(2), ksize);  // kernel cols
+  VERIFY_IS_EQUAL(result.dimension(3), 1);  // number of patches
+  VERIFY_IS_EQUAL(result.dimension(4), input_batches);  // number of batches
+
+  // No padding is carried out.
+  int row_padding = 0;
+  int col_padding = 0;
+
+  for (int i = 0; (i+stride+ksize-1) < input_rows; i += stride) {  // input rows
+    for (int j = 0; (j+stride+ksize-1) < input_cols; j += stride) {  // input cols
+      int patchId = i+input_rows*j;
+      for (int r = 0; r < ksize; ++r) {  // patch rows
+        for (int c = 0; c < ksize; ++c) {  // patch cols
+          for (int d = 0; d < input_depth; ++d) {  // depth
+            for (int b = 0; b < input_batches; ++b) {  // batch
+              float expected = 0.0f;
+              int row_offset = r + i - row_padding;
+              int col_offset = c + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+                expected = tensor(d, row_offset, col_offset, b);
+              }
+              if (result(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result(d, r, c, patchId, b), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Verifies VALID padding (no padding) with the same value.
+static void test_patch_padding_valid_same_value()
+{
+  int input_depth = 1;
+  int input_rows = 5;
+  int input_cols = 5;
+  int input_batches = 2;
+  int ksize = 3;  // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+  int stride = 2;  // Only same stride is supported.
+  Tensor<float, 4> tensor(input_depth, input_rows, input_cols, input_batches);
+  tensor = tensor.constant(11.0f);
+  Tensor<float, 5> result = tensor.extract_image_patches(ksize, ksize, stride, stride, PADDING_VALID);
+
+  VERIFY_IS_EQUAL(result.dimension(0), input_depth);  // depth
+  VERIFY_IS_EQUAL(result.dimension(1), ksize);  // kernel rows
+  VERIFY_IS_EQUAL(result.dimension(2), ksize);  // kernel cols
+  VERIFY_IS_EQUAL(result.dimension(3), 4);  // number of patches
+  VERIFY_IS_EQUAL(result.dimension(4), input_batches);  // number of batches
+
+  // No padding is carried out.
+  int row_padding = 0;
+  int col_padding = 0;
+
+  for (int i = 0; (i+stride+ksize-1) <= input_rows; i += stride) {  // input rows
+    for (int j = 0; (j+stride+ksize-1) <= input_cols; j += stride) {  // input cols
+      int patchId = i+input_rows*j;
+      for (int r = 0; r < ksize; ++r) {  // patch rows
+        for (int c = 0; c < ksize; ++c) {  // patch cols
+          for (int d = 0; d < input_depth; ++d) {  // depth
+            for (int b = 0; b < input_batches; ++b) {  // batch
+              float expected = 0.0f;
+              int row_offset = r + i - row_padding;
+              int col_offset = c + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+                expected = tensor(d, row_offset, col_offset, b);
+              }
+              if (result(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result(d, r, c, patchId, b), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Verifies SAME padding.
+static void test_patch_padding_same()
+{
+  int input_depth = 3;
+  int input_rows = 4;
+  int input_cols = 2;
+  int input_batches = 1;
+  int ksize = 2;  // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+  int stride = 2;  // Only same stride is supported.
+  Tensor<float, 4> tensor(input_depth, input_rows, input_cols, input_batches);
+  // Initializes tensor with incrementing numbers.
+  for (int i = 0; i < tensor.size(); ++i) {
+    tensor.data()[i] = i + 1;
+  }
+  Tensor<float, 5> result = tensor.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME);
+
+  VERIFY_IS_EQUAL(result.dimension(0), input_depth);  // depth
+  VERIFY_IS_EQUAL(result.dimension(1), ksize);  // kernel rows
+  VERIFY_IS_EQUAL(result.dimension(2), ksize);  // kernel cols
+  VERIFY_IS_EQUAL(result.dimension(3), 2);  // number of patches
+  VERIFY_IS_EQUAL(result.dimension(4), input_batches);  // number of batches
+
+  // Based on the calculation described in TensorTraits.h, padding happens to be
+  // 0.
+  int row_padding = 0;
+  int col_padding = 0;
+
+  for (int i = 0; (i+stride+ksize-1) <= input_rows; i += stride) {  // input rows
+    for (int j = 0; (j+stride+ksize-1) <= input_cols; j += stride) {  // input cols
+      int patchId = i+input_rows*j;
+      for (int r = 0; r < ksize; ++r) {  // patch rows
+        for (int c = 0; c < ksize; ++c) {  // patch cols
+          for (int d = 0; d < input_depth; ++d) {  // depth
+            for (int b = 0; b < input_batches; ++b) {  // batch
+              float expected = 0.0f;
+              int row_offset = r*stride + i - row_padding;
+              int col_offset = c*stride + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+                expected = tensor(d, row_offset, col_offset, b);
+              }
+              if (result(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result(d, r, c, patchId, b), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static void test_patch_no_extra_dim()
+{
+  Tensor<float, 3> tensor(2,3,5);
+  tensor.setRandom();
+
+  Tensor<float, 4> single_pixel_patch;
+  single_pixel_patch = tensor.extract_image_patches<1, 1>();
+
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(0), 2);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(3), 3*5);
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    if (tensor.data()[i] != single_pixel_patch.data()[i]) {
+      std::cout << "Mismatch detected at index " << i << " : " << tensor.data()[i] << " vs " << single_pixel_patch.data()[i] << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_pixel_patch.data()[i], tensor.data()[i]);
+  }
+
+  Tensor<float, 4> entire_image_patch;
+  entire_image_patch = tensor.extract_image_patches<3, 5>();
+
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(0), 2);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(1), 3);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(2), 5);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(3), 3*5);
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      int patchId = i+3*j;
+      for (int r = 0; r < 3; ++r) {
+        for (int c = 0; c < 5; ++c) {
+          for (int d = 0; d < 2; ++d) {
+            float expected = 0.0f;
+            if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) {
+              expected = tensor(d, r-1+i, c-2+j);
+            }
+            if (entire_image_patch(d, r, c, patchId) != expected) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(entire_image_patch(d, r, c, patchId), expected);
+          }
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4> twod_patch;
+  twod_patch = tensor.extract_image_patches<2, 2>();
+
+  VERIFY_IS_EQUAL(twod_patch.dimension(0), 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(1), 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(3), 3*5);
+
+  // Based on the calculation described in TensorTraits.h, padding happens to be 0.
+  int row_padding = 0;
+  int col_padding = 0;
+  int stride = 1;
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      int patchId = i+3*j;
+      for (int r = 0; r < 2; ++r) {
+        for (int c = 0; c < 2; ++c) {
+          for (int d = 0; d < 2; ++d) {
+            float expected = 0.0f;
+            int row_offset = r*stride + i - row_padding;
+            int col_offset = c*stride + j - col_padding;
+            if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor.dimension(1) && col_offset < tensor.dimension(2)) {
+              expected = tensor(d, row_offset, col_offset);
+            }
+            if (twod_patch(d, r, c, patchId) != expected) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(twod_patch(d, r, c, patchId), expected);
+          }
+        }
+      }
+    }
+  }
+}
+
+
+static void test_imagenet_patches()
+{
+  // Test the code on typical configurations used by the 'imagenet' benchmarks at
+  // https://github.com/soumith/convnet-benchmarks
+  Tensor<float, 4> l_in(3, 128, 128, 128);
+  l_in.setRandom();
+  Tensor<float, 5> l_out = l_in.extract_image_patches(11, 11);
+  VERIFY_IS_EQUAL(l_out.dimension(0), 3);
+  VERIFY_IS_EQUAL(l_out.dimension(1), 11);
+  VERIFY_IS_EQUAL(l_out.dimension(2), 11);
+  VERIFY_IS_EQUAL(l_out.dimension(3), 128*128);
+  VERIFY_IS_EQUAL(l_out.dimension(4), 128);
+  for (int b = 0; b < 128; ++b) {
+    for (int i = 0; i < 128; ++i) {
+      for (int j = 0; j < 128; ++j) {
+        int patchId = i+128*j;
+        for (int c = 0; c < 11; ++c) {
+          for (int r = 0; r < 11; ++r) {
+            for (int d = 0; d < 3; ++d) {
+              float expected = 0.0f;
+              if (r-5+i >= 0 && c-5+j >= 0 && r-5+i < 128 && c-5+j < 128) {
+                expected = l_in(d, r-5+i, c-5+j, b);
+              }
+              if (l_out(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  l_in.resize(64, 64, 64, 128);
+  l_in.setRandom();
+  l_out = l_in.extract_image_patches(9, 9);
+  VERIFY_IS_EQUAL(l_out.dimension(0), 64);
+  VERIFY_IS_EQUAL(l_out.dimension(1), 9);
+  VERIFY_IS_EQUAL(l_out.dimension(2), 9);
+  VERIFY_IS_EQUAL(l_out.dimension(3), 64*64);
+  VERIFY_IS_EQUAL(l_out.dimension(4), 128);
+  for (int b = 0; b < 128; ++b) {
+    for (int i = 0; i < 64; ++i) {
+      for (int j = 0; j < 64; ++j) {
+        int patchId = i+64*j;
+        for (int c = 0; c < 9; ++c) {
+          for (int r = 0; r < 9; ++r) {
+            for (int d = 0; d < 64; ++d) {
+              float expected = 0.0f;
+              if (r-4+i >= 0 && c-4+j >= 0 && r-4+i < 64 && c-4+j < 64) {
+                expected = l_in(d, r-4+i, c-4+j, b);
+              }
+              if (l_out(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  l_in.resize(128, 16, 16, 128);
+  l_in.setRandom();
+  l_out = l_in.extract_image_patches(7, 7);
+  VERIFY_IS_EQUAL(l_out.dimension(0), 128);
+  VERIFY_IS_EQUAL(l_out.dimension(1), 7);
+  VERIFY_IS_EQUAL(l_out.dimension(2), 7);
+  VERIFY_IS_EQUAL(l_out.dimension(3), 16*16);
+  VERIFY_IS_EQUAL(l_out.dimension(4), 128);
+  for (int b = 0; b < 128; ++b) {
+    for (int i = 0; i < 16; ++i) {
+      for (int j = 0; j < 16; ++j) {
+        int patchId = i+16*j;
+        for (int c = 0; c < 7; ++c) {
+          for (int r = 0; r < 7; ++r) {
+            for (int d = 0; d < 128; ++d) {
+              float expected = 0.0f;
+              if (r-3+i >= 0 && c-3+j >= 0 && r-3+i < 16 && c-3+j < 16) {
+                expected = l_in(d, r-3+i, c-3+j, b);
+              }
+              if (l_out(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  l_in.resize(384, 13, 13, 128);
+  l_in.setRandom();
+  l_out = l_in.extract_image_patches(3, 3);
+  VERIFY_IS_EQUAL(l_out.dimension(0), 384);
+  VERIFY_IS_EQUAL(l_out.dimension(1), 3);
+  VERIFY_IS_EQUAL(l_out.dimension(2), 3);
+  VERIFY_IS_EQUAL(l_out.dimension(3), 13*13);
+  VERIFY_IS_EQUAL(l_out.dimension(4), 128);
+  for (int b = 0; b < 128; ++b) {
+    for (int i = 0; i < 13; ++i) {
+      for (int j = 0; j < 13; ++j) {
+        int patchId = i+13*j;
+        for (int c = 0; c < 3; ++c) {
+          for (int r = 0; r < 3; ++r) {
+            for (int d = 0; d < 384; ++d) {
+              float expected = 0.0f;
+              if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 13 && c-1+j < 13) {
+                expected = l_in(d, r-1+i, c-1+j, b);
+              }
+              if (l_out(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void test_cxx11_tensor_image_patch()
+{
+  CALL_SUBTEST(test_simple_patch());
+  CALL_SUBTEST(test_patch_no_extra_dim());
+  CALL_SUBTEST(test_patch_padding_valid());
+  CALL_SUBTEST(test_patch_padding_valid_same_value());
+  CALL_SUBTEST(test_patch_padding_same());
+  CALL_SUBTEST(test_imagenet_patches());
+}
diff --git a/unsupported/test/cxx11_tensor_index_list.cpp b/unsupported/test/cxx11_tensor_index_list.cpp
new file mode 100644
index 000000000..c4d4f244f
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_index_list.cpp
@@ -0,0 +1,268 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+#ifdef EIGEN_HAS_CONSTEXPR
+
+static void test_static_index_list()
+{
+  Tensor<float, 4> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  constexpr auto reduction_axis = make_index_list(0, 1, 2);
+  VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 0);
+  VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1);
+  VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 2);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 0);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 2);
+
+  EIGEN_STATIC_ASSERT((internal::array_get<0>(reduction_axis) == 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::array_get<1>(reduction_axis) == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::array_get<2>(reduction_axis) == 2), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  Tensor<float, 1> result = tensor.sum(reduction_axis);
+  for (int i = 0; i < result.size(); ++i) {
+    float expected = 0.0f;
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 5; ++l) {
+          expected += tensor(j,k,l,i);
+        }
+      }
+    }
+    VERIFY_IS_APPROX(result(i), expected);
+  }
+}
+
+
+static void test_type2index_list()
+{
+  Tensor<float, 5> tensor(2,3,5,7,11);
+  tensor.setRandom();
+  tensor += tensor.constant(10.0f);
+
+  typedef Eigen::IndexList<Eigen::type2index<0>> Dims0;
+  typedef Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1>> Dims1;
+  typedef Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1>, Eigen::type2index<2>> Dims2;
+  typedef Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1>, Eigen::type2index<2>, Eigen::type2index<3>> Dims3;
+  typedef Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1>, Eigen::type2index<2>, Eigen::type2index<3>, Eigen::type2index<4>> Dims4;
+
+#if 0
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims0>()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims1>()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims2>()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims3>()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims4>()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+#endif
+
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims0, 1, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims1, 2, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims2, 3, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims3, 4, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims4, 5, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims0, 1, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims1, 2, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims2, 3, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims3, 4, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims4, 5, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  const Dims0 reduction_axis0;
+  Tensor<float, 4> result0 = tensor.sum(reduction_axis0);
+  for (int m = 0; m < 11; ++m) {
+    for (int l = 0; l < 7; ++l) {
+      for (int k = 0; k < 5; ++k) {
+        for (int j = 0; j < 3; ++j) {
+          float expected = 0.0f;
+          for (int i = 0; i < 2; ++i) {
+            expected += tensor(i,j,k,l,m);
+          }
+          VERIFY_IS_APPROX(result0(j,k,l,m), expected);
+        }
+      }
+    }
+  }
+
+  const Dims1 reduction_axis1;
+  Tensor<float, 3> result1 = tensor.sum(reduction_axis1);
+  for (int m = 0; m < 11; ++m) {
+    for (int l = 0; l < 7; ++l) {
+      for (int k = 0; k < 5; ++k) {
+        float expected = 0.0f;
+        for (int j = 0; j < 3; ++j) {
+          for (int i = 0; i < 2; ++i) {
+            expected += tensor(i,j,k,l,m);
+          }
+        }
+        VERIFY_IS_APPROX(result1(k,l,m), expected);
+      }
+    }
+  }
+
+  const Dims2 reduction_axis2;
+  Tensor<float, 2> result2 = tensor.sum(reduction_axis2);
+  for (int m = 0; m < 11; ++m) {
+    for (int l = 0; l < 7; ++l) {
+      float expected = 0.0f;
+      for (int k = 0; k < 5; ++k) {
+        for (int j = 0; j < 3; ++j) {
+          for (int i = 0; i < 2; ++i) {
+            expected += tensor(i,j,k,l,m);
+          }
+        }
+      }
+      VERIFY_IS_APPROX(result2(l,m), expected);
+    }
+  }
+
+  const Dims3 reduction_axis3;
+  Tensor<float, 1> result3 = tensor.sum(reduction_axis3);
+  for (int m = 0; m < 11; ++m) {
+    float expected = 0.0f;
+    for (int l = 0; l < 7; ++l) {
+      for (int k = 0; k < 5; ++k) {
+        for (int j = 0; j < 3; ++j) {
+          for (int i = 0; i < 2; ++i) {
+            expected += tensor(i,j,k,l,m);
+          }
+        }
+      }
+    }
+    VERIFY_IS_APPROX(result3(m), expected);
+  }
+
+  const Dims4 reduction_axis4;
+  Tensor<float, 1> result4 = tensor.sum(reduction_axis4);
+  float expected = 0.0f;
+  for (int m = 0; m < 11; ++m) {
+    for (int l = 0; l < 7; ++l) {
+      for (int k = 0; k < 5; ++k) {
+        for (int j = 0; j < 3; ++j) {
+          for (int i = 0; i < 2; ++i) {
+            expected += tensor(i,j,k,l,m);
+          }
+        }
+      }
+    }
+  }
+  VERIFY_IS_APPROX(result4(0), expected);
+}
+
+
+static void test_dynamic_index_list()
+{
+  Tensor<float, 4> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  int dim1 = 2;
+  int dim2 = 1;
+  int dim3 = 0;
+
+  auto reduction_axis = make_index_list(dim1, dim2, dim3);
+
+  VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 2);
+  VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1);
+  VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 0);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 2);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 0);
+
+  Tensor<float, 1> result = tensor.sum(reduction_axis);
+  for (int i = 0; i < result.size(); ++i) {
+    float expected = 0.0f;
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 5; ++l) {
+          expected += tensor(j,k,l,i);
+        }
+      }
+    }
+    VERIFY_IS_APPROX(result(i), expected);
+  }
+}
+
+static void test_mixed_index_list()
+{
+  Tensor<float, 4> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  int dim2 = 1;
+  int dim4 = 3;
+
+  auto reduction_axis = make_index_list(0, dim2, 2, dim4);
+
+  VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 0);
+  VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1);
+  VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 2);
+  VERIFY_IS_EQUAL(internal::array_get<3>(reduction_axis), 3);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 0);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 2);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[3]), 3);
+
+  typedef IndexList<type2index<0>, int, type2index<2>, int> ReductionIndices;
+  ReductionIndices reduction_indices;
+  reduction_indices.set(1, 1);
+  reduction_indices.set(3, 3);
+  EIGEN_STATIC_ASSERT((internal::array_get<0>(reduction_indices) == 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::array_get<2>(reduction_indices) == 2), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_known_statically<ReductionIndices>()(0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_known_statically<ReductionIndices>()(2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionIndices>()(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionIndices>()(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+#if 0
+  EIGEN_STATIC_ASSERT((internal::all_indices_known_statically<ReductionIndices>()() == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<ReductionIndices>()() == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+#endif
+
+  typedef IndexList<type2index<0>, type2index<1>, type2index<2>, type2index<3>> ReductionList;
+  ReductionList reduction_list;
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>()(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>()(1, 1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>()(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>()(3, 3) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+#if 0
+  EIGEN_STATIC_ASSERT((internal::all_indices_known_statically<ReductionList>()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<ReductionList>()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+#endif
+
+  Tensor<float, 1> result1 = tensor.sum(reduction_axis);
+  Tensor<float, 1> result2 = tensor.sum(reduction_indices);
+  Tensor<float, 1> result3 = tensor.sum(reduction_list);
+
+  float expected = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          expected += tensor(i,j,k,l);
+        }
+      }
+    }
+  }
+  VERIFY_IS_APPROX(result1(0), expected);
+  VERIFY_IS_APPROX(result2(0), expected);
+  VERIFY_IS_APPROX(result3(0), expected);
+}
+
+#endif
+
+void test_cxx11_tensor_index_list()
+{
+#ifdef EIGEN_HAS_CONSTEXPR
+  CALL_SUBTEST(test_static_index_list());
+  CALL_SUBTEST(test_type2index_list());
+  CALL_SUBTEST(test_dynamic_index_list());
+  CALL_SUBTEST(test_mixed_index_list());
+#endif
+}
diff --git a/unsupported/test/cxx11_tensor_intdiv.cpp b/unsupported/test/cxx11_tensor_intdiv.cpp
new file mode 100644
index 000000000..a510dc695
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_intdiv.cpp
@@ -0,0 +1,77 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+
+static void test_signed_32bit()
+{
+  for (int32_t i = 1; i < 25000; ++i) {
+    const Eigen::internal::TensorIntDivisor<int32_t> div(i);
+
+    for (int32_t j = 0; j < 25000; ++j) {
+      const int32_t fast_div = j / div;
+      const int32_t slow_div = j / i;
+      VERIFY_IS_EQUAL(fast_div, slow_div);
+    }
+  }
+}
+
+
+static void test_unsigned_32bit()
+{
+  for (uint32_t i = 1; i < 25000; ++i) {
+    const Eigen::internal::TensorIntDivisor<uint32_t> div(i);
+
+    for (uint32_t j = 0; j < 25000; ++j) {
+      const uint32_t fast_div = j / div;
+      const uint32_t slow_div = j / i;
+      VERIFY_IS_EQUAL(fast_div, slow_div);
+    }
+  }
+}
+
+
+static void test_signed_64bit()
+{
+  for (int64_t i = 2; i < 25000; ++i) {
+    const Eigen::internal::TensorIntDivisor<int64_t> div(i);
+
+    for (int64_t j = 0; j < 25000; ++j) {
+      const int64_t fast_div = j / div;
+      const int64_t slow_div = j / i;
+      VERIFY_IS_EQUAL(fast_div, slow_div);
+    }
+  }
+}
+
+
+static void test_unsigned_64bit()
+{
+  for (uint64_t i = 2; i < 25000; ++i) {
+    const Eigen::internal::TensorIntDivisor<uint64_t> div(i);
+
+    for (uint64_t j = 0; j < 25000; ++j) {
+      const uint64_t fast_div = j / div;
+      const uint64_t slow_div = j / i;
+      VERIFY_IS_EQUAL(fast_div, slow_div);
+    }
+  }
+}
+
+
+void test_cxx11_tensor_intdiv()
+{
+  CALL_SUBTEST(test_signed_32bit());
+  CALL_SUBTEST(test_unsigned_32bit());
+  CALL_SUBTEST(test_signed_64bit());
+  CALL_SUBTEST(test_unsigned_64bit());
+}
diff --git a/unsupported/test/cxx11_tensor_io.cpp b/unsupported/test/cxx11_tensor_io.cpp
new file mode 100644
index 000000000..8bbcf7089
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_io.cpp
@@ -0,0 +1,114 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <sstream>
+#include <string>
+#include <Eigen/CXX11/Tensor>
+
+
+template<int DataLayout>
+static void test_output_1d()
+{
+  Tensor<int, 1, DataLayout> tensor(5);
+  for (int i = 0; i < 5; ++i) {
+    tensor(i) = i;
+  }
+
+  std::stringstream os;
+  os << tensor;
+
+  std::string expected("0\n1\n2\n3\n4");
+  VERIFY_IS_EQUAL(std::string(os.str()), expected);
+}
+
+
+template<int DataLayout>
+static void test_output_2d()
+{
+  Tensor<int, 2, DataLayout> tensor(5, 3);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      tensor(i, j) = i*j;
+    }
+  }
+
+  std::stringstream os;
+  os << tensor;
+
+  std::string expected("0  0  0\n0  1  2\n0  2  4\n0  3  6\n0  4  8");
+  VERIFY_IS_EQUAL(std::string(os.str()), expected);
+}
+
+
+template<int DataLayout>
+static void test_output_expr()
+{
+  Tensor<int, 1, DataLayout> tensor1(5);
+  Tensor<int, 1, DataLayout> tensor2(5);
+  for (int i = 0; i < 5; ++i) {
+    tensor1(i) = i;
+    tensor2(i) = 7;
+  }
+
+  std::stringstream os;
+  os << tensor1 + tensor2;
+
+  std::string expected(" 7\n 8\n 9\n10\n11");
+  VERIFY_IS_EQUAL(std::string(os.str()), expected);
+}
+
+
+template<int DataLayout>
+static void test_output_string()
+{
+  Tensor<std::string, 2, DataLayout> tensor(5, 3);
+  tensor.setConstant(std::string("foo"));
+
+  std::cout << tensor << std::endl;
+
+  std::stringstream os;
+  os << tensor;
+
+  std::string expected("foo  foo  foo\nfoo  foo  foo\nfoo  foo  foo\nfoo  foo  foo\nfoo  foo  foo");
+  VERIFY_IS_EQUAL(std::string(os.str()), expected);
+}
+
+
+template<int DataLayout>
+static void test_output_const()
+{
+  Tensor<int, 1, DataLayout> tensor(5);
+  for (int i = 0; i < 5; ++i) {
+    tensor(i) = i;
+  }
+
+  TensorMap<Tensor<const int, 1, DataLayout> > tensor_map(tensor.data(), 5);
+
+  std::stringstream os;
+  os << tensor_map;
+
+  std::string expected("0\n1\n2\n3\n4");
+  VERIFY_IS_EQUAL(std::string(os.str()), expected);
+}
+
+
+void test_cxx11_tensor_io()
+{
+  CALL_SUBTEST(test_output_1d<ColMajor>());
+  CALL_SUBTEST(test_output_1d<RowMajor>());
+  CALL_SUBTEST(test_output_2d<ColMajor>());
+  CALL_SUBTEST(test_output_2d<RowMajor>());
+  CALL_SUBTEST(test_output_expr<ColMajor>());
+  CALL_SUBTEST(test_output_expr<RowMajor>());
+  CALL_SUBTEST(test_output_string<ColMajor>());
+  CALL_SUBTEST(test_output_string<RowMajor>());
+  CALL_SUBTEST(test_output_const<ColMajor>());
+  CALL_SUBTEST(test_output_const<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_layout_swap.cpp b/unsupported/test/cxx11_tensor_layout_swap.cpp
new file mode 100644
index 000000000..ae297a9da
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_layout_swap.cpp
@@ -0,0 +1,61 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+static void test_simple_swap()
+{
+  Tensor<float, 3, ColMajor> tensor(2,3,7);
+  tensor.setRandom();
+
+  Tensor<float, 3, RowMajor> tensor2 = tensor.swap_layout();
+  VERIFY_IS_EQUAL(tensor.dimension(0), tensor2.dimension(2));
+  VERIFY_IS_EQUAL(tensor.dimension(1), tensor2.dimension(1));
+  VERIFY_IS_EQUAL(tensor.dimension(2), tensor2.dimension(0));
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(tensor(i,j,k), tensor2(k,j,i));
+      }
+    }
+  }
+}
+
+
+static void test_swap_as_lvalue()
+{
+  Tensor<float, 3, ColMajor> tensor(2,3,7);
+  tensor.setRandom();
+
+  Tensor<float, 3, RowMajor> tensor2(7,3,2);
+  tensor2.swap_layout() = tensor;
+  VERIFY_IS_EQUAL(tensor.dimension(0), tensor2.dimension(2));
+  VERIFY_IS_EQUAL(tensor.dimension(1), tensor2.dimension(1));
+  VERIFY_IS_EQUAL(tensor.dimension(2), tensor2.dimension(0));
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(tensor(i,j,k), tensor2(k,j,i));
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_layout_swap()
+{
+  CALL_SUBTEST(test_simple_swap());
+  CALL_SUBTEST(test_swap_as_lvalue());
+}
diff --git a/unsupported/test/cxx11_tensor_lvalue.cpp b/unsupported/test/cxx11_tensor_lvalue.cpp
new file mode 100644
index 000000000..071f5b406
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_lvalue.cpp
@@ -0,0 +1,42 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+
+static void test_compound_assignment()
+{
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<float, 3> mat3(2,3,7);
+
+  mat1.setRandom();
+  mat2.setRandom();
+  mat3 = mat1;
+  mat3 += mat2;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat3(i,j,k), mat1(i,j,k) + mat2(i,j,k));
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_lvalue()
+{
+  CALL_SUBTEST(test_compound_assignment());
+}
diff --git a/unsupported/test/cxx11_tensor_map.cpp b/unsupported/test/cxx11_tensor_map.cpp
new file mode 100644
index 000000000..9cf2eb150
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_map.cpp
@@ -0,0 +1,147 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_1d()
+{
+  Tensor<int, 1> vec1(6);
+  Tensor<int, 1, RowMajor> vec2(6);
+
+  TensorMap<Tensor<const int, 1>> vec3(vec1.data(), 6);
+  TensorMap<Tensor<const int, 1, RowMajor>> vec4(vec2.data(), 6);
+
+  vec1(0) = 4;  vec2(0) = 0;
+  vec1(1) = 8;  vec2(1) = 1;
+  vec1(2) = 15; vec2(2) = 2;
+  vec1(3) = 16; vec2(3) = 3;
+  vec1(4) = 23; vec2(4) = 4;
+  vec1(5) = 42; vec2(5) = 5;
+
+  VERIFY_IS_EQUAL(vec1.rank(), 1);
+  VERIFY_IS_EQUAL(vec1.size(), 6);
+  VERIFY_IS_EQUAL(vec1.dimension(0), 6);
+
+  VERIFY_IS_EQUAL(vec3(0), 4);
+  VERIFY_IS_EQUAL(vec3(1), 8);
+  VERIFY_IS_EQUAL(vec3(2), 15);
+  VERIFY_IS_EQUAL(vec3(3), 16);
+  VERIFY_IS_EQUAL(vec3(4), 23);
+  VERIFY_IS_EQUAL(vec3(5), 42);
+
+  VERIFY_IS_EQUAL(vec4(0), 0);
+  VERIFY_IS_EQUAL(vec4(1), 1);
+  VERIFY_IS_EQUAL(vec4(2), 2);
+  VERIFY_IS_EQUAL(vec4(3), 3);
+  VERIFY_IS_EQUAL(vec4(4), 4);
+  VERIFY_IS_EQUAL(vec4(5), 5);
+}
+
+static void test_2d()
+{
+  Tensor<int, 2> mat1(2,3);
+  Tensor<int, 2, RowMajor> mat2(2,3);
+
+  mat1(0,0) = 0;
+  mat1(0,1) = 1;
+  mat1(0,2) = 2;
+  mat1(1,0) = 3;
+  mat1(1,1) = 4;
+  mat1(1,2) = 5;
+
+  mat2(0,0) = 0;
+  mat2(0,1) = 1;
+  mat2(0,2) = 2;
+  mat2(1,0) = 3;
+  mat2(1,1) = 4;
+  mat2(1,2) = 5;
+
+  TensorMap<Tensor<const int, 2>> mat3(mat1.data(), 2, 3);
+  TensorMap<Tensor<const int, 2, RowMajor>> mat4(mat2.data(), 2, 3);
+
+  VERIFY_IS_EQUAL(mat3.rank(), 2);
+  VERIFY_IS_EQUAL(mat3.size(), 6);
+  VERIFY_IS_EQUAL(mat3.dimension(0), 2);
+  VERIFY_IS_EQUAL(mat3.dimension(1), 3);
+
+  VERIFY_IS_EQUAL(mat4.rank(), 2);
+  VERIFY_IS_EQUAL(mat4.size(), 6);
+  VERIFY_IS_EQUAL(mat4.dimension(0), 2);
+  VERIFY_IS_EQUAL(mat4.dimension(1), 3);
+
+  VERIFY_IS_EQUAL(mat3(0,0), 0);
+  VERIFY_IS_EQUAL(mat3(0,1), 1);
+  VERIFY_IS_EQUAL(mat3(0,2), 2);
+  VERIFY_IS_EQUAL(mat3(1,0), 3);
+  VERIFY_IS_EQUAL(mat3(1,1), 4);
+  VERIFY_IS_EQUAL(mat3(1,2), 5);
+
+  VERIFY_IS_EQUAL(mat4(0,0), 0);
+  VERIFY_IS_EQUAL(mat4(0,1), 1);
+  VERIFY_IS_EQUAL(mat4(0,2), 2);
+  VERIFY_IS_EQUAL(mat4(1,0), 3);
+  VERIFY_IS_EQUAL(mat4(1,1), 4);
+  VERIFY_IS_EQUAL(mat4(1,2), 5);
+}
+
+static void test_3d()
+{
+  Tensor<int, 3> mat1(2,3,7);
+  Tensor<int, 3, RowMajor> mat2(2,3,7);
+
+  int val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        mat2(i,j,k) = val;
+        val++;
+      }
+    }
+  }
+
+  TensorMap<Tensor<const int, 3>> mat3(mat1.data(), 2, 3, 7);
+  TensorMap<Tensor<const int, 3, RowMajor>> mat4(mat2.data(), array<DenseIndex, 3>{{2, 3, 7}});
+
+  VERIFY_IS_EQUAL(mat3.rank(), 3);
+  VERIFY_IS_EQUAL(mat3.size(), 2*3*7);
+  VERIFY_IS_EQUAL(mat3.dimension(0), 2);
+  VERIFY_IS_EQUAL(mat3.dimension(1), 3);
+  VERIFY_IS_EQUAL(mat3.dimension(2), 7);
+
+  VERIFY_IS_EQUAL(mat4.rank(), 3);
+  VERIFY_IS_EQUAL(mat4.size(), 2*3*7);
+  VERIFY_IS_EQUAL(mat4.dimension(0), 2);
+  VERIFY_IS_EQUAL(mat4.dimension(1), 3);
+  VERIFY_IS_EQUAL(mat4.dimension(2), 7);
+
+  val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(mat3(i,j,k), val);
+        VERIFY_IS_EQUAL(mat4(i,j,k), val);
+        val++;
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_map()
+{
+  CALL_SUBTEST(test_1d());
+  CALL_SUBTEST(test_2d());
+  CALL_SUBTEST(test_3d());
+}
diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp
new file mode 100644
index 000000000..7fd7a283a
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_morphing.cpp
@@ -0,0 +1,342 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+static void test_simple_reshape()
+{
+  Tensor<float, 5> tensor1(2,3,1,7,1);
+  tensor1.setRandom();
+
+  Tensor<float, 3> tensor2(2,3,7);
+  Tensor<float, 2> tensor3(6,7);
+  Tensor<float, 2> tensor4(2,21);
+
+  Tensor<float, 3>::Dimensions dim1{{2,3,7}};
+  tensor2 = tensor1.reshape(dim1);
+  Tensor<float, 2>::Dimensions dim2{{6,7}};
+  tensor3 = tensor1.reshape(dim2);
+  Tensor<float, 2>::Dimensions dim3{{2,21}};
+  tensor4 = tensor1.reshape(dim1).reshape(dim3);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k));
+        VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i+2*j,k));
+        VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor4(i,j+3*k));
+      }
+    }
+  }
+}
+
+
+static void test_reshape_in_expr() {
+  MatrixXf m1(2,3*5*7*11);
+  MatrixXf m2(3*5*7*11,13);
+  m1.setRandom();
+  m2.setRandom();
+  MatrixXf m3 = m1 * m2;
+
+  TensorMap<Tensor<float, 5>> tensor1(m1.data(), 2,3,5,7,11);
+  TensorMap<Tensor<float, 5>> tensor2(m2.data(), 3,5,7,11,13);
+  Tensor<float, 2>::Dimensions newDims1{{2,3*5*7*11}};
+  Tensor<float, 2>::Dimensions newDims2{{3*5*7*11,13}};
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  array<DimPair, 1> contract_along{{DimPair(1, 0)}};
+  Tensor<float, 2> tensor3(2,13);
+  tensor3 = tensor1.reshape(newDims1).contract(tensor2.reshape(newDims2), contract_along);
+
+  Map<MatrixXf> res(tensor3.data(), 2, 13);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 13; ++j) {
+      VERIFY_IS_APPROX(res(i,j), m3(i,j));
+    }
+  }
+}
+
+
+static void test_reshape_as_lvalue()
+{
+  Tensor<float, 3> tensor(2,3,7);
+  tensor.setRandom();
+
+  Tensor<float, 2> tensor2d(6,7);
+  Tensor<float, 3>::Dimensions dim{{2,3,7}};
+  tensor2d.reshape(dim) = tensor;
+
+  float scratch[2*3*1*7*1];
+  TensorMap<Tensor<float, 5>> tensor5d(scratch, 2,3,1,7,1);
+  tensor5d.reshape(dim).device(Eigen::DefaultDevice()) = tensor;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(tensor2d(i+2*j,k), tensor(i,j,k));
+        VERIFY_IS_EQUAL(tensor5d(i,j,0,k,0), tensor(i,j,k));
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_simple_slice()
+{
+  Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
+  tensor.setRandom();
+
+  Tensor<float, 5, DataLayout> slice1(1,1,1,1,1);
+  Eigen::DSizes<ptrdiff_t, 5> indices(1,2,3,4,5);
+  Eigen::DSizes<ptrdiff_t, 5> sizes(1,1,1,1,1);
+  slice1 = tensor.slice(indices, sizes);
+  VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
+
+  Tensor<float, 5, DataLayout> slice2(1,1,2,2,3);
+  Eigen::DSizes<ptrdiff_t, 5> indices2(1,1,3,4,5);
+  Eigen::DSizes<ptrdiff_t, 5> sizes2(1,1,2,2,3);
+  slice2 = tensor.slice(indices2, sizes2);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+      }
+    }
+  }
+}
+
+// TODO(andydavis) Add RowMajor support when TensorContract supports RowMajor.
+static void test_slice_in_expr() {
+  MatrixXf m1(7,7);
+  MatrixXf m2(3,3);
+  m1.setRandom();
+  m2.setRandom();
+
+  MatrixXf m3 = m1.block(1, 2, 3, 3) * m2.block(0, 2, 3, 1);
+
+  TensorMap<Tensor<float, 2>> tensor1(m1.data(), 7, 7);
+  TensorMap<Tensor<float, 2>> tensor2(m2.data(), 3, 3);
+  Tensor<float, 2> tensor3(3,1);
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  array<DimPair, 1> contract_along{{DimPair(1, 0)}};
+
+  Eigen::DSizes<ptrdiff_t, 2> indices1(1,2);
+  Eigen::DSizes<ptrdiff_t, 2> sizes1(3,3);
+  Eigen::DSizes<ptrdiff_t, 2> indices2(0,2);
+  Eigen::DSizes<ptrdiff_t, 2> sizes2(3,1);
+  tensor3 = tensor1.slice(indices1, sizes1).contract(tensor2.slice(indices2, sizes2), contract_along);
+
+  Map<MatrixXf> res(tensor3.data(), 3, 1);
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 1; ++j) {
+      VERIFY_IS_APPROX(res(i,j), m3(i,j));
+    }
+  }
+
+  // Take an arbitrary slice of an arbitrarily sized tensor.
+  TensorMap<Tensor<const float, 2>> tensor4(m1.data(), 7, 7);
+  Tensor<float, 1> tensor6 = tensor4.reshape(DSizes<ptrdiff_t, 1>(7*7)).exp().slice(DSizes<ptrdiff_t, 1>(0), DSizes<ptrdiff_t, 1>(35));
+  for (int i = 0; i < 35; ++i) {
+    VERIFY_IS_APPROX(tensor6(i), expf(tensor4.data()[i]));
+  }
+}
+
+template<int DataLayout>
+static void test_slice_as_lvalue()
+{
+  Tensor<float, 3, DataLayout> tensor1(2,2,7);
+  tensor1.setRandom();
+  Tensor<float, 3, DataLayout> tensor2(2,2,7);
+  tensor2.setRandom();
+  Tensor<float, 3, DataLayout> tensor3(4,3,5);
+  tensor3.setRandom();
+  Tensor<float, 3, DataLayout> tensor4(4,3,2);
+  tensor4.setRandom();
+  Tensor<float, 3, DataLayout> tensor5(10,13,12);
+  tensor5.setRandom();
+
+  Tensor<float, 3, DataLayout> result(4,5,7);
+  Eigen::DSizes<ptrdiff_t, 3> sizes12(2,2,7);
+  Eigen::DSizes<ptrdiff_t, 3> first_slice(0,0,0);
+  result.slice(first_slice, sizes12) = tensor1;
+  Eigen::DSizes<ptrdiff_t, 3> second_slice(2,0,0);
+  result.slice(second_slice, sizes12).device(Eigen::DefaultDevice()) = tensor2;
+
+  Eigen::DSizes<ptrdiff_t, 3> sizes3(4,3,5);
+  Eigen::DSizes<ptrdiff_t, 3> third_slice(0,2,0);
+  result.slice(third_slice, sizes3) = tensor3;
+
+  Eigen::DSizes<ptrdiff_t, 3> sizes4(4,3,2);
+  Eigen::DSizes<ptrdiff_t, 3> fourth_slice(0,2,5);
+  result.slice(fourth_slice, sizes4) = tensor4;
+
+  for (int j = 0; j < 2; ++j) {
+    for (int k = 0; k < 7; ++k) {
+      for (int i = 0; i < 2; ++i) {
+        VERIFY_IS_EQUAL(result(i,j,k), tensor1(i,j,k));
+        VERIFY_IS_EQUAL(result(i+2,j,k), tensor2(i,j,k));
+      }
+    }
+  }
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 2; j < 5; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        VERIFY_IS_EQUAL(result(i,j,k), tensor3(i,j-2,k));
+      }
+      for (int k = 5; k < 7; ++k) {
+        VERIFY_IS_EQUAL(result(i,j,k), tensor4(i,j-2,k-5));
+      }
+    }
+  }
+
+  Eigen::DSizes<ptrdiff_t, 3> sizes5(4,5,7);
+  Eigen::DSizes<ptrdiff_t, 3> fifth_slice(0,0,0);
+  result.slice(fifth_slice, sizes5) = tensor5.slice(fifth_slice, sizes5);
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 2; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(result(i,j,k), tensor5(i,j,k));
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_slice_raw_data()
+{
+  Tensor<float, 4, DataLayout> tensor(3,5,7,11);
+  tensor.setRandom();
+
+  Eigen::DSizes<ptrdiff_t, 4> offsets(1,2,3,4);
+  Eigen::DSizes<ptrdiff_t, 4> extents(1,1,1,1);
+  typedef TensorEvaluator<decltype(tensor.slice(offsets, extents)), DefaultDevice> SliceEvaluator;
+  auto slice1 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+  VERIFY_IS_EQUAL(slice1.dimensions().TotalSize(), 1ul);
+  VERIFY_IS_EQUAL(slice1.data()[0], tensor(1,2,3,4));
+
+  if (DataLayout == ColMajor) {
+    extents = Eigen::DSizes<ptrdiff_t, 4>(2,1,1,1);
+    auto slice2 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice2.dimensions().TotalSize(), 2ul);
+    VERIFY_IS_EQUAL(slice2.data()[0], tensor(1,2,3,4));
+    VERIFY_IS_EQUAL(slice2.data()[1], tensor(2,2,3,4));
+  } else {
+    extents = Eigen::DSizes<ptrdiff_t, 4>(1,1,1,2);
+    auto slice2 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice2.dimensions().TotalSize(), 2ul);
+    VERIFY_IS_EQUAL(slice2.data()[0], tensor(1,2,3,4));
+    VERIFY_IS_EQUAL(slice2.data()[1], tensor(1,2,3,5));
+  }
+
+  extents = Eigen::DSizes<ptrdiff_t, 4>(1,2,1,1);
+  auto slice3 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+  VERIFY_IS_EQUAL(slice3.dimensions().TotalSize(), 2ul);
+  VERIFY_IS_EQUAL(slice3.data(), static_cast<float*>(0));
+
+  if (DataLayout == ColMajor) {
+    offsets = Eigen::DSizes<ptrdiff_t, 4>(0,2,3,4);
+    extents = Eigen::DSizes<ptrdiff_t, 4>(3,2,1,1);
+    auto slice4 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice4.dimensions().TotalSize(), 6ul);
+    for (int i = 0; i < 3; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        VERIFY_IS_EQUAL(slice4.data()[i+3*j], tensor(i,2+j,3,4));
+      }
+    }
+  } else {
+    offsets = Eigen::DSizes<ptrdiff_t, 4>(1,2,3,0);
+    extents = Eigen::DSizes<ptrdiff_t, 4>(1,1,2,11);
+    auto slice4 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice4.dimensions().TotalSize(), 22ul);
+    for (int l = 0; l < 11; ++l) {
+      for (int k = 0; k < 2; ++k) {
+        VERIFY_IS_EQUAL(slice4.data()[l+11*k], tensor(1,2,3+k,l));
+      }
+    }
+  }
+
+  if (DataLayout == ColMajor) {
+    offsets = Eigen::DSizes<ptrdiff_t, 4>(0,0,0,4);
+    extents = Eigen::DSizes<ptrdiff_t, 4>(3,5,7,2);
+    auto slice5 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice5.dimensions().TotalSize(), 210ul);
+    for (int i = 0; i < 3; ++i) {
+      for (int j = 0; j < 5; ++j) {
+        for (int k = 0; k < 7; ++k) {
+          for (int l = 0; l < 2; ++l) {
+            int slice_index = i + 3 * (j + 5 * (k + 7 * l));
+            VERIFY_IS_EQUAL(slice5.data()[slice_index], tensor(i,j,k,l+4));
+          }
+        }
+      }
+    }
+  } else {
+    offsets = Eigen::DSizes<ptrdiff_t, 4>(1,0,0,0);
+    extents = Eigen::DSizes<ptrdiff_t, 4>(2,5,7,11);
+    auto slice5 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice5.dimensions().TotalSize(), 770ul);
+    for (int l = 0; l < 11; ++l) {
+      for (int k = 0; k < 7; ++k) {
+        for (int j = 0; j < 5; ++j) {
+          for (int i = 0; i < 2; ++i) {
+            int slice_index = l + 11 * (k + 7 * (j + 5 * i));
+            VERIFY_IS_EQUAL(slice5.data()[slice_index], tensor(i+1,j,k,l));
+          }
+        }
+      }
+    }
+
+  }
+
+  offsets = Eigen::DSizes<ptrdiff_t, 4>(0,0,0,0);
+  extents = Eigen::DSizes<ptrdiff_t, 4>(3,5,7,11);
+  auto slice6 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+  VERIFY_IS_EQUAL(slice6.dimensions().TotalSize(), 3ul*5*7*11);
+  VERIFY_IS_EQUAL(slice6.data(), tensor.data());
+}
+
+
+static void test_composition()
+{
+  Eigen::Tensor<float, 2> matrix(7, 11);
+  matrix.setRandom();
+
+  const DSizes<ptrdiff_t, 3> newDims{{1, 1, 11}};
+  Eigen::Tensor<float, 3> tensor =
+      matrix.slice(DSizes<ptrdiff_t, 2>(2, 0), DSizes<ptrdiff_t, 2>(1, 11)).reshape(newDims);
+
+  VERIFY_IS_EQUAL(tensor.dimensions().TotalSize(), 11ul);
+  VERIFY_IS_EQUAL(tensor.dimension(0), 1);
+  VERIFY_IS_EQUAL(tensor.dimension(1), 1);
+  VERIFY_IS_EQUAL(tensor.dimension(2), 11);
+  for (int i = 0; i < 11; ++i) {
+    VERIFY_IS_EQUAL(tensor(0,0,i), matrix(2,i));
+  }
+}
+
+
+void test_cxx11_tensor_morphing()
+{
+  CALL_SUBTEST(test_simple_reshape());
+  CALL_SUBTEST(test_reshape_in_expr());
+  CALL_SUBTEST(test_reshape_as_lvalue());
+
+  CALL_SUBTEST(test_simple_slice<ColMajor>());
+  CALL_SUBTEST(test_simple_slice<RowMajor>());
+  CALL_SUBTEST(test_slice_in_expr());
+  CALL_SUBTEST(test_slice_as_lvalue<ColMajor>());
+  CALL_SUBTEST(test_slice_as_lvalue<RowMajor>());
+  CALL_SUBTEST(test_slice_raw_data<ColMajor>());
+  CALL_SUBTEST(test_slice_raw_data<RowMajor>());
+
+  CALL_SUBTEST(test_composition());
+}
diff --git a/unsupported/test/cxx11_tensor_of_complex.cpp b/unsupported/test/cxx11_tensor_of_complex.cpp
new file mode 100644
index 000000000..24b2bcb58
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_of_complex.cpp
@@ -0,0 +1,81 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+
+
+static void test_additions()
+{
+  Tensor<std::complex<float>, 1> data1(3);
+  Tensor<std::complex<float>, 1> data2(3);
+  for (int i = 0; i < 3; ++i) {
+    data1(i) = std::complex<float>(i, -i);
+    data2(i) = std::complex<float>(i, 7 * i);
+  }
+
+  Tensor<std::complex<float>, 1> sum = data1 + data2;
+  for (int i = 0; i < 3; ++i) {
+    VERIFY_IS_EQUAL(sum(i),  std::complex<float>(2*i, 6*i));
+  }
+}
+
+
+static void test_abs()
+{
+  Tensor<std::complex<float>, 1> data1(3);
+  Tensor<std::complex<double>, 1> data2(3);
+  data1.setRandom();
+  data2.setRandom();
+
+  Tensor<float, 1> abs1 = data1.abs();
+  Tensor<double, 1> abs2 = data2.abs();
+  for (int i = 0; i < 3; ++i) {
+    VERIFY_IS_APPROX(abs1(i), std::abs(data1(i)));
+    VERIFY_IS_APPROX(abs2(i), std::abs(data2(i)));
+  }
+}
+
+
+static void test_contractions()
+{
+  Tensor<std::complex<float>, 4> t_left(30, 50, 8, 31);
+  Tensor<std::complex<float>, 5> t_right(8, 31, 7, 20, 10);
+  Tensor<std::complex<float>, 5> t_result(30, 50, 7, 20, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  typedef Map<Matrix<std::complex<float>, Dynamic, Dynamic>> MapXcf;
+  MapXcf m_left(t_left.data(), 1500, 248);
+  MapXcf m_right(t_right.data(), 248, 1400);
+  Matrix<std::complex<float>, Dynamic, Dynamic> m_result(1500, 1400);
+
+  // This contraction should be equivalent to a regular matrix multiplication
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+  t_result = t_left.contract(t_right, dims);
+  m_result = m_left * m_right;
+  for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
+  }
+}
+
+
+void test_cxx11_tensor_of_complex()
+{
+  CALL_SUBTEST(test_additions());
+  CALL_SUBTEST(test_abs());
+  CALL_SUBTEST(test_contractions());
+}
diff --git a/unsupported/test/cxx11_tensor_of_const_values.cpp b/unsupported/test/cxx11_tensor_of_const_values.cpp
new file mode 100644
index 000000000..f179a0c21
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_of_const_values.cpp
@@ -0,0 +1,105 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_assign()
+{
+  float data1[6];
+  TensorMap<Tensor<const float, 2>> mat1(data1, 2, 3);
+  float data2[6];
+  const TensorMap<Tensor<float, 2>> mat2(data2, 2, 3);
+
+  for (int i = 0; i < 6; ++i) {
+    data1[i] = i;
+    data2[i] = -i;
+  }
+
+  Tensor<float, 2> rslt1;
+  rslt1 = mat1;
+  Tensor<float, 2> rslt2;
+  rslt2 = mat2;
+
+  Tensor<float, 2> rslt3 = mat1;
+  Tensor<float, 2> rslt4 = mat2;
+
+  Tensor<float, 2> rslt5(mat1);
+  Tensor<float, 2> rslt6(mat2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_APPROX(rslt1(i,j), static_cast<float>(i + 2*j));
+      VERIFY_IS_APPROX(rslt2(i,j), static_cast<float>(-i - 2*j));
+      VERIFY_IS_APPROX(rslt3(i,j), static_cast<float>(i + 2*j));
+      VERIFY_IS_APPROX(rslt4(i,j), static_cast<float>(-i - 2*j));
+      VERIFY_IS_APPROX(rslt5(i,j), static_cast<float>(i + 2*j));
+      VERIFY_IS_APPROX(rslt6(i,j), static_cast<float>(-i - 2*j));
+    }
+  }
+}
+
+
+static void test_plus()
+{
+  float data1[6];
+  TensorMap<Tensor<const float, 2>> mat1(data1, 2, 3);
+  float data2[6];
+  TensorMap<Tensor<float, 2>> mat2(data2, 2, 3);
+
+  for (int i = 0; i < 6; ++i) {
+    data1[i] = i;
+    data2[i] = -i;
+  }
+
+  Tensor<float, 2> sum1;
+  sum1 = mat1 + mat2;
+  Tensor<float, 2> sum2;
+  sum2 = mat2 + mat1;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_APPROX(sum1(i,j), 0.0f);
+      VERIFY_IS_APPROX(sum2(i,j), 0.0f);
+    }
+  }
+}
+
+
+static void test_plus_equal()
+{
+  float data1[6];
+  TensorMap<Tensor<const float, 2>> mat1(data1, 2, 3);
+  float data2[6];
+  TensorMap<Tensor<float, 2>> mat2(data2, 2, 3);
+
+  for (int i = 0; i < 6; ++i) {
+    data1[i] = i;
+    data2[i] = -i;
+  }
+  mat2 += mat1;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_APPROX(mat2(i,j), 0.0f);
+    }
+  }
+}
+
+
+void test_cxx11_tensor_of_const_values()
+{
+  CALL_SUBTEST(test_assign());
+  CALL_SUBTEST(test_plus());
+  CALL_SUBTEST(test_plus_equal());
+}
diff --git a/unsupported/test/cxx11_tensor_of_strings.cpp b/unsupported/test/cxx11_tensor_of_strings.cpp
new file mode 100644
index 000000000..8d05d154e
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_of_strings.cpp
@@ -0,0 +1,152 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+static void test_assign()
+{
+  std::string data1[6];
+  TensorMap<Tensor<std::string, 2>> mat1(data1, 2, 3);
+  std::string data2[6];
+  const TensorMap<Tensor<const std::string, 2>> mat2(data2, 2, 3);
+
+  for (int i = 0; i < 6; ++i) {
+    std::ostringstream s1;
+    s1 << "abc" << i*3;
+    data1[i] = s1.str();
+    std::ostringstream s2;
+    s2 << "def" << i*5;
+    data2[i] = s2.str();
+  }
+
+  Tensor<std::string, 2> rslt1;
+  rslt1 = mat1;
+  Tensor<std::string, 2> rslt2;
+  rslt2 = mat2;
+
+  Tensor<std::string, 2> rslt3 = mat1;
+  Tensor<std::string, 2> rslt4 = mat2;
+
+  Tensor<std::string, 2> rslt5(mat1);
+  Tensor<std::string, 2> rslt6(mat2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(rslt1(i,j), data1[i+2*j]);
+      VERIFY_IS_EQUAL(rslt2(i,j), data2[i+2*j]);
+      VERIFY_IS_EQUAL(rslt3(i,j), data1[i+2*j]);
+      VERIFY_IS_EQUAL(rslt4(i,j), data2[i+2*j]);
+      VERIFY_IS_EQUAL(rslt5(i,j), data1[i+2*j]);
+      VERIFY_IS_EQUAL(rslt6(i,j), data2[i+2*j]);
+    }
+  }
+}
+
+
+static void test_concat()
+{
+  Tensor<std::string, 2> t1(2, 3);
+  Tensor<std::string, 2> t2(2, 3);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      std::ostringstream s1;
+      s1 << "abc" << i + j*2;
+      t1(i, j) = s1.str();
+      std::ostringstream s2;
+      s2 << "def" << i*5 + j*32;
+      t2(i, j) = s2.str();
+    }
+  }
+
+  Tensor<std::string, 2> result = t1.concatenate(t2, 1);
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 6);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(result(i, j),   t1(i, j));
+      VERIFY_IS_EQUAL(result(i, j+3), t2(i, j));
+    }
+  }
+}
+
+
+static void test_slices()
+{
+  Tensor<std::string, 2> data(2, 6);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      std::ostringstream s1;
+      s1 << "abc" << i + j*2;
+      data(i, j) = s1.str();
+    }
+  }
+
+  const Eigen::DSizes<ptrdiff_t, 2> half_size{{2, 3}};
+  const Eigen::DSizes<ptrdiff_t, 2> first_half{{0, 0}};
+  const Eigen::DSizes<ptrdiff_t, 2> second_half{{0, 3}};
+
+  Tensor<std::string, 2> t1 = data.slice(first_half, half_size);
+  Tensor<std::string, 2> t2 = data.slice(second_half, half_size);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(data(i, j),   t1(i, j));
+      VERIFY_IS_EQUAL(data(i, j+3), t2(i, j));
+    }
+  }
+}
+
+
+static void test_additions()
+{
+  Tensor<std::string, 1> data1(3);
+  Tensor<std::string, 1> data2(3);
+  for (int i = 0; i < 3; ++i) {
+    data1(i) = "abc";
+    std::ostringstream s1;
+    s1 << i;
+    data2(i) = s1.str();
+  }
+
+  Tensor<std::string, 1> sum = data1 + data2;
+  for (int i = 0; i < 3; ++i) {
+    std::ostringstream concat;
+    concat << "abc" << i;
+    std::string expected = concat.str();
+    VERIFY_IS_EQUAL(sum(i), expected);
+  }
+}
+
+
+static void test_initialization()
+{
+  Tensor<std::string, 2> a(2, 3);
+  a.setConstant(std::string("foo"));
+  for (int i = 0; i < 2*3; ++i) {
+    VERIFY_IS_EQUAL(a(i), std::string("foo"));
+  }
+}
+
+
+void test_cxx11_tensor_of_strings()
+{
+  // Beware: none of this is likely to ever work on a GPU.
+  CALL_SUBTEST(test_assign());
+  CALL_SUBTEST(test_concat());
+  CALL_SUBTEST(test_slices());
+  CALL_SUBTEST(test_additions());
+  CALL_SUBTEST(test_initialization());
+}
diff --git a/unsupported/test/cxx11_tensor_padding.cpp b/unsupported/test/cxx11_tensor_padding.cpp
new file mode 100644
index 000000000..ffa19896e
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_padding.cpp
@@ -0,0 +1,93 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<int DataLayout>
+static void test_simple_padding()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  array<std::pair<ptrdiff_t, ptrdiff_t>, 4> paddings;
+  paddings[0] = std::make_pair(0, 0);
+  paddings[1] = std::make_pair(2, 1);
+  paddings[2] = std::make_pair(3, 4);
+  paddings[3] = std::make_pair(0, 0);
+
+  Tensor<float, 4, DataLayout> padded;
+  padded = tensor.pad(paddings);
+
+  VERIFY_IS_EQUAL(padded.dimension(0), 2+0);
+  VERIFY_IS_EQUAL(padded.dimension(1), 3+3);
+  VERIFY_IS_EQUAL(padded.dimension(2), 5+7);
+  VERIFY_IS_EQUAL(padded.dimension(3), 7+0);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      for (int k = 0; k < 12; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          if (j >= 2 && j < 5 && k >= 3 && k < 8) {
+            VERIFY_IS_EQUAL(padded(i,j,k,l), tensor(i,j-2,k-3,l));
+          } else {
+            VERIFY_IS_EQUAL(padded(i,j,k,l), 0.0f);
+          }
+        }
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_padded_expr()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  array<std::pair<ptrdiff_t, ptrdiff_t>, 4> paddings;
+  paddings[0] = std::make_pair(0, 0);
+  paddings[1] = std::make_pair(2, 1);
+  paddings[2] = std::make_pair(3, 4);
+  paddings[3] = std::make_pair(0, 0);
+
+  Eigen::DSizes<ptrdiff_t, 2> reshape_dims;
+  reshape_dims[0] = 12;
+  reshape_dims[1] = 84;
+
+  Tensor<float, 2, DataLayout> result;
+  result = tensor.pad(paddings).reshape(reshape_dims);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      for (int k = 0; k < 12; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          const float result_value = DataLayout == ColMajor ?
+              result(i+2*j,k+12*l) : result(j+6*i,l+7*k);
+          if (j >= 2 && j < 5 && k >= 3 && k < 8) {
+            VERIFY_IS_EQUAL(result_value, tensor(i,j-2,k-3,l));
+          } else {
+            VERIFY_IS_EQUAL(result_value, 0.0f);
+          }
+        }
+      }
+    }
+  }
+}
+
+void test_cxx11_tensor_padding()
+{
+  CALL_SUBTEST(test_simple_padding<ColMajor>());
+  CALL_SUBTEST(test_simple_padding<RowMajor>());
+  CALL_SUBTEST(test_padded_expr<ColMajor>());
+  CALL_SUBTEST(test_padded_expr<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_patch.cpp b/unsupported/test/cxx11_tensor_patch.cpp
new file mode 100644
index 000000000..0ee7b46d4
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_patch.cpp
@@ -0,0 +1,120 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+static void test_simple_patch()
+{
+  Tensor<float, 4> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> patch_dims;
+  patch_dims[0] = 1;
+  patch_dims[1] = 1;
+  patch_dims[2] = 1;
+  patch_dims[3] = 1;
+
+  Tensor<float, 5> no_patch;
+  no_patch = tensor.extract_patches(patch_dims);
+
+  VERIFY_IS_EQUAL(no_patch.dimension(0), 1);
+  VERIFY_IS_EQUAL(no_patch.dimension(1), 1);
+  VERIFY_IS_EQUAL(no_patch.dimension(2), 1);
+  VERIFY_IS_EQUAL(no_patch.dimension(3), 1);
+  VERIFY_IS_EQUAL(no_patch.dimension(4), tensor.size());
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    VERIFY_IS_EQUAL(tensor.data()[i], no_patch.data()[i]);
+  }
+
+  patch_dims[0] = 2;
+  patch_dims[1] = 3;
+  patch_dims[2] = 5;
+  patch_dims[3] = 7;
+  Tensor<float, 5> single_patch;
+  single_patch = tensor.extract_patches(patch_dims);
+
+  VERIFY_IS_EQUAL(single_patch.dimension(0), 2);
+  VERIFY_IS_EQUAL(single_patch.dimension(1), 3);
+  VERIFY_IS_EQUAL(single_patch.dimension(2), 5);
+  VERIFY_IS_EQUAL(single_patch.dimension(3), 7);
+  VERIFY_IS_EQUAL(single_patch.dimension(4), 1);
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    VERIFY_IS_EQUAL(tensor.data()[i], single_patch.data()[i]);
+  }
+
+  patch_dims[0] = 1;
+  patch_dims[1] = 2;
+  patch_dims[2] = 2;
+  patch_dims[3] = 1;
+  Tensor<float, 5> twod_patch;
+  twod_patch = tensor.extract_patches(patch_dims);
+
+  VERIFY_IS_EQUAL(twod_patch.dimension(0), 1);
+  VERIFY_IS_EQUAL(twod_patch.dimension(1), 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(3), 1);
+  VERIFY_IS_EQUAL(twod_patch.dimension(4), 2*2*4*7);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 4; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          int patch_loc = i + 2 * (j + 2 * (k + 4 * l));
+          for (int x = 0; x < 2; ++x) {
+            for (int y = 0; y < 2; ++y) {
+              VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(0,x,y,0,patch_loc));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  patch_dims[0] = 1;
+  patch_dims[1] = 2;
+  patch_dims[2] = 3;
+  patch_dims[3] = 5;
+  Tensor<float, 5> threed_patch;
+  threed_patch = tensor.extract_patches(patch_dims);
+
+  VERIFY_IS_EQUAL(threed_patch.dimension(0), 1);
+  VERIFY_IS_EQUAL(threed_patch.dimension(1), 2);
+  VERIFY_IS_EQUAL(threed_patch.dimension(2), 3);
+  VERIFY_IS_EQUAL(threed_patch.dimension(3), 5);
+  VERIFY_IS_EQUAL(threed_patch.dimension(4), 2*2*3*3);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 3; ++l) {
+          int patch_loc = i + 2 * (j + 2 * (k + 3 * l));
+          for (int x = 0; x < 2; ++x) {
+            for (int y = 0; y < 3; ++y) {
+              for (int z = 0; z < 5; ++z) {
+                VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(0,x,y,z,patch_loc));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_patch()
+{
+   CALL_SUBTEST(test_simple_patch());
+   //   CALL_SUBTEST(test_expr_shuffling());
+}
diff --git a/unsupported/test/cxx11_tensor_random.cpp b/unsupported/test/cxx11_tensor_random.cpp
new file mode 100644
index 000000000..8276ae822
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_random.cpp
@@ -0,0 +1,78 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+static void test_default()
+{
+  Tensor<float, 1> vec(6);
+  vec.setRandom();
+
+  // Fixme: we should check that the generated numbers follow a uniform
+  // distribution instead.
+  for (int i = 1; i < 6; ++i) {
+    VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1));
+  }
+}
+
+static void test_normal()
+{
+  Tensor<float, 1> vec(6);
+  vec.setRandom<Eigen::internal::NormalRandomGenerator<float>>();
+
+  // Fixme: we should check that the generated numbers follow a gaussian
+  // distribution instead.
+  for (int i = 1; i < 6; ++i) {
+    VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1));
+  }
+}
+
+
+struct MyGenerator {
+  MyGenerator() { }
+  MyGenerator(const MyGenerator&) { }
+
+  // Return a random value to be used.  "element_location" is the
+  // location of the entry to set in the tensor, it can typically
+  // be ignored.
+  int operator()(Eigen::DenseIndex element_location, Eigen::DenseIndex /*unused*/ = 0) const {
+    return 3 * element_location;
+  }
+
+  // Same as above but generates several numbers at a time.
+  typename internal::packet_traits<int>::type packetOp(
+      Eigen::DenseIndex packet_location, Eigen::DenseIndex /*unused*/ = 0) const {
+    const int packetSize = internal::packet_traits<int>::size;
+    EIGEN_ALIGN_DEFAULT int values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = 3 * (packet_location + i);
+    }
+    return internal::pload<typename internal::packet_traits<int>::type>(values);
+  }
+};
+
+
+static void test_custom()
+{
+  Tensor<int, 1> vec(6);
+  vec.setRandom<MyGenerator>();
+
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(vec(i), 3*i);
+  }
+}
+
+void test_cxx11_tensor_random()
+{
+  CALL_SUBTEST(test_default());
+  CALL_SUBTEST(test_normal());
+  CALL_SUBTEST(test_custom());
+}
diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp
new file mode 100644
index 000000000..0269853a9
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_reduction.cpp
@@ -0,0 +1,420 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <limits>
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <int DataLayout>
+static void test_simple_reductions() {
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+  tensor.setRandom();
+  array<ptrdiff_t, 2> reduction_axis;
+  reduction_axis[0] = 1;
+  reduction_axis[1] = 3;
+
+  Tensor<float, 2, DataLayout> result = tensor.sum(reduction_axis);
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 5);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      float sum = 0.0f;
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          sum += tensor(i, k, j, l);
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), sum);
+    }
+  }
+
+  {
+    Tensor<float, 1, DataLayout> sum1 = tensor.sum();
+    VERIFY_IS_EQUAL(sum1.dimension(0), 1);
+
+    array<ptrdiff_t, 4> reduction_axis;
+    reduction_axis[0] = 0;
+    reduction_axis[1] = 1;
+    reduction_axis[2] = 2;
+    reduction_axis[3] = 3;
+    Tensor<float, 1, DataLayout> sum2 = tensor.sum(reduction_axis);
+    VERIFY_IS_EQUAL(sum2.dimension(0), 1);
+
+    VERIFY_IS_APPROX(sum1(0), sum2(0));
+  }
+
+  reduction_axis[0] = 0;
+  reduction_axis[1] = 2;
+  result = tensor.prod(reduction_axis);
+  VERIFY_IS_EQUAL(result.dimension(0), 3);
+  VERIFY_IS_EQUAL(result.dimension(1), 7);
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      float prod = 1.0f;
+      for (int k = 0; k < 2; ++k) {
+        for (int l = 0; l < 5; ++l) {
+          prod *= tensor(k, i, l, j);
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), prod);
+    }
+  }
+
+  {
+    Tensor<float, 1, DataLayout> prod1 = tensor.prod();
+    VERIFY_IS_EQUAL(prod1.dimension(0), 1);
+
+    array<ptrdiff_t, 4> reduction_axis;
+    reduction_axis[0] = 0;
+    reduction_axis[1] = 1;
+    reduction_axis[2] = 2;
+    reduction_axis[3] = 3;
+    Tensor<float, 1, DataLayout> prod2 = tensor.prod(reduction_axis);
+    VERIFY_IS_EQUAL(prod2.dimension(0), 1);
+
+    VERIFY_IS_APPROX(prod1(0), prod2(0));
+  }
+
+  reduction_axis[0] = 0;
+  reduction_axis[1] = 2;
+  result = tensor.maximum(reduction_axis);
+  VERIFY_IS_EQUAL(result.dimension(0), 3);
+  VERIFY_IS_EQUAL(result.dimension(1), 7);
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      float max_val = std::numeric_limits<float>::lowest();
+      for (int k = 0; k < 2; ++k) {
+        for (int l = 0; l < 5; ++l) {
+          max_val = (std::max)(max_val, tensor(k, i, l, j));
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), max_val);
+    }
+  }
+
+  {
+    Tensor<float, 1, DataLayout> max1 = tensor.maximum();
+    VERIFY_IS_EQUAL(max1.dimension(0), 1);
+
+    array<ptrdiff_t, 4> reduction_axis;
+    reduction_axis[0] = 0;
+    reduction_axis[1] = 1;
+    reduction_axis[2] = 2;
+    reduction_axis[3] = 3;
+    Tensor<float, 1, DataLayout> max2 = tensor.maximum(reduction_axis);
+    VERIFY_IS_EQUAL(max2.dimension(0), 1);
+
+    VERIFY_IS_APPROX(max1(0), max2(0));
+  }
+
+  reduction_axis[0] = 0;
+  reduction_axis[1] = 1;
+  result = tensor.minimum(reduction_axis);
+  VERIFY_IS_EQUAL(result.dimension(0), 5);
+  VERIFY_IS_EQUAL(result.dimension(1), 7);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      float min_val = (std::numeric_limits<float>::max)();
+      for (int k = 0; k < 2; ++k) {
+        for (int l = 0; l < 3; ++l) {
+          min_val = (std::min)(min_val, tensor(k, l, i, j));
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), min_val);
+    }
+  }
+
+  {
+    Tensor<float, 1, DataLayout> min1 = tensor.minimum();
+    VERIFY_IS_EQUAL(min1.dimension(0), 1);
+
+    array<ptrdiff_t, 4> reduction_axis;
+    reduction_axis[0] = 0;
+    reduction_axis[1] = 1;
+    reduction_axis[2] = 2;
+    reduction_axis[3] = 3;
+    Tensor<float, 1, DataLayout> min2 = tensor.minimum(reduction_axis);
+    VERIFY_IS_EQUAL(min2.dimension(0), 1);
+
+    VERIFY_IS_APPROX(min1(0), min2(0));
+  }
+
+  reduction_axis[0] = 0;
+  reduction_axis[1] = 1;
+  result = tensor.mean(reduction_axis);
+  VERIFY_IS_EQUAL(result.dimension(0), 5);
+  VERIFY_IS_EQUAL(result.dimension(1), 7);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      float sum = 0.0f;
+      int count = 0;
+      for (int k = 0; k < 2; ++k) {
+        for (int l = 0; l < 3; ++l) {
+          sum += tensor(k, l, i, j);
+          ++count;
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), sum / count);
+    }
+  }
+
+  {
+    Tensor<float, 1, DataLayout> mean1 = tensor.mean();
+    VERIFY_IS_EQUAL(mean1.dimension(0), 1);
+
+    array<ptrdiff_t, 4> reduction_axis;
+    reduction_axis[0] = 0;
+    reduction_axis[1] = 1;
+    reduction_axis[2] = 2;
+    reduction_axis[3] = 3;
+    Tensor<float, 1, DataLayout> mean2 = tensor.mean(reduction_axis);
+    VERIFY_IS_EQUAL(mean2.dimension(0), 1);
+
+    VERIFY_IS_APPROX(mean1(0), mean2(0));
+  }
+}
+
+template <int DataLayout>
+static void test_full_reductions() {
+  Tensor<float, 2, DataLayout> tensor(2, 3);
+  tensor.setRandom();
+  array<ptrdiff_t, 2> reduction_axis;
+  reduction_axis[0] = 0;
+  reduction_axis[1] = 1;
+
+  Tensor<float, 1, DataLayout> result = tensor.sum(reduction_axis);
+  VERIFY_IS_EQUAL(result.dimension(0), 1);
+
+  float sum = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      sum += tensor(i, j);
+    }
+  }
+  VERIFY_IS_APPROX(result(0), sum);
+
+  result = tensor.square().sum(reduction_axis).sqrt();
+  VERIFY_IS_EQUAL(result.dimension(0), 1);
+
+  sum = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      sum += tensor(i, j) * tensor(i, j);
+    }
+  }
+  VERIFY_IS_APPROX(result(0), sqrtf(sum));
+}
+
+struct UserReducer {
+  static const bool PacketAccess = false;
+  UserReducer(float offset) : offset_(offset) {}
+  void reduce(const float val, float* accum) { *accum += val * val; }
+  float initialize() const { return 0; }
+  float finalize(const float accum) const { return 1.0f / (accum + offset_); }
+
+ private:
+  const float offset_;
+};
+
+template <int DataLayout>
+static void test_user_defined_reductions() {
+  Tensor<float, 2, DataLayout> tensor(5, 7);
+  tensor.setRandom();
+  array<ptrdiff_t, 1> reduction_axis;
+  reduction_axis[0] = 1;
+
+  UserReducer reducer(10.0f);
+  Tensor<float, 1, DataLayout> result = tensor.reduce(reduction_axis, reducer);
+  VERIFY_IS_EQUAL(result.dimension(0), 5);
+  for (int i = 0; i < 5; ++i) {
+    float expected = 10.0f;
+    for (int j = 0; j < 7; ++j) {
+      expected += tensor(i, j) * tensor(i, j);
+    }
+    expected = 1.0f / expected;
+    VERIFY_IS_APPROX(result(i), expected);
+  }
+}
+
+template <int DataLayout>
+static void test_tensor_maps() {
+  int inputs[2 * 3 * 5 * 7];
+  TensorMap<Tensor<int, 4, DataLayout> > tensor_map(inputs, 2, 3, 5, 7);
+  TensorMap<Tensor<const int, 4, DataLayout> > tensor_map_const(inputs, 2, 3, 5,
+                                                                7);
+  const TensorMap<Tensor<const int, 4, DataLayout> > tensor_map_const_const(
+      inputs, 2, 3, 5, 7);
+
+  tensor_map.setRandom();
+  array<ptrdiff_t, 2> reduction_axis;
+  reduction_axis[0] = 1;
+  reduction_axis[1] = 3;
+
+  Tensor<int, 2, DataLayout> result = tensor_map.sum(reduction_axis);
+  Tensor<int, 2, DataLayout> result2 = tensor_map_const.sum(reduction_axis);
+  Tensor<int, 2, DataLayout> result3 =
+      tensor_map_const_const.sum(reduction_axis);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      int sum = 0;
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          sum += tensor_map(i, k, j, l);
+        }
+      }
+      VERIFY_IS_EQUAL(result(i, j), sum);
+      VERIFY_IS_EQUAL(result2(i, j), sum);
+      VERIFY_IS_EQUAL(result3(i, j), sum);
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_static_dims() {
+  Tensor<float, 4, DataLayout> in(72, 53, 97, 113);
+  Tensor<float, 2, DataLayout> out(72, 97);
+  in.setRandom();
+
+#ifndef EIGEN_HAS_CONSTEXPR 
+  array<int, 2> reduction_axis;
+  reduction_axis[0] = 1;
+  reduction_axis[1] = 3;
+#else
+  Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<3> > reduction_axis;
+#endif
+
+  out = in.maximum(reduction_axis);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      float expected = -1e10f;
+      for (int k = 0; k < 53; ++k) {
+        for (int l = 0; l < 113; ++l) {
+          expected = (std::max)(expected, in(i, k, j, l));
+        }
+      }
+      VERIFY_IS_APPROX(out(i, j), expected);
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_innermost_last_dims() {
+  Tensor<float, 4, DataLayout> in(72, 53, 97, 113);
+  Tensor<float, 2, DataLayout> out(97, 113);
+  in.setRandom();
+
+// Reduce on the innermost dimensions.
+#ifndef EIGEN_HAS_CONSTEXPR
+  array<int, 2> reduction_axis;
+  reduction_axis[0] = 0;
+  reduction_axis[1] = 1;
+#else
+  // This triggers the use of packets for ColMajor.
+  Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1> > reduction_axis;
+#endif
+
+  out = in.maximum(reduction_axis);
+
+  for (int i = 0; i < 97; ++i) {
+    for (int j = 0; j < 113; ++j) {
+      float expected = -1e10f;
+      for (int k = 0; k < 53; ++k) {
+        for (int l = 0; l < 72; ++l) {
+          expected = (std::max)(expected, in(l, k, i, j));
+        }
+      }
+      VERIFY_IS_APPROX(out(i, j), expected);
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_innermost_first_dims() {
+  Tensor<float, 4, DataLayout> in(72, 53, 97, 113);
+  Tensor<float, 2, DataLayout> out(72, 53);
+  in.setRandom();
+
+// Reduce on the innermost dimensions.
+#ifndef EIGEN_HAS_CONSTEXPR
+  array<int, 2> reduction_axis;
+  reduction_axis[0] = 2;
+  reduction_axis[1] = 3;
+#else
+  // This triggers the use of packets for RowMajor.
+  Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3>> reduction_axis;
+#endif
+
+  out = in.maximum(reduction_axis);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 53; ++j) {
+      float expected = -1e10f;
+      for (int k = 0; k < 97; ++k) {
+        for (int l = 0; l < 113; ++l) {
+          expected = (std::max)(expected, in(i, j, k, l));
+        }
+      }
+      VERIFY_IS_APPROX(out(i, j), expected);
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_reduce_middle_dims() {
+  Tensor<float, 4, DataLayout> in(72, 53, 97, 113);
+  Tensor<float, 2, DataLayout> out(72, 53);
+  in.setRandom();
+
+// Reduce on the innermost dimensions.
+#ifndef EIGEN_HAS_CONSTEXPR
+  array<int, 2> reduction_axis;
+  reduction_axis[0] = 1;
+  reduction_axis[1] = 2;
+#else
+  // This triggers the use of packets for RowMajor.
+  Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2>> reduction_axis;
+#endif
+
+  out = in.maximum(reduction_axis);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 113; ++j) {
+      float expected = -1e10f;
+      for (int k = 0; k < 53; ++k) {
+        for (int l = 0; l < 97; ++l) {
+          expected = (std::max)(expected, in(i, k, l, j));
+        }
+      }
+      VERIFY_IS_APPROX(out(i, j), expected);
+    }
+  }
+}
+
+void test_cxx11_tensor_reduction() {
+  CALL_SUBTEST(test_simple_reductions<ColMajor>());
+  CALL_SUBTEST(test_simple_reductions<RowMajor>());
+  CALL_SUBTEST(test_full_reductions<ColMajor>());
+  CALL_SUBTEST(test_full_reductions<RowMajor>());
+  CALL_SUBTEST(test_user_defined_reductions<ColMajor>());
+  CALL_SUBTEST(test_user_defined_reductions<RowMajor>());
+  CALL_SUBTEST(test_tensor_maps<ColMajor>());
+  CALL_SUBTEST(test_tensor_maps<RowMajor>());
+  CALL_SUBTEST(test_static_dims<ColMajor>());
+  CALL_SUBTEST(test_static_dims<RowMajor>());
+  CALL_SUBTEST(test_innermost_last_dims<ColMajor>());
+  CALL_SUBTEST(test_innermost_last_dims<RowMajor>());
+  CALL_SUBTEST(test_innermost_first_dims<ColMajor>());
+  CALL_SUBTEST(test_innermost_first_dims<RowMajor>());
+  CALL_SUBTEST(test_reduce_middle_dims<ColMajor>());
+  CALL_SUBTEST(test_reduce_middle_dims<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_ref.cpp b/unsupported/test/cxx11_tensor_ref.cpp
new file mode 100644
index 000000000..aa369f278
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_ref.cpp
@@ -0,0 +1,208 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_simple_lvalue_ref()
+{
+  Tensor<int, 1> input(6);
+  input.setRandom();
+
+  TensorRef<Tensor<int, 1>> ref3(input);
+  TensorRef<Tensor<int, 1>> ref4 = input;
+
+  VERIFY_IS_EQUAL(ref3.data(), input.data());
+  VERIFY_IS_EQUAL(ref4.data(), input.data());
+
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(ref3(i), input(i));
+    VERIFY_IS_EQUAL(ref4(i), input(i));
+  }
+
+  for (int i = 0; i < 6; ++i) {
+    ref3.coeffRef(i) = i;
+  }
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(input(i), i);
+  }
+  for (int i = 0; i < 6; ++i) {
+    ref4.coeffRef(i) = -i * 2;
+  }
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(input(i), -i*2);
+  }
+}
+
+
+static void test_simple_rvalue_ref()
+{
+  Tensor<int, 1> input1(6);
+  input1.setRandom();
+  Tensor<int, 1> input2(6);
+  input2.setRandom();
+
+  TensorRef<Tensor<int, 1>> ref3(input1 + input2);
+  TensorRef<Tensor<int, 1>> ref4 = input1 + input2;
+
+  VERIFY_IS_NOT_EQUAL(ref3.data(), input1.data());
+  VERIFY_IS_NOT_EQUAL(ref4.data(), input1.data());
+  VERIFY_IS_NOT_EQUAL(ref3.data(), input2.data());
+  VERIFY_IS_NOT_EQUAL(ref4.data(), input2.data());
+
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(ref3(i), input1(i) + input2(i));
+    VERIFY_IS_EQUAL(ref4(i), input1(i) + input2(i));
+  }
+}
+
+
+static void test_multiple_dims()
+{
+  Tensor<float, 3> input(3,5,7);
+  input.setRandom();
+
+  TensorRef<Tensor<float, 3>> ref(input);
+  VERIFY_IS_EQUAL(ref.data(), input.data());
+  VERIFY_IS_EQUAL(ref.dimension(0), 3);
+  VERIFY_IS_EQUAL(ref.dimension(1), 5);
+  VERIFY_IS_EQUAL(ref.dimension(2), 7);
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(ref(i,j,k), input(i,j,k));
+      }
+    }
+  }
+}
+
+
+static void test_slice()
+{
+  Tensor<float, 5> tensor(2,3,5,7,11);
+  tensor.setRandom();
+
+  Eigen::DSizes<ptrdiff_t, 5> indices(1,2,3,4,5);
+  Eigen::DSizes<ptrdiff_t, 5> sizes(1,1,1,1,1);
+  TensorRef<Tensor<float, 5>> slice = tensor.slice(indices, sizes);
+  VERIFY_IS_EQUAL(slice(0,0,0,0,0), tensor(1,2,3,4,5));
+
+  Eigen::DSizes<ptrdiff_t, 5> indices2(1,1,3,4,5);
+  Eigen::DSizes<ptrdiff_t, 5> sizes2(1,1,2,2,3);
+  slice = tensor.slice(indices2, sizes2);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        VERIFY_IS_EQUAL(slice(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+      }
+    }
+  }
+
+  Eigen::DSizes<ptrdiff_t, 5> indices3(0,0,0,0,0);
+  Eigen::DSizes<ptrdiff_t, 5> sizes3(2,3,1,1,1);
+  slice = tensor.slice(indices3, sizes3);
+  VERIFY_IS_EQUAL(slice.data(), tensor.data());
+}
+
+
+static void test_ref_of_ref()
+{
+  Tensor<float, 3> input(3,5,7);
+  input.setRandom();
+
+  TensorRef<Tensor<float, 3>> ref(input);
+  TensorRef<Tensor<float, 3>> ref_of_ref(ref);
+  TensorRef<Tensor<float, 3>> ref_of_ref2;
+  ref_of_ref2 = ref;
+
+  VERIFY_IS_EQUAL(ref_of_ref.data(), input.data());
+  VERIFY_IS_EQUAL(ref_of_ref.dimension(0), 3);
+  VERIFY_IS_EQUAL(ref_of_ref.dimension(1), 5);
+  VERIFY_IS_EQUAL(ref_of_ref.dimension(2), 7);
+
+  VERIFY_IS_EQUAL(ref_of_ref2.data(), input.data());
+  VERIFY_IS_EQUAL(ref_of_ref2.dimension(0), 3);
+  VERIFY_IS_EQUAL(ref_of_ref2.dimension(1), 5);
+  VERIFY_IS_EQUAL(ref_of_ref2.dimension(2), 7);
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(ref_of_ref(i,j,k), input(i,j,k));
+        VERIFY_IS_EQUAL(ref_of_ref2(i,j,k), input(i,j,k));
+     }
+    }
+  }
+}
+
+
+static void test_ref_in_expr()
+{
+  Tensor<float, 3> input(3,5,7);
+  input.setRandom();
+  TensorRef<Tensor<float, 3>> input_ref(input);
+
+  Tensor<float, 3> result(3,5,7);
+  result.setRandom();
+  TensorRef<Tensor<float, 3>> result_ref(result);
+
+  Tensor<float, 3> bias(3,5,7);
+  bias.setRandom();
+
+  result_ref = input_ref + bias;
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(result_ref(i,j,k), input(i,j,k) + bias(i,j,k));
+        VERIFY_IS_NOT_EQUAL(result(i,j,k), input(i,j,k) + bias(i,j,k));
+      }
+    }
+  }
+
+  result = result_ref;
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(result(i,j,k), input(i,j,k) + bias(i,j,k));
+      }
+    }
+  }
+}
+
+
+static void test_coeff_ref()
+{
+  Tensor<float, 5> tensor(2,3,5,7,11);
+  tensor.setRandom();
+  Tensor<float, 5> original = tensor;
+
+  TensorRef<Tensor<float, 4>> slice = tensor.chip(7, 4);
+  slice.coeffRef(0, 0, 0, 0) = 1.0f;
+  slice.coeffRef(1, 0, 0, 0) += 2.0f;
+
+  VERIFY_IS_EQUAL(tensor(0,0,0,0,7), 1.0f);
+  VERIFY_IS_EQUAL(tensor(1,0,0,0,7), original(1,0,0,0,7) + 2.0f);
+}
+
+
+void test_cxx11_tensor_ref()
+{
+  CALL_SUBTEST(test_simple_lvalue_ref());
+  CALL_SUBTEST(test_simple_rvalue_ref());
+  CALL_SUBTEST(test_multiple_dims());
+  CALL_SUBTEST(test_slice());
+  CALL_SUBTEST(test_ref_of_ref());
+  CALL_SUBTEST(test_ref_in_expr());
+  CALL_SUBTEST(test_coeff_ref());
+}
diff --git a/unsupported/test/cxx11_tensor_reverse.cpp b/unsupported/test/cxx11_tensor_reverse.cpp
new file mode 100644
index 000000000..4c0be35da
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_reverse.cpp
@@ -0,0 +1,167 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com and
+//                    Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::array;
+
+template <int DataLayout>
+static void test_simple_reverse()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  array<bool, 4> dim_rev;
+  dim_rev[0] = false;
+  dim_rev[1] = true;
+  dim_rev[2] = true;
+  dim_rev[3] = false;
+
+  Tensor<float, 4, DataLayout> reversed_tensor;
+  reversed_tensor = tensor.reverse(dim_rev);
+
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(0), 2);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(1), 3);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(2), 5);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(3), 7);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(i,2-j,4-k,l));
+        }
+      }
+    }
+  }
+
+  dim_rev[0] = true;
+  dim_rev[1] = false;
+  dim_rev[2] = false;
+  dim_rev[3] = false;
+
+  reversed_tensor = tensor.reverse(dim_rev);
+
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(0), 2);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(1), 3);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(2), 5);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(3), 7);
+
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,l));
+        }
+      }
+    }
+  }
+
+  dim_rev[0] = true;
+  dim_rev[1] = false;
+  dim_rev[2] = false;
+  dim_rev[3] = true;
+
+  reversed_tensor = tensor.reverse(dim_rev);
+
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(0), 2);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(1), 3);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(2), 5);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(3), 7);
+
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,6-l));
+        }
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_expr_reverse()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  array<bool, 4> dim_rev;
+  dim_rev[0] = false;
+  dim_rev[1] = true;
+  dim_rev[2] = false;
+  dim_rev[3] = true;
+
+
+  Tensor<float, 4, DataLayout> expected;
+  expected = tensor.reverse(dim_rev);
+
+  Tensor<float, 4, DataLayout> result(2,3,5,7);
+
+  array<ptrdiff_t, 4> src_slice_dim{{2,3,1,7}};
+  array<ptrdiff_t, 4> src_slice_start{{0,0,0,0}};
+  array<ptrdiff_t, 4> dst_slice_dim{{2,3,1,7}};
+  array<ptrdiff_t, 4> dst_slice_start{{0,0,0,0}};
+
+  for (int i = 0; i < 5; ++i) {
+    result.slice(dst_slice_start, dst_slice_dim) =
+        tensor.slice(src_slice_start, src_slice_dim).reverse(dim_rev);
+    src_slice_start[2] += 1;
+    dst_slice_start[2] += 1;
+  }
+
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 3);
+  VERIFY_IS_EQUAL(result.dimension(2), 5);
+  VERIFY_IS_EQUAL(result.dimension(3), 7);
+
+  for (int i = 0; i < expected.dimension(0); ++i) {
+    for (int j = 0; j < expected.dimension(1); ++j) {
+      for (int k = 0; k < expected.dimension(2); ++k) {
+        for (int l = 0; l < expected.dimension(3); ++l) {
+          VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  dst_slice_start[2] = 0;
+  result.setRandom();
+  for (int i = 0; i < 5; ++i) {
+    result.slice(dst_slice_start, dst_slice_dim) =
+        tensor.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim);
+    dst_slice_start[2] += 1;
+  }
+
+  for (int i = 0; i < expected.dimension(0); ++i) {
+    for (int j = 0; j < expected.dimension(1); ++j) {
+      for (int k = 0; k < expected.dimension(2); ++k) {
+        for (int l = 0; l < expected.dimension(3); ++l) {
+          VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_reverse()
+{
+  CALL_SUBTEST(test_simple_reverse<ColMajor>());
+  CALL_SUBTEST(test_simple_reverse<RowMajor>());
+  CALL_SUBTEST(test_expr_reverse<ColMajor>());
+  CALL_SUBTEST(test_expr_reverse<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp
new file mode 100644
index 000000000..2f7fd9e50
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_shuffling.cpp
@@ -0,0 +1,187 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::array;
+
+template <int DataLayout>
+static void test_simple_shuffling()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> shuffles;
+  shuffles[0] = 0;
+  shuffles[1] = 1;
+  shuffles[2] = 2;
+  shuffles[3] = 3;
+
+  Tensor<float, 4, DataLayout> no_shuffle;
+  no_shuffle = tensor.shuffle(shuffles);
+
+  VERIFY_IS_EQUAL(no_shuffle.dimension(0), 2);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(1), 3);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(2), 5);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(3), 7);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  shuffles[0] = 2;
+  shuffles[1] = 3;
+  shuffles[2] = 1;
+  shuffles[3] = 0;
+  Tensor<float, 4, DataLayout> shuffle;
+  shuffle = tensor.shuffle(shuffles);
+
+  VERIFY_IS_EQUAL(shuffle.dimension(0), 5);
+  VERIFY_IS_EQUAL(shuffle.dimension(1), 7);
+  VERIFY_IS_EQUAL(shuffle.dimension(2), 3);
+  VERIFY_IS_EQUAL(shuffle.dimension(3), 2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i));
+        }
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_expr_shuffling()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  array<ptrdiff_t, 4> shuffles;
+  shuffles[0] = 2;
+  shuffles[1] = 3;
+  shuffles[2] = 1;
+  shuffles[3] = 0;
+  Tensor<float, 4, DataLayout> expected;
+  expected = tensor.shuffle(shuffles);
+
+  Tensor<float, 4, DataLayout> result(5,7,3,2);
+
+  array<int, 4> src_slice_dim{{2,3,1,7}};
+  array<int, 4> src_slice_start{{0,0,0,0}};
+  array<int, 4> dst_slice_dim{{1,7,3,2}};
+  array<int, 4> dst_slice_start{{0,0,0,0}};
+
+  for (int i = 0; i < 5; ++i) {
+    result.slice(dst_slice_start, dst_slice_dim) =
+        tensor.slice(src_slice_start, src_slice_dim).shuffle(shuffles);
+    src_slice_start[2] += 1;
+    dst_slice_start[0] += 1;
+  }
+
+  VERIFY_IS_EQUAL(result.dimension(0), 5);
+  VERIFY_IS_EQUAL(result.dimension(1), 7);
+  VERIFY_IS_EQUAL(result.dimension(2), 3);
+  VERIFY_IS_EQUAL(result.dimension(3), 2);
+
+  for (int i = 0; i < expected.dimension(0); ++i) {
+    for (int j = 0; j < expected.dimension(1); ++j) {
+      for (int k = 0; k < expected.dimension(2); ++k) {
+        for (int l = 0; l < expected.dimension(3); ++l) {
+          VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  dst_slice_start[0] = 0;
+  result.setRandom();
+  for (int i = 0; i < 5; ++i) {
+    result.slice(dst_slice_start, dst_slice_dim) =
+        tensor.shuffle(shuffles).slice(dst_slice_start, dst_slice_dim);
+    dst_slice_start[0] += 1;
+  }
+
+  for (int i = 0; i < expected.dimension(0); ++i) {
+    for (int j = 0; j < expected.dimension(1); ++j) {
+      for (int k = 0; k < expected.dimension(2); ++k) {
+        for (int l = 0; l < expected.dimension(3); ++l) {
+          VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_shuffling_as_value()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> shuffles;
+  shuffles[2] = 0;
+  shuffles[3] = 1;
+  shuffles[1] = 2;
+  shuffles[0] = 3;
+  Tensor<float, 4, DataLayout> shuffle(5,7,3,2);
+  shuffle.shuffle(shuffles) = tensor;
+
+  VERIFY_IS_EQUAL(shuffle.dimension(0), 5);
+  VERIFY_IS_EQUAL(shuffle.dimension(1), 7);
+  VERIFY_IS_EQUAL(shuffle.dimension(2), 3);
+  VERIFY_IS_EQUAL(shuffle.dimension(3), 2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i));
+        }
+      }
+    }
+  }
+
+  array<ptrdiff_t, 4> no_shuffle;
+  no_shuffle[0] = 0;
+  no_shuffle[1] = 1;
+  no_shuffle[2] = 2;
+  no_shuffle[3] = 3;
+  Tensor<float, 4, DataLayout> shuffle2(5,7,3,2);
+  shuffle2.shuffle(shuffles) = tensor.shuffle(no_shuffle);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 2; ++l) {
+          VERIFY_IS_EQUAL(shuffle2(i,j,k,l), shuffle(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+void test_cxx11_tensor_shuffling()
+{
+   CALL_SUBTEST(test_simple_shuffling<ColMajor>());
+   CALL_SUBTEST(test_simple_shuffling<RowMajor>());
+   CALL_SUBTEST(test_expr_shuffling<ColMajor>());
+   CALL_SUBTEST(test_expr_shuffling<RowMajor>());
+   CALL_SUBTEST(test_shuffling_as_value<ColMajor>());
+   CALL_SUBTEST(test_shuffling_as_value<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_simple.cpp b/unsupported/test/cxx11_tensor_simple.cpp
index ea512c9cc..23855fca0 100644
--- a/unsupported/test/cxx11_tensor_simple.cpp
+++ b/unsupported/test/cxx11_tensor_simple.cpp
@@ -32,6 +32,7 @@ static void test_1d()
   vec1(5) = 42; vec2(5) = 5; vec3(5) = 0;
   vec4.setZero();
 
+  VERIFY_IS_EQUAL((vec1.rank()), 1);
   VERIFY_IS_EQUAL((vec1.size()), 6);
   VERIFY_IS_EQUAL((vec1.dimensions()[0]), 6);
 
@@ -99,10 +100,12 @@ static void test_2d()
   mat2(1,1) = 4;
   mat2(1,2) = 5;
 
+  VERIFY_IS_EQUAL((mat1.rank()), 2);
   VERIFY_IS_EQUAL((mat1.size()), 6);
   VERIFY_IS_EQUAL((mat1.dimensions()[0]), 2);
   VERIFY_IS_EQUAL((mat1.dimensions()[1]), 3);
 
+  VERIFY_IS_EQUAL((mat2.rank()), 2);
   VERIFY_IS_EQUAL((mat2.size()), 6);
   VERIFY_IS_EQUAL((mat2.dimensions()[0]), 2);
   VERIFY_IS_EQUAL((mat2.dimensions()[1]), 3);
@@ -163,7 +166,7 @@ static void test_3d()
   VERIFY_IS_EQUAL((epsilon(0,2,1)), -1);
   VERIFY_IS_EQUAL((epsilon(1,0,2)), -1);
 
-  std::array<Eigen::DenseIndex, 3> dims{{2,3,4}};
+  array<Eigen::DenseIndex, 3> dims{{2,3,4}};
   Tensor<int, 3> t1(dims);
   Tensor<int, 3, RowMajor> t2(dims);
 
@@ -244,7 +247,7 @@ static void test_simple_assign()
   epsilon(0,1,2) = epsilon(2,0,1) = epsilon(1,2,0) = 1;
   epsilon(2,1,0) = epsilon(0,2,1) = epsilon(1,0,2) = -1;
 
-  Tensor<int, 3> e2(2,3,1);
+  Tensor<int, 3> e2(3,3,3);
   e2.setZero();
   VERIFY_IS_EQUAL((e2(1,2,0)), 0);
 
@@ -257,12 +260,38 @@ static void test_simple_assign()
   VERIFY_IS_EQUAL((e2(1,0,2)), -1);
 }
 
+static void test_resize()
+{
+  Tensor<int, 3> epsilon;
+  epsilon.resize(2,3,7);
+  VERIFY_IS_EQUAL(epsilon.dimension(0), 2);
+  VERIFY_IS_EQUAL(epsilon.dimension(1), 3);
+  VERIFY_IS_EQUAL(epsilon.dimension(2), 7);
+  VERIFY_IS_EQUAL(epsilon.dimensions().TotalSize(), 2ul*3*7);
+
+  const int* old_data = epsilon.data();
+  epsilon.resize(3,2,7);
+  VERIFY_IS_EQUAL(epsilon.dimension(0), 3);
+  VERIFY_IS_EQUAL(epsilon.dimension(1), 2);
+  VERIFY_IS_EQUAL(epsilon.dimension(2), 7);
+  VERIFY_IS_EQUAL(epsilon.dimensions().TotalSize(), 2ul*3*7);
+  VERIFY_IS_EQUAL(epsilon.data(), old_data);
+
+  epsilon.resize(3,5,7);
+  VERIFY_IS_EQUAL(epsilon.dimension(0), 3);
+  VERIFY_IS_EQUAL(epsilon.dimension(1), 5);
+  VERIFY_IS_EQUAL(epsilon.dimension(2), 7);
+  VERIFY_IS_EQUAL(epsilon.dimensions().TotalSize(), 3ul*5*7);
+  VERIFY_IS_NOT_EQUAL(epsilon.data(), old_data);
+}
+
 void test_cxx11_tensor_simple()
 {
   CALL_SUBTEST(test_1d());
   CALL_SUBTEST(test_2d());
   CALL_SUBTEST(test_3d());
   CALL_SUBTEST(test_simple_assign());
+  CALL_SUBTEST(test_resize());
 }
 
 /*
diff --git a/unsupported/test/cxx11_tensor_striding.cpp b/unsupported/test/cxx11_tensor_striding.cpp
new file mode 100644
index 000000000..935b908cc
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_striding.cpp
@@ -0,0 +1,119 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<int DataLayout>
+static void test_simple_striding()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> strides;
+  strides[0] = 1;
+  strides[1] = 1;
+  strides[2] = 1;
+  strides[3] = 1;
+
+  Tensor<float, 4, DataLayout> no_stride;
+  no_stride = tensor.stride(strides);
+
+  VERIFY_IS_EQUAL(no_stride.dimension(0), 2);
+  VERIFY_IS_EQUAL(no_stride.dimension(1), 3);
+  VERIFY_IS_EQUAL(no_stride.dimension(2), 5);
+  VERIFY_IS_EQUAL(no_stride.dimension(3), 7);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  strides[0] = 2;
+  strides[1] = 4;
+  strides[2] = 2;
+  strides[3] = 3;
+  Tensor<float, 4, DataLayout> stride;
+  stride = tensor.stride(strides);
+
+  VERIFY_IS_EQUAL(stride.dimension(0), 1);
+  VERIFY_IS_EQUAL(stride.dimension(1), 1);
+  VERIFY_IS_EQUAL(stride.dimension(2), 3);
+  VERIFY_IS_EQUAL(stride.dimension(3), 3);
+
+  for (int i = 0; i < 1; ++i) {
+    for (int j = 0; j < 1; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 3; ++l) {
+          VERIFY_IS_EQUAL(tensor(2*i,4*j,2*k,3*l), stride(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+
+template<int DataLayout>
+static void test_striding_as_lvalue()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> strides;
+  strides[0] = 2;
+  strides[1] = 4;
+  strides[2] = 2;
+  strides[3] = 3;
+
+  Tensor<float, 4, DataLayout> result(3, 12, 10, 21);
+  result.stride(strides) = tensor;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), result(2*i,4*j,2*k,3*l));
+        }
+      }
+    }
+  }
+
+  array<ptrdiff_t, 4> no_strides;
+  no_strides[0] = 1;
+  no_strides[1] = 1;
+  no_strides[2] = 1;
+  no_strides[3] = 1;
+  Tensor<float, 4, DataLayout> result2(3, 12, 10, 21);
+  result2.stride(strides) = tensor.stride(no_strides);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), result2(2*i,4*j,2*k,3*l));
+        }
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_striding()
+{
+  CALL_SUBTEST(test_simple_striding<ColMajor>());
+  CALL_SUBTEST(test_simple_striding<RowMajor>());
+  CALL_SUBTEST(test_striding_as_lvalue<ColMajor>());
+  CALL_SUBTEST(test_striding_as_lvalue<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
new file mode 100644
index 000000000..6fe65c7f9
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -0,0 +1,270 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+
+
+#include "main.h"
+#include <iostream>
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+
+static void test_multithread_elementwise()
+{
+  Tensor<float, 3> in1(2,3,7);
+  Tensor<float, 3> in2(2,3,7);
+  Tensor<float, 3> out(2,3,7);
+
+  in1.setRandom();
+  in2.setRandom();
+
+  Eigen::ThreadPoolDevice thread_pool_device(internal::random<int>(3, 11));
+  out.device(thread_pool_device) = in1 + in2 * 3.14f;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f);
+      }
+    }
+  }
+}
+
+
+static void test_multithread_compound_assignment()
+{
+  Tensor<float, 3> in1(2,3,7);
+  Tensor<float, 3> in2(2,3,7);
+  Tensor<float, 3> out(2,3,7);
+
+  in1.setRandom();
+  in2.setRandom();
+
+  Eigen::ThreadPoolDevice thread_pool_device(internal::random<int>(3, 11));
+  out.device(thread_pool_device) = in1;
+  out.device(thread_pool_device) += in2 * 3.14f;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f);
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_multithread_contraction()
+{
+  Tensor<float, 4, DataLayout> t_left(30, 50, 37, 31);
+  Tensor<float, 5, DataLayout> t_right(37, 31, 70, 2, 10);
+  Tensor<float, 5, DataLayout> t_result(30, 50, 70, 2, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  // this contraction should be equivalent to a single matrix multiplication
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+
+  typedef Map<Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 1500, 1147);
+  MapXf m_right(t_right.data(), 1147, 1400);
+  Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
+
+  Eigen::ThreadPoolDevice thread_pool_device(4);
+
+  // compute results by separate methods
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  m_result = m_left * m_right;
+
+ for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    VERIFY(&t_result.data()[i] != &m_result.data()[i]);
+    if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
+      std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_contraction_corner_cases()
+{
+  Tensor<float, 2, DataLayout> t_left(32, 500);
+  Tensor<float, 2, DataLayout> t_right(32, 28*28);
+  Tensor<float, 2, DataLayout> t_result(500, 28*28);
+
+  t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
+  t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
+  t_result = t_result.constant(NAN);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims{{DimPair(0, 0)}};
+
+  typedef Map<Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 32, 500);
+  MapXf m_right(t_right.data(), 32, 28*28);
+  Matrix<float, Dynamic, Dynamic, DataLayout> m_result(500, 28*28);
+
+  Eigen::ThreadPoolDevice thread_pool_device(12);
+
+  // compute results by separate methods
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  m_result = m_left.transpose() * m_right;
+
+  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    assert(!std::isnan(t_result.data()[i]));
+    if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
+      std::cout << "mismatch detected at index " << i << " : " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+
+  t_left.resize(32, 1);
+  t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
+  t_result.resize (1, 28*28);
+  t_result = t_result.constant(NAN);
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  new(&m_left) MapXf(t_left.data(), 32, 1);
+  m_result = m_left.transpose() * m_right;
+  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    assert(!std::isnan(t_result.data()[i]));
+    if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
+      std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+
+  t_left.resize(32, 500);
+  t_right.resize(32, 4);
+  t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
+  t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
+  t_result.resize (500, 4);
+  t_result = t_result.constant(NAN);
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  new(&m_left) MapXf(t_left.data(), 32, 500);
+  new(&m_right) MapXf(t_right.data(), 32, 4);
+  m_result = m_left.transpose() * m_right;
+  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    assert(!std::isnan(t_result.data()[i]));
+    if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
+      std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+
+  t_left.resize(32, 1);
+  t_right.resize(32, 4);
+  t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
+  t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
+  t_result.resize (1, 4);
+  t_result = t_result.constant(NAN);
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  new(&m_left) MapXf(t_left.data(), 32, 1);
+  new(&m_right) MapXf(t_right.data(), 32, 4);
+  m_result = m_left.transpose() * m_right;
+  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    assert(!std::isnan(t_result.data()[i]));
+    if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
+      std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_multithread_contraction_agrees_with_singlethread() {
+  int contract_size = internal::random<int>(1, 5000);
+
+  Tensor<float, 3, DataLayout> left(internal::random<int>(1, 80),
+                                    contract_size,
+                                    internal::random<int>(1, 100));
+
+  Tensor<float, 4, DataLayout> right(internal::random<int>(1, 25),
+                                     internal::random<int>(1, 37),
+                                     contract_size,
+                                     internal::random<int>(1, 51));
+
+  left.setRandom();
+  right.setRandom();
+
+  // add constants to shift values away from 0 for more precision
+  left += left.constant(1.5f);
+  right += right.constant(1.5f);
+
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}});
+
+  Eigen::ThreadPoolDevice thread_pool_device(internal::random<int>(2, 11));
+
+  Tensor<float, 5, DataLayout> st_result;
+  st_result = left.contract(right, dims);
+
+  Tensor<float, 5, DataLayout> tp_result(st_result.dimensions());
+  tp_result.device(thread_pool_device) = left.contract(right, dims);
+
+  VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
+  for (ptrdiff_t i = 0; i < st_result.size(); i++) {
+    // if both of the values are very small, then do nothing (because the test will fail
+    // due to numerical precision issues when values are small)
+    if (fabs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4) {
+      VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]);
+    }
+  }
+}
+
+
+static void test_memcpy() {
+
+  for (int i = 0; i < 5; ++i) {
+    const int num_threads = internal::random<int>(3, 11);
+    Eigen::ThreadPoolDevice thread_pool_device(num_threads);
+
+    const int size = internal::random<int>(13, 7632);
+    Tensor<float, 1> t1(size);
+    t1.setRandom();
+    std::vector<float> result(size);
+    thread_pool_device.memcpy(&result[0], t1.data(), size*sizeof(float));
+    for (int i = 0; i < size; i++) {
+      VERIFY_IS_EQUAL(t1(i), result[i]);
+    }
+  }
+}
+
+
+static void test_multithread_random()
+{
+  Eigen::ThreadPoolDevice device(2);
+  Tensor<float, 1> t(1 << 20);
+  t.device(device) = t.random<Eigen::internal::NormalRandomGenerator<float>>();
+}
+
+
+void test_cxx11_tensor_thread_pool()
+{
+  CALL_SUBTEST(test_multithread_elementwise());
+  CALL_SUBTEST(test_multithread_compound_assignment());
+
+  CALL_SUBTEST(test_multithread_contraction<ColMajor>());
+  CALL_SUBTEST(test_multithread_contraction<RowMajor>());
+
+  CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread<ColMajor>());
+  CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread<RowMajor>());
+
+  // Exercise various cases that have been problematic in the past.
+  CALL_SUBTEST(test_contraction_corner_cases<ColMajor>());
+  CALL_SUBTEST(test_contraction_corner_cases<RowMajor>());
+
+  CALL_SUBTEST(test_memcpy());
+
+  CALL_SUBTEST(test_multithread_random());
+}
diff --git a/unsupported/test/minres.cpp b/unsupported/test/minres.cpp
index 81b762c37..8b300b78a 100644
--- a/unsupported/test/minres.cpp
+++ b/unsupported/test/minres.cpp
@@ -21,6 +21,7 @@ template<typename T> void test_minres_T()
   // Diagonal preconditioner
   MINRES<SparseMatrix<T>, Lower, DiagonalPreconditioner<T> > minres_colmajor_lower_diag;
   MINRES<SparseMatrix<T>, Upper, DiagonalPreconditioner<T> > minres_colmajor_upper_diag;
+  MINRES<SparseMatrix<T>, Lower|Upper, DiagonalPreconditioner<T> > minres_colmajor_uplo_diag;
   
   // call tests for SPD matrix
   CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_lower_I) );
@@ -28,6 +29,7 @@ template<typename T> void test_minres_T()
     
   CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_lower_diag)  );
   CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_upper_diag)  );
+  CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_uplo_diag)  );
     
   // TO DO: symmetric semi-definite matrix
   // TO DO: symmetric indefinite matrix